merge operators

2026-05-10 02:46:34 +00:00 · 2019-05-31 21:27:28 -05:00
parent 7128cb976b 879e7f2ec9
commit 28f6ff1412
7 changed files with 1137 additions and 2 deletions
--- a/k8s/elasticsearch-cluster.yaml
+++ b/k8s/elasticsearch-cluster.yaml
@@ -0,0 +1,21 @@
+apiVersion: enterprises.upmc.com/v1
+kind: ElasticsearchCluster
+metadata:
+  name: es
+spec:
+  kibana:
+    image: docker.elastic.co/kibana/kibana-oss:6.1.3
+    image-pull-policy: Always
+  cerebro:
+    image: upmcenterprises/cerebro:0.7.2
+    image-pull-policy: Always
+  elastic-search-image: upmcenterprises/docker-elasticsearch-kubernetes:6.1.3_0
+  image-pull-policy: Always
+  client-node-replicas: 2
+  master-node-replicas: 3
+  data-node-replicas: 3
+  network-host: 0.0.0.0
+  use-ssl: false
+  data-volume-size: 10Gi
+  java-options: "-Xms512m -Xmx512m"
+
--- a/k8s/elasticsearch-operator.yaml
+++ b/k8s/elasticsearch-operator.yaml
@@ -0,0 +1,94 @@
+# This is mirrored from https://github.com/upmc-enterprises/elasticsearch-operator/blob/master/example/controller.yaml but using the elasticsearch-operator namespace instead of operator
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: elasticsearch-operator
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: elasticsearch-operator
+  namespace: elasticsearch-operator
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+  name: elasticsearch-operator
+rules:
+- apiGroups: ["extensions"]
+  resources: ["deployments", "replicasets", "daemonsets"]
+  verbs: ["create", "get", "update", "delete", "list"]
+- apiGroups: ["apiextensions.k8s.io"]
+  resources: ["customresourcedefinitions"]
+  verbs: ["create", "get", "update", "delete", "list"]
+- apiGroups: ["storage.k8s.io"]
+  resources: ["storageclasses"]
+  verbs: ["get", "list", "create", "delete", "deletecollection"]
+- apiGroups: [""]
+  resources: ["persistentvolumes", "persistentvolumeclaims", "services", "secrets", "configmaps"] 
+  verbs: ["create", "get", "update", "delete", "list"]
+- apiGroups: ["batch"]
+  resources: ["cronjobs", "jobs"]
+  verbs: ["create", "get", "deletecollection", "delete"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["list", "get", "watch"]
+- apiGroups: ["apps"]
+  resources: ["statefulsets", "deployments"]
+  verbs: ["*"]
+- apiGroups: ["enterprises.upmc.com"]
+  resources: ["elasticsearchclusters"]
+  verbs: ["*"]
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: elasticsearch-operator
+  namespace: elasticsearch-operator
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: elasticsearch-operator
+subjects:
+- kind: ServiceAccount
+  name: elasticsearch-operator
+  namespace: elasticsearch-operator
+---
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: elasticsearch-operator
+  namespace: elasticsearch-operator
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        name: elasticsearch-operator
+    spec:
+      containers:
+      - name: operator
+        image: upmcenterprises/elasticsearch-operator:0.2.0
+        imagePullPolicy: Always
+        env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        ports:
+        - containerPort: 8000
+          name: http
+        livenessProbe:
+          httpGet:
+            path: /live
+            port: 8000
+          initialDelaySeconds: 10
+          timeoutSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /ready
+            port: 8000
+          initialDelaySeconds: 10
+          timeoutSeconds: 5
+      serviceAccount: elasticsearch-operator
--- a/k8s/filebeat.yaml
+++ b/k8s/filebeat.yaml
@@ -0,0 +1,167 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: filebeat-config
+  namespace: kube-system
+  labels:
+    k8s-app: filebeat
+data:
+  filebeat.yml: |-
+    filebeat.config:
+      inputs:
+        # Mounted `filebeat-inputs` configmap:
+        path: ${path.config}/inputs.d/*.yml
+        # Reload inputs configs as they change:
+        reload.enabled: false
+      modules:
+        path: ${path.config}/modules.d/*.yml
+        # Reload module configs as they change:
+        reload.enabled: false
+
+    # To enable hints based autodiscover, remove `filebeat.config.inputs` configuration and uncomment this:
+    #filebeat.autodiscover:
+    #  providers:
+    #    - type: kubernetes
+    #      hints.enabled: true
+
+    processors:
+      - add_cloud_metadata:
+
+    cloud.id: ${ELASTIC_CLOUD_ID}
+    cloud.auth: ${ELASTIC_CLOUD_AUTH}
+
+    output.elasticsearch:
+      hosts: ['${ELASTICSEARCH_HOST:elasticsearch}:${ELASTICSEARCH_PORT:9200}']
+      username: ${ELASTICSEARCH_USERNAME}
+      password: ${ELASTICSEARCH_PASSWORD}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: filebeat-inputs
+  namespace: kube-system
+  labels:
+    k8s-app: filebeat
+data:
+  kubernetes.yml: |-
+    - type: docker
+      containers.ids:
+      - "*"
+      processors:
+        - add_kubernetes_metadata:
+            in_cluster: true
+---
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: filebeat
+  namespace: kube-system
+  labels:
+    k8s-app: filebeat
+spec:
+  template:
+    metadata:
+      labels:
+        k8s-app: filebeat
+    spec:
+      serviceAccountName: filebeat
+      terminationGracePeriodSeconds: 30
+      containers:
+      - name: filebeat
+        image: docker.elastic.co/beats/filebeat-oss:7.0.1
+        args: [
+          "-c", "/etc/filebeat.yml",
+          "-e",
+        ]
+        env:
+        - name: ELASTICSEARCH_HOST
+          value: elasticsearch-es.default.svc.cluster.local
+        - name: ELASTICSEARCH_PORT
+          value: "9200"
+        - name: ELASTICSEARCH_USERNAME
+          value: elastic
+        - name: ELASTICSEARCH_PASSWORD
+          value: changeme
+        - name: ELASTIC_CLOUD_ID
+          value:
+        - name: ELASTIC_CLOUD_AUTH
+          value:
+        securityContext:
+          runAsUser: 0
+          # If using Red Hat OpenShift uncomment this:
+          #privileged: true
+        resources:
+          limits:
+            memory: 200Mi
+          requests:
+            cpu: 100m
+            memory: 100Mi
+        volumeMounts:
+        - name: config
+          mountPath: /etc/filebeat.yml
+          readOnly: true
+          subPath: filebeat.yml
+        - name: inputs
+          mountPath: /usr/share/filebeat/inputs.d
+          readOnly: true
+        - name: data
+          mountPath: /usr/share/filebeat/data
+        - name: varlibdockercontainers
+          mountPath: /var/lib/docker/containers
+          readOnly: true
+      volumes:
+      - name: config
+        configMap:
+          defaultMode: 0600
+          name: filebeat-config
+      - name: varlibdockercontainers
+        hostPath:
+          path: /var/lib/docker/containers
+      - name: inputs
+        configMap:
+          defaultMode: 0600
+          name: filebeat-inputs
+      # data folder stores a registry of read status for all files, so we don't send everything again on a Filebeat pod restart
+      - name: data
+        hostPath:
+          path: /var/lib/filebeat-data
+          type: DirectoryOrCreate
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: filebeat
+subjects:
+- kind: ServiceAccount
+  name: filebeat
+  namespace: kube-system
+roleRef:
+  kind: ClusterRole
+  name: filebeat
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+  name: filebeat
+  labels:
+    k8s-app: filebeat
+rules:
+- apiGroups: [""] # "" indicates the core API group
+  resources:
+  - namespaces
+  - pods
+  verbs:
+  - get
+  - watch
+  - list
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: filebeat
+  namespace: kube-system
+  labels:
+    k8s-app: filebeat
+---
--- a/k8s/local-path-storage.yaml
+++ b/k8s/local-path-storage.yaml
@@ -0,0 +1,110 @@
+# This is a local copy of:
+# https://github.com/rancher/local-path-provisioner/blob/master/deploy/local-path-storage.yaml
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: local-path-storage
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: local-path-provisioner-service-account
+  namespace: local-path-storage
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+  name: local-path-provisioner-role
+  namespace: local-path-storage
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "persistentvolumeclaims"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["endpoints", "persistentvolumes", "pods"]
+  verbs: ["*"]
+- apiGroups: [""]
+  resources: ["events"]
+  verbs: ["create", "patch"]
+- apiGroups: ["storage.k8s.io"]
+  resources: ["storageclasses"]
+  verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: local-path-provisioner-bind
+  namespace: local-path-storage
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: local-path-provisioner-role
+subjects:
+- kind: ServiceAccount
+  name: local-path-provisioner-service-account
+  namespace: local-path-storage
+---
+apiVersion: apps/v1beta2
+kind: Deployment
+metadata:
+  name: local-path-provisioner
+  namespace: local-path-storage
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: local-path-provisioner
+  template:
+    metadata:
+      labels:
+        app: local-path-provisioner
+    spec:
+      serviceAccountName: local-path-provisioner-service-account
+      containers:
+      - name: local-path-provisioner
+        image: rancher/local-path-provisioner:v0.0.8
+        imagePullPolicy: Always
+        command:
+        - local-path-provisioner
+        - --debug
+        - start
+        - --config
+        - /etc/config/config.json
+        volumeMounts:
+        - name: config-volume
+          mountPath: /etc/config/
+        env:
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+      volumes:
+        - name: config-volume
+          configMap:
+            name: local-path-config
+---
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: local-path
+provisioner: rancher.io/local-path
+volumeBindingMode: WaitForFirstConsumer
+reclaimPolicy: Delete
+---
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: local-path-config
+  namespace: local-path-storage
+data:
+  config.json: |-
+        {
+                "nodePathMap":[
+                {
+                        "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES",
+                        "paths":["/opt/local-path-provisioner"]
+                }
+                ]
+        }
+
--- a/slides/k8s/operators-design.md
+++ b/slides/k8s/operators-design.md
@@ -0,0 +1,356 @@
+## What does it take to write an operator?
+
+- Writing a quick-and-dirty operator, or a POC/MVP, is easy
+
+- Writing a robust operator is hard
+
+- We will describe the general idea
+
+- We will identify some of the associated challenges
+
+- We will list a few tools that can help us
+
+---
+
+## Top-down vs. bottom-up
+
+- Both approaches are possible
+
+- Let's see what they entail, and their respective pros and cons
+
+---
+
+## Top-down approach
+
+- Start with high-level design (see next slide)
+
+- Pros:
+
+  - can yield cleaner design that will be more robust
+
+- Cons:
+
+  - must be able to anticipate all the events that might happen
+
+  - design will be better only to the extend of what we anticipated
+
+  - hard to anticipate if we don't have production experience
+
+---
+
+## High-level design
+
+- What are we solving?
+
+  (e.g.: geographic databases backed by PostGIS with Redis caches)
+
+- What are our use-cases, stories?
+
+  (e.g.: adding/resizing caches and read replicas; load balancing queries)
+
+- What kind of outage do we want to address?
+
+  (e.g.: loss of individual node, pod, volume)
+
+- What are our *non-features*, the things we don't want to address?
+
+  (e.g.: loss of datacenter/zone; differentiating between read and write queries;
+  <br/>
+  cache invalidation; upgrading to newer major versions of Redis, PostGIS, PostgreSQL)
+
+---
+
+## Low-level design
+
+- What Custom Resource Definitions do we need?
+
+  (one, many?)
+
+- How will we store configuration information?
+
+  (part of the CRD spec fields, annotations, other?)
+
+- Do we need to store state? If so, where?
+
+  - state that is small and doesn't change much can be stored via the Kubernetes API
+    <br/>
+    (e.g.: leader information, configuration, credentials)
+
+  - things that are big and/or change a lot should go elsewhere
+    <br/>
+    (e.g.: metrics, bigger configuration file like GeoIP)
+
+---
+
+class: extra-details
+
+## What can we store via the Kubernetes API?
+
+- The API server stores most Kubernetes resources into etcd
+
+- Etcd is designed for reliability, not for performance
+
+- If our storage needs exceed what etcd can offer, we need to use something else:
+
+  - either directly
+
+  - or by extending the API server
+    <br/>(for instance by using the agregation layer, like [metrics server](https://github.com/kubernetes-incubator/metrics-server) does)
+
+---
+
+## Bottom-up approach
+
+- Start with existing Kubernetes resources (Deployment, Stateful Set...)
+
+- Run the system in production
+
+- Add scripts, automation, to facilitate day-to-day operations
+
+- Turn the scripts into an operator
+
+- Pros: simpler to get started; reflects actual use-cases
+
+- Cons: can result in convoluted designs requiring extensive refactor
+
+---
+
+## General idea
+
+- Our operator will watch its CRDs *and associated resources*
+
+- Drawing state diagrams and finite state automata helps a lot
+
+- It's OK if some transitions lead to a big catch-all "human intervention"
+
+- Over time, we will learn about new failure modes and add to these diagrams
+
+- It's OK to start with CRD creation / deletion and prevent any modification
+
+  (that's the easy POC/MVP we were talking about)
+
+- *Presentation* and *validation* will help our users
+
+  (more on that later)
+
+---
+
+## Challenges
+
+- Reacting to infrastructure disruption can seem hard at first
+
+- Kubernetes gives us a lot of primitives to help:
+
+  - Pods and Persistent Volumes will *eventually* recover
+
+  - Stateful Sets give us easy ways to "add N copies" of a thing
+
+- The real challenges come with configuration changes
+
+  (i.e., what to do when our users update our CRDs)
+
+- Keep in mind that [some] of the [largest] cloud [outages] haven't been caused by [natural catastrophes], or even code bugs, but by configuration changes
+
+[some]: https://www.datacenterdynamics.com/news/gcp-outage-mainone-leaked-google-cloudflare-ip-addresses-china-telecom/
+[largest]: https://aws.amazon.com/message/41926/
+[outages]: https://aws.amazon.com/message/65648/
+[natural catastrophes]: https://www.datacenterknowledge.com/amazon/aws-says-it-s-never-seen-whole-data-center-go-down
+
+---
+
+## Configuration changes
+
+- It is helpful to analyze and understand how Kubernetes controllers work:
+
+  - watch resource for modifications
+
+  - compare desired state (CRD) and current state
+
+  - issue actions to converge state
+
+- Configuration changes will probably require *another* state diagram or FSA
+
+- Again, it's OK to have transitions labeled as "unsupported"
+
+  (i.e. reject some modifications because we can't execute them)
+
+---
+
+## Tools
+
+- CoreOS / RedHat Operator Framework
+
+  [GitHub](https://github.com/operator-framework)
+  |
+  [Blog](https://developers.redhat.com/blog/2018/12/18/introduction-to-the-kubernetes-operator-framework/)
+  |
+  [Intro talk](https://www.youtube.com/watch?v=8k_ayO1VRXE)
+  |
+  [Deep dive talk](https://www.youtube.com/watch?v=fu7ecA2rXmc)
+
+- Zalando Kubernetes Operator Pythonic Framework (KOPF)
+
+  [GitHub](https://github.com/zalando-incubator/kopf)
+  |
+  [Docs](https://kopf.readthedocs.io/)
+  |
+  [Step-by-step tutorial](https://kopf.readthedocs.io/en/stable/walkthrough/problem/)
+
+- Mesosphere Kubernetes Universal Declarative Operator (KUDO)
+
+  [GitHub](https://github.com/kudobuilder/kudo)
+  |
+  [Blog](https://mesosphere.com/blog/announcing-maestro-a-declarative-no-code-approach-to-kubernetes-day-2-operators/)
+  |
+  [Docs](https://kudo.dev/)
+  |
+  [Zookeeper example](https://github.com/kudobuilder/frameworks/tree/master/repo/stable/zookeeper)
+
+---
+
+## Validation
+
+- By default, a CRD is "free form"
+
+  (we can put pretty much anything we want in it)
+
+- When creating a CRD, we can provide an OpenAPI v3 schema
+  ([Example](https://github.com/amaizfinance/redis-operator/blob/master/deploy/crds/k8s_v1alpha1_redis_crd.yaml#L34))
+
+- The API server will then validate resources created/edited with this schema
+
+- If we need a stronger validation, we can use a Validating Admission Webhook:
+
+  - run an [admission webhook server](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#write-an-admission-webhook-server) to receive validation requests
+
+  - register the webhook by creating a [ValidatingWebhookConfiguration](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#configure-admission-webhooks-on-the-fly)
+
+  - each time the API server receives a request matching the configuration,
+    <br/>the request is sent to our server for validation
+
+---
+
+## Presentation
+
+- By default, `kubectl get mycustomresource` won't display much information
+
+  (just the name and age of each resource)
+
+- When creating a CRD, we can specify additional columns to print
+  ([Example](https://github.com/amaizfinance/redis-operator/blob/master/deploy/crds/k8s_v1alpha1_redis_crd.yaml#L6),
+  [Docs](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/#additional-printer-columns))
+
+- By default, `kubectl describe mycustomresource` will also be generic
+
+- `kubectl describe` can show events related to our custom resources
+
+  (for that, we need to create Event resources, and fill the `involvedObject` field)
+
+- For scalable resources, we can define a `scale` sub-resource
+
+- This will enable the use of `kubectl scale` and other scaling-related operations
+
+---
+
+## About scaling
+
+- It is possible to use the HPA (Horizontal Pod Autoscaler) with CRDs
+
+- But it is not always desirable
+
+- The HPA works very well for homogenous, stateless workloads
+
+- For other workloads, your mileage may vary
+
+- Some systems can scale across multiple dimensions
+
+  (for instance: increase number of replicas, or number of shards?)
+
+- If autoscaling is desired, the operator will have to take complex decisions
+
+  (example: Zalando's Elasticsearch Operator ([Video](https://www.youtube.com/watch?v=lprE0J0kAq0)))
+
+---
+
+## Versioning
+
+- As our operator evolves over time, we may have to change the CRD
+
+  (add, remove, change fields)
+
+- Like every other resource in Kubernetes, [custom resources are versioned](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definition-versioning/
+)
+
+- When creating a CRD, we need to specify a *list* of versions
+
+- Versions can be marked as `stored` and/or `served`
+
+---
+
+## Stored version
+
+- Exactly one version has to be marked as the `stored` version
+
+- As the name implies, it is the one that will be stored in etcd
+
+- Resources in storage are never converted automatically
+
+  (we need to read and re-write them ourselves)
+
+- Yes, this means that we can have different versions in etcd at any time
+
+- Our code needs to handle all the versions that still exist in storage
+
+---
+
+## Served versions
+
+- By default, the Kubernetes API will serve resources "as-is"
+
+  (using their stored version)
+
+- It will assume that all versions are compatible storage-wise
+
+  (i.e. that the spec and fields are compatible between versions)
+
+- We can provide [conversion webhooks](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definition-versioning/#webhook-conversion) to "translate" requests
+
+  (the alternative is to upgrade all stored resources and stop serving old versions)
+
+---
+
+## Operator reliability
+
+- Remember that the operator itself must be resilient
+
+  (e.g.: the node running it can fail)
+
+- Our operator must be able to restart and recover gracefully
+
+- Do not store state locally
+
+  (unless we can reconstruct that state when we restart)
+
+- As indicated earlier, we can use the Kubernetes API to store data:
+
+  - in the custom resources themselves
+
+  - in other resources' annotations
+
+---
+
+## Beyond CRDs
+
+- CRDs cannot use custom storage (e.g. for time series data)
+
+- CRDs cannot support arbitrary subresources (like logs or exec for Pods)
+
+- CRDs cannot support protobuf (for faster, more efficient communication)
+
+- If we need these things, we can use the [aggregation layer](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/apiserver-aggregation/) instead
+
+- The aggregation layer proxies all requests below a specific path to another server
+
+  (this is used e.g. by the metrics server)
+
+- [This documentation page](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/#choosing-a-method-for-adding-custom-resources) compares the features of CRDs and API aggregation
--- a/slides/k8s/operators.md
+++ b/slides/k8s/operators.md
@@ -0,0 +1,389 @@
+# Operators
+
+- Operators are one of the many ways to extend Kubernetes
+
+- We will define operators
+
+- We will see how they work
+
+- We will install a specific operator (for ElasticSearch)
+
+- We will use it to provision an ElasticSearch cluster
+
+---
+
+## What are operators?
+
+*An operator represents **human operational knowledge in software,**
+<br/>
+to reliably manage an application.
+— [CoreOS](https://coreos.com/blog/introducing-operators.html)*
+
+Examples:
+
+- Deploying and configuring replication with MySQL, PostgreSQL ...
+
+- Setting up Elasticsearch, Kafka, RabbitMQ, Zookeeper ...
+
+- Reacting to failures when intervention is needed
+
+- Scaling up and down these systems
+
+---
+
+## What are they made from?
+
+- Operators combine two things:
+
+  - Custom Resource Definitions
+
+  - controller code watching the corresponding resources and acting upon them
+
+- A given operator can define one or multiple CRDs
+
+- The controller code (control loop) typically runs within the cluster
+
+  (running as a Deployment with 1 replica is a common scenario)
+
+- But it could also run elsewhere
+
+  (nothing mandates that the code run on the cluster, as long as it has API access)
+
+---
+
+## Why use operators?
+
+- Kubernetes gives us Deployments, StatefulSets, Services ...
+
+- These mechanisms give us building blocks to deploy applications
+
+- They work great for services that are made of *N* identical containers
+
+  (like stateless ones)
+
+- They also work great for some stateful applications like Consul, etcd ...
+
+  (with the help of highly persistent volumes)
+
+- They're not enough for complex services:
+
+  - where different containers have different roles
+
+  - where extra steps have to be taken when scaling or replacing containers
+
+---
+
+## Use-cases for operators
+
+- Systems with primary/secondary replication
+
+  Examples: MariaDB, MySQL, PostgreSQL, Redis ...
+
+- Systems where different groups of nodes have different roles
+
+  Examples: ElasticSearch, MongoDB ...
+
+- Systems with complex dependencies (that are themselves managed with operators)
+
+  Examples: Flink or Kafka, which both depend on Zookeeper
+
+---
+
+## More use-cases
+
+- Representing and managing external resources
+
+  (Example: [AWS Service Operator](https://operatorhub.io/operator/alpha/aws-service-operator.v0.0.1))
+
+- Managing complex cluster add-ons
+
+  (Example: [Istio operator](https://operatorhub.io/operator/beta/istio-operator.0.1.6))
+
+- Deploying and managing our applications' lifecycles
+
+  (more on that later)
+
+---
+
+## How operators work
+
+- An operator creates one or more CRDs
+
+  (i.e., it creates new "Kinds" of resources on our cluster)
+
+- The operator also runs a *controller* that will watch its resources
+
+- Each time we create/update/delete a resource, the controller is notified
+
+  (we could write our own cheap controller with `kubectl get --watch`)
+
+---
+
+## One operator in action
+
+- We will install the UPMC Enterprises ElasticSearch operator
+
+- This operator requires PersistentVolumes
+
+- We will install Rancher's [local path storage provisioner](https://github.com/rancher/local-path-provisioner) to automatically create these
+
+- Then, we will create an ElasticSearch resource
+
+- The operator will detect that resource and provision the cluster
+
+---
+
+## Installing a Persistent Volume provisioner
+
+(This step can be skipped if you already have a dynamic volume provisioner.)
+
+- This provisioner creates Persistent Volumes backed by `hostPath`
+
+  (local directories on our nodes)
+
+- It doesn't require anything special ...
+
+- ... But losing a node = losing the volumes on that node!
+
+.exercise[
+
+- Install the local path storage provisioner:
+  ```bash
+  kubectl apply -f ~/container.training/k8s/local-path-storage.yaml
+  ```
+
+]
+
+---
+
+## Making sure we have a default StorageClass
+
+- The ElasticSearch operator will create StatefulSets
+
+- These StatefulSets will instantiate PersistentVolumeClaims
+
+- These PVCs need to be explicitly associated with a StorageClass
+
+- Or we need to tag a StorageClass to be used as the default one
+
+.exercise[
+
+- List StorageClasses:
+  ```bash
+  kubectl get storageclasses
+  ```
+
+]
+
+We should see the `local-path` StorageClass.
+
+---
+
+## Setting a default StorageClass
+
+- This is done by adding an annotation to the StorageClass:
+
+  `storageclass.kubernetes.io/is-default-class: true`
+
+.exercise[
+
+- Tag the StorageClass so that it's the default one:
+  ```bash
+  kubectl annotate storageclass local-path \
+            storageclass.kubernetes.io/is-default-class=true
+  ```
+
+- Check the result:
+  ```bash
+  kubectl get storageclasses
+  ```
+
+]
+
+Now, the StorageClass should have `(default)` next to its name.
+
+---
+
+## Install the ElasticSearch operator
+
+- The operator needs:
+
+  - a Deployment for its controller
+  - a ServiceAccount, ClusterRole, ClusterRoleBinding for permissions
+  - a Namespace
+
+- We have grouped all the definitions for these resources in a YAML file
+
+.exercise[
+
+- Install the operator:
+  ```bash
+  kubectl apply -f ~/container.training/k8s/elasticsearch-operator.yaml
+  ```
+
+]
+
+---
+
+## Wait for the operator to be ready
+
+- Some operators require to create their CRDs separately
+
+- This operator will create its CRD itself
+
+  (i.e. the CRD is not listed in the YAML that we applied earlier)
+
+.exercise[
+
+- Wait until the `elasticsearchclusters` CRD shows up:
+  ```bash
+  kubectl get crds
+  ```
+
+]
+
+---
+
+## Create an ElasticSearch resource
+
+- We can now create a resource with `kind: ElasticsearchCluster`
+
+- The YAML for that resource will specify all the desired parameters:
+
+  - how many nodes do we want of each type (client, master, data)
+  - image to use
+  - add-ons (kibana, cerebro, ...)
+  - whether to use TLS or not
+  - etc.
+
+.exercise[
+
+- Create our ElasticSearch cluster:
+  ```bash
+  kubectl apply -f ~/container.training/k8s/elasticsearch-cluster.yaml
+  ```
+
+]
+
+---
+
+## Operator in action
+
+- Over the next minutes, the operator will create:
+
+  - StatefulSets (one for master nodes, one for data nodes)
+
+  - Deployments (for client nodes; and for add-ons like cerebro and kibana)
+
+  - Services (for all these pods)
+
+.exercise[
+
+- Wait for all the StatefulSets to be fully up and running:
+  ```bash
+  kubectl get statefulsets -w
+  ```
+
+]
+
+---
+
+## Connecting to our cluster
+
+- Since connecting directly to the ElasticSearch API is a bit raw,
+  <br/>we'll connect to the cerebro frontend instead
+
+.exercise[
+
+- Edit the cerebro service to change its type from ClusterIP to NodePort:
+  ```bash
+  kubectl patch svc cerebro-es -p "spec: { type: NodePort }"
+  ```
+
+- Retrieve the NodePort that was allocated:
+  ```bash
+  kubectl get svc cerebreo-es
+  ```
+
+- Connect to that port with a browser
+
+]
+
+---
+
+## (Bonus) Setup filebeat
+
+- Let's send some data to our brand new ElasticSearch cluster!
+
+- We'll deploy a filebeat DaemonSet to collect node logs
+
+.exercise[
+
+- Deploy filebeat:
+  ```bash
+  kubectl apply -f ~/container.training/k8s/filebeat.yaml
+  ```
+
+]
+
+We should see at least one index being created in cerebro.
+
+---
+
+## (Bonus) Access log data with kibana
+
+- Let's expose kibana (by making kibana-es a NodePort too)
+
+- Then access kibana
+
+- We'll need to configure kibana indexes
+
+---
+
+## Deploying our apps with operators
+
+- It is very simple to deploy with `kubectl run` / `kubectl expose`
+
+- We can unlock more features by writing YAML and using `kubectl apply`
+
+- Kustomize or Helm let us deploy in multiple environments
+
+  (and adjust/tweak parameters in each environment)
+
+- We can also use an operator to deploy our application
+
+---
+
+## Pros and cons of deploying with operators
+
+- The app definition and configuration is persisted in the Kubernetes API
+
+- Multiple instances of the app can be manipulated with `kubectl get`
+
+- We can add labels, annotations to the app instances
+
+- Our controller can execute custom code for any lifecycle event
+
+- However, we need to write this controller
+
+- We need to be careful about changes
+
+  (what happens when the resource `spec` is updated?)
+
+---
+
+## Operators are not magic
+
+- Look at the ElasticSearch resource definition
+
+  (`~/container.training/k8s/elasticsearch-cluster.yaml`)
+
+- What should happen if we flip the `use-tls` flag? Twice?
+
+- What should happen if we remove / re-add the kibana or cerebro sections?
+
+- What should happen if we change the number of nodes?
+
+- What if we want different images or parameters for the different nodes?
+
+*Operators can be very powerful, iff we know exactly the scenarios that they can handle.*
--- a/slides/kadm-fourday.yml
+++ b/slides/kadm-fourday.yml
@@ -93,5 +93,3 @@ chapters:
 - - k8s/lastwords-admin.md
  - k8s/links.md
  - shared/thankyou.md
-
-