diff --git a/k8s/elasticsearch-cluster.yaml b/k8s/elasticsearch-cluster.yaml
new file mode 100644
index 00000000..23d8108d
--- /dev/null
+++ b/k8s/elasticsearch-cluster.yaml
@@ -0,0 +1,21 @@
+apiVersion: enterprises.upmc.com/v1
+kind: ElasticsearchCluster
+metadata:
+ name: es
+spec:
+ kibana:
+ image: docker.elastic.co/kibana/kibana-oss:6.1.3
+ image-pull-policy: Always
+ cerebro:
+ image: upmcenterprises/cerebro:0.7.2
+ image-pull-policy: Always
+ elastic-search-image: upmcenterprises/docker-elasticsearch-kubernetes:6.1.3_0
+ image-pull-policy: Always
+ client-node-replicas: 2
+ master-node-replicas: 3
+ data-node-replicas: 3
+ network-host: 0.0.0.0
+ use-ssl: false
+ data-volume-size: 10Gi
+ java-options: "-Xms512m -Xmx512m"
+
diff --git a/k8s/elasticsearch-operator.yaml b/k8s/elasticsearch-operator.yaml
new file mode 100644
index 00000000..0049541e
--- /dev/null
+++ b/k8s/elasticsearch-operator.yaml
@@ -0,0 +1,94 @@
+# This is mirrored from https://github.com/upmc-enterprises/elasticsearch-operator/blob/master/example/controller.yaml but using the elasticsearch-operator namespace instead of operator
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: elasticsearch-operator
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: elasticsearch-operator
+ namespace: elasticsearch-operator
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+ name: elasticsearch-operator
+rules:
+- apiGroups: ["extensions"]
+ resources: ["deployments", "replicasets", "daemonsets"]
+ verbs: ["create", "get", "update", "delete", "list"]
+- apiGroups: ["apiextensions.k8s.io"]
+ resources: ["customresourcedefinitions"]
+ verbs: ["create", "get", "update", "delete", "list"]
+- apiGroups: ["storage.k8s.io"]
+ resources: ["storageclasses"]
+ verbs: ["get", "list", "create", "delete", "deletecollection"]
+- apiGroups: [""]
+ resources: ["persistentvolumes", "persistentvolumeclaims", "services", "secrets", "configmaps"]
+ verbs: ["create", "get", "update", "delete", "list"]
+- apiGroups: ["batch"]
+ resources: ["cronjobs", "jobs"]
+ verbs: ["create", "get", "deletecollection", "delete"]
+- apiGroups: [""]
+ resources: ["pods"]
+ verbs: ["list", "get", "watch"]
+- apiGroups: ["apps"]
+ resources: ["statefulsets", "deployments"]
+ verbs: ["*"]
+- apiGroups: ["enterprises.upmc.com"]
+ resources: ["elasticsearchclusters"]
+ verbs: ["*"]
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+ name: elasticsearch-operator
+ namespace: elasticsearch-operator
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: elasticsearch-operator
+subjects:
+- kind: ServiceAccount
+ name: elasticsearch-operator
+ namespace: elasticsearch-operator
+---
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+ name: elasticsearch-operator
+ namespace: elasticsearch-operator
+spec:
+ replicas: 1
+ template:
+ metadata:
+ labels:
+ name: elasticsearch-operator
+ spec:
+ containers:
+ - name: operator
+ image: upmcenterprises/elasticsearch-operator:0.2.0
+ imagePullPolicy: Always
+ env:
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ ports:
+ - containerPort: 8000
+ name: http
+ livenessProbe:
+ httpGet:
+ path: /live
+ port: 8000
+ initialDelaySeconds: 10
+ timeoutSeconds: 10
+ readinessProbe:
+ httpGet:
+ path: /ready
+ port: 8000
+ initialDelaySeconds: 10
+ timeoutSeconds: 5
+ serviceAccount: elasticsearch-operator
diff --git a/k8s/filebeat.yaml b/k8s/filebeat.yaml
new file mode 100644
index 00000000..690e9613
--- /dev/null
+++ b/k8s/filebeat.yaml
@@ -0,0 +1,167 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: filebeat-config
+ namespace: kube-system
+ labels:
+ k8s-app: filebeat
+data:
+ filebeat.yml: |-
+ filebeat.config:
+ inputs:
+ # Mounted `filebeat-inputs` configmap:
+ path: ${path.config}/inputs.d/*.yml
+ # Reload inputs configs as they change:
+ reload.enabled: false
+ modules:
+ path: ${path.config}/modules.d/*.yml
+ # Reload module configs as they change:
+ reload.enabled: false
+
+ # To enable hints based autodiscover, remove `filebeat.config.inputs` configuration and uncomment this:
+ #filebeat.autodiscover:
+ # providers:
+ # - type: kubernetes
+ # hints.enabled: true
+
+ processors:
+ - add_cloud_metadata:
+
+ cloud.id: ${ELASTIC_CLOUD_ID}
+ cloud.auth: ${ELASTIC_CLOUD_AUTH}
+
+ output.elasticsearch:
+ hosts: ['${ELASTICSEARCH_HOST:elasticsearch}:${ELASTICSEARCH_PORT:9200}']
+ username: ${ELASTICSEARCH_USERNAME}
+ password: ${ELASTICSEARCH_PASSWORD}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: filebeat-inputs
+ namespace: kube-system
+ labels:
+ k8s-app: filebeat
+data:
+ kubernetes.yml: |-
+ - type: docker
+ containers.ids:
+ - "*"
+ processors:
+ - add_kubernetes_metadata:
+ in_cluster: true
+---
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+ name: filebeat
+ namespace: kube-system
+ labels:
+ k8s-app: filebeat
+spec:
+ template:
+ metadata:
+ labels:
+ k8s-app: filebeat
+ spec:
+ serviceAccountName: filebeat
+ terminationGracePeriodSeconds: 30
+ containers:
+ - name: filebeat
+ image: docker.elastic.co/beats/filebeat-oss:7.0.1
+ args: [
+ "-c", "/etc/filebeat.yml",
+ "-e",
+ ]
+ env:
+ - name: ELASTICSEARCH_HOST
+ value: elasticsearch-es.default.svc.cluster.local
+ - name: ELASTICSEARCH_PORT
+ value: "9200"
+ - name: ELASTICSEARCH_USERNAME
+ value: elastic
+ - name: ELASTICSEARCH_PASSWORD
+ value: changeme
+ - name: ELASTIC_CLOUD_ID
+ value:
+ - name: ELASTIC_CLOUD_AUTH
+ value:
+ securityContext:
+ runAsUser: 0
+ # If using Red Hat OpenShift uncomment this:
+ #privileged: true
+ resources:
+ limits:
+ memory: 200Mi
+ requests:
+ cpu: 100m
+ memory: 100Mi
+ volumeMounts:
+ - name: config
+ mountPath: /etc/filebeat.yml
+ readOnly: true
+ subPath: filebeat.yml
+ - name: inputs
+ mountPath: /usr/share/filebeat/inputs.d
+ readOnly: true
+ - name: data
+ mountPath: /usr/share/filebeat/data
+ - name: varlibdockercontainers
+ mountPath: /var/lib/docker/containers
+ readOnly: true
+ volumes:
+ - name: config
+ configMap:
+ defaultMode: 0600
+ name: filebeat-config
+ - name: varlibdockercontainers
+ hostPath:
+ path: /var/lib/docker/containers
+ - name: inputs
+ configMap:
+ defaultMode: 0600
+ name: filebeat-inputs
+ # data folder stores a registry of read status for all files, so we don't send everything again on a Filebeat pod restart
+ - name: data
+ hostPath:
+ path: /var/lib/filebeat-data
+ type: DirectoryOrCreate
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+ name: filebeat
+subjects:
+- kind: ServiceAccount
+ name: filebeat
+ namespace: kube-system
+roleRef:
+ kind: ClusterRole
+ name: filebeat
+ apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+ name: filebeat
+ labels:
+ k8s-app: filebeat
+rules:
+- apiGroups: [""] # "" indicates the core API group
+ resources:
+ - namespaces
+ - pods
+ verbs:
+ - get
+ - watch
+ - list
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: filebeat
+ namespace: kube-system
+ labels:
+ k8s-app: filebeat
+---
diff --git a/k8s/local-path-storage.yaml b/k8s/local-path-storage.yaml
new file mode 100644
index 00000000..7374a02a
--- /dev/null
+++ b/k8s/local-path-storage.yaml
@@ -0,0 +1,110 @@
+# This is a local copy of:
+# https://github.com/rancher/local-path-provisioner/blob/master/deploy/local-path-storage.yaml
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: local-path-storage
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: local-path-provisioner-service-account
+ namespace: local-path-storage
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+ name: local-path-provisioner-role
+ namespace: local-path-storage
+rules:
+- apiGroups: [""]
+ resources: ["nodes", "persistentvolumeclaims"]
+ verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+ resources: ["endpoints", "persistentvolumes", "pods"]
+ verbs: ["*"]
+- apiGroups: [""]
+ resources: ["events"]
+ verbs: ["create", "patch"]
+- apiGroups: ["storage.k8s.io"]
+ resources: ["storageclasses"]
+ verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+ name: local-path-provisioner-bind
+ namespace: local-path-storage
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: local-path-provisioner-role
+subjects:
+- kind: ServiceAccount
+ name: local-path-provisioner-service-account
+ namespace: local-path-storage
+---
+apiVersion: apps/v1beta2
+kind: Deployment
+metadata:
+ name: local-path-provisioner
+ namespace: local-path-storage
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: local-path-provisioner
+ template:
+ metadata:
+ labels:
+ app: local-path-provisioner
+ spec:
+ serviceAccountName: local-path-provisioner-service-account
+ containers:
+ - name: local-path-provisioner
+ image: rancher/local-path-provisioner:v0.0.8
+ imagePullPolicy: Always
+ command:
+ - local-path-provisioner
+ - --debug
+ - start
+ - --config
+ - /etc/config/config.json
+ volumeMounts:
+ - name: config-volume
+ mountPath: /etc/config/
+ env:
+ - name: POD_NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ volumes:
+ - name: config-volume
+ configMap:
+ name: local-path-config
+---
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+ name: local-path
+provisioner: rancher.io/local-path
+volumeBindingMode: WaitForFirstConsumer
+reclaimPolicy: Delete
+---
+kind: ConfigMap
+apiVersion: v1
+metadata:
+ name: local-path-config
+ namespace: local-path-storage
+data:
+ config.json: |-
+ {
+ "nodePathMap":[
+ {
+ "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES",
+ "paths":["/opt/local-path-provisioner"]
+ }
+ ]
+ }
+
diff --git a/slides/k8s/operators-design.md b/slides/k8s/operators-design.md
new file mode 100644
index 00000000..9a02f2b8
--- /dev/null
+++ b/slides/k8s/operators-design.md
@@ -0,0 +1,356 @@
+## What does it take to write an operator?
+
+- Writing a quick-and-dirty operator, or a POC/MVP, is easy
+
+- Writing a robust operator is hard
+
+- We will describe the general idea
+
+- We will identify some of the associated challenges
+
+- We will list a few tools that can help us
+
+---
+
+## Top-down vs. bottom-up
+
+- Both approaches are possible
+
+- Let's see what they entail, and their respective pros and cons
+
+---
+
+## Top-down approach
+
+- Start with high-level design (see next slide)
+
+- Pros:
+
+ - can yield cleaner design that will be more robust
+
+- Cons:
+
+ - must be able to anticipate all the events that might happen
+
+ - design will be better only to the extend of what we anticipated
+
+ - hard to anticipate if we don't have production experience
+
+---
+
+## High-level design
+
+- What are we solving?
+
+ (e.g.: geographic databases backed by PostGIS with Redis caches)
+
+- What are our use-cases, stories?
+
+ (e.g.: adding/resizing caches and read replicas; load balancing queries)
+
+- What kind of outage do we want to address?
+
+ (e.g.: loss of individual node, pod, volume)
+
+- What are our *non-features*, the things we don't want to address?
+
+ (e.g.: loss of datacenter/zone; differentiating between read and write queries;
+
+ cache invalidation; upgrading to newer major versions of Redis, PostGIS, PostgreSQL)
+
+---
+
+## Low-level design
+
+- What Custom Resource Definitions do we need?
+
+ (one, many?)
+
+- How will we store configuration information?
+
+ (part of the CRD spec fields, annotations, other?)
+
+- Do we need to store state? If so, where?
+
+ - state that is small and doesn't change much can be stored via the Kubernetes API
+
+ (e.g.: leader information, configuration, credentials)
+
+ - things that are big and/or change a lot should go elsewhere
+
+ (e.g.: metrics, bigger configuration file like GeoIP)
+
+---
+
+class: extra-details
+
+## What can we store via the Kubernetes API?
+
+- The API server stores most Kubernetes resources into etcd
+
+- Etcd is designed for reliability, not for performance
+
+- If our storage needs exceed what etcd can offer, we need to use something else:
+
+ - either directly
+
+ - or by extending the API server
+
(for instance by using the agregation layer, like [metrics server](https://github.com/kubernetes-incubator/metrics-server) does)
+
+---
+
+## Bottom-up approach
+
+- Start with existing Kubernetes resources (Deployment, Stateful Set...)
+
+- Run the system in production
+
+- Add scripts, automation, to facilitate day-to-day operations
+
+- Turn the scripts into an operator
+
+- Pros: simpler to get started; reflects actual use-cases
+
+- Cons: can result in convoluted designs requiring extensive refactor
+
+---
+
+## General idea
+
+- Our operator will watch its CRDs *and associated resources*
+
+- Drawing state diagrams and finite state automata helps a lot
+
+- It's OK if some transitions lead to a big catch-all "human intervention"
+
+- Over time, we will learn about new failure modes and add to these diagrams
+
+- It's OK to start with CRD creation / deletion and prevent any modification
+
+ (that's the easy POC/MVP we were talking about)
+
+- *Presentation* and *validation* will help our users
+
+ (more on that later)
+
+---
+
+## Challenges
+
+- Reacting to infrastructure disruption can seem hard at first
+
+- Kubernetes gives us a lot of primitives to help:
+
+ - Pods and Persistent Volumes will *eventually* recover
+
+ - Stateful Sets give us easy ways to "add N copies" of a thing
+
+- The real challenges come with configuration changes
+
+ (i.e., what to do when our users update our CRDs)
+
+- Keep in mind that [some] of the [largest] cloud [outages] haven't been caused by [natural catastrophes], or even code bugs, but by configuration changes
+
+[some]: https://www.datacenterdynamics.com/news/gcp-outage-mainone-leaked-google-cloudflare-ip-addresses-china-telecom/
+[largest]: https://aws.amazon.com/message/41926/
+[outages]: https://aws.amazon.com/message/65648/
+[natural catastrophes]: https://www.datacenterknowledge.com/amazon/aws-says-it-s-never-seen-whole-data-center-go-down
+
+---
+
+## Configuration changes
+
+- It is helpful to analyze and understand how Kubernetes controllers work:
+
+ - watch resource for modifications
+
+ - compare desired state (CRD) and current state
+
+ - issue actions to converge state
+
+- Configuration changes will probably require *another* state diagram or FSA
+
+- Again, it's OK to have transitions labeled as "unsupported"
+
+ (i.e. reject some modifications because we can't execute them)
+
+---
+
+## Tools
+
+- CoreOS / RedHat Operator Framework
+
+ [GitHub](https://github.com/operator-framework)
+ |
+ [Blog](https://developers.redhat.com/blog/2018/12/18/introduction-to-the-kubernetes-operator-framework/)
+ |
+ [Intro talk](https://www.youtube.com/watch?v=8k_ayO1VRXE)
+ |
+ [Deep dive talk](https://www.youtube.com/watch?v=fu7ecA2rXmc)
+
+- Zalando Kubernetes Operator Pythonic Framework (KOPF)
+
+ [GitHub](https://github.com/zalando-incubator/kopf)
+ |
+ [Docs](https://kopf.readthedocs.io/)
+ |
+ [Step-by-step tutorial](https://kopf.readthedocs.io/en/stable/walkthrough/problem/)
+
+- Mesosphere Kubernetes Universal Declarative Operator (KUDO)
+
+ [GitHub](https://github.com/kudobuilder/kudo)
+ |
+ [Blog](https://mesosphere.com/blog/announcing-maestro-a-declarative-no-code-approach-to-kubernetes-day-2-operators/)
+ |
+ [Docs](https://kudo.dev/)
+ |
+ [Zookeeper example](https://github.com/kudobuilder/frameworks/tree/master/repo/stable/zookeeper)
+
+---
+
+## Validation
+
+- By default, a CRD is "free form"
+
+ (we can put pretty much anything we want in it)
+
+- When creating a CRD, we can provide an OpenAPI v3 schema
+ ([Example](https://github.com/amaizfinance/redis-operator/blob/master/deploy/crds/k8s_v1alpha1_redis_crd.yaml#L34))
+
+- The API server will then validate resources created/edited with this schema
+
+- If we need a stronger validation, we can use a Validating Admission Webhook:
+
+ - run an [admission webhook server](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#write-an-admission-webhook-server) to receive validation requests
+
+ - register the webhook by creating a [ValidatingWebhookConfiguration](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#configure-admission-webhooks-on-the-fly)
+
+ - each time the API server receives a request matching the configuration,
+
the request is sent to our server for validation
+
+---
+
+## Presentation
+
+- By default, `kubectl get mycustomresource` won't display much information
+
+ (just the name and age of each resource)
+
+- When creating a CRD, we can specify additional columns to print
+ ([Example](https://github.com/amaizfinance/redis-operator/blob/master/deploy/crds/k8s_v1alpha1_redis_crd.yaml#L6),
+ [Docs](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/#additional-printer-columns))
+
+- By default, `kubectl describe mycustomresource` will also be generic
+
+- `kubectl describe` can show events related to our custom resources
+
+ (for that, we need to create Event resources, and fill the `involvedObject` field)
+
+- For scalable resources, we can define a `scale` sub-resource
+
+- This will enable the use of `kubectl scale` and other scaling-related operations
+
+---
+
+## About scaling
+
+- It is possible to use the HPA (Horizontal Pod Autoscaler) with CRDs
+
+- But it is not always desirable
+
+- The HPA works very well for homogenous, stateless workloads
+
+- For other workloads, your mileage may vary
+
+- Some systems can scale across multiple dimensions
+
+ (for instance: increase number of replicas, or number of shards?)
+
+- If autoscaling is desired, the operator will have to take complex decisions
+
+ (example: Zalando's Elasticsearch Operator ([Video](https://www.youtube.com/watch?v=lprE0J0kAq0)))
+
+---
+
+## Versioning
+
+- As our operator evolves over time, we may have to change the CRD
+
+ (add, remove, change fields)
+
+- Like every other resource in Kubernetes, [custom resources are versioned](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definition-versioning/
+)
+
+- When creating a CRD, we need to specify a *list* of versions
+
+- Versions can be marked as `stored` and/or `served`
+
+---
+
+## Stored version
+
+- Exactly one version has to be marked as the `stored` version
+
+- As the name implies, it is the one that will be stored in etcd
+
+- Resources in storage are never converted automatically
+
+ (we need to read and re-write them ourselves)
+
+- Yes, this means that we can have different versions in etcd at any time
+
+- Our code needs to handle all the versions that still exist in storage
+
+---
+
+## Served versions
+
+- By default, the Kubernetes API will serve resources "as-is"
+
+ (using their stored version)
+
+- It will assume that all versions are compatible storage-wise
+
+ (i.e. that the spec and fields are compatible between versions)
+
+- We can provide [conversion webhooks](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definition-versioning/#webhook-conversion) to "translate" requests
+
+ (the alternative is to upgrade all stored resources and stop serving old versions)
+
+---
+
+## Operator reliability
+
+- Remember that the operator itself must be resilient
+
+ (e.g.: the node running it can fail)
+
+- Our operator must be able to restart and recover gracefully
+
+- Do not store state locally
+
+ (unless we can reconstruct that state when we restart)
+
+- As indicated earlier, we can use the Kubernetes API to store data:
+
+ - in the custom resources themselves
+
+ - in other resources' annotations
+
+---
+
+## Beyond CRDs
+
+- CRDs cannot use custom storage (e.g. for time series data)
+
+- CRDs cannot support arbitrary subresources (like logs or exec for Pods)
+
+- CRDs cannot support protobuf (for faster, more efficient communication)
+
+- If we need these things, we can use the [aggregation layer](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/apiserver-aggregation/) instead
+
+- The aggregation layer proxies all requests below a specific path to another server
+
+ (this is used e.g. by the metrics server)
+
+- [This documentation page](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/#choosing-a-method-for-adding-custom-resources) compares the features of CRDs and API aggregation
diff --git a/slides/k8s/operators.md b/slides/k8s/operators.md
new file mode 100644
index 00000000..fdf6bbf6
--- /dev/null
+++ b/slides/k8s/operators.md
@@ -0,0 +1,389 @@
+# Operators
+
+- Operators are one of the many ways to extend Kubernetes
+
+- We will define operators
+
+- We will see how they work
+
+- We will install a specific operator (for ElasticSearch)
+
+- We will use it to provision an ElasticSearch cluster
+
+---
+
+## What are operators?
+
+*An operator represents **human operational knowledge in software,**
+
+to reliably manage an application.
+— [CoreOS](https://coreos.com/blog/introducing-operators.html)*
+
+Examples:
+
+- Deploying and configuring replication with MySQL, PostgreSQL ...
+
+- Setting up Elasticsearch, Kafka, RabbitMQ, Zookeeper ...
+
+- Reacting to failures when intervention is needed
+
+- Scaling up and down these systems
+
+---
+
+## What are they made from?
+
+- Operators combine two things:
+
+ - Custom Resource Definitions
+
+ - controller code watching the corresponding resources and acting upon them
+
+- A given operator can define one or multiple CRDs
+
+- The controller code (control loop) typically runs within the cluster
+
+ (running as a Deployment with 1 replica is a common scenario)
+
+- But it could also run elsewhere
+
+ (nothing mandates that the code run on the cluster, as long as it has API access)
+
+---
+
+## Why use operators?
+
+- Kubernetes gives us Deployments, StatefulSets, Services ...
+
+- These mechanisms give us building blocks to deploy applications
+
+- They work great for services that are made of *N* identical containers
+
+ (like stateless ones)
+
+- They also work great for some stateful applications like Consul, etcd ...
+
+ (with the help of highly persistent volumes)
+
+- They're not enough for complex services:
+
+ - where different containers have different roles
+
+ - where extra steps have to be taken when scaling or replacing containers
+
+---
+
+## Use-cases for operators
+
+- Systems with primary/secondary replication
+
+ Examples: MariaDB, MySQL, PostgreSQL, Redis ...
+
+- Systems where different groups of nodes have different roles
+
+ Examples: ElasticSearch, MongoDB ...
+
+- Systems with complex dependencies (that are themselves managed with operators)
+
+ Examples: Flink or Kafka, which both depend on Zookeeper
+
+---
+
+## More use-cases
+
+- Representing and managing external resources
+
+ (Example: [AWS Service Operator](https://operatorhub.io/operator/alpha/aws-service-operator.v0.0.1))
+
+- Managing complex cluster add-ons
+
+ (Example: [Istio operator](https://operatorhub.io/operator/beta/istio-operator.0.1.6))
+
+- Deploying and managing our applications' lifecycles
+
+ (more on that later)
+
+---
+
+## How operators work
+
+- An operator creates one or more CRDs
+
+ (i.e., it creates new "Kinds" of resources on our cluster)
+
+- The operator also runs a *controller* that will watch its resources
+
+- Each time we create/update/delete a resource, the controller is notified
+
+ (we could write our own cheap controller with `kubectl get --watch`)
+
+---
+
+## One operator in action
+
+- We will install the UPMC Enterprises ElasticSearch operator
+
+- This operator requires PersistentVolumes
+
+- We will install Rancher's [local path storage provisioner](https://github.com/rancher/local-path-provisioner) to automatically create these
+
+- Then, we will create an ElasticSearch resource
+
+- The operator will detect that resource and provision the cluster
+
+---
+
+## Installing a Persistent Volume provisioner
+
+(This step can be skipped if you already have a dynamic volume provisioner.)
+
+- This provisioner creates Persistent Volumes backed by `hostPath`
+
+ (local directories on our nodes)
+
+- It doesn't require anything special ...
+
+- ... But losing a node = losing the volumes on that node!
+
+.exercise[
+
+- Install the local path storage provisioner:
+ ```bash
+ kubectl apply -f ~/container.training/k8s/local-path-storage.yaml
+ ```
+
+]
+
+---
+
+## Making sure we have a default StorageClass
+
+- The ElasticSearch operator will create StatefulSets
+
+- These StatefulSets will instantiate PersistentVolumeClaims
+
+- These PVCs need to be explicitly associated with a StorageClass
+
+- Or we need to tag a StorageClass to be used as the default one
+
+.exercise[
+
+- List StorageClasses:
+ ```bash
+ kubectl get storageclasses
+ ```
+
+]
+
+We should see the `local-path` StorageClass.
+
+---
+
+## Setting a default StorageClass
+
+- This is done by adding an annotation to the StorageClass:
+
+ `storageclass.kubernetes.io/is-default-class: true`
+
+.exercise[
+
+- Tag the StorageClass so that it's the default one:
+ ```bash
+ kubectl annotate storageclass local-path \
+ storageclass.kubernetes.io/is-default-class=true
+ ```
+
+- Check the result:
+ ```bash
+ kubectl get storageclasses
+ ```
+
+]
+
+Now, the StorageClass should have `(default)` next to its name.
+
+---
+
+## Install the ElasticSearch operator
+
+- The operator needs:
+
+ - a Deployment for its controller
+ - a ServiceAccount, ClusterRole, ClusterRoleBinding for permissions
+ - a Namespace
+
+- We have grouped all the definitions for these resources in a YAML file
+
+.exercise[
+
+- Install the operator:
+ ```bash
+ kubectl apply -f ~/container.training/k8s/elasticsearch-operator.yaml
+ ```
+
+]
+
+---
+
+## Wait for the operator to be ready
+
+- Some operators require to create their CRDs separately
+
+- This operator will create its CRD itself
+
+ (i.e. the CRD is not listed in the YAML that we applied earlier)
+
+.exercise[
+
+- Wait until the `elasticsearchclusters` CRD shows up:
+ ```bash
+ kubectl get crds
+ ```
+
+]
+
+---
+
+## Create an ElasticSearch resource
+
+- We can now create a resource with `kind: ElasticsearchCluster`
+
+- The YAML for that resource will specify all the desired parameters:
+
+ - how many nodes do we want of each type (client, master, data)
+ - image to use
+ - add-ons (kibana, cerebro, ...)
+ - whether to use TLS or not
+ - etc.
+
+.exercise[
+
+- Create our ElasticSearch cluster:
+ ```bash
+ kubectl apply -f ~/container.training/k8s/elasticsearch-cluster.yaml
+ ```
+
+]
+
+---
+
+## Operator in action
+
+- Over the next minutes, the operator will create:
+
+ - StatefulSets (one for master nodes, one for data nodes)
+
+ - Deployments (for client nodes; and for add-ons like cerebro and kibana)
+
+ - Services (for all these pods)
+
+.exercise[
+
+- Wait for all the StatefulSets to be fully up and running:
+ ```bash
+ kubectl get statefulsets -w
+ ```
+
+]
+
+---
+
+## Connecting to our cluster
+
+- Since connecting directly to the ElasticSearch API is a bit raw,
+
we'll connect to the cerebro frontend instead
+
+.exercise[
+
+- Edit the cerebro service to change its type from ClusterIP to NodePort:
+ ```bash
+ kubectl patch svc cerebro-es -p "spec: { type: NodePort }"
+ ```
+
+- Retrieve the NodePort that was allocated:
+ ```bash
+ kubectl get svc cerebreo-es
+ ```
+
+- Connect to that port with a browser
+
+]
+
+---
+
+## (Bonus) Setup filebeat
+
+- Let's send some data to our brand new ElasticSearch cluster!
+
+- We'll deploy a filebeat DaemonSet to collect node logs
+
+.exercise[
+
+- Deploy filebeat:
+ ```bash
+ kubectl apply -f ~/container.training/k8s/filebeat.yaml
+ ```
+
+]
+
+We should see at least one index being created in cerebro.
+
+---
+
+## (Bonus) Access log data with kibana
+
+- Let's expose kibana (by making kibana-es a NodePort too)
+
+- Then access kibana
+
+- We'll need to configure kibana indexes
+
+---
+
+## Deploying our apps with operators
+
+- It is very simple to deploy with `kubectl run` / `kubectl expose`
+
+- We can unlock more features by writing YAML and using `kubectl apply`
+
+- Kustomize or Helm let us deploy in multiple environments
+
+ (and adjust/tweak parameters in each environment)
+
+- We can also use an operator to deploy our application
+
+---
+
+## Pros and cons of deploying with operators
+
+- The app definition and configuration is persisted in the Kubernetes API
+
+- Multiple instances of the app can be manipulated with `kubectl get`
+
+- We can add labels, annotations to the app instances
+
+- Our controller can execute custom code for any lifecycle event
+
+- However, we need to write this controller
+
+- We need to be careful about changes
+
+ (what happens when the resource `spec` is updated?)
+
+---
+
+## Operators are not magic
+
+- Look at the ElasticSearch resource definition
+
+ (`~/container.training/k8s/elasticsearch-cluster.yaml`)
+
+- What should happen if we flip the `use-tls` flag? Twice?
+
+- What should happen if we remove / re-add the kibana or cerebro sections?
+
+- What should happen if we change the number of nodes?
+
+- What if we want different images or parameters for the different nodes?
+
+*Operators can be very powerful, iff we know exactly the scenarios that they can handle.*
diff --git a/slides/kadm-fourday.yml b/slides/kadm-fourday.yml
index 6b6eb9d4..8c713631 100644
--- a/slides/kadm-fourday.yml
+++ b/slides/kadm-fourday.yml
@@ -93,5 +93,3 @@ chapters:
- - k8s/lastwords-admin.md
- k8s/links.md
- shared/thankyou.md
-
-