🍀 SUADEO - 4 days, 5 hours/day

Fix #636 : kustomize commonLabels typo error
➕ Add ngrok token instructions
2026-02-28 16:30:21 +00:00 · 2024-03-10 20:05:12 +01:00 · 2024-02-28 06:08:43 +01:00 · 2024-02-21 23:40:19 +01:00 · 2024-02-21 22:04:33 +01:00 · 2024-02-21 22:02:34 +01:00
28 changed files with 1237 additions and 175 deletions
--- a/dockercoins/hasher/Dockerfile
+++ b/dockercoins/hasher/Dockerfile
@@ -1,6 +1,6 @@
 FROM ruby:alpine
 RUN apk add --update build-base curl
-RUN gem install sinatra
+RUN gem install sinatra --version '~> 3'
 RUN gem install thin
 ADD hasher.rb /
 CMD ["ruby", "hasher.rb"]
--- a/k8s/pod-disruption-budget.yaml
+++ b/k8s/pod-disruption-budget.yaml
@@ -0,0 +1,13 @@
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: my-pdb
+spec:
+  #minAvailable: 2
+  #minAvailable: 90%
+  maxUnavailable: 1
+  #maxUnavailable: 10%
+  selector:
+    matchLabels:
+      app: my-app
+
--- a/prepare-labs/dns-netlify.sh
+++ b/prepare-labs/dns-netlify.sh
@@ -12,7 +12,7 @@
  echo "$0 del <recordid>"
  echo ""
  echo "Example to create a A record for eu.container.training:"
-  echo "$0 add eu 185.145.250.0"
+  echo "$0 add eu A 185.145.250.0"
  echo ""
  exit 1
 }
@@ -49,27 +49,29 @@ ZONE_ID=$(netlify dns_zones |

 _list() {
  netlify dns_zones/$ZONE_ID/dns_records |
-    jq -r '.[] | select(.type=="A") | [.hostname, .type, .value, .id] | @tsv'
+    jq -r '.[] | select(.type=="A" or .type=="AAAA") | [.hostname, .type, .value, .id] | @tsv' |
+    sort |
+    column --table
 }

 _add() {
  NAME=$1.$DOMAIN
-  ADDR=$2
-
+  TYPE=$2
+  VALUE=$3

  # It looks like if we create two identical records, then delete one of them,
  # Netlify DNS ends up in a weird state (the name doesn't resolve anymore even
  # though it's still visible through the API and the website?)

  if netlify dns_zones/$ZONE_ID/dns_records |
-          jq '.[] | select(.hostname=="'$NAME'" and .type=="A" and .value=="'$ADDR'")' |
+          jq '.[] | select(.hostname=="'$NAME'" and .type=="'$TYPE'" and .value=="'$VALUE'")' |
          grep .
  then
    echo "It looks like that record already exists. Refusing to create it."
    exit 1
  fi

-  netlify dns_zones/$ZONE_ID/dns_records type=A hostname=$NAME value=$ADDR ttl=300
+  netlify dns_zones/$ZONE_ID/dns_records type=$TYPE hostname=$NAME value=$VALUE ttl=300

  netlify dns_zones/$ZONE_ID/dns_records |
          jq '.[] | select(.hostname=="'$NAME'")'
@@ -88,7 +90,7 @@ case "$1" in
    _list
    ;;
  add)
-    _add $2 $3
+    _add $2 $3 $4
    ;;
  del)
    _del $2
--- a/prepare-labs/konk.sh
+++ b/prepare-labs/konk.sh
@@ -13,6 +13,8 @@ TF_VAR_location=fr-par-2 \
 # set kubeconfig file
 cp tags/konk/stage2/kubeconfig.101 ~/kubeconfig

+export KUBECONFIG=~/kubeconfig
+
 # set external_ip labels
 kubectl get nodes -o=jsonpath='{range .items[*]}{.metadata.name} {.status.addresses[?(@.type=="ExternalIP")].address}{"\n"}{end}' |
 while read node address; do
--- a/prepare-labs/lib/commands.sh
+++ b/prepare-labs/lib/commands.sh
@@ -421,18 +421,18 @@ _cmd_kubebins() {
    TAG=$1
    need_tag

-    ##VERSION##
    if [ "$KUBEVERSION" = "" ]; then
        KUBEVERSION="$(curl -fsSL https://cdn.dl.k8s.io/release/stable.txt | sed s/^v//)"
    fi

+    ##VERSION##
    case "$KUBEVERSION" in
    1.19.*)
      ETCD_VERSION=v3.4.13
      CNI_VERSION=v0.8.7
      ;;
    *)
-      ETCD_VERSION=v3.5.9
+      ETCD_VERSION=v3.5.10
      CNI_VERSION=v1.3.0
      ;;
    esac
@@ -466,24 +466,36 @@ _cmd_kubepkgs() {
    TAG=$1
    need_tag

-    if [ "$KUBEVERSION" ]; then
-        pssh "
-        sudo tee /etc/apt/preferences.d/kubernetes <<EOF
+    # Prior September 2023, there was a single Kubernetes package repo that
+    # contained packages for all versions, so we could just add that repo
+    # and install whatever was the latest version available there.
+    # Things have changed (versions after September 2023, e.g. 1.28.3 are
+    # not in the old repo) and now there is a different repo for each
+    # minor version, so we need to figure out what minor version we are
+    # installing to add the corresponding repo.
+    if [ "$KUBEVERSION" = "" ]; then
+        KUBEVERSION="$(curl -fsSL https://cdn.dl.k8s.io/release/stable.txt | sed s/^v//)"
+    fi
+    KUBEREPOVERSION="$(echo $KUBEVERSION | cut -d. -f1-2)"
+
+    # Since the new repo doesn't have older versions, add a safety check here.
+    MINORVERSION="$(echo $KUBEVERSION | cut -d. -f2)"
+    if [ "$MINORVERSION" -lt 24 ]; then
+        die "Cannot install kubepkgs for versions before 1.24."
+    fi
+
+    pssh "
+    sudo tee /etc/apt/preferences.d/kubernetes <<EOF
 Package: kubectl kubeadm kubelet
 Pin: version $KUBEVERSION-*
 Pin-Priority: 1000
 EOF"
-    fi
-
-    # As of February 27th, 2023, packages.cloud.google.com seems broken
-    # (serves HTTP 500 errors for the GPG key), so let's pre-load that key.
-    pssh -I "sudo apt-key add -" < lib/kubernetes-apt-key.gpg

    # Install packages
    pssh --timeout 200 "
-    #curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg |
-    #sudo apt-key add - &&
-    echo deb http://apt.kubernetes.io/ kubernetes-xenial main |
+    curl -fsSL https://pkgs.k8s.io/core:/stable:/v$KUBEREPOVERSION/deb/Release.key | 
+    gpg --dearmor | sudo tee /etc/apt/keyrings/kubernetes-apt-keyring.gpg &&
+    echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$KUBEREPOVERSION/deb/ /' |
    sudo tee /etc/apt/sources.list.d/kubernetes.list"
    pssh --timeout 200 "
    sudo apt-get update -q &&
@@ -729,7 +741,7 @@ EOF
    # Install popeye
    pssh "
    if [ ! -x /usr/local/bin/popeye ]; then
-        FILENAME=popeye_Linux_$HERP_DERP_ARCH.tar.gz &&
+        FILENAME=popeye_Linux_$ARCH.tar.gz &&
        curl -fsSL https://github.com/derailed/popeye/releases/latest/download/\$FILENAME |
        sudo tar -zxvf- -C /usr/local/bin popeye
        popeye version
@@ -817,6 +829,14 @@ EOF
        sudo tar -zxvf- -C /usr/local/bin kubent
        kubent --version
    fi"
+
+    # Ngrok. Note that unfortunately, this is the x86_64 binary.
+    # We might have to rethink how to handle this for multi-arch environments.
+    pssh "
+    if [ ! -x /usr/local/bin/ngrok ]; then
+        curl -fsSL https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz |
+        sudo tar -zxvf- -C /usr/local/bin ngrok
+    fi"
 }

 _cmd kubereset "Wipe out Kubernetes configuration on all nodes"
@@ -950,12 +970,19 @@ _cmd_standardize() {
    # Disable unattended upgrades so that they don't mess up with the subsequent steps
    pssh sudo rm -f /etc/apt/apt.conf.d/50unattended-upgrades

-    # Digital Ocean's cloud init disables password authentication; re-enable it.
+    # Some cloud providers think that it's smart to disable password authentication.
+    # We need to re-neable it, though.
+    # Digital Ocecan
    pssh "
    if [ -f /etc/ssh/sshd_config.d/50-cloud-init.conf ]; then
        sudo rm /etc/ssh/sshd_config.d/50-cloud-init.conf
        sudo systemctl restart ssh.service
    fi"
+    # AWS
+    pssh "if [ -f /etc/ssh/sshd_config.d/60-cloudimg-settings.conf ]; then
+        sudo rm /etc/ssh/sshd_config.d/60-cloudimg-settings.conf
+        sudo systemctl restart ssh.service
+    fi"

    # Special case for oracle since their iptables blocks everything but SSH
    pssh "
--- a/prepare-labs/lib/kubernetes-apt-key.gpg
+++ b/prepare-labs/lib/kubernetes-apt-key.gpg
--- a/prepare-labs/lib/pssh.sh
+++ b/prepare-labs/lib/pssh.sh
@@ -17,6 +17,12 @@ pssh() {

    echo "[parallel-ssh] $@"

+    # There are some routers that really struggle with the number of TCP
+    # connections that we open when deploying large fleets of clusters.
+    # We're adding a 1 second delay here, but this can be cranked up if
+    # necessary - or down to zero, too.
+    sleep ${PSSH_DELAY_PRE-1}
+
    $(which pssh || which parallel-ssh) -h $HOSTFILE -l ubuntu \
        --par ${PSSH_PARALLEL_CONNECTIONS-100} \
        --timeout 300 \
--- a/prepare-labs/map-dns.sh
+++ b/prepare-labs/map-dns.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+DOMAINS=~/Dropbox/domains.txt
+IPS=ips.txt
+
+. ./dns-cloudflare.sh
+
+paste "$DOMAINS" "$IPS" | while read domain ips; do
+  if ! [ "$domain" ]; then
+    echo "⚠️ No more domains!"
+    exit 1
+  fi
+  _clear_zone "$domain"
+  _populate_zone "$domain" $ips
+done
+echo "✅ All done."
--- a/prepare-labs/settings/admin-oldversion.env
+++ b/prepare-labs/settings/admin-oldversion.env
@@ -7,7 +7,7 @@ USER_PASSWORD=training

 # For a list of old versions, check:
 # https://kubernetes.io/releases/patch-releases/#non-active-branch-history
-KUBEVERSION=1.22.5
+KUBEVERSION=1.24.14

 STEPS="
  wait
--- a/prepare-labs/settings/portal.env
+++ b/prepare-labs/settings/portal.env
@@ -1,5 +1,7 @@
 #export TF_VAR_node_size=GP2.4
 #export TF_VAR_node_size=g6-standard-6
+#export TF_VAR_node_size=m7i.xlarge
+

 CLUSTERSIZE=1

--- a/prepare-labs/terraform/one-kubernetes/scaleway/main.tf
+++ b/prepare-labs/terraform/one-kubernetes/scaleway/main.tf
@@ -1,10 +1,23 @@
+resource "scaleway_vpc_private_network" "_" {
+}
+
+# This is a kind of hack to use a custom security group with Kapsulse.
+# See https://www.scaleway.com/en/docs/containers/kubernetes/reference-content/secure-cluster-with-private-network/
+
+resource "scaleway_instance_security_group" "_" {
+  name                    = "kubernetes ${split("/", scaleway_k8s_cluster._.id)[1]}"
+  inbound_default_policy  = "accept"
+  outbound_default_policy = "accept"
+}
+
 resource "scaleway_k8s_cluster" "_" {
-  name = var.cluster_name
-  #region                     = var.location
+  name                        = var.cluster_name
  tags                        = var.common_tags
  version                     = local.k8s_version
+  type                        = "kapsule"
  cni                         = "cilium"
  delete_additional_resources = true
+  private_network_id          = scaleway_vpc_private_network._.id
 }

 resource "scaleway_k8s_pool" "_" {
@@ -17,6 +30,7 @@ resource "scaleway_k8s_pool" "_" {
  max_size    = var.max_nodes_per_pool
  autoscaling = var.max_nodes_per_pool > var.min_nodes_per_pool
  autohealing = true
+  depends_on = [ scaleway_instance_security_group._ ]
 }

 data "scaleway_k8s_version" "_" {
--- a/slides/_redirects
+++ b/slides/_redirects
@@ -2,7 +2,7 @@
 #/ /kube-halfday.yml.html 200!
 #/ /kube-fullday.yml.html 200!
 #/ /kube-twodays.yml.html 200!
-/ /dojo.yml.html 200!
+/ /all.yml.html 200!

 # And this allows to do "git clone https://container.training".
 /info/refs service=git-upload-pack https://github.com/jpetazzo/container.training/info/refs?service=git-upload-pack
--- a/slides/all.yml
+++ b/slides/all.yml
@@ -0,0 +1,110 @@
+title: |
+  Docker & Kubernetes
+
+chat: "[FIXME](https://FIXME.container.training/mattermost/)"
+
+gitrepo: github.com/jpetazzo/container.training
+
+slides: https://2024-04-suadeo.container.training/
+
+#slidenumberprefix: "#SomeHashTag &mdash; "
+
+exclude:
+- self-paced
+
+content:
+- shared/title.md
+- logistics.md
+- containers/intro.md
+- shared/about-slides.md
+- shared/chat-room-im.md
+#- shared/chat-room-zoom-meeting.md
+#- shared/chat-room-zoom-webinar.md
+- shared/toc.md
+- # DAY 1
+  #- containers/Docker_Overview.md
+  #- containers/Docker_History.md
+  - containers/Training_Environment.md
+  #- containers/Installing_Docker.md
+  - containers/First_Containers.md
+  - containers/Background_Containers.md
+  - containers/Initial_Images.md
+  - containers/Building_Images_Interactively.md
+  - containers/Building_Images_With_Dockerfiles.md
+  - containers/Cmd_And_Entrypoint.md
+  - containers/Copying_Files_During_Build.md
+  - containers/Exercise_Dockerfile_Basic.md
+  - containers/Dockerfile_Tips.md
+  - containers/Multi_Stage_Builds.md
+  - containers/Container_Networking_Basics.md
+- # DAY 2
+  - containers/Local_Development_Workflow.md
+  - containers/Getting_Inside.md
+  - containers/Container_Network_Model.md
+  - containers/Compose_For_Dev_Stacks.md
+  - containers/Exercise_Composefile.md
+  - containers/Exercise_Dockerfile_Advanced.md
+  - |
+    # Kubernetes
+  - shared/connecting.md
+  #- k8s/versions-k8s.md
+  - shared/sampleapp.md
+  #- shared/composescale.md
+  #- shared/hastyconclusions.md
+  - shared/composedown.md
+  - k8s/concepts-k8s.md
+- # DAY 3
+  - k8s/kubectlget.md
+  - k8s/kubectl-run.md
+  - k8s/kubectlexpose.md
+  - k8s/service-types.md
+  - k8s/kubenet.md
+  - k8s/shippingimages.md
+  #- k8s/buildshiprun-selfhosted.md
+  - k8s/buildshiprun-dockerhub.md
+  - k8s/labels-annotations.md
+  - k8s/kubectl-logs.md
+  - k8s/logs-cli.md
+  - exercises/k8sfundamentals-details.md
+  #- k8s/exercise-wordsmith.md
+  - k8s/ourapponkube.md
+  #- k8s/setup-overview.md
+  - k8s/setup-devel.md
+  #- k8s/setup-managed.md
+  #- k8s/setup-selfhosted.md
+  - k8s/localkubeconfig.md
+  - k8s/accessinternal.md
+  #- k8s/kubectlproxy.md
+  - shared/declarative.md
+  - k8s/declarative.md
+  - k8s/deploymentslideshow.md
+  - exercises/localcluster-details.md
+- # DAY 4
+  #- k8s/kubectlscale.md
+  - shared/yaml.md
+  - k8s/yamldeploy.md
+  - k8s/namespaces.md
+  - k8s/scalingdockercoins.md
+  - shared/hastyconclusions.md
+  - k8s/daemonset.md
+  - k8s/rollout.md
+  - k8s/healthchecks.md
+  #- k8s/healthchecks-more.md
+  - k8s/volumes.md
+  - k8s/configuration.md
+  - k8s/secrets.md
+  - exercises/yaml-details.md
+  - shared/thankyou.md
+- 
+  - |
+    # (Docker extras)
+  - containers/Start_And_Attach.md
+  - containers/Naming_And_Inspecting.md
+  - containers/Labels.md
+  - containers/Advanced_Dockerfiles.md
+  - containers/Network_Drivers.md
+-
+  - |
+    # (Kubernetes extras)
+  - k8s/k9s.md
+  - k8s/ingress.md
--- a/slides/dojo.yml
+++ b/slides/dojo.yml
@@ -1,15 +0,0 @@
-title: |
-  FIXME
-
-#chat: "[Slack](https://dockercommunity.slack.com/messages/C7GKACWDV)"
-#chat: "[Gitter](https://gitter.im/jpetazzo/training-20180413-paris)"
-chat: "FIXME"
-
-gitrepo: github.com/jpetazzo/container.training
-
-slides: https://2023-11-dojo.container.training/
-
-#slidenumberprefix: "#SomeHashTag &mdash; "
-
-content:
- fixme.md
--- a/slides/exercises/polykuberbac-brief.md
+++ b/slides/exercises/polykuberbac-brief.md
@@ -0,0 +1,11 @@
+## Exercise — Enable RBAC
+
+- Enable RBAC on a manually-deployed control plane
+
+- This involves:
+
+  - generating different certificates
+
+  - distributing the certificates to the controllers
+
+  - enabling the proper authorizers in API server
--- a/slides/exercises/polykuberbac-details.md
+++ b/slides/exercises/polykuberbac-details.md
@@ -0,0 +1,117 @@
+# Exercise — Enable RBAC
+
+- We want to enable RBAC on the "polykube" cluster
+
+  (it doesn't matter whether we have 1 or multiple nodes)
+
+- Ideally, we want to have, for instance:
+
+  - one key, certificate, and kubeconfig for a cluster admin
+
+  - one key, certificate, and kubeconfig for a user
+    <br/>
+    (with permissions in a single namespace)
+
+- Bonus points: enable the NodeAuthorizer too!
+
+- Check the following slides for hints
+
+---
+
+## Step 1
+
+- Enable RBAC itself!
+
+--
+
+- This is done with an API server command-line flag
+
+--
+
+- Check [the documentation][kube-apiserver-doc] to see the flag
+
+--
+
+- For now, only enable `--authorization-mode=RBAC`
+
+[kube-apiserver-doc]: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-apiserver/
+
+---
+
+## Step 2
+
+- Our certificate doesn't work anymore, we need to generate a new one
+
+--
+
+- We need a certificate that will have *some* (ideally *all*) permissions
+
+--
+
+- Two options:
+
+  - use the equivalent of "root" (identity that completely skips permission checks)
+
+  - a "non-root" identity but which is granted permissions with RBAC
+
+--
+
+- The "non-root" option looks nice, but to grant permissions, we need permissions
+
+- So let's start with the equivalent of "root"!
+
+--
+
+- The Kubernetes equivalent of `root` is the group `system:masters`
+
+---
+
+## Step 2, continued
+
+- We need to generate a certificate for a user belonging to group `system:masters`
+
+--
+
+- In Kubernetes certificates, groups are encoded with the "organization" field
+
+--
+
+- That corresponds to `O=system:masters`
+
+--
+
+- In other words we need to generate a new certificate, but with a subject of:
+
+  `/CN=admin/O=system:masters/` (the `CN` doesn't matter)
+
+- That certificate should be able to interact with the API server, like before
+
+---
+
+## Step 3
+
+- Now, all our controllers have permissions issues
+
+- We need to either:
+
+  - use that `system:masters` cert everywhere
+
+  - generate different certs for every controller, with the proper identities
+
+- Suggestion: use `system-masters` everywhere to begin with
+
+  (and make sure the cluster is back on its feet)
+
+---
+
+## Step 4
+
+At this point, there are two possible forks in the road:
+
+1. Generate certs for the control plane controllers
+
+   (`kube-controller-manager`, `kube-scheduler`)
+
+2. Generate cert(s) for the node(s) and enable `NodeAuthorizer`
+
+Good luck!
--- a/slides/k8s/admission.md
+++ b/slides/k8s/admission.md
@@ -198,6 +198,64 @@ Some examples ...

  (the Node "echo" app, the Flask app, and one ngrok tunnel for each of them)

+- We will need an ngrok account for the tunnels
+
+  (a free account is fine)
+
+---
+
+class: extra-details
+
+## What's ngrok?
+
+- Ngrok provides secure tunnels to access local services
+
+- Example: run `ngrok http 1234`
+
+- `ngrok` will display a publicly-available URL (e.g. https://xxxxyyyyzzzz.ngrok.app)
+
+- Connections to https://xxxxyyyyzzzz.ngrok.app will terminate at `localhost:1234`
+
+- Basic product is free; extra features (vanity domains, end-to-end TLS...) for $$$
+
+- Perfect to develop our webhook!
+
+---
+
+class: extra-details
+
+## Ngrok in production
+
+- Ngrok was initially known for its local webhook development features
+
+- It now supports production scenarios as well
+
+  (load balancing, WAF, authentication, circuit-breaking...)
+
+- Including some that are very relevant to Kubernetes
+
+  (e.g. [ngrok Ingress Controller](https://github.com/ngrok/kubernetes-ingress-controller)
+
+---
+
+## Ngrok tokens
+
+- If you're attending a live training, you might have an ngrok token
+
+- Look in `~/ngrok.env` and if that file exists, copy it to the stack:
+
+.lab[
+
+```bash
+cp ~/ngrok.env ~/container.training/webhooks/admission/.env
+```
+
+]
+
+---
+
+## Starting the whole stack
+
 .lab[

 - Go to the webhook directory:
@@ -216,28 +274,6 @@ Some examples ...

 ---

-class: extra-details
-
-## What's ngrok?
-
- Ngrok provides secure tunnels to access local services
-
- Example: run `ngrok http 1234`
-
- `ngrok` will display a publicly-available URL (e.g. https://xxxxyyyyzzzz.ngrok.io)
-
- Connections to https://xxxxyyyyzzzz.ngrok.io will terminate at `localhost:1234`
-
- Basic product is free; extra features (vanity domains, end-to-end TLS...) for $$$
-
- Perfect to develop our webhook!
-
- Probably not for production, though
-
-  (webhook requests and responses now pass through the ngrok platform)
-
---
-
 ## Update the webhook configuration

 - We have a webhook configuration in `k8s/webhook-configuration.yaml`
@@ -543,6 +579,23 @@ Shell to the rescue!

  (it should only allow values of `red`, `green`, `blue`)

+---
+
+## Coming soon...
+
+- Kubernetes Validating Admission Policies
+
+- Integrated with the Kubernetes API server
+
+- Lets us define policies using [CEL (Common Expression Language)][cel-spec]
+
+- Available in beta in Kubernetes 1.28 <!-- ##VERSION## -->
+
+- Check this [CNCF Blog Post][cncf-blog-vap] for more details
+
+[cncf-blog-vap]: https://www.cncf.io/blog/2023/09/14/policy-management-in-kubernetes-is-changing/
+[cel-spec]: https://github.com/google/cel-spec
+
 ???

 :EN:- Dynamic admission control with webhooks
--- a/slides/k8s/authn-authz.md
+++ b/slides/k8s/authn-authz.md
@@ -856,7 +856,7 @@ class: extra-details
 - To learn more about Kubernetes attacks and threat models around RBAC:

  📽️ [Hacking into Kubernetes Security for Beginners](https://www.youtube.com/watch?v=mLsCm9GVIQg)
-  by [Ellen Körbes](https://twitter.com/ellenkorbes)
+  by [V Körbes](https://twitter.com/veekorbes)
  and [Tabitha Sable](https://twitter.com/TabbySable)

 ---
--- a/slides/k8s/cluster-upgrade.md
+++ b/slides/k8s/cluster-upgrade.md
@@ -507,6 +507,86 @@ kubeadm should now agree to upgrade to 1.23.X.

 ---

+## And now, was that a good idea?
+
+--
+
+**Almost!**
+
+--
+
+- The official recommendation is to *drain* a node before performing node maintenance
+
+  (migrate all workloads off the node before upgrading it)
+
+- How do we do that?
+
+- Is it really necessary?
+
+- Let's see!
+
+---
+
+## Draining a node
+
+- This can be achieved with the `kubectl drain` command, which will:
+
+  - *cordon* the node (prevent new pods from being scheduled there)
+
+  - *evict* all the pods running on the node (delete them gracefully)
+
+  - the evicted pods will automatically be recreated somewhere else
+
+  - evictions might be blocked in some cases (Pod Disruption Budgets, `emptyDir` volumes)
+
+- Once the node is drained, it can safely be upgraded, restarted...
+
+- Once it's ready, it can be put back in commission with `kubectl uncordon`
+
+---
+
+## Is it necessary?
+
+- When upgrading kubelet from one patch-level version to another:
+
+  - it's *probably fine*
+
+- When upgrading system packages:
+
+  - it's *probably fine*
+
+  - except [when it's not][datadog-systemd-outage]
+
+- When upgrading the kernel:
+
+  - it's *probably fine*
+
+  - ...as long as we can tolerate a restart of the containers on the node
+
+  - ...and that they will be unavailable for a few minutes (during the reboot)
+
+[datadog-systemd-outage]: https://www.datadoghq.com/blog/engineering/2023-03-08-deep-dive-into-platform-level-impact/
+
+---
+
+## Is it necessary?
+
+- When upgrading kubelet from one minor version to another:
+
+  - it *may or may not be fine*
+
+  - in some cases (e.g. migrating from Docker to containerd) it *will not*
+
+- Here's what [the documentation][node-upgrade-docs] says:
+
+  *Draining nodes before upgrading kubelet ensures that pods are re-admitted and containers are re-created, which may be necessary to resolve some security issues or other important bugs.*
+
+- Do it at your own risk, and if you do, test extensively in staging environments!
+
+[node-upgrade-docs]: https://kubernetes.io/docs/tasks/administer-cluster/cluster-upgrade/#manual-deployments
+
+---
+
 class: extra-details

 ## Skipping versions
--- a/slides/k8s/disruptions.md
+++ b/slides/k8s/disruptions.md
@@ -0,0 +1,513 @@
+# Disruptions
+
+In a perfect world...
+
+- hardware never fails
+
+- software never has bugs
+
+- ...and never needs to be updated
+
+- ...and uses a predictable amount of resources
+
+- ...and these resources are infinite anyways
+
+- network latency and packet loss are zero
+
+- humans never make mistakes
+
+--
+
+😬
+
+---
+
+## Disruptions
+
+In the real world...
+
+- hardware will fail randomly (without advance notice)
+
+- software has bugs
+
+- ...and we constantly add new features
+
+- ...and will sometimes use more resources than expected
+
+- ...and these resources are limited
+
+- network latency and packet loss are NOT zero
+
+- humans make mistake (shutting down the wrong machine, the wrong app...)
+
+---
+
+## Disruptions
+
+- In Kubernetes, a "disruption" is something that stops the execution of a Pod
+
+- There are **voluntary** and **involuntary** disruptions
+
+  - voluntary = directly initiated by humans (including by mistake!)
+
+  - involuntary = everything else
+
+- In this section, we're going to see what they are and how to prevent them
+
+  (or at least, mitigate their effects)
+
+---
+
+## Node outage
+
+- Example: hardware failure (server or network), low-level error
+
+  (includes kernel bugs, issues affecting underlying hypervisors or infrastructure...)
+
+- **Involuntary** disruption (even if it results from human error!)
+
+- Consequence: all workloads on that node become unresponsive
+
+- Mitigations:
+
+  - scale workloads to at least 2 replicas (or more if quorum is needed)
+
+  - add anti-affinity scheduling constraints (to avoid having all pods on the same node)
+
+---
+
+## Node outage play-by-play
+
+- Node goes down (or disconnected from network)
+
+- Its lease (in Namespace `kube-node-lease`) doesn't get renewed
+
+- Controller manager detects that and mark the node as "unreachable"
+
+  (this adds both a `NoSchedule` and `NoExecute` taints to the node)
+
+- Eventually, the `NoExecute` taint will evict these pods
+
+- This will trigger creation of replacement pods by owner controllers
+
+  (except for pods with a stable network identity, e.g. in a Stateful Set!)
+
+---
+
+## Node outage notes
+
+- By default, pods will tolerate the `unreachable:NoExecute` taint for 5 minutes
+
+  (toleration automatically added by Admission controller `DefaultTolerationSeconds`)
+
+- Pods of a Stateful Set don't recover automatically:
+
+  - as long as the Pod exists, a replacement Pod can't be created
+
+  - the Pod will exist as long as its Node exists
+
+  - deleting the Node (manually or automatically) will recover the Pod
+
+---
+
+## Memory/disk pressure
+
+- Example: available memory on a node goes below a specific threshold
+
+  (because a pod is using too much memory and no limit was set)
+
+- **Involuntary** disruption
+
+- Consequence: kubelet starts to *evict* some pods
+
+- Mitigations:
+
+  - set *resource limits* on containers to prevent them from using too much resources
+
+  - set *resource requests* on containers to make sure they don't get evicted
+    <br/>
+    (as long as they use less than what they requested)
+
+  - make sure that apps don't use more resources than what they've requested
+
+---
+
+## Memory/disk pressure play-by-play
+
+- Memory leak in an application container, slowly causing very high memory usage
+
+- Overall free memory on the node goes below the *soft* or the *hard* threshold
+
+  (default hard threshold = 100Mi; default soft threshold = none)
+
+- When reaching the *soft* threshold:
+
+  - kubelet waits until the "eviction soft grace period" expires
+
+  - then (if resource usage is still above the threshold) it gracefully evicts pods
+
+- When reaching the *hard* threshold:
+
+  - kubelet immediately and forcefully evicts pods
+
+---
+
+## Which pods are evicted?
+
+- Kubelet only considers pods that are using *more* than what they requested
+
+  (and only for the resource that is under pressure, e.g. RAM or disk usage)
+
+- First, it sorts pods by *priority¹* (as set with the `priorityClassName` in the pod spec)
+
+- Then, by how much their resource usage exceeds their request
+
+  (again, for the resource that is under pressure)
+
+- It evicts pods until enough resources have been freed up
+
+---
+
+## Soft (graceful) vs hard (forceful) eviction
+
+- Soft eviction = graceful shutdown of the pod
+
+  (honor's the pod `terminationGracePeriodSeconds` timeout)
+
+- Hard eviction = immediate shutdown of the pod
+
+  (kills all containers immediately)
+
+---
+
+## Memory/disk pressure notes
+
+- If resource usage increases *very fast*, kubelet might not catch it fast enough
+
+- For memory: this will trigger the kernel out-of-memory killer
+
+  - containers killed by OOM are automatically restarted (no eviction)
+
+  - eviction might happen at a later point though (if memory usage stays high)
+
+- For disk: there is no "out-of-disk" killer, but writes will fail
+
+  - the `write` system call fails with `errno = ENOSPC` / `No space left on device`
+
+  - eviction typically happens shortly after (when kubelet catches up)
+
+- When relying on disk/memory bursts a lot, using `priorityClasses` might help
+
+---
+
+## Memory/disk pressure delays
+
+- By default, no soft threshold is defined
+
+- Defining it requires setting both the threshold and the grace period
+
+- Grace periods can be different for the different types of resources
+
+- When a node is under pressure, kubelet places a `NoSchedule` taint
+
+  (to avoid adding more pods while the pod is under pressure)
+
+- Once the node is no longer under pressure, kubelet clears the taint
+
+  (after waiting an extra timeout, `evictionPressureTransitionPeriod`, 5 min by default)
+
+---
+
+## Accidental deletion
+
+- Example: developer deletes the wrong Deployment, the wrong Namespace...
+
+- **Voluntary** disruption
+
+  (from Kubernetes' perspective!)
+
+- Consequence: application is down
+
+- Mitigations:
+
+  - only deploy to production systems through e.g. gitops workflows
+
+  - enforce peer review of changes
+
+  - only give users limited (e.g. read-only) access to production systems
+
+  - use canary deployments (might not catch all mistakes though!)
+
+---
+
+## Bad code deployment
+
+- Example: critical bug introduced, application crashes immediately or is non-functional
+
+- **Voluntary** disruption
+
+  (again, from Kubernetes' perspective!)
+
+- Consequence: application is down
+
+- Mitigations:
+
+  - readiness probes can mitigate immediate crashes
+    <br/>
+    (rolling update continues only when enough pods are ready)
+
+  - delayed crashes will require a rollback
+    <br/>
+    (manual intervention, or automated by a canary system)
+
+---
+
+## Node shutdown
+
+- Example: scaling down a cluster to save money
+
+- **Voluntary** disruption
+
+- Consequence:
+
+  - all workloads running on that node are terminated
+
+  - this might disrupt workloads that have too many replicas on that node
+
+  - or workloads that should not be interrupted at all
+
+- Mitigations:
+
+  - terminate workloads one at a time, coordinating with users
+
+--
+
+🤔
+
+---
+
+## Node shutdown
+
+- Example: scaling down a cluster to save money
+
+- **Voluntary** disruption
+
+- Consequence:
+
+  - all workloads running on that node are terminated
+
+  - this might disrupt workloads that have too many replicas on that node
+
+  - or workloads that should not be interrupted at all
+
+- Mitigations:
+
+  - ~~terminate workloads one at a time, coordinating with users~~
+
+  - use Pod Disruption Budgets
+
+---
+
+## Pod Disruption Budgets
+
+- A PDB is a kind of *contract* between:
+
+  - "admins" = folks maintaining the cluster (e.g. adding/removing/updating nodes)
+
+  - "users" = folks deploying apps and workloads on the cluster
+
+- A PDB expresses something like:
+
+  *in that particular set of pods, do not "disrupt" more than X at a time*
+
+- Examples:
+
+  - in that set of frontend pods, do not disrupt more than 1 at a time
+
+  - in that set of worker pods, always have at least 10 ready
+    <br/>
+    (do not disrupt them if it would bring down the number of ready pods below 10)
+
+---
+
+## PDB - user side
+
+- Cluster users create a PDB with a manifest like this one:
+
+```yaml
+@@INCLUDE[k8s/pod-disruption-budget.yaml]
+```
+
+- The PDB must indicate either `minAvailable` or `maxUnavailable`
+
+---
+
+## Rounding logic
+
+- Percentages are rounded **up**
+
+- When specifying `maxUnavailble` as a percentage, this can result in a higher perecentage
+
+  (e.g. `maxUnavailable: 50%` with 3 pods can result in 2 pods being unavailable!)
+
+---
+
+## Unmanaged pods
+
+- Specifying `minAvailable: X` works all the time
+
+- Specifying `minAvailable: X%` or `maxUnavaiable` requires *managed pods*
+
+  (pods that belong to a controller, e.g. Replica Set, Stateful Set...)
+
+- This is because the PDB controller needs to know the total number of pods
+
+  (given by the `replicas` field, not merely by counting pod objects)
+
+- The PDB controller will try to resolve the controller using the pod selector
+
+- If that fails, the PDB controller will emit warning events
+
+  (visible with `kubectl describe pdb ...`)
+
+---
+
+## Zero
+
+- `maxUnavailable: 0` means "do not disrupt my pods"
+
+- Same thing if `minAvailable` is greater than or equal to the number of pods
+
+- In that case, cluster admins are supposed to get in touch with cluster users
+
+- This will prevent fully automated operation
+
+  (and some cluster admins automated systems might not honor that request)
+
+---
+
+## PDB - admin side
+
+- As a cluster admin, we need to follow certain rules
+
+- Only shut down (or restart) a node when no pods are running on that node
+
+  (except system pods belonging to Daemon Sets)
+
+- To remove pods running on a node, we should use the *eviction API*
+
+  (which will check PDB constraints and honor them)
+
+- To prevent new pods from being scheduled on a node, we can use a *taint*
+
+- These operations are streamlined by `kubectl drain`, which will:
+
+  - *cordon* the node (add a `NoSchedule` taint)
+
+  - invoke the *eviction API* to remove pods while respecting their PDBs
+
+---
+
+## Theory vs practice
+
+- `kubectl drain` won't evict pods using `emptyDir` volumes
+
+  (unless the `--delete-emptydir-data` flag is passed as well)
+
+- Make sure that `emptyDir` volumes don't hold anything important
+
+  (they shouldn't, but... who knows!)
+
+- Kubernetes lacks a standard way for users to express:
+
+  *this `emptyDir` volume can/cannot be safely deleted*
+
+- If a PDB forbids an eviction, this requires manual coordination
+
+---
+
+class: extra-details
+
+## Unhealthy pod eviction policy
+
+- By default, unhealthy pods can only be evicted if PDB allows it
+
+  (unhealthy = running, but not ready)
+
+- In many cases, unhealthy pods aren't healthy anyway, and can be removed
+
+- This behavior is enabled by setting the appropriate field in the PDB manifest:
+ 
+```yaml
+spec:
+  unhealthyPodEvictionPolicy: AlwaysAllow
+```
+
+---
+
+## Node upgrade
+
+- Example: upgrading kubelet or the Linux kernel on a node
+
+- **Voluntary** disruption
+
+- Consequence:
+
+  - all workloads running on that node are temporarily interrupted, and restarted
+
+  - this might disrupt these workloads
+
+- Mitigations:
+
+  - migrate workloads off the done first (as if we were shutting it down)
+
+---
+
+## Node upgrade notes
+
+- Is it necessary to drain a node before doing an upgrade?
+
+- From [the documentation][node-upgrade-docs]:
+
+  *Draining nodes before upgrading kubelet ensures that pods are re-admitted and containers are re-created, which may be necessary to resolve some security issues or other important bugs.*
+
+- It's *probably* safe to upgrade in-place for:
+
+  - kernel upgrades
+
+  - kubelet patch-level upgrades (1.X.Y → 1.X.Z)
+
+- It's *probably* better to drain the node for minor revisions kubelet upgrades (1.X → 1.Y)
+
+- In doubt, test extensively in staging environments!
+
+[node-upgrade-docs]: https://kubernetes.io/docs/tasks/administer-cluster/cluster-upgrade/#manual-deployments
+
+---
+
+## Manual rescheduling
+
+- Example: moving workloads around to accommodate noisy neighbors or other issues
+
+  (e.g. pod X is doing a lot of disk I/O and this is starving other pods)
+
+- **Voluntary** disruption
+
+- Consequence:
+
+  - the moved workloads are temporarily interrupted
+
+- Mitigations:
+
+  - define an appropriate number of replicas, declare PDBs
+
+  - use the [eviction API][eviction-API] to move workloads
+
+[eviction-API]: https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/
+
+???
+
+:EN:- Voluntary and involuntary disruptions
+:EN:- Pod Disruption Budgets
+:FR:- "Disruptions" volontaires et involontaires
+:FR:- Pod Disruption Budgets
--- a/slides/k8s/dmuc-medium.md
+++ b/slides/k8s/dmuc-medium.md
@@ -462,7 +462,7 @@ The "context" section references the "cluster" and "credentials" that we defined

 ---

-## Review the kubeconfig filfe
+## Review the kubeconfig file

 The kubeconfig file should look like this:

--- a/slides/k8s/kustomize.md
+++ b/slides/k8s/kustomize.md
@@ -337,7 +337,7 @@ kustomize edit add label app.kubernetes.io/name:dockercoins

 - Assuming that `commonLabels` have been set as shown on the previous slide:
  ```bash
-    kubectl apply -k . --prune --selector app.kubernetes.io.name=dockercoins
+    kubectl apply -k . --prune --selector app.kubernetes.io/name=dockercoins
  ```

 - ... This command removes resources that have been removed from the kustomization
--- a/slides/k8s/kyverno.md
+++ b/slides/k8s/kyverno.md
@@ -536,12 +536,12 @@ Note: the `apiVersion` field appears to be optional.
 - Excerpt:
  ```yaml
      generate: 
-      kind: LimitRange
-      name: default-limitrange
-      namespace: "{{request.object.metadata.name}}" 
-      data:
-        spec:
-          limits:
+        kind: LimitRange
+        name: default-limitrange
+        namespace: "{{request.object.metadata.name}}" 
+        data:
+          spec:
+            limits:
  ```

 - Note that we have to specify the `namespace`
--- a/slides/k8s/resource-limits.md
+++ b/slides/k8s/resource-limits.md
@@ -6,11 +6,53 @@

 - We can specify *limits* and/or *requests*

- We can specify quantities of CPU and/or memory
+- We can specify quantities of CPU and/or memory and/or ephemeral storage

 ---

-## CPU vs memory
+## Requests vs limits
+
+- *Requests* are *guaranteed reservations* of resources
+
+- They are used for scheduling purposes
+
+- Kubelet will use cgroups to e.g. guarantee a minimum amount of CPU time
+
+- A container **can** use more than its requested resources
+
+- A container using *less* than what it requested should never be killed or throttled
+
+- A node **cannot** be overcommitted with requests
+
+  (the sum of all requests **cannot** be higher than resources available on the node)
+
+- A small amount of resources is set aside for system components
+
+  (this explains why there is a difference between "capacity" and "allocatable")
+
+---
+
+## Requests vs limits
+
+- *Limits* are "hard limits" (a container **cannot** exceed its limits)
+
+- They aren't taken into account by the scheduler
+
+- A container exceeding its memory limit is killed instantly
+
+  (by the kernel out-of-memory killer)
+
+- A container exceeding its CPU limit is throttled
+
+- A container exceeding its disk limit is killed
+
+  (usually with a small delay, since this is checked periodically by kubelet)
+
+- On a given node, the sum of all limits **can** be higher than the node size
+
+---
+
+## Compressible vs incompressible resources

 - CPU is a *compressible resource*

@@ -24,7 +66,29 @@

  - if we have N GB RAM and need 2N, we might run at... 0.1% speed!

- As a result, exceeding limits will have different consequences for CPU and memory
+- Disk is also an *incompressible resource*
+
+  - when the disk is full, writes will fail
+
+  - applications may or may not crash but persistent apps will be in trouble
+
+---
+
+## Running low on CPU
+
+- Two ways for a container to "run low" on CPU:
+
+  - it's hitting its CPU limit
+
+  - all CPUs on the node are at 100% utilization
+
+- The app in the container will run slower
+
+  (compared to running without a limit, or if CPU cycles were available)
+
+- No other consequence
+
+  (but this could affect SLA/SLO for latency-sensitive applications!)

 ---

@@ -136,9 +200,7 @@ For more details, check [this blog post](https://erickhun.com/posts/kubernetes-f

 ## Running low on memory

- When the system runs low on memory, it starts to reclaim used memory
-
-  (we talk about "memory pressure")
+- When the kernel runs low on memory, it starts to reclaim used memory

 - Option 1: free up some buffers and caches

@@ -162,71 +224,91 @@ For more details, check [this blog post](https://erickhun.com/posts/kubernetes-f

 - If a container exceeds its memory *limit*, it gets killed immediately

- If a node is overcommitted and under memory pressure, it will terminate some pods
+- If a node memory usage gets too high, it will *evict* some pods

-  (see next slide for some details about what "overcommit" means here!)
+  (we say that the node is "under pressure", more on that in a bit!)

 [KEP 2400]: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md#implementation-history

 ---

-## Overcommitting resources
+## Running low on disk

- *Limits* are "hard limits" (a container *cannot* exceed its limits)
+- When the kubelet runs low on disk, it starts to reclaim disk space

-  - a container exceeding its memory limit is killed
+  (similarly to what the kernel does, but in different categories)

-  - a container exceeding its CPU limit is throttled
+- Option 1: garbage collect dead pods and containers

- On a given node, the sum of pod *limits* can be higher than the node size
+  (no consequence, but their logs will be deleted)

- *Requests* are used for scheduling purposes
+- Option 2: remove unused images

-  - a container can use more than its requested CPU or RAM amounts
+  (no consequence, but these images will have to be repulled if we need them later)

-  - a container using *less* than what it requested should never be killed or throttled
+- Option 3: evict pods and remove them to reclaim their disk usage

- On a given node, the sum of pod *requests* cannot be higher than the node size
+- Note: this only applies to *ephemeral storage*, not to e.g. Persistent Volumes!

 ---

-## Pod quality of service
+## Ephemeral storage?

-Each pod is assigned a QoS class (visible in `status.qosClass`).
+- This includes:

- If limits = requests:
+  - the *read-write layer* of the container
+    <br/>
+    (any file creation/modification outside of its volumes)

-  - as long as the container uses less than the limit, it won't be affected
+  - `emptyDir` volumes mounted in the container

-  - if all containers in a pod have *(limits=requests)*, QoS is considered "Guaranteed"
+  - the container logs stored on the node

- If requests &lt; limits:
+- This does not include:

-  - as long as the container uses less than the request, it won't be affected
+  - the container image

-  - otherwise, it might be killed/evicted if the node gets overloaded
-
-  - if at least one container has *(requests&lt;limits)*, QoS is considered "Burstable"
-
- If a pod doesn't have any request nor limit, QoS is considered "BestEffort"
+  - other types of volumes (e.g. Persistent Volumes, `hostPath`, or `local` volumes)

 ---

-## Quality of service impact
+class: extra-details

- When a node is overloaded, BestEffort pods are killed first
+## Disk limit enforcement

- Then, Burstable pods that exceed their requests
+- Disk usage is periodically measured by kubelet

- Burstable and Guaranteed pods below their requests are never killed
+  (with something equivalent to `du`)

-  (except if their node fails)
+- There can be a small delay before pod termination when disk limit is exceeded

- If we only use Guaranteed pods, no pod should ever be killed
+- It's also possible to enable filesystem *project quotas*

-  (as long as they stay within their limits)
+  (e.g. with EXT4 or XFS)

-(Pod QoS is also explained in [this page](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/) of the Kubernetes documentation and in [this blog post](https://medium.com/google-cloud/quality-of-service-class-qos-in-kubernetes-bb76a89eb2c6).)
+- Remember that container logs are also accounted for!
+
+  (container log rotation/retention is managed by kubelet)
+
+---
+
+class: extra-details
+
+## `nodefs` and `imagefs`
+
+- `nodefs` is the main filesystem of the node
+
+  (holding, notably, `emptyDir` volumes and container logs)
+
+- Optionally, the container engine can be configured to use an `imagefs`
+
+- `imagefs` will store container images and container writable layers
+
+- When there is a separate `imagefs`, its disk usage is tracked independently
+
+- If `imagefs` usage gets too high, kubelet will remove old images first
+
+  (conversely, if `nodefs` usage gets too high, kubelet won't remove old images)

 ---

@@ -304,6 +386,46 @@ class: extra-details

 ---

+## Pod quality of service
+
+Each pod is assigned a QoS class (visible in `status.qosClass`).
+
+- If limits = requests:
+
+  - as long as the container uses less than the limit, it won't be affected
+
+  - if all containers in a pod have *(limits=requests)*, QoS is considered "Guaranteed"
+
+- If requests &lt; limits:
+
+  - as long as the container uses less than the request, it won't be affected
+
+  - otherwise, it might be killed/evicted if the node gets overloaded
+
+  - if at least one container has *(requests&lt;limits)*, QoS is considered "Burstable"
+
+- If a pod doesn't have any request nor limit, QoS is considered "BestEffort"
+
+---
+
+## Quality of service impact
+
+- When a node is overloaded, BestEffort pods are killed first
+
+- Then, Burstable pods that exceed their requests
+
+- Burstable and Guaranteed pods below their requests are never killed
+
+  (except if their node fails)
+
+- If we only use Guaranteed pods, no pod should ever be killed
+
+  (as long as they stay within their limits)
+
+(Pod QoS is also explained in [this page](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/) of the Kubernetes documentation and in [this blog post](https://medium.com/google-cloud/quality-of-service-class-qos-in-kubernetes-bb76a89eb2c6).)
+
+---
+
 ## Specifying resources

 - Resource requests are expressed at the *container* level
@@ -316,9 +438,9 @@ class: extra-details

  (so 100m = 0.1)

- Memory is expressed in bytes
+- Memory and ephemeral disk storage are expressed in bytes

- Memory can be expressed with k, M, G, T, ki, Mi, Gi, Ti suffixes
+- These can have k, M, G, T, ki, Mi, Gi, Ti suffixes

  (corresponding to 10^3, 10^6, 10^9, 10^12, 2^10, 2^20, 2^30, 2^40)

@@ -334,11 +456,13 @@ containers:
  image: jpetazzo/color
  resources:
    limits:
-      memory: "100Mi"
      cpu: "100m"
-    requests:
+      ephemeral-storage: 10M
      memory: "100Mi"
+    requests:
      cpu: "10m"
+      ephemeral-storage: 10M
+      memory: "100Mi"
 ```

 This set of resources makes sure that this service won't be killed (as long as it stays below 100 MB of RAM), but allows its CPU usage to be throttled if necessary.
@@ -365,7 +489,7 @@ This set of resources makes sure that this service won't be killed (as long as i

 ---

-## We need default resource values
+## We need to specify resource values

 - If we do not set resource values at all:

@@ -379,9 +503,33 @@ This set of resources makes sure that this service won't be killed (as long as i

  - if the request is zero, the scheduler can't make a smart placement decision

- To address this, we can set default values for resources
+- This is fine when learning/testing, absolutely not in production!

- This is done with a LimitRange object
+---
+
+## How should we set resources?
+
+- Option 1: manually, for each container
+
+  - simple, effective, but tedious
+
+- Option 2: automatically, with the [Vertical Pod Autoscaler (VPA)][vpa]
+
+  - relatively simple, very minimal involvement beyond initial setup
+
+  - not compatible with HPAv1, can disrupt long-running workloads (see [limitations][vpa-limitations])
+
+- Option 3: semi-automatically, with tools like [Robusta KRR][robusta]
+
+  - good compromise between manual work and automation
+
+- Option 4: by creating LimitRanges in our Namespaces
+
+  - relatively simple, but "one-size-fits-all" approach might not always work
+
+[robusta]: https://github.com/robusta-dev/krr
+[vpa]: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler
+[vpa-limitations]: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler#known-limitations

 ---

@@ -636,7 +784,7 @@ class: extra-details

  - ResourceQuota per namespace

- Let's see a simple recommendation to get started with resource limits
+- Let's see one possible strategy to get started with resource limits

 ---

--- a/slides/k8s/setup-overview.md
+++ b/slides/k8s/setup-overview.md
@@ -166,17 +166,15 @@

 - [Kubernetes The Hard Way](https://github.com/kelseyhightower/kubernetes-the-hard-way) by Kelsey Hightower

-  - step by step guide to install Kubernetes on Google Cloud
-
-  - covers certificates, high availability ...
-
-  - *“Kubernetes The Hard Way is optimized for learning, which means taking the long route to ensure you understand each task required to bootstrap a Kubernetes cluster.”*
+  *step by step guide to install Kubernetes on GCP, with certificates, HA...*

 - [Deep Dive into Kubernetes Internals for Builders and Operators](https://www.youtube.com/watch?v=3KtEAa7_duA)

-  - conference presentation showing step-by-step control plane setup
+  *conference talk setting up a simplified Kubernetes cluster - no security or HA*

-  - emphasis on simplicity, not on security and availability
+- 🇫🇷[Démystifions les composants internes de Kubernetes](https://www.youtube.com/watch?v=OCMNA0dSAzc)
+
+  *improved version of the previous one, with certs and recent k8s versions*

 ---

--- a/slides/logistics-template.md
+++ b/slides/logistics-template.md
@@ -1,42 +1,8 @@
 ## Introductions

-⚠️ This slide should be customized by the tutorial instructor(s).
+- Hello! I'm Jérôme Petazzoni ([@jpetazzo@hachyderm.io], Enix SAS)

-<!--
-
- Hello! We are:
-
-   - 👷🏻‍♀️ AJ ([@s0ulshake], [EphemeraSearch], [Quantgene])
-
-   - 🚁 Alexandre ([@alexbuisine], Enix SAS)
-
-   - 🐳 Jérôme ([@jpetazzo], [@jpetazzo@hachyderm.io], Ardan Labs)
-
-   - 🐳 Jérôme ([@jpetazzo], [@jpetazzo@hachyderm.io], Enix SAS)
-
-   - 🐳 Jérôme ([@jpetazzo], [@jpetazzo@hachyderm.io], Tiny Shell Script LLC)
-
-->
-
-<!--
-
- The training will run for 4 hours, with a 10 minutes break every hour
-
-  (the middle break will be a bit longer)
-
-->
-
-<!--
-
- The workshop will run from XXX to YYY
-
- There will be a lunch break at ZZZ
-
-  (And coffee breaks!)
-
-->
-
-<!--
+- The workshop will run from FIXME

 - Feel free to interrupt for questions at any time

@@ -44,20 +10,6 @@

 - Live feedback, questions, help: @@CHAT@@

-->
-
-<!--
-
- You ~~should~~ must ask questions! Lots of questions!
-
-  (especially when you see full screen container pictures)
-
- Use @@CHAT@@ to ask questions, get help, etc.
-
-->
-
-<!-- -->
-
 [@alexbuisine]: https://twitter.com/alexbuisine
 [EphemeraSearch]: https://ephemerasearch.com/
 [@jpetazzo]: https://twitter.com/jpetazzo
--- a/slides/workshop.css
+++ b/slides/workshop.css
@@ -15,6 +15,7 @@ h1, h2, h3, h4, h5, h6 {
  font-weight: bold;
  font-size: 45px !important;
  margin-top: 0.5em;
+  margin-bottom: 0.75em;
 }

 code {
--- a/webhooks/admission/docker-compose.yml
+++ b/webhooks/admission/docker-compose.yml
@@ -1,3 +1,11 @@
+# Note: Ngrok doesn't have an "anonymous" mode anymore.
+# This means that it requires an authentication token.
+# That said, all you need is a free account; so if you're
+# doing the labs on admission webhooks and want to try
+# this Compose file, I highly recommend that you create
+# an Ngrok account and set the NGROK_AUTHTOKEN environment
+# variable to your authentication token.
+
 version: "3"

 services:
@@ -5,6 +13,8 @@ services:
  ngrok-echo:
    image: ngrok/ngrok
    command: http --log=stdout localhost:3000
+    environment:
+    - NGROK_AUTHTOKEN
    ports:
    - 3000

@@ -16,6 +26,8 @@ services:
  ngrok-flask:
    image: ngrok/ngrok
    command: http --log=stdout localhost:5000
+    environment:
+    - NGROK_AUTHTOKEN
    ports:
    - 5000
Author	SHA1	Message	Date
Jérôme Petazzoni	477feee80b	🍀 SUADEO - 4 days, 5 hours/day	2024-03-10 20:05:12 +01:00
Alix Lourme	41330f8302	Fix #636 : kustomize commonLabels typo error	2024-02-28 06:08:43 +01:00
Jérôme Petazzoni	4fcd490b30	➕ Add ngrok token instructions	2024-02-21 23:40:19 +01:00
Jérôme Petazzoni	633c29b62c	➕ Install Ngrok binary	2024-02-21 22:04:33 +01:00
Jérôme Petazzoni	0802701f11	🗝️ Fix AWS cloud init settings that disable password auth	2024-02-21 22:02:34 +01:00
Jérôme Petazzoni	c407e178d5	🔗 Fix popeye download link	2024-02-19 22:38:03 +01:00
Jérôme Petazzoni	cb574d7cdd	📍 Pin sinatra version in dockercoins/hasher Sinatra 4.0 was released very recently and something broke. Let's pin Sinatra to version 3.	2024-02-17 23:42:59 +01:00
Jérôme Petazzoni	84988644df	🐞 Fix minor issue in konk helper script	2024-01-28 17:08:36 +01:00
Jérôme Petazzoni	3ab64d79e4	🔧 Add script to map DNS to clusters with CloudFlare	2024-01-28 17:08:14 +01:00
Jérôme Petazzoni	6391b4d896	🔗 Add link to Denis Germain's Devoxx presentation	2024-01-15 22:02:07 +01:00
Jérôme Petazzoni	57e8c6ee2f	📃 Update ngrok information	2024-01-15 15:44:35 +01:00
Jérôme Petazzoni	42443df0dc	♻️ Update Scaleway Terraform config (VPC now mandatory; sec group)	2024-01-08 15:47:58 +01:00
Jérôme Petazzoni	9289d453bc	🐞 Unvoluntary → Involuntary	2023-12-08 16:54:24 -06:00
Jérôme Petazzoni	3d8059c631	🐞 Fix YAML indentation	2023-12-08 15:13:58 -06:00
Jérôme Petazzoni	7ff17fbabd	🔧 Add AWS instance size for portal, while we're at it	2023-12-07 15:22:03 -06:00
Jérôme Petazzoni	dbfda8b458	🐞 Typo fix	2023-12-06 15:31:09 -06:00
Jérôme Petazzoni	c8fc67c995	📃 Update V's name and social media link	2023-12-04 16:41:03 -06:00
Jérôme Petazzoni	28222db2e4	⏳ Add 1-second pre-pssh delay Seems to help with AT&T fiber router. (Actually it takes a longer delay to make a difference, like 10 seconds, but this patch makes the delay configurable.)	2023-12-04 16:38:33 -06:00
Jérôme Petazzoni	a38f930858	📦 Use new k8s package repositories	2023-12-03 21:33:25 -06:00
Jérôme Petazzoni	2cef200726	➕ Add DMUC+RBAC exercises	2023-12-03 15:38:43 -06:00
Jérôme Petazzoni	1f77a52137	📃 Flesh out upgrade information Add the official policy (which is to drain nodes before upgrading), and give some explanations about when it may/may not be fine to upgrade without draining nodes.	2023-11-30 16:45:11 -06:00
Jérôme Petazzoni	b188e0f8a9	🔧 Mention priorityClasses around resource pressure	2023-11-30 16:10:12 -06:00
Jérôme Petazzoni	ac203a128d	➕ Add content about disruptions and PDB	2023-11-30 15:36:32 -06:00
Jérôme Petazzoni	a9920e5cf0	🌐 Add IPv6 support in netlify DNS scriptlet	2023-11-30 15:32:03 -06:00
Jérôme Petazzoni	d1047f950d	📃 Update resource limits to add ephemeral-storage	2023-11-29 14:23:24 -06:00
Jérôme Petazzoni	e380509ffe	💈 Tweak CSS for consistent spacing after titles	2023-11-29 14:22:54 -06:00
Jérôme Petazzoni	b5c754211e	➕ Mention Validating Admission Policies and CEL	2023-11-24 12:29:44 -06:00