mirror of
https://github.com/jpetazzo/container.training.git
synced 2026-02-28 16:30:21 +00:00
Compare commits
91 Commits
2023-11-do
...
2024-12-mq
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c223891e15 | ||
|
|
6c71a38ddc | ||
|
|
c6507c1561 | ||
|
|
10a4fff91c | ||
|
|
91218b2b16 | ||
|
|
106912fcf8 | ||
|
|
9e712e8a9e | ||
|
|
cc4c096558 | ||
|
|
908ffe0dd2 | ||
|
|
0e7058214a | ||
|
|
21dad159de | ||
|
|
3ab190710f | ||
|
|
8ea09e93ee | ||
|
|
88fbb6f629 | ||
|
|
7ee8c00cfa | ||
|
|
7d35bacbbe | ||
|
|
cd81b5287b | ||
|
|
0abc67e974 | ||
|
|
7305bcfe12 | ||
|
|
0d1873145e | ||
|
|
6105b57914 | ||
|
|
8724ab2835 | ||
|
|
a669b15313 | ||
|
|
76067dca97 | ||
|
|
e665dad1b8 | ||
|
|
543204b905 | ||
|
|
c3b81baa06 | ||
|
|
41e5467063 | ||
|
|
96f03066f9 | ||
|
|
a3d543c6fe | ||
|
|
e573d520e9 | ||
|
|
e7b8337dd5 | ||
|
|
8b554c02d3 | ||
|
|
99348d8a2b | ||
|
|
1ea72f2179 | ||
|
|
ff7cbb2e19 | ||
|
|
5d65cf2ef6 | ||
|
|
3fb2c1e9d1 | ||
|
|
59a569e9e7 | ||
|
|
0b95eac799 | ||
|
|
ce13afa0d4 | ||
|
|
e97c93e451 | ||
|
|
3eb0378d13 | ||
|
|
f98192ac76 | ||
|
|
3488f5ad7b | ||
|
|
51f9b2db3b | ||
|
|
787be94cb6 | ||
|
|
86d4dfa775 | ||
|
|
c550ea6553 | ||
|
|
0d761409d7 | ||
|
|
ea16766fd7 | ||
|
|
e5d0e3ef85 | ||
|
|
81026d9d41 | ||
|
|
8788012880 | ||
|
|
ab6ed864e3 | ||
|
|
21f08cf3bd | ||
|
|
00b126ff20 | ||
|
|
d5b462653e | ||
|
|
560be57017 | ||
|
|
303cf459c4 | ||
|
|
2f009de2db | ||
|
|
06ca097b52 | ||
|
|
b4383156a5 | ||
|
|
624ec14763 | ||
|
|
a5e270b756 | ||
|
|
41330f8302 | ||
|
|
4fcd490b30 | ||
|
|
633c29b62c | ||
|
|
0802701f11 | ||
|
|
c407e178d5 | ||
|
|
cb574d7cdd | ||
|
|
84988644df | ||
|
|
3ab64d79e4 | ||
|
|
6391b4d896 | ||
|
|
57e8c6ee2f | ||
|
|
42443df0dc | ||
|
|
9289d453bc | ||
|
|
3d8059c631 | ||
|
|
7ff17fbabd | ||
|
|
dbfda8b458 | ||
|
|
c8fc67c995 | ||
|
|
28222db2e4 | ||
|
|
a38f930858 | ||
|
|
2cef200726 | ||
|
|
1f77a52137 | ||
|
|
b188e0f8a9 | ||
|
|
ac203a128d | ||
|
|
a9920e5cf0 | ||
|
|
d1047f950d | ||
|
|
e380509ffe | ||
|
|
b5c754211e |
@@ -1,6 +1,6 @@
|
||||
FROM ruby:alpine
|
||||
RUN apk add --update build-base curl
|
||||
RUN gem install sinatra
|
||||
RUN gem install sinatra --version '~> 3'
|
||||
RUN gem install thin
|
||||
ADD hasher.rb /
|
||||
CMD ["ruby", "hasher.rb"]
|
||||
|
||||
@@ -16,8 +16,7 @@ spec:
|
||||
hostPath:
|
||||
path: /root
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- operator: Exists
|
||||
initContainers:
|
||||
- name: hacktheplanet
|
||||
image: alpine
|
||||
@@ -27,7 +26,7 @@ spec:
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "mkdir -p /root/.ssh && apk update && apk add curl && curl https://github.com/jpetazzo.keys > /root/.ssh/authorized_keys"
|
||||
- "mkdir -p /root/.ssh && apk update && apk add curl && curl https://github.com/jpetazzo.keys >> /root/.ssh/authorized_keys"
|
||||
containers:
|
||||
- name: web
|
||||
image: nginx
|
||||
|
||||
13
k8s/pod-disruption-budget.yaml
Normal file
13
k8s/pod-disruption-budget.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: my-pdb
|
||||
spec:
|
||||
#minAvailable: 2
|
||||
#minAvailable: 90%
|
||||
maxUnavailable: 1
|
||||
#maxUnavailable: 10%
|
||||
selector:
|
||||
matchLabels:
|
||||
app: my-app
|
||||
|
||||
27
k8s/sysctl.yaml
Normal file
27
k8s/sysctl.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: sysctl
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: sysctl
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: sysctl
|
||||
spec:
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
initContainers:
|
||||
- name: sysctl
|
||||
image: alpine
|
||||
securityContext:
|
||||
privileged: true
|
||||
command:
|
||||
- sysctl
|
||||
- fs.inotify.max_user_instances=99999
|
||||
containers:
|
||||
- name: pause
|
||||
image: registry.k8s.io/pause:3.8
|
||||
|
||||
@@ -25,7 +25,7 @@ cloudflare() {
|
||||
}
|
||||
|
||||
_list_zones() {
|
||||
cloudflare zones | jq -r .result[].name
|
||||
cloudflare zones?per_page=100 | jq -r .result[].name
|
||||
}
|
||||
|
||||
_get_zone_id() {
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -eu
|
||||
|
||||
# https://open-api.netlify.com/#tag/dnsZone
|
||||
[ "$1" ] || {
|
||||
[ "${1-}" ] || {
|
||||
echo ""
|
||||
echo "Add a record in Netlify DNS."
|
||||
echo "This script is hardcoded to add a record to container.training".
|
||||
@@ -12,13 +14,13 @@
|
||||
echo "$0 del <recordid>"
|
||||
echo ""
|
||||
echo "Example to create a A record for eu.container.training:"
|
||||
echo "$0 add eu 185.145.250.0"
|
||||
echo "$0 add eu A 185.145.250.0"
|
||||
echo ""
|
||||
exit 1
|
||||
}
|
||||
|
||||
NETLIFY_CONFIG_FILE=~/.config/netlify/config.json
|
||||
if ! [ "$DOMAIN" ]; then
|
||||
if ! [ "${DOMAIN-}" ]; then
|
||||
DOMAIN=container.training
|
||||
fi
|
||||
|
||||
@@ -49,27 +51,29 @@ ZONE_ID=$(netlify dns_zones |
|
||||
|
||||
_list() {
|
||||
netlify dns_zones/$ZONE_ID/dns_records |
|
||||
jq -r '.[] | select(.type=="A") | [.hostname, .type, .value, .id] | @tsv'
|
||||
jq -r '.[] | select(.type=="A" or .type=="AAAA") | [.hostname, .type, .value, .id] | @tsv' |
|
||||
sort |
|
||||
column --table
|
||||
}
|
||||
|
||||
_add() {
|
||||
NAME=$1.$DOMAIN
|
||||
ADDR=$2
|
||||
|
||||
TYPE=$2
|
||||
VALUE=$3
|
||||
|
||||
# It looks like if we create two identical records, then delete one of them,
|
||||
# Netlify DNS ends up in a weird state (the name doesn't resolve anymore even
|
||||
# though it's still visible through the API and the website?)
|
||||
|
||||
if netlify dns_zones/$ZONE_ID/dns_records |
|
||||
jq '.[] | select(.hostname=="'$NAME'" and .type=="A" and .value=="'$ADDR'")' |
|
||||
jq '.[] | select(.hostname=="'$NAME'" and .type=="'$TYPE'" and .value=="'$VALUE'")' |
|
||||
grep .
|
||||
then
|
||||
echo "It looks like that record already exists. Refusing to create it."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
netlify dns_zones/$ZONE_ID/dns_records type=A hostname=$NAME value=$ADDR ttl=300
|
||||
netlify dns_zones/$ZONE_ID/dns_records type=$TYPE hostname=$NAME value=$VALUE ttl=300
|
||||
|
||||
netlify dns_zones/$ZONE_ID/dns_records |
|
||||
jq '.[] | select(.hostname=="'$NAME'")'
|
||||
@@ -88,7 +92,7 @@ case "$1" in
|
||||
_list
|
||||
;;
|
||||
add)
|
||||
_add $2 $3
|
||||
_add $2 $3 $4
|
||||
;;
|
||||
del)
|
||||
_del $2
|
||||
|
||||
@@ -1,17 +1,29 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Baseline resource usage per vcluster in our usecase:
|
||||
# 500 MB RAM
|
||||
# 10% CPU
|
||||
# (See https://docs.google.com/document/d/1n0lwp6rQKQUIuo_A5LQ1dgCzrmjkDjmDtNj1Jn92UrI)
|
||||
# PRO2-XS = 4 core, 16 gb
|
||||
|
||||
# deploy big cluster
|
||||
#TF_VAR_node_size=g6-standard-6 \
|
||||
#TF_VAR_nodes_per_cluster=5 \
|
||||
#TF_VAR_location=eu-west \
|
||||
PROVIDER=scaleway
|
||||
|
||||
TF_VAR_node_size=PRO2-XS \
|
||||
TF_VAR_nodes_per_cluster=5 \
|
||||
TF_VAR_location=fr-par-2 \
|
||||
./labctl create --mode mk8s --settings settings/mk8s.env --provider scaleway --tag konk
|
||||
case "$PROVIDER" in
|
||||
linode)
|
||||
export TF_VAR_node_size=g6-standard-6
|
||||
export TF_VAR_location=eu-west
|
||||
;;
|
||||
scaleway)
|
||||
export TF_VAR_node_size=PRO2-XS
|
||||
export TF_VAR_location=fr-par-2
|
||||
;;
|
||||
esac
|
||||
|
||||
./labctl create --mode mk8s --settings settings/konk.env --provider $PROVIDER --tag konk
|
||||
|
||||
# set kubeconfig file
|
||||
cp tags/konk/stage2/kubeconfig.101 ~/kubeconfig
|
||||
export KUBECONFIG=~/kubeconfig
|
||||
cp tags/konk/stage2/kubeconfig.101 $KUBECONFIG
|
||||
|
||||
# set external_ip labels
|
||||
kubectl get nodes -o=jsonpath='{range .items[*]}{.metadata.name} {.status.addresses[?(@.type=="ExternalIP")].address}{"\n"}{end}' |
|
||||
@@ -21,3 +33,11 @@ done
|
||||
|
||||
# vcluster all the things
|
||||
./labctl create --settings settings/mk8s.env --provider vcluster --mode mk8s --students 50
|
||||
|
||||
# install prometheus stack because that's cool
|
||||
helm upgrade --install --repo https://prometheus-community.github.io/helm-charts \
|
||||
--namespace prom-system --create-namespace \
|
||||
kube-prometheus-stack kube-prometheus-stack
|
||||
|
||||
# and also fix sysctl
|
||||
kubectl apply -f ../k8s/sysctl.yaml --namespace kube-system
|
||||
|
||||
@@ -57,7 +57,7 @@ need_tag() {
|
||||
if [ ! -d "tags/$TAG" ]; then
|
||||
die "Tag $TAG not found (directory tags/$TAG does not exist)."
|
||||
fi
|
||||
for FILE in settings.env ips.txt; do
|
||||
for FILE in mode provider settings.env status; do
|
||||
if [ ! -f "tags/$TAG/$FILE" ]; then
|
||||
warning "File tags/$TAG/$FILE not found."
|
||||
fi
|
||||
|
||||
@@ -19,20 +19,22 @@ _cmd_cards() {
|
||||
TAG=$1
|
||||
need_tag
|
||||
|
||||
die FIXME
|
||||
OPTIONS_FILE=$2
|
||||
[ -f "$OPTIONS_FILE" ] || die "Please specify a YAML options file as 2nd argument."
|
||||
OPTIONS_FILE_PATH="$(readlink -f "$OPTIONS_FILE")"
|
||||
|
||||
# This will process ips.txt to generate two files: ips.pdf and ips.html
|
||||
# This will process logins.jsonl to generate two files: cards.pdf and cards.html
|
||||
(
|
||||
cd tags/$TAG
|
||||
../../../lib/ips-txt-to-html.py settings.yaml
|
||||
../../../lib/make-login-cards.py "$OPTIONS_FILE_PATH"
|
||||
)
|
||||
|
||||
ln -sf ../tags/$TAG/ips.html www/$TAG.html
|
||||
ln -sf ../tags/$TAG/ips.pdf www/$TAG.pdf
|
||||
ln -sf ../tags/$TAG/cards.html www/$TAG.html
|
||||
ln -sf ../tags/$TAG/cards.pdf www/$TAG.pdf
|
||||
|
||||
info "Cards created. You can view them with:"
|
||||
info "xdg-open tags/$TAG/ips.html tags/$TAG/ips.pdf (on Linux)"
|
||||
info "open tags/$TAG/ips.html (on macOS)"
|
||||
info "xdg-open tags/$TAG/cards.html tags/$TAG/cards.pdf (on Linux)"
|
||||
info "open tags/$TAG/cards.html (on macOS)"
|
||||
info "Or you can start a web server with:"
|
||||
info "$0 www"
|
||||
}
|
||||
@@ -257,7 +259,9 @@ _cmd_create() {
|
||||
terraform init
|
||||
echo tag = \"$TAG\" >> terraform.tfvars
|
||||
echo how_many_clusters = $STUDENTS >> terraform.tfvars
|
||||
echo nodes_per_cluster = $CLUSTERSIZE >> terraform.tfvars
|
||||
if [ "$CLUSTERSIZE" ]; then
|
||||
echo nodes_per_cluster = $CLUSTERSIZE >> terraform.tfvars
|
||||
fi
|
||||
for RETRY in 1 2 3; do
|
||||
if terraform apply -auto-approve; then
|
||||
touch terraform.ok
|
||||
@@ -321,10 +325,11 @@ _cmd_clusterize() {
|
||||
pssh "
|
||||
set -e
|
||||
grep PSSH_ /etc/ssh/sshd_config || echo 'AcceptEnv PSSH_*' | sudo tee -a /etc/ssh/sshd_config
|
||||
grep KUBECOLOR_ /etc/ssh/sshd_config || echo 'AcceptEnv KUBECOLOR_*' | sudo tee -a /etc/ssh/sshd_config
|
||||
sudo systemctl restart ssh.service"
|
||||
|
||||
pssh -I < tags/$TAG/clusters.txt "
|
||||
grep -w \$PSSH_HOST | tr ' ' '\n' > /tmp/cluster"
|
||||
pssh -I < tags/$TAG/clusters.tsv "
|
||||
grep -w \$PSSH_HOST | tr '\t' '\n' > /tmp/cluster"
|
||||
pssh "
|
||||
echo \$PSSH_HOST > /tmp/ipv4
|
||||
head -n 1 /tmp/cluster | sudo tee /etc/ipv4_of_first_node
|
||||
@@ -345,6 +350,10 @@ _cmd_clusterize() {
|
||||
done < /tmp/cluster
|
||||
"
|
||||
|
||||
while read line; do
|
||||
printf '{"login": "%s", "password": "%s", "ipaddrs": "%s"}\n' "$USER_LOGIN" "$USER_PASSWORD" "$line"
|
||||
done < tags/$TAG/clusters.tsv > tags/$TAG/logins.jsonl
|
||||
|
||||
echo cluster_ok > tags/$TAG/status
|
||||
}
|
||||
|
||||
@@ -392,7 +401,7 @@ _cmd_docker() {
|
||||
##VERSION## https://github.com/docker/compose/releases
|
||||
COMPOSE_VERSION=v2.11.1
|
||||
COMPOSE_PLATFORM='linux-$(uname -m)'
|
||||
|
||||
|
||||
# Just in case you need Compose 1.X, you can use the following lines.
|
||||
# (But it will probably only work for x86_64 machines.)
|
||||
#COMPOSE_VERSION=1.29.2
|
||||
@@ -421,18 +430,18 @@ _cmd_kubebins() {
|
||||
TAG=$1
|
||||
need_tag
|
||||
|
||||
##VERSION##
|
||||
if [ "$KUBEVERSION" = "" ]; then
|
||||
KUBEVERSION="$(curl -fsSL https://cdn.dl.k8s.io/release/stable.txt | sed s/^v//)"
|
||||
fi
|
||||
|
||||
##VERSION##
|
||||
case "$KUBEVERSION" in
|
||||
1.19.*)
|
||||
ETCD_VERSION=v3.4.13
|
||||
CNI_VERSION=v0.8.7
|
||||
;;
|
||||
*)
|
||||
ETCD_VERSION=v3.5.9
|
||||
ETCD_VERSION=v3.5.10
|
||||
CNI_VERSION=v1.3.0
|
||||
;;
|
||||
esac
|
||||
@@ -466,24 +475,36 @@ _cmd_kubepkgs() {
|
||||
TAG=$1
|
||||
need_tag
|
||||
|
||||
if [ "$KUBEVERSION" ]; then
|
||||
pssh "
|
||||
sudo tee /etc/apt/preferences.d/kubernetes <<EOF
|
||||
# Prior September 2023, there was a single Kubernetes package repo that
|
||||
# contained packages for all versions, so we could just add that repo
|
||||
# and install whatever was the latest version available there.
|
||||
# Things have changed (versions after September 2023, e.g. 1.28.3 are
|
||||
# not in the old repo) and now there is a different repo for each
|
||||
# minor version, so we need to figure out what minor version we are
|
||||
# installing to add the corresponding repo.
|
||||
if [ "$KUBEVERSION" = "" ]; then
|
||||
KUBEVERSION="$(curl -fsSL https://cdn.dl.k8s.io/release/stable.txt | sed s/^v//)"
|
||||
fi
|
||||
KUBEREPOVERSION="$(echo $KUBEVERSION | cut -d. -f1-2)"
|
||||
|
||||
# Since the new repo doesn't have older versions, add a safety check here.
|
||||
MINORVERSION="$(echo $KUBEVERSION | cut -d. -f2)"
|
||||
if [ "$MINORVERSION" -lt 24 ]; then
|
||||
die "Cannot install kubepkgs for versions before 1.24."
|
||||
fi
|
||||
|
||||
pssh "
|
||||
sudo tee /etc/apt/preferences.d/kubernetes <<EOF
|
||||
Package: kubectl kubeadm kubelet
|
||||
Pin: version $KUBEVERSION-*
|
||||
Pin-Priority: 1000
|
||||
EOF"
|
||||
fi
|
||||
|
||||
# As of February 27th, 2023, packages.cloud.google.com seems broken
|
||||
# (serves HTTP 500 errors for the GPG key), so let's pre-load that key.
|
||||
pssh -I "sudo apt-key add -" < lib/kubernetes-apt-key.gpg
|
||||
|
||||
# Install packages
|
||||
pssh --timeout 200 "
|
||||
#curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg |
|
||||
#sudo apt-key add - &&
|
||||
echo deb http://apt.kubernetes.io/ kubernetes-xenial main |
|
||||
curl -fsSL https://pkgs.k8s.io/core:/stable:/v$KUBEREPOVERSION/deb/Release.key |
|
||||
gpg --dearmor | sudo tee /etc/apt/keyrings/kubernetes-apt-keyring.gpg &&
|
||||
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$KUBEREPOVERSION/deb/ /' |
|
||||
sudo tee /etc/apt/sources.list.d/kubernetes.list"
|
||||
pssh --timeout 200 "
|
||||
sudo apt-get update -q &&
|
||||
@@ -491,7 +512,7 @@ EOF"
|
||||
sudo apt-mark hold kubelet kubeadm kubectl &&
|
||||
kubeadm completion bash | sudo tee /etc/bash_completion.d/kubeadm &&
|
||||
kubectl completion bash | sudo tee /etc/bash_completion.d/kubectl &&
|
||||
echo 'alias k=kubectl' | sudo tee /etc/bash_completion.d/k &&
|
||||
echo 'alias k=kubecolor' | sudo tee /etc/bash_completion.d/k &&
|
||||
echo 'complete -F __start_kubectl k' | sudo tee -a /etc/bash_completion.d/k"
|
||||
}
|
||||
|
||||
@@ -504,6 +525,7 @@ _cmd_kubeadm() {
|
||||
CLUSTER_CONFIGURATION_KUBERNETESVERSION='kubernetesVersion: "v'$KUBEVERSION'"'
|
||||
IGNORE_SYSTEMVERIFICATION="- SystemVerification"
|
||||
IGNORE_SWAP="- Swap"
|
||||
IGNORE_IPTABLES="- FileContent--proc-sys-net-bridge-bridge-nf-call-iptables"
|
||||
fi
|
||||
|
||||
# Install a valid configuration for containerd
|
||||
@@ -527,6 +549,7 @@ nodeRegistration:
|
||||
- NumCPU
|
||||
$IGNORE_SYSTEMVERIFICATION
|
||||
$IGNORE_SWAP
|
||||
$IGNORE_IPTABLES
|
||||
---
|
||||
kind: JoinConfiguration
|
||||
apiVersion: kubeadm.k8s.io/v1beta3
|
||||
@@ -540,6 +563,7 @@ nodeRegistration:
|
||||
- NumCPU
|
||||
$IGNORE_SYSTEMVERIFICATION
|
||||
$IGNORE_SWAP
|
||||
$IGNORE_IPTABLES
|
||||
---
|
||||
kind: KubeletConfiguration
|
||||
apiVersion: kubelet.config.k8s.io/v1beta1
|
||||
@@ -622,6 +646,31 @@ _cmd_kubetools() {
|
||||
;;
|
||||
esac
|
||||
|
||||
# Install ArgoCD CLI
|
||||
##VERSION## https://github.com/argoproj/argo-cd/releases/latest
|
||||
URL=https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-${ARCH}
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/argocd ]; then
|
||||
sudo curl -o /usr/local/bin/argocd -fsSL $URL
|
||||
sudo chmod +x /usr/local/bin/argocd
|
||||
argocd completion bash | sudo tee /etc/bash_completion.d/argocd
|
||||
argocd version --client
|
||||
fi"
|
||||
|
||||
# Install Flux CLI
|
||||
##VERSION## https://github.com/fluxcd/flux2/releases
|
||||
FLUX_VERSION=2.3.0
|
||||
FILENAME=flux_${FLUX_VERSION}_linux_${ARCH}
|
||||
URL=https://github.com/fluxcd/flux2/releases/download/v$FLUX_VERSION/$FILENAME.tar.gz
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/flux ]; then
|
||||
curl -fsSL $URL |
|
||||
sudo tar -C /usr/local/bin -zx flux
|
||||
sudo chmod +x /usr/local/bin/flux
|
||||
flux completion bash | sudo tee /etc/bash_completion.d/flux
|
||||
flux --version
|
||||
fi"
|
||||
|
||||
# Install kubectx and kubens
|
||||
pssh "
|
||||
set -e
|
||||
@@ -653,7 +702,7 @@ EOF
|
||||
|
||||
# Install stern
|
||||
##VERSION## https://github.com/stern/stern/releases
|
||||
STERN_VERSION=1.22.0
|
||||
STERN_VERSION=1.29.0
|
||||
FILENAME=stern_${STERN_VERSION}_linux_${ARCH}
|
||||
URL=https://github.com/stern/stern/releases/download/v$STERN_VERSION/$FILENAME.tar.gz
|
||||
pssh "
|
||||
@@ -675,7 +724,7 @@ EOF
|
||||
|
||||
# Install kustomize
|
||||
##VERSION## https://github.com/kubernetes-sigs/kustomize/releases
|
||||
KUSTOMIZE_VERSION=v4.5.7
|
||||
KUSTOMIZE_VERSION=v5.4.1
|
||||
URL=https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize/${KUSTOMIZE_VERSION}/kustomize_${KUSTOMIZE_VERSION}_linux_${ARCH}.tar.gz
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/kustomize ]; then
|
||||
@@ -706,6 +755,16 @@ EOF
|
||||
aws-iam-authenticator version
|
||||
fi"
|
||||
|
||||
# Install jless (jless.io)
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/jless ]; then
|
||||
##VERSION##
|
||||
sudo apt-get install -y libxcb-render0 libxcb-shape0 libxcb-xfixes0
|
||||
wget https://github.com/PaulJuliusMartinez/jless/releases/download/v0.9.0/jless-v0.9.0-x86_64-unknown-linux-gnu.zip
|
||||
unzip jless-v0.9.0-x86_64-unknown-linux-gnu
|
||||
sudo mv jless /usr/local/bin
|
||||
fi"
|
||||
|
||||
# Install the krew package manager
|
||||
pssh "
|
||||
if [ ! -d /home/$USER_LOGIN/.krew ]; then
|
||||
@@ -717,21 +776,31 @@ EOF
|
||||
echo export PATH=/home/$USER_LOGIN/.krew/bin:\\\$PATH | sudo -u $USER_LOGIN tee -a /home/$USER_LOGIN/.bashrc
|
||||
fi"
|
||||
|
||||
# Install kubecolor
|
||||
KUBECOLOR_VERSION=0.4.0
|
||||
URL=https://github.com/kubecolor/kubecolor/releases/download/v${KUBECOLOR_VERSION}/kubecolor_${KUBECOLOR_VERSION}_linux_${ARCH}.tar.gz
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/kubecolor ]; then
|
||||
##VERSION##
|
||||
curl -fsSL $URL |
|
||||
sudo tar -C /usr/local/bin -zx kubecolor
|
||||
fi"
|
||||
|
||||
# Install k9s
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/k9s ]; then
|
||||
FILENAME=k9s_Linux_$ARCH.tar.gz &&
|
||||
curl -fsSL https://github.com/derailed/k9s/releases/latest/download/\$FILENAME |
|
||||
sudo tar -zxvf- -C /usr/local/bin k9s
|
||||
sudo tar -C /usr/local/bin -zx k9s
|
||||
k9s version
|
||||
fi"
|
||||
|
||||
# Install popeye
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/popeye ]; then
|
||||
FILENAME=popeye_Linux_$HERP_DERP_ARCH.tar.gz &&
|
||||
FILENAME=popeye_Linux_$ARCH.tar.gz &&
|
||||
curl -fsSL https://github.com/derailed/popeye/releases/latest/download/\$FILENAME |
|
||||
sudo tar -zxvf- -C /usr/local/bin popeye
|
||||
sudo tar -C /usr/local/bin -zx popeye
|
||||
popeye version
|
||||
fi"
|
||||
|
||||
@@ -741,10 +810,10 @@ EOF
|
||||
# But the install script is not arch-aware (see https://github.com/tilt-dev/tilt/pull/5050).
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/tilt ]; then
|
||||
TILT_VERSION=0.22.15
|
||||
TILT_VERSION=0.33.13
|
||||
FILENAME=tilt.\$TILT_VERSION.linux.$TILT_ARCH.tar.gz
|
||||
curl -fsSL https://github.com/tilt-dev/tilt/releases/download/v\$TILT_VERSION/\$FILENAME |
|
||||
sudo tar -zxvf- -C /usr/local/bin tilt
|
||||
sudo tar -C /usr/local/bin -zx tilt
|
||||
tilt completion bash | sudo tee /etc/bash_completion.d/tilt
|
||||
tilt version
|
||||
fi"
|
||||
@@ -786,7 +855,8 @@ EOF
|
||||
fi"
|
||||
|
||||
##VERSION## https://github.com/bitnami-labs/sealed-secrets/releases
|
||||
KUBESEAL_VERSION=0.17.4
|
||||
KUBESEAL_VERSION=0.26.2
|
||||
URL=https://github.com/bitnami-labs/sealed-secrets/releases/download/v${KUBESEAL_VERSION}/kubeseal-${KUBESEAL_VERSION}-linux-${ARCH}.tar.gz
|
||||
#case $ARCH in
|
||||
#amd64) FILENAME=kubeseal-linux-amd64;;
|
||||
#arm64) FILENAME=kubeseal-arm64;;
|
||||
@@ -794,13 +864,13 @@ EOF
|
||||
#esac
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/kubeseal ]; then
|
||||
curl -fsSL https://github.com/bitnami-labs/sealed-secrets/releases/download/v$KUBESEAL_VERSION/kubeseal-$KUBESEAL_VERSION-linux-$ARCH.tar.gz |
|
||||
sudo tar -zxvf- -C /usr/local/bin kubeseal
|
||||
curl -fsSL $URL |
|
||||
sudo tar -C /usr/local/bin -zx kubeseal
|
||||
kubeseal --version
|
||||
fi"
|
||||
|
||||
##VERSION## https://github.com/vmware-tanzu/velero/releases
|
||||
VELERO_VERSION=1.11.0
|
||||
VELERO_VERSION=1.13.2
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/velero ]; then
|
||||
curl -fsSL https://github.com/vmware-tanzu/velero/releases/download/v$VELERO_VERSION/velero-v$VELERO_VERSION-linux-$ARCH.tar.gz |
|
||||
@@ -810,13 +880,21 @@ EOF
|
||||
fi"
|
||||
|
||||
##VERSION## https://github.com/doitintl/kube-no-trouble/releases
|
||||
KUBENT_VERSION=0.7.0
|
||||
KUBENT_VERSION=0.7.2
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/kubent ]; then
|
||||
curl -fsSL https://github.com/doitintl/kube-no-trouble/releases/download/${KUBENT_VERSION}/kubent-${KUBENT_VERSION}-linux-$ARCH.tar.gz |
|
||||
sudo tar -zxvf- -C /usr/local/bin kubent
|
||||
kubent --version
|
||||
fi"
|
||||
|
||||
# Ngrok. Note that unfortunately, this is the x86_64 binary.
|
||||
# We might have to rethink how to handle this for multi-arch environments.
|
||||
pssh "
|
||||
if [ ! -x /usr/local/bin/ngrok ]; then
|
||||
curl -fsSL https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz |
|
||||
sudo tar -zxvf- -C /usr/local/bin ngrok
|
||||
fi"
|
||||
}
|
||||
|
||||
_cmd kubereset "Wipe out Kubernetes configuration on all nodes"
|
||||
@@ -864,6 +942,15 @@ _cmd_inventory() {
|
||||
FIXME
|
||||
}
|
||||
|
||||
_cmd logins "Show login information for a group of instances"
|
||||
_cmd_logins() {
|
||||
TAG=$1
|
||||
need_tag $TAG
|
||||
|
||||
cat tags/$TAG/logins.jsonl \
|
||||
| jq -r '"\(.password)\tssh -l \(.login)\(if .port then " -p \(.port)" else "" end)\t\(.ipaddrs)"'
|
||||
}
|
||||
|
||||
_cmd maketag "Generate a quasi-unique tag for a group of instances"
|
||||
_cmd_maketag() {
|
||||
if [ -z $USER ]; then
|
||||
@@ -914,6 +1001,9 @@ _cmd_stage2() {
|
||||
cd tags/$TAG/stage2
|
||||
terraform init -upgrade
|
||||
terraform apply -auto-approve
|
||||
terraform output -raw logins_jsonl > ../logins.jsonl
|
||||
terraform output -raw ips_txt > ../ips.txt
|
||||
echo "stage2_ok" > status
|
||||
}
|
||||
|
||||
_cmd standardize "Deal with non-standard Ubuntu cloud images"
|
||||
@@ -950,12 +1040,19 @@ _cmd_standardize() {
|
||||
# Disable unattended upgrades so that they don't mess up with the subsequent steps
|
||||
pssh sudo rm -f /etc/apt/apt.conf.d/50unattended-upgrades
|
||||
|
||||
# Digital Ocean's cloud init disables password authentication; re-enable it.
|
||||
# Some cloud providers think that it's smart to disable password authentication.
|
||||
# We need to re-neable it, though.
|
||||
# Digital Ocecan
|
||||
pssh "
|
||||
if [ -f /etc/ssh/sshd_config.d/50-cloud-init.conf ]; then
|
||||
sudo rm /etc/ssh/sshd_config.d/50-cloud-init.conf
|
||||
sudo systemctl restart ssh.service
|
||||
fi"
|
||||
# AWS
|
||||
pssh "if [ -f /etc/ssh/sshd_config.d/60-cloudimg-settings.conf ]; then
|
||||
sudo rm /etc/ssh/sshd_config.d/60-cloudimg-settings.conf
|
||||
sudo systemctl restart ssh.service
|
||||
fi"
|
||||
|
||||
# Special case for oracle since their iptables blocks everything but SSH
|
||||
pssh "
|
||||
@@ -991,11 +1088,12 @@ _cmd_tailhist () {
|
||||
# halfway through and we're actually trying to download it again.
|
||||
pssh "
|
||||
set -e
|
||||
sudo apt-get install unzip -y
|
||||
wget -c https://github.com/joewalnes/websocketd/releases/download/v0.3.0/websocketd-0.3.0-linux_$ARCH.zip
|
||||
unzip websocketd-0.3.0-linux_$ARCH.zip websocketd
|
||||
sudo mv websocketd /usr/local/bin/websocketd
|
||||
sudo mkdir -p /tmp/tailhist
|
||||
sudo tee /root/tailhist.service <<EOF
|
||||
sudo mkdir -p /opt/tailhist
|
||||
sudo tee /opt/tailhist.service <<EOF
|
||||
[Unit]
|
||||
Description=tailhist
|
||||
|
||||
@@ -1003,16 +1101,16 @@ Description=tailhist
|
||||
WantedBy=multi-user.target
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/tmp/tailhist
|
||||
WorkingDirectory=/opt/tailhist
|
||||
ExecStart=/usr/local/bin/websocketd --port=1088 --staticdir=. sh -c \"tail -n +1 -f /home/$USER_LOGIN/.history || echo 'Could not read history file. Perhaps you need to \\\"chmod +r .history\\\"?'\"
|
||||
User=nobody
|
||||
Group=nogroup
|
||||
Restart=always
|
||||
EOF
|
||||
sudo systemctl enable /root/tailhist.service --now
|
||||
sudo systemctl enable /opt/tailhist.service --now
|
||||
"
|
||||
|
||||
pssh -I sudo tee /tmp/tailhist/index.html <lib/tailhist.html
|
||||
pssh -I sudo tee /opt/tailhist/index.html <lib/tailhist.html
|
||||
}
|
||||
|
||||
_cmd tools "Install a bunch of useful tools (editors, git, jq...)"
|
||||
@@ -1085,8 +1183,8 @@ _cmd_tags() {
|
||||
cd tags
|
||||
echo "[#] [Status] [Tag] [Mode] [Provider]"
|
||||
for tag in *; do
|
||||
if [ -f $tag/ips.txt ]; then
|
||||
count="$(wc -l < $tag/ips.txt)"
|
||||
if [ -f $tag/logins.jsonl ]; then
|
||||
count="$(wc -l < $tag/logins.jsonl)"
|
||||
else
|
||||
count="?"
|
||||
fi
|
||||
@@ -1249,7 +1347,7 @@ EOF"
|
||||
_cmd www "Run a web server to access card HTML and PDF"
|
||||
_cmd_www() {
|
||||
cd www
|
||||
IPADDR=$(curl -sL canihazip.com/s)
|
||||
IPADDR=$(curl -fsSL canihazip.com/s || echo localhost)
|
||||
info "The following files are available:"
|
||||
for F in *; do
|
||||
echo "http://$IPADDR:8000/$F"
|
||||
|
||||
Binary file not shown.
@@ -1,32 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import jinja2
|
||||
|
||||
|
||||
# Read settings from user-provided settings file
|
||||
context = yaml.safe_load(open(sys.argv[1]))
|
||||
|
||||
ips = list(open("ips.txt"))
|
||||
clustersize = context["clustersize"]
|
||||
context["logins"] = []
|
||||
for line in open("logins.jsonl"):
|
||||
if line.strip():
|
||||
context["logins"].append(json.loads(line))
|
||||
|
||||
print("---------------------------------------------")
|
||||
print(" Number of IPs: {}".format(len(ips)))
|
||||
print(" VMs per cluster: {}".format(clustersize))
|
||||
print(" Number of cards: {}".format(len(context["logins"])))
|
||||
print("---------------------------------------------")
|
||||
|
||||
assert len(ips)%clustersize == 0
|
||||
|
||||
clusters = []
|
||||
|
||||
while ips:
|
||||
cluster = ips[:clustersize]
|
||||
ips = ips[clustersize:]
|
||||
clusters.append(cluster)
|
||||
|
||||
context["clusters"] = clusters
|
||||
|
||||
template_file_name = context["cards_template"]
|
||||
template_file_path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
@@ -35,23 +25,23 @@ template_file_path = os.path.join(
|
||||
template_file_name
|
||||
)
|
||||
template = jinja2.Template(open(template_file_path).read())
|
||||
with open("ips.html", "w") as f:
|
||||
f.write(template.render(**context))
|
||||
print("Generated ips.html")
|
||||
with open("cards.html", "w") as f:
|
||||
f.write(template.render(**context))
|
||||
print("Generated cards.html")
|
||||
|
||||
|
||||
try:
|
||||
import pdfkit
|
||||
paper_size = context["paper_size"]
|
||||
margin = {"A4": "0.5cm", "Letter": "0.2in"}[paper_size]
|
||||
with open("ips.html") as f:
|
||||
pdfkit.from_file(f, "ips.pdf", options={
|
||||
with open("cards.html") as f:
|
||||
pdfkit.from_file(f, "cards.pdf", options={
|
||||
"page-size": paper_size,
|
||||
"margin-top": margin,
|
||||
"margin-bottom": margin,
|
||||
"margin-left": margin,
|
||||
"margin-right": margin,
|
||||
})
|
||||
print("Generated ips.pdf")
|
||||
print("Generated cards.pdf")
|
||||
except ImportError:
|
||||
print("WARNING: could not import pdfkit; did not generate ips.pdf")
|
||||
print("WARNING: could not import pdfkit; did not generate cards.pdf")
|
||||
@@ -17,6 +17,12 @@ pssh() {
|
||||
|
||||
echo "[parallel-ssh] $@"
|
||||
|
||||
# There are some routers that really struggle with the number of TCP
|
||||
# connections that we open when deploying large fleets of clusters.
|
||||
# We're adding a 1 second delay here, but this can be cranked up if
|
||||
# necessary - or down to zero, too.
|
||||
sleep ${PSSH_DELAY_PRE-1}
|
||||
|
||||
$(which pssh || which parallel-ssh) -h $HOSTFILE -l ubuntu \
|
||||
--par ${PSSH_PARALLEL_CONNECTIONS-100} \
|
||||
--timeout 300 \
|
||||
|
||||
16
prepare-labs/map-dns.sh
Executable file
16
prepare-labs/map-dns.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
DOMAINS=domains.txt
|
||||
IPS=ips.txt
|
||||
|
||||
. ./dns-cloudflare.sh
|
||||
|
||||
paste "$DOMAINS" "$IPS" | while read domain ips; do
|
||||
if ! [ "$domain" ]; then
|
||||
echo "⚠️ No more domains!"
|
||||
exit 1
|
||||
fi
|
||||
_clear_zone "$domain"
|
||||
_populate_zone "$domain" $ips
|
||||
done
|
||||
echo "✅ All done."
|
||||
@@ -7,7 +7,7 @@ USER_PASSWORD=training
|
||||
|
||||
# For a list of old versions, check:
|
||||
# https://kubernetes.io/releases/patch-releases/#non-active-branch-history
|
||||
KUBEVERSION=1.22.5
|
||||
KUBEVERSION=1.28.9
|
||||
|
||||
STEPS="
|
||||
wait
|
||||
|
||||
6
prepare-labs/settings/konk.env
Normal file
6
prepare-labs/settings/konk.env
Normal file
@@ -0,0 +1,6 @@
|
||||
CLUSTERSIZE=5
|
||||
|
||||
USER_LOGIN=k8s
|
||||
USER_PASSWORD=
|
||||
|
||||
STEPS="stage2"
|
||||
@@ -1,5 +1,3 @@
|
||||
CLUSTERSIZE=2
|
||||
|
||||
USER_LOGIN=k8s
|
||||
USER_PASSWORD=
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#export TF_VAR_node_size=GP2.4
|
||||
#export TF_VAR_node_size=g6-standard-6
|
||||
#export TF_VAR_node_size=m7i.xlarge
|
||||
|
||||
|
||||
CLUSTERSIZE=1
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
{%- set url = url
|
||||
| default("http://FIXME.container.training/") -%}
|
||||
{%- set pagesize = pagesize
|
||||
| default(9) -%}
|
||||
| default(10) -%}
|
||||
{%- set lang = lang
|
||||
| default("en") -%}
|
||||
{%- set event = event
|
||||
@@ -15,79 +15,36 @@
|
||||
{%- set backside = backside
|
||||
| default(False) -%}
|
||||
{%- set image = image
|
||||
| default("kube") -%}
|
||||
| default(False) -%}
|
||||
{%- set clusternumber = clusternumber
|
||||
| default(None) -%}
|
||||
{%- if qrcode == True -%}
|
||||
{%- set qrcode = "https://container.training/q" -%}
|
||||
{%- elif qrcode -%}
|
||||
{%- set qrcode = qrcode -%}
|
||||
{%- endif -%}
|
||||
{%- set thing = thing
|
||||
| default("lab environment") -%}
|
||||
|
||||
{# You can also set img_bottom_src instead. #}
|
||||
{%- set img_logo_src = {
|
||||
"docker": "https://s3-us-west-2.amazonaws.com/www.breadware.com/integrations/docker.png",
|
||||
"swarm": "https://cdn.wp.nginx.com/wp-content/uploads/2016/07/docker-swarm-hero2.png",
|
||||
"kube": "https://avatars1.githubusercontent.com/u/13629408",
|
||||
"enix": "https://enix.io/static/img/logos/logo-domain-cropped.png",
|
||||
}[image] -%}
|
||||
{%- if lang == "en" and clustersize == 1 -%}
|
||||
{%- set intro -%}
|
||||
Here is the connection information to your very own
|
||||
machine for this {{ event }}.
|
||||
You can connect to this VM with any SSH client.
|
||||
{%- endset -%}
|
||||
{%- set listhead -%}
|
||||
Your machine is:
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "en" and clustersize != 1 -%}
|
||||
{%- set intro -%}
|
||||
Here is the connection information to your very own
|
||||
cluster for this {{ event }}.
|
||||
You can connect to each VM with any SSH client.
|
||||
{%- endset -%}
|
||||
{%- set listhead -%}
|
||||
Your machines are:
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "fr" and clustersize == 1 -%}
|
||||
{%- set intro -%}
|
||||
Voici les informations permettant de se connecter à votre
|
||||
machine pour cette formation.
|
||||
Vous pouvez vous connecter à cette machine virtuelle
|
||||
avec n'importe quel client SSH.
|
||||
{%- endset -%}
|
||||
{%- set listhead -%}
|
||||
Adresse IP:
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "en" and clusterprefix != "node" -%}
|
||||
{%- set intro -%}
|
||||
Here is the connection information for the
|
||||
<strong>{{ clusterprefix }}</strong> environment.
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "fr" and clustersize != 1 -%}
|
||||
{%- set intro -%}
|
||||
Voici les informations permettant de se connecter à votre
|
||||
cluster pour cette formation.
|
||||
Vous pouvez vous connecter à chaque machine virtuelle
|
||||
avec n'importe quel client SSH.
|
||||
{%- endset -%}
|
||||
{%- set listhead -%}
|
||||
Adresses IP:
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "en" -%}
|
||||
{%- set slides_are_at -%}
|
||||
You can find the slides at:
|
||||
{%- endset -%}
|
||||
{%- if lang == "en" -%}
|
||||
{%- set intro -%}
|
||||
Here is the connection information to your very own
|
||||
{{ thing }} for this {{ event }}.
|
||||
You can connect to it with any SSH client.
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "fr" -%}
|
||||
{%- set slides_are_at -%}
|
||||
Le support de formation est à l'adresse suivante :
|
||||
{%- endset -%}
|
||||
{%- set intro -%}
|
||||
Voici les informations permettant de se connecter à votre
|
||||
{{ thing }} pour cette formation.
|
||||
Vous pouvez vous y connecter
|
||||
avec n'importe quel client SSH.
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "en" -%}
|
||||
{%- set slides_are_at -%}
|
||||
You can find the slides at:
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
{%- if lang == "fr" -%}
|
||||
{%- set slides_are_at -%}
|
||||
Le support de formation est à l'adresse suivante :
|
||||
{%- endset -%}
|
||||
{%- endif -%}
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
@@ -102,25 +59,21 @@
|
||||
}
|
||||
body {
|
||||
/* this is A4 minus 0.5cm margins */
|
||||
width: 20cm;
|
||||
height: 28.7cm;
|
||||
width: 20cm;
|
||||
height: 28.7cm;
|
||||
}
|
||||
{% elif paper_size == "Letter" %}
|
||||
@page {
|
||||
size: Letter;
|
||||
margin: 0.2in;
|
||||
size: Letter; /* 8.5in x 11in */
|
||||
}
|
||||
body {
|
||||
/* this is Letter minus 0.2in margins */
|
||||
width: 8.6in;
|
||||
heigth: 10.6in;
|
||||
width: 6.75in; /* two cards wide */
|
||||
margin-left: 0.875in; /* (8.5in - 6.75in)/2 */
|
||||
margin-top: 0; /* NOTE: we have to manually specify a top margin of e.g. 0.1875in when printing */
|
||||
}
|
||||
{% endif %}
|
||||
|
||||
|
||||
body, table {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
line-height: 1em;
|
||||
font-size: 15px;
|
||||
font-family: 'Slabo 27px';
|
||||
@@ -134,47 +87,45 @@ table {
|
||||
padding-left: 0.4em;
|
||||
}
|
||||
|
||||
div {
|
||||
td:first-child {
|
||||
width: 10.5em;
|
||||
}
|
||||
|
||||
div.card {
|
||||
float: left;
|
||||
border: 1px dotted black;
|
||||
{% if backside %}
|
||||
height: 33%;
|
||||
{% endif %}
|
||||
/* columns * (width+left+right) < 100% */
|
||||
border: 0.01in dotted black;
|
||||
/*
|
||||
width: 24.8%;
|
||||
columns * (width+left+right) < 100%
|
||||
height: 33%;
|
||||
width: 24.8%;
|
||||
width: 33%;
|
||||
*/
|
||||
/**/
|
||||
width: 33%;
|
||||
/**/
|
||||
width: 3.355in; /* 3.375in minus two 0.01in borders */
|
||||
height: 2.105in; /* 2.125in minus two 0.01in borders */
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 0.8em;
|
||||
}
|
||||
|
||||
div.back {
|
||||
border: 1px dotted grey;
|
||||
div.front {
|
||||
{% if image %}
|
||||
background-image: url("{{ image }}");
|
||||
background-repeat: no-repeat;
|
||||
background-size: 1in;
|
||||
background-position-x: 2.8in;
|
||||
background-position-y: center;
|
||||
{% endif %}
|
||||
}
|
||||
|
||||
span.scale {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
img.logo {
|
||||
height: 4.5em;
|
||||
float: right;
|
||||
}
|
||||
|
||||
img.bottom {
|
||||
height: 2.5em;
|
||||
display: block;
|
||||
margin: 0.5em auto;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.qrcode img {
|
||||
width: 40%;
|
||||
margin: 1em;
|
||||
height: 5.8em;
|
||||
padding: 1em 1em 0.5em 1em;
|
||||
float: left;
|
||||
}
|
||||
|
||||
.logpass {
|
||||
@@ -189,101 +140,97 @@ img.bottom {
|
||||
height: 0;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.rawgit.com/davidshimjs/qrcodejs/gh-pages/qrcode.min.js"></script>
|
||||
<script type="text/javascript" src="qrcode.min.js"></script>
|
||||
<script type="text/javascript">
|
||||
function qrcodes() {
|
||||
[].forEach.call(
|
||||
document.getElementsByClassName("qrcode"),
|
||||
(e, index) => {
|
||||
new QRCode(e, {
|
||||
text: "{{ qrcode }}",
|
||||
correctLevel: QRCode.CorrectLevel.L
|
||||
});
|
||||
}
|
||||
);
|
||||
[].forEach.call(
|
||||
document.getElementsByClassName("qrcode"),
|
||||
(e, index) => {
|
||||
new QRCode(e, {
|
||||
text: "{{ qrcode }}",
|
||||
correctLevel: QRCode.CorrectLevel.L
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
function scale() {
|
||||
[].forEach.call(
|
||||
document.getElementsByClassName("scale"),
|
||||
(e, index) => {
|
||||
var text_width = e.getBoundingClientRect().width;
|
||||
var box_width = e.parentElement.getBoundingClientRect().width;
|
||||
var percent = 100 * box_width / text_width + "%";
|
||||
e.style.fontSize = percent;
|
||||
}
|
||||
);
|
||||
[].forEach.call(
|
||||
document.getElementsByClassName("scale"),
|
||||
(e, index) => {
|
||||
var text_width = e.getBoundingClientRect().width;
|
||||
var box_width = e.parentElement.getBoundingClientRect().width;
|
||||
var percent = 100 * box_width / text_width + "%";
|
||||
e.style.fontSize = percent;
|
||||
}
|
||||
);
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body onload="qrcodes(); scale();">
|
||||
{% for cluster in clusters %}
|
||||
<div>
|
||||
{% for login in logins %}
|
||||
<div class="card front">
|
||||
<p>{{ intro }}</p>
|
||||
<p>
|
||||
{% if img_logo_src %}
|
||||
<img class="logo" src="{{ img_logo_src }}" />
|
||||
{% endif %}
|
||||
<table>
|
||||
{% if clusternumber != None %}
|
||||
<tr><td>cluster:</td></tr>
|
||||
<tr><td class="logpass">{{ clusternumber + loop.index }}</td></tr>
|
||||
{% endif %}
|
||||
<tr><td>login:</td></tr>
|
||||
<tr><td class="logpass">{{ user_login }}</td></tr>
|
||||
<tr><td>password:</td></tr>
|
||||
<tr><td class="logpass">{{ user_password }}</td></tr>
|
||||
</table>
|
||||
|
||||
</p>
|
||||
<p>
|
||||
{{ listhead }}
|
||||
<table>
|
||||
{% for node in cluster %}
|
||||
<tr>
|
||||
<td>{{ clusterprefix }}{{ loop.index }}:</td>
|
||||
<td>{{ node }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
<tr>
|
||||
<td>login:</td>
|
||||
<td>password:</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="logpass">{{ login.login }}</td>
|
||||
<td class="logpass">{{ login.password }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>IP address:</td>
|
||||
{% if login.port %}
|
||||
<td>port:</td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="logpass">{{ login.ipaddrs.split("\t")[0] }}</td>
|
||||
{% if login.port %}
|
||||
<td class="logpass">{{ login.port }}</td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
</table>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
{% if url %}
|
||||
{{ slides_are_at }}
|
||||
{{ slides_are_at }}
|
||||
<p>
|
||||
<span class="scale">{{ url }}</span>
|
||||
</p>
|
||||
{% endif %}
|
||||
{% if img_bottom_src %}
|
||||
<img class="bottom" src="{{ img_bottom_src }}" />
|
||||
{% endif %}
|
||||
</p>
|
||||
</div>
|
||||
{% if loop.index%pagesize==0 or loop.last %}
|
||||
<span class="pagebreak"></span>
|
||||
{% if backside %}
|
||||
{% for x in range(pagesize) %}
|
||||
<div class="back">
|
||||
<p>Thanks for attending
|
||||
"Getting Started With Kubernetes and Container Orchestration"
|
||||
during CONFERENCE in Month YYYY!</p>
|
||||
<p>If you liked that workshop,
|
||||
I can train your team, in person or
|
||||
online, with custom courses of
|
||||
any length and any level.
|
||||
</p>
|
||||
{% if qrcode %}
|
||||
<p>If you're interested, please scan that QR code to contact me:</p>
|
||||
<span class="qrcode"></span>
|
||||
{% for x in range(pagesize) %}
|
||||
<div class="card back">
|
||||
{{ backside }}
|
||||
{#
|
||||
<p>Thanks for attending
|
||||
"Getting Started With Kubernetes and Container Orchestration"
|
||||
during CONFERENCE in Month YYYY!</p>
|
||||
<p>If you liked that workshop,
|
||||
I can train your team, in person or
|
||||
online, with custom courses of
|
||||
any length and any level.
|
||||
</p>
|
||||
{% if qrcode %}
|
||||
<p>If you're interested, please scan that QR code to contact me:</p>
|
||||
<span class="qrcode"></span>
|
||||
{% else %}
|
||||
<p>If you're interested, you can contact me at:</p>
|
||||
{% endif %}
|
||||
<p>jerome.petazzoni@gmail.com</p>
|
||||
</div>
|
||||
{% endfor %}
|
||||
<span class="pagebreak"></span>
|
||||
{% endif %}
|
||||
<p>If you're interested, you can contact me at:</p>
|
||||
{% endif %}
|
||||
<p>jerome.petazzoni@gmail.com</p>
|
||||
#}
|
||||
</div>
|
||||
{% endfor %}
|
||||
<span class="pagebreak"></span>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</body>
|
||||
|
||||
18
prepare-labs/templates/cards.yaml
Normal file
18
prepare-labs/templates/cards.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
cards_template: cards.html
|
||||
paper_size: Letter
|
||||
url: https://2024-11-qconsf.container.training
|
||||
event: workshop
|
||||
backside: |
|
||||
<div class="qrcode"></div>
|
||||
<p>
|
||||
Thanks for attending the Asynchronous Architecture Patterns workshop at QCON!
|
||||
</p>
|
||||
<p>
|
||||
If you'd like me to send you a copy of the recording of the workshop
|
||||
and of the training materials,
|
||||
please scan that QR code to leave me your
|
||||
contact information. Thank you!
|
||||
</p>
|
||||
qrcode: https://2024-11-qconsf.container.training/q
|
||||
thing: Kubernetes cluster
|
||||
image: logo-bento.svg
|
||||
@@ -8,8 +8,8 @@ resource "random_string" "_" {
|
||||
resource "time_static" "_" {}
|
||||
|
||||
locals {
|
||||
min_nodes_per_pool = var.nodes_per_cluster
|
||||
max_nodes_per_pool = var.nodes_per_cluster * 2
|
||||
min_nodes_per_pool = var.min_nodes_per_cluster
|
||||
max_nodes_per_pool = var.max_nodes_per_cluster
|
||||
timestamp = formatdate("YYYY-MM-DD-hh-mm", time_static._.rfc3339)
|
||||
tag = random_string._.result
|
||||
# Common tags to be assigned to all resources
|
||||
|
||||
@@ -217,16 +217,27 @@ resource "kubernetes_certificate_signing_request_v1" "cluster_admin_${index}" {
|
||||
|
||||
%{ endfor ~}
|
||||
|
||||
output "ip_addresses_of_nodes" {
|
||||
output "ips_txt" {
|
||||
value = join("\n", [
|
||||
%{ for index, cluster in clusters ~}
|
||||
join("\t", concat(
|
||||
[
|
||||
random_string.shpod_${index}.result,
|
||||
"ssh -l k8s -p $${kubernetes_service.shpod_${index}.spec[0].port[0].node_port}"
|
||||
],
|
||||
join("\n", concat(
|
||||
split(" ", file("./externalips.${index}"))
|
||||
)),
|
||||
%{ endfor ~}
|
||||
""
|
||||
])
|
||||
}
|
||||
|
||||
output "logins_jsonl" {
|
||||
value = join("\n", [
|
||||
%{ for index, cluster in clusters ~}
|
||||
jsonencode({
|
||||
login = "k8s",
|
||||
password = random_string.shpod_${index}.result,
|
||||
port = kubernetes_service.shpod_${index}.spec[0].port[0].node_port,
|
||||
ipaddrs = replace(file("./externalips.${index}"), " ", "\t"),
|
||||
}),
|
||||
%{ endfor ~}
|
||||
""
|
||||
])
|
||||
}
|
||||
|
||||
@@ -7,11 +7,16 @@ variable "how_many_clusters" {
|
||||
default = 2
|
||||
}
|
||||
|
||||
variable "nodes_per_cluster" {
|
||||
variable "min_nodes_per_cluster" {
|
||||
type = number
|
||||
default = 2
|
||||
}
|
||||
|
||||
variable "max_nodes_per_cluster" {
|
||||
type = number
|
||||
default = 4
|
||||
}
|
||||
|
||||
variable "node_size" {
|
||||
type = string
|
||||
default = "M"
|
||||
|
||||
@@ -1,10 +1,23 @@
|
||||
resource "scaleway_vpc_private_network" "_" {
|
||||
}
|
||||
|
||||
# This is a kind of hack to use a custom security group with Kapsulse.
|
||||
# See https://www.scaleway.com/en/docs/containers/kubernetes/reference-content/secure-cluster-with-private-network/
|
||||
|
||||
resource "scaleway_instance_security_group" "_" {
|
||||
name = "kubernetes ${split("/", scaleway_k8s_cluster._.id)[1]}"
|
||||
inbound_default_policy = "accept"
|
||||
outbound_default_policy = "accept"
|
||||
}
|
||||
|
||||
resource "scaleway_k8s_cluster" "_" {
|
||||
name = var.cluster_name
|
||||
#region = var.location
|
||||
name = var.cluster_name
|
||||
tags = var.common_tags
|
||||
version = local.k8s_version
|
||||
type = "kapsule"
|
||||
cni = "cilium"
|
||||
delete_additional_resources = true
|
||||
private_network_id = scaleway_vpc_private_network._.id
|
||||
}
|
||||
|
||||
resource "scaleway_k8s_pool" "_" {
|
||||
@@ -17,6 +30,7 @@ resource "scaleway_k8s_pool" "_" {
|
||||
max_size = var.max_nodes_per_pool
|
||||
autoscaling = var.max_nodes_per_pool > var.min_nodes_per_pool
|
||||
autohealing = true
|
||||
depends_on = [ scaleway_instance_security_group._ ]
|
||||
}
|
||||
|
||||
data "scaleway_k8s_version" "_" {
|
||||
|
||||
@@ -4,6 +4,7 @@ resource "helm_release" "_" {
|
||||
create_namespace = true
|
||||
repository = "https://charts.loft.sh"
|
||||
chart = "vcluster"
|
||||
version = "0.19.7"
|
||||
set {
|
||||
name = "service.type"
|
||||
value = "NodePort"
|
||||
|
||||
@@ -14,9 +14,9 @@ $ hcloud server-type list | grep shared
|
||||
variable "node_sizes" {
|
||||
type = map(any)
|
||||
default = {
|
||||
S = "cx11"
|
||||
M = "cx21"
|
||||
L = "cx31"
|
||||
S = "cpx11"
|
||||
M = "cpx21"
|
||||
L = "cpx31"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -71,10 +71,10 @@ resource "local_file" "ip_addresses" {
|
||||
resource "local_file" "clusters" {
|
||||
content = join("", formatlist("%s\n", [
|
||||
for cid in range(1, 1 + var.how_many_clusters) :
|
||||
join(" ",
|
||||
join("\t",
|
||||
[for nid in range(1, 1 + var.nodes_per_cluster) :
|
||||
local.ip_addresses[format("c%03dn%03d", cid, nid)]
|
||||
])]))
|
||||
filename = "clusters.txt"
|
||||
filename = "clusters.tsv"
|
||||
file_permission = "0600"
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ data "openstack_images_image_v2" "_" {
|
||||
most_recent = true
|
||||
properties = {
|
||||
os = "ubuntu"
|
||||
version = "22.04"
|
||||
version = "24.04"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
555
prepare-labs/www/logo-bento.svg
Normal file
555
prepare-labs/www/logo-bento.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 81 KiB |
BIN
prepare-labs/www/logo-kubernetes.png
Normal file
BIN
prepare-labs/www/logo-kubernetes.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 31 KiB |
1
prepare-labs/www/qrcode.min.js
vendored
Normal file
1
prepare-labs/www/qrcode.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
@@ -2,7 +2,7 @@
|
||||
#/ /kube-halfday.yml.html 200!
|
||||
#/ /kube-fullday.yml.html 200!
|
||||
#/ /kube-twodays.yml.html 200!
|
||||
/ /dojo.yml.html 200!
|
||||
/ /mq.yml.html 200!
|
||||
|
||||
# And this allows to do "git clone https://container.training".
|
||||
/info/refs service=git-upload-pack https://github.com/jpetazzo/container.training/info/refs?service=git-upload-pack
|
||||
@@ -13,14 +13,12 @@
|
||||
#/kubernetesmastery https://www.udemy.com/course/kubernetesmastery/?couponCode=DOCKERALLDAY
|
||||
|
||||
# Shortlink for the QRCode
|
||||
/q /qrcode.html 200
|
||||
/q https://docs.google.com/forms/d/e/1FAIpQLScYloWur4uVhKgVNIdUrfHZ8pk_mBmPcQwmbhjK2FlR9KWDCA/viewform
|
||||
|
||||
# Shortlinks for next training in English and French
|
||||
#/next https://www.eventbrite.com/e/livestream-intensive-kubernetes-bootcamp-tickets-103262336428
|
||||
/next https://skillsmatter.com/courses/700-advanced-kubernetes-concepts-workshop-jerome-petazzoni
|
||||
/next https://qconsf.com/training/nov2024/asynchronous-architecture-patterns-scale-ml-and-other-high-latency-workloads
|
||||
/hi5 https://enix.io/fr/services/formation/online/
|
||||
/us https://www.ardanlabs.com/live-training-events/deploying-microservices-and-traditional-applications-with-kubernetes-march-28-2022.html
|
||||
/uk https://skillsmatter.com/workshops/827-deploying-microservices-and-traditional-applications-with-kubernetes-with-jerome-petazzoni
|
||||
|
||||
# Survey form
|
||||
/please https://docs.google.com/forms/d/e/1FAIpQLSfIYSgrV7tpfBNm1hOaprjnBHgWKn5n-k5vtNXYJkOX1sRxng/viewform
|
||||
|
||||
814
slides/autopilot/package-lock.json
generated
814
slides/autopilot/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -2,8 +2,8 @@
|
||||
"name": "container-training-pub-sub-server",
|
||||
"version": "0.0.1",
|
||||
"dependencies": {
|
||||
"express": "^4.16.2",
|
||||
"socket.io": "^4.6.1",
|
||||
"socket.io-client": "^4.5.1"
|
||||
"express": "^4.21.1",
|
||||
"socket.io": "^4.8.0",
|
||||
"socket.io-client": "^4.7.5"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
version: "2"
|
||||
|
||||
services:
|
||||
www:
|
||||
image: nginx
|
||||
@@ -40,7 +40,7 @@
|
||||
|
||||
- In multi-stage builds, all stages can be built in parallel
|
||||
|
||||
(example: https://github.com/jpetazzo/shpod; [before] and [after])
|
||||
(example: https://github.com/jpetazzo/shpod; [before][shpod-before-parallel] and [after][shpod-after-parallel])
|
||||
|
||||
- Stages are built only when they are necessary
|
||||
|
||||
@@ -50,8 +50,8 @@
|
||||
|
||||
- Files are cached in the builder
|
||||
|
||||
[before]: https://github.com/jpetazzo/shpod/blob/c6efedad6d6c3dc3120dbc0ae0a6915f85862474/Dockerfile
|
||||
[after]: https://github.com/jpetazzo/shpod/blob/d20887bbd56b5fcae2d5d9b0ce06cae8887caabf/Dockerfile
|
||||
[shpod-before-parallel]: https://github.com/jpetazzo/shpod/blob/c6efedad6d6c3dc3120dbc0ae0a6915f85862474/Dockerfile
|
||||
[shpod-after-parallel]: https://github.com/jpetazzo/shpod/blob/d20887bbd56b5fcae2d5d9b0ce06cae8887caabf/Dockerfile
|
||||
|
||||
---
|
||||
|
||||
@@ -121,10 +121,10 @@ docker buildx build … \
|
||||
|
||||
- Must not use binary downloads with hard-coded architectures!
|
||||
|
||||
(streamlining a Dockerfile for multi-arch: [before], [after])
|
||||
(streamlining a Dockerfile for multi-arch: [before][shpod-before-multiarch], [after][shpod-after-multiarch])
|
||||
|
||||
[before]: https://github.com/jpetazzo/shpod/blob/d20887bbd56b5fcae2d5d9b0ce06cae8887caabf/Dockerfile
|
||||
[after]: https://github.com/jpetazzo/shpod/blob/c50789e662417b34fea6f5e1d893721d66d265b7/Dockerfile
|
||||
[shpod-before-multiarch]: https://github.com/jpetazzo/shpod/blob/d20887bbd56b5fcae2d5d9b0ce06cae8887caabf/Dockerfile
|
||||
[shpod-after-multiarch]: https://github.com/jpetazzo/shpod/blob/c50789e662417b34fea6f5e1d893721d66d265b7/Dockerfile
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -120,11 +120,11 @@ class: extra-details
|
||||
|
||||
(and won't end up in the resulting image)
|
||||
|
||||
- See the [documentation] for the little details
|
||||
- See the [documentation][dockerignore] for the little details
|
||||
|
||||
(exceptions can be made with `!`, multiple directory levels with `**`...)
|
||||
|
||||
[documentation]: https://docs.docker.com/engine/reference/builder/#dockerignore-file
|
||||
[dockerignore]: https://docs.docker.com/engine/reference/builder/#dockerignore-file
|
||||
|
||||
???
|
||||
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
title: |
|
||||
FIXME
|
||||
|
||||
#chat: "[Slack](https://dockercommunity.slack.com/messages/C7GKACWDV)"
|
||||
#chat: "[Gitter](https://gitter.im/jpetazzo/training-20180413-paris)"
|
||||
chat: "FIXME"
|
||||
|
||||
gitrepo: github.com/jpetazzo/container.training
|
||||
|
||||
slides: https://2023-11-dojo.container.training/
|
||||
|
||||
#slidenumberprefix: "#SomeHashTag — "
|
||||
|
||||
content:
|
||||
- fixme.md
|
||||
@@ -1,4 +1,4 @@
|
||||
## Exercise — Ingress
|
||||
## Exercise — Ingress Controller
|
||||
|
||||
- Add an ingress controller to a Kubernetes cluster
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Exercise — Ingress
|
||||
# Exercise — Ingress Controller
|
||||
|
||||
- We want to expose a couple of web apps through an ingress controller
|
||||
|
||||
@@ -128,4 +128,4 @@ This is similar to the previous scenario, but with two significant changes:
|
||||
|
||||
1. We only want to run the ingress controller on nodes that have the role `ingress`.
|
||||
|
||||
2. We don't want to use `hostNetwork`, but a list of `externalIPs` instead.
|
||||
2. We want to either use `hostPort`, or a list of `externalIPs` (not `hostNetwork`).
|
||||
@@ -1,6 +1,6 @@
|
||||
# Exercise — Network Policies
|
||||
|
||||
We want to to implement a generic network security mechanism.
|
||||
We want to implement a generic network security mechanism.
|
||||
|
||||
Instead of creating one policy per service, we want to
|
||||
create a fixed number of policies, and use a single label
|
||||
|
||||
11
slides/exercises/polykuberbac-brief.md
Normal file
11
slides/exercises/polykuberbac-brief.md
Normal file
@@ -0,0 +1,11 @@
|
||||
## Exercise — Enable RBAC
|
||||
|
||||
- Enable RBAC on a manually-deployed control plane
|
||||
|
||||
- This involves:
|
||||
|
||||
- generating different certificates
|
||||
|
||||
- distributing the certificates to the controllers
|
||||
|
||||
- enabling the proper authorizers in API server
|
||||
117
slides/exercises/polykuberbac-details.md
Normal file
117
slides/exercises/polykuberbac-details.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# Exercise — Enable RBAC
|
||||
|
||||
- We want to enable RBAC on the "polykube" cluster
|
||||
|
||||
(it doesn't matter whether we have 1 or multiple nodes)
|
||||
|
||||
- Ideally, we want to have, for instance:
|
||||
|
||||
- one key, certificate, and kubeconfig for a cluster admin
|
||||
|
||||
- one key, certificate, and kubeconfig for a user
|
||||
<br/>
|
||||
(with permissions in a single namespace)
|
||||
|
||||
- Bonus points: enable the NodeAuthorizer too!
|
||||
|
||||
- Check the following slides for hints
|
||||
|
||||
---
|
||||
|
||||
## Step 1
|
||||
|
||||
- Enable RBAC itself!
|
||||
|
||||
--
|
||||
|
||||
- This is done with an API server command-line flag
|
||||
|
||||
--
|
||||
|
||||
- Check [the documentation][kube-apiserver-doc] to see the flag
|
||||
|
||||
--
|
||||
|
||||
- For now, only enable `--authorization-mode=RBAC`
|
||||
|
||||
[kube-apiserver-doc]: https://kubernetes.io/docs/reference/command-line-tools-reference/kube-apiserver/
|
||||
|
||||
---
|
||||
|
||||
## Step 2
|
||||
|
||||
- Our certificate doesn't work anymore, we need to generate a new one
|
||||
|
||||
--
|
||||
|
||||
- We need a certificate that will have *some* (ideally *all*) permissions
|
||||
|
||||
--
|
||||
|
||||
- Two options:
|
||||
|
||||
- use the equivalent of "root" (identity that completely skips permission checks)
|
||||
|
||||
- a "non-root" identity but which is granted permissions with RBAC
|
||||
|
||||
--
|
||||
|
||||
- The "non-root" option looks nice, but to grant permissions, we need permissions
|
||||
|
||||
- So let's start with the equivalent of "root"!
|
||||
|
||||
--
|
||||
|
||||
- The Kubernetes equivalent of `root` is the group `system:masters`
|
||||
|
||||
---
|
||||
|
||||
## Step 2, continued
|
||||
|
||||
- We need to generate a certificate for a user belonging to group `system:masters`
|
||||
|
||||
--
|
||||
|
||||
- In Kubernetes certificates, groups are encoded with the "organization" field
|
||||
|
||||
--
|
||||
|
||||
- That corresponds to `O=system:masters`
|
||||
|
||||
--
|
||||
|
||||
- In other words we need to generate a new certificate, but with a subject of:
|
||||
|
||||
`/CN=admin/O=system:masters/` (the `CN` doesn't matter)
|
||||
|
||||
- That certificate should be able to interact with the API server, like before
|
||||
|
||||
---
|
||||
|
||||
## Step 3
|
||||
|
||||
- Now, all our controllers have permissions issues
|
||||
|
||||
- We need to either:
|
||||
|
||||
- use that `system:masters` cert everywhere
|
||||
|
||||
- generate different certs for every controller, with the proper identities
|
||||
|
||||
- Suggestion: use `system-masters` everywhere to begin with
|
||||
|
||||
(and make sure the cluster is back on its feet)
|
||||
|
||||
---
|
||||
|
||||
## Step 4
|
||||
|
||||
At this point, there are two possible forks in the road:
|
||||
|
||||
1. Generate certs for the control plane controllers
|
||||
|
||||
(`kube-controller-manager`, `kube-scheduler`)
|
||||
|
||||
2. Generate cert(s) for the node(s) and enable `NodeAuthorizer`
|
||||
|
||||
Good luck!
|
||||
7
slides/exercises/reqlim-brief.md
Normal file
7
slides/exercises/reqlim-brief.md
Normal file
@@ -0,0 +1,7 @@
|
||||
## Exercise — Requests and Limits
|
||||
|
||||
- Check current resource allocation and utilization
|
||||
|
||||
- Make sure that all workloads have requests (and perhaps limits)
|
||||
|
||||
- Make sure that all *future* workloads too!
|
||||
55
slides/exercises/reqlim-details.md
Normal file
55
slides/exercises/reqlim-details.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# Exercise — Requests and Limits
|
||||
|
||||
By default, if we don't specify *resource requests*,
|
||||
our workloads will run in `BestEffort` quality of service.
|
||||
|
||||
`BestEffort` is very bad for production workloads,
|
||||
because the scheduler has no idea of the actual resource
|
||||
requirements of our apps, and won't be able to make
|
||||
smart decisions about workload placement.
|
||||
|
||||
As a result, when the cluster gets overloaded,
|
||||
containers will be killed, pods will be evicted,
|
||||
and service disruptions will happen.
|
||||
|
||||
Let's solve this!
|
||||
|
||||
---
|
||||
|
||||
## Check current state
|
||||
|
||||
- Check *allocations*
|
||||
|
||||
(i.e. which pods have requests and limits for CPU and memory)
|
||||
|
||||
- Then check *utilization*
|
||||
|
||||
(i.e. actual resource usage)
|
||||
|
||||
- Possible tools: `kubectl`, plugins like `view-allocations`, Prometheus...
|
||||
|
||||
---
|
||||
|
||||
## Follow best practices
|
||||
|
||||
- We want to make sure that *all* workloads have requests
|
||||
|
||||
(and perhaps limits, too!)
|
||||
|
||||
- Depending on the workload:
|
||||
|
||||
- edit its YAML manifest
|
||||
|
||||
- adjust its Helm values
|
||||
|
||||
- add LimitRange in its Namespace
|
||||
|
||||
- Then check again to confirm that the job has been done properly!
|
||||
|
||||
---
|
||||
|
||||
## Be future-proof!
|
||||
|
||||
- We want to make sure that *future* workloads will have requests, too
|
||||
|
||||
- How can that be implemented?
|
||||
5
slides/find-duplicate-markdown-links.sh
Executable file
5
slides/find-duplicate-markdown-links.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
for LINK in $(cat */*.md | sed -n 's/^\[\(.*\)\]:.*/\1/p' | sort | uniq -d); do
|
||||
grep '^\['"$LINK"'\]:' */*.md
|
||||
done
|
||||
|
||||
BIN
slides/images/argocd_architecture.png
Normal file
BIN
slides/images/argocd_architecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 103 KiB |
BIN
slides/images/argocdlogo.png
Normal file
BIN
slides/images/argocdlogo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 22 KiB |
@@ -1,16 +1,16 @@
|
||||
https://gallant-turing-d0d520.netlify.com/containers/Container-Ship-Freighter-Navigation-Elbe-Romance-1782991.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/ShippingContainerSFBay.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/aerial-view-of-containers.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/blue-containers.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/chinook-helicopter-container.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/container-cranes.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/container-housing.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/containers-by-the-water.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/distillery-containers.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/lots-of-containers.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/plastic-containers.JPG
|
||||
https://gallant-turing-d0d520.netlify.com/containers/train-of-containers-1.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/train-of-containers-2.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/two-containers-on-a-truck.jpg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/wall-of-containers.jpeg
|
||||
https://gallant-turing-d0d520.netlify.com/containers/catene-de-conteneurs.jpg
|
||||
https://prettypictures.container.training/containers/Container-Ship-Freighter-Navigation-Elbe-Romance-1782991.jpg
|
||||
https://prettypictures.container.training/containers/ShippingContainerSFBay.jpg
|
||||
https://prettypictures.container.training/containers/aerial-view-of-containers.jpg
|
||||
https://prettypictures.container.training/containers/blue-containers.jpg
|
||||
https://prettypictures.container.training/containers/chinook-helicopter-container.jpg
|
||||
https://prettypictures.container.training/containers/container-cranes.jpg
|
||||
https://prettypictures.container.training/containers/container-housing.jpg
|
||||
https://prettypictures.container.training/containers/containers-by-the-water.jpg
|
||||
https://prettypictures.container.training/containers/distillery-containers.jpg
|
||||
https://prettypictures.container.training/containers/lots-of-containers.jpg
|
||||
https://prettypictures.container.training/containers/plastic-containers.JPG
|
||||
https://prettypictures.container.training/containers/train-of-containers-1.jpg
|
||||
https://prettypictures.container.training/containers/train-of-containers-2.jpg
|
||||
https://prettypictures.container.training/containers/two-containers-on-a-truck.jpg
|
||||
https://prettypictures.container.training/containers/wall-of-containers.jpeg
|
||||
https://prettypictures.container.training/containers/catene-de-conteneurs.jpg
|
||||
|
||||
@@ -20,19 +20,21 @@
|
||||
|
||||
## Use cases
|
||||
|
||||
Some examples ...
|
||||
- Defaulting
|
||||
|
||||
- Stand-alone admission controllers
|
||||
*injecting image pull secrets, sidecars, environment variables...*
|
||||
|
||||
*validating:* policy enforcement (e.g. quotas, naming conventions ...)
|
||||
- Policy enforcement and best practices
|
||||
|
||||
*mutating:* inject or provide default values (e.g. pod presets)
|
||||
*prevent: `latest` images, deprecated APIs...*
|
||||
|
||||
- Admission controllers part of a greater system
|
||||
*require: PDBs, resource requests/limits, labels/annotations, local registry...*
|
||||
|
||||
*validating:* advanced typing for operators
|
||||
- Problem mitigation
|
||||
|
||||
*mutating:* inject sidecars for service meshes
|
||||
*block nodes with vulnerable kernels, inject log4j mitigations...*
|
||||
|
||||
- Extended validation for operators
|
||||
|
||||
---
|
||||
|
||||
@@ -198,6 +200,64 @@ Some examples ...
|
||||
|
||||
(the Node "echo" app, the Flask app, and one ngrok tunnel for each of them)
|
||||
|
||||
- We will need an ngrok account for the tunnels
|
||||
|
||||
(a free account is fine)
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## What's ngrok?
|
||||
|
||||
- Ngrok provides secure tunnels to access local services
|
||||
|
||||
- Example: run `ngrok http 1234`
|
||||
|
||||
- `ngrok` will display a publicly-available URL (e.g. https://xxxxyyyyzzzz.ngrok.app)
|
||||
|
||||
- Connections to https://xxxxyyyyzzzz.ngrok.app will terminate at `localhost:1234`
|
||||
|
||||
- Basic product is free; extra features (vanity domains, end-to-end TLS...) for $$$
|
||||
|
||||
- Perfect to develop our webhook!
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Ngrok in production
|
||||
|
||||
- Ngrok was initially known for its local webhook development features
|
||||
|
||||
- It now supports production scenarios as well
|
||||
|
||||
(load balancing, WAF, authentication, circuit-breaking...)
|
||||
|
||||
- Including some that are very relevant to Kubernetes
|
||||
|
||||
(e.g. [ngrok Ingress Controller](https://github.com/ngrok/kubernetes-ingress-controller)
|
||||
|
||||
---
|
||||
|
||||
## Ngrok tokens
|
||||
|
||||
- If you're attending a live training, you might have an ngrok token
|
||||
|
||||
- Look in `~/ngrok.env` and if that file exists, copy it to the stack:
|
||||
|
||||
.lab[
|
||||
|
||||
```bash
|
||||
cp ~/ngrok.env ~/container.training/webhooks/admission/.env
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Starting the whole stack
|
||||
|
||||
.lab[
|
||||
|
||||
- Go to the webhook directory:
|
||||
@@ -216,28 +276,6 @@ Some examples ...
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## What's ngrok?
|
||||
|
||||
- Ngrok provides secure tunnels to access local services
|
||||
|
||||
- Example: run `ngrok http 1234`
|
||||
|
||||
- `ngrok` will display a publicly-available URL (e.g. https://xxxxyyyyzzzz.ngrok.io)
|
||||
|
||||
- Connections to https://xxxxyyyyzzzz.ngrok.io will terminate at `localhost:1234`
|
||||
|
||||
- Basic product is free; extra features (vanity domains, end-to-end TLS...) for $$$
|
||||
|
||||
- Perfect to develop our webhook!
|
||||
|
||||
- Probably not for production, though
|
||||
|
||||
(webhook requests and responses now pass through the ngrok platform)
|
||||
|
||||
---
|
||||
|
||||
## Update the webhook configuration
|
||||
|
||||
- We have a webhook configuration in `k8s/webhook-configuration.yaml`
|
||||
@@ -543,6 +581,23 @@ Shell to the rescue!
|
||||
|
||||
(it should only allow values of `red`, `green`, `blue`)
|
||||
|
||||
---
|
||||
|
||||
## Coming soon...
|
||||
|
||||
- Kubernetes Validating Admission Policies
|
||||
|
||||
- Integrated with the Kubernetes API server
|
||||
|
||||
- Lets us define policies using [CEL (Common Expression Language)][cel-spec]
|
||||
|
||||
- Available in beta in Kubernetes 1.28 <!-- ##VERSION## -->
|
||||
|
||||
- Check this [CNCF Blog Post][cncf-blog-vap] for more details
|
||||
|
||||
[cncf-blog-vap]: https://www.cncf.io/blog/2023/09/14/policy-management-in-kubernetes-is-changing/
|
||||
[cel-spec]: https://github.com/google/cel-spec
|
||||
|
||||
???
|
||||
|
||||
:EN:- Dynamic admission control with webhooks
|
||||
|
||||
@@ -141,12 +141,6 @@ class: pic
|
||||
|
||||
class: pic
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
class: pic
|
||||
|
||||

|
||||
|
||||
---
|
||||
@@ -157,6 +151,12 @@ class: pic
|
||||
|
||||
---
|
||||
|
||||
class: pic
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
# The Kubernetes API
|
||||
|
||||
[
|
||||
|
||||
592
slides/k8s/argocd.md
Normal file
592
slides/k8s/argocd.md
Normal file
@@ -0,0 +1,592 @@
|
||||
# ArgoCD
|
||||
|
||||
- We're going to implement a basic GitOps workflow with ArgoCD
|
||||
|
||||
- Pushing to the default branch will automatically deploy to our clusters
|
||||
|
||||
- There will be two clusters (`dev` and `prod`)
|
||||
|
||||
- The two clusters will have similar (but slightly different) workloads
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
## ArgoCD concepts
|
||||
|
||||
ArgoCD manages **applications** by **syncing** their **live state** with their **target state**.
|
||||
|
||||
- **Application**: a group of Kubernetes resources managed by ArgoCD.
|
||||
<br/>
|
||||
Also a custom resource (`kind: Application`) managing that group of resources.
|
||||
|
||||
- **Application source type**: the **Tool** used to build the application (Kustomize, Helm...)
|
||||
|
||||
- **Target state**: the desired state of an **application**, as represented by the git repository.
|
||||
|
||||
- **Live state**: the current state of the application on the cluster.
|
||||
|
||||
- **Sync status**: whether or not the live state matches the target state.
|
||||
|
||||
- **Sync**: the process of making an application move to its target state.
|
||||
<br/>
|
||||
(e.g. by applying changes to a Kubernetes cluster)
|
||||
|
||||
(Check [ArgoCD core concepts](https://argo-cd.readthedocs.io/en/stable/core_concepts/) for more definitions!)
|
||||
|
||||
---
|
||||
|
||||
## Getting ready
|
||||
|
||||
- Let's make sure we have two clusters
|
||||
|
||||
- It's OK to use local clusters (kind, minikube...)
|
||||
|
||||
- We need to install the ArgoCD CLI ([argocd-packages], [argocd-binaries])
|
||||
|
||||
- **Highly recommended:** set up CLI completion!
|
||||
|
||||
- Of course we'll need a Git service, too
|
||||
|
||||
---
|
||||
|
||||
## Setting up ArgoCD
|
||||
|
||||
- The easiest way is to use upstream YAML manifests
|
||||
|
||||
- There is also a [Helm chart][argocd-helmchart] if we need more customization
|
||||
|
||||
.lab[
|
||||
|
||||
- Create a namespace for ArgoCD and install it there:
|
||||
```bash
|
||||
kubectl create namespace argocd
|
||||
kubectl apply --namespace argocd -f \
|
||||
https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Logging in with the ArgoCD CLI
|
||||
|
||||
- The CLI can talk to the ArgoCD API server or to the Kubernetes API server
|
||||
|
||||
- For simplicity, we're going to authenticate and communicate with the Kubernetes API
|
||||
|
||||
.lab[
|
||||
|
||||
- Authenticate with the ArgoCD API (that's what the `--core` flag does):
|
||||
```bash
|
||||
argocd login --core
|
||||
```
|
||||
|
||||
- Check that everything is fine:
|
||||
```bash
|
||||
argocd version
|
||||
```
|
||||
]
|
||||
|
||||
--
|
||||
|
||||
🤔 `FATA[0000] error retrieving argocd-cm: configmap "argocd-cm" not found`
|
||||
|
||||
---
|
||||
|
||||
## ArgoCD CLI shortcomings
|
||||
|
||||
- When using "core" authentication, the ArgoCD CLI uses our current Kubernetes context
|
||||
|
||||
(as defined in our kubeconfig file)
|
||||
|
||||
- That context need to point to the correct namespace
|
||||
|
||||
(the namespace where we installed ArgoCD)
|
||||
|
||||
- In fact, `argocd login --core` doesn't communicate at all with ArgoCD!
|
||||
|
||||
(it only updates a local ArgoCD configuration file)
|
||||
|
||||
---
|
||||
|
||||
## Trying again in the right namespace
|
||||
|
||||
- We will need to run all `argocd` commands in the `argocd` namespace
|
||||
|
||||
(this limitation only applies to "core" authentication; see [issue 14167][issue14167])
|
||||
|
||||
.lab[
|
||||
|
||||
- Switch to the `argocd` namespace:
|
||||
```bash
|
||||
kubectl config set-context --current --namespace argocd
|
||||
```
|
||||
|
||||
- Check that we can communicate with the ArgoCD API now:
|
||||
```bash
|
||||
argocd version
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
- Let's have a look at ArgoCD architecture!
|
||||
|
||||
---
|
||||
|
||||
class: pic
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
## ArgoCD API Server
|
||||
|
||||
The API server is a gRPC/REST server which exposes the API consumed by the Web UI, CLI, and CI/CD systems. It has the following responsibilities:
|
||||
|
||||
- application management and status reporting
|
||||
|
||||
- invoking of application operations (e.g. sync, rollback, user-defined actions)
|
||||
|
||||
- repository and cluster credential management (stored as K8s secrets)
|
||||
|
||||
- authentication and auth delegation to external identity providers
|
||||
|
||||
- RBAC enforcement
|
||||
|
||||
- listener/forwarder for Git webhook events
|
||||
|
||||
---
|
||||
|
||||
## ArgoCD Repository Server
|
||||
|
||||
The repository server is an internal service which maintains a local cache of the Git repositories holding the application manifests. It is responsible for generating and returning the Kubernetes manifests when provided the following inputs:
|
||||
|
||||
- repository URL
|
||||
|
||||
- revision (commit, tag, branch)
|
||||
|
||||
- application path
|
||||
|
||||
- template specific settings: parameters, helm values...
|
||||
|
||||
---
|
||||
|
||||
## ArgoCD Application Controller
|
||||
|
||||
The application controller is a Kubernetes controller which continuously monitors running applications and compares the current, live state against the desired target state (as specified in the repo).
|
||||
|
||||
It detects *OutOfSync* application state and optionally takes corrective action.
|
||||
|
||||
It is responsible for invoking any user-defined hooks for lifecycle events (*PreSync, Sync, PostSync*).
|
||||
|
||||
---
|
||||
|
||||
## Preparing a repository for ArgoCD
|
||||
|
||||
- We need a repository with Kubernetes YAML manifests
|
||||
|
||||
- You can fork [kubercoins] or create a new, empty repository
|
||||
|
||||
- If you create a new, empty repository, add some manifests to it
|
||||
|
||||
---
|
||||
|
||||
## Add an Application
|
||||
|
||||
- An Application can be added to ArgoCD via the web UI or the CLI
|
||||
|
||||
(either way, this will create a custom resource of `kind: Application`)
|
||||
|
||||
- The Application should then automatically be deployed to our cluster
|
||||
|
||||
(the application manifests will be "applied" to the cluster)
|
||||
|
||||
.lab[
|
||||
|
||||
- Let's use the CLI to add an Application:
|
||||
```bash
|
||||
argocd app create kubercoins \
|
||||
--repo https://github.com/`<your_user>/<your_repo>`.git \
|
||||
--path . --revision `<branch>` \
|
||||
--dest-server https://kubernetes.default.svc \
|
||||
--dest-namespace kubercoins-prod
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Checking progress
|
||||
|
||||
- We can see sync status in the web UI or with the CLI
|
||||
|
||||
.lab[
|
||||
|
||||
- Let's check app status with the CLI:
|
||||
```bash
|
||||
argocd app list
|
||||
```
|
||||
|
||||
- We can also check directly with the Kubernetes CLI:
|
||||
```bash
|
||||
kubectl get applications
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
- The app is there and it is `OutOfSync`!
|
||||
|
||||
---
|
||||
|
||||
## Manual sync with the CLI
|
||||
|
||||
- By default the "sync policy" is `manual`
|
||||
|
||||
- It can also be set to `auto`, which would check the git repository every 3 minutes
|
||||
|
||||
(this interval can be [configured globally][pollinginterval])
|
||||
|
||||
- Manual sync can be triggered with the CLI
|
||||
|
||||
.lab[
|
||||
|
||||
- Let's force an immediate sync of our app:
|
||||
```bash
|
||||
argocd app sync kubercoins
|
||||
```
|
||||
]
|
||||
|
||||
🤔 We're getting errors!
|
||||
|
||||
---
|
||||
|
||||
## Sync failed
|
||||
|
||||
We should receive a failure:
|
||||
|
||||
`FATA[0000] Operation has completed with phase: Failed`
|
||||
|
||||
And in the output, we see more details:
|
||||
|
||||
`Message: one or more objects failed to apply,`
|
||||
<br/>
|
||||
`reason: namespaces "kubercoins-prod" not found`
|
||||
|
||||
---
|
||||
|
||||
## Creating the namespace
|
||||
|
||||
- There are multiple ways to achieve that
|
||||
|
||||
- We could generate a YAML manifest for the namespace and add it to the git repository
|
||||
|
||||
- Or we could use "Sync Options" so that ArgoCD creates it automatically!
|
||||
|
||||
- ArgoCD provides many "Sync Options" to handle various edge cases
|
||||
|
||||
- Some [others](https://argo-cd.readthedocs.io/en/stable/user-guide/sync-options/) are: `FailOnSharedResource`, `PruneLast`, `PrunePropagationPolicy`...
|
||||
|
||||
---
|
||||
|
||||
## Editing the app's sync options
|
||||
|
||||
- This can be done through the web UI or the CLI
|
||||
|
||||
.lab[
|
||||
|
||||
- Let's use the CLI once again:
|
||||
```bash
|
||||
argocd app edit kubercoins
|
||||
```
|
||||
|
||||
- Add the following to the YAML manifest, at the root level:
|
||||
```yaml
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Sync again
|
||||
|
||||
.lab[
|
||||
|
||||
- Let's retry the sync operation:
|
||||
```bash
|
||||
argocd app sync kubercoins
|
||||
```
|
||||
|
||||
- And check the application status:
|
||||
```bash
|
||||
argocd app list
|
||||
kubectl get applications
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
- It should show `Synced` and `Progressing`
|
||||
|
||||
- After a while (when all pods are running correctly) it should be `Healthy`
|
||||
|
||||
---
|
||||
|
||||
## Managing Applications via the Web UI
|
||||
|
||||
- ArgoCD is popular in large part due to its browser-based UI
|
||||
|
||||
- Let's see how to manage Applications in the web UI
|
||||
|
||||
.lab[
|
||||
|
||||
- Expose the web dashboard on a local port:
|
||||
```bash
|
||||
argocd admin dashboard
|
||||
```
|
||||
|
||||
- This command will show the dashboard URL; open it in a browser
|
||||
|
||||
- Authentication should be automatic
|
||||
|
||||
]
|
||||
|
||||
Note: `argocd admin dashboard` is similar to `kubectl port-forward` or `kubectl-proxy`.
|
||||
|
||||
(The dashboard remains available as long as `argocd admin dashboard` is running.)
|
||||
|
||||
---
|
||||
|
||||
## Adding a staging Application
|
||||
|
||||
- Let's add another Application for a staging environment
|
||||
|
||||
- First, create a new branch (e.g. `staging`) in our kubercoins fork
|
||||
|
||||
- Then, in the ArgoCD web UI, click on the "+ NEW APP" button
|
||||
|
||||
(on a narrow display, it might just be "+", right next to buttons looking like 🔄 and ↩️)
|
||||
|
||||
- See next slides for details about that form!
|
||||
|
||||
---
|
||||
|
||||
## Defining the Application
|
||||
|
||||
| Field | Value |
|
||||
|------------------|--------------------------------------------|
|
||||
| Application Name | `kubercoins-stg` |
|
||||
| Project Name | `default` |
|
||||
| Sync policy | `Manual` |
|
||||
| Sync options | check `auto-create namespace` |
|
||||
| Repository URL | `https://github.com/<username>/<reponame>` |
|
||||
| Revision | `<branchname>` |
|
||||
| Path | `.` |
|
||||
| Cluster URL | `https://kubernetes.default.svc` |
|
||||
| Namespace | `kubercoins-stg` |
|
||||
|
||||
Then click on the "CREATE" button (top left).
|
||||
|
||||
---
|
||||
|
||||
## Synchronizing the Application
|
||||
|
||||
- After creating the app, it should now show up in the app tiles
|
||||
|
||||
(with a yellow outline to indicate that it's out of sync)
|
||||
|
||||
- Click on the "SYNC" button on the app tile to show the sync panel
|
||||
|
||||
- In the sync panel, click on "SYNCHRONIZE"
|
||||
|
||||
- The app will start to synchronize, and should become healthy after a little while
|
||||
|
||||
---
|
||||
|
||||
## Making changes
|
||||
|
||||
- Let's make changes to our application manifests and see what happens
|
||||
|
||||
.lab[
|
||||
|
||||
- Make a change to a manifest
|
||||
|
||||
(for instance, change the number of replicas of a Deployment)
|
||||
|
||||
- Commit that change and push it to the staging branch
|
||||
|
||||
- Check the application sync status:
|
||||
```bash
|
||||
argocd app list
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
- After a short period of time (a few minutes max) the app should show up "out of sync"
|
||||
|
||||
---
|
||||
|
||||
## Automated synchronization
|
||||
|
||||
- We don't want to manually sync after every change
|
||||
|
||||
(that wouldn't be true continuous deployment!)
|
||||
|
||||
- We're going to enable "auto sync"
|
||||
|
||||
- Note that this requires much more rigorous testing and observability!
|
||||
|
||||
(we need to be sure that our changes won't crash our app or even our cluster)
|
||||
|
||||
- Argo project also provides [Argo Rollouts][rollouts]
|
||||
|
||||
(a controller and CRDs to provide blue-green, canary deployments...)
|
||||
|
||||
- Today we'll just turn on automated sync for the staging namespace
|
||||
|
||||
---
|
||||
|
||||
## Enabling auto-sync
|
||||
|
||||
- In the web UI, go to *Applications* and click on *kubercoins-stg*
|
||||
|
||||
- Click on the "DETAILS" button (top left, might be just a "i" sign on narrow displays)
|
||||
|
||||
- Click on "ENABLE AUTO-SYNC" (under "SYNC POLICY")
|
||||
|
||||
- After a few minutes the changes should show up!
|
||||
|
||||
---
|
||||
|
||||
## Rolling back
|
||||
|
||||
- If we deploy a broken version, how do we recover?
|
||||
|
||||
- "The GitOps way": revert the changes in source control
|
||||
|
||||
(see next slide)
|
||||
|
||||
- Emergency rollback:
|
||||
|
||||
- disable auto-sync (if it was enabled)
|
||||
|
||||
- on the app page, click on "HISTORY AND ROLLBACK"
|
||||
<br/>
|
||||
(with the clock-with-backward-arrow icon)
|
||||
|
||||
- click on the "..." button next to the button we want to roll back to
|
||||
|
||||
- click "Rollback" and confirm
|
||||
|
||||
---
|
||||
|
||||
## Rolling back with GitOps
|
||||
|
||||
- The correct way to roll back is rolling back the code in source control
|
||||
|
||||
```bash
|
||||
git checkout staging
|
||||
git revert HEAD
|
||||
git push origin staging
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Working with Helm
|
||||
|
||||
- ArgoCD supports different tools to process Kubernetes manifests:
|
||||
|
||||
Kustomize, Helm, Jsonnet, and [Config Management Plugins][cmp]
|
||||
|
||||
- Let's how to deploy Helm charts with ArgoCD!
|
||||
|
||||
- In the [kubercoins] repository, there is a branch called [helm-branch]
|
||||
|
||||
- It provides a generic Helm chart, in the [generic-service] directory
|
||||
|
||||
- There are service-specific values YAML files in the [values] directory
|
||||
|
||||
- Let's create one application for each of the 5 components of our app!
|
||||
|
||||
---
|
||||
|
||||
## Creating a Helm Application
|
||||
|
||||
- The example below uses "upstream" kubercoins
|
||||
|
||||
- Feel free to use your own fork instead!
|
||||
|
||||
.lab[
|
||||
|
||||
- Create an Application for `hasher`:
|
||||
```bash
|
||||
argocd app create hasher \
|
||||
--repo https://github.com/jpetazzo/kubercoins.git \
|
||||
--path generic-service --revision helm \
|
||||
--dest-server https://kubernetes.default.svc \
|
||||
--dest-namespace kubercoins-helm \
|
||||
--sync-option CreateNamespace=true \
|
||||
--values ../values/hasher.yaml \
|
||||
--sync-policy=auto
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Deploying the rest of the application
|
||||
|
||||
- Option 1: repeat the previous command (updating app name and values)
|
||||
|
||||
- Option 2: author YAML manifests and apply them
|
||||
|
||||
---
|
||||
|
||||
## Additional considerations
|
||||
|
||||
- When running in production, ArgoCD can be integrated with an [SSO provider][sso]
|
||||
|
||||
- ArgoCD embeds and bundles [Dex] to delegate authentication
|
||||
|
||||
- it can also use an existing OIDC provider (Okta, Keycloak...)
|
||||
|
||||
- A single ArgoCD instance can manage multiple clusters
|
||||
|
||||
(but it's also fine to have one ArgoCD per cluster)
|
||||
|
||||
- ArgoCD can be complemented with [Argo Rollouts][rollouts] for advanced rollout control
|
||||
|
||||
(blue/green, canary...)
|
||||
|
||||
---
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
Many thanks to
|
||||
Anton (Ant) Weiss ([antweiss.com](https://antweiss.com), [@antweiss](https://twitter.com/antweiss))
|
||||
and
|
||||
Guilhem Lettron
|
||||
for contributing an initial version and suggestions to this ArgoCD chapter.
|
||||
|
||||
All remaining typos, mistakes, or approximations are mine (Jérôme Petazzoni).
|
||||
|
||||
[argocd-binaries]: https://github.com/argoproj/argo-cd/releases/latest
|
||||
[argocd-helmchart]: https://artifacthub.io/packages/helm/argo/argocd-apps
|
||||
[argocd-packages]: https://argo-cd.readthedocs.io/en/stable/cli_installation/
|
||||
[cmp]: https://argo-cd.readthedocs.io/en/stable/operator-manual/config-management-plugins/
|
||||
[Dex]: https://github.com/dexidp/dex
|
||||
[generic-service]: https://github.com/jpetazzo/kubercoins/tree/helm/generic-service
|
||||
[helm-branch]: https://github.com/jpetazzo/kubercoins/tree/helm
|
||||
[issue14167]: https://github.com/argoproj/argo-cd/issues/14167
|
||||
[kubercoins]: https://github.com/jpetazzo/kubercoins
|
||||
[pollinginterval]: https://argo-cd.readthedocs.io/en/stable/faq/#how-often-does-argo-cd-check-for-changes-to-my-git-or-helm-repository
|
||||
[rollouts]: https://argoproj.github.io/rollouts/
|
||||
[sso]: https://argo-cd.readthedocs.io/en/stable/operator-manual/user-management/#sso
|
||||
[values]: https://github.com/jpetazzo/kubercoins/tree/helm/values
|
||||
|
||||
???
|
||||
|
||||
:EN:- Implementing gitops with ArgoCD
|
||||
:FR:- Workflow gitops avec ArgoCD
|
||||
@@ -856,7 +856,7 @@ class: extra-details
|
||||
- To learn more about Kubernetes attacks and threat models around RBAC:
|
||||
|
||||
📽️ [Hacking into Kubernetes Security for Beginners](https://www.youtube.com/watch?v=mLsCm9GVIQg)
|
||||
by [Ellen Körbes](https://twitter.com/ellenkorbes)
|
||||
by [V Körbes](https://twitter.com/veekorbes)
|
||||
and [Tabitha Sable](https://twitter.com/TabbySable)
|
||||
|
||||
---
|
||||
|
||||
173
slides/k8s/bento-cnpg.md
Normal file
173
slides/k8s/bento-cnpg.md
Normal file
@@ -0,0 +1,173 @@
|
||||
# Bento & PostgreSQL
|
||||
|
||||
- Bento can also use SQL databases for input/output
|
||||
|
||||
- We're going to demonstrate that by writing to a PostgreSQL database
|
||||
|
||||
- That database will be deployed with the Cloud Native PostGres operator
|
||||
|
||||
(https://cloudnative-pg.io/)
|
||||
|
||||
---
|
||||
|
||||
## CNPG in a nutshell
|
||||
|
||||
- Free, open source
|
||||
|
||||
- Originally created by [EDB] (EnterpriseDB, well-known PgSQL experts)
|
||||
|
||||
- Non-exhaustive list of features:
|
||||
|
||||
- provisioning of Postgres servers, replicas, bouncers
|
||||
|
||||
- automatic failover
|
||||
|
||||
- backups (full backups and WAL shipping)
|
||||
|
||||
- provisioning from scratch, from backups, PITR
|
||||
|
||||
- manual and automated switchover (e.g. for node maintenance)
|
||||
|
||||
- and many more!
|
||||
|
||||
[EDB]: https://www.enterprisedb.com/workload/kubernetes
|
||||
|
||||
---
|
||||
|
||||
## What we're going to do
|
||||
|
||||
1. Install CNPG.
|
||||
|
||||
2. Provision a Postgres cluster.
|
||||
|
||||
3. Configure Bento to write to that cluster.
|
||||
|
||||
4. Set up a Grafana dashboard to see the data.
|
||||
|
||||
---
|
||||
|
||||
## 1️⃣ Installing CNPG
|
||||
|
||||
Many options available, see the [documentation][cnpg-install]:
|
||||
|
||||
- raw YAML manifests
|
||||
|
||||
- kubectl CNPG plugin (`kubectl cnpg install generate`)
|
||||
|
||||
- Helm chart
|
||||
|
||||
- OLM
|
||||
|
||||
[cnpg-install]: https://cloudnative-pg.io/documentation/1.24/installation_upgrade/
|
||||
|
||||
---
|
||||
|
||||
## 2️⃣ Provisioning a Postgres cluster
|
||||
|
||||
Minimal manifest:
|
||||
|
||||
```yaml
|
||||
apiVersion: postgresql.cnpg.io/v1
|
||||
kind: Cluster
|
||||
metadata:
|
||||
name: db
|
||||
spec:
|
||||
storage:
|
||||
size: 1Gi
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## For production...
|
||||
|
||||
We might also add:
|
||||
|
||||
- `spec.monitoring.enablePodMonitor: true`
|
||||
|
||||
- `spec.instances: 2`
|
||||
|
||||
- `resources.{requests,limits}.{cpu,memory}`
|
||||
|
||||
- `walStorage.size`
|
||||
|
||||
- `backup`
|
||||
|
||||
- `postgresql.parameters`
|
||||
|
||||
See [this manifest][cluster-maximal] for a detailed example.
|
||||
|
||||
[cluster-maximal]: https://github.com/jpetazzo/pozok/blob/main/cluster-maximal.yaml
|
||||
|
||||
---
|
||||
|
||||
## 3️⃣ Configuring Bento to write to SQL
|
||||
|
||||
- We'll use the [`sql_insert`][sql-insert] output
|
||||
|
||||
- If our cluster is named `mydb`, there will be a Secret `mydb-app`
|
||||
|
||||
- This Secret will contain a `uri` field
|
||||
|
||||
- That field can be used as the `dns` in the Bento configuration
|
||||
|
||||
- We will also need to create the table that we want to use
|
||||
|
||||
(see next slide for instructions)
|
||||
|
||||
[sql-insert]: https://warpstreamlabs.github.io/bento/docs/components/outputs/sql_insert
|
||||
|
||||
---
|
||||
|
||||
## Creating a table
|
||||
|
||||
- If we just want to store the city name and its population:
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS cities (
|
||||
city varchar(100) NOT NULL,
|
||||
population integer
|
||||
);
|
||||
```
|
||||
|
||||
- This statement can be executed:
|
||||
|
||||
- manually, by getting a `psql` shell with `kubectl cnpg psql mydb app`
|
||||
|
||||
- automatically, with Bento's `init_statatement`
|
||||
|
||||
---
|
||||
|
||||
## 4️⃣ Viewing the table in Grafana
|
||||
|
||||
- In Grafana, in the home menu on the lift, click "connections"
|
||||
|
||||
- Add a PostgreSQL data source
|
||||
|
||||
- Enter the host:port, database, user, password
|
||||
|
||||
- Then add a visualization using that data source
|
||||
|
||||
(it should be relatively self-explanatory!)
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Automating it all
|
||||
|
||||
- Expose PostgreSQL credentials through environment variables
|
||||
|
||||
(in the Bento container)
|
||||
|
||||
- Use the `${...}` syntax in Bento to use these environment variables
|
||||
|
||||
- Export the Grafana dashboard to a JSON file
|
||||
|
||||
- Store the JSON file in a ConfigMap, with label `grafana_dashboard=1`
|
||||
|
||||
- Create that ConfigMap in the namespace where Grafana is running
|
||||
|
||||
- Similarly, data sources (like the Redis and the PostgreSQL one) can be defined in YAML
|
||||
|
||||
- And that YAML can be put in a ConfigMap with label `grafana_datasource=1`
|
||||
285
slides/k8s/bento-enrichment.md
Normal file
285
slides/k8s/bento-enrichment.md
Normal file
@@ -0,0 +1,285 @@
|
||||
# Calling APIs from Bento
|
||||
|
||||
- We want to ask our LLM who's the mayor of each of these cities
|
||||
|
||||
- We'll use a prompt that will usually ensure a short answer
|
||||
|
||||
(so that it's faster; we don't want to wait 30 seconds per city!)
|
||||
|
||||
- We'll test the prompt with the Ollama CLI
|
||||
|
||||
- Then we'll craft a proper HTTP API query
|
||||
|
||||
- Finally, we'll configure an [enrichment workflow][enrichment] in Bento
|
||||
|
||||
---
|
||||
|
||||
## Test our prompt
|
||||
|
||||
Assuming that our earlier Ollama Deployment is still running:
|
||||
|
||||
```bash
|
||||
kubectl exec deployment/ollama -- \
|
||||
ollama run qwen2:1.5b "
|
||||
Who is the mayor of San Francisco?
|
||||
Just give the name by itself on a single line.
|
||||
If you don't know, don't say anything.
|
||||
"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Turn the prompt into an HTTP API query
|
||||
|
||||
Note: to install `http` in an Alpine container, run `apk add httpie`.
|
||||
|
||||
```bash
|
||||
http http://ollama.default:11434/api/generate \
|
||||
model=qwen2:1.5b stream:=false prompt="
|
||||
Who is the mayor of Paris?
|
||||
Just give the name by itself on a single line.
|
||||
If you don't know, don't say anything.
|
||||
"
|
||||
```
|
||||
|
||||
We get a JSON payload, and we want to use the `response` field.
|
||||
|
||||
---
|
||||
|
||||
## Configure an enrichment workflow
|
||||
|
||||
The [Bento documentation][enrichment] is really good!
|
||||
|
||||
We need to set up:
|
||||
|
||||
- a `branch` processor
|
||||
|
||||
- a `request_map` to transform the city into an Ollama request
|
||||
|
||||
- an `http` processor to submit the request to Ollama
|
||||
|
||||
- a `result_map` to transform the Ollama response
|
||||
|
||||
---
|
||||
|
||||
## Without the `branch` processor
|
||||
|
||||
<pre class="mermaid">
|
||||
flowchart LR
|
||||
|
||||
CITY["
|
||||
city: Paris
|
||||
country: France
|
||||
population: 1106000
|
||||
iso2: FR
|
||||
...
|
||||
"]
|
||||
|
||||
REQ["
|
||||
model: qwen2:1.5b
|
||||
stream: false
|
||||
prompt: Who is the mayor of Paris?
|
||||
"]
|
||||
|
||||
REP["
|
||||
response: Anne Hidalgo
|
||||
eval_count: ...
|
||||
prompt_eval_count: ...
|
||||
(other ollama fields)
|
||||
"]
|
||||
|
||||
CITY@{ shape: card}
|
||||
REQ@{ shape: card}
|
||||
REP@{ shape: card}
|
||||
|
||||
style CITY text-align: left
|
||||
style REQ text-align: left
|
||||
style REP text-align: left
|
||||
|
||||
mapping@{ shape: diam }
|
||||
http["http processor"]@{ shape: diam }
|
||||
|
||||
CITY --> mapping --> REQ --> http --> REP
|
||||
</pre>
|
||||
|
||||
- We transform the `city` into an Ollama request
|
||||
|
||||
- The `http` processor submits the request to Ollama
|
||||
|
||||
- The final output is the Ollama response
|
||||
|
||||
---
|
||||
|
||||
## With the `branch` processor
|
||||
|
||||
<pre class="mermaid">
|
||||
flowchart LR
|
||||
|
||||
CITY["
|
||||
city: Paris
|
||||
country: France
|
||||
population: 1106000
|
||||
iso2: FR
|
||||
...
|
||||
"]
|
||||
|
||||
REQ["
|
||||
model: qwen2:1.5b
|
||||
stream: false
|
||||
prompt: Who is the mayor of Paris?
|
||||
"]
|
||||
|
||||
REP["
|
||||
response: Anne Hidalgo
|
||||
eval_count: ...
|
||||
prompt_eval_count: ...
|
||||
(other ollama fields)
|
||||
"]
|
||||
|
||||
OUT["
|
||||
city: Paris
|
||||
country: France
|
||||
population: 1106000
|
||||
iso2: FR
|
||||
...
|
||||
mayor: Anne Hidalgo
|
||||
"]
|
||||
|
||||
CITY@{ shape: card}
|
||||
REQ@{ shape: card}
|
||||
REP@{ shape: card}
|
||||
OUT@{ shape: card}
|
||||
|
||||
style CITY text-align: left
|
||||
style REQ text-align: left
|
||||
style REP text-align: left
|
||||
style OUT text-align: left
|
||||
|
||||
branch@{ shape: diam }
|
||||
request_map@{ shape: diam }
|
||||
result_map@{ shape: diam }
|
||||
http["http processor"]@{ shape: diam }
|
||||
|
||||
CITY --> branch
|
||||
branch --> result_map
|
||||
branch --> request_map
|
||||
request_map --> REQ
|
||||
REQ --> http
|
||||
http --> REP
|
||||
REP --> result_map
|
||||
result_map --> OUT
|
||||
</pre>
|
||||
|
||||
- The `branch` processor allows doing the processing "on the side"
|
||||
|
||||
- `request_map` and `result_map` transform the message before/after processing
|
||||
|
||||
- Then, the result is combined with the original message (the `city`)
|
||||
|
||||
---
|
||||
|
||||
```yaml
|
||||
input:
|
||||
csv:
|
||||
paths: ["cities.csv"]
|
||||
pipeline:
|
||||
processors:
|
||||
- branch:
|
||||
request_map: |
|
||||
root.model = "qwen2:1.5b"
|
||||
root.stream = false
|
||||
root.prompt = (
|
||||
"Who is the mayor of %s? ".format(this.city) +
|
||||
"Just give the name by itself on a single line. " +
|
||||
"If you don't know, don't say anything."
|
||||
)
|
||||
processors:
|
||||
- http:
|
||||
url: http://ollama:11434/api/generate
|
||||
verb: POST
|
||||
result_map: |
|
||||
root.mayor = this.response
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Trying it out
|
||||
|
||||
- Save the YAML on the previous page into a configuration file
|
||||
|
||||
- Run Bento with that configuration file
|
||||
|
||||
- What happens?
|
||||
|
||||
--
|
||||
|
||||
🤔 We're seeing errors due to timeouts
|
||||
|
||||
```
|
||||
ERRO HTTP request to 'http://ollama...' failed: http://ollama...:
|
||||
Post "http://ollama...": context deadline exceeded
|
||||
(Client.Timeout exceeded while awaiting headers)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🙋 Choose your own adventure
|
||||
|
||||
How should we address errors?
|
||||
|
||||
- Option 1: increase the timeout in the [http][bento-http] processor
|
||||
|
||||
- Option 2: use a [retry][bento-retry] processor in the pipeline
|
||||
|
||||
- Option 3: use a [reject_errored][bento-reject] output
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Let's build something!
|
||||
|
||||
- We want to process 1000 cities with our LLM
|
||||
|
||||
(guessing who the mayor is, or something similar)
|
||||
|
||||
- Store the output wherever we want
|
||||
|
||||
(Redis, CSV file, JSONL files...)
|
||||
|
||||
- Deal correctly with errors
|
||||
|
||||
(we'll check that there are, indeed, 1000 cities in the output)
|
||||
|
||||
- Scale out to process faster
|
||||
|
||||
(scale ollama to e.g. 10 replicas, enable parallelism in Bento)
|
||||
|
||||
---
|
||||
|
||||
class: title
|
||||
|
||||
🍱 Lunch time! 🍱
|
||||
|
||||
---
|
||||
|
||||
## What happened?
|
||||
|
||||
- If your Ollama pods have *resource requests*:
|
||||
|
||||
→ your cluster may have auto-scaled
|
||||
|
||||
- If your Ollama pods don't have *resource requests*:
|
||||
|
||||
→ you probably have a bunch of container restarts, due to out-of-memory errors
|
||||
|
||||
🤔 What's that about?
|
||||
|
||||
[bento-http]: https://warpstreamlabs.github.io/bento/docs/components/processors/http/
|
||||
[bento-inputs]: https://warpstreamlabs.github.io/bento/docs/components/inputs/about/
|
||||
[bento-reject]: https://warpstreamlabs.github.io/bento/docs/components/outputs/reject_errored
|
||||
[bento-retry]: https://warpstreamlabs.github.io/bento/docs/components/processors/retry
|
||||
[bento-switch]: https://warpstreamlabs.github.io/bento/docs/components/processors/switch/
|
||||
[enrichment]: https://warpstreamlabs.github.io/bento/cookbooks/enrichments/
|
||||
[output-http-server]: https://warpstreamlabs.github.io/bento/docs/components/outputs/http_server
|
||||
[redpanda-acquires-benthos]: https://www.redpanda.com/press/redpanda-acquires-benthos
|
||||
[warpstream-forks-benthos]: https://www.warpstream.com/blog/announcing-bento-the-open-source-fork-of-the-project-formerly-known-as-benthos
|
||||
|
||||
450
slides/k8s/bento-hpa.md
Normal file
450
slides/k8s/bento-hpa.md
Normal file
@@ -0,0 +1,450 @@
|
||||
# Autoscaling with KEDA
|
||||
|
||||
- Cluster autoscaling = automatically add nodes *when needed*
|
||||
|
||||
- *When needed* = when Pods are `Pending`
|
||||
|
||||
- How do these pods get created?
|
||||
|
||||
- When the Ollama Deployment is scaled up
|
||||
|
||||
- ... manually (e.g. `kubectl scale`)
|
||||
|
||||
- ... automatically (that's what we want to investigate now!)
|
||||
|
||||
---
|
||||
|
||||
## Ways to implement autoscaling
|
||||
|
||||
- Custom code
|
||||
|
||||
(e.g. crontab checking some value every few minutes and scaling accordingly)
|
||||
|
||||
- Kubernetes Horizontal Pod Autoscaler v1
|
||||
|
||||
(aka `kubectl autoscale`)
|
||||
|
||||
- Kubernetes Horizontal Pod Autoscaler v2 with custom metrics
|
||||
|
||||
(e.g. with Prometheus Adapter)
|
||||
|
||||
- Kubernetes Horizontal Pod Autoscaler v2 with external metrics
|
||||
|
||||
(e.g. with KEDA)
|
||||
|
||||
---
|
||||
|
||||
## Custom code
|
||||
|
||||
- No, we're not going to do that!
|
||||
|
||||
- But this would be an interesting exercise in RBAC
|
||||
|
||||
(setting minimal amount of permissions for the pod running our custom code)
|
||||
|
||||
---
|
||||
|
||||
## HPAv1
|
||||
|
||||
Pros: very straightforward
|
||||
|
||||
Cons: can only scale on CPU utilization
|
||||
|
||||
How it works:
|
||||
|
||||
- periodically measures average CPU *utilization* across pods
|
||||
|
||||
- if utilization is above/below a target (default: 80%), scale up/down
|
||||
|
||||
---
|
||||
|
||||
## HPAv1 in practice
|
||||
|
||||
- Create the autoscaling policy:
|
||||
```bash
|
||||
kubectl autoscale deployment ollama --max=1000
|
||||
```
|
||||
(The `--max` is required; it's a safety limit.)
|
||||
|
||||
- Check it:
|
||||
```bash
|
||||
kubectl describe hpa
|
||||
```
|
||||
|
||||
- Send traffic, wait a bit: pods should be created automatically
|
||||
|
||||
---
|
||||
|
||||
## HPAv2 custom vs external
|
||||
|
||||
- Custom metrics = arbitrary metrics attached to Kubernetes objects
|
||||
|
||||
- External metrics = arbitrary metrics not related to Kubernetes objects
|
||||
|
||||
--
|
||||
|
||||
🤔
|
||||
|
||||
---
|
||||
|
||||
## HPAv2 custom metrics
|
||||
|
||||
- Examples:
|
||||
|
||||
- on Pods: CPU, RAM, network traffic...
|
||||
|
||||
- on Ingress: requests per second, HTTP status codes, request duration...
|
||||
|
||||
- on some worker Deployment: number of tasks processed, task duration...
|
||||
|
||||
- Requires an *adapter* to:
|
||||
|
||||
- expose the metrics through the Kubernetes *aggregation layer*
|
||||
|
||||
- map the actual metrics source to Kubernetes objects
|
||||
|
||||
Example: the [Prometheus adapter][prometheus-adapter]
|
||||
|
||||
[prometheus-adapter]: https://github.com/kubernetes-sigs/prometheus-adapter
|
||||
|
||||
---
|
||||
|
||||
## HPAv2 custom metrics in practice
|
||||
|
||||
- We're not going to cover this here
|
||||
|
||||
(too complex / not enough time!)
|
||||
|
||||
- If you want more details, check [my other course material][hpav2slides]
|
||||
|
||||
[hpav2slides]: https://2024-10-enix.container.training/4.yml.html#toc-scaling-with-custom-metrics
|
||||
|
||||
---
|
||||
|
||||
## HPAv2 external metrics
|
||||
|
||||
- Examples:
|
||||
|
||||
- arbitrary Prometheus query
|
||||
|
||||
- arbitrary SQL query
|
||||
|
||||
- number of messages in a queue
|
||||
|
||||
- and [many, many more][keda-scalers]
|
||||
|
||||
- Also requires an extra components to expose the metrics
|
||||
|
||||
Example: [KEDA (https://keda.sh/)](https://keda.sh)
|
||||
|
||||
[keda-scalers]: https://keda.sh/docs/latest/scalers/
|
||||
|
||||
---
|
||||
|
||||
## HPAv2 external metrics in practice
|
||||
|
||||
- We're going to install KEDA
|
||||
|
||||
- And set it up to autoscale depending on the number of messages in Redis
|
||||
|
||||
---
|
||||
|
||||
## Installing KEDA
|
||||
|
||||
Multiple options (details in the [documentation][keda-deploy]):
|
||||
|
||||
- YAML
|
||||
|
||||
- Operator Hub
|
||||
|
||||
- Helm chart 💡
|
||||
|
||||
```bash
|
||||
helm upgrade --install --repo https://kedacore.github.io/charts \
|
||||
--namespace keda-system --create-namespace keda keda
|
||||
```
|
||||
|
||||
[keda-deploy]: https://keda.sh/docs/latest/deploy/
|
||||
|
||||
---
|
||||
|
||||
## Scaling according to Redis
|
||||
|
||||
- We need to create a KEDA Scaler
|
||||
|
||||
- This is done with a "ScaledObject" manifest
|
||||
|
||||
- [Here is the documentation][keda-redis-lists] for the Redis Lists Scaler
|
||||
|
||||
- Let's write that manifest!
|
||||
|
||||
[keda-redis-lists]: https://keda.sh/docs/latest/scalers/redis-lists/
|
||||
|
||||
---
|
||||
|
||||
## `keda-redis-scaler.yaml`
|
||||
|
||||
```yaml
|
||||
apiVersion: keda.sh/v1alpha1
|
||||
kind: ScaledObject
|
||||
metadata:
|
||||
name: ollama
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
name: ollama
|
||||
triggers:
|
||||
- type: redis
|
||||
metadata:
|
||||
address: redis.`default`.svc:6379
|
||||
listName: cities
|
||||
listLength: "10"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- We need to update the `address` field with our namespace
|
||||
|
||||
(unless we are running in the `default` namespace)
|
||||
|
||||
- Alternative: use `addressFromEnv` and set an env var in the Ollama pods
|
||||
|
||||
- `listLength` gives the target ratio of `messages / replicas`
|
||||
|
||||
- In our example, KEDA will scale the Deployment to `messages / 100`
|
||||
|
||||
(rounded up!)
|
||||
|
||||
---
|
||||
|
||||
## Trying it out
|
||||
|
||||
- Apply the ScaledObject manifest
|
||||
|
||||
- Start a Bento pipeline loading e.g. 100-1000 cities in Redis
|
||||
|
||||
(100 on smaller clusters / slower CPUs, 1000 on bigger / faster ones)
|
||||
|
||||
- Check pod and nod resource usage
|
||||
|
||||
- What do we see?
|
||||
|
||||
--
|
||||
|
||||
🤩 The Deployment scaled up automatically!
|
||||
|
||||
--
|
||||
|
||||
🤔 But Pod resource usage remains very low (A few busy pods, many idle)
|
||||
|
||||
--
|
||||
|
||||
💡 Bento doesn't submit enough requests in parallel!
|
||||
|
||||
---
|
||||
|
||||
# Improving throughput
|
||||
|
||||
We're going to review multiple techniques:
|
||||
|
||||
1. Increase parallelism inside the Bento pipeline.
|
||||
|
||||
2. Run multiple Bento consumers.
|
||||
|
||||
3. Couple consumers and processors more tightly.
|
||||
|
||||
---
|
||||
|
||||
## 1️⃣ Increase pipeline parallelism
|
||||
|
||||
- Set `parallel` to `true` in the `http` processor
|
||||
|
||||
- Wrap the input around a `batched` input
|
||||
|
||||
(otherwise, we don't have enough messages in flight)
|
||||
|
||||
- Increase `http` timeout significantly (e.g. to 5 minutes)
|
||||
|
||||
---
|
||||
|
||||
## Results
|
||||
|
||||
🎉 More messages flow through the pipeline
|
||||
|
||||
🎉 Many requests happen in parallel
|
||||
|
||||
🤔 Average Pod and Node CPU utilization is higher, but not maxed out
|
||||
|
||||
🤔 HTTP queue size (measured with HAProxy metrics) is relatively high
|
||||
|
||||
🤔 Latency is higher too
|
||||
|
||||
Why?
|
||||
|
||||
---
|
||||
|
||||
## Too many requests in parallel
|
||||
|
||||
- Ealier, we didn't have enough...
|
||||
|
||||
- ...Now, we have too much!
|
||||
|
||||
- However, for a very big request queue, it still wouldn't be enough
|
||||
|
||||
💡 We currently have a fixed parallelism. We need to make it dynamic!
|
||||
|
||||
---
|
||||
|
||||
## 2️⃣ Run multiple Bento consumers
|
||||
|
||||
- Restore the original Bento configuration
|
||||
|
||||
(flip `parallel` back to `false`; remove the `batched` input)
|
||||
|
||||
- Run Bento in a Deployment
|
||||
|
||||
(e.g. with the [Bento Helm chart][bento-helm-chart])
|
||||
|
||||
- Autoscale that Deployment like we autoscaled the Ollama Deployment
|
||||
|
||||
[bento-helm-chart]: https://github.com/warpstreamlabs/bento-helm-chart
|
||||
|
||||
---
|
||||
|
||||
## Results
|
||||
|
||||
🤔🤔🤔 Pretty much the same as before!
|
||||
|
||||
(High throughput, high utilization but not maxed out, high latency...)
|
||||
|
||||
--
|
||||
|
||||
🤔🤔🤔 Why?
|
||||
|
||||
---
|
||||
|
||||
## Unbalanced load balancing
|
||||
|
||||
- All our requests go through the `ollama` Service
|
||||
|
||||
- We're still using the default Kubernetes service proxy!
|
||||
|
||||
- It doesn't spread the requests properly across all the backends
|
||||
|
||||
---
|
||||
|
||||
## 3️⃣ Couple consumers and processors
|
||||
|
||||
What if:
|
||||
|
||||
--
|
||||
|
||||
instead of sending requests to a load balancer,
|
||||
|
||||
--
|
||||
|
||||
each queue consumer had its own Ollama instance?
|
||||
|
||||
---
|
||||
|
||||
## Current architecture
|
||||
|
||||
<pre class="mermaid">
|
||||
flowchart LR
|
||||
subgraph P1["Pod"]
|
||||
H1["HAProxy"] --> O1["Ollama"]
|
||||
end
|
||||
subgraph P2["Pod"]
|
||||
H2["HAProxy"] --> O2["Ollama"]
|
||||
end
|
||||
subgraph P3["Pod"]
|
||||
H3["HAProxy"] --> O3["Ollama"]
|
||||
end
|
||||
Q["Queue<br/>(Redis)"] <--> C["Consumer<br/>(Bento)"] --> LB["Load Balancer<br/>(kube-proxy)"]
|
||||
LB --> H1 & H2 & H3
|
||||
</pre>
|
||||
|
||||
---
|
||||
|
||||
## Proposed architecture
|
||||
|
||||
<pre class="mermaid">
|
||||
flowchart LR
|
||||
subgraph P1["Consumer Pod"]
|
||||
C1["Bento"] --> H1["HAProxy"] --> O1["Ollama"]
|
||||
end
|
||||
subgraph P2["Consumer Pod"]
|
||||
C2["Bento"] --> H2["HAProxy"] --> O2["Ollama"]
|
||||
end
|
||||
subgraph P3["Consumer Pod"]
|
||||
C3["Bento"] --> H3["HAProxy"] --> O3["Ollama"]
|
||||
end
|
||||
Queue["Queue"] <--> C1 & C2 & C3
|
||||
</pre>
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Let's build something!
|
||||
|
||||
- Let's implement that architecture!
|
||||
|
||||
- See next slides for hints / getting started
|
||||
|
||||
---
|
||||
|
||||
## Hints
|
||||
|
||||
We need to:
|
||||
|
||||
- Update the Bento consumer configuration to talk to localhost
|
||||
|
||||
- Store that configuration in a ConfigMap
|
||||
|
||||
- Add a Bento container to the Ollama Deployment
|
||||
|
||||
- Profit!
|
||||
|
||||
---
|
||||
|
||||
## Results
|
||||
|
||||
🎉 Node and Pod utilization is maximized
|
||||
|
||||
🎉 HTTP queue size is bounded
|
||||
|
||||
🎉 Deployment autoscales up and down
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Scaling down
|
||||
|
||||
- Eventually, there are less messages in the queue
|
||||
|
||||
- The HPA scales down the Ollama Deployment
|
||||
|
||||
- This terminates some Ollama Pods
|
||||
|
||||
🤔 What happens if these Pods were processing requests?
|
||||
|
||||
--
|
||||
|
||||
- The requests might be lost!
|
||||
|
||||
---
|
||||
|
||||
## Avoiding lost messages
|
||||
|
||||
Option 1:
|
||||
|
||||
- cleanly shutdown the consumer
|
||||
|
||||
- make sure that Ollama can complete in-flight requests
|
||||
|
||||
(by extending its grace period)
|
||||
|
||||
- find a way to terminate Ollama when no more requests are in flight
|
||||
|
||||
Option 2:
|
||||
|
||||
- use *message acknowledgement*
|
||||
325
slides/k8s/bento-intro.md
Normal file
325
slides/k8s/bento-intro.md
Normal file
@@ -0,0 +1,325 @@
|
||||
# Getting started with Bento
|
||||
|
||||
How can we move to a message queue architecture...
|
||||
|
||||
*...without rewriting a bunch of code?*
|
||||
|
||||
🤔
|
||||
|
||||
---
|
||||
|
||||
## Bento
|
||||
|
||||
https://bento.dev/
|
||||
|
||||
"Fancy stream processing made operationally mundane"
|
||||
|
||||
"Written in Go, deployed as a static binary, declarative configuration. Open source and cloud native as utter heck."
|
||||
|
||||
With ✨ amazing ✨ documentation 😍
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Tiny bit of history
|
||||
|
||||
- Original project: Benthos
|
||||
|
||||
- May 30, 2024: [Redpanda acquires Benthos][redpanda-acquires-benthos]
|
||||
|
||||
- Benthos is now Redpanda Connect
|
||||
|
||||
- some parts have been relicensed as commercial products
|
||||
|
||||
- May 31, 2024: [Warpstream forks Benthos][warpstream-forks-benthos]
|
||||
|
||||
- that fork is named "Bento"
|
||||
|
||||
- it's fully open source
|
||||
|
||||
- We're going to use Bento here, but Redpanda Connect should work fine too!
|
||||
|
||||
---
|
||||
|
||||
## Bento concepts
|
||||
|
||||
- Message stream processor
|
||||
|
||||
- Each pipeline is configured by a YAML configuration that defines:
|
||||
|
||||
- input (where do we get the messages?)
|
||||
|
||||
- pipeline (optional: how do we transform the messages?)
|
||||
|
||||
- output (where do we put the messages afterwards?)
|
||||
|
||||
- Once Bento is started, it runs the pipelines forever
|
||||
|
||||
(except for pipelines that have a logical end, e.g. reading from a file)
|
||||
|
||||
- Embedded language (Bloblang) to manipulate/transform messages
|
||||
|
||||
---
|
||||
|
||||
## Messages
|
||||
|
||||
- Typically JSON objects
|
||||
|
||||
(but raw strings are also possible)
|
||||
|
||||
- Nesting, arrays, etc. are OK
|
||||
|
||||
---
|
||||
|
||||
## Getting started with Bento
|
||||
|
||||
We're going to:
|
||||
|
||||
1. Import a bunch of cities from a CSV file into a Redis queue.
|
||||
|
||||
2. Read back these cities using a web server.
|
||||
|
||||
3. Use an "enrichment workflow" to query our LLM for each city.
|
||||
|
||||
---
|
||||
|
||||
## 1️⃣ Importing cities
|
||||
|
||||
Let's break down the work:
|
||||
|
||||
- download the data set
|
||||
|
||||
- create the Bento configuration
|
||||
|
||||
- deploy Redis
|
||||
|
||||
- start Bento
|
||||
|
||||
---
|
||||
|
||||
## Downloading the data set
|
||||
|
||||
- Example database:
|
||||
|
||||
https://www.kaggle.com/datasets/juanmah/world-cities
|
||||
|
||||
- Let's download and uncompress the data set:
|
||||
```bash
|
||||
curl -fsSL https://www.kaggle.com/api/v1/datasets/download/juanmah/world-cities |
|
||||
funzip > cities.csv
|
||||
```
|
||||
|
||||
(Ignore the "length error", it's harmless!)
|
||||
|
||||
- Check the structure of the data set:
|
||||
```bash
|
||||
head cities.csv
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Creating the Bento configuration
|
||||
|
||||
- We need to find which `input` and `output` to use
|
||||
|
||||
- Check the list with `bento list` or the [documentation][bento-inputs]
|
||||
|
||||
- Then run `bento create INPUTNAME/PIPELINENAME/OUTPUTNAME`
|
||||
|
||||
- Generate a configuration file:
|
||||
```bash
|
||||
bento create csv//redis_list > csv2redis.yaml
|
||||
```
|
||||
|
||||
- Edit that configuration file; look for the `(required)` parameters
|
||||
|
||||
(Everything else can go away!)
|
||||
|
||||
---
|
||||
|
||||
## Resulting configuration
|
||||
|
||||
If we trim all the default values, here is the result:
|
||||
|
||||
```yaml
|
||||
input:
|
||||
csv:
|
||||
paths: ["cities.csv"]
|
||||
output:
|
||||
redis_list:
|
||||
url: redis://redis:6379 # No default (required)
|
||||
key: cities
|
||||
```
|
||||
|
||||
We'll call that value `csv2redis.yaml`.
|
||||
|
||||
---
|
||||
|
||||
## Deploying Redis
|
||||
|
||||
- Create a Deployment:
|
||||
```bash
|
||||
kubectl create deployment redis --image redis
|
||||
```
|
||||
|
||||
- Expose it:
|
||||
```bash
|
||||
kubectl expose deployment redis --port 6379
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Starting Bento
|
||||
|
||||
Option 1: run it manually in a pod, to see what's going on.
|
||||
|
||||
```bash
|
||||
bento --config csv2redis.yaml
|
||||
```
|
||||
|
||||
Option 2: run it with e.g. the Bento Helm chart.
|
||||
|
||||
*We're not going to do that yet, since this particular pipeline has a logical end.*
|
||||
|
||||
*(The Helm chart is best suited to pipelines that run forever.)*
|
||||
|
||||
---
|
||||
|
||||
## Expected output
|
||||
|
||||
.small[
|
||||
```
|
||||
INFO Running main config from specified file @service=bento bento_version="" path=csv2redis.yaml
|
||||
INFO Launching a Bento instance, use CTRL+C to close @service=bento
|
||||
INFO Listening for HTTP requests at: http://0.0.0.0:4195 @service=bento
|
||||
INFO Input type csv is now active @service=bento label="" path=root.input
|
||||
INFO Output type redis_list is now active @service=bento label="" path=root.output
|
||||
INFO Pipeline has terminated. Shutting down the service @service=bento
|
||||
```
|
||||
]
|
||||
|
||||
The pipeline should complete in just a few seconds.
|
||||
|
||||
---
|
||||
|
||||
## Checking what's in Redis
|
||||
|
||||
- Connect to our Redis instance:
|
||||
```bash
|
||||
redis-cli -h redis
|
||||
```
|
||||
|
||||
- List keys:
|
||||
```redis
|
||||
KEYS *
|
||||
```
|
||||
|
||||
- Check that the `cities` list has approx. 47000 elements:
|
||||
```redis
|
||||
LLEN cities
|
||||
```
|
||||
|
||||
- Get the first element of the list:
|
||||
```redis
|
||||
LINDEX cities 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Fun with Bloblang
|
||||
|
||||
- Let's add a filter to keep only cities with a population above 10,000,000
|
||||
|
||||
- Add the following block to the Bento configuration:
|
||||
|
||||
```yaml
|
||||
pipeline:
|
||||
processors:
|
||||
- switch:
|
||||
- check: this.population == ""
|
||||
processors:
|
||||
- mapping: root = deleted()
|
||||
- check: this.population.int64() < 10000000
|
||||
processors:
|
||||
- mapping: root = deleted()
|
||||
```
|
||||
|
||||
(See the [docs][bento-switch] for details about the `switch` processor.)
|
||||
|
||||
---
|
||||
|
||||
## Testing our processor
|
||||
|
||||
- First, delete the existing `cities` list:
|
||||
```bash
|
||||
redis-cli -h redis DEL cities
|
||||
```
|
||||
|
||||
- Then, run the Bento pipeline again:
|
||||
```bash
|
||||
bento --config csv2redis.yaml
|
||||
```
|
||||
(It should complain about a few cities where the population has a decimal point.)
|
||||
|
||||
- Check how many cities were loaded:
|
||||
```bash
|
||||
redis-cli -h redis LLEN cities
|
||||
```
|
||||
(There should be 47.)
|
||||
|
||||
---
|
||||
|
||||
## 2️⃣ Consume the queue over HTTP
|
||||
|
||||
- We want to "get the next city" in the queue with a simple `curl`
|
||||
|
||||
- Our input will be `redis_list`
|
||||
|
||||
- Our output will be `http_server`
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Let's generate the Bento configuration!
|
||||
|
||||
Option 1: `bento create redis_list//http_server`
|
||||
|
||||
Option 2: [read the docs][output-http-server]
|
||||
|
||||
⚠️ Spoilers on next slide!
|
||||
|
||||
---
|
||||
|
||||
## `redis2http.yaml`
|
||||
|
||||
```yaml
|
||||
input:
|
||||
redis_list:
|
||||
url: redis://redis:`6379`
|
||||
key: cities
|
||||
output:
|
||||
http_server:
|
||||
path: /nextcity
|
||||
```
|
||||
|
||||
This will set up an HTTP route to fetch *one* city.
|
||||
|
||||
It's also possible to batch, stream...
|
||||
|
||||
⚠️ As of November 2024, `bento create` uses port 6397 instead of 6379 for Redis!
|
||||
|
||||
---
|
||||
|
||||
## Trying it out
|
||||
|
||||
- Run Bento with this configuration:
|
||||
```bash
|
||||
bento --config redis2http.yaml &
|
||||
```
|
||||
|
||||
- Retrieve one city:
|
||||
```bash
|
||||
curl http://localhost:4195/nextcity
|
||||
```
|
||||
|
||||
- Check what happens after we retrive *all* the cities!
|
||||
250
slides/k8s/bento-rmq.md
Normal file
250
slides/k8s/bento-rmq.md
Normal file
@@ -0,0 +1,250 @@
|
||||
# Bento & RabbitMQ
|
||||
|
||||
- In some of the previous runs, messages were dropped
|
||||
|
||||
(we start with 1000 messages in `cities` and have e.g. 955 in `mayors`)
|
||||
|
||||
- This is caused by various errors during processing
|
||||
|
||||
(e.g. too many timeouts; Bento being shutdown halfway through...)
|
||||
|
||||
- ...And by the fact that we are using a Redis queue
|
||||
|
||||
(which doesn't offer delivery guarantees or acknowledgements)
|
||||
|
||||
- Can we get something better?
|
||||
|
||||
---
|
||||
|
||||
## The problem
|
||||
|
||||
- Some inputs (like `redis_list`) don't support *acknowledgements*
|
||||
|
||||
- When a message is pulled from the queue, it is deleted immediately
|
||||
|
||||
- If the message is lost for any reason, it is lost permanently
|
||||
|
||||
---
|
||||
|
||||
## The solution
|
||||
|
||||
- Some inputs (like `amqp_0_9`) support acknowledgements
|
||||
|
||||
- When a message is pulled from the queue:
|
||||
|
||||
- it is not visible anymore to other consumers
|
||||
|
||||
- it needs to be explicitly acknowledged
|
||||
|
||||
- The acknowledgement is done by Bento when the message reaches the output
|
||||
|
||||
- The acknowledgement deletes the message
|
||||
|
||||
- No acknowledgement after a while? Consumer crashes/disconnects?
|
||||
|
||||
Message gets requeued automatically!
|
||||
|
||||
---
|
||||
|
||||
## `amqp_0_9`
|
||||
|
||||
- Protocol used by RabbitMQ
|
||||
|
||||
- Very simplified behavior:
|
||||
|
||||
- messages are published to an [*exchange*][amqp-exchanges]
|
||||
|
||||
- messages have a *routing key*
|
||||
|
||||
- the exchange routes the message to one (or zero or more) queues
|
||||
</br>(possibly using the routing key or message headers to decide which queue(s))
|
||||
|
||||
- [*consumers*][amqp-consumers] subscribe to queues to receive messages
|
||||
|
||||
[amqp-exchanges]: https://www.rabbitmq.com/tutorials/amqp-concepts#exchanges
|
||||
[amqp-consumers]: https://www.rabbitmq.com/tutorials/amqp-concepts#consumers
|
||||
|
||||
---
|
||||
|
||||
## Using the default exchange
|
||||
|
||||
- There is a default exchange (called `""` - empty string)
|
||||
|
||||
- The routing key indicates the name of the queue to deliver to
|
||||
|
||||
- The queue needs to exist (we need to create it beforehand)
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Defining custom exchanges
|
||||
|
||||
- Create an exchange
|
||||
|
||||
- exchange types: direct, fanout, topic, headers
|
||||
|
||||
- durability: persisted to disk to survive server restart or not?
|
||||
|
||||
- Create a binding
|
||||
|
||||
- which exchange?
|
||||
|
||||
- which routing key? (for direct exchanges)
|
||||
|
||||
- which queue?
|
||||
|
||||
---
|
||||
|
||||
## RabbitMQ on Kubernetes
|
||||
|
||||
- RabbitMQ can be deployed on Kubernetes:
|
||||
|
||||
- directly (creating e.g. a StatefulSet)
|
||||
|
||||
- with the RabbitMQ operator
|
||||
|
||||
- We're going to do the latter!
|
||||
|
||||
- The operator includes the "topology operator"
|
||||
|
||||
(to configure queues, exchanges, and bindings through custom resources)
|
||||
|
||||
---
|
||||
|
||||
## Installing the RabbitMQ operator
|
||||
|
||||
- Let's install it with this Helm chart:
|
||||
|
||||
```bash
|
||||
helm upgrade --install --repo https://charts.bitnami.com/bitnami \
|
||||
--namespace rabbitmq-system --create-namespace \
|
||||
rabbitmq-cluster-operator rabbitmq-cluster-operator
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Deploying a simple RabbitMQ cluster
|
||||
|
||||
- Let's use the YAML manifests in that directory:
|
||||
|
||||
https://github.com/jpetazzo/beyond-load-balancers/tree/main/rabbitmq
|
||||
|
||||
- This creates:
|
||||
|
||||
- a `RabbitmqCluster` called `mq`
|
||||
|
||||
- a `Secret` called `mq-default-user` containing access credentials
|
||||
|
||||
- a durable `Queue` named `q1`
|
||||
|
||||
(We can ignore the `Exchange` and the `Binding`, we won't use them.)
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Let's build something!
|
||||
|
||||
Let's replace the `cities` Redis list with our RabbitMQ queue.
|
||||
|
||||
(See next slide for steps and hints!)
|
||||
|
||||
---
|
||||
|
||||
## Steps
|
||||
|
||||
1. Edit the Bento configuration for our "CSV importer".
|
||||
|
||||
(replace the `redis_list` output with `amqp_0_9`)
|
||||
|
||||
2. Run that pipeline and confirm that messages show up in RabbitMQ.
|
||||
|
||||
3. Edit the Bento configuration for the Ollama consumer.
|
||||
|
||||
(replace the `redis_list` input with `amqp_0_9`)
|
||||
|
||||
4. Trigger a scale up of the Ollama consumer.
|
||||
|
||||
5. Update the KEDA Scaler to use RabbitMQ instead of Redis.
|
||||
|
||||
---
|
||||
|
||||
## 1️⃣ Sending messages to RabbitMQ
|
||||
|
||||
- Edit our Bento configuration (the one feeding the CSV file to Redis)
|
||||
|
||||
- We want the following `output` section:
|
||||
```yaml
|
||||
output:
|
||||
amqp_0_9:
|
||||
exchange: ""
|
||||
key: q1
|
||||
mandatory: true
|
||||
urls:
|
||||
- "${AMQP_URL}"
|
||||
```
|
||||
|
||||
- Then export the AMQP_URL environment variable using `connection_string` from Secret `mq-default-user`
|
||||
|
||||
💡 Yes, we can directly use environment variables in Bento configuration!
|
||||
|
||||
---
|
||||
|
||||
## 2️⃣ Testing our AMQP output
|
||||
|
||||
- Run the Bento pipeline
|
||||
|
||||
- To check that our messages made it:
|
||||
```bash
|
||||
kubectl exec mq-server-0 -- rabbitmqctl list_queues
|
||||
```
|
||||
|
||||
- We can also use Prometheus metrics, e.g. `rabbitmq_queue_messages`
|
||||
|
||||
---
|
||||
|
||||
## 3️⃣ Receiving messages from RabbitMQ
|
||||
|
||||
- Edit our other Bento configuration (the one in the Ollama consumer Pod)
|
||||
|
||||
- We want the following `input` section:
|
||||
```yaml
|
||||
input:
|
||||
amqp_0_9:
|
||||
urls:
|
||||
- `amqp://...:5672/`
|
||||
queue: q1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4️⃣ Triggering Ollama scale up
|
||||
|
||||
- If the autoscaler is configured to scale to zero, disable it
|
||||
|
||||
(easiest solution: delete the ScaledObject)
|
||||
|
||||
- Then manually scale the Deployment to e.g. 4 Pods
|
||||
|
||||
- Check that messages are processed and show up in the output
|
||||
|
||||
(it should still be a Redis list at this point)
|
||||
|
||||
---
|
||||
|
||||
## 5️⃣ Autoscaling on RabbitMQ
|
||||
|
||||
- We need to update our ScaledObject
|
||||
|
||||
- Check the [RabbitMQ Queue Scaler][keda-rabbitmq]
|
||||
|
||||
- Multiple ways to pass the AMQP URL:
|
||||
|
||||
- hardcode it (easier solution for testing!)
|
||||
|
||||
- use `...fromEnv` and set environment variables in target pod
|
||||
|
||||
- create and use a TriggerAuthentication
|
||||
|
||||
💡 Since we have the AMQP URL in a Secret, TriggerAuthentication works great!
|
||||
|
||||
[keda-rabbitmq]: https://keda.sh/docs/latest/scalers/rabbitmq-queue/
|
||||
@@ -55,6 +55,7 @@
|
||||
|
||||
`cert-manager.io/allow-direct-injection: "true"`
|
||||
|
||||
- See [cert-manager documentation][docs] for details
|
||||
- See [cert-manager documentation] for details
|
||||
|
||||
[cert-manager documentation]: https://cert-manager.io/docs/concepts/ca-injector/
|
||||
|
||||
[docs]: https://cert-manager.io/docs/concepts/ca-injector/
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Cluster autoscaler
|
||||
# Scaling up the cluster
|
||||
|
||||
- When the cluster is full, we need to add more nodes
|
||||
|
||||
@@ -221,7 +221,9 @@ THEN add a Node.
|
||||
|
||||
---
|
||||
|
||||
## Scaling down in theory
|
||||
# Scaling down the cluster
|
||||
|
||||
*In theory:*
|
||||
|
||||
IF a Node has less than 50% utilization for 10 minutes,
|
||||
|
||||
@@ -272,9 +274,9 @@ This can be overridden by setting the annotation:
|
||||
|
||||
- Can express `minAvailable` or `maxUnavailable`
|
||||
|
||||
- See [documentation] for details and examples
|
||||
- See [documentation][doc-pdb] for details and examples
|
||||
|
||||
[documentation]: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
|
||||
[doc-pdb]: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@
|
||||
|
||||
## What version are we running anyway?
|
||||
|
||||
- When I say, "I'm running Kubernetes 1.22", is that the version of:
|
||||
- When I say, "I'm running Kubernetes 1.28", is that the version of:
|
||||
|
||||
- kubectl
|
||||
|
||||
@@ -129,15 +129,15 @@
|
||||
|
||||
## Kubernetes uses semantic versioning
|
||||
|
||||
- Kubernetes versions look like MAJOR.MINOR.PATCH; e.g. in 1.22.17:
|
||||
- Kubernetes versions look like MAJOR.MINOR.PATCH; e.g. in 1.28.9:
|
||||
|
||||
- MAJOR = 1
|
||||
- MINOR = 22
|
||||
- PATCH = 17
|
||||
- MINOR = 28
|
||||
- PATCH = 9
|
||||
|
||||
- It's always possible to mix and match different PATCH releases
|
||||
|
||||
(e.g. 1.22.17 and 1.22.5 are compatible)
|
||||
(e.g. 1.28.9 and 1.28.13 are compatible)
|
||||
|
||||
- It is recommended to run the latest PATCH release
|
||||
|
||||
@@ -153,9 +153,9 @@
|
||||
|
||||
- All components support a difference of one¹ MINOR version
|
||||
|
||||
- This allows live upgrades (since we can mix e.g. 1.22 and 1.23)
|
||||
- This allows live upgrades (since we can mix e.g. 1.28 and 1.29)
|
||||
|
||||
- It also means that going from 1.22 to 1.24 requires going through 1.23
|
||||
- It also means that going from 1.28 to 1.30 requires going through 1.29
|
||||
|
||||
.footnote[¹Except kubelet, which can be up to two MINOR behind API server,
|
||||
and kubectl, which can be one MINOR ahead or behind API server.]
|
||||
@@ -254,7 +254,7 @@ and kubectl, which can be one MINOR ahead or behind API server.]
|
||||
sudo vim /etc/kubernetes/manifests/kube-apiserver.yaml
|
||||
```
|
||||
|
||||
- Look for the `image:` line, and update it to e.g. `v1.24.1`
|
||||
- Look for the `image:` line, and update it to e.g. `v1.30.1`
|
||||
|
||||
]
|
||||
|
||||
@@ -320,53 +320,29 @@ Note 2: kubeadm itself is still version 1.22.1..
|
||||
|
||||
- First things first: we need to upgrade kubeadm
|
||||
|
||||
.lab[
|
||||
- The Kubernetes package repositories are now split by minor versions
|
||||
|
||||
- Upgrade kubeadm:
|
||||
```
|
||||
sudo apt install kubeadm=1.27.0-00
|
||||
```
|
||||
(i.e. there is one repository for 1.28, another for 1.29, etc.)
|
||||
|
||||
- Check what kubeadm tells us:
|
||||
```
|
||||
sudo kubeadm upgrade plan
|
||||
```
|
||||
- This avoids accidentally upgrading from one minor version to another
|
||||
|
||||
]
|
||||
(e.g. with unattended upgrades or if packages haven't been held/pinned)
|
||||
|
||||
Problem: kubeadm doesn't know know how to handle
|
||||
upgrades from version 1.22.
|
||||
|
||||
This is because we installed version 1.27.
|
||||
|
||||
We need to install kubeadm version 1.23.X.
|
||||
- We'll need to add the new package repository and unpin packages!
|
||||
|
||||
---
|
||||
|
||||
## Downgrading kubeadm
|
||||
## Installing the new packages
|
||||
|
||||
- We need to go back to kubeadm version 1.23.X.
|
||||
- Edit `/etc/apt/sources.list.d/kubernetes.list`
|
||||
|
||||
.lab[
|
||||
(or copy it to e.g. `kubernetes-1.29.list` and edit that)
|
||||
|
||||
- View available versions for package `kubeadm`:
|
||||
```bash
|
||||
apt show kubeadm -a | grep ^Version | grep 1.23
|
||||
```
|
||||
- `apt-get update`
|
||||
|
||||
- Downgrade kubeadm:
|
||||
```
|
||||
sudo apt install kubeadm=1.23.0-00
|
||||
```
|
||||
- Now edit (or remove) `/etc/apt/preferences.d/kubernetes`
|
||||
|
||||
- Check what kubeadm tells us:
|
||||
```
|
||||
sudo kubeadm upgrade plan
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
kubeadm should now agree to upgrade to 1.23.X.
|
||||
- `apt-get install kubeadm` should now upgrade `kubeadm` correctly! 🎉
|
||||
|
||||
---
|
||||
|
||||
@@ -385,7 +361,7 @@ kubeadm should now agree to upgrade to 1.23.X.
|
||||
|
||||
- Look for the `image:` line, and restore it to the original value
|
||||
|
||||
(e.g. `v1.22.17`)
|
||||
(e.g. `v1.28.9`)
|
||||
|
||||
- Wait for the control plane to come back up
|
||||
|
||||
@@ -399,9 +375,14 @@ kubeadm should now agree to upgrade to 1.23.X.
|
||||
|
||||
.lab[
|
||||
|
||||
- Check the upgrade plan:
|
||||
```bash
|
||||
sudo kubeadm upgrade plan
|
||||
```
|
||||
|
||||
- Perform the upgrade:
|
||||
```bash
|
||||
sudo kubeadm upgrade apply v1.23.0
|
||||
sudo kubeadm upgrade apply v1.29.0
|
||||
```
|
||||
|
||||
]
|
||||
@@ -418,15 +399,9 @@ kubeadm should now agree to upgrade to 1.23.X.
|
||||
|
||||
- Log into node `oldversion2`
|
||||
|
||||
- View available versions for package `kubelet`:
|
||||
```bash
|
||||
apt show kubelet -a | grep ^Version
|
||||
```
|
||||
- Update package lists and APT pins like we did before
|
||||
|
||||
- Upgrade kubelet:
|
||||
```bash
|
||||
sudo apt install kubelet=1.23.0-00
|
||||
```
|
||||
- Then upgrade kubelet
|
||||
|
||||
]
|
||||
|
||||
@@ -479,13 +454,16 @@ kubeadm should now agree to upgrade to 1.23.X.
|
||||
|
||||
.lab[
|
||||
|
||||
- Download the configuration on each node, and upgrade kubelet:
|
||||
- Execute the whole upgrade procedure on each node:
|
||||
```bash
|
||||
for N in 1 2 3; do
|
||||
ssh oldversion$N "
|
||||
sudo apt install kubeadm=1.23.0-00 &&
|
||||
sudo sed -i s/1.28/1.29/ /etc/apt/sources.list.d/kubernetes.list &&
|
||||
sudo rm /etc/apt/preferences.d/kubernetes &&
|
||||
sudo apt update &&
|
||||
sudo apt install kubeadm -y &&
|
||||
sudo kubeadm upgrade node &&
|
||||
sudo apt install kubelet=1.23.0-00"
|
||||
sudo apt install kubelet -y"
|
||||
done
|
||||
```
|
||||
]
|
||||
@@ -494,7 +472,7 @@ kubeadm should now agree to upgrade to 1.23.X.
|
||||
|
||||
## Checking what we've done
|
||||
|
||||
- All our nodes should now be updated to version 1.23.0
|
||||
- All our nodes should now be updated to version 1.29
|
||||
|
||||
.lab[
|
||||
|
||||
@@ -507,17 +485,115 @@ kubeadm should now agree to upgrade to 1.23.X.
|
||||
|
||||
---
|
||||
|
||||
## And now, was that a good idea?
|
||||
|
||||
--
|
||||
|
||||
**Almost!**
|
||||
|
||||
--
|
||||
|
||||
- The official recommendation is to *drain* a node before performing node maintenance
|
||||
|
||||
(migrate all workloads off the node before upgrading it)
|
||||
|
||||
- How do we do that?
|
||||
|
||||
- Is it really necessary?
|
||||
|
||||
- Let's see!
|
||||
|
||||
---
|
||||
|
||||
## Draining a node
|
||||
|
||||
- This can be achieved with the `kubectl drain` command, which will:
|
||||
|
||||
- *cordon* the node (prevent new pods from being scheduled there)
|
||||
|
||||
- *evict* all the pods running on the node (delete them gracefully)
|
||||
|
||||
- the evicted pods will automatically be recreated somewhere else
|
||||
|
||||
- evictions might be blocked in some cases (Pod Disruption Budgets, `emptyDir` volumes)
|
||||
|
||||
- Once the node is drained, it can safely be upgraded, restarted...
|
||||
|
||||
- Once it's ready, it can be put back in commission with `kubectl uncordon`
|
||||
|
||||
---
|
||||
|
||||
## Is it necessary?
|
||||
|
||||
- When upgrading kubelet from one patch-level version to another:
|
||||
|
||||
- it's *probably fine*
|
||||
|
||||
- When upgrading system packages:
|
||||
|
||||
- it's *probably fine*
|
||||
|
||||
- except [when it's not][datadog-systemd-outage]
|
||||
|
||||
- When upgrading the kernel:
|
||||
|
||||
- it's *probably fine*
|
||||
|
||||
- ...as long as we can tolerate a restart of the containers on the node
|
||||
|
||||
- ...and that they will be unavailable for a few minutes (during the reboot)
|
||||
|
||||
[datadog-systemd-outage]: https://www.datadoghq.com/blog/engineering/2023-03-08-deep-dive-into-platform-level-impact/
|
||||
|
||||
---
|
||||
|
||||
## Is it necessary?
|
||||
|
||||
- When upgrading kubelet from one minor version to another:
|
||||
|
||||
- it *may or may not be fine*
|
||||
|
||||
- in some cases (e.g. migrating from Docker to containerd) it *will not*
|
||||
|
||||
- Here's what [the documentation][node-upgrade-docs] says:
|
||||
|
||||
*Draining nodes before upgrading kubelet ensures that pods are re-admitted and containers are re-created, which may be necessary to resolve some security issues or other important bugs.*
|
||||
|
||||
- Do it at your own risk, and if you do, test extensively in staging environments!
|
||||
|
||||
[node-upgrade-docs]: https://kubernetes.io/docs/tasks/administer-cluster/cluster-upgrade/#manual-deployments
|
||||
|
||||
---
|
||||
|
||||
## Database operators to the rescue
|
||||
|
||||
- Moving stateful pods (e.g.: database server) can cause downtime
|
||||
|
||||
- Database replication can help:
|
||||
|
||||
- if a node contains database servers, we make sure these servers aren't primaries
|
||||
|
||||
- if they are primaries, we execute a *switch over*
|
||||
|
||||
- Some database operators (e.g. [CNPG]) will do that switch over automatically
|
||||
|
||||
(when they detect that a node has been *cordoned*)
|
||||
|
||||
[CNPG]: https://cloudnative-pg.io/
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Skipping versions
|
||||
|
||||
- This example worked because we went from 1.22 to 1.23
|
||||
- This example worked because we went from 1.28 to 1.29
|
||||
|
||||
- If you are upgrading from e.g. 1.21, you will have to go through 1.22 first
|
||||
- If you are upgrading from e.g. 1.26, you will have to go through 1.27 first
|
||||
|
||||
- This means upgrading kubeadm to 1.22.X, then using it to upgrade the cluster
|
||||
- This means upgrading kubeadm to 1.27.X, then using it to upgrade the cluster
|
||||
|
||||
- Then upgrading kubeadm to 1.23.X, etc.
|
||||
- Then upgrading kubeadm to 1.28.X, etc.
|
||||
|
||||
- **Make sure to read the release notes before upgrading!**
|
||||
|
||||
|
||||
@@ -225,4 +225,4 @@ consul agent -data-dir=/consul/data -client=0.0.0.0 -server -ui \
|
||||
:EN:- Scheduling pods together or separately
|
||||
:EN:- Example: deploying a Consul cluster
|
||||
:FR:- Lancer des pods ensemble ou séparément
|
||||
:FR:- Example : lancer un cluster Consul
|
||||
:FR:- Exemple : lancer un cluster Consul
|
||||
|
||||
@@ -24,6 +24,32 @@
|
||||
|
||||
---
|
||||
|
||||
## A bit of history
|
||||
|
||||
Things related to Custom Resource Definitions:
|
||||
|
||||
- Kubernetes 1.??: `apiextensions.k8s.io/v1beta1` introduced
|
||||
|
||||
- Kubernetes 1.16: `apiextensions.k8s.io/v1` introduced
|
||||
|
||||
- Kubernetes 1.22: `apiextensions.k8s.io/v1beta1` [removed][changes-in-122]
|
||||
|
||||
- Kubernetes 1.25: [CEL validation rules available in beta][crd-validation-rules-beta]
|
||||
|
||||
- Kubernetes 1.28: [validation ratcheting][validation-ratcheting] in [alpha][feature-gates]
|
||||
|
||||
- Kubernetes 1.29: [CEL validation rules available in GA][cel-validation-rules]
|
||||
|
||||
- Kubernetes 1.30: [validation ratcheting][validation-ratcheting] in [beta][feature-gates]; enabled by default
|
||||
|
||||
[crd-validation-rules-beta]: https://kubernetes.io/blog/2022/09/23/crd-validation-rules-beta/
|
||||
[cel-validation-rules]: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#validation-rules
|
||||
[validation-ratcheting]: https://github.com/kubernetes/enhancements/tree/master/keps/sig-api-machinery/4008-crd-ratcheting
|
||||
[feature-gates]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features
|
||||
[changes-in-122]: https://kubernetes.io/blog/2021/07/14/upcoming-changes-in-kubernetes-1-22/
|
||||
|
||||
---
|
||||
|
||||
## First slice of pizza
|
||||
|
||||
```yaml
|
||||
@@ -42,8 +68,6 @@
|
||||
|
||||
(a few optional things become mandatory, see [this guide](https://kubernetes.io/docs/reference/using-api/deprecation-guide/#customresourcedefinition-v122) for details)
|
||||
|
||||
- `apiextensions.k8s.io/v1beta1` is available since Kubernetes 1.16
|
||||
|
||||
---
|
||||
|
||||
## Second slice of pizza
|
||||
@@ -96,9 +120,9 @@ The YAML below defines a resource using the CRD that we just created:
|
||||
kind: Pizza
|
||||
apiVersion: container.training/v1alpha1
|
||||
metadata:
|
||||
name: napolitana
|
||||
name: hawaiian
|
||||
spec:
|
||||
toppings: [ mozzarella ]
|
||||
toppings: [ cheese, ham, pineapple ]
|
||||
```
|
||||
|
||||
.lab[
|
||||
@@ -114,11 +138,7 @@ spec:
|
||||
|
||||
## Type validation
|
||||
|
||||
- Older versions of Kubernetes will accept our pizza definition as is
|
||||
|
||||
- Newer versions, however, will issue warnings about unknown fields
|
||||
|
||||
(and if we use `--validate=false`, these fields will simply be dropped)
|
||||
- Recent versions of Kubernetes will issue errors about unknown fields
|
||||
|
||||
- We need to improve our OpenAPI schema
|
||||
|
||||
@@ -126,6 +146,28 @@ spec:
|
||||
|
||||
---
|
||||
|
||||
## Creating a bland pizza
|
||||
|
||||
- Let's try to create a pizza anyway!
|
||||
|
||||
.lab[
|
||||
|
||||
- Only provide the most basic YAML manifest:
|
||||
```bash
|
||||
kubectl create -f- <<EOF
|
||||
kind: Pizza
|
||||
apiVersion: container.training/v1alpha1
|
||||
metadata:
|
||||
name: hawaiian
|
||||
EOF
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
- That should work! (As long as we don't try to add pineapple😁)
|
||||
|
||||
---
|
||||
|
||||
## Third slice of pizza
|
||||
|
||||
- Let's add a full OpenAPI v3 schema to our Pizza CRD
|
||||
@@ -208,24 +250,42 @@ Note: we can update a CRD without having to re-create the corresponding resource
|
||||
|
||||
---
|
||||
|
||||
## Better data validation
|
||||
## Validation woes
|
||||
|
||||
- Let's change the data schema so that the sauce can only be `red` or `white`
|
||||
|
||||
- This will be implemented by @@LINK[k8s/pizza-5.yaml]
|
||||
- Let's check what happens if we try to update our pizzas
|
||||
|
||||
.lab[
|
||||
|
||||
- Update the Pizza CRD:
|
||||
- Try to add a label:
|
||||
```bash
|
||||
kubectl apply -f ~/container.training/k8s/pizza-5.yaml
|
||||
kubectl label pizza --all deliciousness=9001
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
--
|
||||
|
||||
- It works for the pizzas that have `sauce` and `toppings`, but not the other one!
|
||||
|
||||
- The other one doesn't pass validation, and *can't be modified*
|
||||
|
||||
---
|
||||
|
||||
## Validation *a posteriori*
|
||||
## First, let's fix this!
|
||||
|
||||
- Option 1: delete the pizza
|
||||
|
||||
*(deletion isn't subject to validation)*
|
||||
|
||||
- Option 2: update the pizza to add `sauce` and `toppings`
|
||||
|
||||
*(writing a pizza that passes validation is fine)*
|
||||
|
||||
- Option 3: relax the validation rules
|
||||
|
||||
---
|
||||
|
||||
## Next, explain what's happening
|
||||
|
||||
- Some of the pizzas that we defined earlier *do not* pass validation
|
||||
|
||||
@@ -281,6 +341,8 @@ Note: we can update a CRD without having to re-create the corresponding resource
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Migrating database content
|
||||
|
||||
- We need to *serve* a version as long as we *store* objects in that version
|
||||
@@ -295,6 +357,58 @@ Note: we can update a CRD without having to re-create the corresponding resource
|
||||
|
||||
---
|
||||
|
||||
## Validation ratcheting
|
||||
|
||||
- Good news: it's not always necessary to introduce new versions
|
||||
|
||||
(and to write the associated conversion webhooks)
|
||||
|
||||
- *Validation ratcheting allows updates to custom resources that fail validation to succeed if the validation errors were on unchanged keypaths*
|
||||
|
||||
- In other words: allow changes that don't introduce further validation errors
|
||||
|
||||
- This was introduced in Kubernetes 1.28 (alpha), enabled by default in 1.30 (beta)
|
||||
|
||||
- The rules are actually a bit more complex
|
||||
|
||||
- Another (maybe more accurate) explanation: allow to tighten or loosen some field definitions
|
||||
|
||||
---
|
||||
|
||||
## Validation ratcheting example
|
||||
|
||||
- Let's change the data schema so that the sauce can only be `red` or `white`
|
||||
|
||||
- This will be implemented by @@LINK[k8s/pizza-5.yaml]
|
||||
|
||||
.lab[
|
||||
|
||||
- Update the Pizza CRD:
|
||||
```bash
|
||||
kubectl apply -f ~/container.training/k8s/pizza-5.yaml
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Testing validation ratcheting
|
||||
|
||||
- This should work with Kubernetes 1.30 and above
|
||||
|
||||
(but give an error for the `brownie` pizza with previous versions of K8S)
|
||||
|
||||
.lab[
|
||||
|
||||
- Add another label:
|
||||
```bash
|
||||
kubectl label pizzas --all food=definitely
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Even better data validation
|
||||
|
||||
- If we need more complex data validation, we can use a validating webhook
|
||||
|
||||
513
slides/k8s/disruptions.md
Normal file
513
slides/k8s/disruptions.md
Normal file
@@ -0,0 +1,513 @@
|
||||
# Disruptions
|
||||
|
||||
In a perfect world...
|
||||
|
||||
- hardware never fails
|
||||
|
||||
- software never has bugs
|
||||
|
||||
- ...and never needs to be updated
|
||||
|
||||
- ...and uses a predictable amount of resources
|
||||
|
||||
- ...and these resources are infinite anyways
|
||||
|
||||
- network latency and packet loss are zero
|
||||
|
||||
- humans never make mistakes
|
||||
|
||||
--
|
||||
|
||||
😬
|
||||
|
||||
---
|
||||
|
||||
## Disruptions
|
||||
|
||||
In the real world...
|
||||
|
||||
- hardware will fail randomly (without advance notice)
|
||||
|
||||
- software has bugs
|
||||
|
||||
- ...and we constantly add new features
|
||||
|
||||
- ...and will sometimes use more resources than expected
|
||||
|
||||
- ...and these resources are limited
|
||||
|
||||
- network latency and packet loss are NOT zero
|
||||
|
||||
- humans make mistake (shutting down the wrong machine, the wrong app...)
|
||||
|
||||
---
|
||||
|
||||
## Disruptions
|
||||
|
||||
- In Kubernetes, a "disruption" is something that stops the execution of a Pod
|
||||
|
||||
- There are **voluntary** and **involuntary** disruptions
|
||||
|
||||
- voluntary = directly initiated by humans (including by mistake!)
|
||||
|
||||
- involuntary = everything else
|
||||
|
||||
- In this section, we're going to see what they are and how to prevent them
|
||||
|
||||
(or at least, mitigate their effects)
|
||||
|
||||
---
|
||||
|
||||
## Node outage
|
||||
|
||||
- Example: hardware failure (server or network), low-level error
|
||||
|
||||
(includes kernel bugs, issues affecting underlying hypervisors or infrastructure...)
|
||||
|
||||
- **Involuntary** disruption (even if it results from human error!)
|
||||
|
||||
- Consequence: all workloads on that node become unresponsive
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- scale workloads to at least 2 replicas (or more if quorum is needed)
|
||||
|
||||
- add anti-affinity scheduling constraints (to avoid having all pods on the same node)
|
||||
|
||||
---
|
||||
|
||||
## Node outage play-by-play
|
||||
|
||||
- Node goes down (or disconnected from network)
|
||||
|
||||
- Its lease (in Namespace `kube-node-lease`) doesn't get renewed
|
||||
|
||||
- Controller manager detects that and mark the node as "unreachable"
|
||||
|
||||
(this adds both a `NoSchedule` and `NoExecute` taints to the node)
|
||||
|
||||
- Eventually, the `NoExecute` taint will evict these pods
|
||||
|
||||
- This will trigger creation of replacement pods by owner controllers
|
||||
|
||||
(except for pods with a stable network identity, e.g. in a Stateful Set!)
|
||||
|
||||
---
|
||||
|
||||
## Node outage notes
|
||||
|
||||
- By default, pods will tolerate the `unreachable:NoExecute` taint for 5 minutes
|
||||
|
||||
(toleration automatically added by Admission controller `DefaultTolerationSeconds`)
|
||||
|
||||
- Pods of a Stateful Set don't recover automatically:
|
||||
|
||||
- as long as the Pod exists, a replacement Pod can't be created
|
||||
|
||||
- the Pod will exist as long as its Node exists
|
||||
|
||||
- deleting the Node (manually or automatically) will recover the Pod
|
||||
|
||||
---
|
||||
|
||||
## Memory/disk pressure
|
||||
|
||||
- Example: available memory on a node goes below a specific threshold
|
||||
|
||||
(because a pod is using too much memory and no limit was set)
|
||||
|
||||
- **Involuntary** disruption
|
||||
|
||||
- Consequence: kubelet starts to *evict* some pods
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- set *resource limits* on containers to prevent them from using too much resources
|
||||
|
||||
- set *resource requests* on containers to make sure they don't get evicted
|
||||
<br/>
|
||||
(as long as they use less than what they requested)
|
||||
|
||||
- make sure that apps don't use more resources than what they've requested
|
||||
|
||||
---
|
||||
|
||||
## Memory/disk pressure play-by-play
|
||||
|
||||
- Memory leak in an application container, slowly causing very high memory usage
|
||||
|
||||
- Overall free memory on the node goes below the *soft* or the *hard* threshold
|
||||
|
||||
(default hard threshold = 100Mi; default soft threshold = none)
|
||||
|
||||
- When reaching the *soft* threshold:
|
||||
|
||||
- kubelet waits until the "eviction soft grace period" expires
|
||||
|
||||
- then (if resource usage is still above the threshold) it gracefully evicts pods
|
||||
|
||||
- When reaching the *hard* threshold:
|
||||
|
||||
- kubelet immediately and forcefully evicts pods
|
||||
|
||||
---
|
||||
|
||||
## Which pods are evicted?
|
||||
|
||||
- Kubelet only considers pods that are using *more* than what they requested
|
||||
|
||||
(and only for the resource that is under pressure, e.g. RAM or disk usage)
|
||||
|
||||
- First, it sorts pods by *priority¹* (as set with the `priorityClassName` in the pod spec)
|
||||
|
||||
- Then, by how much their resource usage exceeds their request
|
||||
|
||||
(again, for the resource that is under pressure)
|
||||
|
||||
- It evicts pods until enough resources have been freed up
|
||||
|
||||
---
|
||||
|
||||
## Soft (graceful) vs hard (forceful) eviction
|
||||
|
||||
- Soft eviction = graceful shutdown of the pod
|
||||
|
||||
(honor's the pod `terminationGracePeriodSeconds` timeout)
|
||||
|
||||
- Hard eviction = immediate shutdown of the pod
|
||||
|
||||
(kills all containers immediately)
|
||||
|
||||
---
|
||||
|
||||
## Memory/disk pressure notes
|
||||
|
||||
- If resource usage increases *very fast*, kubelet might not catch it fast enough
|
||||
|
||||
- For memory: this will trigger the kernel out-of-memory killer
|
||||
|
||||
- containers killed by OOM are automatically restarted (no eviction)
|
||||
|
||||
- eviction might happen at a later point though (if memory usage stays high)
|
||||
|
||||
- For disk: there is no "out-of-disk" killer, but writes will fail
|
||||
|
||||
- the `write` system call fails with `errno = ENOSPC` / `No space left on device`
|
||||
|
||||
- eviction typically happens shortly after (when kubelet catches up)
|
||||
|
||||
- When relying on disk/memory bursts a lot, using `priorityClasses` might help
|
||||
|
||||
---
|
||||
|
||||
## Memory/disk pressure delays
|
||||
|
||||
- By default, no soft threshold is defined
|
||||
|
||||
- Defining it requires setting both the threshold and the grace period
|
||||
|
||||
- Grace periods can be different for the different types of resources
|
||||
|
||||
- When a node is under pressure, kubelet places a `NoSchedule` taint
|
||||
|
||||
(to avoid adding more pods while the pod is under pressure)
|
||||
|
||||
- Once the node is no longer under pressure, kubelet clears the taint
|
||||
|
||||
(after waiting an extra timeout, `evictionPressureTransitionPeriod`, 5 min by default)
|
||||
|
||||
---
|
||||
|
||||
## Accidental deletion
|
||||
|
||||
- Example: developer deletes the wrong Deployment, the wrong Namespace...
|
||||
|
||||
- **Voluntary** disruption
|
||||
|
||||
(from Kubernetes' perspective!)
|
||||
|
||||
- Consequence: application is down
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- only deploy to production systems through e.g. gitops workflows
|
||||
|
||||
- enforce peer review of changes
|
||||
|
||||
- only give users limited (e.g. read-only) access to production systems
|
||||
|
||||
- use canary deployments (might not catch all mistakes though!)
|
||||
|
||||
---
|
||||
|
||||
## Bad code deployment
|
||||
|
||||
- Example: critical bug introduced, application crashes immediately or is non-functional
|
||||
|
||||
- **Voluntary** disruption
|
||||
|
||||
(again, from Kubernetes' perspective!)
|
||||
|
||||
- Consequence: application is down
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- readiness probes can mitigate immediate crashes
|
||||
<br/>
|
||||
(rolling update continues only when enough pods are ready)
|
||||
|
||||
- delayed crashes will require a rollback
|
||||
<br/>
|
||||
(manual intervention, or automated by a canary system)
|
||||
|
||||
---
|
||||
|
||||
## Node shutdown
|
||||
|
||||
- Example: scaling down a cluster to save money
|
||||
|
||||
- **Voluntary** disruption
|
||||
|
||||
- Consequence:
|
||||
|
||||
- all workloads running on that node are terminated
|
||||
|
||||
- this might disrupt workloads that have too many replicas on that node
|
||||
|
||||
- or workloads that should not be interrupted at all
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- terminate workloads one at a time, coordinating with users
|
||||
|
||||
--
|
||||
|
||||
🤔
|
||||
|
||||
---
|
||||
|
||||
## Node shutdown
|
||||
|
||||
- Example: scaling down a cluster to save money
|
||||
|
||||
- **Voluntary** disruption
|
||||
|
||||
- Consequence:
|
||||
|
||||
- all workloads running on that node are terminated
|
||||
|
||||
- this might disrupt workloads that have too many replicas on that node
|
||||
|
||||
- or workloads that should not be interrupted at all
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- ~~terminate workloads one at a time, coordinating with users~~
|
||||
|
||||
- use Pod Disruption Budgets
|
||||
|
||||
---
|
||||
|
||||
## Pod Disruption Budgets
|
||||
|
||||
- A PDB is a kind of *contract* between:
|
||||
|
||||
- "admins" = folks maintaining the cluster (e.g. adding/removing/updating nodes)
|
||||
|
||||
- "users" = folks deploying apps and workloads on the cluster
|
||||
|
||||
- A PDB expresses something like:
|
||||
|
||||
*in that particular set of pods, do not "disrupt" more than X at a time*
|
||||
|
||||
- Examples:
|
||||
|
||||
- in that set of frontend pods, do not disrupt more than 1 at a time
|
||||
|
||||
- in that set of worker pods, always have at least 10 ready
|
||||
<br/>
|
||||
(do not disrupt them if it would bring down the number of ready pods below 10)
|
||||
|
||||
---
|
||||
|
||||
## PDB - user side
|
||||
|
||||
- Cluster users create a PDB with a manifest like this one:
|
||||
|
||||
```yaml
|
||||
@@INCLUDE[k8s/pod-disruption-budget.yaml]
|
||||
```
|
||||
|
||||
- The PDB must indicate either `minAvailable` or `maxUnavailable`
|
||||
|
||||
---
|
||||
|
||||
## Rounding logic
|
||||
|
||||
- Percentages are rounded **up**
|
||||
|
||||
- When specifying `maxUnavailble` as a percentage, this can result in a higher perecentage
|
||||
|
||||
(e.g. `maxUnavailable: 50%` with 3 pods can result in 2 pods being unavailable!)
|
||||
|
||||
---
|
||||
|
||||
## Unmanaged pods
|
||||
|
||||
- Specifying `minAvailable: X` works all the time
|
||||
|
||||
- Specifying `minAvailable: X%` or `maxUnavaiable` requires *managed pods*
|
||||
|
||||
(pods that belong to a controller, e.g. Replica Set, Stateful Set...)
|
||||
|
||||
- This is because the PDB controller needs to know the total number of pods
|
||||
|
||||
(given by the `replicas` field, not merely by counting pod objects)
|
||||
|
||||
- The PDB controller will try to resolve the controller using the pod selector
|
||||
|
||||
- If that fails, the PDB controller will emit warning events
|
||||
|
||||
(visible with `kubectl describe pdb ...`)
|
||||
|
||||
---
|
||||
|
||||
## Zero
|
||||
|
||||
- `maxUnavailable: 0` means "do not disrupt my pods"
|
||||
|
||||
- Same thing if `minAvailable` is greater than or equal to the number of pods
|
||||
|
||||
- In that case, cluster admins are supposed to get in touch with cluster users
|
||||
|
||||
- This will prevent fully automated operation
|
||||
|
||||
(and some cluster admins automated systems might not honor that request)
|
||||
|
||||
---
|
||||
|
||||
## PDB - admin side
|
||||
|
||||
- As a cluster admin, we need to follow certain rules
|
||||
|
||||
- Only shut down (or restart) a node when no pods are running on that node
|
||||
|
||||
(except system pods belonging to Daemon Sets)
|
||||
|
||||
- To remove pods running on a node, we should use the *eviction API*
|
||||
|
||||
(which will check PDB constraints and honor them)
|
||||
|
||||
- To prevent new pods from being scheduled on a node, we can use a *taint*
|
||||
|
||||
- These operations are streamlined by `kubectl drain`, which will:
|
||||
|
||||
- *cordon* the node (add a `NoSchedule` taint)
|
||||
|
||||
- invoke the *eviction API* to remove pods while respecting their PDBs
|
||||
|
||||
---
|
||||
|
||||
## Theory vs practice
|
||||
|
||||
- `kubectl drain` won't evict pods using `emptyDir` volumes
|
||||
|
||||
(unless the `--delete-emptydir-data` flag is passed as well)
|
||||
|
||||
- Make sure that `emptyDir` volumes don't hold anything important
|
||||
|
||||
(they shouldn't, but... who knows!)
|
||||
|
||||
- Kubernetes lacks a standard way for users to express:
|
||||
|
||||
*this `emptyDir` volume can/cannot be safely deleted*
|
||||
|
||||
- If a PDB forbids an eviction, this requires manual coordination
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Unhealthy pod eviction policy
|
||||
|
||||
- By default, unhealthy pods can only be evicted if PDB allows it
|
||||
|
||||
(unhealthy = running, but not ready)
|
||||
|
||||
- In many cases, unhealthy pods aren't healthy anyway, and can be removed
|
||||
|
||||
- This behavior is enabled by setting the appropriate field in the PDB manifest:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
unhealthyPodEvictionPolicy: AlwaysAllow
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Node upgrade
|
||||
|
||||
- Example: upgrading kubelet or the Linux kernel on a node
|
||||
|
||||
- **Voluntary** disruption
|
||||
|
||||
- Consequence:
|
||||
|
||||
- all workloads running on that node are temporarily interrupted, and restarted
|
||||
|
||||
- this might disrupt these workloads
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- migrate workloads off the done first (as if we were shutting it down)
|
||||
|
||||
---
|
||||
|
||||
## Node upgrade notes
|
||||
|
||||
- Is it necessary to drain a node before doing an upgrade?
|
||||
|
||||
- From [the documentation][node-upgrade-docs]:
|
||||
|
||||
*Draining nodes before upgrading kubelet ensures that pods are re-admitted and containers are re-created, which may be necessary to resolve some security issues or other important bugs.*
|
||||
|
||||
- It's *probably* safe to upgrade in-place for:
|
||||
|
||||
- kernel upgrades
|
||||
|
||||
- kubelet patch-level upgrades (1.X.Y → 1.X.Z)
|
||||
|
||||
- It's *probably* better to drain the node for minor revisions kubelet upgrades (1.X → 1.Y)
|
||||
|
||||
- In doubt, test extensively in staging environments!
|
||||
|
||||
[node-upgrade-docs]: https://kubernetes.io/docs/tasks/administer-cluster/cluster-upgrade/#manual-deployments
|
||||
|
||||
---
|
||||
|
||||
## Manual rescheduling
|
||||
|
||||
- Example: moving workloads around to accommodate noisy neighbors or other issues
|
||||
|
||||
(e.g. pod X is doing a lot of disk I/O and this is starving other pods)
|
||||
|
||||
- **Voluntary** disruption
|
||||
|
||||
- Consequence:
|
||||
|
||||
- the moved workloads are temporarily interrupted
|
||||
|
||||
- Mitigations:
|
||||
|
||||
- define an appropriate number of replicas, declare PDBs
|
||||
|
||||
- use the [eviction API][eviction-API] to move workloads
|
||||
|
||||
[eviction-API]: https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/
|
||||
|
||||
???
|
||||
|
||||
:EN:- Voluntary and involuntary disruptions
|
||||
:EN:- Pod Disruption Budgets
|
||||
:FR:- "Disruptions" volontaires et involontaires
|
||||
:FR:- Pod Disruption Budgets
|
||||
@@ -368,6 +368,30 @@ class: extra-details
|
||||
|
||||
[ciliumwithoutkubeproxy]: https://docs.cilium.io/en/stable/network/kubernetes/kubeproxy-free/#kubeproxy-free
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## About the API server certificate...
|
||||
|
||||
- In the previous sections, we've skipped API server certificate verification
|
||||
|
||||
- To generate a proper certificate, we need to include a `subjectAltName` extension
|
||||
|
||||
- And make sure that the CA includes the extension in the certificate
|
||||
|
||||
```bash
|
||||
openssl genrsa -out apiserver.key 4096
|
||||
|
||||
openssl req -new -key apiserver.key -subj /CN=kubernetes/ \
|
||||
-addext "subjectAltName = DNS:kubernetes.default.svc, \
|
||||
DNS:kubernetes.default, DNS:kubernetes, \
|
||||
DNS:localhost, DNS:polykube1" -out apiserver.csr
|
||||
|
||||
openssl x509 -req -in apiserver.csr -CAkey ca.key -CA ca.cert \
|
||||
-out apiserver.crt -copy_extensions copy
|
||||
```
|
||||
|
||||
???
|
||||
|
||||
:EN:- Connecting nodes and pods
|
||||
|
||||
@@ -462,7 +462,7 @@ The "context" section references the "cluster" and "credentials" that we defined
|
||||
|
||||
---
|
||||
|
||||
## Review the kubeconfig filfe
|
||||
## Review the kubeconfig file
|
||||
|
||||
The kubeconfig file should look like this:
|
||||
|
||||
|
||||
508
slides/k8s/flux.md
Normal file
508
slides/k8s/flux.md
Normal file
@@ -0,0 +1,508 @@
|
||||
# FluxCD
|
||||
|
||||
- We're going to implement a basic GitOps workflow with Flux
|
||||
|
||||
- Pushing to `main` will automatically deploy to the clusters
|
||||
|
||||
- There will be two clusters (`dev` and `prod`)
|
||||
|
||||
- The two clusters will have similar (but slightly different) workloads
|
||||
|
||||
---
|
||||
|
||||
## Repository structure
|
||||
|
||||
This is (approximately) what we're going to do:
|
||||
|
||||
```
|
||||
@@INCLUDE[slides/k8s/gitopstree.txt]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Getting ready
|
||||
|
||||
- Let's make sure we have two clusters
|
||||
|
||||
- It's OK to use local clusters (kind, minikube...)
|
||||
|
||||
- We might run into resource limits, though
|
||||
|
||||
(pay attention to `Pending` pods!)
|
||||
|
||||
- We need to install the Flux CLI ([packages], [binaries])
|
||||
|
||||
- **Highly recommended:** set up CLI completion!
|
||||
|
||||
- Of course we'll need a Git service, too
|
||||
|
||||
(we're going to use GitHub here)
|
||||
|
||||
[packages]: https://fluxcd.io/flux/get-started/
|
||||
[binaries]: https://github.com/fluxcd/flux2/releases
|
||||
|
||||
---
|
||||
|
||||
## GitHub setup
|
||||
|
||||
- Generate a GitHub token:
|
||||
|
||||
https://github.com/settings/tokens/new
|
||||
|
||||
- Give it "repo" access
|
||||
|
||||
- This token will be used by the `flux bootstrap github` command later
|
||||
|
||||
- It will create a repository and configure it (SSH key...)
|
||||
|
||||
- The token can be revoked afterwards
|
||||
|
||||
---
|
||||
|
||||
## Flux bootstrap
|
||||
|
||||
.lab[
|
||||
|
||||
- Let's set a few variables for convenience, and create our repository:
|
||||
```bash
|
||||
export GITHUB_TOKEN=...
|
||||
export GITHUB_USER=changeme
|
||||
export GITHUB_REPO=alsochangeme
|
||||
export FLUX_CLUSTER=dev
|
||||
|
||||
flux bootstrap github \
|
||||
--owner=$GITHUB_USER \
|
||||
--repository=$GITHUB_REPO \
|
||||
--branch=main \
|
||||
--path=./clusters/$FLUX_CLUSTER \
|
||||
--personal --private=false
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
Problems? check next slide!
|
||||
|
||||
---
|
||||
|
||||
## What could go wrong?
|
||||
|
||||
- `flux bootstrap` will create or update the repository on GitHub
|
||||
|
||||
- Then it will install Flux controllers to our cluster
|
||||
|
||||
- Then it waits for these controllers to be up and running and ready
|
||||
|
||||
- Check pod status in `flux-system`
|
||||
|
||||
- If pods are `Pending`, check that you have enough resources on your cluster
|
||||
|
||||
- For testing purposes, it should be fine to lower or remove Flux `requests`!
|
||||
|
||||
(but don't do that in production!)
|
||||
|
||||
- If anything goes wrong, don't worry, we can just re-run the bootstrap
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Idempotence
|
||||
|
||||
- It's OK to run that same `flux bootstrap` command multiple times!
|
||||
|
||||
- If the repository already exists, it will re-use it
|
||||
|
||||
(it won't destroy or empty it)
|
||||
|
||||
- If the path `./clusters/$FLUX_CLUSTER` already exists, it will update it
|
||||
|
||||
- It's totally fine to re-run `flux bootstrap` if something fails
|
||||
|
||||
- It's totally fine to run it multiple times on different clusters
|
||||
|
||||
- Or even to run it multiple times for the *same* cluster
|
||||
|
||||
(to reinstall Flux on that cluster after a cluster wipe / reinstall)
|
||||
|
||||
---
|
||||
|
||||
## What do we get?
|
||||
|
||||
- Let's look at what `flux bootstrap` installed on the cluster
|
||||
|
||||
.lab[
|
||||
|
||||
- Look inside the `flux-system` namespace:
|
||||
```bash
|
||||
kubectl get all --namespace flux-system
|
||||
```
|
||||
|
||||
- Look at `kustomizations` custom resources:
|
||||
```bash
|
||||
kubectl get kustomizations --all-namespaces
|
||||
```
|
||||
|
||||
- See what the `flux` CLI tells us:
|
||||
```bash
|
||||
flux get all
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Deploying with GitOps
|
||||
|
||||
- We'll need to add/edit files on the repository
|
||||
|
||||
- We can do it by using `git clone`, local edits, `git commit`, `git push`
|
||||
|
||||
- Or by editing online on the GitHub website
|
||||
|
||||
.lab[
|
||||
|
||||
- Create a manifest; for instance `clusters/dev/flux-system/blue.yaml`
|
||||
|
||||
- Add that manifest to `clusters/dev/kustomization.yaml`
|
||||
|
||||
- Commit and push both changes to the repository
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Waiting for reconciliation
|
||||
|
||||
- Compare the git hash that we pushed and the one shown with `kubectl get `
|
||||
|
||||
- Option 1: wait for Flux to pick up the changes in the repository
|
||||
|
||||
(the default interval for git repositories is 1 minute, so that's fast)
|
||||
|
||||
- Option 2: use `flux reconcile source git flux-system`
|
||||
|
||||
(this puts an annotation on the appropriate resource, triggering an immediate check)
|
||||
|
||||
- Option 3: set up receiver webhooks
|
||||
|
||||
(so that git updates trigger immediate reconciliation)
|
||||
|
||||
---
|
||||
|
||||
## Checking progress
|
||||
|
||||
- `flux logs`
|
||||
|
||||
- `kubectl get gitrepositories --all-namespaces`
|
||||
|
||||
- `kubectl get kustomizations --all-namespaces`
|
||||
|
||||
---
|
||||
|
||||
## Did it work?
|
||||
|
||||
--
|
||||
|
||||
- No!
|
||||
|
||||
--
|
||||
|
||||
- Why?
|
||||
|
||||
--
|
||||
|
||||
- We need to indicate the namespace where the app should be deployed
|
||||
|
||||
- Either in the YAML manifests
|
||||
|
||||
- Or in the `kustomization` custom resource
|
||||
|
||||
(using field `spec.targetNamespace`)
|
||||
|
||||
- Add the namespace to the manifest and try again!
|
||||
|
||||
---
|
||||
|
||||
## Adding an app in a reusable way
|
||||
|
||||
- Let's see a technique to add a whole app
|
||||
|
||||
(with multiple resource manifets)
|
||||
|
||||
- We want to minimize code repetition
|
||||
|
||||
(i.e. easy to add on multiple clusters with minimal changes)
|
||||
|
||||
---
|
||||
|
||||
## The plan
|
||||
|
||||
- Add the app manifests in a directory
|
||||
|
||||
(e.g.: `apps/myappname/manifests`)
|
||||
|
||||
- Create a kustomization manifest for the app and its namespace
|
||||
|
||||
(e.g.: `apps/myappname/flux.yaml`)
|
||||
|
||||
- The kustomization manifest will refer to the app manifest
|
||||
|
||||
- Add the kustomization manifest to the top-level `flux-system` kustomization
|
||||
|
||||
---
|
||||
|
||||
## Creating the manifests
|
||||
|
||||
- All commands below should be executed at the root of the repository
|
||||
|
||||
.lab[
|
||||
|
||||
- Put application manifests in their directory:
|
||||
```bash
|
||||
mkdir -p apps/dockercoins
|
||||
cp ~/container.training/k8s/dockercoins.yaml apps/dockercoins/
|
||||
```
|
||||
|
||||
- Create kustomization manifest:
|
||||
```bash
|
||||
flux create kustomization dockercoins \
|
||||
--source=GitRepository/flux-system \
|
||||
--path=./apps/dockercoins/manifests/ \
|
||||
--target-namespace=dockercoins \
|
||||
--prune=true --export > apps/dockercoins/flux.yaml
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Creating the target namespace
|
||||
|
||||
- When deploying *helm releases*, it is possible to automatically create the namespace
|
||||
|
||||
- When deploying *kustomizations*, we need to create it explicitly
|
||||
|
||||
- Let's put the namespace with the kustomization manifest
|
||||
|
||||
(so that the whole app can be mediated through a single manifest)
|
||||
|
||||
.lab[
|
||||
|
||||
- Add the target namespace to the kustomization manifest:
|
||||
```bash
|
||||
echo "---
|
||||
kind: Namespace
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: dockercoins" >> apps/dockercoins/flux.yaml
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Linking the kustomization manifest
|
||||
|
||||
- Edit `clusters/dev/flux-system/kustomization.yaml`
|
||||
|
||||
- Add a line to reference the kustomization manifest that we created:
|
||||
```yaml
|
||||
- ../../../apps/dockercoins/flux.yaml
|
||||
```
|
||||
|
||||
- `git add` our manifests, `git commit`, `git push`
|
||||
|
||||
(check with `git status` that we haven't forgotten anything!)
|
||||
|
||||
- `flux reconcile` or wait for the changes to be picked up
|
||||
|
||||
---
|
||||
|
||||
## Installing with Helm
|
||||
|
||||
- We're going to see two different workflows:
|
||||
|
||||
- installing a third-party chart
|
||||
<br/>
|
||||
(e.g. something we found on the Artifact Hub)
|
||||
|
||||
- installing one of our own charts
|
||||
<br/>
|
||||
(e.g. a chart we authored ourselves)
|
||||
|
||||
- The procedures are very similar
|
||||
|
||||
---
|
||||
|
||||
## Installing from a public Helm repository
|
||||
|
||||
- Let's install [kube-prometheus-stack][kps]
|
||||
|
||||
.lab[
|
||||
|
||||
- Create the Flux manifests:
|
||||
```bash
|
||||
mkdir -p apps/kube-prometheus-stack
|
||||
flux create source helm kube-prometheus-stack \
|
||||
--url=https://prometheus-community.github.io/helm-charts \
|
||||
--export >> apps/kube-prometheus-stack/flux.yaml
|
||||
flux create helmrelease kube-prometheus-stack \
|
||||
--source=HelmRepository/kube-prometheus-stack \
|
||||
--chart=kube-prometheus-stack --release-name=kube-prometheus-stack \
|
||||
--target-namespace=kube-prometheus-stack --create-target-namespace \
|
||||
--export >> apps/kube-prometheus-stack/flux.yaml
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
[kps]: https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack
|
||||
|
||||
---
|
||||
|
||||
## Enable the app
|
||||
|
||||
- Just like before, link the manifest from the top-level kustomization
|
||||
|
||||
(`flux-system` in namespace `flux-system`)
|
||||
|
||||
- `git add` / `git commit` / `git push`
|
||||
|
||||
- We should now have a Prometheus+Grafana observability stack!
|
||||
|
||||
---
|
||||
|
||||
## Installing from a Helm chart in a git repo
|
||||
|
||||
- In this example, the chart will be in the same repo
|
||||
|
||||
- In the real world, it will typically be in a different repo!
|
||||
|
||||
.lab[
|
||||
|
||||
- Generate a basic Helm chart:
|
||||
```bash
|
||||
mkdir -p charts
|
||||
helm create charts/myapp
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
(This generates a chart which installs NGINX. A lot of things can be customized, though.)
|
||||
|
||||
---
|
||||
|
||||
## Creating the Flux manifests
|
||||
|
||||
- The invocation is very similar to our first example
|
||||
|
||||
.lab[
|
||||
|
||||
- Generate the Flux manifest for the Helm release:
|
||||
```bash
|
||||
mkdir apps/myapp
|
||||
flux create helmrelease myapp \
|
||||
--source=GitRepository/flux-system \
|
||||
--chart=charts/myapp \
|
||||
--target-namespace=myapp --create-target-namespace \
|
||||
--export > apps/myapp/flux.yaml
|
||||
```
|
||||
|
||||
- Add a reference to that manifest to the top-level kustomization
|
||||
|
||||
- `git add` / `git commit` / `git push` the chart, manifest, and kustomization
|
||||
|
||||
]
|
||||
|
||||
---
|
||||
|
||||
## Passing values
|
||||
|
||||
- We can also configure our Helm releases with values
|
||||
|
||||
- Using an existing `myvalues.yaml` file:
|
||||
|
||||
`flux create helmrelease ... --values=myvalues.yaml`
|
||||
|
||||
- Referencing an existing ConfigMap or Secret with a `values.yaml` key:
|
||||
|
||||
`flux create helmrelease ... --values-from=ConfigMap/myapp`
|
||||
|
||||
---
|
||||
|
||||
## Gotchas
|
||||
|
||||
- When creating a HelmRelease using a chart stored in a git repository, you must:
|
||||
|
||||
- either bump the chart version (in `Chart.yaml`) after each change,
|
||||
|
||||
- or set `spec.chart.spec.reconcileStrategy` to `Revision`
|
||||
|
||||
- Why?
|
||||
|
||||
- Flux installs helm releases using packaged artifacts
|
||||
|
||||
- Artifacts are updated only when the Helm chart version changes
|
||||
|
||||
- Unless `reconcileStrategy` is set to `Revision` (instead of the default `ChartVersion`)
|
||||
|
||||
---
|
||||
|
||||
## More gotchas
|
||||
|
||||
- There is a bug in Flux that prevents using identical subcharts with aliases
|
||||
|
||||
- See [fluxcd/flux2#2505][flux2505] for details
|
||||
|
||||
[flux2505]: https://github.com/fluxcd/flux2/discussions/2505
|
||||
|
||||
---
|
||||
|
||||
## Things that we didn't talk about...
|
||||
|
||||
- Bucket sources
|
||||
|
||||
- Image automation controller
|
||||
|
||||
- Image reflector controller
|
||||
|
||||
- And more!
|
||||
|
||||
???
|
||||
|
||||
:EN:- Implementing gitops with Flux
|
||||
:FR:- Workflow gitops avec Flux
|
||||
|
||||
<!--
|
||||
|
||||
helm upgrade --install --repo https://dl.gitea.io/charts --namespace gitea --create-namespace gitea gitea \
|
||||
--set persistence.enabled=false \
|
||||
--set redis-cluster.enabled=false \
|
||||
--set postgresql-ha.enabled=false \
|
||||
--set postgresql.enabled=true \
|
||||
--set gitea.config.session.PROVIDER=db \
|
||||
--set gitea.config.cache.ADAPTER=memory \
|
||||
#
|
||||
|
||||
### Boostrap Flux controllers
|
||||
|
||||
```bash
|
||||
mkdir -p flux/flux-system/gotk-components.yaml
|
||||
flux install --export > flux/flux-system/gotk-components.yaml
|
||||
kubectl apply -f flux/flux-system/gotk-components.yaml
|
||||
```
|
||||
|
||||
### Bootstrap GitRepository/Kustomization
|
||||
|
||||
```bash
|
||||
export REPO_URL="<gitlab_url>" DEPLOY_USERNAME="<username>"
|
||||
read -s DEPLOY_TOKEN
|
||||
flux create secret git flux-system --url="${REPO_URL}" --username="${DEPLOY_USERNAME}" --password="${DEPLOY_TOKEN}"
|
||||
flux create source git flux-system --url=$REPO_URL --branch=main --secret-ref flux-system --ignore-paths='/*,!/flux' --export > flux/flux-system/gotk-sync.yaml
|
||||
flux create kustomization flux-system --source=GitRepository/flux-system --path="./flux" --prune=true --export >> flux/flux-system/gotk-sync.yaml
|
||||
|
||||
git add flux/ && git commit -m 'feat: Setup Flux' flux/ && git push
|
||||
kubectl apply -f flux/flux-system/gotk-sync.yaml
|
||||
```
|
||||
|
||||
-->
|
||||
|
||||
13
slides/k8s/gitopstree.txt
Normal file
13
slides/k8s/gitopstree.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
├── charts/ <--- could also be in separate app repos
|
||||
│ ├── dockercoins/
|
||||
│ └── color/
|
||||
├── apps/ <--- YAML manifests for GitOps resources
|
||||
│ ├── dockercoins/ (might reference the "charts" above,
|
||||
│ ├── blue/ and/or include environment-specific
|
||||
│ ├── green/ manifests to create e.g. namespaces,
|
||||
│ ├── kube-prometheus-stack/ configmaps, secrets...)
|
||||
│ ├── cert-manager/
|
||||
│ └── traefik/
|
||||
└── clusters/ <--- per-cluster; will typically reference
|
||||
├── prod/ the "apps" above, possibly extending
|
||||
└── dev/ or adding configuration resources too
|
||||
@@ -1,4 +1,4 @@
|
||||
# Git-based workflows
|
||||
# Git-based workflows (GitOps)
|
||||
|
||||
- Deploying with `kubectl` has downsides:
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
- These resources have a perfect YAML representation
|
||||
|
||||
- All we do is manipulating these YAML representations
|
||||
- All we do is manipulate these YAML representations
|
||||
|
||||
(`kubectl run` generates a YAML file that gets applied)
|
||||
|
||||
@@ -34,229 +34,232 @@
|
||||
|
||||
- control who can push to which branches
|
||||
|
||||
- have formal review processes, pull requests ...
|
||||
- have formal review processes, pull requests, test gates...
|
||||
|
||||
---
|
||||
|
||||
## Enabling git-based workflows
|
||||
|
||||
- There are a few tools out there to help us do that
|
||||
- There are a many tools out there to help us do that; with different approaches
|
||||
|
||||
- We'll see demos of two of them: [Flux] and [Gitkube]
|
||||
- "Git host centric" approach: GitHub Actions, GitLab...
|
||||
|
||||
- There are *many* other tools, some of them with even more features
|
||||
*the workflows/action are directly initiated by the git platform*
|
||||
|
||||
- There are also *many* integrations with popular CI/CD systems
|
||||
- "Kubernetes cluster centric" approach: [ArgoCD], [FluxCD]..
|
||||
|
||||
(e.g.: GitLab, Jenkins, ...)
|
||||
*controllers run on our clusters and trigger on repo updates*
|
||||
|
||||
[Flux]: https://www.weave.works/oss/flux/
|
||||
[Gitkube]: https://gitkube.sh/
|
||||
- This is not an exhaustive list (see also: Jenkins)
|
||||
|
||||
- We're going to talk mostly about "Kubernetes cluster centric" approaches here
|
||||
|
||||
[ArgoCD]: https://argoproj.github.io/cd/
|
||||
[Flux]: https://fluxcd.io/
|
||||
|
||||
---
|
||||
|
||||
## Flux overview
|
||||
## The road to production
|
||||
|
||||
- We put our Kubernetes resources as YAML files in a git repository
|
||||
In no specific order, we need to at least:
|
||||
|
||||
- Flux polls that repository regularly (every 5 minutes by default)
|
||||
- Choose a tool
|
||||
|
||||
- The resources described by the YAML files are created/updated automatically
|
||||
- Choose a cluster / app / namespace layout
|
||||
<br/>
|
||||
(one cluster per app, different clusters for prod/staging...)
|
||||
|
||||
- Changes are made by updating the code in the repository
|
||||
- Choose a repository layout
|
||||
<br/>
|
||||
(different repositories, directories, branches per app, env, cluster...)
|
||||
|
||||
- Choose an installation / bootstrap method
|
||||
|
||||
- Choose how new apps / environments / versions will be deployed
|
||||
|
||||
- Choose how new images will be built
|
||||
|
||||
---
|
||||
|
||||
## Preparing a repository for Flux
|
||||
## Flux vs ArgoCD (1/2)
|
||||
|
||||
- We need a repository with Kubernetes YAML files
|
||||
- Flux:
|
||||
|
||||
- I have one: https://github.com/jpetazzo/kubercoins
|
||||
- fancy setup with an (optional) dedicated `flux bootstrap` command
|
||||
<br/>
|
||||
(with support for specific git providers, repo creation...)
|
||||
|
||||
- Fork it to your GitHub account
|
||||
- deploying an app requires multiple CRDs
|
||||
<br/>
|
||||
(Kustomization, HelmRelease, GitRepository...)
|
||||
|
||||
- Create a new branch in your fork; e.g. `prod`
|
||||
- supports Helm charts, Kustomize, raw YAML
|
||||
|
||||
(e.g. with "branch" dropdown through the GitHub web UI)
|
||||
- ArgoCD:
|
||||
|
||||
- This is the branch that we are going to use for deployment
|
||||
- simple setup (just apply YAMLs / install Helm chart)
|
||||
|
||||
- fewer CRDs (basic workflow can be implement with a single "Application" resource)
|
||||
|
||||
- supports Helm charts, Jsonnet, Kustomize, raw YAML, and arbitrary plugins
|
||||
|
||||
---
|
||||
|
||||
## Setting up Flux with kustomize
|
||||
## Flux vs ArgoCD (2/2)
|
||||
|
||||
- Clone the Flux repository:
|
||||
```bash
|
||||
git clone https://github.com/fluxcd/flux
|
||||
cd flux
|
||||
```
|
||||
- Flux:
|
||||
|
||||
- Edit `deploy/flux-deployment.yaml`
|
||||
- sync interval is configurable per app
|
||||
- no web UI out of the box
|
||||
- CLI relies on Kubernetes API access
|
||||
- CLI can easily generate custom resource manifests (with `--export`)
|
||||
- self-hosted (flux controllers are managed by flux itself by default)
|
||||
- one flux instance manages a single cluster
|
||||
|
||||
- Change the `--git-url` and `--git-branch` parameters:
|
||||
```yaml
|
||||
- --git-url=git@github.com:your-git-username/kubercoins
|
||||
- --git-branch=prod
|
||||
```
|
||||
- ArgoCD:
|
||||
|
||||
- Apply all the YAML:
|
||||
```bash
|
||||
kubectl apply -k deploy/
|
||||
```
|
||||
- sync interval is configured globally
|
||||
- comes with a web UI
|
||||
- CLI can use Kubernetes API or separate API and authentication system
|
||||
- one ArgoCD instance can manage multiple clusters
|
||||
|
||||
---
|
||||
|
||||
## Setting up Flux with Helm
|
||||
## Cluster, app, namespace layout
|
||||
|
||||
- Add Flux helm repo:
|
||||
```bash
|
||||
helm repo add fluxcd https://charts.fluxcd.io
|
||||
```
|
||||
- One cluster per app, different namespaces for environments?
|
||||
|
||||
- Install Flux:
|
||||
```bash
|
||||
kubectl create namespace flux
|
||||
helm upgrade --install flux \
|
||||
--set git.url=git@github.com:your-git-username/kubercoins \
|
||||
--set git.branch=prod \
|
||||
--namespace flux \
|
||||
fluxcd/flux
|
||||
```
|
||||
- One cluster per environment, different namespaces for apps?
|
||||
|
||||
- Everything on a single cluster? One cluster per combination?
|
||||
|
||||
- Something in between:
|
||||
|
||||
- prod cluster, database cluster, dev/staging/etc cluster
|
||||
|
||||
- prod+db cluster per app, shared dev/staging/etc cluster
|
||||
|
||||
- And more!
|
||||
|
||||
Note: this decision isn't really tied to GitOps!
|
||||
|
||||
---
|
||||
|
||||
## Allowing Flux to access the repository
|
||||
## Repository layout
|
||||
|
||||
- When it starts, Flux generates an SSH key
|
||||
So many different possibilities!
|
||||
|
||||
- Display that key:
|
||||
```bash
|
||||
kubectl -n flux logs deployment/flux | grep identity.pub | cut -d '"' -f2
|
||||
```
|
||||
- Source repos
|
||||
|
||||
- Then add that key to the repository, giving it **write** access
|
||||
- Cluster/infra repos/branches/directories
|
||||
|
||||
(some Flux features require write access)
|
||||
- "Deployment" repos (with manifests, charts)
|
||||
|
||||
- After a minute or so, DockerCoins will be deployed to the current namespace
|
||||
- Different repos/branches/directories for environments
|
||||
|
||||
🤔 How to decide?
|
||||
|
||||
---
|
||||
|
||||
## Making changes
|
||||
## Permissions
|
||||
|
||||
- Make changes (on the `prod` branch), e.g. change `replicas` in `worker`
|
||||
- Different teams/companies = different repos
|
||||
|
||||
- After a few minutes, the changes will be picked up by Flux and applied
|
||||
- separate platform team → separate "infra" vs "apps" repos
|
||||
|
||||
- teams working on different apps → different repos per app
|
||||
|
||||
- Branches can be "protected" (`production`, `main`...)
|
||||
|
||||
(don't need separate repos for separate environments)
|
||||
|
||||
- Directories will typically have the same permissions
|
||||
|
||||
- Managing directories is easier than branches
|
||||
|
||||
- But branches are more "powerful" (cherrypicking, rebasing...)
|
||||
|
||||
---
|
||||
|
||||
## Other features
|
||||
## Resource hierarchy
|
||||
|
||||
- Flux can keep a list of all the tags of all the images we're running
|
||||
- Git-based deployments are managed by Kubernetes resources
|
||||
|
||||
- The `fluxctl` tool can show us if we're running the latest images
|
||||
(e.g. Kustomization, HelmRelease with Flux; Application with ArgoCD)
|
||||
|
||||
- We can also "automate" a resource (i.e. automatically deploy new images)
|
||||
- We will call these resources "GitOps resources"
|
||||
|
||||
- And much more!
|
||||
- These resources need to be managed like any other Kubernetes resource
|
||||
|
||||
(YAML manifests, Kustomizations, Helm charts)
|
||||
|
||||
- They can be managed with Git workflows too!
|
||||
|
||||
---
|
||||
|
||||
## Gitkube overview
|
||||
## Cluster / infra management
|
||||
|
||||
- We put our Kubernetes resources as YAML files in a git repository
|
||||
- How do we provision clusters?
|
||||
|
||||
- Gitkube is a git server (or "git remote")
|
||||
- Manual "one-shot" provisioning (CLI, web UI...)
|
||||
|
||||
- After making changes to the repository, we push to Gitkube
|
||||
- Automation with Terraform, Ansible...
|
||||
|
||||
- Gitkube applies the resources to the cluster
|
||||
- Kubernetes-driven systems (Crossplane, CAPI)
|
||||
|
||||
- Infrastructure can also be managed with GitOps
|
||||
|
||||
---
|
||||
|
||||
## Setting up Gitkube
|
||||
## Example 1
|
||||
|
||||
- Install the CLI:
|
||||
```bash
|
||||
sudo curl -L -o /usr/local/bin/gitkube \
|
||||
https://github.com/hasura/gitkube/releases/download/v0.2.1/gitkube_linux_amd64
|
||||
sudo chmod +x /usr/local/bin/gitkube
|
||||
```
|
||||
- Managed with YAML/Charts:
|
||||
|
||||
- Install Gitkube on the cluster:
|
||||
```bash
|
||||
gitkube install --expose ClusterIP
|
||||
```
|
||||
- core components (CNI, CSI, Ingress, logging, monitoring...)
|
||||
|
||||
- GitOps controllers
|
||||
|
||||
- critical application foundations (database operator, databases)
|
||||
|
||||
- GitOps manifests
|
||||
|
||||
- Managed with GitOps:
|
||||
|
||||
- applications
|
||||
|
||||
- staging databases
|
||||
|
||||
---
|
||||
|
||||
## Creating a Remote
|
||||
## Example 2
|
||||
|
||||
- Gitkube provides a new type of API resource: *Remote*
|
||||
- Managed with YAML/Charts:
|
||||
|
||||
(this is using a mechanism called Custom Resource Definitions or CRD)
|
||||
- essential components (CNI, CoreDNS)
|
||||
|
||||
- Create and apply a YAML file containing the following manifest:
|
||||
```yaml
|
||||
apiVersion: gitkube.sh/v1alpha1
|
||||
kind: Remote
|
||||
metadata:
|
||||
name: example
|
||||
spec:
|
||||
authorizedKeys:
|
||||
- `ssh-rsa AAA...`
|
||||
manifests:
|
||||
path: "."
|
||||
```
|
||||
- initial installation of GitOps controllers
|
||||
|
||||
(replace the `ssh-rsa AAA...` section with the content of `~/.ssh/id_rsa.pub`)
|
||||
- Managed with GitOps:
|
||||
|
||||
- upgrades of GitOps controllers
|
||||
|
||||
- core components (CSI, Ingress, logging, monitoring...)
|
||||
|
||||
- operators, databases
|
||||
|
||||
- more GitOps manifests for applications!
|
||||
|
||||
---
|
||||
|
||||
## Pushing to our remote
|
||||
## Concrete example
|
||||
|
||||
- Get the `gitkubed` IP address:
|
||||
```bash
|
||||
kubectl -n kube-system get svc gitkubed
|
||||
IP=$(kubectl -n kube-system get svc gitkubed -o json |
|
||||
jq -r .spec.clusterIP)
|
||||
```
|
||||
- Source code repository (not shown here)
|
||||
|
||||
- Get ourselves a sample repository with resource YAML files:
|
||||
```bash
|
||||
git clone git://github.com/jpetazzo/kubercoins
|
||||
cd kubercoins
|
||||
```
|
||||
- Infrastructure repository (shown below), single branch
|
||||
|
||||
- Add the remote and push to it:
|
||||
```bash
|
||||
git remote add k8s ssh://default-example@$IP/~/git/default-example
|
||||
git push k8s master
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Making changes
|
||||
|
||||
- Edit a local file
|
||||
|
||||
- Commit
|
||||
|
||||
- Push!
|
||||
|
||||
- Make sure that you push to the `k8s` remote
|
||||
|
||||
---
|
||||
|
||||
## Other features
|
||||
|
||||
- Gitkube can also build container images for us
|
||||
|
||||
(see the [documentation](https://github.com/hasura/gitkube/blob/master/docs/remote.md) for more details)
|
||||
|
||||
- Gitkube can also deploy Helm charts
|
||||
|
||||
(instead of raw YAML files)
|
||||
```
|
||||
@@INCLUDE[slides/k8s/gitopstree.txt]
|
||||
```
|
||||
|
||||
???
|
||||
|
||||
|
||||
203
slides/k8s/handson-mlops.md
Normal file
203
slides/k8s/handson-mlops.md
Normal file
@@ -0,0 +1,203 @@
|
||||
class: title
|
||||
|
||||
*Tell me and I forget.*
|
||||
<br/>
|
||||
*Teach me and I remember.*
|
||||
<br/>
|
||||
*Involve me and I learn.*
|
||||
|
||||
Misattributed to Benjamin Franklin
|
||||
|
||||
[(Probably inspired by Chinese Confucian philosopher Xunzi)](https://www.barrypopik.com/index.php/new_york_city/entry/tell_me_and_i_forget_teach_me_and_i_may_remember_involve_me_and_i_will_lear/)
|
||||
|
||||
---
|
||||
|
||||
## Hands-on sections
|
||||
|
||||
- There will be *a lot* of examples and demos
|
||||
|
||||
- If you are attending a live workshop:
|
||||
|
||||
- follow along with the demos, ask questions at any time
|
||||
|
||||
- if you can, try to run some of the examples and demos in your environment
|
||||
|
||||
- if things are going too fast, ask the trainer to slow down :)
|
||||
|
||||
- If you are watching a recording or only reading the slides:
|
||||
|
||||
- it is **strongly** recommended to run **all** the examples and demos
|
||||
|
||||
- take advantage of the fact that you can pause at any time
|
||||
|
||||
---
|
||||
|
||||
class: in-person
|
||||
|
||||
## Where are we going to run our containers?
|
||||
|
||||
---
|
||||
|
||||
class: in-person, pic
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
## If you're attending a live training or workshop
|
||||
|
||||
- Each person gets a private lab environment
|
||||
|
||||
- Your lab environments will be available for the duration of the workshop
|
||||
|
||||
(check with your instructor to know exactly when they'll be shut down)
|
||||
|
||||
- Note that for budget reasons¹, your environment will be fairly modest
|
||||
|
||||
- scenario 1: 4 nodes with 2 cores and 4 GB RAM ; no cluster autoscaling
|
||||
|
||||
- scenario 2: 1 node with 4 cores and 8 GB RAM ; cluster autoscaling
|
||||
|
||||
.footnote[¹That cloud thing is mighty expensive, yo]
|
||||
|
||||
---
|
||||
|
||||
## Running your own lab environment
|
||||
|
||||
- If you are following a self-paced course...
|
||||
|
||||
- Or watching a replay of a recorded course...
|
||||
|
||||
- ...You will need to set up a local environment for the labs
|
||||
|
||||
*or*
|
||||
|
||||
- If you want to use a specific cloud provider...
|
||||
|
||||
- Or want to see these concepts "at scale"...
|
||||
|
||||
- ...You can set up your own clusters with whatever capacity suits you
|
||||
|
||||
---
|
||||
|
||||
## Deploying your own Kubernetes cluster
|
||||
|
||||
- You need cloud provider credentials for this
|
||||
|
||||
- Option 1: use the cloud provider CLI, web UI, ...
|
||||
|
||||
- Option 2: use [one of these Terraform configurations][one-kubernetes]
|
||||
|
||||
(set `cluster_name`, `node_size`, `max_nodes_per_pool`, `location`, and GO!)
|
||||
|
||||
[one-kubernetes]: https://github.com/jpetazzo/container.training/tree/main/prepare-labs/terraform/one-kubernetes
|
||||
|
||||
---
|
||||
|
||||
## Deploying your own Kubernetes cluster.red[**s**]
|
||||
|
||||
- If you want to deliver your own training or workshop:
|
||||
|
||||
- deployment scripts are available in the [prepare-labs] directory
|
||||
|
||||
- you can use them to automatically deploy many lab environments
|
||||
|
||||
- they support many different infrastructure providers
|
||||
|
||||
- they can deploy dozens (even hundreds) of clusters at a time
|
||||
|
||||
[prepare-labs]: https://github.com/jpetazzo/container.training/tree/main/prepare-labs
|
||||
|
||||
---
|
||||
|
||||
## Our recommendation
|
||||
|
||||
- Any managed Kubernetes cluster
|
||||
|
||||
- Nodes with 8 GB of RAM (or more)
|
||||
|
||||
- At least 1 node (obviously!)
|
||||
|
||||
- Ideally, cluster autoscaling
|
||||
|
||||
(you can set the maximum number of nodes to 5)
|
||||
|
||||
- Alternatively, have a cluster of at least 3 nodes
|
||||
|
||||
(ideally a bit more to see the effect of scaling)
|
||||
|
||||
- Local tools: kubectl, Helm, Stern, Bento
|
||||
|
||||
- You can also use [shpod] to get a shell on the cluster
|
||||
|
||||
[shpod]: https://github.com/jpetazzo/shpod
|
||||
|
||||
---
|
||||
|
||||
## Example with Linode (create cluster)
|
||||
|
||||
- Make sure you have a [Linode account][linode-account]
|
||||
|
||||
- Install and configure the [Linode CLI][linode-cli]
|
||||
|
||||
- Create a cluster:
|
||||
```bash
|
||||
lin lke cluster-create --label mlops \
|
||||
--k8s_version=1.31 \
|
||||
--node_pools.type g6-standard-4 \
|
||||
--node_pools.count 1 \
|
||||
--node_pools.autoscaler.enabled true \
|
||||
--node_pools.autoscaler.min 1 \
|
||||
--node_pools.autoscaler.max 5 \
|
||||
#
|
||||
```
|
||||
|
||||
[linode-account]: https://login.linode.com/signup
|
||||
[linode-cli]: https://www.linode.com/products/cli/
|
||||
|
||||
---
|
||||
|
||||
## Example with Linode (retrieve kubeconfig)
|
||||
|
||||
- Retrieve the cluster ID:
|
||||
```bash
|
||||
CLUSTER_ID=$(lin lke clusters-list --label $CLUSTER_NAME --json | jq .[].id)
|
||||
```
|
||||
|
||||
- Wait until the cluster is provisioned:
|
||||
```bash
|
||||
while ! lin lke kubeconfig-view $CLUSTER_ID; do
|
||||
sleep 10
|
||||
done
|
||||
```
|
||||
|
||||
- Retrieve the cluster kubeconfig:
|
||||
```bash
|
||||
lin lke kubeconfig-view $CLUSTER_ID --json | jq -r .[].kubeconfig |
|
||||
base64 -d > kubeconfig.$CLUSTER_ID
|
||||
```
|
||||
|
||||
- And set the `KUBECONFIG` environment variable accordingly!
|
||||
|
||||
---
|
||||
|
||||
class: in-person
|
||||
|
||||
## Why don't we run containers locally?
|
||||
|
||||
- Installing this stuff can be hard on some machines
|
||||
|
||||
(32 bits CPU or OS... Laptops without administrator access... etc.)
|
||||
|
||||
- *"The whole team downloaded all these container images from the WiFi!
|
||||
<br/>... and it went great!"* (Literally no-one ever)
|
||||
|
||||
- All you need is a computer (or even a phone or tablet!), with:
|
||||
|
||||
- an Internet connection
|
||||
|
||||
- a web browser
|
||||
|
||||
- an SSH client
|
||||
|
||||
- Some of the demos require multiple nodes to demonstrate scaling
|
||||
@@ -158,8 +158,6 @@
|
||||
|
||||
- Let's see the specific details for each of them!
|
||||
|
||||
[grpc]: https://grpc.github.io/grpc/core/md_doc_health-checking.html
|
||||
|
||||
---
|
||||
|
||||
## `httpGet`
|
||||
@@ -296,8 +294,6 @@ class: extra-details
|
||||
|
||||
- Leverages standard [GRPC Health Checking Protocol][grpc]
|
||||
|
||||
[grpc]: https://grpc.github.io/grpc/core/md_doc_health-checking.html
|
||||
|
||||
---
|
||||
|
||||
## Timing and thresholds
|
||||
@@ -513,7 +509,10 @@ class: extra-details
|
||||
|
||||
- Sometimes it can also make sense to embed a web server in the worker
|
||||
|
||||
[grpc]: https://grpc.github.io/grpc/core/md_doc_health-checking.html
|
||||
|
||||
???
|
||||
|
||||
:EN:- Using healthchecks to improve availability
|
||||
:FR:- Utiliser des *healthchecks* pour améliorer la disponibilité
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@
|
||||
- instructions indicating to users "please tweak this and that in the YAML"
|
||||
|
||||
- That's where using something like
|
||||
[CUE](https://github.com/cuelang/cue/blob/v0.3.2/doc/tutorial/kubernetes/README.md),
|
||||
[CUE](https://github.com/cue-labs/cue-by-example/tree/main/003_kubernetes_tutorial),
|
||||
[Kustomize](https://kustomize.io/),
|
||||
or [Helm](https://helm.sh/) can help!
|
||||
|
||||
@@ -86,8 +86,6 @@
|
||||
|
||||
- On April 30th 2020, Helm was the 10th project to *graduate* within the CNCF
|
||||
|
||||
🎉
|
||||
|
||||
(alongside Containerd, Prometheus, and Kubernetes itself)
|
||||
|
||||
- This is an acknowledgement by the CNCF for projects that
|
||||
@@ -99,6 +97,8 @@
|
||||
- See [CNCF announcement](https://www.cncf.io/announcement/2020/04/30/cloud-native-computing-foundation-announces-helm-graduation/)
|
||||
and [Helm announcement](https://helm.sh/blog/celebrating-helms-cncf-graduation/)
|
||||
|
||||
- In other words: Helm is here to stay
|
||||
|
||||
---
|
||||
|
||||
## Helm concepts
|
||||
@@ -173,11 +173,13 @@ or `apt` tools).
|
||||
|
||||
- Helm 3 doesn't use `tiller` at all, making it simpler (yay!)
|
||||
|
||||
- If you see references to `tiller` in a tutorial, documentation... that doc is obsolete!
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## With or without `tiller`
|
||||
## What was the problem with `tiller`?
|
||||
|
||||
- With Helm 3:
|
||||
|
||||
@@ -193,9 +195,7 @@ class: extra-details
|
||||
|
||||
- This indirect model caused significant permissions headaches
|
||||
|
||||
(`tiller` required very broad permissions to function)
|
||||
|
||||
- `tiller` was removed in Helm 3 to simplify the security aspects
|
||||
- It also made it more complicated to embed Helm in other tools
|
||||
|
||||
---
|
||||
|
||||
@@ -222,59 +222,6 @@ class: extra-details
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Only if using Helm 2 ...
|
||||
|
||||
- We need to install Tiller and give it some permissions
|
||||
|
||||
- Tiller is composed of a *service* and a *deployment* in the `kube-system` namespace
|
||||
|
||||
- They can be managed (installed, upgraded...) with the `helm` CLI
|
||||
|
||||
.lab[
|
||||
|
||||
- Deploy Tiller:
|
||||
```bash
|
||||
helm init
|
||||
```
|
||||
|
||||
]
|
||||
|
||||
At the end of the install process, you will see:
|
||||
|
||||
```
|
||||
Happy Helming!
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Only if using Helm 2 ...
|
||||
|
||||
- Tiller needs permissions to create Kubernetes resources
|
||||
|
||||
- In a more realistic deployment, you might create per-user or per-team
|
||||
service accounts, roles, and role bindings
|
||||
|
||||
.lab[
|
||||
|
||||
- Grant `cluster-admin` role to `kube-system:default` service account:
|
||||
```bash
|
||||
kubectl create clusterrolebinding add-on-cluster-admin \
|
||||
--clusterrole=cluster-admin --serviceaccount=kube-system:default
|
||||
```
|
||||
|
||||
|
||||
]
|
||||
|
||||
(Defining the exact roles and permissions on your cluster requires
|
||||
a deeper knowledge of Kubernetes' RBAC model. The command above is
|
||||
fine for personal and development clusters.)
|
||||
|
||||
---
|
||||
|
||||
## Charts and repositories
|
||||
|
||||
- A *repository* (or repo in short) is a collection of charts
|
||||
@@ -293,27 +240,7 @@ fine for personal and development clusters.)
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## How to find charts, the old way
|
||||
|
||||
- Helm 2 came with one pre-configured repo, the "stable" repo
|
||||
|
||||
(located at https://charts.helm.sh/stable)
|
||||
|
||||
- Helm 3 doesn't have any pre-configured repo
|
||||
|
||||
- The "stable" repo mentioned above is now being deprecated
|
||||
|
||||
- The new approach is to have fully decentralized repos
|
||||
|
||||
- Repos can be indexed in the Artifact Hub
|
||||
|
||||
(which supersedes the Helm Hub)
|
||||
|
||||
---
|
||||
|
||||
## How to find charts, the new way
|
||||
## How to find charts
|
||||
|
||||
- Go to the [Artifact Hub](https://artifacthub.io/packages/search?kind=0) (https://artifacthub.io)
|
||||
|
||||
@@ -409,24 +336,6 @@ Note: it is also possible to install directly a chart, with `--repo https://...`
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Searching and installing with Helm 2
|
||||
|
||||
- Helm 2 doesn't have support for the Helm Hub
|
||||
|
||||
- The `helm search` command only takes a search string argument
|
||||
|
||||
(e.g. `helm search juice-shop`)
|
||||
|
||||
- With Helm 2, the name is optional:
|
||||
|
||||
`helm install juice/juice-shop` will automatically generate a name
|
||||
|
||||
`helm install --name my-juice-shop juice/juice-shop` will specify a name
|
||||
|
||||
---
|
||||
|
||||
## Viewing resources of a release
|
||||
|
||||
- This specific chart labels all its resources with a `release` label
|
||||
@@ -542,11 +451,11 @@ All unspecified values will take the default values defined in the chart.
|
||||
|
||||
:EN:- Helm concepts
|
||||
:EN:- Installing software with Helm
|
||||
:EN:- Helm 2, Helm 3, and the Helm Hub
|
||||
:EN:- Finding charts on the Artifact Hub
|
||||
|
||||
:FR:- Fonctionnement général de Helm
|
||||
:FR:- Installer des composants via Helm
|
||||
:FR:- Helm 2, Helm 3, et le *Helm Hub*
|
||||
:FR:- Trouver des *charts* sur *Artifact Hub*
|
||||
|
||||
:T: Getting started with Helm and its concepts
|
||||
|
||||
|
||||
165
slides/k8s/helmfile.md
Normal file
165
slides/k8s/helmfile.md
Normal file
@@ -0,0 +1,165 @@
|
||||
# Managing our stack with `helmfile`
|
||||
|
||||
- We've installed a few things with Helm
|
||||
|
||||
- And others with raw YAML manifests
|
||||
|
||||
- Perhaps you've used Kustomize sometimes
|
||||
|
||||
- How can we automate all this? Make it reproducible?
|
||||
|
||||
---
|
||||
|
||||
## Requirements
|
||||
|
||||
- We want something that is *idempotent*
|
||||
|
||||
= running it 1, 2, 3 times, should only install the stack once
|
||||
|
||||
- We want something that handles udpates
|
||||
|
||||
= modifying / reconfiguring without restarting from scratch
|
||||
|
||||
- We want something that is configurable
|
||||
|
||||
= with e.g. configuration files, environment variables...
|
||||
|
||||
- We want something that can handle *partial removals*
|
||||
|
||||
= ability to remove one element without affecting the rest
|
||||
|
||||
- Inspiration: Terraform, Docker Compose...
|
||||
|
||||
---
|
||||
|
||||
## Shell scripts?
|
||||
|
||||
✅ Idempotent, thanks to `kubectl apply -f`, `helm upgrade --install`
|
||||
|
||||
✅ Handles updates (edit script, re-run)
|
||||
|
||||
✅ Configurable
|
||||
|
||||
❌ Partial removals
|
||||
|
||||
If we remove an element from our script, it won't be uninstalled automatically.
|
||||
|
||||
---
|
||||
|
||||
## Umbrella chart?
|
||||
|
||||
Helm chart with dependencies on other charts.
|
||||
|
||||
✅ Idempotent
|
||||
|
||||
✅ Handles updates
|
||||
|
||||
✅ Configurable (with Helm values: YAML files and `--set`)
|
||||
|
||||
✅ Partial removals
|
||||
|
||||
❌ Complex (requires to learn advanced Helm features)
|
||||
|
||||
❌ Requires everything to be a Helm chart (adds (lots of) boilerplate)
|
||||
|
||||
---
|
||||
|
||||
## Helmfile
|
||||
|
||||
https://github.com/helmfile/helmfile
|
||||
|
||||
✅ Idempotent
|
||||
|
||||
✅ Handles updates
|
||||
|
||||
✅ Configurable (with values files, environment variables, and more)
|
||||
|
||||
✅ Partial removals
|
||||
|
||||
✅ Fairly easy to get started
|
||||
|
||||
🐙 Sometimes feels like summoning unspeakable powers / staring down the abyss
|
||||
|
||||
---
|
||||
|
||||
## What `helmfile` can install
|
||||
|
||||
- Helm charts from remote Helm repositories
|
||||
|
||||
- Helm charts from remote git repositories
|
||||
|
||||
- Helm charts from local directories
|
||||
|
||||
- Kustomizations
|
||||
|
||||
- Directories with raw YAML manifests
|
||||
|
||||
---
|
||||
|
||||
## How `helmfile` works
|
||||
|
||||
- Everything is defined in a main `helmfile.yaml`
|
||||
|
||||
- That file defines:
|
||||
|
||||
- `repositories` (remote Helm repositories)
|
||||
|
||||
- `releases` (things to install: Charts, YAML...)
|
||||
|
||||
- `environments` (optional: to specialize prod vs staging vs ...)
|
||||
|
||||
- Helm-style values file can be loaded in `enviroments`
|
||||
|
||||
- These values can then be used in the rest of the Helmfile
|
||||
|
||||
- Examples: [install essentials on a cluster][helmfile-ex-1], [run a Bento stack][helmfile-ex-2]
|
||||
|
||||
[helmfile-ex-1]: https://github.com/jpetazzo/beyond-load-balancers/blob/main/helmfile.yaml
|
||||
[helmfile-ex-2]: https://github.com/jpetazzo/beyond-load-balancers/blob/main/bento/helmfile.yaml
|
||||
|
||||
---
|
||||
|
||||
## `helmfile` commands
|
||||
|
||||
- `helmfile init` (optional; downloads plugins if needed)
|
||||
|
||||
- `helmfile apply` (updates all releases that have changed)
|
||||
|
||||
- `helmfile sync` (updates all releases even if they haven't changed)
|
||||
|
||||
- `helmfile destroy` (guess!)
|
||||
|
||||
---
|
||||
|
||||
## Helmfile tips
|
||||
|
||||
As seen in [this example](https://github.com/jpetazzo/beyond-load-balancers/blob/main/bento/helmfile.yaml#L21):
|
||||
|
||||
- variables can be used to simplify the file
|
||||
|
||||
- configuration values and secrets can be loaded from external sources
|
||||
|
||||
(Kubernetes Secrets, Vault... See [vals] for details)
|
||||
|
||||
- current namespace isn't exposed by default
|
||||
|
||||
- there's often more than one way to do it!
|
||||
|
||||
(this particular section could be improved by using Bento `${...}`)
|
||||
|
||||
[vals]: https://github.com/helmfile/vals
|
||||
---
|
||||
|
||||
## 🏗️ Let's build something!
|
||||
|
||||
- Write a helmfile (or two) to set up today's entire stack on a brand new cluster!
|
||||
|
||||
- Suggestion:
|
||||
|
||||
- one helmfile for singleton, cluster components
|
||||
<br/>
|
||||
(All our operators: Prometheus, Grafana, KEDA, CNPG, RabbitMQ Operator)
|
||||
|
||||
- one helmfile for the application stack
|
||||
<br/>
|
||||
(Bento, PostgreSQL cluster, RabbitMQ)
|
||||
@@ -96,7 +96,7 @@ class: extra-details
|
||||
|
||||
---
|
||||
|
||||
## Choose your adventure!
|
||||
## Choose your own adventure!
|
||||
|
||||
- We present 3 methods to obtain a certificate
|
||||
|
||||
|
||||
@@ -572,7 +572,7 @@ This is normal: we haven't provided any ingress rule yet.
|
||||
|
||||
- Create a prefix match rule for the `blue` service:
|
||||
```bash
|
||||
kubectl create ingress bluestar --rule=/blue*:blue:80
|
||||
kubectl create ingress bluestar --rule=/blue*=blue:80
|
||||
```
|
||||
|
||||
- Check that it works:
|
||||
|
||||
@@ -128,7 +128,9 @@ configMapGenerator:
|
||||
|
||||
- A *variant* is the final outcome of applying bases + overlays
|
||||
|
||||
(See the [kustomize glossary](https://github.com/kubernetes-sigs/kustomize/blob/master/docs/glossary.md) for more definitions!)
|
||||
(See the [kustomize glossary][glossary] for more definitions!)
|
||||
|
||||
[glossary]: https://kubectl.docs.kubernetes.io/references/kustomize/glossary/
|
||||
|
||||
---
|
||||
|
||||
@@ -337,7 +339,7 @@ kustomize edit add label app.kubernetes.io/name:dockercoins
|
||||
|
||||
- Assuming that `commonLabels` have been set as shown on the previous slide:
|
||||
```bash
|
||||
kubectl apply -k . --prune --selector app.kubernetes.io.name=dockercoins
|
||||
kubectl apply -k . --prune --selector app.kubernetes.io/name=dockercoins
|
||||
```
|
||||
|
||||
- ... This command removes resources that have been removed from the kustomization
|
||||
|
||||
@@ -536,12 +536,12 @@ Note: the `apiVersion` field appears to be optional.
|
||||
- Excerpt:
|
||||
```yaml
|
||||
generate:
|
||||
kind: LimitRange
|
||||
name: default-limitrange
|
||||
namespace: "{{request.object.metadata.name}}"
|
||||
data:
|
||||
spec:
|
||||
limits:
|
||||
kind: LimitRange
|
||||
name: default-limitrange
|
||||
namespace: "{{request.object.metadata.name}}"
|
||||
data:
|
||||
spec:
|
||||
limits:
|
||||
```
|
||||
|
||||
- Note that we have to specify the `namespace`
|
||||
|
||||
@@ -195,4 +195,4 @@ class: extra-details
|
||||
:EN:- Installing metrics-server
|
||||
|
||||
:EN:- Le *resource metrics pipeline*
|
||||
:FR:- Installtion de metrics-server
|
||||
:FR:- Installation de metrics-server
|
||||
|
||||
53
slides/k8s/mlops-headsup.md
Normal file
53
slides/k8s/mlops-headsup.md
Normal file
@@ -0,0 +1,53 @@
|
||||
## What we will / won't cover
|
||||
|
||||
- Kubernetes provides low-level building blocks (pods, deployments, services...)
|
||||
|
||||
- There are many high-level frameworks out there for serverless, AI...:
|
||||
|
||||
[Knative](https://knative.dev/docs/),
|
||||
[KubeAI](https://www.kubeai.org/),
|
||||
[Kueue](https://kueue.sigs.k8s.io/)...
|
||||
|
||||
- We're going to sit somewhere in the middle:
|
||||
|
||||
reimplement some of the features of these high-level frameworks, in a flexible way
|
||||
|
||||
- This workshop will (hopefully!) give you a better eye to evaluate these frameworks, too
|
||||
|
||||
- We won't showcase GPUs today for budget reasons
|
||||
|
||||
(giving everyone a few GPU nodes would be prohibitive, sorry!)
|
||||
|
||||
---
|
||||
|
||||
## A word about our demo app
|
||||
|
||||
- We'll use Ollama with a relatively small LLM
|
||||
|
||||
(qwen2:1.5b)
|
||||
|
||||
- We'll use it to generate very short completions
|
||||
|
||||
(a few seconds of CPU)
|
||||
|
||||
- All the challenges that we will address are also visible on longer requests
|
||||
|
||||
(in fact, they are even more visible on longer requests!)
|
||||
|
||||
- We're sticking to short requests to save time and cover a lot of ground today
|
||||
|
||||
(but feel free to use more expensive prompts if you'd like!)
|
||||
|
||||
---
|
||||
|
||||
## Tiny bit of backstory...
|
||||
|
||||
The original prompt that we used when building the first version of this content was:
|
||||
|
||||
```
|
||||
If you go to {city}, I suggest that you
|
||||
```
|
||||
|
||||
This would typically take 10-30 seconds - and with much bigger Kubernetes nodes.
|
||||
|
||||
Today, we suggest that we use a prompt that generates shorter answers!
|
||||
343
slides/k8s/ollama-intro.md
Normal file
343
slides/k8s/ollama-intro.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# Ollama in a nutshell
|
||||
|
||||
https://ollama.dev
|
||||
|
||||
"Get up and running with large language models"
|
||||
|
||||
"Docker, but for LLMs"
|
||||
|
||||
- Server to host (run) LLMs
|
||||
|
||||
- Controlled with CLI or API
|
||||
|
||||
- Download a model with `ollama pull`
|
||||
|
||||
- Run inference with `ollama run`
|
||||
|
||||
---
|
||||
|
||||
## Quick demo
|
||||
|
||||
⚠️ **Important note 1:** the commands in this section aren't meant
|
||||
to be executed on your Kubernetes clusters. They are meant to
|
||||
be executed on a local machine, and they assume that Ollama is
|
||||
installed and running. If you don't have Ollama on your local
|
||||
machine, it's OK to skip these demos!
|
||||
|
||||
⚠️ **Important note 2:** the models used by Ollama are fairly big
|
||||
(1.5 GB for the one used here; up to 10s or 100s of GB for bigger
|
||||
models). We do not recommend downloading them on conference WiFi.
|
||||
|
||||
Assuming Ollama is installed and running:
|
||||
|
||||
```
|
||||
ollama run qwen2:1.5b "What's the solution to global warming?"
|
||||
```
|
||||
|
||||
We're going to use this model because it's relatively small.
|
||||
|
||||
Many others are available (see https://ollama.dev/search).
|
||||
|
||||
---
|
||||
|
||||
## Other useful commands
|
||||
|
||||
- Start an interactive chat session:
|
||||
```bash
|
||||
ollama run qwen2:1.5b
|
||||
```
|
||||
|
||||
- Pull an model (or check for updates):
|
||||
```bash
|
||||
ollama pull qwen2:1.5b
|
||||
```
|
||||
|
||||
- See information on a model:
|
||||
```bash
|
||||
ollama show qwen2:1.5b
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Models on disk, in memory
|
||||
|
||||
- See models available on disk:
|
||||
```bash
|
||||
ollama list
|
||||
```
|
||||
|
||||
- See models loaded in memory:
|
||||
```bash
|
||||
ollama ps
|
||||
```
|
||||
|
||||
- Unload a model:
|
||||
```bash
|
||||
ollama stop qwen2:1.5b
|
||||
```
|
||||
|
||||
Models are automatically unloaded after 5 minutes (by default).
|
||||
|
||||
Ollama loads models in RAM, and in VRAM if it detects a supported GPU.
|
||||
|
||||
---
|
||||
|
||||
# Ollama on Kubernetes
|
||||
|
||||
Let's run Ollama on our Kubernetes cluster!
|
||||
|
||||
- Option 1: `kubectl run`
|
||||
|
||||
- Option 2: create a Deployment and a Service
|
||||
|
||||
- Option 3: use a Helm chart
|
||||
|
||||
---
|
||||
|
||||
## 1️⃣ `kubectl run`
|
||||
|
||||
Note: the `ollama/ollama` image is quite big (~2 GB transfer, ~4 GB on disk).
|
||||
|
||||
```bash
|
||||
kubectl run ollama --image ollama/ollama
|
||||
```
|
||||
|
||||
Wait for the pod to be up and running:
|
||||
```bash
|
||||
kubectl wait pod ollama --for=condition=Ready
|
||||
```
|
||||
|
||||
(If that command times out, try again and/or specify a higher timeout.)
|
||||
|
||||
```bash
|
||||
kubectl exec ollama -- ollama run qwen2:1.5b "What's Bach's best piece?"
|
||||
```
|
||||
|
||||
Shutdown the pod:
|
||||
```bash
|
||||
kubectl delete pod ollama
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2️⃣ Deployment + Service
|
||||
|
||||
Create the Deployment:
|
||||
```bash
|
||||
kubectl create deployment ollama --image ollama/ollama
|
||||
```
|
||||
|
||||
Create the Service:
|
||||
```bash
|
||||
kubectl create service clusterip ollama --tcp 11343
|
||||
```
|
||||
|
||||
Wait for the Service Endpoints to be available:
|
||||
```bash
|
||||
kubectl wait endpoints ollama --for=jsonpath={..ip}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## By the way... Why port 11434?
|
||||
|
||||
| 1 | 1 | 4 | 3 | 4 |
|
||||
|---|---|---|---|---|
|
||||
| L | L | A | M | A |
|
||||
|
||||
---
|
||||
|
||||
## Connecting to the Service
|
||||
|
||||
Let's use the `/api/generate` endpoint:
|
||||
|
||||
```bash
|
||||
kubectl run httpclient --rm -it --image alpine/httpie -- --ignore-stdin \
|
||||
http://ollama:11434/api/generate \
|
||||
model=qwen2:1.5b prompt="Write a limerick about Kubernetes"
|
||||
```
|
||||
|
||||
(See [Ollama API docs](https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion) for details.)
|
||||
|
||||
--
|
||||
|
||||
🤔 We get an error: the model needs to be downloaded first.
|
||||
|
||||
💡 When we used the `ollama run` CLI command earlier, it did it automatically for us.
|
||||
|
||||
---
|
||||
|
||||
## Pulling the model
|
||||
|
||||
Method 1:
|
||||
```bash
|
||||
kubectl exec deployment/ollama -- ollama pull qwen2:1.5b
|
||||
```
|
||||
|
||||
Method 2:
|
||||
```bash
|
||||
kubectl run httpclient --rm -it --image alpine/httpie -- --ignore-stdin \
|
||||
http://ollama:11434/api/pull \
|
||||
name=qwen2:1.5b
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Houston, we (are going to) have a problem...
|
||||
|
||||
- This works when there is only one pod
|
||||
|
||||
- What happens if we scale up the Deployment?
|
||||
|
||||
- We need to pull the model on every pod
|
||||
|
||||
- How should we do that?
|
||||
|
||||
---
|
||||
|
||||
## Potential solutions
|
||||
|
||||
- Bake the model into the image
|
||||
|
||||
🙅 Personal opinion: this is a bad idea (image size, maintenance...)
|
||||
|
||||
- Directly send a "pull" command to each pod, individually
|
||||
|
||||
🙁 Hackish, not great
|
||||
|
||||
- Use a Kubernetes lifecycle hook
|
||||
|
||||
💡 That works!
|
||||
|
||||
- Use a sidecar container to pull the model
|
||||
|
||||
🤔 Doable, but more work than the lifecycle hook
|
||||
|
||||
---
|
||||
|
||||
## 🙋 Choose your own adventure
|
||||
|
||||
Should we add that lifecycle hook?
|
||||
|
||||
---
|
||||
|
||||
## 3️⃣ Helm chart
|
||||
|
||||
- Let's check the [ArtifactHUB] for an Ollama Helm chart
|
||||
|
||||
- The most popular (as of November 2024) is [this one, by OTWLD][ollama-chart]
|
||||
|
||||
- ~~It has pockets~~
|
||||
|
||||
- It can pre-pull models! 🎉
|
||||
|
||||
[ArtifactHub]: https://artifacthub.io
|
||||
[ollama-chart]: https://artifacthub.io/packages/helm/ollama-helm/ollama
|
||||
|
||||
---
|
||||
|
||||
## Installing the Helm chart
|
||||
|
||||
Traditional method:
|
||||
```bash
|
||||
helm repo add ollama https://otwld.github.io/ollama-helm/
|
||||
helm install ollama ollama/ollama --set ollama.models={qwen2:1.5b}
|
||||
```
|
||||
|
||||
Idempotent¹, single-command method:
|
||||
```bash
|
||||
helm upgrade --install --repo https://otwld.github.io/ollama-helm/ \
|
||||
ollama ollama --set ollama.models={qwen2:1.5b}
|
||||
```
|
||||
|
||||
.footnote[¹Idempotent: which can be executed multiple times without adverse effect.]
|
||||
|
||||
---
|
||||
|
||||
## Testing the Helm installation
|
||||
|
||||
Just like before:
|
||||
```bash
|
||||
kubectl run httpclient --rm -it --image alpine/httpie -- --ignore-stdin \
|
||||
http://ollama:11434/api/generate \
|
||||
model=qwen2:1.5b prompt="Write a limerick about YAML" stream:=false
|
||||
```
|
||||
|
||||
And while we're here, check resource usage:
|
||||
```bash
|
||||
kubectl exec deployment/ollama -ti -- top
|
||||
```
|
||||
|
||||
There should be two processes:
|
||||
|
||||
- `ollama` itself, relatively small (~100 MB)
|
||||
|
||||
- the LLM subprocess, relatively big (~1.4 GB for qwen2:1.5b)
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## HTTPie
|
||||
|
||||
https://httpie.io/
|
||||
|
||||
- CLI client to send requests to web services
|
||||
|
||||
- Similar to curl, but made specifically to talk to API backends
|
||||
|
||||
```bash
|
||||
httpie <URL> [key=value] [key=value] [key:=value]
|
||||
```
|
||||
|
||||
- The `key=value` pairs get turned into a JSON object
|
||||
|
||||
- `key:=value` indicates a parameter to be sent "as-is"
|
||||
|
||||
(ideal for e.g. boolean or numbers)
|
||||
|
||||
---
|
||||
|
||||
## Sending some load
|
||||
|
||||
We're going to use `hey`:
|
||||
|
||||
```bash
|
||||
kubectl run hey --rm -it --image nixery.dev/hey -- \
|
||||
hey -c 10 -n 10 -t 60 -m POST \
|
||||
-d '{"model": "qwen2:1.5b", "prompt": "vi or emacs?"}' \
|
||||
http://ollama:11434/api/generate
|
||||
```
|
||||
|
||||
Some explanations:
|
||||
|
||||
- `nixery.dev` = automatically generates images with [Nixery]
|
||||
- `-c` = concurrent requests
|
||||
- `-n` = total number of requests
|
||||
- `-t` = timeout in seconds
|
||||
|
||||
This is probably going to take (literally) a minute.
|
||||
|
||||
[Nixery]: https://nixery.dev/
|
||||
|
||||
---
|
||||
|
||||
## Performance analysis
|
||||
|
||||
- Let's start an interactive container with `hey`
|
||||
|
||||
(e.g., use the `alpine` image, then `apk add hey`)
|
||||
|
||||
- Try 10 requests, with a concurrency of 1/2/4
|
||||
|
||||
- Meanwhile, check the logs of the `ollama` pod
|
||||
|
||||
- Some results (your results may vary depending on CPU, random seed...):
|
||||
|
||||
- 1 = 0.08 reqs/s, average latency: 12s
|
||||
- 2 = 0.10 reqs/s, average latency: 18s
|
||||
- 4 = 0.12 reqs/s, average latency: 28s
|
||||
|
||||
- Higher concurrency = slightly higher throughput, much higher latency
|
||||
|
||||
🤔 We need metrics!
|
||||
273
slides/k8s/ollama-metrics.md
Normal file
273
slides/k8s/ollama-metrics.md
Normal file
@@ -0,0 +1,273 @@
|
||||
# Adding metrics
|
||||
|
||||
We want multiple kinds of metrics:
|
||||
|
||||
- instantaneous pod and node resource usage
|
||||
|
||||
- historical resource usage (=graphs)
|
||||
|
||||
- request duration
|
||||
|
||||
---
|
||||
|
||||
## 1️⃣ Instantaneous resource usage
|
||||
|
||||
- We're going to use metrics-server
|
||||
|
||||
- Check if it's already installed:
|
||||
```bash
|
||||
kubectl top nodes
|
||||
```
|
||||
|
||||
- If we see a list of nodes, with CPU and RAM usage:
|
||||
|
||||
*great, metrics-server is installed!*
|
||||
|
||||
- If we see `error: Metrics API not available`:
|
||||
|
||||
*metrics-server isn't installed, so we'll install it!*
|
||||
|
||||
---
|
||||
|
||||
## Installing metrics-server
|
||||
|
||||
- In a lot of places, this is done with a little bit of custom YAML
|
||||
|
||||
(derived from the [official installation instructions](https://github.com/kubernetes-sigs/metrics-server#installation))
|
||||
|
||||
- We can also use a Helm chart:
|
||||
```bash
|
||||
helm upgrade --install metrics-server metrics-server \
|
||||
--create-namespace --namespace metrics-server \
|
||||
--repo https://kubernetes-sigs.github.io/metrics-server/ \
|
||||
--set args={--kubelet-insecure-tls=true}
|
||||
```
|
||||
|
||||
- The `args` flag specified above should be sufficient on most clusters
|
||||
|
||||
- After a minute, `kubectl top nodes` should show resource usage
|
||||
|
||||
---
|
||||
|
||||
## 2️⃣ Historical resource usage
|
||||
|
||||
- We're going to use Prometheus (specifically: kube-prometheus-stack)
|
||||
|
||||
- This is a Helm chart bundling:
|
||||
|
||||
- Prometheus
|
||||
|
||||
- multiple exporters (node, kube-state-metrics...)
|
||||
|
||||
- Grafana
|
||||
|
||||
- a handful of Grafana dashboards
|
||||
|
||||
- Open Source
|
||||
|
||||
- Commercial alternatives: Datadog, New Relic...
|
||||
|
||||
---
|
||||
|
||||
## Installing kube-prometheus-stack
|
||||
|
||||
We're going to expose both Prometheus and Grafana with a NodePort:
|
||||
|
||||
```bash
|
||||
helm upgrade --install --repo https://prometheus-community.github.io/helm-charts \
|
||||
promstack kube-prometheus-stack \
|
||||
--namespace prom-system --create-namespace \
|
||||
--set prometheus.service.type=NodePort \
|
||||
--set grafana.service.type=NodePort \
|
||||
--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \
|
||||
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
|
||||
#
|
||||
```
|
||||
|
||||
This chart installation can take a while (up to a couple of minutes).
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## `...NilUsersHelmValues=false` ???
|
||||
|
||||
- kube-prometheus-stack uses the "Prometheus Operator"
|
||||
|
||||
- To configure "scrape targets", we create PodMonitor or ServiceMonitor resources
|
||||
|
||||
- By default, the Prometheus Operator will only look at \*Monitors with the right labels
|
||||
|
||||
- Our extra options mean "use all the Monitors that you find!"
|
||||
|
||||
---
|
||||
|
||||
## Connecting to Grafana
|
||||
|
||||
Check the NodePort allocated to Grafana:
|
||||
|
||||
```bash
|
||||
kubectl get service promstack-grafana --namespace prom-system
|
||||
```
|
||||
|
||||
Get the public address of one of our nodes:
|
||||
|
||||
```bash
|
||||
kubectl get nodes -o wide
|
||||
```
|
||||
|
||||
In a browser, connect to the public address of any node, on the node port.
|
||||
|
||||
The default login and password are `admin` / `prom-operator`.
|
||||
|
||||
Check the dashboard "Kubernetes / Compute Resources / Namespace (Pods)".
|
||||
|
||||
Select a namespace and see the CPU and RAM usage for the pods in that namespace.
|
||||
|
||||
---
|
||||
|
||||
## 3️⃣ Request duration
|
||||
|
||||
- Unfortunately, as of November 2024, ollama doesn't expose metrics
|
||||
|
||||
(there is ongoing discussion about it: [issue 3144][3144], [PR 6537][6537])
|
||||
|
||||
- There are some [garbage AI-generated blog posts claiming otherwise][garbage]
|
||||
|
||||
(but it's AI-generated, so it bears no connection to truth whatsoever)
|
||||
|
||||
- So, what can we do?
|
||||
|
||||
[3144]: https://github.com/ollama/ollama/issues/3144#issuecomment-2153184254
|
||||
[6537]: https://github.com/ollama/ollama/pull/6537
|
||||
[garbage]: https://www.arsturn.com/blog/setting-up-ollama-prometheus-metrics
|
||||
|
||||
---
|
||||
|
||||
## HAProxy to the rescue
|
||||
|
||||
- HAProxy is a proxy that can handle TCP, HTTP, and more
|
||||
|
||||
- It can expose detailed Prometheus metrics about HTTP requests
|
||||
|
||||
- The plan: add a sidecar HAProxy to each Ollama container
|
||||
|
||||
- For that, we need to give up on the Ollama Helm chart
|
||||
|
||||
(and go back to basic manifests)
|
||||
|
||||
---
|
||||
|
||||
## 🙋 Choose your own adventure
|
||||
|
||||
Do we want to...
|
||||
|
||||
- write all the corresponding manifests?
|
||||
|
||||
- look at pre-written manifests and explain how they work?
|
||||
|
||||
- apply the manifests and carry on?
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Let's build something!
|
||||
|
||||
- If you have created Deployments / Services: clean them up first!
|
||||
|
||||
- Deploy Ollama with a sidecar HAProxy (sample configuration on next slide)
|
||||
|
||||
- Run a short benchmark campaign
|
||||
|
||||
(e.g. scale to 4 pods, try 4/8/16 parallel requests, 2 minutes each)
|
||||
|
||||
- Check live resource usage with `kubectl top nodes` / `kubectl top pods`
|
||||
|
||||
- Check historical usage with the Grafana dashboards
|
||||
|
||||
(for HAProxy metrics, you can use [Grafana dashboard 12693, HAProxy 2 Full][grafana-12693])
|
||||
|
||||
- If you don't want to write the manifests, you can use [these][ollama-yaml]
|
||||
|
||||
[grafana-12693]: https://grafana.com/grafana/dashboards/12693-haproxy-2-full/
|
||||
[ollama-yaml]: https://github.com/jpetazzo/beyond-load-balancers/tree/main/ollama
|
||||
|
||||
---
|
||||
|
||||
```
|
||||
global
|
||||
#log stdout format raw local0
|
||||
#daemon
|
||||
maxconn 32
|
||||
defaults
|
||||
#log global
|
||||
timeout client 1h
|
||||
timeout connect 1h
|
||||
timeout server 1h
|
||||
mode http
|
||||
`option abortonclose`
|
||||
frontend metrics
|
||||
bind :9000
|
||||
http-request use-service prometheus-exporter
|
||||
frontend ollama_frontend
|
||||
bind :8000
|
||||
default_backend ollama_backend
|
||||
`maxconn 16`
|
||||
backend ollama_backend
|
||||
server ollama_server localhost:11434 check
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## ⚠️ Connection queues
|
||||
|
||||
- HAProxy will happily queue *many* connections
|
||||
|
||||
- If a client sends a request, then disconnects:
|
||||
|
||||
- the request stays in the queue
|
||||
|
||||
- the request gets processed by the backend
|
||||
|
||||
- eventually, when the backend starts sending the reply, the connection is closed
|
||||
|
||||
- This can result in a backlog of queries that take a long time to resorb
|
||||
|
||||
- To avoid that: `option abortonclose` (see [HAProxy docs for details][abortonclose])
|
||||
|
||||
- Note that the issue is less severe when replies are streamed
|
||||
|
||||
[abortonclose]: https://www.haproxy.com/documentation/haproxy-configuration-manual/latest/#4-option%20abortonclose
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Ad-hoc HAProxy dashboard
|
||||
|
||||
- To consolidate all frontend and backend queues on a single graph:
|
||||
|
||||
- query: `haproxy_frontend_current_sessions`
|
||||
|
||||
- legend: `{{namespace}}/{{pod}}/{{proxy}}`
|
||||
|
||||
- options, "Color scheme", select "Classic palette (by series name)"
|
||||
|
||||
---
|
||||
|
||||
## What do we see?
|
||||
|
||||
- Imperfect load balancing
|
||||
|
||||
- Some backends receive more requests than others
|
||||
|
||||
- Sometimes, some backends are idle while others are busy
|
||||
|
||||
- However, CPU utilization on the node is maxed out
|
||||
|
||||
- This is because our node is oversubscribed
|
||||
|
||||
- This is because we haven't specified resource requests/limits (yet)
|
||||
|
||||
(we'll do that later!)
|
||||
155
slides/k8s/ollama-reqlim.md
Normal file
155
slides/k8s/ollama-reqlim.md
Normal file
@@ -0,0 +1,155 @@
|
||||
## Setting resource requests and limits
|
||||
|
||||
- Thanks to *requests*:
|
||||
|
||||
- our pods will have resources *reserved* for them
|
||||
|
||||
- we won't pack too many pods on a single node
|
||||
|
||||
- cluster autoscaling will trigger when needed (if possible!)
|
||||
|
||||
- Thanks to *limits*:
|
||||
|
||||
- our pods won't use more than a given amount of resources
|
||||
|
||||
- they won't use up all the available resources on the node
|
||||
|
||||
- behavior will be more consistent between loaded and unloaded state
|
||||
|
||||
---
|
||||
|
||||
## Memory
|
||||
|
||||
- Personal advice: set request and limit to the same value
|
||||
|
||||
- Check current or historical usage and add a bit of padding
|
||||
|
||||
(the more data historical data we have, the less padding we need)
|
||||
|
||||
- Consider 10% padding for "dataless" pods, more for pods with data
|
||||
|
||||
(so that the pod has "reserves" for page cache usage)
|
||||
|
||||
⚠️ Pods hitting their memory limit will be **killed!**
|
||||
|
||||
---
|
||||
|
||||
## CPU
|
||||
|
||||
- It's not necessary to set requests and limits to the same value
|
||||
|
||||
(this would cause a lot of waste for idle workloads)
|
||||
|
||||
- Let's see a few possible strategies!
|
||||
|
||||
---
|
||||
|
||||
## CPU for mostly idle pods
|
||||
|
||||
E.g.: web services, workers handling very few requests...
|
||||
|
||||
- Set the limit to at least one whole core
|
||||
|
||||
(to avoid throttling, especially on bursty workloads)
|
||||
|
||||
- Requests can be very low (e.g. 0.1 core)
|
||||
|
||||
⚠️ If requests are too low and the node is very loaded,
|
||||
the pod will slow down significantly!
|
||||
|
||||
(Because CPU cycles are allocated proportionally to CPU requests.)
|
||||
|
||||
---
|
||||
|
||||
## Inelastic CPU-hungry pods
|
||||
|
||||
- Pods with a fixed number of threads:
|
||||
|
||||
*set requests and limits to that number of threads*
|
||||
|
||||
- Pods where a specific level of performance needs to be guaranteed:
|
||||
|
||||
*set requests and limits to the number of cores providing that performance*
|
||||
|
||||
⚠️ If you set limits to higher levels, performance will be unpredictible!
|
||||
|
||||
(You'll get good performance when the node has extra cycles.)
|
||||
|
||||
---
|
||||
|
||||
## Elastic CPU-hungry pods
|
||||
|
||||
- Pods that could potentially use all the cores
|
||||
|
||||
(e.g. machine learning training and inference, depending on the models)
|
||||
|
||||
- Decide how many pods per node you want to pack
|
||||
|
||||
- Set CPU requests as a fraction of the number of cores of the nodes
|
||||
|
||||
(minus some padding)
|
||||
|
||||
- Example:
|
||||
|
||||
- nodes with 32 cores
|
||||
- we want 4 pods per node
|
||||
- CPU request: 7.5 cores
|
||||
|
||||
- Set limits to a higher level (up to node size)
|
||||
|
||||
---
|
||||
|
||||
## In practice
|
||||
|
||||
- Check memory usage of our Ollama pods:
|
||||
```bash
|
||||
kubectl top pods
|
||||
```
|
||||
(Or even better, look at historical usage in Prometheus or Grafana!)
|
||||
|
||||
- Check how many cores we have on our nodes:
|
||||
```bash
|
||||
kubectl get nodes -o json | jq .items[].status.capacity.cpu
|
||||
kubectl get nodes -o custom-columns=NAME:metadata.name,CPU:status.capacity.cpu
|
||||
```
|
||||
|
||||
- Let's decide that we want two Ollama pods per node
|
||||
|
||||
- What requests/limits should we set?
|
||||
|
||||
---
|
||||
|
||||
## Setting resources for Ollama
|
||||
|
||||
- Assumptions:
|
||||
|
||||
- we want two pods per node
|
||||
- each pod uses ~1500MiB RAM
|
||||
- nodes have 4 cores
|
||||
|
||||
- We'll set memory requests and limits to 2G
|
||||
|
||||
- We'll set CPU requests to 1.5 (4 cores / 2 pods, minus padding)
|
||||
|
||||
- We'll set CPU limits to twice the requests
|
||||
|
||||
```bash
|
||||
kubectl set resources deployment ollama \
|
||||
--requests=cpu=1.5,memory=2G \
|
||||
--limits=cpu=3,memory=2G
|
||||
```
|
||||
|
||||
⚠️ If you have an HAProxy side car, this will set its resources too!
|
||||
|
||||
---
|
||||
|
||||
## Results
|
||||
|
||||
- After setting these resource requests, we should see cluster autoscaling
|
||||
|
||||
- If not: scale up the Ollama Deployment to at least 3 replicas
|
||||
|
||||
- Check cluster autoscaler status with:
|
||||
```bash
|
||||
kubectl describe configmap --namespace kube-system cluster-autoscaler-status
|
||||
```
|
||||
@@ -40,7 +40,7 @@ using Kubernetes manifests and tooling.*
|
||||
|
||||
- etc.
|
||||
|
||||
[ArgoCD]: https://github.com/argoproj/argo-cd
|
||||
[ArgoCD]: https://argoproj.github.io/cd/
|
||||
[AWS]: https://aws-controllers-k8s.github.io/community/docs/community/services/
|
||||
[cert-manager]: https://cert-manager.io/
|
||||
[External Secrets Operator]: https://external-secrets.io/
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
- "New" policies
|
||||
|
||||
(available in alpha since Kubernetes 1.22)
|
||||
(available in alpha since Kubernetes 1.22, and GA since Kubernetes 1.25)
|
||||
|
||||
- Easier to use
|
||||
|
||||
@@ -66,50 +66,6 @@ class: extra-details
|
||||
|
||||
---
|
||||
|
||||
## PSA in practice
|
||||
|
||||
- Step 1: enable the PodSecurity admission plugin
|
||||
|
||||
- Step 2: label some Namespaces
|
||||
|
||||
- Step 3: provide an AdmissionConfiguration (optional)
|
||||
|
||||
- Step 4: profit!
|
||||
|
||||
---
|
||||
|
||||
## Enabling PodSecurity
|
||||
|
||||
- This requires Kubernetes 1.22 or later
|
||||
|
||||
- This requires the ability to reconfigure the API server
|
||||
|
||||
- The following slides assume that we're using `kubeadm`
|
||||
|
||||
(and have write access to `/etc/kubernetes/manifests`)
|
||||
|
||||
---
|
||||
|
||||
## Reconfiguring the API server
|
||||
|
||||
- In Kubernetes 1.22, we need to enable the `PodSecurity` feature gate
|
||||
|
||||
- In later versions, this might be enabled automatically
|
||||
|
||||
.lab[
|
||||
|
||||
- Edit `/etc/kubernetes/manifests/kube-apiserver.yaml`
|
||||
|
||||
- In the `command` list, add `--feature-gates=PodSecurity=true`
|
||||
|
||||
- Save, quit, wait for the API server to be back up again
|
||||
|
||||
]
|
||||
|
||||
Note: for bonus points, edit the `kubeadm-config` ConfigMap instead!
|
||||
|
||||
---
|
||||
|
||||
## Namespace labels
|
||||
|
||||
- Three optional labels can be added to namespaces:
|
||||
@@ -277,14 +233,6 @@ Let's use @@LINK[k8s/admission-configuration.yaml]:
|
||||
|
||||
- But the Pods don't get created
|
||||
|
||||
---
|
||||
|
||||
## Clean up
|
||||
|
||||
- We probably want to remove the API server flags that we added
|
||||
|
||||
(the feature gate and the admission configuration)
|
||||
|
||||
???
|
||||
|
||||
:EN:- Preventing privilege escalation with Pod Security Admission
|
||||
|
||||
@@ -124,7 +124,7 @@
|
||||
|
||||
## Admission plugins
|
||||
|
||||
- [PodSecurityPolicy](https://kubernetes.io/docs/concepts/policy/pod-security-policy/) (will be removed in Kubernetes 1.25)
|
||||
- [PodSecurityPolicy](https://kubernetes.io/docs/concepts/policy/pod-security-policy/) (was removed in Kubernetes 1.25)
|
||||
|
||||
- create PodSecurityPolicy resources
|
||||
|
||||
@@ -132,7 +132,7 @@
|
||||
|
||||
- create RoleBinding that grants the Role to a user or ServiceAccount
|
||||
|
||||
- [PodSecurityAdmission](https://kubernetes.io/docs/concepts/security/pod-security-admission/) (alpha since Kubernetes 1.22)
|
||||
- [PodSecurityAdmission](https://kubernetes.io/docs/concepts/security/pod-security-admission/) (alpha since Kubernetes 1.22, stable since 1.25)
|
||||
|
||||
- use pre-defined policies (privileged, baseline, restricted)
|
||||
|
||||
@@ -162,9 +162,31 @@
|
||||
|
||||
---
|
||||
|
||||
## Validating Admission Policies
|
||||
|
||||
- Alternative to validating admission webhooks
|
||||
|
||||
- Evaluated in the API server
|
||||
|
||||
(don't require an external server; don't add network latency)
|
||||
|
||||
- Written in CEL (Common Expression Language)
|
||||
|
||||
- alpha in K8S 1.26; beta in K8S 1.28; GA in K8S 1.30
|
||||
|
||||
- Can replace validating webhooks at least in simple cases
|
||||
|
||||
- Can extend Pod Security Admission
|
||||
|
||||
- Check [the documentation][vapdoc] for examples
|
||||
|
||||
[vapdoc]: https://kubernetes.io/docs/reference/access-authn-authz/validating-admission-policy/
|
||||
|
||||
---
|
||||
|
||||
## Acronym salad
|
||||
|
||||
- PSP = Pod Security Policy
|
||||
- PSP = Pod Security Policy **(deprecated)**
|
||||
|
||||
- an admission plugin called PodSecurityPolicy
|
||||
|
||||
|
||||
@@ -2,11 +2,15 @@
|
||||
|
||||
- "Legacy" policies
|
||||
|
||||
(deprecated since Kubernetes 1.21; will be removed in 1.25)
|
||||
(deprecated since Kubernetes 1.21; removed in 1.25)
|
||||
|
||||
- Superseded by Pod Security Standards + Pod Security Admission
|
||||
|
||||
(available in alpha since Kubernetes 1.22)
|
||||
(available in alpha since Kubernetes 1.22; stable since 1.25)
|
||||
|
||||
- **Since Kubernetes 1.24 was EOL in July 2023, nobody should use PSPs anymore!**
|
||||
|
||||
- This section is here mostly for historical purposes, and can be skipped
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Pre-requirements
|
||||
## Pre-requirements
|
||||
|
||||
- Kubernetes concepts
|
||||
|
||||
|
||||
210
slides/k8s/queue-architecture.md
Normal file
210
slides/k8s/queue-architecture.md
Normal file
@@ -0,0 +1,210 @@
|
||||
# Message Queue Architecture
|
||||
|
||||
There are (at least) three ways to distribute load:
|
||||
|
||||
- load balancers
|
||||
|
||||
- batch jobs
|
||||
|
||||
- message queues
|
||||
|
||||
Let's do a quick review of their pros/cons!
|
||||
|
||||
---
|
||||
|
||||
## 1️⃣ Load balancers
|
||||
|
||||
<pre class="mermaid">
|
||||
flowchart TD
|
||||
Client["Client"] ---> LB["Load balancer"]
|
||||
LB ---> B1["Backend"] & B2["Backend"] & B3["Backend"]
|
||||
</pre>
|
||||
|
||||
---
|
||||
|
||||
## Load balancers
|
||||
|
||||
- Latency: ~milliseconds (network latency)
|
||||
|
||||
- Overhead: very low (one extra network hop, one log message?)
|
||||
|
||||
- Great for short requests (a few milliseconds to a minute)
|
||||
|
||||
- Supported out of the box by the Kubernetes Service Proxy
|
||||
|
||||
(by default, this is `kube-proxy`)
|
||||
|
||||
- Suboptimal resource utilization due to imperfect balancing
|
||||
|
||||
(especially when there are multiple load balancers)
|
||||
|
||||
---
|
||||
|
||||
## 2️⃣ Batch jobs
|
||||
|
||||
<pre class="mermaid">
|
||||
flowchart TD
|
||||
subgraph K["Kubernetes Control Plane"]
|
||||
J1["Job"]@{ shape: card}
|
||||
J2["Job"]@{ shape: card}
|
||||
J3["..."]@{ shape: text}
|
||||
J4["Job"]@{ shape: card}
|
||||
end
|
||||
C["Client"] ---> K
|
||||
K <---> N1["Node"] & N2["Node"] & N3["Node"]
|
||||
</pre>
|
||||
|
||||
---
|
||||
|
||||
## Batch jobs
|
||||
|
||||
- Latency: a few seconds (many Kubernetes controllers involved)
|
||||
|
||||
- Overhead: significant due to all the moving pieces involved
|
||||
|
||||
(job controller, scheduler, kubelet; many writes to etcd and logs)
|
||||
|
||||
- Great for long requests (a few minutes to a few days)
|
||||
|
||||
- Supported out of the box by Kubernetes
|
||||
|
||||
(`kubectl create job hello --image alpine -- sleep 60`)
|
||||
|
||||
- Asynchronous processing requires some refactoring
|
||||
|
||||
(we don't get the response immediately)
|
||||
|
||||
---
|
||||
|
||||
## 3️⃣ Message queues
|
||||
|
||||
<pre class="mermaid">
|
||||
flowchart TD
|
||||
subgraph Q["Message queue"]
|
||||
M1["Message"]@{ shape: card}
|
||||
M2["Message"]@{ shape: card}
|
||||
M3["..."]@{ shape: text}
|
||||
M4["Message"]@{ shape: card}
|
||||
end
|
||||
C["Client"] ---> Q
|
||||
Q <---> W1["Worker"] & W2["Worker"] & W3["Worker"]
|
||||
</pre>
|
||||
|
||||
---
|
||||
|
||||
## Message queues
|
||||
|
||||
- Latency: a few milliseconds to a few seconds
|
||||
|
||||
- Overhead: intermediate
|
||||
|
||||
(very low with e.g. Redis, higher with e.g. Kafka)
|
||||
|
||||
- Great for all except very short requests
|
||||
|
||||
- Requires additional setup
|
||||
|
||||
- Asynchronous processing requires some refactoring
|
||||
|
||||
---
|
||||
|
||||
## Dealing with errors
|
||||
|
||||
- Load balancers
|
||||
|
||||
- errors reported immediately (client must retry)
|
||||
- some load balancers can retry automatically
|
||||
|
||||
- Batch jobs
|
||||
|
||||
- Kubernetes retries automatically
|
||||
- after `backoffLimit` retries, Job is marked as failed
|
||||
|
||||
- Message queues
|
||||
|
||||
- some queues have a concept of "acknowledgement"
|
||||
- some queues have a concept of "dead letter queue"
|
||||
- some extra work is required
|
||||
|
||||
---
|
||||
|
||||
## Some queue brokers
|
||||
|
||||
- Redis (with e.g. RPUSH, BLPOP)
|
||||
|
||||
*light, fast, easy to setup... no durability guarantee, no acknowledgement, no dead letter queue*
|
||||
|
||||
- Kafka
|
||||
|
||||
*heavy, complex to setup... strong deliverability guarantee, full featured*
|
||||
|
||||
- RabbitMQ
|
||||
|
||||
*somewhat in-between Redis and Kafka*
|
||||
|
||||
- SQL databases
|
||||
|
||||
*often requires polling, which adds extra latency; not as scalable as a "true" broker*
|
||||
|
||||
---
|
||||
|
||||
## More queue brokers
|
||||
|
||||
Many cloud providers offer hosted message queues (e.g.: Amazon SQS).
|
||||
|
||||
These are usually great options, with some drawbacks:
|
||||
|
||||
- vendor lock-in
|
||||
|
||||
- setting up extra environments (testing, staging...) can be more complex
|
||||
|
||||
(Setting up a singleton environment is usually very easy, thanks to web UI, CLI, etc.; setting up extra environments and assigning the right permissions with e.g. IAC is usually significantly more complex.)
|
||||
|
||||
---
|
||||
|
||||
## Implementing a message queue
|
||||
|
||||
1. Pick a broker
|
||||
|
||||
2. Deploy the broker
|
||||
|
||||
3. Set up the queue
|
||||
|
||||
4. Refactor our code
|
||||
|
||||
---
|
||||
|
||||
## Code refactoring (client)
|
||||
|
||||
Before:
|
||||
```python
|
||||
response = http.POST("http://api", payload=Request(...))
|
||||
```
|
||||
|
||||
After:
|
||||
```python
|
||||
client = queue.connect(...)
|
||||
client.publish(message=Request(...))
|
||||
```
|
||||
|
||||
Note: we don't get the response right way (if at all)!
|
||||
|
||||
---
|
||||
|
||||
## Code refactoring (server)
|
||||
|
||||
Before:
|
||||
```python
|
||||
server = http.server(request_handler=handler)
|
||||
server.listen("80")
|
||||
server.run()
|
||||
```
|
||||
|
||||
After:
|
||||
```python
|
||||
client = queue.connect(...)
|
||||
while true:
|
||||
message = client.consume()
|
||||
response = handler(message)
|
||||
# Write the response somewhere
|
||||
```
|
||||
@@ -1,4 +1,4 @@
|
||||
# Resource Limits
|
||||
# Allocating compute resources in theory
|
||||
|
||||
- We can attach resource indications to our pods
|
||||
|
||||
@@ -6,11 +6,53 @@
|
||||
|
||||
- We can specify *limits* and/or *requests*
|
||||
|
||||
- We can specify quantities of CPU and/or memory
|
||||
- We can specify quantities of CPU and/or memory and/or ephemeral storage
|
||||
|
||||
---
|
||||
|
||||
## CPU vs memory
|
||||
## Requests vs limits
|
||||
|
||||
- *Requests* are *guaranteed reservations* of resources
|
||||
|
||||
- They are used for scheduling purposes
|
||||
|
||||
- Kubelet will use cgroups to e.g. guarantee a minimum amount of CPU time
|
||||
|
||||
- A container **can** use more than its requested resources
|
||||
|
||||
- A container using *less* than what it requested should never be killed or throttled
|
||||
|
||||
- A node **cannot** be overcommitted with requests
|
||||
|
||||
(the sum of all requests **cannot** be higher than resources available on the node)
|
||||
|
||||
- A small amount of resources is set aside for system components
|
||||
|
||||
(this explains why there is a difference between "capacity" and "allocatable")
|
||||
|
||||
---
|
||||
|
||||
## Requests vs limits
|
||||
|
||||
- *Limits* are "hard limits" (a container **cannot** exceed its limits)
|
||||
|
||||
- They aren't taken into account by the scheduler
|
||||
|
||||
- A container exceeding its memory limit is killed instantly
|
||||
|
||||
(by the kernel out-of-memory killer)
|
||||
|
||||
- A container exceeding its CPU limit is throttled
|
||||
|
||||
- A container exceeding its disk limit is killed
|
||||
|
||||
(usually with a small delay, since this is checked periodically by kubelet)
|
||||
|
||||
- On a given node, the sum of all limits **can** be higher than the node size
|
||||
|
||||
---
|
||||
|
||||
## Compressible vs incompressible resources
|
||||
|
||||
- CPU is a *compressible resource*
|
||||
|
||||
@@ -24,7 +66,29 @@
|
||||
|
||||
- if we have N GB RAM and need 2N, we might run at... 0.1% speed!
|
||||
|
||||
- As a result, exceeding limits will have different consequences for CPU and memory
|
||||
- Disk is also an *incompressible resource*
|
||||
|
||||
- when the disk is full, writes will fail
|
||||
|
||||
- applications may or may not crash but persistent apps will be in trouble
|
||||
|
||||
---
|
||||
|
||||
## Running low on CPU
|
||||
|
||||
- Two ways for a container to "run low" on CPU:
|
||||
|
||||
- it's hitting its CPU limit
|
||||
|
||||
- all CPUs on the node are at 100% utilization
|
||||
|
||||
- The app in the container will run slower
|
||||
|
||||
(compared to running without a limit, or if CPU cycles were available)
|
||||
|
||||
- No other consequence
|
||||
|
||||
(but this could affect SLA/SLO for latency-sensitive applications!)
|
||||
|
||||
---
|
||||
|
||||
@@ -130,15 +194,13 @@ class: extra-details
|
||||
|
||||
- use [static CPU manager policy](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#static-policy)
|
||||
|
||||
For more details, check [this blog post](https://erickhun.com/posts/kubernetes-faster-services-no-cpu-limits/) or these ones ([part 1](https://engineering.indeedblog.com/blog/2019/12/unthrottled-fixing-cpu-limits-in-the-cloud/), [part 2](https://engineering.indeedblog.com/blog/2019/12/cpu-throttling-regression-fix/)).
|
||||
For more details, check [this blog post](https://erickhun.com/posts/kubernetes-faster-services-no-cpu-limits/) or these: ([part 1](https://engineering.indeedblog.com/blog/2019/12/unthrottled-fixing-cpu-limits-in-the-cloud/), [part 2](https://engineering.indeedblog.com/blog/2019/12/cpu-throttling-regression-fix/)).
|
||||
|
||||
---
|
||||
|
||||
## Running low on memory
|
||||
|
||||
- When the system runs low on memory, it starts to reclaim used memory
|
||||
|
||||
(we talk about "memory pressure")
|
||||
- When the kernel runs low on memory, it starts to reclaim used memory
|
||||
|
||||
- Option 1: free up some buffers and caches
|
||||
|
||||
@@ -162,71 +224,91 @@ For more details, check [this blog post](https://erickhun.com/posts/kubernetes-f
|
||||
|
||||
- If a container exceeds its memory *limit*, it gets killed immediately
|
||||
|
||||
- If a node is overcommitted and under memory pressure, it will terminate some pods
|
||||
- If a node memory usage gets too high, it will *evict* some pods
|
||||
|
||||
(see next slide for some details about what "overcommit" means here!)
|
||||
(we say that the node is "under pressure", more on that in a bit!)
|
||||
|
||||
[KEP 2400]: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md#implementation-history
|
||||
|
||||
---
|
||||
|
||||
## Overcommitting resources
|
||||
## Running low on disk
|
||||
|
||||
- *Limits* are "hard limits" (a container *cannot* exceed its limits)
|
||||
- When the kubelet runs low on disk, it starts to reclaim disk space
|
||||
|
||||
- a container exceeding its memory limit is killed
|
||||
(similarly to what the kernel does, but in different categories)
|
||||
|
||||
- a container exceeding its CPU limit is throttled
|
||||
- Option 1: garbage collect dead pods and containers
|
||||
|
||||
- On a given node, the sum of pod *limits* can be higher than the node size
|
||||
(no consequence, but their logs will be deleted)
|
||||
|
||||
- *Requests* are used for scheduling purposes
|
||||
- Option 2: remove unused images
|
||||
|
||||
- a container can use more than its requested CPU or RAM amounts
|
||||
(no consequence, but these images will have to be repulled if we need them later)
|
||||
|
||||
- a container using *less* than what it requested should never be killed or throttled
|
||||
- Option 3: evict pods and remove them to reclaim their disk usage
|
||||
|
||||
- On a given node, the sum of pod *requests* cannot be higher than the node size
|
||||
- Note: this only applies to *ephemeral storage*, not to e.g. Persistent Volumes!
|
||||
|
||||
---
|
||||
|
||||
## Pod quality of service
|
||||
## Ephemeral storage?
|
||||
|
||||
Each pod is assigned a QoS class (visible in `status.qosClass`).
|
||||
- This includes:
|
||||
|
||||
- If limits = requests:
|
||||
- the *read-write layer* of the container
|
||||
<br/>
|
||||
(any file creation/modification outside of its volumes)
|
||||
|
||||
- as long as the container uses less than the limit, it won't be affected
|
||||
- `emptyDir` volumes mounted in the container
|
||||
|
||||
- if all containers in a pod have *(limits=requests)*, QoS is considered "Guaranteed"
|
||||
- the container logs stored on the node
|
||||
|
||||
- If requests < limits:
|
||||
- This does not include:
|
||||
|
||||
- as long as the container uses less than the request, it won't be affected
|
||||
- the container image
|
||||
|
||||
- otherwise, it might be killed/evicted if the node gets overloaded
|
||||
|
||||
- if at least one container has *(requests<limits)*, QoS is considered "Burstable"
|
||||
|
||||
- If a pod doesn't have any request nor limit, QoS is considered "BestEffort"
|
||||
- other types of volumes (e.g. Persistent Volumes, `hostPath`, or `local` volumes)
|
||||
|
||||
---
|
||||
|
||||
## Quality of service impact
|
||||
class: extra-details
|
||||
|
||||
- When a node is overloaded, BestEffort pods are killed first
|
||||
## Disk limit enforcement
|
||||
|
||||
- Then, Burstable pods that exceed their requests
|
||||
- Disk usage is periodically measured by kubelet
|
||||
|
||||
- Burstable and Guaranteed pods below their requests are never killed
|
||||
(with something equivalent to `du`)
|
||||
|
||||
(except if their node fails)
|
||||
- There can be a small delay before pod termination when disk limit is exceeded
|
||||
|
||||
- If we only use Guaranteed pods, no pod should ever be killed
|
||||
- It's also possible to enable filesystem *project quotas*
|
||||
|
||||
(as long as they stay within their limits)
|
||||
(e.g. with EXT4 or XFS)
|
||||
|
||||
(Pod QoS is also explained in [this page](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/) of the Kubernetes documentation and in [this blog post](https://medium.com/google-cloud/quality-of-service-class-qos-in-kubernetes-bb76a89eb2c6).)
|
||||
- Remember that container logs are also accounted for!
|
||||
|
||||
(container log rotation/retention is managed by kubelet)
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## `nodefs` and `imagefs`
|
||||
|
||||
- `nodefs` is the main filesystem of the node
|
||||
|
||||
(holding, notably, `emptyDir` volumes and container logs)
|
||||
|
||||
- Optionally, the container engine can be configured to use an `imagefs`
|
||||
|
||||
- `imagefs` will store container images and container writable layers
|
||||
|
||||
- When there is a separate `imagefs`, its disk usage is tracked independently
|
||||
|
||||
- If `imagefs` usage gets too high, kubelet will remove old images first
|
||||
|
||||
(conversely, if `nodefs` usage gets too high, kubelet won't remove old images)
|
||||
|
||||
---
|
||||
|
||||
@@ -304,7 +386,47 @@ class: extra-details
|
||||
|
||||
---
|
||||
|
||||
## Specifying resources
|
||||
## Pod quality of service
|
||||
|
||||
Each pod is assigned a QoS class (visible in `status.qosClass`).
|
||||
|
||||
- If limits = requests:
|
||||
|
||||
- as long as the container uses less than the limit, it won't be affected
|
||||
|
||||
- if all containers in a pod have *(limits=requests)*, QoS is considered "Guaranteed"
|
||||
|
||||
- If requests < limits:
|
||||
|
||||
- as long as the container uses less than the request, it won't be affected
|
||||
|
||||
- otherwise, it might be killed/evicted if the node gets overloaded
|
||||
|
||||
- if at least one container has *(requests<limits)*, QoS is considered "Burstable"
|
||||
|
||||
- If a pod doesn't have any request nor limit, QoS is considered "BestEffort"
|
||||
|
||||
---
|
||||
|
||||
## Quality of service impact
|
||||
|
||||
- When a node is overloaded, BestEffort pods are killed first
|
||||
|
||||
- Then, Burstable pods that exceed their requests
|
||||
|
||||
- Burstable and Guaranteed pods below their requests are never killed
|
||||
|
||||
(except if their node fails)
|
||||
|
||||
- If we only use Guaranteed pods, no pod should ever be killed
|
||||
|
||||
(as long as they stay within their limits)
|
||||
|
||||
(Pod QoS is also explained in [this page](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/) of the Kubernetes documentation and in [this blog post](https://medium.com/google-cloud/quality-of-service-class-qos-in-kubernetes-bb76a89eb2c6).)
|
||||
|
||||
---
|
||||
|
||||
# Allocating compute resources in practice
|
||||
|
||||
- Resource requests are expressed at the *container* level
|
||||
|
||||
@@ -316,9 +438,9 @@ class: extra-details
|
||||
|
||||
(so 100m = 0.1)
|
||||
|
||||
- Memory is expressed in bytes
|
||||
- Memory and ephemeral disk storage are expressed in bytes
|
||||
|
||||
- Memory can be expressed with k, M, G, T, ki, Mi, Gi, Ti suffixes
|
||||
- These can have k, M, G, T, ki, Mi, Gi, Ti suffixes
|
||||
|
||||
(corresponding to 10^3, 10^6, 10^9, 10^12, 2^10, 2^20, 2^30, 2^40)
|
||||
|
||||
@@ -334,11 +456,13 @@ containers:
|
||||
image: jpetazzo/color
|
||||
resources:
|
||||
limits:
|
||||
memory: "100Mi"
|
||||
cpu: "100m"
|
||||
requests:
|
||||
ephemeral-storage: 10M
|
||||
memory: "100Mi"
|
||||
requests:
|
||||
cpu: "10m"
|
||||
ephemeral-storage: 10M
|
||||
memory: "100Mi"
|
||||
```
|
||||
|
||||
This set of resources makes sure that this service won't be killed (as long as it stays below 100 MB of RAM), but allows its CPU usage to be throttled if necessary.
|
||||
@@ -365,7 +489,7 @@ This set of resources makes sure that this service won't be killed (as long as i
|
||||
|
||||
---
|
||||
|
||||
## We need default resource values
|
||||
## We need to specify resource values
|
||||
|
||||
- If we do not set resource values at all:
|
||||
|
||||
@@ -379,9 +503,33 @@ This set of resources makes sure that this service won't be killed (as long as i
|
||||
|
||||
- if the request is zero, the scheduler can't make a smart placement decision
|
||||
|
||||
- To address this, we can set default values for resources
|
||||
- This is fine when learning/testing, absolutely not in production!
|
||||
|
||||
- This is done with a LimitRange object
|
||||
---
|
||||
|
||||
## How should we set resources?
|
||||
|
||||
- Option 1: manually, for each container
|
||||
|
||||
- simple, effective, but tedious
|
||||
|
||||
- Option 2: automatically, with the [Vertical Pod Autoscaler (VPA)][vpa]
|
||||
|
||||
- relatively simple, very minimal involvement beyond initial setup
|
||||
|
||||
- not compatible with HPAv1, can disrupt long-running workloads (see [limitations][vpa-limitations])
|
||||
|
||||
- Option 3: semi-automatically, with tools like [Robusta KRR][robusta]
|
||||
|
||||
- good compromise between manual work and automation
|
||||
|
||||
- Option 4: by creating LimitRanges in our Namespaces
|
||||
|
||||
- relatively simple, but "one-size-fits-all" approach might not always work
|
||||
|
||||
[robusta]: https://github.com/robusta-dev/krr
|
||||
[vpa]: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler
|
||||
[vpa-limitations]: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler#known-limitations
|
||||
|
||||
---
|
||||
|
||||
@@ -458,246 +606,6 @@ per Pod, but it's not [officially documented yet](https://github.com/kubernetes/
|
||||
|
||||
---
|
||||
|
||||
# Namespace quotas
|
||||
|
||||
- We can also set quotas per namespace
|
||||
|
||||
- Quotas apply to the total usage in a namespace
|
||||
|
||||
(e.g. total CPU limits of all pods in a given namespace)
|
||||
|
||||
- Quotas can apply to resource limits and/or requests
|
||||
|
||||
(like the CPU and memory limits that we saw earlier)
|
||||
|
||||
- Quotas can also apply to other resources:
|
||||
|
||||
- "extended" resources (like GPUs)
|
||||
|
||||
- storage size
|
||||
|
||||
- number of objects (number of pods, services...)
|
||||
|
||||
---
|
||||
|
||||
## Creating a quota for a namespace
|
||||
|
||||
- Quotas are enforced by creating a ResourceQuota object
|
||||
|
||||
- ResourceQuota objects are namespaced, and apply to their namespace only
|
||||
|
||||
- We can have multiple ResourceQuota objects in the same namespace
|
||||
|
||||
- The most restrictive values are used
|
||||
|
||||
---
|
||||
|
||||
## Limiting total CPU/memory usage
|
||||
|
||||
- The following YAML specifies an upper bound for *limits* and *requests*:
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: a-little-bit-of-compute
|
||||
spec:
|
||||
hard:
|
||||
requests.cpu: "10"
|
||||
requests.memory: 10Gi
|
||||
limits.cpu: "20"
|
||||
limits.memory: 20Gi
|
||||
```
|
||||
|
||||
These quotas will apply to the namespace where the ResourceQuota is created.
|
||||
|
||||
---
|
||||
|
||||
## Limiting number of objects
|
||||
|
||||
- The following YAML specifies how many objects of specific types can be created:
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: quota-for-objects
|
||||
spec:
|
||||
hard:
|
||||
pods: 100
|
||||
services: 10
|
||||
secrets: 10
|
||||
configmaps: 10
|
||||
persistentvolumeclaims: 20
|
||||
services.nodeports: 0
|
||||
services.loadbalancers: 0
|
||||
count/roles.rbac.authorization.k8s.io: 10
|
||||
```
|
||||
|
||||
(The `count/` syntax allows limiting arbitrary objects, including CRDs.)
|
||||
|
||||
---
|
||||
|
||||
## YAML vs CLI
|
||||
|
||||
- Quotas can be created with a YAML definition
|
||||
|
||||
- ...Or with the `kubectl create quota` command
|
||||
|
||||
- Example:
|
||||
```bash
|
||||
kubectl create quota my-resource-quota --hard=pods=300,limits.memory=300Gi
|
||||
```
|
||||
|
||||
- With both YAML and CLI form, the values are always under the `hard` section
|
||||
|
||||
(there is no `soft` quota)
|
||||
|
||||
---
|
||||
|
||||
## Viewing current usage
|
||||
|
||||
When a ResourceQuota is created, we can see how much of it is used:
|
||||
|
||||
```
|
||||
kubectl describe resourcequota my-resource-quota
|
||||
|
||||
Name: my-resource-quota
|
||||
Namespace: default
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
pods 12 100
|
||||
services 1 5
|
||||
services.loadbalancers 0 0
|
||||
services.nodeports 0 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Advanced quotas and PriorityClass
|
||||
|
||||
- Pods can have a *priority*
|
||||
|
||||
- The priority is a number from 0 to 1000000000
|
||||
|
||||
(or even higher for system-defined priorities)
|
||||
|
||||
- High number = high priority = "more important" Pod
|
||||
|
||||
- Pods with a higher priority can *preempt* Pods with lower priority
|
||||
|
||||
(= low priority pods will be *evicted* if needed)
|
||||
|
||||
- Useful when mixing workloads in resource-constrained environments
|
||||
|
||||
---
|
||||
|
||||
## Setting the priority of a Pod
|
||||
|
||||
- Create a PriorityClass
|
||||
|
||||
(or use an existing one)
|
||||
|
||||
- When creating the Pod, set the field `spec.priorityClassName`
|
||||
|
||||
- If the field is not set:
|
||||
|
||||
- if there is a PriorityClass with `globalDefault`, it is used
|
||||
|
||||
- otherwise, the default priority will be zero
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## PriorityClass and ResourceQuotas
|
||||
|
||||
- A ResourceQuota can include a list of *scopes* or a *scope selector*
|
||||
|
||||
- In that case, the quota will only apply to the scoped resources
|
||||
|
||||
- Example: limit the resources allocated to "high priority" Pods
|
||||
|
||||
- In that case, make sure that the quota is created in every Namespace
|
||||
|
||||
(or use *admission configuration* to enforce it)
|
||||
|
||||
- See the [resource quotas documentation][quotadocs] for details
|
||||
|
||||
[quotadocs]: https://kubernetes.io/docs/concepts/policy/resource-quotas/#resource-quota-per-priorityclass
|
||||
|
||||
---
|
||||
|
||||
# Limiting resources in practice
|
||||
|
||||
- We have at least three mechanisms:
|
||||
|
||||
- requests and limits per Pod
|
||||
|
||||
- LimitRange per namespace
|
||||
|
||||
- ResourceQuota per namespace
|
||||
|
||||
- Let's see a simple recommendation to get started with resource limits
|
||||
|
||||
---
|
||||
|
||||
## Set a LimitRange
|
||||
|
||||
- In each namespace, create a LimitRange object
|
||||
|
||||
- Set a small default CPU request and CPU limit
|
||||
|
||||
(e.g. "100m")
|
||||
|
||||
- Set a default memory request and limit depending on your most common workload
|
||||
|
||||
- for Java, Ruby: start with "1G"
|
||||
|
||||
- for Go, Python, PHP, Node: start with "250M"
|
||||
|
||||
- Set upper bounds slightly below your expected node size
|
||||
|
||||
(80-90% of your node size, with at least a 500M memory buffer)
|
||||
|
||||
---
|
||||
|
||||
## Set a ResourceQuota
|
||||
|
||||
- In each namespace, create a ResourceQuota object
|
||||
|
||||
- Set generous CPU and memory limits
|
||||
|
||||
(e.g. half the cluster size if the cluster hosts multiple apps)
|
||||
|
||||
- Set generous objects limits
|
||||
|
||||
- these limits should not be here to constrain your users
|
||||
|
||||
- they should catch a runaway process creating many resources
|
||||
|
||||
- example: a custom controller creating many pods
|
||||
|
||||
---
|
||||
|
||||
## Observe, refine, iterate
|
||||
|
||||
- Observe the resource usage of your pods
|
||||
|
||||
(we will see how in the next chapter)
|
||||
|
||||
- Adjust individual pod limits
|
||||
|
||||
- If you see trends: adjust the LimitRange
|
||||
|
||||
(rather than adjusting every individual set of pod limits)
|
||||
|
||||
- Observe the resource usage of your namespaces
|
||||
|
||||
(with `kubectl describe resourcequota ...`)
|
||||
|
||||
- Rinse and repeat regularly
|
||||
|
||||
---
|
||||
|
||||
## Underutilization
|
||||
|
||||
- Remember: when assigning a pod to a node, the scheduler looks at *requests*
|
||||
|
||||
@@ -352,6 +352,87 @@ class: pic
|
||||
class: pic
|
||||

|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Traffic engineering
|
||||
|
||||
- By default, connections to a ClusterIP or a NodePort are load balanced
|
||||
across all the backends of their Service
|
||||
|
||||
- This can incur extra network hops (which add latency)
|
||||
|
||||
- To remove that extra hop, multiple mechanisms are available:
|
||||
|
||||
- `spec.externalTrafficPolicy`
|
||||
|
||||
- `spec.internalTrafficPolicy`
|
||||
|
||||
- [Topology aware routing](https://kubernetes.io/docs/concepts/services-networking/topology-aware-routing/) annotation (beta)
|
||||
|
||||
- `spec.trafficDistribution` (alpha in 1.30, beta in 1.31)
|
||||
|
||||
---
|
||||
|
||||
## `internal / externalTrafficPolicy`
|
||||
|
||||
- Applies respectively to `ClusterIP` and `NodePort` connections
|
||||
|
||||
- Can be set to `Cluster` or `Local`
|
||||
|
||||
- `Cluster`: load balance connections across all backends (default)
|
||||
|
||||
- `Local`: load balance connections to local backends (on the same node)
|
||||
|
||||
- With `Local`, if there is no local backend, the connection will fail!
|
||||
|
||||
(the parameter expresses a "hard rule", not a preference)
|
||||
|
||||
- Example: `externalTrafficPolicy: Local` for Ingress controllers
|
||||
|
||||
(as shown on earlier diagrams)
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## Topology aware routing
|
||||
|
||||
- In beta since Kubernetes 1.23
|
||||
|
||||
- Enabled with annotation `service.kubernetes.io/topology-mode=Auto`
|
||||
|
||||
- Relies on node label `topology.kubernetes.io/zone`
|
||||
|
||||
- Kubernetes service proxy will try to keep connections within a zone
|
||||
|
||||
(connections made by a pod in zone `a` will be sent to pods in zone `a`)
|
||||
|
||||
- ...Except if there are no pods in the zone (then fallback to all zones)
|
||||
|
||||
- This can mess up autoscaling!
|
||||
|
||||
---
|
||||
|
||||
class: extra-details
|
||||
|
||||
## `spec.trafficDistribution`
|
||||
|
||||
- [KEP4444, Traffic Distribution for Services][kep4444]
|
||||
|
||||
- In alpha since Kubernetes 1.30, beta since Kubernetes 1.31
|
||||
|
||||
- Should eventually supersede topology aware routing
|
||||
|
||||
- Can be set to `PreferClose` (more values might be supported later)
|
||||
|
||||
- The meaning of `PreferClose` is implementation dependent
|
||||
|
||||
(with kube-proxy, it should work like topology aware routing: stay in a zone)
|
||||
|
||||
[kep4444]: https://github.com/kubernetes/enhancements/issues/4444
|
||||
|
||||
???
|
||||
|
||||
:EN:- Service types: ClusterIP, NodePort, LoadBalancer
|
||||
|
||||
@@ -144,6 +144,30 @@
|
||||
|
||||
---
|
||||
|
||||
## [Orbstack](https://orbstack.dev/)
|
||||
|
||||
- Mac only
|
||||
|
||||
- Runs Docker containers, Kubernetes, and Linux virtual machines
|
||||
|
||||
- Emphasis on speed and energy usage (battery life)
|
||||
|
||||
- Great support for `ClusterIP` and `LoadBalancer` services
|
||||
|
||||
- Free for personal use; paid product otherwise
|
||||
|
||||
---
|
||||
|
||||
## [Podman Desktop](https://podman-desktop.io/)
|
||||
|
||||
- Available on Linux, Mac, and Windows
|
||||
|
||||
- Free and open-source
|
||||
|
||||
- Doesn't support Kubernetes directly, but [supports KinD](https://podman-desktop.io/docs/kind)
|
||||
|
||||
---
|
||||
|
||||
## [Rancher Desktop](https://rancherdesktop.io/)
|
||||
|
||||
- Available on Linux, Mac, and Windows
|
||||
@@ -158,8 +182,6 @@
|
||||
|
||||
- Emphasis on ease of use (like Docker Desktop)
|
||||
|
||||
- Relatively young product (first release in May 2021)
|
||||
|
||||
- Based on k3s and other proven components
|
||||
|
||||
---
|
||||
|
||||
@@ -166,17 +166,15 @@
|
||||
|
||||
- [Kubernetes The Hard Way](https://github.com/kelseyhightower/kubernetes-the-hard-way) by Kelsey Hightower
|
||||
|
||||
- step by step guide to install Kubernetes on Google Cloud
|
||||
|
||||
- covers certificates, high availability ...
|
||||
|
||||
- *“Kubernetes The Hard Way is optimized for learning, which means taking the long route to ensure you understand each task required to bootstrap a Kubernetes cluster.”*
|
||||
*step by step guide to install Kubernetes on GCP, with certificates, HA...*
|
||||
|
||||
- [Deep Dive into Kubernetes Internals for Builders and Operators](https://www.youtube.com/watch?v=3KtEAa7_duA)
|
||||
|
||||
- conference presentation showing step-by-step control plane setup
|
||||
*conference talk setting up a simplified Kubernetes cluster - no security or HA*
|
||||
|
||||
- emphasis on simplicity, not on security and availability
|
||||
- 🇫🇷[Démystifions les composants internes de Kubernetes](https://www.youtube.com/watch?v=OCMNA0dSAzc)
|
||||
|
||||
*improved version of the previous one, with certs and recent k8s versions*
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ This is the flag that we're looking for:
|
||||
|
||||
- We only need to transfer the CSR (Certificate Signing Request) to the CA
|
||||
|
||||
(we never need to expoes the private key)
|
||||
(we never need to expose the private key)
|
||||
|
||||
.lab[
|
||||
|
||||
|
||||
@@ -245,9 +245,9 @@
|
||||
|
||||
- command-line flags
|
||||
|
||||
- Precedence of the different methods is defined in the [docs]
|
||||
- Precedence of the different methods is defined in the [docs][data-values-merge-order]
|
||||
|
||||
[docs]: https://carvel.dev/ytt/docs/v0.41.0/ytt-data-values/#data-values-merge-order
|
||||
[data-values-merge-order]: https://carvel.dev/ytt/docs/v0.41.0/ytt-data-values/#data-values-merge-order
|
||||
|
||||
---
|
||||
|
||||
@@ -462,13 +462,13 @@ spec:
|
||||
|
||||
- By default, `#@overlay/match` must find *exactly* one match
|
||||
|
||||
(that can be changed by specifying `expects=...`, `missing_ok=True`... see [docs])
|
||||
(that can be changed by specifying `expects=...`, `missing_ok=True`... see [docs][docs-ytt-overlaymatch])
|
||||
|
||||
- By default, the specified fields (here, `spec.replicas`) must exist
|
||||
|
||||
(that can also be changed by annotating the optional fields)
|
||||
|
||||
[docs]: https://carvel.dev/ytt/docs/v0.41.0/lang-ref-ytt-overlay/#overlaymatch
|
||||
[docs-ytt-overlaymatch]: https://carvel.dev/ytt/docs/v0.41.0/lang-ref-ytt-overlay/#overlaymatch
|
||||
|
||||
---
|
||||
|
||||
@@ -573,7 +573,7 @@ metadata:
|
||||
|
||||
## Overlays vs data values
|
||||
|
||||
- The documentation has a [detailed discussion][docs] about this question
|
||||
- The documentation has a [detailed discussion][data-values-vs-overlays] about this question
|
||||
|
||||
- In short:
|
||||
|
||||
@@ -587,7 +587,7 @@ metadata:
|
||||
|
||||
(keeping in mind that overlays are harder to write/understand/maintain)
|
||||
|
||||
[docs]: https://carvel.dev/ytt/docs/v0.41.0/data-values-vs-overlays/
|
||||
[data-values-vs-overlays]: https://carvel.dev/ytt/docs/v0.41.0/data-values-vs-overlays/
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,82 +1,44 @@
|
||||
## Introductions
|
||||
|
||||
⚠️ This slide should be customized by the tutorial instructor(s).
|
||||
Hello! We are:
|
||||
|
||||
<!--
|
||||
- Jérôme Petazzoni ([@jpetazzo@hachyderm.io], [/in/jpetazzo][jp-linkedin])
|
||||
|
||||
- Hello! We are:
|
||||
- freelance Docker¹ / Kubernetes / MLops consultant and trainer
|
||||
|
||||
- 👷🏻♀️ AJ ([@s0ulshake], [EphemeraSearch], [Quantgene])
|
||||
- AJ Bowen ([GitHub: @soulshake][aj-github], [LinkedIn: AJ Bowen][aj-linkedin])
|
||||
|
||||
- 🚁 Alexandre ([@alexbuisine], Enix SAS)
|
||||
- freelance k8s/IaC/CI/CD/devOps engineer and consultant
|
||||
|
||||
- 🐳 Jérôme ([@jpetazzo], [@jpetazzo@hachyderm.io], Ardan Labs)
|
||||
- founder of [EphemeraSearch]
|
||||
|
||||
- 🐳 Jérôme ([@jpetazzo], [@jpetazzo@hachyderm.io], Enix SAS)
|
||||
.footnote[¹I worked at Docker from 2011 to 2018.
|
||||
I ran containers in production before it was cool. 😎]
|
||||
|
||||
- 🐳 Jérôme ([@jpetazzo], [@jpetazzo@hachyderm.io], Tiny Shell Script LLC)
|
||||
|
||||
-->
|
||||
|
||||
<!--
|
||||
|
||||
- The training will run for 4 hours, with a 10 minutes break every hour
|
||||
|
||||
(the middle break will be a bit longer)
|
||||
|
||||
-->
|
||||
|
||||
<!--
|
||||
|
||||
- The workshop will run from XXX to YYY
|
||||
|
||||
- There will be a lunch break at ZZZ
|
||||
|
||||
(And coffee breaks!)
|
||||
|
||||
-->
|
||||
|
||||
<!--
|
||||
|
||||
- Feel free to interrupt for questions at any time
|
||||
|
||||
- *Especially when you see full screen container pictures!*
|
||||
|
||||
- Live feedback, questions, help: @@CHAT@@
|
||||
|
||||
-->
|
||||
|
||||
<!--
|
||||
|
||||
- You ~~should~~ must ask questions! Lots of questions!
|
||||
|
||||
(especially when you see full screen container pictures)
|
||||
|
||||
- Use @@CHAT@@ to ask questions, get help, etc.
|
||||
|
||||
-->
|
||||
|
||||
<!-- -->
|
||||
|
||||
[@alexbuisine]: https://twitter.com/alexbuisine
|
||||
[EphemeraSearch]: https://ephemerasearch.com/
|
||||
[@jpetazzo]: https://twitter.com/jpetazzo
|
||||
[aj-github]: https://github.com/soulshake
|
||||
[aj-linkedin]: https://linkedin.com/in/ajbowen
|
||||
[jp-linkedin]: https://linkedin.com/in/jpetazzo
|
||||
[@jpetazzo@hachyderm.io]: https://hachyderm.io/@jpetazzo
|
||||
[@s0ulshake]: https://twitter.com/s0ulshake
|
||||
[Quantgene]: https://www.quantgene.com/
|
||||
|
||||
---
|
||||
|
||||
## Exercises
|
||||
## Context
|
||||
|
||||
- At the end of each day, there is a series of exercises
|
||||
- This content was delivered at QCON SF in November 2024
|
||||
|
||||
(in-person workshop)
|
||||
|
||||
- To make the most out of the training, please try the exercises!
|
||||
- I recorded that workshop, but had technical issues
|
||||
|
||||
(it will help to practice and memorize the content of the day)
|
||||
- This is a re-recording of the workshop!
|
||||
|
||||
- We recommend to take at least one hour to work on the exercises
|
||||
- I may or may not cover everything in this video series
|
||||
|
||||
(if you understood the content of the day, it will be much faster)
|
||||
- At the conference, I provided everyone with an individual cluster
|
||||
|
||||
- If you want to follow along the demos and labs, you'll need your own
|
||||
|
||||
(I'll explain how to provision it)
|
||||
|
||||
- Each day will start with a quick review of the exercises of the previous day
|
||||
|
||||
47
slides/mq.yml
Normal file
47
slides/mq.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
title: |
|
||||
Asynchronous Architecture Patterns To Scale ML and Other High Latency Workloads on Kubernetes
|
||||
|
||||
#chat: "[Slack](https://dockercommunity.slack.com/messages/C7GKACWDV)"
|
||||
#chat: "[Gitter](https://gitter.im/jpetazzo/workshop-yyyymmdd-city)"
|
||||
chat: "In person!"
|
||||
|
||||
gitrepo: github.com/jpetazzo/container.training
|
||||
|
||||
slides: https://2024-12-mq.container.training/
|
||||
|
||||
#slidenumberprefix: "#SomeHashTag — "
|
||||
|
||||
exclude:
|
||||
- in-person
|
||||
|
||||
content:
|
||||
- shared/title.md
|
||||
- shared/contact.md
|
||||
- logistics.md
|
||||
- shared/about-slides.md
|
||||
#- shared/chat-room-im.md
|
||||
#- shared/chat-room-slack.md
|
||||
#- shared/chat-room-zoom-meeting.md
|
||||
#- shared/chat-room-zoom-webinar.md
|
||||
- k8s/prereqs-advanced.md
|
||||
- k8s/handson-mlops.md
|
||||
#- shared/connecting.md
|
||||
- k8s/mlops-headsup.md
|
||||
- shared/toc.md
|
||||
-
|
||||
- k8s/ollama-intro.md
|
||||
- k8s/ollama-metrics.md
|
||||
- k8s/queue-architecture.md
|
||||
- k8s/bento-intro.md
|
||||
-
|
||||
- k8s/bento-enrichment.md
|
||||
- k8s/resource-limits.md
|
||||
- k8s/cluster-autoscaler.md
|
||||
- k8s/ollama-reqlim.md
|
||||
-
|
||||
- k8s/bento-hpa.md
|
||||
- k8s/bento-rmq.md
|
||||
- k8s/bento-cnpg.md
|
||||
- k8s/helmfile.md
|
||||
- shared/thankyou.md
|
||||
- shared/contact.md
|
||||
@@ -46,7 +46,7 @@
|
||||
|
||||
(let's say we'll keep them online at least 1 year, how about that?)
|
||||
|
||||
- You can download the slides using that URL:
|
||||
- You can download the slides using this URL:
|
||||
|
||||
@@ZIP@@
|
||||
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
class: in-person
|
||||
|
||||
## Connecting to our lab environment
|
||||
## Testing the connection to our lab environment
|
||||
|
||||
.lab[
|
||||
|
||||
- Log into the first VM (`node1`) with your SSH client:
|
||||
- Connect to your lab environment with your SSH client:
|
||||
```bash
|
||||
ssh `user`@`A.B.C.D`
|
||||
ssh -p `32323` `user`@`A.B.C.D`
|
||||
```
|
||||
|
||||
(Replace `user` and `A.B.C.D` with the user and IP address provided to you)
|
||||
(Make sure to replace the highlighted values with the ones provided to you!)
|
||||
|
||||
<!--
|
||||
```bash
|
||||
@@ -27,7 +28,7 @@ done
|
||||
|
||||
You should see a prompt looking like this:
|
||||
```
|
||||
[A.B.C.D] (...) user@node1 ~
|
||||
[A.B.C.D] (...) user@machine ~
|
||||
$
|
||||
```
|
||||
If anything goes wrong — ask for help!
|
||||
@@ -40,9 +41,11 @@ class: in-person
|
||||
|
||||
- The shell history of the instructor is available online in real time
|
||||
|
||||
- Note the IP address of the instructor's virtual machine (A.B.C.D)
|
||||
- The instructor will provide you a "magic URL"
|
||||
|
||||
- Open http://A.B.C.D:1088 in your browser and you should see the history
|
||||
(typically, the instructor's lab address on port 1088 or 30088)
|
||||
|
||||
- Open that URL in your browser and you should see the history
|
||||
|
||||
- The history is updated in real time
|
||||
|
||||
@@ -57,7 +60,7 @@ class: in-person
|
||||
## Doing or re-doing the workshop on your own?
|
||||
|
||||
- Use something like
|
||||
[Play-With-Docker](http://play-with-docker.com/) or
|
||||
[Play-With-Docker](https://labs.play-with-docker.com/) or
|
||||
[Play-With-Kubernetes](https://training.play-with-kubernetes.com/)
|
||||
|
||||
Zero setup effort; but environment are short-lived and
|
||||
@@ -100,13 +103,13 @@ class: self-paced
|
||||
|
||||
.lab[
|
||||
|
||||
- Go to http://www.play-with-docker.com/
|
||||
- Go to https://labs.play-with-docker.com/
|
||||
|
||||
- Log in
|
||||
|
||||
- Create your first node
|
||||
|
||||
<!-- ```open http://www.play-with-docker.com/``` -->
|
||||
<!-- ```open https://labs.play-with-docker.com/``` -->
|
||||
|
||||
]
|
||||
|
||||
@@ -116,21 +119,17 @@ You will need a Docker ID to use Play-With-Docker.
|
||||
|
||||
---
|
||||
|
||||
## We will (mostly) interact with node1 only
|
||||
## We don't need to connect to ALL the nodes
|
||||
|
||||
*These remarks apply only when using multiple nodes, of course.*
|
||||
- If your cluster has multiple nodes (e.g. `node1`, `node2`, ...):
|
||||
|
||||
- Unless instructed, **all commands must be run from the first VM, `node1`**
|
||||
unless instructed, **all commands must be run from the first node**
|
||||
|
||||
- We will only check out/copy the code on `node1`
|
||||
- We don't need to check out/copy code or manifests on other nodes
|
||||
|
||||
- During normal operations, we do not need access to the other nodes
|
||||
|
||||
- If we had to troubleshoot issues, we would use a combination of:
|
||||
|
||||
- SSH (to access system logs, daemon status...)
|
||||
|
||||
- Docker API (to check running containers and container engine status)
|
||||
(but we could log into these nodes to troubleshoot or examine stuff)
|
||||
|
||||
---
|
||||
|
||||
|
||||
45
slides/shared/contact.md
Normal file
45
slides/shared/contact.md
Normal file
@@ -0,0 +1,45 @@
|
||||
name: contact
|
||||
|
||||
## Contact information
|
||||
|
||||
.column-half[
|
||||
Instructor:
|
||||
|
||||
📛 Jérôme Petazzoni
|
||||
<br/>
|
||||
📩 jerome.petazzoni@gmail.com
|
||||
<br/>
|
||||
🔗 https://linkedin.com/in/jpetazzo
|
||||
<br/>
|
||||
🦣 https://hachyderm.io/@jpetazzo
|
||||
|
||||
I can teach custom courses:
|
||||
|
||||
- Docker, Kubernetes, MLOps
|
||||
- from intro level to "black belt"
|
||||
- on site or remotely
|
||||
|
||||
Reach out if you're interested!
|
||||
]
|
||||
|
||||
.column-half[
|
||||
Assistant:
|
||||
|
||||
📛 AJ Bowen
|
||||
<br/>
|
||||
📩 aj@soulshake.net
|
||||
<br/>
|
||||
🔗 https://linkedin.com/in/ajbowen
|
||||
<br/>
|
||||
📃 https://github.com/soulshake
|
||||
|
||||
|
||||
I can consult on the following topics:
|
||||
|
||||
- Kubernetes
|
||||
- CI/CD
|
||||
- Terraform & Infra-as-code
|
||||
- Docker
|
||||
- AWS
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
## Using Play-With-Docker
|
||||
|
||||
- Open a new browser tab to [www.play-with-docker.com](http://www.play-with-docker.com/)
|
||||
- Open a new browser tab to [labs.play-with-docker.com](https://labs.play-with-docker.com/)
|
||||
|
||||
- Confirm that you're not a robot
|
||||
|
||||
|
||||
26
slides/shared/qrcode.md
Normal file
26
slides/shared/qrcode.md
Normal file
@@ -0,0 +1,26 @@
|
||||
<!-- QRcode generated with "qrencode -t UTF-8" -->
|
||||
|
||||
.center[
|
||||
<pre style="padding-top: 0em; font-size: 18px; line-height: 20px;">
|
||||
█▀▀▀▀▀█ ▀▀▀█▄▀ ▀▄ ▀▄ ▀▄ ▄█▀ ▄ █▀▀▀▀▀█
|
||||
█ ███ █ ▀▄█ ▀▀▄█ ▄▀▀ ██▄▄ █ ███ █
|
||||
█ ▀▀▀ █ ▄▀█▀ █▀▀▀█ ▄█▀▄███ ▄ █ ▀▀▀ █
|
||||
▀▀▀▀▀▀▀ █▄▀ █▄█ ▀ █ █ ▀▄█▄▀ █ ▀▀▀▀▀▀▀
|
||||
▀▀ █▀▄▀ ▀▄ ▀▀█▄▄█▄▄ ▄▄▄ █▀ ▀▄▄ ▄▀
|
||||
▄█▄▀▄▀▀██▀ ▀▀██▄█ ▀▀▄█ ██▀ █▄█▀█▀▀
|
||||
▄ ▄▀▀ ▀ ▀█▀ ▄█▄▀▄▀ ▀ █ █ █▄▄▀▀▀▀▄█▄█▀
|
||||
█ ▀▀█▄▀▀█▀█ ▄▀ ▀▀ █▀▄ ▀▄ ██▄▀ ▄█ ▄▀█
|
||||
█▄▀▀▀ ▀▀ ███▀█▀▄ ▄▄█ ██ █▀▄▀▄ █▀▀▀
|
||||
▄ █▀▄▀ ▄▀ ▄▀▄ ██ ▀▀█ ▄█ █▀▀▄█▀ ▄ █
|
||||
█▀▀▄▄ ▀ ▀ ▀▀█ ▀▀▀ ▀▀ █▀██▄▀▀▀███▄█▀
|
||||
█▀█▀▄█▀██ ██ ▀ █▄█▀ ▀ ██▀ ██▄ █▄█▄▄█
|
||||
█▀█▀▄▄▀▀▀▄▀▄▀ ▄█ ▄▀█ ▄▀▄ █▄ ▀▀▄█▄▄▀
|
||||
█▀█▄█ ▀ ▀▀▄█▀ █▄▀ █ ▄ ▄▀▄█ █▄▄█▄▄▀█
|
||||
▀ ▀▀ ▀▀█▄ ▀ ▀ ▄▄███▄ ▄ █▀▀▀█▀██
|
||||
█▀▀▀▀▀█ ▀██ █ █▀▀ ▀█▀██▄█▀▄█ ▀ █▄ ▄▀
|
||||
█ ███ █ █▄██▀ ▀▄▀▀▄█▀ ▄▄▀██▀▀▀█▀▀ ▄ ▀
|
||||
█ ▀▀▀ █ ▄█▀▀▀▀▄▀▄▄█ ▄▀█▀▄ ▀ ▀█ █▄█
|
||||
▀▀▀▀▀▀▀ ▀▀ ▀▀ ▀ ▀ ▀ ▀ ▀ ▀ ▀ ▀
|
||||
</pre>
|
||||
]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user