From 3d75f1b37a5b2c5fb22e3c34458530ffa73bc339 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Wed, 26 Aug 2020 13:14:04 +0200 Subject: [PATCH 1/2] Add smoke/basic functional test Without this patch, we don't test on release whether kured actually works and behave well. This is a problem, as a functional issue could have been hidden by a recent change, as our testing is minimalist (only test the usability, not the functionality). Instead of testing manually, we should ensure this in CI. This fixes it by adding a github action which tests the previously built artifacts before publishing a release. The job consume the helm chart in our code tree (note: this relies on the last released image), and run a functional test triggering a coordinated restart of a whole 5 node cluster deployed with kind, through github actions. Note: The github action needs to reset docker configuration, else the reboot of the node (a docker container in kind) will fail. It will be correctly triggered, but the node will not come back up, with its systemd log mentioning: "Failed to attach 1 to compat systemd cgroup". --- .github/kind-cluster.yaml | 8 +++ .github/workflows/smoke-tests.yml | 66 +++++++++++++++++++++ tests/kind/create-reboot-sentinels.sh | 12 ++++ tests/kind/follow-coordinated-reboot.sh | 76 +++++++++++++++++++++++++ 4 files changed, 162 insertions(+) create mode 100644 .github/kind-cluster.yaml create mode 100644 .github/workflows/smoke-tests.yml create mode 100755 tests/kind/create-reboot-sentinels.sh create mode 100755 tests/kind/follow-coordinated-reboot.sh diff --git a/.github/kind-cluster.yaml b/.github/kind-cluster.yaml new file mode 100644 index 0000000..f30e384 --- /dev/null +++ b/.github/kind-cluster.yaml @@ -0,0 +1,8 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane +- role: control-plane +- role: control-plane +- role: worker +- role: worker diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/smoke-tests.yml new file mode 100644 index 0000000..6b53ffe --- /dev/null +++ b/.github/workflows/smoke-tests.yml @@ -0,0 +1,66 @@ +# This needs all the previous artifacts to be created: +# the image should be published, the repo should be tagged, the helm +# chart pushed. It's assumed everything is working, and we are doing +# a last final test at the release creation time to _ensure_ we can +# release. At this point it's still time to back off in case of problem, +# not publish the release and iterate on tags. + +name: Smoke test + +on: + push: + branches: + - '**' + release: + types: created + +jobs: + deploy-manifests: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + fetch-depth: "0" + + - name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions + run: | + sudo bash << EOF + cp /etc/docker/daemon.json /etc/docker/daemon.json.old + echo '{}' > /etc/docker/daemon.json + systemctl restart docker || journalctl --no-pager -n 500 + systemctl status docker + EOF + + - name: Create 5 node kind cluster + uses: helm/kind-action@master + with: + config: .github/kind-cluster.yaml + + - name: Deploy kured on default namespace with its helm chart + run: | + curl -o helm3.tgz https://get.helm.sh/helm-v3.3.0-linux-amd64.tar.gz + tar xf helm3.tgz + ./linux-amd64/helm install kured ./charts/kured/ --set configuration.period=1m + kubectl config set-context kind-chart-testing + kubectl get ds --all-namespaces + kubectl describe ds kured + + - name: Ensure kured is ready + uses: nick-invision/retry@v1 + with: + timeout_minutes: 10 + max_attempts: 10 + retry_wait_seconds: 60 + # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = 5 + command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' " + + - name: Create reboot sentinel files + run: | + ./tests/kind/create-reboot-sentinels.sh + + - name: Follow reboot until success + env: + DEBUG: true + run: | + ./tests/kind/follow-coordinated-reboot.sh diff --git a/tests/kind/create-reboot-sentinels.sh b/tests/kind/create-reboot-sentinels.sh new file mode 100755 index 0000000..e95dc3b --- /dev/null +++ b/tests/kind/create-reboot-sentinels.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +# USE KUBECTL_CMD to pass context and/or namespaces. +KUBECTL_CMD="${KUBECTL_CMD:-kubectl}" +SENTINEL_FILE="${SENTINEL_FILE:-/var/run/reboot-required}" + +echo "Creating reboot sentinel on all nodes" + +for nodename in $("$KUBECTL_CMD" get nodes -o name); do + docker exec "${nodename/node\//}" hostname + docker exec "${nodename/node\//}" touch "${SENTINEL_FILE}" +done diff --git a/tests/kind/follow-coordinated-reboot.sh b/tests/kind/follow-coordinated-reboot.sh new file mode 100755 index 0000000..e3682e8 --- /dev/null +++ b/tests/kind/follow-coordinated-reboot.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +NODECOUNT=${NODECOUNT:-5} +KUBECTL_CMD="${KUBECTL_CMD:-kubectl}" +DEBUG="${DEBUG:-false}" + +tmp_dir=$(mktemp -d -t kured-XXXX) + +declare -A was_unschedulable +declare -A has_recovered + +max_attempts="60" +sleep_time=60 +attempt_num=1 + +set +o errexit +echo "There are $NODECOUNT nodes in the cluster" +until [ ${#was_unschedulable[@]} == "$NODECOUNT" ] && [ ${#has_recovered[@]} == "$NODECOUNT" ] +do + echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}" + echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}" + + "$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output + if [[ "$DEBUG" == "true" ]]; then + # This is useful to see if a node gets stuck after drain, and doesn't + # come back up. + echo "Result of command $KUBECTL_CMD get nodes ... showing unschedulable nodes:" + cat "$tmp_dir"/node_output + fi + while read -r node; do + unschedulable=$(echo "$node" | grep true | cut -f 1 -d ' ') + if [ -n "$unschedulable" ] && [ -z ${was_unschedulable["$unschedulable"]+x} ] ; then + echo "$unschedulable is now unschedulable!" + was_unschedulable["$unschedulable"]=1 + fi + schedulable=$(echo "$node" | grep '' | cut -f 1 -d ' ') + if [ -n "$schedulable" ] && [ ${was_unschedulable["$schedulable"]+x} ] && [ -z ${has_recovered["$schedulable"]+x} ]; then + echo "$schedulable has recovered!" + has_recovered["$schedulable"]=1 + fi + done < "$tmp_dir"/node_output + + if [[ "${#has_recovered[@]}" == "$NODECOUNT" ]]; then + echo "All nodes recovered." + break + else + if (( attempt_num == max_attempts )) + then + echo "Attempt $attempt_num failed and there are no more attempts left!" + exit 1 + else + echo "Attempt $attempt_num failed! Trying again in $sleep_time seconds..." + sleep "$sleep_time" + fi + fi + (( attempt_num++ )) +done +if [[ "$DEBUG" == "true" ]]; then + # This is useful to see if containers have crashed + echo "Debug logs" + echo "docker ps:" + docker ps + # This is useful to see if the nodes have _properly_ rebooted. + # It should show the reboot/two container starts per node. + for name in chart-testing-control-plane chart-testing-control-plane2 chart-testing-control-plane3 chart-testing-worker chart-testing-worker2; do + echo "############################################################" + echo "docker logs for node $name:" + docker logs $name + done +fi + +set -o errexit +echo "Test successful" + +rm "$tmp_dir"/node_output +rmdir "$tmp_dir" From c9367eeff5578a5481020501570e6c0dd4022037 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Thu, 26 Nov 2020 09:19:41 +0100 Subject: [PATCH 2/2] Always have latest helm binary installed This will ease our maintenance. --- .github/workflows/smoke-tests.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/smoke-tests.yml index 6b53ffe..6f4a7e4 100644 --- a/.github/workflows/smoke-tests.yml +++ b/.github/workflows/smoke-tests.yml @@ -39,9 +39,11 @@ jobs: - name: Deploy kured on default namespace with its helm chart run: | - curl -o helm3.tgz https://get.helm.sh/helm-v3.3.0-linux-amd64.tar.gz - tar xf helm3.tgz - ./linux-amd64/helm install kured ./charts/kured/ --set configuration.period=1m + # Documented in official helm doc to live on the edge + curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash + # Refresh bins + hash -r + helm install kured ./charts/kured/ --set configuration.period=1m kubectl config set-context kind-chart-testing kubectl get ds --all-namespaces kubectl describe ds kured