diff --git a/.github/kind-cluster.yaml b/.github/kind-cluster.yaml new file mode 100644 index 0000000..f30e384 --- /dev/null +++ b/.github/kind-cluster.yaml @@ -0,0 +1,8 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane +- role: control-plane +- role: control-plane +- role: worker +- role: worker diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/smoke-tests.yml new file mode 100644 index 0000000..6f4a7e4 --- /dev/null +++ b/.github/workflows/smoke-tests.yml @@ -0,0 +1,68 @@ +# This needs all the previous artifacts to be created: +# the image should be published, the repo should be tagged, the helm +# chart pushed. It's assumed everything is working, and we are doing +# a last final test at the release creation time to _ensure_ we can +# release. At this point it's still time to back off in case of problem, +# not publish the release and iterate on tags. + +name: Smoke test + +on: + push: + branches: + - '**' + release: + types: created + +jobs: + deploy-manifests: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + fetch-depth: "0" + + - name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions + run: | + sudo bash << EOF + cp /etc/docker/daemon.json /etc/docker/daemon.json.old + echo '{}' > /etc/docker/daemon.json + systemctl restart docker || journalctl --no-pager -n 500 + systemctl status docker + EOF + + - name: Create 5 node kind cluster + uses: helm/kind-action@master + with: + config: .github/kind-cluster.yaml + + - name: Deploy kured on default namespace with its helm chart + run: | + # Documented in official helm doc to live on the edge + curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash + # Refresh bins + hash -r + helm install kured ./charts/kured/ --set configuration.period=1m + kubectl config set-context kind-chart-testing + kubectl get ds --all-namespaces + kubectl describe ds kured + + - name: Ensure kured is ready + uses: nick-invision/retry@v1 + with: + timeout_minutes: 10 + max_attempts: 10 + retry_wait_seconds: 60 + # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = 5 + command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' " + + - name: Create reboot sentinel files + run: | + ./tests/kind/create-reboot-sentinels.sh + + - name: Follow reboot until success + env: + DEBUG: true + run: | + ./tests/kind/follow-coordinated-reboot.sh diff --git a/tests/kind/create-reboot-sentinels.sh b/tests/kind/create-reboot-sentinels.sh new file mode 100755 index 0000000..e95dc3b --- /dev/null +++ b/tests/kind/create-reboot-sentinels.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +# USE KUBECTL_CMD to pass context and/or namespaces. +KUBECTL_CMD="${KUBECTL_CMD:-kubectl}" +SENTINEL_FILE="${SENTINEL_FILE:-/var/run/reboot-required}" + +echo "Creating reboot sentinel on all nodes" + +for nodename in $("$KUBECTL_CMD" get nodes -o name); do + docker exec "${nodename/node\//}" hostname + docker exec "${nodename/node\//}" touch "${SENTINEL_FILE}" +done diff --git a/tests/kind/follow-coordinated-reboot.sh b/tests/kind/follow-coordinated-reboot.sh new file mode 100755 index 0000000..e3682e8 --- /dev/null +++ b/tests/kind/follow-coordinated-reboot.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +NODECOUNT=${NODECOUNT:-5} +KUBECTL_CMD="${KUBECTL_CMD:-kubectl}" +DEBUG="${DEBUG:-false}" + +tmp_dir=$(mktemp -d -t kured-XXXX) + +declare -A was_unschedulable +declare -A has_recovered + +max_attempts="60" +sleep_time=60 +attempt_num=1 + +set +o errexit +echo "There are $NODECOUNT nodes in the cluster" +until [ ${#was_unschedulable[@]} == "$NODECOUNT" ] && [ ${#has_recovered[@]} == "$NODECOUNT" ] +do + echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}" + echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}" + + "$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output + if [[ "$DEBUG" == "true" ]]; then + # This is useful to see if a node gets stuck after drain, and doesn't + # come back up. + echo "Result of command $KUBECTL_CMD get nodes ... showing unschedulable nodes:" + cat "$tmp_dir"/node_output + fi + while read -r node; do + unschedulable=$(echo "$node" | grep true | cut -f 1 -d ' ') + if [ -n "$unschedulable" ] && [ -z ${was_unschedulable["$unschedulable"]+x} ] ; then + echo "$unschedulable is now unschedulable!" + was_unschedulable["$unschedulable"]=1 + fi + schedulable=$(echo "$node" | grep '' | cut -f 1 -d ' ') + if [ -n "$schedulable" ] && [ ${was_unschedulable["$schedulable"]+x} ] && [ -z ${has_recovered["$schedulable"]+x} ]; then + echo "$schedulable has recovered!" + has_recovered["$schedulable"]=1 + fi + done < "$tmp_dir"/node_output + + if [[ "${#has_recovered[@]}" == "$NODECOUNT" ]]; then + echo "All nodes recovered." + break + else + if (( attempt_num == max_attempts )) + then + echo "Attempt $attempt_num failed and there are no more attempts left!" + exit 1 + else + echo "Attempt $attempt_num failed! Trying again in $sleep_time seconds..." + sleep "$sleep_time" + fi + fi + (( attempt_num++ )) +done +if [[ "$DEBUG" == "true" ]]; then + # This is useful to see if containers have crashed + echo "Debug logs" + echo "docker ps:" + docker ps + # This is useful to see if the nodes have _properly_ rebooted. + # It should show the reboot/two container starts per node. + for name in chart-testing-control-plane chart-testing-control-plane2 chart-testing-control-plane3 chart-testing-worker chart-testing-worker2; do + echo "############################################################" + echo "docker logs for node $name:" + docker logs $name + done +fi + +set -o errexit +echo "Test successful" + +rm "$tmp_dir"/node_output +rmdir "$tmp_dir"