From 608abc6e89093f344979e70ecf5bea5e52e8758e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Tue, 15 Oct 2024 22:16:45 +0200 Subject: [PATCH] Increase CI coverage and provide new dev tool (#982) * Move to stable kind cluster filenames Without this, we have to rename files at every version. This is really unnecessary, we should only change the files and be done with it. This is a problem, as if we move to programmatic test running, the tests would need to be mutatated at every k8s version. With this model, we know that only the kind-cluster files need to be modified for the tests to ba automatically adapted. Signed-off-by: Jean-Philippe Evrard * Create e2e from go tests interface Without this, e2e tests need tons of manual work to test locally, and the results are not easily exposed. People are less likely to use the e2e tests if they are tough to use outside the CI. This commit makes it easier to run tests locally, and ensures the CI is closer to the Makefile. At the same time, this removes debt in the github worfklows: By switching to newer versions of kind, we can remove the very old workaround for the failed to attach pid 1. Signed-off-by: Jean-Philippe Evrard * Add node stays as cordonned test Without this, impossible to prove that the node stays as cordonned after a reboot by kured. This refactor also adds the test in the CI, and makes sure the CI is a bit simpler, by using matrix more extensively. Signed-off-by: Jean-Philippe Evrard * Use hack dir instead of .tmp This is more idiomatic. Signed-off-by: Jean-Philippe Evrard --------- Signed-off-by: Jean-Philippe Evrard --- ...er-1.29.yaml => kind-cluster-current.yaml} | 0 ...uster-1.30.yaml => kind-cluster-next.yaml} | 0 ...r-1.28.yaml => kind-cluster-previous.yaml} | 0 .github/workflows/on-pr.yaml | 259 ++------------ .github/workflows/periodics-daily.yaml | 2 +- .gitignore | 3 + CONTRIBUTING.md | 41 ++- Makefile | 51 ++- .../installers}/goreleaser-install.sh | 0 tests/kind/create-reboot-sentinels.sh | 13 - tests/kind/main_test.go | 336 ++++++++++++++++++ .../kind/testfiles/create-reboot-sentinels.sh | 11 + .../follow-coordinated-reboot.sh | 32 +- .../kind/testfiles/node-stays-as-cordonned.sh | 59 +++ 14 files changed, 517 insertions(+), 290 deletions(-) rename .github/{kind-cluster-1.29.yaml => kind-cluster-current.yaml} (100%) rename .github/{kind-cluster-1.30.yaml => kind-cluster-next.yaml} (100%) rename .github/{kind-cluster-1.28.yaml => kind-cluster-previous.yaml} (100%) rename {.github/scripts => hack/installers}/goreleaser-install.sh (100%) delete mode 100755 tests/kind/create-reboot-sentinels.sh create mode 100644 tests/kind/main_test.go create mode 100755 tests/kind/testfiles/create-reboot-sentinels.sh rename tests/kind/{ => testfiles}/follow-coordinated-reboot.sh (75%) create mode 100755 tests/kind/testfiles/node-stays-as-cordonned.sh diff --git a/.github/kind-cluster-1.29.yaml b/.github/kind-cluster-current.yaml similarity index 100% rename from .github/kind-cluster-1.29.yaml rename to .github/kind-cluster-current.yaml diff --git a/.github/kind-cluster-1.30.yaml b/.github/kind-cluster-next.yaml similarity index 100% rename from .github/kind-cluster-1.30.yaml rename to .github/kind-cluster-next.yaml diff --git a/.github/kind-cluster-1.28.yaml b/.github/kind-cluster-previous.yaml similarity index 100% rename from .github/kind-cluster-1.28.yaml rename to .github/kind-cluster-previous.yaml diff --git a/.github/workflows/on-pr.yaml b/.github/workflows/on-pr.yaml index 0c6346c..db73960 100644 --- a/.github/workflows/on-pr.yaml +++ b/.github/workflows/on-pr.yaml @@ -4,8 +4,8 @@ on: push: jobs: - pr-gotest: - name: Run go tests + pr-short-tests: + name: Run short go tests runs-on: ubuntu-latest steps: - name: checkout @@ -16,39 +16,13 @@ jobs: go-version-file: 'go.mod' check-latest: true - name: run tests - run: go test -json ./... > test.json + run: make test - name: Annotate tests if: always() uses: guyarb/golang-test-annoations@v0.8.0 with: test-results: test.json - pr-shellcheck: - name: Lint bash code with shellcheck - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Run ShellCheck - uses: bewuethr/shellcheck-action@v2 - - pr-lint-code: - name: Lint golang code - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Ensure go version - uses: actions/setup-go@v5 - with: - go-version-file: 'go.mod' - check-latest: true - - name: Lint cmd folder - uses: Jerome1337/golint-action@v1.0.3 - with: - golint-path: './cmd/...' - - name: Lint pkg folder - uses: Jerome1337/golint-action@v1.0.3 - with: - golint-path: './pkg/...' pr-check-docs-links: name: Check docs for incorrect links @@ -104,16 +78,20 @@ jobs: # (compared to helm charts, manifests cannot easily template changes based on versions) # Helm charts are _trailing_ releases, while manifests are done during development. # This test uses the "command" reboot-method. - e2e-manifests-command: - name: End-to-End test with kured with code and manifests from HEAD (command) + e2e-manifests: + name: End-to-End test with kured with code and manifests from HEAD runs-on: ubuntu-latest strategy: fail-fast: false matrix: - kubernetes: - - "1.28" - - "1.29" - - "1.30" + testname: + - "TestE2EWithCommand" + - "TestE2EWithSignal" + - "TestE2EConcurrentWithCommand" + kubernetes_version: + - "previous" + - "current" + - "next" steps: - uses: actions/checkout@v4 - name: Ensure go version @@ -130,75 +108,24 @@ jobs: - name: Find current tag version run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT id: tags - - name: Build artifacts - run: | - VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make image - VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make manifest - - - name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions - run: | - sudo bash << EOF - cp /etc/docker/daemon.json /etc/docker/daemon.json.old - echo '{}' > /etc/docker/daemon.json - systemctl restart docker || journalctl --no-pager -n 500 - systemctl status docker - EOF - - # Default name for helm/kind-action kind clusters is "chart-testing" - - name: Create kind cluster with 3 nodes + - name: Install kind uses: helm/kind-action@v1.10.0 with: - config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml - version: v0.14.0 - - - name: Preload previously built images onto kind cluster - run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing - - - name: Do not wait for an hour before detecting the rebootSentinel - run: | - sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml - - - name: Install kured with kubectl - run: | - kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml - - - name: Ensure kured is ready - uses: nick-invision/retry@v3.0.0 - with: - timeout_minutes: 10 - max_attempts: 10 - retry_wait_seconds: 60 - # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size - command: "kubectl get ds -n kube-system kured | grep -E 'kured.*3.*3.*3.*3.*3'" - - - name: Create reboot sentinel files - run: | - ./tests/kind/create-reboot-sentinels.sh - - - name: Follow reboot until success - env: - DEBUG: true - run: | - ./tests/kind/follow-coordinated-reboot.sh + install_only: true + version: v0.22.0 + - name: Run specific e2e tests + run: make e2e-test ARGS="-run ^${{ matrix.testname }}/${{ matrix.kubernetes_version }}" - # This ensures the latest code works with the manifests built from tree. - # It is useful for two things: - # - Test manifests changes (obviously), ensuring they don't break existing clusters - # - Ensure manifests work with the latest versions even with no manifest change - # (compared to helm charts, manifests cannot easily template changes based on versions) - # Helm charts are _trailing_ releases, while manifests are done during development. - # This test uses the "signal" reboot-method. - e2e-manifests-signal: - name: End-to-End test with kured with code and manifests from HEAD (signal) + e2e-tests-singleversion: + name: End-to-End test targetting a single version of kubernetes runs-on: ubuntu-latest strategy: fail-fast: false matrix: - kubernetes: - - "1.28" - - "1.29" - - "1.30" + testname: + - "TestCordonningIsKept/concurrency1" + - "TestCordonningIsKept/concurrency2" steps: - uses: actions/checkout@v4 - name: Ensure go version @@ -215,140 +142,10 @@ jobs: - name: Find current tag version run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT id: tags - - name: Build artifacts - run: | - VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make image - VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make manifest - - - name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions - run: | - sudo bash << EOF - cp /etc/docker/daemon.json /etc/docker/daemon.json.old - echo '{}' > /etc/docker/daemon.json - systemctl restart docker || journalctl --no-pager -n 500 - systemctl status docker - EOF - - # Default name for helm/kind-action kind clusters is "chart-testing" - - name: Create kind cluster with 3 nodes + - name: Install kind uses: helm/kind-action@v1.10.0 with: - config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml - version: v0.14.0 - - - name: Preload previously built images onto kind cluster - run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing - - - name: Do not wait for an hour before detecting the rebootSentinel - run: | - sed -i 's/#\(.*\)--period=1h/\1--period=15s/g' kured-ds-signal.yaml - - - name: Install kured with kubectl - run: | - kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds-signal.yaml - - - name: Ensure kured is ready - uses: nick-invision/retry@v3.0.0 - with: - timeout_minutes: 10 - max_attempts: 10 - retry_wait_seconds: 60 - # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size - command: "kubectl get ds -n kube-system kured | grep -E 'kured.*3.*3.*3.*3.*3'" - - - name: Create reboot sentinel files - run: | - ./tests/kind/create-reboot-sentinels.sh - - - name: Follow reboot until success - env: - DEBUG: true - run: | - ./tests/kind/follow-coordinated-reboot.sh - - - - # This ensures the latest code works with the manifests built from tree. - # It is useful for two things: - # - Test manifests changes (obviously), ensuring they don't break existing clusters - # - Ensure manifests work with the latest versions even with no manifest change - # (compared to helm charts, manifests cannot easily template changes based on versions) - # Helm charts are _trailing_ releases, while manifests are done during development. - # Concurrency = 2 - e2e-manifests-concurent: - name: End-to-End test with kured with code and manifests from HEAD (concurrent) - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - kubernetes: - - "1.28" - - "1.29" - - "1.30" - steps: - - uses: actions/checkout@v4 - - name: Ensure go version - uses: actions/setup-go@v5 - with: - go-version-file: 'go.mod' - check-latest: true - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Setup GoReleaser - run: make bootstrap-tools - - name: Find current tag version - run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT - id: tags - - name: Build artifacts - run: | - VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make image - VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make manifest - - - name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions - run: | - sudo bash << EOF - cp /etc/docker/daemon.json /etc/docker/daemon.json.old - echo '{}' > /etc/docker/daemon.json - systemctl restart docker || journalctl --no-pager -n 500 - systemctl status docker - EOF - - # Default name for helm/kind-action kind clusters is "chart-testing" - - name: Create kind cluster with 3 nodes - uses: helm/kind-action@v1.10.0 - with: - config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml - version: v0.14.0 - - - name: Preload previously built images onto kind cluster - run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing - - - name: Do not wait for an hour before detecting the rebootSentinel - run: | - sed -i 's/#\(.*\)--period=1h/\1--period=15s/g' kured-ds.yaml - sed -i 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml - - - name: Install kured with kubectl - run: | - kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml - - - name: Ensure kured is ready - uses: nick-invision/retry@v3.0.0 - with: - timeout_minutes: 10 - max_attempts: 10 - retry_wait_seconds: 60 - # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size - command: "kubectl get ds -n kube-system kured | grep -E 'kured.*3.*3.*3.*3.*3'" - - - name: Create reboot sentinel files - run: | - ./tests/kind/create-reboot-sentinels.sh - - - name: Follow reboot until success - env: - DEBUG: true - run: | - ./tests/kind/follow-coordinated-reboot.sh + install_only: true + version: v0.22.0 + - name: Run specific e2e tests + run: make e2e-test ARGS="-run ^${{ matrix.testname }}" \ No newline at end of file diff --git a/.github/workflows/periodics-daily.yaml b/.github/workflows/periodics-daily.yaml index 45b47c7..52d67bc 100644 --- a/.github/workflows/periodics-daily.yaml +++ b/.github/workflows/periodics-daily.yaml @@ -12,7 +12,7 @@ jobs: - name: checkout uses: actions/checkout@v4 - name: run tests - run: go test -json ./... > test.json + run: make test - name: Annotate tests if: always() uses: guyarb/golang-test-annoations@v0.8.0 diff --git a/.gitignore b/.gitignore index 0f707f6..e8cd29e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ vendor build dist .tmp +test.json +tests/kind/testfiles/*.yaml +hack/bin/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2cf2a25..6263e61 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,7 +44,7 @@ All Kured repositories are kept under . To find t ### Kured code - Kured's main code can be found in the [`cmd`](cmd) and [`pkg`](pkg) directories -- Its smoke tests are in the [`tests`](tests) directory +- Its e2e tests are in the [`tests`](tests) directory - We use [GoReleaser to build](.goreleaser.yml). - Every PR and tagged release is tested by [Kind in GitHub workflows](.github/workflows). @@ -75,12 +75,19 @@ efbb0c3: Document version compatibility in release notes Search the git log for inspiration for your cases. -Please update our `.github/workflows` with the new k8s images, starting by -the creation of a `.github/kind-cluster-.yaml`, then updating -our workflows with the new versions. +Please update our `.github/workflows` with the new k8s images. -Once you updated everything, make sure you update the support matrix in -the [installation docs](https://kured.dev/docs/installation/) as well. +For that, run the following: + +`cp .github/kind-cluster-current.yaml .github/kind-cluster-previous.yaml` +`cp .github/kind-cluster-next.yaml .github/kind-cluster-current.yaml` + +Then edit `.github/kind-cluster-next.yaml` to point to the new version. + +This will make the full test matrix updated (the CI and the test code). + +Once your code passes all tests, update the support matrix in +the [installation docs](https://kured.dev/docs/installation/). ### Updating other dependencies @@ -147,6 +154,13 @@ To test your code manually, follow the section Manual testing. ## Manual (release) testing +### Quick Golang code testing + +Please run `make test` to run only the basic tests. It gives a good +idea of the code behaviour. + +### Manual functional testing + Before `kured` is released, we want to make sure it still works fine on the previous, current and next minor version of Kubernetes (with respect to the `client-go` & `kubectl` dependencies in use). For local testing e.g. @@ -162,11 +176,7 @@ results, if you login to a node and run: sudo touch /var/run/reboot-required ``` -### Example of golang testing - -Please run `make test`. You should have `golint` installed. - -### Example of testing with `minikube` +### Example of functional testing with `minikube` A test-run with `minikube` could look like this: @@ -214,6 +224,15 @@ kind create cluster --config .github/kind-cluster-.yaml ``` +### Example of testing with `kind` and `make` + +A test-run with `kind` and `make` can be done with the following command: + +```cli +# Build kured:dev image, build manifests, and run the "long" go tests +make e2e-test +``` + ## Publishing a new kured release ### Prepare Documentation diff --git a/Makefile b/Makefile index 378032d..ddba7a6 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,25 @@ .DEFAULT: all .PHONY: all clean image minikube-publish manifest test kured-all -TEMPDIR=./.tmp -GORELEASER_CMD=$(TEMPDIR)/goreleaser +HACKDIR=./hack/bin +GORELEASER_CMD=$(HACKDIR)/goreleaser DH_ORG ?= kubereboot VERSION=$(shell git rev-parse --short HEAD) SUDO=$(shell docker info >/dev/null 2>&1 || echo "sudo -E") all: image -$(TEMPDIR): - mkdir -p $(TEMPDIR) +$(HACKDIR): + mkdir -p $(HACKDIR) .PHONY: bootstrap-tools -bootstrap-tools: $(TEMPDIR) - VERSION=v1.24.0 TMPDIR=.tmp bash .github/scripts/goreleaser-install.sh - curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b .tmp v1.0.1 - curl -sSfL https://github.com/sigstore/cosign/releases/download/v2.2.3/cosign-linux-amd64 -o .tmp/cosign - chmod +x .tmp/goreleaser .tmp/cosign .tmp/syft +bootstrap-tools: $(HACKDIR) + command -v $(HACKDIR)/goreleaser || VERSION=v1.24.0 TMPDIR=$(HACKDIR) bash hack/installers/goreleaser-install.sh + command -v $(HACKDIR)/syft || curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b $(HACKDIR) v1.0.1 + command -v $(HACKDIR)/cosign || curl -sSfL https://github.com/sigstore/cosign/releases/download/v2.2.3/cosign-linux-amd64 -o $(HACKDIR)/cosign + command -v $(HACKDIR)/shellcheck || (curl -sSfL https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz | tar -J -v -x shellcheck-stable/shellcheck && mv shellcheck-stable/shellcheck $(HACKDIR)/shellcheck && rmdir shellcheck-stable) + chmod +x $(HACKDIR)/goreleaser $(HACKDIR)/cosign $(HACKDIR)/syft $(HACKDIR)/shellcheck + # go install honnef.co/go/tools/cmd/staticcheck@latest clean: rm -rf ./dist @@ -35,7 +37,23 @@ kured-release-snapshot: $(GORELEASER_CMD) release --clean --snapshot image: kured - $(SUDO) docker buildx build --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) . + $(SUDO) docker buildx build --no-cache --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) . + +dev-image: image + $(SUDO) docker tag ghcr.io/$(DH_ORG)/kured:$(VERSION) kured:dev + +dev-manifest: + # basic e2e scenario + sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds.yaml > tests/kind/testfiles/kured-ds.yaml + # signal e2e scenario + sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-signal.yaml + # concurrency e2e scenario + sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml > tests/kind/testfiles/kured-ds-concurrent.yaml + + +e2e-test: dev-manifest dev-image + echo "Running ALL go tests" + go test -count=1 -v --parallel 4 ./... $(ARGS) minikube-publish: image $(SUDO) docker save ghcr.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load) @@ -45,10 +63,9 @@ manifest: sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds-signal.yaml echo "Please generate combined manifest if necessary" -test: - echo "Running go tests" - go test ./... - echo "Running golint on pkg" - golint ./pkg/... - echo "Running golint on cmd" - golint ./cmd/... +test: bootstrap-tools + echo "Running short go tests" + go test -test.short -json ./... > test.json + echo "Running shellcheck" + find . -name '*.sh' | xargs -n1 $(HACKDIR)/shellcheck + # Need to add staticcheck to replace golint as golint is deprecated, and staticcheck is the recommendation diff --git a/.github/scripts/goreleaser-install.sh b/hack/installers/goreleaser-install.sh similarity index 100% rename from .github/scripts/goreleaser-install.sh rename to hack/installers/goreleaser-install.sh diff --git a/tests/kind/create-reboot-sentinels.sh b/tests/kind/create-reboot-sentinels.sh deleted file mode 100755 index 51bd127..0000000 --- a/tests/kind/create-reboot-sentinels.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -# USE KUBECTL_CMD to pass context and/or namespaces. -KUBECTL_CMD="${KUBECTL_CMD:-kubectl}" -SENTINEL_FILE="${SENTINEL_FILE:-/var/run/reboot-required}" - -echo "Creating reboot sentinel on worker nodes" - -# To speed up the system, let's not kill the control plane. -for nodename in $("$KUBECTL_CMD" get nodes -o name | grep -v control-plane); do - docker exec "${nodename/node\//}" hostname - docker exec "${nodename/node\//}" touch "${SENTINEL_FILE}" -done diff --git a/tests/kind/main_test.go b/tests/kind/main_test.go new file mode 100644 index 0000000..0a13c00 --- /dev/null +++ b/tests/kind/main_test.go @@ -0,0 +1,336 @@ +package kind + +import ( + "bytes" + "fmt" + "math/rand" + "os/exec" + "strconv" + "testing" + "time" +) + +const ( + kuredDevImage string = "kured:dev" +) + +// KindTest cluster deployed by each TestMain function, prepared to run a given test scenario. +type KindTest struct { + kindConfigPath string + clusterName string + timeout time.Duration + deployManifests []string + localImages []string + logsDir string + logBuffer bytes.Buffer + testInstance *testing.T // Maybe move this to testing.TB +} + +func (k *KindTest) Write(p []byte) (n int, err error) { + k.testInstance.Helper() + k.logBuffer.Write(p) + return len(p), nil +} + +func (k *KindTest) FlushLog() { + k.testInstance.Helper() + k.testInstance.Log(k.logBuffer.String()) + k.logBuffer.Reset() +} + +func (k *KindTest) RunCmd(cmdDetails ...string) error { + cmd := exec.Command(cmdDetails[0], cmdDetails[1:]...) + // by making KindTest a Writer, we can simply wire k to logs + // writing to k will write to proper logs. + cmd.Stdout = k + cmd.Stderr = k + + err := cmd.Run() + if err != nil { + return err + } + return nil +} + +// Option that can be passed to the NewKind function in order to change the configuration +// of the test cluster +type Option func(k *KindTest) + +// Deploy can be passed to NewKind to deploy extra components, in addition to the base deployment. +func Deploy(manifest string) Option { + return func(k *KindTest) { + k.deployManifests = append(k.deployManifests, manifest) + } +} + +// ExportLogs can be passed to NewKind to specify the folder where the kubernetes logs will be exported after the tests. +func ExportLogs(folder string) Option { + return func(k *KindTest) { + k.logsDir = folder + } +} + +// Timeout for long-running operations (e.g. deployments, readiness probes...) +func Timeout(t time.Duration) Option { + return func(k *KindTest) { + k.timeout = t + } +} + +// LocalImage is passed to NewKind to allow loading a local Docker image into the cluster +func LocalImage(nameTag string) Option { + return func(k *KindTest) { + k.localImages = append(k.localImages, nameTag) + } +} + +// NewKind creates a kind cluster given a name and set of Option instances. +func NewKindTester(kindClusterName string, filePath string, t *testing.T, options ...Option) *KindTest { + + k := &KindTest{ + clusterName: kindClusterName, + timeout: 10 * time.Minute, + kindConfigPath: filePath, + testInstance: t, + } + for _, option := range options { + option(k) + } + return k +} + +// Prepare the kind cluster. +func (k *KindTest) Create() error { + err := k.RunCmd("kind", "create", "cluster", "--name", k.clusterName, "--config", k.kindConfigPath) + + if err != nil { + return fmt.Errorf("failed to create cluster: %v", err) + } + + for _, img := range k.localImages { + if err := k.RunCmd("kind", "load", "docker-image", "--name", k.clusterName, img); err != nil { + return fmt.Errorf("failed to load image: %v", err) + } + } + for _, mf := range k.deployManifests { + kubectlContext := fmt.Sprintf("kind-%v", k.clusterName) + if err := k.RunCmd("kubectl", "--context", kubectlContext, "apply", "-f", mf); err != nil { + return fmt.Errorf("failed to deploy manifest: %v", err) + } + } + return nil +} + +func (k *KindTest) Destroy() error { + if k.logsDir != "" { + if err := k.RunCmd("kind", "export", "logs", k.logsDir, "--name", k.clusterName); err != nil { + return fmt.Errorf("failed to export logs: %v. will not teardown", err) + } + } + + if err := k.RunCmd("kind", "delete", "cluster", "--name", k.clusterName); err != nil { + return fmt.Errorf("failed to destroy cluster: %v", err) + } + return nil +} + +func TestE2EWithCommand(t *testing.T) { + t.Parallel() + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + var kindClusterConfigs = []string{ + "previous", + "current", + "next", + } + // Iterate over each Kubernetes version + for _, version := range kindClusterConfigs { + version := version + // Define a subtest for each combination + t.Run(version, func(t *testing.T) { + t.Parallel() // Allow tests to run in parallel + + randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + kindClusterName := fmt.Sprintf("kured-e2e-command-%v-%v", version, randomInt) + kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) + kindContext := fmt.Sprintf("kind-%v", kindClusterName) + + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds.yaml")) + defer k.FlushLog() + + err := k.Create() + if err != nil { + t.Fatalf("Error creating cluster %v", err) + } + defer func(k *KindTest) { + err := k.Destroy() + if err != nil { + t.Fatalf("Error destroying cluster %v", err) + } + }(k) + + k.Write([]byte("Now running e2e tests")) + + if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil { + t.Fatalf("failed to create sentinels: %v", err) + } + + if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil { + t.Fatalf("failed to follow reboot: %v", err) + } + }) + } +} + +func TestE2EWithSignal(t *testing.T) { + t.Parallel() + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + var kindClusterConfigs = []string{ + "previous", + "current", + "next", + } + // Iterate over each Kubernetes version + for _, version := range kindClusterConfigs { + version := version + // Define a subtest for each combination + t.Run(version, func(t *testing.T) { + t.Parallel() // Allow tests to run in parallel + + randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + kindClusterName := fmt.Sprintf("kured-e2e-signal-%v-%v", version, randomInt) + kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) + kindContext := fmt.Sprintf("kind-%v", kindClusterName) + + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-signal.yaml")) + defer k.FlushLog() + + err := k.Create() + if err != nil { + t.Fatalf("Error creating cluster %v", err) + } + defer func(k *KindTest) { + err := k.Destroy() + if err != nil { + t.Fatalf("Error destroying cluster %v", err) + } + }(k) + + k.Write([]byte("Now running e2e tests")) + + if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil { + t.Fatalf("failed to create sentinels: %v", err) + } + + if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil { + t.Fatalf("failed to follow reboot: %v", err) + } + }) + } +} + +func TestE2EConcurrentWithCommand(t *testing.T) { + t.Parallel() + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + var kindClusterConfigs = []string{ + "previous", + "current", + "next", + } + // Iterate over each Kubernetes version + for _, version := range kindClusterConfigs { + version := version + // Define a subtest for each combination + t.Run(version, func(t *testing.T) { + t.Parallel() // Allow tests to run in parallel + + randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + kindClusterName := fmt.Sprintf("kured-e2e-concurrentcommand-%v-%v", version, randomInt) + kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version) + kindContext := fmt.Sprintf("kind-%v", kindClusterName) + + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent.yaml")) + defer k.FlushLog() + + err := k.Create() + if err != nil { + t.Fatalf("Error creating cluster %v", err) + } + defer func(k *KindTest) { + err := k.Destroy() + if err != nil { + t.Fatalf("Error destroying cluster %v", err) + } + }(k) + + k.Write([]byte("Now running e2e tests")) + + if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil { + t.Fatalf("failed to create sentinels: %v", err) + } + + if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil { + t.Fatalf("failed to follow reboot: %v", err) + } + }) + } +} + +func TestCordonningIsKept(t *testing.T) { + t.Parallel() + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + var kindClusterConfigs = []string{ + "concurrency1", + "concurrency2", + } + // Iterate over each Kubernetes version + for _, version := range kindClusterConfigs { + version := version + // Define a subtest for each combination + t.Run(version, func(t *testing.T) { + t.Parallel() // Allow tests to run in parallel + + randomInt := fmt.Sprintf(strconv.Itoa(rand.Intn(100))) + kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", version, randomInt) + kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-next.yaml") + kindContext := fmt.Sprintf("kind-%v", kindClusterName) + + var manifest string + if version == "concurrency1" { + manifest = fmt.Sprintf("testfiles/kured-ds.yaml") + } else { + manifest = fmt.Sprintf("testfiles/kured-ds-concurrent.yaml") + } + k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(manifest)) + defer k.FlushLog() + + err := k.Create() + if err != nil { + t.Fatalf("Error creating cluster %v", err) + } + defer func(k *KindTest) { + err := k.Destroy() + if err != nil { + t.Fatalf("Error destroying cluster %v", err) + } + }(k) + + k.Write([]byte("Now running e2e tests")) + + if err := k.RunCmd("bash", "testfiles/node-stays-as-cordonned.sh", kindContext); err != nil { + t.Fatalf("node did not reboot in time: %v", err) + } + }) + } +} diff --git a/tests/kind/testfiles/create-reboot-sentinels.sh b/tests/kind/testfiles/create-reboot-sentinels.sh new file mode 100755 index 0000000..016f201 --- /dev/null +++ b/tests/kind/testfiles/create-reboot-sentinels.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +kubectl_flags=( ) +[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1") + +# To speed up the system, let's not kill the control plane. +for nodename in $(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o name | grep -v control-plane); do + echo "Creating reboot sentinel on $nodename" + docker exec "${nodename/node\//}" hostname + docker exec "${nodename/node\//}" touch "${SENTINEL_FILE:-/var/run/reboot-required}" +done diff --git a/tests/kind/follow-coordinated-reboot.sh b/tests/kind/testfiles/follow-coordinated-reboot.sh similarity index 75% rename from tests/kind/follow-coordinated-reboot.sh rename to tests/kind/testfiles/follow-coordinated-reboot.sh index 4559ec1..fd25c36 100755 --- a/tests/kind/follow-coordinated-reboot.sh +++ b/tests/kind/testfiles/follow-coordinated-reboot.sh @@ -1,11 +1,14 @@ #!/usr/bin/env bash -NODECOUNT=${NODECOUNT:-2} -KUBECTL_CMD="${KUBECTL_CMD:-kubectl}" +REBOOTCOUNT=${REBOOTCOUNT:-2} # By default we only create two sentinels in create-reboot-sentinels. DEBUG="${DEBUG:-false}" CONTAINER_NAME_FORMAT=${CONTAINER_NAME_FORMAT:-"chart-testing-*"} +kubectl_flags=( ) +[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1") + tmp_dir=$(mktemp -d -t kured-XXXX) + function gather_logs_and_cleanup { if [[ -f "$tmp_dir"/node_output ]]; then rm "$tmp_dir"/node_output @@ -18,15 +21,15 @@ function gather_logs_and_cleanup { # This is useful to see if containers have crashed. echo "docker ps -a:" docker ps -a - echo "docker journal logs" - journalctl -u docker --no-pager + echo "docker journal logs" + journalctl -u docker --no-pager # This is useful to see if the nodes have _properly_ rebooted. # It should show the reboot/two container starts per node. - for name in $(docker ps -a -f "name=${CONTAINER_NAME_FORMAT}" -q); do + for id in $(docker ps -a -q); do echo "############################################################" - echo "docker logs for container $name:" - docker logs "$name" + echo "docker logs for container $id:" + docker logs "$id" done fi @@ -42,23 +45,18 @@ attempt_num=1 # Get docker info of each of those kind containers. If one has crashed, restart it. set +o errexit -echo "There are $NODECOUNT nodes in the cluster" -until [ ${#was_unschedulable[@]} == "$NODECOUNT" ] && [ ${#has_recovered[@]} == "$NODECOUNT" ] +echo "There are $REBOOTCOUNT nodes total needing reboot in the cluster" +until [ ${#was_unschedulable[@]} == "$REBOOTCOUNT" ] && [ ${#has_recovered[@]} == "$REBOOTCOUNT" ] do echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}" echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}" - #"$KUBECTL_CMD" logs -n kube-system -l name=kured --ignore-errors > "$tmp_dir"/node_output - #if [[ "$DEBUG" == "true" ]]; then - # echo "Kured pod logs:" - # cat "$tmp_dir"/node_output - #fi - "$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers | grep -v control-plane > "$tmp_dir"/node_output + ${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers | grep -v control-plane > "$tmp_dir"/node_output if [[ "$DEBUG" == "true" ]]; then # This is useful to see if a node gets stuck after drain, and doesn't # come back up. - echo "Result of command $KUBECTL_CMD get nodes ... showing unschedulable nodes:" + echo "Result of command kubectl unschedulable nodes:" cat "$tmp_dir"/node_output fi @@ -81,7 +79,7 @@ do done < "$tmp_dir"/node_output - if [[ "${#has_recovered[@]}" == "$NODECOUNT" ]]; then + if [[ "${#has_recovered[@]}" == "$REBOOTCOUNT" ]]; then echo "All nodes recovered." break else diff --git a/tests/kind/testfiles/node-stays-as-cordonned.sh b/tests/kind/testfiles/node-stays-as-cordonned.sh new file mode 100755 index 0000000..bc41345 --- /dev/null +++ b/tests/kind/testfiles/node-stays-as-cordonned.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +kubectl_flags=( ) +[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1") + +cordon() { + kubectl "${kubectl_flags[@]}" cordon "${precordonned_node}" +} + +create_sentinel() { + docker exec "${precordonned_node}" touch "${SENTINEL_FILE:-/var/run/reboot-required}" + docker exec "${notcordonned_node}" touch "${SENTINEL_FILE:-/var/run/reboot-required}" +} + +check_reboot_required() { + while true; + do + docker exec "${precordonned_node}" stat /var/run/reboot-required > /dev/null && echo "Reboot still required" || return 0 + sleep 3 + done +} + +check_node_back_online_as_cordonned() { + sleep 5 # For safety, wait for 5 seconds, so that the kubectl command succeeds. + # This test might be giving us false positive until we work on reliability of the + # test. + while true; + do + result=$(kubectl "${kubectl_flags[@]}" get node "${precordonned_node}" --no-headers | awk '{print $2;}') + test "${result}" != "Ready,SchedulingDisabled" && echo "Node ${precordonned_node} in state ${result}" || return 0 + sleep 3 + done +} + +check_node_back_online_as_uncordonned() { + while true; + do + result=$(kubectl "${kubectl_flags[@]}" get node "${notcordonned_node}" --no-headers | awk '{print $2;}') + test "${result}" != "Ready" && echo "Node ${notcordonned_node} in state ${result}" || return 0 + sleep 3 + done +} +### Start main + +worker_nodes=$(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=name:metadata.name --no-headers | grep worker) +precordonned_node=$(echo "$worker_nodes" | head -n 1) +notcordonned_node=$(echo "$worker_nodes" | tail -n 1) + +# Wait for kured to install correctly +sleep 15 +cordon +create_sentinel +check_reboot_required +echo "Node has rebooted, but may take time to come back ready" +check_node_back_online_as_cordonned +check_node_back_online_as_uncordonned +echo "Showing final node state" +${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes +echo "Test successful" \ No newline at end of file