Revert "Updated helm charts with new message template flag (#589 )"

This reverts commit e2e6e86e0c.
Updated helm charts with new message template flag (#589 )
2026-02-15 01:39:50 +00:00 · 2022-06-29 16:50:51 +02:00 · 2022-06-29 14:49:56 +02:00 · 2022-06-28 12:33:09 +02:00 · 2022-06-28 12:06:57 +02:00 · 2022-06-25 21:08:05 +02:00
58 changed files with 4653 additions and 479 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,26 +0,0 @@
-version: 2
-jobs:
-  build:
-    docker:
-      - image: cimg/go:1.13
-    steps:
-      - checkout
-      - setup_remote_docker
-      - deploy:
-          name: Build and push image
-          command: |
-            echo "$DOCKER_PASS" | docker login --username "$DOCKER_USER" --password-stdin
-            if [ -z "${CIRCLE_TAG}" ]; then
-                make publish-image
-            else
-                make VERSION="${CIRCLE_TAG}" publish-image
-            fi
-
-workflows:
-  version: 2
-  build:
-    jobs:
-     - build:
-         filters:
-           tags:
-             only: /.*/
--- a/.github/ct.yaml
+++ b/.github/ct.yaml
@@ -0,0 +1,7 @@
+# See https://github.com/helm/chart-testing#configuration
+remote: origin
+target-branch: main
+chart-dirs:
+  - charts
+chart-repos: []
+helm-extra-args: --timeout 600s
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,21 @@
+version: 2
+updates:
+  # Maintain dependencies for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+  # Maintain dependencies for gomod
+  - package-ecosystem: "gomod"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    ignore:
+      - dependency-name: "k8s.io/api"
+      - dependency-name: "k8s.io/apimachinery"
+      - dependency-name: "k8s.io/client-go"
+      - dependency-name: "k8s.io/kubectl"
+  - package-ecosystem: "docker"
+    directory: "cmd/kured"
+    schedule:
+      interval: "daily"
--- a/.github/kind-cluster-1.22.yaml
+++ b/.github/kind-cluster-1.22.yaml
@@ -0,0 +1,13 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+- role: control-plane
+  image: kindest/node:v1.22.4
+- role: control-plane
+  image: kindest/node:v1.22.4
+- role: control-plane
+  image: kindest/node:v1.22.4
+- role: worker
+  image: kindest/node:v1.22.4
+- role: worker
+  image: kindest/node:v1.22.4
--- a/.github/kind-cluster-1.23.yaml
+++ b/.github/kind-cluster-1.23.yaml
@@ -0,0 +1,13 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+- role: control-plane
+  image: "kindest/node:v1.23.0"
+- role: control-plane
+  image: "kindest/node:v1.23.0"
+- role: control-plane
+  image: "kindest/node:v1.23.0"
+- role: worker
+  image: "kindest/node:v1.23.0"
+- role: worker
+  image: "kindest/node:v1.23.0"
--- a/.github/kind-cluster-1.24.yaml
+++ b/.github/kind-cluster-1.24.yaml
@@ -0,0 +1,13 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+- role: control-plane
+  image: "kindest/node:v1.24.0"
+- role: control-plane
+  image: "kindest/node:v1.24.0"
+- role: control-plane
+  image: "kindest/node:v1.24.0"
+- role: worker
+  image: "kindest/node:v1.24.0"
+- role: worker
+  image: "kindest/node:v1.24.0"
--- a/.github/workflows/check-links.yaml
+++ b/.github/workflows/check-links.yaml
@@ -1,16 +0,0 @@
-name: "Check links"
-on: [pull_request, push]
-
-jobs:
-  docs:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v1
-    - name: Link Checker
-      id: lc
-      uses: peter-evans/link-checker@v1
-      with:
-        args: -r *.md *.yaml */*/*.go -x .cluster.local
-    - name: Fail if there were link errors
-      run: exit ${{ steps.lc.outputs.exit_code }}
-
--- a/.github/workflows/on-main-push-charts.yaml
+++ b/.github/workflows/on-main-push-charts.yaml
@@ -0,0 +1,19 @@
+name: Publish helm chart
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - "charts/**"
+
+jobs:
+  publish-helm-chart:
+    name: Publish latest chart
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Publish Helm chart
+        uses: stefanprodan/helm-gh-pages@master
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          charts_dir: charts
--- a/.github/workflows/on-main-push.yaml
+++ b/.github/workflows/on-main-push.yaml
@@ -0,0 +1,59 @@
+# We publish every merged commit in the form of an image
+# named kured:<branch>-<short tag>
+name: Push image of latest main
+on:
+  push:
+    branches:
+      - main
+jobs:
+  tag-scan-and-push-final-image:
+    name: "Build, scan, and publish tagged image"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
+          password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
+
+      - name: Login to ghcr.io
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: weave-ghcr-bot
+          password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Find current tag version
+        run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
+        id: tags
+
+      - name: Build image
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          file: cmd/kured/Dockerfile.multi
+          platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
+          push: true
+          tags: |
+            docker.io/${{ GITHUB.REPOSITORY }}:main-${{ steps.tags.outputs.sha_short }}
+            ghcr.io/${{ GITHUB.REPOSITORY }}:main-${{ steps.tags.outputs.sha_short }}
--- a/.github/workflows/on-pr-charts.yaml
+++ b/.github/workflows/on-pr-charts.yaml
@@ -0,0 +1,78 @@
+#This is just extra testing, for lint check, and basic installation
+#Those can fail earlier than functional tests (shorter tests)
+# and give developer feedback soon if they didn't test themselves
+name: PR - charts
+on:
+  pull_request:
+    paths:
+      - "charts/**"
+jobs:
+  # We create two jobs (with a matrix) instead of one to make those parallel.
+  # We don't need to conditionally check if something has changed, due to github actions
+  # tackling that for us.
+  # Fail-fast ensures that if one of those matrix job fail, the other one gets cancelled.
+  test-chart:
+    name: Test helm chart changes
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: true
+      matrix:
+        test-action:
+          - lint
+          - install
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: "0"
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.7
+
+      # Helm is already present in github actions, so do not re-install it
+      - name: Setup chart testing
+        uses: helm/chart-testing-action@v2.2.1
+
+      - name: Create default kind cluster
+        uses: helm/kind-action@v1.3.0
+        with:
+          version: v0.14.0
+        if: ${{ matrix.test-action == 'install' }}
+
+      - name: Run chart tests
+        run: ct ${{ matrix.test-action }} --config .github/ct.yaml
+
+  # This doesn't re-use the ct actions, due to many limitations (auto tear down, no real testing)
+  deploy-chart:
+    name: Functional test of helm chart in its current state (needs published image of the helm chart)
+    runs-on: ubuntu-latest
+    needs: test-chart
+    steps:
+      - uses: actions/checkout@v3
+      
+      # Default name for helm/kind-action kind clusters is "chart-testing"
+      - name: Create 1 node kind cluster
+        uses: helm/kind-action@v1.3.0
+        with:
+          version: v0.14.0
+
+      - name: Deploy kured on default namespace with its helm chart
+        run: |
+          # Documented in official helm doc to live on the edge
+          curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+          # Refresh bins
+          hash -r
+          helm install kured ./charts/kured/ --set configuration.period=1m --wait
+          kubectl config set-context kind-chart-testing
+          kubectl get ds --all-namespaces
+          kubectl describe ds kured
+
+      - name: Test if successful deploy
+        uses: nick-invision/retry@v2.7.0
+        with:
+          timeout_minutes: 10
+          max_attempts: 10
+          retry_wait_seconds: 10
+          # DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE should all be = to cluster_size
+          command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1'"
--- a/.github/workflows/on-pr.yaml
+++ b/.github/workflows/on-pr.yaml
@@ -0,0 +1,336 @@
+name: PR
+on:
+  pull_request:
+  push:
+
+jobs:
+  pr-gotest:
+    name: Run go tests
+    runs-on: ubuntu-18.04
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+      - name: run tests
+        run: go test -json ./... > test.json
+      - name: Annotate tests
+        if: always()
+        uses: guyarb/golang-test-annoations@v0.6.0
+        with:
+          test-results: test.json
+
+  pr-shellcheck:
+    name: Lint bash code with shellcheck
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Run ShellCheck
+      uses: bewuethr/shellcheck-action@v2
+
+  pr-lint-code:
+    name: Lint golang code
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Find go version
+      run: |
+        GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+        echo "::set-output name=version::${GO_VERSION}"
+      id: awk_gomod
+    - name: Ensure go version
+      uses: actions/setup-go@v3
+      with:
+        go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+    - name: Lint cmd folder
+      uses: Jerome1337/golint-action@v1.0.2
+      with:
+        golint-path: './cmd/...'
+    - name: Lint pkg folder
+      uses: Jerome1337/golint-action@v1.0.2
+      with:
+        golint-path: './pkg/...'
+
+  pr-check-docs-links:
+    name: Check docs for incorrect links
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Link Checker
+      id: lc
+      uses: peter-evans/link-checker@v1
+      with:
+        args: -r *.md *.yaml */*/*.go -x .cluster.local
+    - name: Fail if there were link errors
+      run: exit ${{ steps.lc.outputs.exit_code }}
+
+  # This should not be made a mandatory test
+  # It is only used to make us aware of any potential security failure, that
+  # should trigger a bump of the image in build/.
+  pr-vuln-scan:
+    name: Build image and scan it against known vulnerabilities
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+      - run: make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
+      - uses: Azure/container-scan@v0
+        with:
+          image-name: docker.io/${{ github.repository_owner }}/kured:${{ github.sha }}
+
+  # This ensures the latest code works with the manifests built from tree.
+  # It is useful for two things:
+  # - Test manifests changes (obviously), ensuring they don't break existing clusters
+  # - Ensure manifests work with the latest versions even with no manifest change
+  #     (compared to helm charts, manifests cannot easily template changes based on versions)
+  # Helm charts are _trailing_ releases, while manifests are done during development.
+  e2e-manifests:
+    name: End-to-End test with kured with code and manifests from HEAD
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes:
+          - "1.22"
+          - "1.23"
+          - "1.24"
+    steps:
+      - uses: actions/checkout@v3
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+      - name: Build artifacts
+        run: |
+          make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
+          make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" manifest
+
+      - name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
+        run: |
+          sudo bash << EOF
+              cp /etc/docker/daemon.json /etc/docker/daemon.json.old
+              echo '{}' > /etc/docker/daemon.json
+              systemctl restart docker || journalctl --no-pager -n 500
+              systemctl status docker
+          EOF
+
+      # Default name for helm/kind-action kind clusters is "chart-testing"
+      - name: Create kind cluster with 5 nodes
+        uses: helm/kind-action@v1.3.0
+        with:
+          config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
+          version: v0.14.0
+
+      - name: Preload previously built images onto kind cluster
+        run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
+
+      - name: Do not wait for an hour before detecting the rebootSentinel
+        run: |
+          sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
+
+      - name: Install kured with kubectl
+        run: |
+          kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
+
+      - name: Ensure kured is ready
+        uses: nick-invision/retry@v2.7.0
+        with:
+          timeout_minutes: 10
+          max_attempts: 10
+          retry_wait_seconds: 60
+          # DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE should all be = to cluster_size
+          command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"
+
+      - name: Create reboot sentinel files
+        run: |
+          ./tests/kind/create-reboot-sentinels.sh
+
+      - name: Follow reboot until success
+        env:
+          DEBUG: true
+        run: |
+          ./tests/kind/follow-coordinated-reboot.sh
+
+  scenario-prom-helm:
+    name: Test prometheus with latest code from HEAD (=overrides image of the helm chart)
+    runs-on: ubuntu-latest
+    # only build with oldest and newest supported, it should be good enough.
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes:
+          - "1.22"
+    steps:
+      - uses: actions/checkout@v3
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+      - name: Build artifacts
+        run: |
+          make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
+          make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" helm-chart
+
+      - name: Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions
+        run: |
+          sudo bash << EOF
+              cp /etc/docker/daemon.json /etc/docker/daemon.json.old
+              echo '{}' > /etc/docker/daemon.json
+              systemctl restart docker || journalctl --no-pager -n 500
+              systemctl status docker
+          EOF
+
+      # Default name for helm/kind-action kind clusters is "chart-testing"
+      - name: Create 1 node kind cluster
+        uses: helm/kind-action@v1.3.0
+        with:
+          version: v0.14.0
+
+      - name: Preload previously built images onto kind cluster
+        run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
+
+      - name: Deploy kured on default namespace with its helm chart
+        run: |
+          # Documented in official helm doc to live on the edge
+          curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+          # Refresh bins
+          hash -r
+          helm install kured ./charts/kured/ --wait --values ./charts/kured/ci/prometheus-values.yaml
+          kubectl config set-context kind-chart-testing
+          kubectl get ds --all-namespaces
+          kubectl describe ds kured
+
+      - name: Ensure kured is ready
+        uses: nick-invision/retry@v2.7.0
+        with:
+          timeout_minutes: 10
+          max_attempts: 10
+          retry_wait_seconds: 60
+          # DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE
+          command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1' "
+
+      - name: Get metrics (healthy)
+        uses: nick-invision/retry@v2.7.0
+        with:
+          timeout_minutes: 2
+          max_attempts: 12
+          retry_wait_seconds: 5
+          command: "./tests/kind/test-metrics.sh 0"
+
+      - name: Create reboot sentinel files
+        run: |
+          ./tests/kind/create-reboot-sentinels.sh
+
+      - name: Get metrics (need reboot)
+        uses: nick-invision/retry@v2.7.0
+        with:
+          timeout_minutes: 15
+          max_attempts: 10
+          retry_wait_seconds: 60
+          command: "./tests/kind/test-metrics.sh 1"
+
+
+  # TEMPLATE Scenario testing.
+  # Note: keep in mind that the helm chart's appVersion is overriden to test your HEAD of the branch,
+  # if you `make helm-chart`.
+  # This will allow you to test properly your scenario and not use an existing image which will not
+  # contain your feature.
+
+  # scenario-<REPLACETHIS>-helm:
+  #   #example: Testing <REPLACETHIS> with helm chart and code from HEAD"
+  #   name: "<REPLACETHIS>"
+  #   runs-on: ubuntu-latest
+  #   strategy:
+  #     fail-fast: false
+  #     # You can define your own kubernetes versions. For example if your helm chart change should behave differently with different kubernetes versions.
+  #     matrix:
+  #       kubernetes:
+  #         - "1.20"
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - name: Find go version
+  #       run: |
+  #         GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+  #         echo "::set-output name=version::${GO_VERSION}"
+  #       id: awk_gomod
+  #     - name: Ensure go version
+  #       uses: actions/setup-go@v3
+  #       with:
+  #         go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+  #     - name: Build artifacts
+  #       run: |
+  #         make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
+  #         make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" helm-chart
+  #
+  #     - name: "Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions"
+  #       run: |
+  #         sudo bash << EOF
+  #             cp /etc/docker/daemon.json /etc/docker/daemon.json.old
+  #             echo '{}' > /etc/docker/daemon.json
+  #             systemctl restart docker || journalctl --no-pager -n 500
+  #             systemctl status docker
+  #         EOF
+  #
+  #     # Default name for helm/kind-action kind clusters is "chart-testing"
+  #     - name: Create 5 node kind cluster
+  #       uses: helm/kind-action@master
+  #       with:
+  #         config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
+  #
+  #     - name: Preload previously built images onto kind cluster
+  #       run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
+  #
+  #     - name: Deploy kured on default namespace with its helm chart
+  #       run: |
+  #         # Documented in official helm doc to live on the edge
+  #         curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+  #         # Refresh bins
+  #         hash -r
+  #         helm install kured ./charts/kured/ --wait --values ./charts/kured/ci/<REPLACETHIS>-values.yaml
+  #         kubectl config set-context kind-chart-testing
+  #         kubectl get ds --all-namespaces
+  #         kubectl describe ds kured
+  #
+  #     - name: Ensure kured is ready
+  #       uses: nick-invision/retry@v2.7.0
+  #       with:
+  #         timeout_minutes: 10
+  #         max_attempts: 10
+  #         retry_wait_seconds: 60
+  #         # DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE should all be = 5
+  #         command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' "
+  #
+  #     - name: Create reboot sentinel files
+  #       run: |
+  #         ./tests/kind/create-reboot-sentinels.sh
+  #
+  #     - name: Test <REPLACETHIS>
+  #       env:
+  #         DEBUG: true
+  #       run: |
+  #         <TODO>
--- a/.github/workflows/on-tag.yaml
+++ b/.github/workflows/on-tag.yaml
@@ -0,0 +1,65 @@
+# when we add a tag to the repo, we should publish the kured image to a public repository
+# if it's safe.
+# It doesn't mean it's ready for release, but at least it's getting us started.
+# The next step is to have a PR with the helm chart, to bump the version of the image used
+name: Tag repo
+on:
+  push:
+    tags:
+      - "*"
+jobs:
+  tag-scan-and-push-final-image:
+    name: "Build, scan, and publish tagged image"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+      - name: Find current tag version
+        run: echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
+        id: tags
+      - run: |
+          make DH_ORG="${{ github.repository_owner }}" VERSION="${{ steps.tags.outputs.version }}" image
+      - uses: Azure/container-scan@v0
+        with:
+          image-name: docker.io/${{ github.repository_owner }}/kured:${{ steps.tags.outputs.version }}
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
+          password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
+
+      - name: Login to ghcr.io
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: weave-ghcr-bot
+          password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Build image
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          file: cmd/kured/Dockerfile.multi
+          platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
+          push: true
+#          cache-from: type=registry,ref=user/app:buildcache
+#          cache-to: type=inline
+          tags: |
+            docker.io/${{ GITHUB.REPOSITORY }}:${{ steps.tags.outputs.version }}
+            ghcr.io/${{ GITHUB.REPOSITORY }}:${{ steps.tags.outputs.version }}
--- a/.github/workflows/periodics-daily.yaml
+++ b/.github/workflows/periodics-daily.yaml
@@ -0,0 +1,138 @@
+name: Daily jobs
+
+on:
+  schedule:
+  - cron: "30 1 * * *"
+
+jobs:
+  periodics-gotest:
+    name: Run go tests
+    runs-on: ubuntu-18.04
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+      - name: run tests
+        run: go test -json ./... > test.json
+      - name: Annotate tests
+        if: always()
+        uses: guyarb/golang-test-annoations@v0.6.0
+        with:
+          test-results: test.json
+
+  periodics-mark-stale:
+    name: Mark stale issues and PRs
+    runs-on: ubuntu-latest
+    steps:
+    # Stale by default waits for 60 days before marking PR/issues as stale, and closes them after 21 days.
+    # Do not expire the first issues that would allow the community to grow.
+    - uses: actions/stale@v5
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-issue-message: 'This issue was automatically considered stale due to lack of activity. Please update it and/or join our slack channels to promote it, before it automatically closes (in 7 days).'
+        stale-pr-message: 'This PR was automatically considered stale due to lack of activity. Please refresh it and/or join our slack channels to highlight it, before it automatically closes (in 7 days).'
+        stale-issue-label: 'no-issue-activity'
+        stale-pr-label: 'no-pr-activity'
+        exempt-issue-labels: 'good first issue,keep'
+        days-before-close: 21
+
+  check-docs-links:
+    name: Check docs for incorrect links
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Link Checker
+      id: lc
+      uses: peter-evans/link-checker@v1
+      with:
+        args: -r *.md *.yaml */*/*.go -x .cluster.local
+    - name: Fail if there were link errors
+      run: exit ${{ steps.lc.outputs.exit_code }}
+
+  vuln-scan:
+    name: Build image and scan it against known vulnerabilities
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+      - run: make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
+      - uses: Azure/container-scan@v0
+        with:
+          image-name: docker.io/${{ github.repository_owner }}/kured:${{ github.sha }}
+
+  deploy-helm:
+    name: Ensure our currently released helm chart works on all kubernetes versions
+    runs-on: ubuntu-latest
+    # only build with oldest and newest supported, it should be good enough.
+    strategy:
+      matrix:
+        kubernetes:
+          - "1.22"
+          - "1.23"
+          - "1.24"
+    steps:
+      - uses: actions/checkout@v3
+      - name: Find go version
+        run: |
+          GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
+          echo "::set-output name=version::${GO_VERSION}"
+        id: awk_gomod
+      - name: Ensure go version
+        uses: actions/setup-go@v3
+        with:
+          go-version: "${{ steps.awk_gomod.outputs.version }}.x"
+
+      - name: "Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions"
+        run: |
+          sudo bash << EOF
+              cp /etc/docker/daemon.json /etc/docker/daemon.json.old
+              echo '{}' > /etc/docker/daemon.json
+              systemctl restart docker || journalctl --no-pager -n 500
+              systemctl status docker
+          EOF
+
+      # Default name for helm/kind-action kind clusters is "chart-testing"
+      - name: Create 5 node kind cluster
+        uses: helm/kind-action@v1.3.0
+        with:
+          config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
+          version: v0.14.0
+
+      - name: Deploy kured on default namespace with its helm chart
+        run: |
+          # Documented in official helm doc to live on the edge
+          curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+          # Refresh bins
+          hash -r
+          helm install kured ./charts/kured/ --set configuration.period=1m
+          kubectl config set-context kind-chart-testing
+          kubectl get ds --all-namespaces
+          kubectl get nodes -o yaml
+          sleep 5
+          kubectl describe ds kured
+
+      - name: Ensure kured is ready
+        uses: nick-invision/retry@v2.7.0
+        with:
+          timeout_minutes: 10
+          max_attempts: 10
+          retry_wait_seconds: 60
+          # DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE should all be = 5
+          command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' "
+
+      - name: Create reboot sentinel files
+        run: |
+          ./tests/kind/create-reboot-sentinels.sh
+
+      - name: Follow reboot until success
+        env:
+          DEBUG: true
+        run: |
+          ./tests/kind/follow-coordinated-reboot.sh
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
 cmd/kured/kured
-cmd/prom-active-alerts/prom-active-alerts
 vendor
 build
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,3 @@
+## Kured Community Code of Conduct
+
+Kured follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/main/code-of-conduct.md).
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -13,11 +13,12 @@ you are planning to contribute code.
 [issues]: https://github.com/weaveworks/kured/issues
 [readme]: README.md

-## Updating k8s support
+## Regular development activities

-Whenever we want to update e.g. [`kubectl` in the
-image](cmd/kured/Dockerfile), we need to consider if we update `client-go`
-as well, some RBAC changes might be necessary too.
+### Updating k8s support
+
+Whenever we want to update e.g. the `kubectl` or `client-go` dependencies,
+some RBAC changes might be necessary too.

 This is what it took to support Kubernetes 1.14:
 <https://github.com/weaveworks/kured/pull/75>
@@ -25,15 +26,90 @@ This is what it took to support Kubernetes 1.14:
 That the process can be more involved that that can be seen in
 <https://github.com/weaveworks/kured/commits/support-k8s-1.10>

+Please update our .github/workflows with the new k8s images, starting by
+the creation of a .github/kind-cluster-<version>.yaml, then updating
+our workflows with the new versions.
+
 Once you updated everything, make sure you update the support matrix on
 the main [README][readme] as well.

-## Release testing
+### Updating other dependencies
+
+Dependabot proposes changes in our go.mod/go.sum.
+Some of those changes are covered by CI testing, some are not.
+
+Please make sure to test those not covered by CI (mostly the integration
+with other tools) manually before merging.
+
+### Review periodic jobs
+
+We run periodic jobs (see also Automated testing section of this documentation).
+Those should be monitored for failures.
+
+If a failure happen in periodics, something terribly wrong must have happened
+(or github is failing at the creation of a kind cluster). Please monitor those
+failures carefully.
+
+### Introducing new features
+
+When you introduce a new feature, the kured team expects you to have tested
+your change thoroughly. If possible, include all the necessary testing in your change.
+
+If your change involves a user facing change (change in flags of kured for example),
+please include expose your new feature in our default manifest (`kured-ds.yaml`),
+as a comment.
+
+Do not update the helm chart directly.
+Helm charts and our release manifests (see below) are our stable interfaces.
+Any user facing changes will therefore have to wait for a while before being
+exposed to our users.
+
+This also means that when you expose a new feature, you should create another PR
+for your changes in `charts/` to make your feature available for our next kured version.
+In this change, you can directly bump the appVersion to the next minor version.
+(for example, if current appVersion is 1.6.x, make sure you update your appVersion
+to 1.7.0). It allows us to have an easy view of what we land each release.
+
+Do not hesitate to increase the test coverage for your feature, whether it's unit
+testing to full functional testing (even using helm charts)
+
+### Increasing test coverage
+
+We are welcoming any change to increase our test coverage.
+See also our github issues for the label `testing`.
+
+### Updating helm charts
+
+Helm charts are continuously published. Any change in `charts/` will be immediately
+pushed in production.
+
+## Automated testing
+
+Our CI is covered by github actions.
+You can see their contents in .github/workflows.
+
+We currently run:
+- go tests and lint
+- shellcheck
+- a check for dead links in our docs
+- a security check against our base image (alpine)
+- a deep functional test using our manifests on all supported k8s versions
+- basic deployment using our helm chart on any chart change
+
+Changes in helm charts are not functionally tested on PRs. We assume that
+the PRs to implement the feature are properly tested by our users and
+contributors before merge.
+
+To test your code manually, follow the section Manual testing.
+
+## Manual (release) testing

 Before `kured` is released, we want to make sure it still works fine on the
 previous, current and next minor version of Kubernetes (with respect to the
-embedded `client-go` & `kubectl`). For local testing e.g. `minikube` can be
-sufficient.
+`client-go` & `kubectl` dependencies in use). For local testing e.g.
+`minikube` or `kind` can be sufficient. This will allow you to catch issues
+that might not have been tested in our CI, like integration with other tools,
+or your specific use case.

 Deploy kured in your test scenario, make sure you pass the right `image`,
 update the e.g. `period` and `reboot-days` options, so you get immediate
@@ -43,13 +119,21 @@ results, if you login to a node and run:
 sudo touch /var/run/reboot-required
 ```

-### Testing with `minikube`
+### Example of golang testing
+
+Please run `make test`. You should have golint installed.
+
+### Example of testing with `minikube`

 A test-run with `minikube` could look like this:

 ```console
+# start minikube
 minikube start --vm-driver kvm2 --kubernetes-version <k8s-release>

+# build kured image and publish to registry accessible by minikube
+make image minikube-publish
+
 # edit kured-ds.yaml to
 #   - point to new image
 #   - change e.g. period and reboot-days option for immediate results
@@ -58,6 +142,10 @@ minikube kubectl -- apply -f kured-rbac.yaml
 minikube kubectl -- apply -f kured-ds.yaml
 minikube kubectl -- logs daemonset.apps/kured -n kube-system -f

+# Alternatively use helm to install the chart
+# edit values-local.yaml to change any chart parameters
+helm install kured ./charts/kured --namespace kube-system -f ./charts/kured/values.minikube.yaml
+
 # In separate terminal
 minikube ssh
 sudo touch /var/run/reboot-required
@@ -75,32 +163,73 @@ If all the tests ran well, kured maintainers can reach out to the Weaveworks
 team to get an upcoming `kured` release tested in the Dev environment for
 real life testing.

+### Example of testing with `kind`
+
+A test-run with `kind` could look like this:
+
+```console
+# create kind cluster
+kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
+
+# create reboot required files on pre-defined kind nodes
+./tests/kind/create-reboot-sentinels.sh
+
+# check if reboot is working fine
+./tests/kind/follow-coordinated-reboot.sh
+
+```
+
 ## Publishing a new kured release

+### Prepare Documentation
+
 Check that `README.md` has an updated compatibility matrix and that the
 url in the `kubectl` incantation (under "Installation") is updated to the
 new version you want to release.

+### Create a tag on the repo
+
+Before going further, we should freeze the code for a release, by
+tagging the code. The Github-Action should start a new job and push
+the new image to the registry.
+
+
+### Create the combined manifest
+
 Now create the `kured-<release>-dockerhub.yaml` for e.g. `1.3.0`:

 ```sh
 VERSION=1.3.0
 MANIFEST="kured-$VERSION-dockerhub.yaml"
+make DH_ORG="weaveworks" VERSION="${VERSION}" manifest
 cat kured-rbac.yaml > "$MANIFEST"
 cat kured-ds.yaml >> "$MANIFEST"
-sed -i "s#docker.io/weaveworks/kured#docker.io/weaveworks/kured:$VERSION#g" "$MANIFEST"
 ```

-The last thing you need to do is update the `image:` to point to the release
-tag, e.g. `docker.io/weaveworks/kured:1.3.0`.
+### Publish release artifacts

 Now you can head to the Github UI, use the version number as tag and upload the
 `kured-<release>-dockerhub.yaml` file.

-### Release notes
-
 Please describe what's new and noteworthy in the release notes, list the PRs
 that landed and give a shout-out to everyone who contributed.

 Please also note down on which releases the upcoming `kured` release was
 tested on. (Check old release notes if you're unsure.)
+
+### Update the Helm chart
+
+You can automatically bump the helm chart's application version
+with the latest image tag by running:
+
+```sh
+make DH_ORG="weaveworks" VERSION="1.3.0" helm-chart
+```
+
+A change in the helm chart requires a bump of the `version`
+in `charts/kured/Chart.yaml` (following the versioning rules).
+Update it, and issue a PR. Upon merge, that PR will automatically
+publish the chart to the gh-pages branch.
+
+When there are open helm-chart PRs which are on hold until the helm-chart has been updated
+with the new kured version, they can be merged now (unless a rebase is needed from the contributor).
--- a/5
+++ b/5
@@ -0,0 +1,5 @@
+Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
+Daniel Holbach <daniel@weave.works> (@dholbach)
+Hidde Beydals <hidde@weave.works> (@hiddeco)
+Jean-Phillipe Evrard <jean-philippe.evrard@suse.com> (@evrardjp)
+Jack Francis <jackfrancis@gmail.com> (@jackfrancis)
--- a/25
+++ b/25
@@ -1,5 +1,5 @@
 .DEFAULT: all
-.PHONY: all clean image publish-image minikube-publish
+.PHONY: all clean image publish-image minikube-publish manifest helm-chart test tests kured-multi

 DH_ORG=weaveworks
 VERSION=$(shell git symbolic-ref --short HEAD)-$(shell git rev-parse --short HEAD)
@@ -19,17 +19,40 @@ cmd/kured/kured: $(DEPS)
 cmd/kured/kured: cmd/kured/*.go
 	CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.version=$(VERSION)" -o $@ cmd/kured/*.go

+kured-multi: 
+	CGO_ENABLED=0 go build -ldflags "-X main.version=$(VERSION)" -o cmd/kured/kured cmd/kured/*.go
+
 build/.image.done: cmd/kured/Dockerfile cmd/kured/kured
 	mkdir -p build
 	cp $^ build
 	$(SUDO) docker build -t docker.io/$(DH_ORG)/kured -f build/Dockerfile ./build
 	$(SUDO) docker tag docker.io/$(DH_ORG)/kured docker.io/$(DH_ORG)/kured:$(VERSION)
+	$(SUDO) docker tag docker.io/$(DH_ORG)/kured ghcr.io/$(DH_ORG)/kured:$(VERSION)
 	touch $@

 image: build/.image.done

 publish-image: image
 	$(SUDO) docker push docker.io/$(DH_ORG)/kured:$(VERSION)
+	$(SUDO) docker push ghcr.io/$(DH_ORG)/kured:$(VERSION)

 minikube-publish: image
 	$(SUDO) docker save docker.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load)
+
+manifest:
+	sed -i "s#image: docker.io/.*kured.*#image: docker.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds.yaml
+	echo "Please generate combined manifest if necessary"
+
+helm-chart:
+	sed -i "s#repository:.*/kured#repository: $(DH_ORG)/kured#g" charts/kured/values.yaml
+	sed -i "s#appVersion:.*#appVersion: \"$(VERSION)\"#g" charts/kured/Chart.yaml
+	sed -i "s#\`[0-9]*\.[0-9]*\.[0-9]*\`#\`$(VERSION)\`#g" charts/kured/README.md
+	echo "Please bump version in charts/kured/Chart.yaml"
+
+test: tests
+	echo "Running go tests"
+	go test ./...
+	echo "Running golint on pkg"
+	golint ./pkg/...
+	echo "Running golint on cmd"
+	golint ./cmd/...
--- a/README.md
+++ b/README.md
@@ -1,24 +1,31 @@
+# kured - Kubernetes Reboot Daemon

-<img src="https://github.com/weaveworks/kured/raw/master/img/logo.png" align="right"/>
+<img src="https://github.com/weaveworks/kured/raw/main/img/logo.png" align="right"/>

-* [Introduction](#introduction)
-* [Kubernetes & OS Compatibility](#kubernetes-&-os-compatibility)
-* [Installation](#installation)
-* [Configuration](#configuration)
-	* [Reboot Sentinel File & Period](#reboot-sentinel-file-&-period)
-	* [Setting a schedule](#setting-a-schedule)
-	* [Blocking Reboots via Alerts](#blocking-reboots-via-alerts)
-	* [Blocking Reboots via Pods](#blocking-reboots-via-pods)
-	* [Prometheus Metrics](#prometheus-metrics)
-	* [Slack Notifications](#slack-notifications)
-	* [Overriding Lock Configuration](#overriding-lock-configuration)
-* [Operation](#operation)
-	* [Testing](#testing)
-	* [Disabling Reboots](#disabling-reboots)
-	* [Manual Unlock](#manual-unlock)
-* [Building](#building)
-* [Frequently Asked/Anticipated Questions](#frequently-askedanticipated-questions)
-* [Getting Help](#getting-help)
+- [kured - Kubernetes Reboot Daemon](#kured---kubernetes-reboot-daemon)
+  - [Introduction](#introduction)
+  - [Kubernetes & OS Compatibility](#kubernetes--os-compatibility)
+  - [Installation](#installation)
+  - [Configuration](#configuration)
+    - [Reboot Sentinel File & Period](#reboot-sentinel-file--period)
+    - [Reboot Sentinel Command](#reboot-sentinel-command)
+    - [Setting a schedule](#setting-a-schedule)
+    - [Blocking Reboots via Alerts](#blocking-reboots-via-alerts)
+    - [Blocking Reboots via Pods](#blocking-reboots-via-pods)
+    - [Adding node labels before and after reboots](#adding-node-labels-before-and-after-reboots)
+    - [Prometheus Metrics](#prometheus-metrics)
+    - [Notifications](#notifications)
+    - [Overriding Lock Configuration](#overriding-lock-configuration)
+  - [Operation](#operation)
+    - [Testing](#testing)
+    - [Disabling Reboots](#disabling-reboots)
+    - [Manual Unlock](#manual-unlock)
+    - [Automatic Unlock](#automatic-unlock)
+    - [Delaying Lock Release](#delaying-lock-release)
+  - [Building](#building)
+  - [Frequently Asked/Anticipated Questions](#frequently-askedanticipated-questions)
+    - [Why is there no `latest` tag on Docker Hub?](#why-is-there-no-latest-tag-on-docker-hub)
+  - [Getting Help](#getting-help)

 ## Introduction

@@ -26,7 +33,8 @@ Kured (KUbernetes REboot Daemon) is a Kubernetes daemonset that
 performs safe automatic node reboots when the need to do so is
 indicated by the package management system of the underlying OS.

-* Watches for the presence of a reboot sentinel e.g. `/var/run/reboot-required`
+* Watches for the presence of a reboot sentinel file e.g. `/var/run/reboot-required`
+  or the successful run of a sentinel command.
 * Utilises a lock in the API server to ensure only one node reboots at
  a time
 * Optionally defers reboots in the presence of active Prometheus alerts or selected pods
@@ -34,19 +42,26 @@ indicated by the package management system of the underlying OS.

 ## Kubernetes & OS Compatibility

-The daemon image contains versions of `k8s.io/client-go` and the
-`kubectl` binary for the purposes of maintaining the lock and draining
-worker nodes. Kubernetes aims to provide forwards & backwards
-compatibility of one minor version between client and server:
+The daemon image contains versions of `k8s.io/client-go` and
+`k8s.io/kubectl` (the binary of `kubectl` in older releases) for the purposes of
+maintaining the lock and draining worker nodes. Kubernetes aims to provide
+forwards and backwards compatibility of one minor version between client and
+server:

-| kured  | kubectl | k8s.io/client-go | k8s.io/apimachinery | expected kubernetes compatibility |
-|--------|---------|------------------|---------------------|-----------------------------------|
-| master | 1.17.5  | v0.17.0          | v0.17.0             | 1.16.x, 1.17.x, 1.18.x            |
-| 1.4.0  | 1.17.5  | v0.17.0          | v0.17.0             | 1.16.x, 1.17.x, 1.18.x            |
-| 1.3.0  | 1.15.10 | v12.0.0          | release-1.15        | 1.15.x, 1.16.x, 1.17.x            |
-| 1.2.0  | 1.13.6  | v10.0.0          | release-1.13        | 1.12.x, 1.13.x, 1.14.x            |
-| 1.1.0  | 1.12.1  | v9.0.0           | release-1.12        | 1.11.x, 1.12.x, 1.13.x            |
-| 1.0.0  | 1.7.6   | v4.0.0           | release-1.7         | 1.6.x, 1.7.x, 1.8.x               | 
+| kured | kubectl | k8s.io/client-go | k8s.io/apimachinery | expected kubernetes compatibility |
+| ----- | ------- | ---------------- | ------------------- | --------------------------------- |
+| main  | 1.23.6  | v0.23.6          | v0.23.6             | 1.22.x, 1.23.x, 1.24.x            |
+| 1.10.0| 1.23.6  | v0.23.6          | v0.23.6             | 1.22.x, 1.23.x, 1.24.x            |
+| 1.9.2 | 1.22.4  | v0.22.4          | v0.22.4             | 1.21.x, 1.22.x, 1.23.x            |
+| 1.8.1 | 1.21.4  | v0.21.4          | v0.21.4             | 1.20.x, 1.21.x, 1.22.x            |
+| 1.7.0 | 1.20.5  | v0.20.5          | v0.20.5             | 1.19.x, 1.20.x, 1.21.x            |
+| 1.6.1 | 1.19.4  | v0.19.4          | v0.19.4             | 1.18.x, 1.19.x, 1.20.x            |
+| 1.5.1 | 1.18.8  | v0.18.8          | v0.18.8             | 1.17.x, 1.18.x, 1.19.x            |
+| 1.4.4 | 1.17.7  | v0.17.0          | v0.17.0             | 1.16.x, 1.17.x, 1.18.x            |
+| 1.3.0 | 1.15.10 | v12.0.0          | release-1.15        | 1.15.x, 1.16.x, 1.17.x            |
+| 1.2.0 | 1.13.6  | v10.0.0          | release-1.13        | 1.12.x, 1.13.x, 1.14.x            |
+| 1.1.0 | 1.12.1  | v9.0.0           | release-1.12        | 1.11.x, 1.12.x, 1.13.x            |
+| 1.0.0 | 1.7.6   | v4.0.0           | release-1.7         | 1.6.x, 1.7.x, 1.8.x               |

 See the [release notes](https://github.com/weaveworks/kured/releases)
 for specific version compatibility information, including which
@@ -60,8 +75,9 @@ Versions >=1.1.0 enter the host mount namespace to invoke
 To obtain a default installation without Prometheus alerting interlock
 or Slack notifications:

-```
-kubectl apply -f https://github.com/weaveworks/kured/releases/download/1.3.0/kured-1.3.0-dockerhub.yaml
+```console
+latest=$(curl -s https://api.github.com/repos/weaveworks/kured/releases | jq -r .[0].tag_name)
+kubectl apply -f "https://github.com/weaveworks/kured/releases/download/$latest/kured-$latest-dockerhub.yaml"
 ```

 If you want to customise the installation, download the manifest and
@@ -71,24 +87,40 @@ edit it in accordance with the following section before application.

 The following arguments can be passed to kured via the daemonset pod template:

-```
+```console
 Flags:
      --alert-filter-regexp regexp.Regexp   alert names to ignore when checking for active alerts
+      --alert-firing-only bool              only consider firing alerts when checking for active alerts
      --blocking-pod-selector stringArray   label selector identifying pods whose presence should prevent reboots
+      --drain-grace-period int              time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
+      --skip-wait-for-delete-timeout int    when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
      --ds-name string                      name of daemonset on which to place lock (default "kured")
      --ds-namespace string                 namespace containing daemonset on which to place lock (default "kube-system")
-      --end-time string                     only reboot before this time of day (default "23:59")
+      --end-time string                     schedule reboot only before this time of day (default "23:59:59")
+      --force-reboot bool                   force a reboot even if the drain is still running (default: false)
+      --drain-timeout duration              timeout after which the drain is aborted (default: 0, infinite time)
  -h, --help                                help for kured
      --lock-annotation string              annotation in which to record locking node (default "weave.works/kured-node-lock")
+      --lock-release-delay duration         hold lock after reboot by this duration (default: 0, disabled)
+      --lock-ttl duration                   expire lock annotation after this duration (default: 0, disabled)
+      --message-template-uncordon string    message template used to notify about a node being successfully uncordoned (default "Node %s rebooted & uncordoned successfully!")
+      --message-template-drain string       message template used to notify about a node being drained (default "Draining node %s")
+      --message-template-reboot string      message template used to notify about a node being rebooted (default "Rebooting node %s")
+      --notify-url                          url for reboot notifications (cannot use with --slack-hook-url flags)
      --period duration                     reboot check period (default 1h0m0s)
+      --prefer-no-schedule-taint string     Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to "weave.works/kured-node-reboot" to enable tainting.
      --prometheus-url string               Prometheus instance to probe for active alerts
-      --reboot-days strings                 only reboot on these days (default [su,mo,tu,we,th,fr,sa])
+      --reboot-command string               command to run when a reboot is required by the sentinel (default "/sbin/systemctl reboot")
+      --reboot-days strings                 schedule reboot on these days (default [su,mo,tu,we,th,fr,sa])
+      --reboot-delay duration               add a delay after drain finishes but before the reboot command is issued (default 0, no time)
      --reboot-sentinel string              path to file whose existence signals need to reboot (default "/var/run/reboot-required")
+      --reboot-sentinel-command string      command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
      --slack-channel string                slack channel for reboot notfications
-      --slack-hook-url string               slack hook URL for reboot notfications
+      --slack-hook-url string               slack hook URL for reboot notfications [deprecated in favor of --notify-url]
      --slack-username string               slack username for reboot notfications (default "kured")
-      --start-time string                   only reboot after this time of day (default "0:00")
-      --time-zone string                    use this timezone to calculate allowed reboot time (default "UTC")
+      --start-time string                   schedule reboot only after this time of day (default "0:00")
+      --time-zone string                    use this timezone for schedule inputs (default "UTC")
+      --log-format string                   log format specified as text or json, defaults to "text"
 ```

 ### Reboot Sentinel File & Period
@@ -99,6 +131,23 @@ values with `--reboot-sentinel` and `--period`. Each replica of the
 daemon uses a random offset derived from the period on startup so that
 nodes don't all contend for the lock simultaneously.

+### Reboot Sentinel Command
+
+Alternatively, a reboot sentinel command can be used. If a reboot
+sentinel command is used, the reboot sentinel file presence will be
+ignored. When the command exits with code `0`, kured will assume
+that a reboot is required.
+
+For example, if you're using RHEL or its derivatives, you can
+set the sentinel command to `sh -c "! needs-restarting --reboothint"`
+(by default the command will return `1` if a reboot is required,
+so we wrap it in `sh -c` and add `!` to negate the return value).
+
+```yaml
+configuration:
+  rebootSentinelCommand: sh -c "! needs-restarting --reboothint"
+```
+
 ### Setting a schedule

 By default, kured will reboot any time it detects the sentinel, but this
@@ -108,11 +157,11 @@ reboots to predictable schedules.  Use `--reboot-days`, `--start-time`,
 `--end-time`, and `--time-zone` to set a schedule.  For example, business
 hours on the west coast USA can be specified with:

-```
-	--reboot-days mon,tue,wed,thu,fri
-	--start-time 9am
-	--end-time 5pm
-	--time-zone America/Los_Angeles
+```console
+  --reboot-days=mon,tue,wed,thu,fri
+  --start-time=9am
+  --end-time=5pm
+  --time-zone=America/Los_Angeles
 ```

 Times can be formatted in numerous ways, including `5pm`, `5:00pm` `17:00`,
@@ -128,17 +177,22 @@ You may find it desirable to block automatic node reboots when there
 are active alerts - you can do so by providing the URL of your
 Prometheus server:

-```
+```console
 --prometheus-url=http://prometheus.monitoring.svc.cluster.local
 ```

 By default the presence of *any* active (pending or firing) alerts
 will block reboots, however you can ignore specific alerts:

-```
+```console
 --alert-filter-regexp=^(RebootRequired|AnotherBenignAlert|...$
 ```

+You can also only block reboots for firing alerts:
+```console
+--alert-firing-only=true
+```
+
 See the section on Prometheus metrics for an important application of this
 filter.

@@ -147,14 +201,14 @@ filter.
 You can also block reboots of an _individual node_ when specific pods
 are scheduled on it:

-```
+```console
 --blocking-pod-selector=runtime=long,cost=expensive
 ```

 Since label selector strings use commas to express logical 'and', you can
 specify this parameter multiple times for 'or':

-```
+```console
 --blocking-pod-selector=runtime=long,cost=expensive
 --blocking-pod-selector=name=temperamental
 ```
@@ -167,12 +221,25 @@ running job or a known temperamental pod on a node will stop it rebooting.
 > up a RebootRequired alert as described in the next section so that
 > you can intervene manually if reboots are blocked for too long.

+### Adding node labels before and after reboots
+
+If you need to add node labels before and after the reboot process, you can use `--pre-reboot-node-labels` and `--post-reboot-node-labels`:
+
+```console
+      --pre-reboot-node-labels=zalando=notready
+      --post-reboot-node-labels=zalando=ready
+```
+
+Labels can be comma-delimited (e.g. `--pre-reboot-node-labels=zalando=notready,thisnode=disabled`) or you can supply the flags multiple times.
+
+Note that label keys specified by these two flags should match. If they do not match, a warning will be generated.
+
 ### Prometheus Metrics

 Each kured pod exposes a single gauge metric (`:8080/metrics`) that
 indicates the presence of the sentinel file:

-```
+```console
 # HELP kured_reboot_required OS requires reboot due to software updates.
 # TYPE kured_reboot_required gauge
 kured_reboot_required{node="ip-xxx-xxx-xxx-xxx.ec2.internal"} 0
@@ -182,7 +249,7 @@ The purpose of this metric is to power an alert which will summon an
 operator if the cluster cannot reboot itself automatically for a
 prolonged period:

-```
+```console
 # Alert if a reboot is required for any machines. Acts as a failsafe for the
 # reboot daemon, which will not reboot nodes if there are pending alerts save
 # this one.
@@ -201,15 +268,38 @@ If you choose to employ such an alert and have configured kured to
 probe for active alerts before rebooting, be sure to specify
 `--alert-filter-regexp=^RebootRequired$` to avoid deadlock!

-### Slack Notifications
+### Notifications

-If you specify a Slack hook via `--slack-hook-url`, kured will notify
-you immediately prior to rebooting a node:
+When you specify a formatted URL using `--notify-url`, kured will notify
+about draining and rebooting nodes across a list of technologies.

-<img src="https://github.com/weaveworks/kured/raw/master/img/slack-notification.png"/>
+![Notification](img/slack-notification.png)

-We recommend setting `--slack-username` to be the name of the
-environment, e.g. `dev` or `prod`.
+Alternatively you can use the `--message-template-drain`, `--message-template-reboot` and `--message-template-uncordon` to customize the text of the message, e.g.
+
+```cli
+--message-template-drain="Draining node %s part of *my-cluster* in region *xyz*"
+```
+
+Here is the syntax:
+
+- slack:           `slack://tokenA/tokenB/tokenC`
+
+    (`slack://<USERNAME>@tokenA/tokenB/tokenC` - in case you want to [respect username](https://github.com/weaveworks/kured/issues/482))
+
+    (`--slack-hook-url` is deprecated but possible to use)
+
+  For the new slack App integration, use:\
+    `slack://xoxb:123456789012-1234567890123-4mt0t4l1YL3g1T5L4cK70k3N@<CHANNEL_NAME>?botname=<BOTNAME>`\
+    for more information, [look here](https://containrrr.dev/shoutrrr/v0.5/services/slack/#examples)
+
+- rocketchat:      `rocketchat://[username@]rocketchat-host/token[/channel|@recipient]`
+
+- teams:           `teams://group@tenant/altId/groupOwner?host=organization.webhook.office.com`
+
+- Email:           `smtp://username:password@host:port/?fromAddress=fromAddress&toAddresses=recipient1[,recipient2,...]`
+
+More details here: [containrrr.dev/shoutrrr/v0.5/services/overview](https://containrrr.dev/shoutrrr/v0.5/services/overview)

 ### Overriding Lock Configuration

@@ -232,7 +322,7 @@ if you have, you will have to adjust the commands accordingly.

 You can test your configuration by provoking a reboot on a node:

-```
+```console
 sudo touch /var/run/reboot-required
 ```

@@ -241,7 +331,7 @@ sudo touch /var/run/reboot-required
 If you need to temporarily stop kured from rebooting any nodes, you
 can take the lock manually:

-```
+```console
 kubectl -n kube-system annotate ds kured weave.works/kured-node-lock='{"nodeID":"manual"}'
 ```

@@ -253,32 +343,46 @@ In exceptional circumstances, such as a node experiencing a permanent
 failure whilst rebooting, manual intervention may be required to
 remove the cluster lock:

-```
+```console
 kubectl -n kube-system annotate ds kured weave.works/kured-node-lock-
 ```
+
 > NB the `-` at the end of the command is important - it instructs
 > `kubectl` to remove that annotation entirely.

+### Automatic Unlock
+
+In exceptional circumstances (especially when used with cluster-autoscaler) a node
+which holds lock might be killed thus annotation will stay there for ever.
+
+Using `--lock-ttl=30m` will allow other nodes to take over if TTL has expired (in this case 30min) and continue reboot process.
+
+### Delaying Lock Release
+
+
+Using `--lock-release-delay=30m` will cause nodes to hold the lock for the specified time frame (in this case 30min) before it is released and the reboot process continues. This can be used to throttle reboots across the cluster.
+
 ## Building

-See the [CircleCI config](.circleci/config.yml) for the preferred
-version of Golang. Kured now uses [Go
+Kured now uses [Go
 Modules](https://github.com/golang/go/wiki/Modules), so build
 instructions vary depending on where you have checked out the
 repository:

 **Building outside $GOPATH:**

-```
+```console
 make
 ```

 **Building inside $GOPATH:**

-```
+```console
 GO111MODULE=on make
 ```

+You can find the current preferred version of Golang in the [go.mod file](go.mod).
+
 If you are interested in contributing code to kured, please take a look at
 our [development][development] docs.

@@ -290,7 +394,7 @@ our [development][development] docs.

 Use of `latest` for production deployments is bad practice - see
 [here](https://kubernetes.io/docs/concepts/configuration/overview) for
-details. The manifest on `master` refers to `latest` for local
+details. The manifest on `main` refers to `latest` for local
 development testing with minikube only; for production use choose a
 versioned manifest from the [release page](https://github.com/weaveworks/kured/releases/).

@@ -298,10 +402,12 @@ versioned manifest from the [release page](https://github.com/weaveworks/kured/r

 If you have any questions about, feedback for or problems with `kured`:

- Invite yourself to the <a href="https://slack.weave.works/" target="_blank">Weave Users Slack</a>.
- Ask a question on the [#kured](https://weave-community.slack.com/messages/kured/) slack channel.
- [File an issue](https://github.com/weaveworks/kured/issues/new).
- Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
+* Invite yourself to the <a href="https://slack.weave.works/" target="_blank">Weave Users Slack</a>.
+* Ask a question on the [#kured](https://weave-community.slack.com/messages/kured/) slack channel.
+* [File an issue](https://github.com/weaveworks/kured/issues/new).
+* Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
  every fourth Wednesday of the month at 16:00 UTC.

+We follow the [CNCF Code of Conduct](CODE_OF_CONDUCT.md).
+
 Your feedback is always welcome!
--- a/charts/kured/.helmignore
+++ b/charts/kured/.helmignore
@@ -0,0 +1,21 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
--- a/charts/kured/Chart.yaml
+++ b/charts/kured/Chart.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+appVersion: "1.10.0"
+description: A Helm chart for kured
+name: kured
+version: 2.15.0
+home: https://github.com/weaveworks/kured
+maintainers:
+  - name: ckotzbauer
+    email: christian.kotzbauer@gmail.com
+  - name: davidkarlsen
+    email: david@davidkarlsen.com
+sources:
+  - https://github.com/weaveworks/kured
+icon: https://raw.githubusercontent.com/weaveworks/kured/main/img/logo.png
--- a/charts/kured/README.md
+++ b/charts/kured/README.md
@@ -0,0 +1,130 @@
+# Kured (KUbernetes REboot Daemon)
+
+## Introduction
+This chart installs the "Kubernetes Reboot Daemon" using the Helm Package Manager.
+
+## Prerequisites
+- Kubernetes 1.9+
+
+## Installing the Chart
+To install the chart with the release name `my-release`:
+```bash
+$ helm repo add kured https://weaveworks.github.io/kured
+$ helm install my-release kured/kured
+```
+
+## Uninstalling the Chart
+To uninstall/delete the `my-release` deployment:
+```bash
+$ helm delete my-release
+```
+
+The command removes all the Kubernetes components associated with the chart and deletes the release.
+
+
+## Migrate from stable Helm-Chart
+The following changes have been made compared to the stable chart:
+- **[BREAKING CHANGE]** The `autolock` feature was removed. Use `configuration.startTime` and `configuration.endTime` instead.
+- Role inconsistencies have been fixed (allowed verbs for modifying the `DaemonSet`, apiGroup of `PodSecurityPolicy`)
+- Added support for affinities.
+- Configuration of cli-flags can be made through a `configuration` object.
+- Added optional `Service` and `ServiceMonitor` support for metrics endpoint.
+- Previously static Slack channel, hook URL and username values are now made dynamic using `tpl` function.
+
+## Configuration
+
+| Config                  | Description                                                                 | Default                    |
+| ------                  | -----------                                                                 | -------                    |
+| `image.repository`      | Image repository                                                            | `weaveworks/kured` |
+| `image.tag`             | Image tag                                                                   | `1.10.0`                   |
+| `image.pullPolicy`      | Image pull policy                                                           | `IfNotPresent`             |
+| `image.pullSecrets`     | Image pull secrets                                                          | `[]`                       |
+| `updateStrategy`        | Daemonset update strategy                                                   | `RollingUpdate`            |
+| `maxUnavailable`        | The max pods unavailable during a rolling update                            | `1`                        |
+| `podAnnotations`        | Annotations to apply to pods (eg to add Prometheus annotations)             | `{}`                       |
+| `dsAnnotations`         | Annotations to apply to the kured DaemonSet                                 | `{}`                       |
+| `extraArgs`             | Extra arguments to pass to `/usr/bin/kured`. See below.                     | `{}`                       |
+| `extraEnvVars`          | Array of environment variables to pass to the daemonset.                    | `{}`                       |
+| `configuration.lockTtl` | cli-parameter `--lock-ttl`                                                  | `0`                       |
+| `configuration.lockReleaseDelay` | cli-parameter `--lock-release-delay`                               | `0`                       |
+| `configuration.alertFilterRegexp` | cli-parameter `--alert-filter-regexp`                             | `""`                       |
+| `configuration.alertFiringOnly` | cli-parameter `--alert-firing-only`                                 | `false`                   |
+| `configuration.blockingPodSelector` | Array of selectors for multiple cli-parameters `--blocking-pod-selector` | `[]`             |
+| `configuration.endTime` | cli-parameter `--end-time`                                                  | `""`                      |
+| `configuration.lockAnnotation` | cli-parameter `--lock-annotation`                                    | `""`                      |
+| `configuration.period` | cli-parameter `--period`                                                     | `""`                      |
+| `configuration.forceReboot` | cli-parameter `--force-reboot`                                          | `false`                   |
+| `configuration.drainGracePeriod` | cli-parameter `--drain-grace-period`                               | `""`                      |
+| `configuration.drainTimeout` | cli-parameter `--drain-timeout`                                        | `""`                      |
+| `configuration.skipWaitForDeleteTimeout` | cli-parameter `--skip-wait-for-delete-timeout`             | `""`                      |
+| `configuration.prometheusUrl` | cli-parameter `--prometheus-url`                                      | `""`                      |
+| `configuration.rebootDays` | Array of days for multiple cli-parameters `--reboot-days`                | `[]`                      |
+| `configuration.rebootSentinel` | cli-parameter `--reboot-sentinel`                                    | `""`                      |
+| `configuration.rebootSentinelCommand` | cli-parameter `--reboot-sentinel-command`                     | `""`                      |
+| `configuration.rebootCommand` | cli-parameter `--reboot-command`                                      | `""`                      |
+| `configuration.rebootDelay` | cli-parameter `--reboot-delay`                                          | `""`                      |
+| `configuration.slackChannel` | cli-parameter `--slack-channel`. Passed through `tpl`                  | `""`                      |
+| `configuration.slackHookUrl` | cli-parameter `--slack-hook-url`. Passed through `tpl`                 | `""`                      |
+| `configuration.slackUsername` | cli-parameter `--slack-username`. Passed through `tpl`                | `""`                      |
+| `configuration.notifyUrl` | cli-parameter `--notify-url`                                              | `""`                      |
+| `configuration.messageTemplateDrain` | cli-parameter `--message-template-drain`                       | `""`                      |
+| `configuration.messageTemplateReboot` | cli-parameter `--message-template-reboot`                     | `""`                      |
+| `configuration.startTime` | cli-parameter `--start-time`                                              | `""`                      |
+| `configuration.timeZone` | cli-parameter `--time-zone`                                                | `""`                      |
+| `configuration.annotateNodes` | cli-parameter `--annotate-nodes`                                      | `false`                   |
+| `configuration.logFormat` | cli-parameter `--log-format`                                              | `"text"`                  |
+| `configuration.preferNoScheduleTaint` | Taint name applied during pending node reboot                 | `""`                   |
+| `configuration.preRebootNodeLabels` | Array of key-value-pairs to add to nodes before cordoning for multiple cli-parameters `--pre-reboot-node-labels` | `[]` |
+| `configuration.postRebootNodeLabels` | Array of key-value-pairs to add to nodes after uncordoning for multiple cli-parameters `--post-reboot-node-labels` | `[]` |
+| `rbac.create`           | Create RBAC roles                                                           | `true`                     |
+| `serviceAccount.create` | Create a service account                                                    | `true`                     |
+| `serviceAccount.name`   | Service account name to create (or use if `serviceAccount.create` is false) | (chart fullname)           |
+| `podSecurityPolicy.create` | Create podSecurityPolicy                                                 | `false`                    |
+| `containerSecurityContext.privileged `| Enables `privileged` in container-specific security context   | `true`                     |
+| `containerSecurityContext.allowPrivilegeEscalation`| Enables `allowPrivilegeEscalation` in container-specific security context. If not set it won't be configured. |  |
+| `resources`             | Resources requests and limits.                                              | `{}`                       |
+| `metrics.create`        | Create a ServiceMonitor for prometheus-operator                             | `false`                    |
+| `metrics.namespace`     | The namespace to create the ServiceMonitor in                               | `""`                    |
+| `metrics.labels`        | Additional labels for the ServiceMonitor                                    | `{}`                    |
+| `metrics.interval`      | Interval prometheus should scrape the endpoint                              | `60s`                   |
+| `metrics.scrapeTimeout` | A custom scrapeTimeout for prometheus                                       | `""`                    |
+| `service.create`        | Create a Service for the metrics endpoint                                   | `false`                    |
+| `service.name  `        | Service name for the metrics endpoint                                       | `""`                       |
+| `service.port`          | Port of the service to expose                                               | `8080`                     |
+| `service.annotations`   | Annotations to apply to the service (eg to add Prometheus annotations)      | `{}`                       |
+| `podLabels`             | Additional labels for pods (e.g. CostCenter=IT)                             | `{}`                       |
+| `priorityClassName`     | Priority Class to be used by the pods                                       | `""`                       |
+| `tolerations`           | Tolerations to apply to the daemonset (eg to allow running on master)       | `[{"key": "node-role.kubernetes.io/control-plane", "effect": "NoSchedule"}]` for Kubernetes 1.24.0 and greater, otherwise `[{"key": "node-role.kubernetes.io/master", "effect": "NoSchedule"}]`|
+| `affinity`              | Affinity for the daemonset (ie, restrict which nodes kured runs on)         | `{}`                       |
+| `nodeSelector`          | Node Selector for the daemonset (ie, restrict which nodes kured runs on)    | `{}`                       |
+| `volumeMounts`          | Maps of volumes mount to mount                                              | `{}`                       |
+| `volumes`               | Maps of volumes to mount                                                    | `{}`                       |
+See https://github.com/weaveworks/kured#configuration for values (not contained in the `configuration` object) for `extraArgs`. Note that
+```yaml
+extraArgs:
+  foo: 1
+  bar-baz: 2
+```
+becomes `/usr/bin/kured ... --foo=1 --bar-baz=2`.
+
+
+## Prometheus Metrics
+
+Kured exposes a single prometheus metric indicating whether a reboot is required or not (see [kured docs](https://github.com/weaveworks/kured#prometheus-metrics)) for details.
+
+#### Prometheus-Operator
+
+```yaml
+metrics:
+  create: true
+```
+
+#### Prometheus Annotations
+
+```yaml
+service:
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/path: "/metrics"
+    prometheus.io/port: "8080"
+```
--- a/charts/kured/ci/prometheus-values.yaml
+++ b/charts/kured/ci/prometheus-values.yaml
@@ -0,0 +1,13 @@
+# This is tested twice:
+# Basic install test with chart-testing (on charts PRs)
+# Functional testing in PRs (other PRs)
+
+service:
+  create: true
+  name: kured-prometheus-endpoint
+  port: 8080
+  type: NodePort
+  nodePort: 30000
+
+# Do not override the configuration: period in this, so that
+# We can test prometheus exposed metrics without rebooting.
--- a/charts/kured/templates/NOTES.txt
+++ b/charts/kured/templates/NOTES.txt
@@ -0,0 +1,3 @@
+Kured will check for /var/run/reboot-required, and reboot nodes when needed.
+
+See https://github.com/weaveworks/kured/ for details.
--- a/charts/kured/templates/_helpers.tpl
+++ b/charts/kured/templates/_helpers.tpl
@@ -0,0 +1,72 @@
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "kured.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "kured.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "kured.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "kured.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create -}}
+    {{ default (include "kured.fullname" .) .Values.serviceAccount.name }}
+{{- else -}}
+    {{ default "default" .Values.serviceAccount.name }}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Return the appropriate apiVersion for podsecuritypolicy.
+*/}}
+{{- define "kured.psp.apiVersion" -}}
+{{- if semverCompare "<1.10-0" .Capabilities.KubeVersion.GitVersion -}}
+{{- print "extensions/v1beta1" -}}
+{{- else -}}
+{{- print "policy/v1beta1" -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Returns a set of labels applied to each resource.
+*/}}
+{{- define "kured.labels" -}}
+app: {{ template "kured.name" . }}
+chart: {{ template "kured.chart" . }}
+release: {{ .Release.Name }}
+heritage: {{ .Release.Service }}
+{{- end -}}
+
+{{/*
+Returns a set of matchLabels applied.
+*/}}
+{{- define "kured.matchLabels" -}}
+app: {{ template "kured.name" . }}
+release: {{ .Release.Name }}
+{{- end -}}
--- a/charts/kured/templates/clusterrole.yaml
+++ b/charts/kured/templates/clusterrole.yaml
@@ -0,0 +1,30 @@
+{{- if .Values.rbac.create -}}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ template "kured.fullname" . }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+rules:
+# Allow kured to read spec.unschedulable
+# Allow kubectl to drain/uncordon
+#
+# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
+# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
+#
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs:     ["get", "patch"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs:     ["list","delete","get"]
+- apiGroups: ["extensions"]
+  resources: ["daemonsets"]
+  verbs:     ["get"]
+- apiGroups: ["apps"]
+  resources: ["daemonsets"]
+  verbs:     ["get"]
+- apiGroups: [""]
+  resources: ["pods/eviction"]
+  verbs:     ["create"]
+{{- end -}}
--- a/charts/kured/templates/clusterrolebinding.yaml
+++ b/charts/kured/templates/clusterrolebinding.yaml
@@ -0,0 +1,16 @@
+{{- if .Values.rbac.create -}}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ template "kured.fullname" . }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ template "kured.fullname" . }}
+subjects:
+- kind: ServiceAccount
+  name: {{ template "kured.serviceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+{{- end -}}
--- a/charts/kured/templates/daemonset.yaml
+++ b/charts/kured/templates/daemonset.yaml
@@ -0,0 +1,200 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ template "kured.fullname" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+  {{- if .Values.dsAnnotations }}
+  annotations:
+  {{- range $key, $value := .Values.dsAnnotations }}
+    {{ $key }}: {{ $value | quote }}
+  {{- end }}
+  {{- end }}
+spec:
+  updateStrategy:
+    type: {{ .Values.updateStrategy }}
+{{- if eq .Values.updateStrategy "RollingUpdate"}}
+    rollingUpdate:
+      maxUnavailable: {{ .Values.maxUnavailable }}
+{{- end}}
+  selector:
+    matchLabels:
+      {{- include "kured.matchLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "kured.labels" . | nindent 8 }}
+        {{- if .Values.podLabels }}
+        {{- toYaml .Values.podLabels | nindent 8 }}
+        {{- end }}
+      {{- if .Values.podAnnotations }}
+      annotations:
+      {{- range $key, $value := .Values.podAnnotations }}
+        {{ $key }}: {{ $value | quote }}
+      {{- end }}
+      {{- end }}
+    spec:
+      serviceAccountName: {{ template "kured.serviceAccountName" . }}
+      hostPID: true
+      restartPolicy: Always
+      {{- with .Values.image.pullSecrets }}
+      imagePullSecrets:
+{{ toYaml . | indent 8 }}
+      {{- end }}
+      {{- if .Values.priorityClassName }}
+      priorityClassName: {{ .Values.priorityClassName }}
+      {{- end }}
+      containers:
+        - name: {{ .Chart.Name }}
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          securityContext:
+{{ toYaml .Values.containerSecurityContext | indent 12 }}
+          resources:
+{{ toYaml .Values.resources | indent 12 }}
+          command:
+            - /usr/bin/kured
+          args:
+            - --ds-name={{ template "kured.fullname" . }}
+            - --ds-namespace={{ .Release.Namespace }}
+          {{- if .Values.configuration.lockTtl }}
+            - --lock-ttl={{ .Values.configuration.lockTtl }}
+          {{- end }}
+          {{- if .Values.configuration.lockReleaseDelay }}
+            - --lock-release-delay={{ .Values.configuration.lockReleaseDelay }}
+          {{- end }}
+          {{- if .Values.configuration.alertFilterRegexp }}
+            - --alert-filter-regexp={{ .Values.configuration.alertFilterRegexp }}
+          {{- end }}
+          {{- if .Values.configuration.alertFiringOnly }}
+            - --alert-firing-only={{ .Values.configuration.alertFiringOnly }}
+          {{- end }}
+          {{- range .Values.configuration.blockingPodSelector }}
+            - --blocking-pod-selector={{ . }}
+          {{- end }}
+          {{- if .Values.configuration.endTime }}
+            - --end-time={{ .Values.configuration.endTime }}
+          {{- end }}
+          {{- if .Values.configuration.lockAnnotation }}
+            - --lock-annotation={{ .Values.configuration.lockAnnotation }}
+          {{- end }}
+          {{- if .Values.configuration.period }}
+            - --period={{ .Values.configuration.period }}
+          {{- end }}
+          {{- if .Values.configuration.forceReboot }}
+            - --force-reboot
+          {{- end }}
+          {{- if .Values.configuration.drainGracePeriod }}
+            - --drain-grace-period={{ .Values.configuration.drainGracePeriod }}
+          {{- end }}
+          {{- if .Values.configuration.drainTimeout }}
+            - --drain-timeout={{ .Values.configuration.drainTimeout }}
+          {{- end }}
+          {{- if .Values.configuration.skipWaitForDeleteTimeout }}
+             - --skip-wait-for-delete-timeout={{ .Values.configuration.skipWaitForDeleteTimeout }}
+          {{- end }}
+          {{- if .Values.configuration.prometheusUrl }}
+            - --prometheus-url={{ .Values.configuration.prometheusUrl }}
+          {{- end }}
+          {{- range .Values.configuration.rebootDays }}
+            - --reboot-days={{ . }}
+          {{- end }}
+          {{- range .Values.configuration.preRebootNodeLabels }}
+            - --pre-reboot-node-labels={{ . }}
+          {{- end }}
+          {{- range .Values.configuration.postRebootNodeLabels }}
+            - --post-reboot-node-labels={{ . }}
+          {{- end }}
+          {{- if .Values.configuration.rebootSentinel }}
+            - --reboot-sentinel={{ .Values.configuration.rebootSentinel }}
+          {{- end }}
+          {{- if .Values.configuration.rebootSentinelCommand }}
+            - --reboot-sentinel-command={{ .Values.configuration.rebootSentinelCommand }}
+          {{- end }}
+          {{- if .Values.configuration.rebootCommand }}
+            - --reboot-command={{ .Values.configuration.rebootCommand }}
+          {{- end }}
+          {{- if .Values.configuration.rebootDelay }}
+            - --reboot-delay={{ .Values.configuration.rebootDelay }}
+          {{- end }}
+          {{- if .Values.configuration.slackChannel }}
+            - --slack-channel={{ tpl .Values.configuration.slackChannel . }}
+          {{- end }}
+          {{- if .Values.configuration.slackHookUrl }}
+            - --slack-hook-url={{ tpl .Values.configuration.slackHookUrl . }}
+          {{- end }}
+          {{- if .Values.configuration.slackUsername }}
+            - --slack-username={{ tpl .Values.configuration.slackUsername . }}
+          {{- end }}
+          {{- if .Values.configuration.notifyUrl }}
+            - --notify-url={{ .Values.configuration.notifyUrl }}
+          {{- end }}
+          {{- if .Values.configuration.messageTemplateDrain }}
+            - --message-template-drain={{ .Values.configuration.messageTemplateDrain }}
+          {{- end }}
+          {{- if .Values.configuration.messageTemplateReboot }}
+            - --message-template-reboot={{ .Values.configuration.messageTemplateReboot }}
+          {{- end }}
+          {{- if .Values.configuration.startTime }}
+            - --start-time={{ .Values.configuration.startTime }}
+          {{- end }}
+          {{- if .Values.configuration.timeZone }}
+            - --time-zone={{ .Values.configuration.timeZone }}
+          {{- end }}
+          {{- if .Values.configuration.annotateNodes }}
+            - --annotate-nodes={{ .Values.configuration.annotateNodes }}
+          {{- end }}
+          {{- if .Values.configuration.preferNoScheduleTaint }}
+            - --prefer-no-schedule-taint={{ .Values.configuration.preferNoScheduleTaint }}
+          {{- end }}
+          {{- if .Values.configuration.logFormat }}
+            - --log-format={{ .Values.configuration.logFormat }}
+          {{- end }}
+          {{- range $key, $value := .Values.extraArgs }}
+            {{- if $value }}
+            - --{{ $key }}={{ $value }}
+            {{- else }}
+            - --{{ $key }}
+            {{- end }}
+          {{- end }}
+{{- if .Values.volumeMounts }}
+          volumeMounts:
+{{- toYaml .Values.volumeMounts  | nindent 12 }}
+{{- end }}
+          ports:
+            - containerPort: 8080
+              name: metrics
+          env:
+            # Pass in the name of the node on which this pod is scheduled
+            # for use with drain/uncordon operations and lock acquisition
+            - name: KURED_NODE_ID
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            {{- if .Values.extraEnvVars }}
+              {{ toYaml .Values.extraEnvVars | nindent 12 }}
+            {{- end }}
+      tolerations:
+      {{- if .Values.tolerations }}
+          {{- with .Values.tolerations }}
+{{ toYaml . | indent 8 }}
+          {{- end }}
+      {{- else }}
+        - key: node-role.kubernetes.io/control-plane
+          effect: NoSchedule
+        - key: node-role.kubernetes.io/master
+          effect: NoSchedule
+      {{- end }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+{{ toYaml . | indent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+{{ toYaml . | indent 8 }}
+      {{- end }}
+{{- if .Values.volumes }}
+      volumes:
+{{- toYaml .Values.volumes  | nindent 8 }}
+{{- end }}
--- a/charts/kured/templates/podsecuritypolicy.yaml
+++ b/charts/kured/templates/podsecuritypolicy.yaml
@@ -0,0 +1,21 @@
+{{- if .Values.podSecurityPolicy.create}}
+apiVersion: {{ template "kured.psp.apiVersion" . }}
+kind: PodSecurityPolicy
+metadata:
+  name: {{ template "kured.fullname" . }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+spec:
+  privileged: true
+  hostPID: true
+  allowedCapabilities: ['*']
+  fsGroup:
+    rule: RunAsAny
+  runAsUser:
+    rule: RunAsAny
+  seLinux:
+    rule: RunAsAny
+  supplementalGroups:
+    rule: RunAsAny
+  volumes: ['*']
+{{- end }}
--- a/charts/kured/templates/role.yaml
+++ b/charts/kured/templates/role.yaml
@@ -0,0 +1,30 @@
+{{- if .Values.rbac.create -}}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: {{ .Release.Namespace }}
+  name: {{ template "kured.fullname" . }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+rules:
+  # Allow kured to lock/unlock itself
+  - apiGroups:     ["extensions"]
+    resources:     ["daemonsets"]
+    resourceNames: ["{{ template "kured.fullname" . }}"]
+    verbs:         ["update", "patch"]
+  - apiGroups:     ["apps"]
+    resources:     ["daemonsets"]
+    resourceNames: ["{{ template "kured.fullname" . }}"]
+    verbs:         ["update", "patch"]
+{{- if .Values.podSecurityPolicy.create }}
+  - apiGroups:     ["extensions"]
+    resources:     ["podsecuritypolicies"]
+    resourceNames: ["{{ template "kured.fullname" . }}"]
+    verbs:         ["use"]
+  - apiGroups:     ["policy"]
+    resources:     ["podsecuritypolicies"]
+    resourceNames: ["{{ template "kured.fullname" . }}"]
+    verbs:         ["use"]
+{{- end }}
+
+{{- end -}}
--- a/charts/kured/templates/rolebinding.yaml
+++ b/charts/kured/templates/rolebinding.yaml
@@ -0,0 +1,17 @@
+{{- if .Values.rbac.create -}}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  namespace: {{ .Release.Namespace }}
+  name: {{ template "kured.fullname" . }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+subjects:
+- kind: ServiceAccount
+  namespace: {{ .Release.Namespace }}
+  name: {{ template "kured.serviceAccountName" . }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ template "kured.fullname" . }}
+{{- end -}}
--- a/charts/kured/templates/service.yaml
+++ b/charts/kured/templates/service.yaml
@@ -0,0 +1,29 @@
+{{- if or .Values.service.create .Values.metrics.create }}
+apiVersion: v1
+kind: Service
+metadata:
+  {{- if .Values.service.name }}
+  name: {{ .Values.service.name }}
+  {{- else }}
+  name: {{ template "kured.fullname" . }}
+  {{- end }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+  {{- if .Values.service.annotations }}
+  annotations:
+  {{- range $key, $value := .Values.service.annotations }}
+    {{ $key }}: {{ $value | quote }}
+  {{- end }}
+  {{- end }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - name: metrics
+      port: {{ .Values.service.port }}
+      targetPort: 8080
+      {{- if eq .Values.service.type "NodePort" }}
+      nodePort: {{ .Values.service.nodePort }}
+      {{- end }}
+  selector:
+    {{- include "kured.matchLabels" . | nindent 4 }}
+{{- end }}
--- a/charts/kured/templates/serviceaccount.yaml
+++ b/charts/kured/templates/serviceaccount.yaml
@@ -0,0 +1,9 @@
+{{- if .Values.serviceAccount.create -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ template "kured.serviceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+{{- end -}}
--- a/charts/kured/templates/servicemonitor.yaml
+++ b/charts/kured/templates/servicemonitor.yaml
@@ -0,0 +1,31 @@
+{{- if .Values.metrics.create }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ template "kured.fullname" . }}
+  {{- if .Values.metrics.namespace }}
+  namespace: {{ .Values.metrics.namespace }}
+  {{- end }}
+  labels:
+    {{- include "kured.labels" . | nindent 4 }}
+    {{- if .Values.metrics.labels }}
+    {{- toYaml .Values.metrics.labels | nindent 4 }}
+    {{- end }}
+spec:
+  endpoints:
+  - interval: {{ .Values.metrics.interval }}
+    {{- if .Values.metrics.scrapeTimeout }}
+    scrapeTimeout: {{ .Values.metrics.scrapeTimeout }}
+    {{- end }}
+    honorLabels: true
+    targetPort: 8080
+    path: /metrics
+    scheme: http
+  jobLabel: "{{ .Release.Name }}"
+  selector:
+    matchLabels:
+      {{- include "kured.matchLabels" . | nindent 6 }}
+  namespaceSelector:
+    matchNames:
+      - {{ .Release.Namespace }}
+{{- end }}
--- a/charts/kured/values.minikube.yaml
+++ b/charts/kured/values.minikube.yaml
@@ -0,0 +1,33 @@
+image:
+  repository: weaveworks/kured
+  tag: latest
+
+configuration:
+  # annotationTtl: 0          # force clean annotation after this amount of time (default 0, disabled)
+  # alertFilterRegexp: ""     # alert names to ignore when checking for active alerts
+  # alertFiringOnly: false     # only consider firing alerts when checking for active alerts
+  # blockingPodSelector: []   # label selector identifying pods whose presence should prevent reboots
+  # endTime: ""               # only reboot before this time of day (default "23:59")
+  # lockAnnotation: ""        # annotation in which to record locking node (default "weave.works/kured-node-lock")
+  period: "1m"                # reboot check period (default 1h0m0s)
+  # forceReboot: false        # force a reboot even if the drain fails or times out (default: false)
+  # drainGracePeriod: ""      # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
+  # drainTimeout: ""          # timeout after which the drain is aborted (default: 0, infinite time)
+  # skipWaitForDeleteTimeout: ""  # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
+  # prometheusUrl: ""         # Prometheus instance to probe for active alerts
+  # rebootDays: []            # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
+  # rebootSentinel: ""        # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
+  # rebootSentinelCommand: ""  # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
+  # slackChannel: ""          # slack channel for reboot notfications
+  # slackHookUrl: ""          # slack hook URL for reboot notfications
+  # slackUsername: ""         # slack username for reboot notfications (default "kured")
+  # notifyUrl: ""             # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
+  # messageTemplateDrain: ""  # slack message template when notifying about a node being drained (default "Draining node %s")
+  # messageTemplateReboot: "" # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
+  # startTime: ""             # only reboot after this time of day (default "0:00")
+  # timeZone: ""              # time-zone to use (valid zones from "time" golang package)
+  # annotateNodes: false      # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
+  # lockReleaseDelay: "5m"    # hold lock after reboot by this amount of time (default 0, disabled)
+  # logFormat: "text"         # log format specified as text or json, defaults to text
+  # preRebootNodeLabels: []   # labels to add to nodes before cordoning (default [])
+  # postRebootNodeLabels: []   # labels to add to nodes after uncordoning (default [])
--- a/charts/kured/values.yaml
+++ b/charts/kured/values.yaml
@@ -0,0 +1,101 @@
+image:
+  repository: weaveworks/kured
+  tag: ""  # will default to the appVersion in Chart.yaml
+  pullPolicy: IfNotPresent
+  pullSecrets: []
+
+updateStrategy: RollingUpdate
+#  requires RollingUpdate updateStrategy
+maxUnavailable: 1
+
+podAnnotations: {}
+dsAnnotations: {}
+
+extraArgs: {}
+
+extraEnvVars:
+#  - name: slackHookUrl
+#    valueFrom:
+#      secretKeyRef:
+#        name: secret_name
+#        key: secret_key
+#  - name: regularEnvVariable
+#    value: 123
+
+configuration:
+  lockTtl: 0                 # force clean annotation after this amount of time (default 0, disabled)
+  alertFilterRegexp: ""      # alert names to ignore when checking for active alerts
+  alertFiringOnly: false     # only consider firing alerts when checking for active alerts
+  blockingPodSelector: []    # label selector identifying pods whose presence should prevent reboots
+  endTime: ""                # only reboot before this time of day (default "23:59")
+  lockAnnotation: ""         # annotation in which to record locking node (default "weave.works/kured-node-lock")
+  period: ""                 # reboot check period (default 1h0m0s)
+  forceReboot: false         # force a reboot even if the drain fails or times out (default: false)
+  drainGracePeriod: ""       # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
+  drainTimeout: ""           # timeout after which the drain is aborted (default: 0, infinite time)
+  skipWaitForDeleteTimeout: ""  # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
+  prometheusUrl: ""          # Prometheus instance to probe for active alerts
+  rebootDays: []             # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
+  rebootSentinel: ""         # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
+  rebootSentinelCommand: ""  # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
+  rebootCommand: "/bin/systemctl reboot"  # command to run when a reboot is required by the sentinel
+  rebootDelay: ""            # add a delay after drain finishes but before the reboot command is issued
+  slackChannel: ""           # slack channel for reboot notfications
+  slackHookUrl: ""           # slack hook URL for reboot notfications
+  slackUsername: ""          # slack username for reboot notfications (default "kured")
+  notifyUrl: ""              # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
+  messageTemplateDrain: ""   # slack message template when notifying about a node being drained (default "Draining node %s")
+  messageTemplateReboot: ""  # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
+  startTime: ""              # only reboot after this time of day (default "0:00")
+  timeZone: ""               # time-zone to use (valid zones from "time" golang package)
+  annotateNodes: false       # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
+  lockReleaseDelay: 0        # hold lock after reboot by this amount of time (default 0, disabled)
+  preferNoScheduleTaint: ""  # Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to "weave.works/kured-node-reboot" to enable tainting.
+  logFormat: "text"          # log format specified as text or json, defaults to text
+  preRebootNodeLabels: []    # labels to add to nodes before cordoning (default [])
+  postRebootNodeLabels: []   # labels to add to nodes after uncordoning (default [])
+
+
+rbac:
+  create: true
+
+serviceAccount:
+  create: true
+  name:
+
+podSecurityPolicy:
+  create: false
+
+containerSecurityContext:
+  privileged: true  # Give permission to nsenter /proc/1/ns/mnt
+#  allowPrivilegeEscalation: true # Needed when using defaultAllowPrivilegedEscalation: false in psp
+
+resources: {}
+
+metrics:
+  create: false
+  namespace: ""
+  labels: {}
+  interval: 60s
+  scrapeTimeout: ""
+
+service:
+  create: false
+  port: 8080
+  annotations: {}
+  name: ""
+  type: ClusterIP
+
+podLabels: {}
+
+priorityClassName: ""
+
+tolerations: []
+
+affinity: {}
+
+nodeSelector: {}
+
+volumeMounts: []
+
+volumes: []
--- a/cmd/kured/Dockerfile
+++ b/cmd/kured/Dockerfile
@@ -1,7 +1,4 @@
-FROM alpine:3.11
-RUN apk update && apk add ca-certificates tzdata && rm -rf /var/cache/apk/*
-# NB: you may need to update RBAC permissions when upgrading kubectl - see kured-rbac.yaml for details
-ADD https://storage.googleapis.com/kubernetes-release/release/v1.17.5/bin/linux/amd64/kubectl /usr/bin/kubectl
-RUN chmod 0755 /usr/bin/kubectl
+FROM alpine:3.16.0
+RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
 COPY ./kured /usr/bin/kured
 ENTRYPOINT ["/usr/bin/kured"]
--- a/cmd/kured/Dockerfile.multi
+++ b/cmd/kured/Dockerfile.multi
@@ -0,0 +1,20 @@
+FROM --platform=$BUILDPLATFORM golang:bullseye AS build
+
+ARG TARGETOS
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+ENV GOOS=$TARGETOS
+ENV GOARCH=$TARGETARCH
+ENV GOVARIANT=$TARGETVARIANT
+
+WORKDIR /src
+COPY go.mod go.sum .
+RUN go mod download
+COPY . .
+RUN make kured-multi
+
+FROM --platform=$TARGETPLATFORM alpine:3.16.0 as bin
+RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
+COPY --from=build /src/cmd/kured/kured /usr/bin/kured
+ENTRYPOINT ["/usr/bin/kured"]
--- a/cmd/kured/main.go
+++ b/cmd/kured/main.go
@@ -1,26 +1,41 @@
 package main

 import (
+	"context"
+	"encoding/json"
 	"fmt"
 	"math/rand"
 	"net/http"
+	"net/url"
 	"os"
 	"os/exec"
+	"reflect"
 	"regexp"
+	"sort"
+	"strings"
 	"time"

+	papi "github.com/prometheus/client_golang/api"
 	log "github.com/sirupsen/logrus"
 	"github.com/spf13/cobra"
+	"github.com/spf13/pflag"
+	"github.com/spf13/viper"
+	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
+	kubectldrain "k8s.io/kubectl/pkg/drain"

+	"github.com/google/shlex"
+
+	shoutrrr "github.com/containrrr/shoutrrr"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"github.com/weaveworks/kured/pkg/alerts"
 	"github.com/weaveworks/kured/pkg/daemonsetlock"
 	"github.com/weaveworks/kured/pkg/delaytick"
-	"github.com/weaveworks/kured/pkg/notifications/slack"
+	"github.com/weaveworks/kured/pkg/taints"
 	"github.com/weaveworks/kured/pkg/timewindow"
 )

@@ -28,22 +43,42 @@ var (
 	version = "unreleased"

 	// Command line flags
-	period         time.Duration
-	dsNamespace    string
-	dsName         string
-	lockAnnotation string
-	prometheusURL  string
-	alertFilter    *regexp.Regexp
-	rebootSentinel string
-	slackHookURL   string
-	slackUsername  string
-	slackChannel   string
-	podSelectors   []string
+	forceReboot                     bool
+	drainTimeout                    time.Duration
+	rebootDelay                     time.Duration
+	period                          time.Duration
+	drainGracePeriod                int
+	skipWaitForDeleteTimeoutSeconds int
+	dsNamespace                     string
+	dsName                          string
+	lockAnnotation                  string
+	lockTTL                         time.Duration
+	lockReleaseDelay                time.Duration
+	prometheusURL                   string
+	preferNoScheduleTaintName       string
+	alertFilter                     *regexp.Regexp
+	alertFiringOnly                 bool
+	rebootSentinelFile              string
+	rebootSentinelCommand           string
+	notifyURL                       string
+	slackHookURL                    string
+	slackUsername                   string
+	slackChannel                    string
+	messageTemplateDrain            string
+	messageTemplateReboot           string
+	messageTemplateUncordon         string
+	podSelectors                    []string
+	rebootCommand                   string
+	logFormat                       string
+	preRebootNodeLabels             []string
+	postRebootNodeLabels            []string
+	nodeID                          string

-	rebootDays  []string
-	rebootStart string
-	rebootEnd   string
-	timezone    string
+	rebootDays    []string
+	rebootStart   string
+	rebootEnd     string
+	timezone      string
+	annotateNodes bool

 	// Metrics
 	rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
@@ -53,37 +88,91 @@ var (
 	}, []string{"node"})
 )

+const (
+	// KuredNodeLockAnnotation is the canonical string value for the kured node-lock annotation
+	KuredNodeLockAnnotation string = "weave.works/kured-node-lock"
+	// KuredRebootInProgressAnnotation is the canonical string value for the kured reboot-in-progress annotation
+	KuredRebootInProgressAnnotation string = "weave.works/kured-reboot-in-progress"
+	// KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation
+	KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
+	// EnvPrefix The environment variable prefix of all environment variables bound to our command line flags.
+	EnvPrefix = "KURED"
+)
+
 func init() {
 	prometheus.MustRegister(rebootRequiredGauge)
 }

 func main() {
-	rootCmd := &cobra.Command{
-		Use:   "kured",
-		Short: "Kubernetes Reboot Daemon",
-		Run:   root}
+	cmd := NewRootCommand()

+	if err := cmd.Execute(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+// NewRootCommand construct the Cobra root command
+func NewRootCommand() *cobra.Command {
+	rootCmd := &cobra.Command{
+		Use:               "kured",
+		Short:             "Kubernetes Reboot Daemon",
+		PersistentPreRunE: bindViper,
+		PreRun:            flagCheck,
+		Run:               root}
+
+	rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "",
+		"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
+	rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false,
+		"force a reboot even if the drain fails or times out (default: false)")
+	rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1,
+		"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)")
+	rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
+		"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)")
+	rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
+		"timeout after which the drain is aborted (default: 0, infinite time)")
+	rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
+		"delay reboot for this duration (default: 0, disabled)")
 	rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60,
-		"reboot check period")
+		"sentinel check period")
 	rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system",
 		"namespace containing daemonset on which to place lock")
 	rootCmd.PersistentFlags().StringVar(&dsName, "ds-name", "kured",
 		"name of daemonset on which to place lock")
-	rootCmd.PersistentFlags().StringVar(&lockAnnotation, "lock-annotation", "weave.works/kured-node-lock",
+	rootCmd.PersistentFlags().StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation,
 		"annotation in which to record locking node")
+	rootCmd.PersistentFlags().DurationVar(&lockTTL, "lock-ttl", 0,
+		"expire lock annotation after this duration (default: 0, disabled)")
+	rootCmd.PersistentFlags().DurationVar(&lockReleaseDelay, "lock-release-delay", 0,
+		"delay lock release for this duration (default: 0, disabled)")
 	rootCmd.PersistentFlags().StringVar(&prometheusURL, "prometheus-url", "",
 		"Prometheus instance to probe for active alerts")
 	rootCmd.PersistentFlags().Var(&regexpValue{&alertFilter}, "alert-filter-regexp",
 		"alert names to ignore when checking for active alerts")
-	rootCmd.PersistentFlags().StringVar(&rebootSentinel, "reboot-sentinel", "/var/run/reboot-required",
-		"path to file whose existence signals need to reboot")
+	rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
+		"only consider firing alerts when checking for active alerts (default: false)")
+	rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
+		"path to file whose existence triggers the reboot command")
+	rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "",
+		"Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"weave.works/kured-node-reboot\" to enable tainting.")
+	rootCmd.PersistentFlags().StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "",
+		"command for which a zero return code will trigger a reboot command")
+	rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
+		"command to run when a reboot is required")

 	rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
-		"slack hook URL for reboot notfications")
+		"slack hook URL for notifications")
 	rootCmd.PersistentFlags().StringVar(&slackUsername, "slack-username", "kured",
-		"slack username for reboot notfications")
+		"slack username for notifications")
 	rootCmd.PersistentFlags().StringVar(&slackChannel, "slack-channel", "",
 		"slack channel for reboot notfications")
+	rootCmd.PersistentFlags().StringVar(&notifyURL, "notify-url", "",
+		"notify URL for reboot notfications")
+	rootCmd.PersistentFlags().StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!",
+		"message template used to notify about a node being successfully uncordoned")
+	rootCmd.PersistentFlags().StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s",
+		"message template used to notify about a node being drained")
+	rootCmd.PersistentFlags().StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s",
+		"message template used to notify about a node being rebooted")

 	rootCmd.PersistentFlags().StringArrayVar(&podSelectors, "blocking-pod-selector", nil,
 		"label selector identifying pods whose presence should prevent reboots")
@@ -97,15 +186,104 @@ func main() {
 	rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC",
 		"use this timezone for schedule inputs")

-	if err := rootCmd.Execute(); err != nil {
-		log.Fatal(err)
+	rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false,
+		"if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots")
+
+	rootCmd.PersistentFlags().StringVar(&logFormat, "log-format", "text",
+		"use text or json log format")
+
+	rootCmd.PersistentFlags().StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil,
+		"labels to add to nodes before cordoning")
+	rootCmd.PersistentFlags().StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil,
+		"labels to add to nodes after uncordoning")
+
+	return rootCmd
+}
+
+// func that checks for deprecated slack-notification-related flags and node labels that do not match
+func flagCheck(cmd *cobra.Command, args []string) {
+	if slackHookURL != "" && notifyURL != "" {
+		log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...")
 	}
+	if notifyURL != "" {
+		notifyURL = stripQuotes(notifyURL)
+	} else if slackHookURL != "" {
+		slackHookURL = stripQuotes(slackHookURL)
+		log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.")
+		trataURL, err := url.Parse(slackHookURL)
+		if err != nil {
+			log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)
+		}
+		if len(strings.Split(strings.Trim(trataURL.Path, "/services/"), "/")) != 3 {
+			log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")
+		} else {
+			notifyURL = fmt.Sprintf("slack://%s", strings.Trim(trataURL.Path, "/services/"))
+		}
+	}
+	var preRebootNodeLabelKeys, postRebootNodeLabelKeys []string
+	for _, label := range preRebootNodeLabels {
+		preRebootNodeLabelKeys = append(preRebootNodeLabelKeys, strings.Split(label, "=")[0])
+	}
+	for _, label := range postRebootNodeLabels {
+		postRebootNodeLabelKeys = append(postRebootNodeLabelKeys, strings.Split(label, "=")[0])
+	}
+	sort.Strings(preRebootNodeLabelKeys)
+	sort.Strings(postRebootNodeLabelKeys)
+	if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) {
+		log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.")
+	}
+}
+
+// stripQuotes removes any literal single or double quote chars that surround a string
+func stripQuotes(str string) string {
+	if len(str) > 2 {
+		firstChar := str[0]
+		lastChar := str[len(str)-1]
+		if firstChar == lastChar && (firstChar == '"' || firstChar == '\'') {
+			return str[1 : len(str)-1]
+		}
+	}
+	// return the original string if it has a length of zero or one
+	return str
+}
+
+// bindViper initializes viper and binds command flags with environment variables
+func bindViper(cmd *cobra.Command, args []string) error {
+	v := viper.New()
+
+	v.SetEnvPrefix(EnvPrefix)
+	v.AutomaticEnv()
+	bindFlags(cmd, v)
+
+	return nil
+}
+
+// bindFlags binds each cobra flag to its associated viper configuration (environment variable)
+func bindFlags(cmd *cobra.Command, v *viper.Viper) {
+	cmd.Flags().VisitAll(func(f *pflag.Flag) {
+		// Environment variables can't have dashes in them, so bind them to their equivalent keys with underscores
+		if strings.Contains(f.Name, "-") {
+			v.BindEnv(f.Name, flagToEnvVar(f.Name))
+		}
+
+		// Apply the viper config value to the flag when the flag is not set and viper has a value
+		if !f.Changed && v.IsSet(f.Name) {
+			val := v.Get(f.Name)
+			log.Infof("Binding %s command flag to environment variable: %s", f.Name, flagToEnvVar(f.Name))
+			cmd.Flags().Set(f.Name, fmt.Sprintf("%v", val))
+		}
+	})
+}
+
+// flagToEnvVar converts command flag name to equivalent environment variable name
+func flagToEnvVar(flag string) string {
+	envVarSuffix := strings.ToUpper(strings.ReplaceAll(flag, "-", "_"))
+	return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix)
 }

 // newCommand creates a new Command with stdout/stderr wired to our standard logger
 func newCommand(name string, arg ...string) *exec.Cmd {
 	cmd := exec.Command(name, arg...)
-
 	cmd.Stdout = log.NewEntry(log.StandardLogger()).
 		WithField("cmd", cmd.Args[0]).
 		WithField("std", "out").
@@ -119,10 +297,19 @@ func newCommand(name string, arg ...string) *exec.Cmd {
 	return cmd
 }

-func sentinelExists() bool {
-	// Relies on hostPID:true and privileged:true to enter host mount space
-	sentinelCmd := newCommand("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "/usr/bin/test", "-f", rebootSentinel)
-	if err := sentinelCmd.Run(); err != nil {
+// buildHostCommand writes a new command to run in the host namespace
+// Rancher based need different pid
+func buildHostCommand(pid int, command []string) []string {
+
+	// From the container, we nsenter into the proper PID to run the hostCommand.
+	// For this, kured daemonset need to be configured with hostPID:true and privileged:true
+	cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"}
+	cmd = append(cmd, command...)
+	return cmd
+}
+
+func rebootRequired(sentinelCommand []string) bool {
+	if err := newCommand(sentinelCommand[0], sentinelCommand[1:]...).Run(); err != nil {
 		switch err := err.(type) {
 		case *exec.ExitError:
 			// We assume a non-zero exit code means 'reboot not required', but of course
@@ -139,36 +326,56 @@ func sentinelExists() bool {
 	return true
 }

-func rebootRequired() bool {
-	if sentinelExists() {
-		log.Infof("Reboot required")
-		return true
-	} else {
-		log.Infof("Reboot not required")
-		return false
-	}
+// RebootBlocker interface should be implemented by types
+// to know if their instantiations should block a reboot
+type RebootBlocker interface {
+	isBlocked() bool
 }

-func rebootBlocked(client *kubernetes.Clientset, nodeID string) bool {
-	if prometheusURL != "" {
-		alertNames, err := alerts.PrometheusActiveAlerts(prometheusURL, alertFilter)
-		if err != nil {
-			log.Warnf("Reboot blocked: prometheus query error: %v", err)
-			return true
-		}
-		count := len(alertNames)
-		if count > 10 {
-			alertNames = append(alertNames[:10], "...")
-		}
-		if count > 0 {
-			log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
-			return true
-		}
-	}
+// PrometheusBlockingChecker contains info for connecting
+// to prometheus, and can give info about whether a reboot should be blocked
+type PrometheusBlockingChecker struct {
+	// prometheusClient to make prometheus-go-client and api config available
+	// into the PrometheusBlockingChecker struct
+	promClient *alerts.PromClient
+	// regexp used to get alerts
+	filter *regexp.Regexp
+	// bool to indicate if only firing alerts should be considered
+	firingOnly bool
+}

-	fieldSelector := fmt.Sprintf("spec.nodeName=%s", nodeID)
-	for _, labelSelector := range podSelectors {
-		podList, err := client.CoreV1().Pods("").List(metav1.ListOptions{
+// KubernetesBlockingChecker contains info for connecting
+// to k8s, and can give info about whether a reboot should be blocked
+type KubernetesBlockingChecker struct {
+	// client used to contact kubernetes API
+	client   *kubernetes.Clientset
+	nodename string
+	// lised used to filter pods (podSelector)
+	filter []string
+}
+
+func (pb PrometheusBlockingChecker) isBlocked() bool {
+
+	alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
+	if err != nil {
+		log.Warnf("Reboot blocked: prometheus query error: %v", err)
+		return true
+	}
+	count := len(alertNames)
+	if count > 10 {
+		alertNames = append(alertNames[:10], "...")
+	}
+	if count > 0 {
+		log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
+		return true
+	}
+	return false
+}
+
+func (kb KubernetesBlockingChecker) isBlocked() bool {
+	fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodename)
+	for _, labelSelector := range kb.filter {
+		podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
 			LabelSelector: labelSelector,
 			FieldSelector: fieldSelector,
 			Limit:         10})
@@ -189,7 +396,15 @@ func rebootBlocked(client *kubernetes.Clientset, nodeID string) bool {
 			return true
 		}
 	}
+	return false
+}

+func rebootBlocked(blockers ...RebootBlocker) bool {
+	for _, blocker := range blockers {
+		if blocker.isBlocked() {
+			return true
+		}
+	}
 	return false
 }

@@ -204,8 +419,8 @@ func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
 	return holding
 }

-func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
-	holding, holder, err := lock.Acquire(metadata)
+func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration) bool {
+	holding, holder, err := lock.Acquire(metadata, TTL)
 	switch {
 	case err != nil:
 		log.Fatalf("Error acquiring lock: %v", err)
@@ -219,6 +434,13 @@ func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
 	}
 }

+func throttle(releaseDelay time.Duration) {
+	if releaseDelay > 0 {
+		log.Infof("Delaying lock release by %v", releaseDelay)
+		time.Sleep(releaseDelay)
+	}
+}
+
 func release(lock *daemonsetlock.DaemonSetLock) {
 	log.Infof("Releasing lock")
 	if err := lock.Release(); err != nil {
@@ -226,50 +448,81 @@ func release(lock *daemonsetlock.DaemonSetLock) {
 	}
 }

-func drain(nodeID string) {
-	log.Infof("Draining node %s", nodeID)
+func drain(client *kubernetes.Clientset, node *v1.Node) error {
+	nodename := node.GetName()

-	if slackHookURL != "" {
-		if err := slack.NotifyDrain(slackHookURL, slackUsername, slackChannel, nodeID); err != nil {
-			log.Warnf("Error notifying slack: %v", err)
+	if preRebootNodeLabels != nil {
+		updateNodeLabels(client, node, preRebootNodeLabels)
+	}
+
+	log.Infof("Draining node %s", nodename)
+
+	if notifyURL != "" {
+		if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateDrain, nodename)); err != nil {
+			log.Warnf("Error notifying: %v", err)
 		}
 	}

-	drainCmd := newCommand("/usr/bin/kubectl", "drain",
-		"--ignore-daemonsets", "--delete-local-data", "--force", nodeID)
-
-	if err := drainCmd.Run(); err != nil {
-		log.Fatalf("Error invoking drain command: %v", err)
+	drainer := &kubectldrain.Helper{
+		Client:                          client,
+		Ctx:                             context.Background(),
+		GracePeriodSeconds:              drainGracePeriod,
+		SkipWaitForDeleteTimeoutSeconds: skipWaitForDeleteTimeoutSeconds,
+		Force:                           true,
+		DeleteEmptyDirData:              true,
+		IgnoreAllDaemonSets:             true,
+		ErrOut:                          os.Stderr,
+		Out:                             os.Stdout,
+		Timeout:                         drainTimeout,
 	}
+
+	if err := kubectldrain.RunCordonOrUncordon(drainer, node, true); err != nil {
+		log.Errorf("Error cordonning %s: %v", nodename, err)
+		return err
+	}
+
+	if err := kubectldrain.RunNodeDrain(drainer, nodename); err != nil {
+		log.Errorf("Error draining %s: %v", nodename, err)
+		return err
+	}
+	return nil
 }

-func uncordon(nodeID string) {
-	log.Infof("Uncordoning node %s", nodeID)
-	uncordonCmd := newCommand("/usr/bin/kubectl", "uncordon", nodeID)
-	if err := uncordonCmd.Run(); err != nil {
-		log.Fatalf("Error invoking uncordon command: %v", err)
+func uncordon(client *kubernetes.Clientset, node *v1.Node) error {
+	nodename := node.GetName()
+	log.Infof("Uncordoning node %s", nodename)
+	drainer := &kubectldrain.Helper{
+		Client: client,
+		ErrOut: os.Stderr,
+		Out:    os.Stdout,
+		Ctx:    context.Background(),
 	}
+	if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil {
+		log.Fatalf("Error uncordonning %s: %v", nodename, err)
+		return err
+	} else if postRebootNodeLabels != nil {
+		updateNodeLabels(client, node, postRebootNodeLabels)
+	}
+	return nil
 }

-func commandReboot(nodeID string) {
-	log.Infof("Commanding reboot")
+func invokeReboot(nodeID string, rebootCommand []string) {
+	log.Infof("Running command: %s for node: %s", rebootCommand, nodeID)

-	if slackHookURL != "" {
-		if err := slack.NotifyReboot(slackHookURL, slackUsername, slackChannel, nodeID); err != nil {
-			log.Warnf("Error notifying slack: %v", err)
+	if notifyURL != "" {
+		if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil {
+			log.Warnf("Error notifying: %v", err)
 		}
 	}

-	// Relies on hostPID:true and privileged:true to enter host mount space
-	rebootCmd := newCommand("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "/bin/systemctl", "reboot")
-	if err := rebootCmd.Run(); err != nil {
+	if err := newCommand(rebootCommand[0], rebootCommand[1:]...).Run(); err != nil {
 		log.Fatalf("Error invoking reboot command: %v", err)
 	}
 }

-func maintainRebootRequiredMetric(nodeID string) {
+func maintainRebootRequiredMetric(nodeID string, sentinelCommand []string) {
 	for {
-		if sentinelExists() {
+		if rebootRequired(sentinelCommand) {
 			rebootRequiredGauge.WithLabelValues(nodeID).Set(1)
 		} else {
 			rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
@@ -283,7 +536,81 @@ type nodeMeta struct {
 	Unschedulable bool `json:"unschedulable"`
 }

-func rebootAsRequired(nodeID string, window *timewindow.TimeWindow) {
+func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error {
+	node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
+	if err != nil {
+		log.Errorf("Error retrieving node object via k8s API: %s", err)
+		return err
+	}
+	for k, v := range annotations {
+		node.Annotations[k] = v
+		log.Infof("Adding node %s annotation: %s=%s", node.GetName(), k, v)
+	}
+
+	bytes, err := json.Marshal(node)
+	if err != nil {
+		log.Errorf("Error marshalling node object into JSON: %v", err)
+		return err
+	}
+
+	_, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
+	if err != nil {
+		var annotationsErr string
+		for k, v := range annotations {
+			annotationsErr += fmt.Sprintf("%s=%s ", k, v)
+		}
+		log.Errorf("Error adding node annotations %s via k8s API: %v", annotationsErr, err)
+		return err
+	}
+	return nil
+}
+
+func deleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) error {
+	log.Infof("Deleting node %s annotation %s", nodeID, key)
+
+	// JSON Patch takes as path input a JSON Pointer, defined in RFC6901
+	// So we replace all instances of "/" with "~1" as per:
+	// https://tools.ietf.org/html/rfc6901#section-3
+	patch := []byte(fmt.Sprintf("[{\"op\":\"remove\",\"path\":\"/metadata/annotations/%s\"}]", strings.ReplaceAll(key, "/", "~1")))
+	_, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{})
+	if err != nil {
+		log.Errorf("Error deleting node annotation %s via k8s API: %v", key, err)
+		return err
+	}
+	return nil
+}
+
+func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []string) {
+	labelsMap := make(map[string]string)
+	for _, label := range labels {
+		k := strings.Split(label, "=")[0]
+		v := strings.Split(label, "=")[1]
+		labelsMap[k] = v
+		log.Infof("Updating node %s label: %s=%s", node.GetName(), k, v)
+	}
+
+	bytes, err := json.Marshal(map[string]interface{}{
+		"metadata": map[string]interface{}{
+			"labels": labelsMap,
+		},
+	})
+	if err != nil {
+		log.Fatalf("Error marshalling node object into JSON: %v", err)
+	}
+
+	_, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
+	if err != nil {
+		var labelsErr string
+		for _, label := range labels {
+			k := strings.Split(label, "=")[0]
+			v := strings.Split(label, "=")[1]
+			labelsErr += fmt.Sprintf("%s=%s ", k, v)
+		}
+		log.Errorf("Error updating node labels %s via k8s API: %v", labelsErr, err)
+	}
+}
+
+func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) {
 	config, err := rest.InClusterConfig()
 	if err != nil {
 		log.Fatal(err)
@@ -297,41 +624,172 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow) {
 	lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation)

 	nodeMeta := nodeMeta{}
-	if holding(lock, &nodeMeta) {
-		if !nodeMeta.Unschedulable {
-			uncordon(nodeID)
+	source := rand.NewSource(time.Now().UnixNano())
+	tick := delaytick.New(source, 1*time.Minute)
+	for range tick {
+		if holding(lock, &nodeMeta) {
+			node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
+			if err != nil {
+				log.Errorf("Error retrieving node object via k8s API: %v", err)
+				continue
+			}
+			if !nodeMeta.Unschedulable {
+				err = uncordon(client, node)
+				if err != nil {
+					log.Errorf("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err)
+					continue
+				} else {
+					if notifyURL != "" {
+						if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateUncordon, nodeID)); err != nil {
+							log.Warnf("Error notifying: %v", err)
+						}
+					}
+				}
+			}
+			// If we're holding the lock we know we've tried, in a prior run, to reboot
+			// So (1) we want to confirm that the reboot succeeded practically ( !rebootRequired() )
+			// And (2) check if we previously annotated the node that it was in the process of being rebooted,
+			// And finally (3) if it has that annotation, to delete it.
+			// This indicates to other node tools running on the cluster that this node may be a candidate for maintenance
+			if annotateNodes && !rebootRequired(sentinelCommand) {
+				if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok {
+					err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation)
+					if err != nil {
+						continue
+					}
+				}
+			}
+			throttle(releaseDelay)
+			release(lock)
+			break
+		} else {
+			break
 		}
-		release(lock)
 	}

-	source := rand.NewSource(time.Now().UnixNano())
-	tick := delaytick.New(source, period)
-	for _ = range tick {
-		if window.Contains(time.Now()) && rebootRequired() && !rebootBlocked(client, nodeID) {
-			node, err := client.CoreV1().Nodes().Get(nodeID, metav1.GetOptions{})
-			if err != nil {
-				log.Fatal(err)
-			}
-			nodeMeta.Unschedulable = node.Spec.Unschedulable
+	preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)

-			if acquire(lock, &nodeMeta) {
-				if !nodeMeta.Unschedulable {
-					drain(nodeID)
-				}
-				commandReboot(nodeID)
-				for {
-					log.Infof("Waiting for reboot")
-					time.Sleep(time.Minute)
+	// Remove taint immediately during startup to quickly allow scheduling again.
+	if !rebootRequired(sentinelCommand) {
+		preferNoScheduleTaint.Disable()
+	}
+
+	// instantiate prometheus client
+	promClient, err := alerts.NewPromClient(papi.Config{Address: prometheusURL})
+	if err != nil {
+		log.Fatal("Unable to create prometheus client: ", err)
+	}
+
+	source = rand.NewSource(time.Now().UnixNano())
+	tick = delaytick.New(source, period)
+	for range tick {
+		if !window.Contains(time.Now()) {
+			// Remove taint outside the reboot time window to allow for normal operation.
+			preferNoScheduleTaint.Disable()
+			continue
+		}
+
+		if !rebootRequired(sentinelCommand) {
+			log.Infof("Reboot not required")
+			preferNoScheduleTaint.Disable()
+			continue
+		}
+		log.Infof("Reboot required")
+
+		var blockCheckers []RebootBlocker
+		if prometheusURL != "" {
+			blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
+		}
+		if podSelectors != nil {
+			blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
+		}
+
+		if rebootBlocked(blockCheckers...) {
+			continue
+		}
+
+		node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
+		if err != nil {
+			log.Fatalf("Error retrieving node object via k8s API: %v", err)
+		}
+		nodeMeta.Unschedulable = node.Spec.Unschedulable
+
+		var timeNowString string
+		if annotateNodes {
+			if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; !ok {
+				timeNowString = time.Now().Format(time.RFC3339)
+				// Annotate this node to indicate that "I am going to be rebooted!"
+				// so that other node maintenance tools running on the cluster are aware that this node is in the process of a "state transition"
+				annotations := map[string]string{KuredRebootInProgressAnnotation: timeNowString}
+				// & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot
+				annotations[KuredMostRecentRebootNeededAnnotation] = timeNowString
+				err := addNodeAnnotations(client, nodeID, annotations)
+				if err != nil {
+					continue
 				}
 			}
 		}
+
+		if !holding(lock, &nodeMeta) && !acquire(lock, &nodeMeta, TTL) {
+			// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
+			preferNoScheduleTaint.Enable()
+			continue
+		}
+
+		err = drain(client, node)
+		if err != nil {
+			if !forceReboot {
+				log.Errorf("Unable to cordon or drain %s: %v, will release lock and retry cordon and drain before rebooting when lock is next acquired", node.GetName(), err)
+				release(lock)
+				log.Infof("Performing a best-effort uncordon after failed cordon and drain")
+				uncordon(client, node)
+				continue
+			}
+		}
+
+		if rebootDelay > 0 {
+			log.Infof("Delaying reboot for %v", rebootDelay)
+			time.Sleep(rebootDelay)
+		}
+
+		invokeReboot(nodeID, rebootCommand)
+		for {
+			log.Infof("Waiting for reboot")
+			time.Sleep(time.Minute)
+		}
 	}
 }

+// buildSentinelCommand creates the shell command line which will need wrapping to escape
+// the container boundaries
+func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand string) []string {
+	if rebootSentinelCommand != "" {
+		cmd, err := shlex.Split(rebootSentinelCommand)
+		if err != nil {
+			log.Fatalf("Error parsing provided sentinel command: %v", err)
+		}
+		return cmd
+	}
+	return []string{"test", "-f", rebootSentinelFile}
+}
+
+// parseRebootCommand creates the shell command line which will need wrapping to escape
+// the container boundaries
+func parseRebootCommand(rebootCommand string) []string {
+	command, err := shlex.Split(rebootCommand)
+	if err != nil {
+		log.Fatalf("Error parsing provided reboot command: %v", err)
+	}
+	return command
+}
+
 func root(cmd *cobra.Command, args []string) {
+	if logFormat == "json" {
+		log.SetFormatter(&log.JSONFormatter{})
+	}
+
 	log.Infof("Kubernetes Reboot Daemon: %s", version)

-	nodeID := os.Getenv("KURED_NODE_ID")
 	if nodeID == "" {
 		log.Fatal("KURED_NODE_ID environment variable required")
 	}
@@ -341,14 +799,38 @@ func root(cmd *cobra.Command, args []string) {
 		log.Fatalf("Failed to build time window: %v", err)
 	}

+	sentinelCommand := buildSentinelCommand(rebootSentinelFile, rebootSentinelCommand)
+	restartCommand := parseRebootCommand(rebootCommand)
+
 	log.Infof("Node ID: %s", nodeID)
 	log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation)
-	log.Infof("Reboot Sentinel: %s every %v", rebootSentinel, period)
+	if lockTTL > 0 {
+		log.Infof("Lock TTL set, lock will expire after: %v", lockTTL)
+	} else {
+		log.Info("Lock TTL not set, lock will remain until being released")
+	}
+	if lockReleaseDelay > 0 {
+		log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay)
+	} else {
+		log.Info("Lock release delay not set, lock will be released immediately after rebooting")
+	}
+	log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName)
 	log.Infof("Blocking Pod Selectors: %v", podSelectors)
-	log.Infof("Reboot on: %v", window)
+	log.Infof("Reboot schedule: %v", window)
+	log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
+	log.Infof("Reboot command: %s", restartCommand)
+	if annotateNodes {
+		log.Infof("Will annotate nodes during kured reboot operations")
+	}

-	go rebootAsRequired(nodeID, window)
-	go maintainRebootRequiredMetric(nodeID)
+	// To run those commands as it was the host, we'll use nsenter
+	// Relies on hostPID:true and privileged:true to enter host mount space
+	// PID set to 1, until we have a better discovery mechanism.
+	hostSentinelCommand := buildHostCommand(1, sentinelCommand)
+	hostRestartCommand := buildHostCommand(1, restartCommand)
+
+	go rebootAsRequired(nodeID, hostRestartCommand, hostSentinelCommand, window, lockTTL, lockReleaseDelay)
+	go maintainRebootRequiredMetric(nodeID, hostSentinelCommand)

 	http.Handle("/metrics", promhttp.Handler())
 	log.Fatal(http.ListenAndServe(":8080", nil))
--- a/cmd/kured/main_test.go
+++ b/cmd/kured/main_test.go
@@ -0,0 +1,310 @@
+package main
+
+import (
+	"reflect"
+	"testing"
+
+	log "github.com/sirupsen/logrus"
+	"github.com/spf13/cobra"
+	"github.com/weaveworks/kured/pkg/alerts"
+	assert "gotest.tools/v3/assert"
+
+	papi "github.com/prometheus/client_golang/api"
+)
+
+type BlockingChecker struct {
+	blocking bool
+}
+
+func (fbc BlockingChecker) isBlocked() bool {
+	return fbc.blocking
+}
+
+var _ RebootBlocker = BlockingChecker{}       // Verify that Type implements Interface.
+var _ RebootBlocker = (*BlockingChecker)(nil) // Verify that *Type implements Interface.
+
+func Test_flagCheck(t *testing.T) {
+	var cmd *cobra.Command
+	var args []string
+	slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
+	expected := "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
+	flagCheck(cmd, args)
+	if notifyURL != expected {
+		t.Errorf("Slack URL Parsing is wrong: expecting %s  but got %s\n", expected, notifyURL)
+	}
+
+	// validate that surrounding quotes are stripped
+	slackHookURL = "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\""
+	expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
+	flagCheck(cmd, args)
+	if notifyURL != expected {
+		t.Errorf("Slack URL Parsing is wrong: expecting %s  but got %s\n", expected, notifyURL)
+	}
+	slackHookURL = "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'"
+	expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
+	flagCheck(cmd, args)
+	if notifyURL != expected {
+		t.Errorf("Slack URL Parsing is wrong: expecting %s  but got %s\n", expected, notifyURL)
+	}
+	slackHookURL = ""
+	notifyURL = "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\""
+	expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
+	flagCheck(cmd, args)
+	if notifyURL != expected {
+		t.Errorf("notifyURL Parsing is wrong: expecting %s  but got %s\n", expected, notifyURL)
+	}
+	notifyURL = "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'"
+	expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
+	flagCheck(cmd, args)
+	if notifyURL != expected {
+		t.Errorf("notifyURL Parsing is wrong: expecting %s  but got %s\n", expected, notifyURL)
+	}
+}
+
+func Test_stripQuotes(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "string with no surrounding quotes is unchanged",
+			input:    "Hello, world!",
+			expected: "Hello, world!",
+		},
+		{
+			name:     "string with surrounding double quotes should strip quotes",
+			input:    "\"Hello, world!\"",
+			expected: "Hello, world!",
+		},
+		{
+			name:     "string with surrounding single quotes should strip quotes",
+			input:    "'Hello, world!'",
+			expected: "Hello, world!",
+		},
+		{
+			name:     "string with unbalanced surrounding quotes is unchanged",
+			input:    "'Hello, world!\"",
+			expected: "'Hello, world!\"",
+		},
+		{
+			name:     "string with length of one is unchanged",
+			input:    "'",
+			expected: "'",
+		},
+		{
+			name:     "string with length of zero is unchanged",
+			input:    "",
+			expected: "",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := stripQuotes(tt.input); !reflect.DeepEqual(got, tt.expected) {
+				t.Errorf("stripQuotes() = %v, expected %v", got, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_rebootBlocked(t *testing.T) {
+	noCheckers := []RebootBlocker{}
+	nonblockingChecker := BlockingChecker{blocking: false}
+	blockingChecker := BlockingChecker{blocking: true}
+
+	// Instantiate a prometheusClient with a broken_url
+	promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"})
+	if err != nil {
+		log.Fatal("Can't create prometheusClient: ", err)
+	}
+	brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false}
+
+	type args struct {
+		blockers []RebootBlocker
+	}
+	tests := []struct {
+		name string
+		args args
+		want bool
+	}{
+		{
+			name: "Do not block on no blocker defined",
+			args: args{blockers: noCheckers},
+			want: false,
+		},
+		{
+			name: "Ensure a blocker blocks",
+			args: args{blockers: []RebootBlocker{blockingChecker}},
+			want: true,
+		},
+		{
+			name: "Ensure a non-blocker doesn't block",
+			args: args{blockers: []RebootBlocker{nonblockingChecker}},
+			want: false,
+		},
+		{
+			name: "Ensure one blocker is enough to block",
+			args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}},
+			want: true,
+		},
+		{
+			name: "Do block on error contacting prometheus API",
+			args: args{blockers: []RebootBlocker{brokenPrometheusClient}},
+			want: true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := rebootBlocked(tt.args.blockers...); got != tt.want {
+				t.Errorf("rebootBlocked() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_buildHostCommand(t *testing.T) {
+	type args struct {
+		pid     int
+		command []string
+	}
+	tests := []struct {
+		name string
+		args args
+		want []string
+	}{
+		{
+			name: "Ensure command will run with nsenter",
+			args: args{pid: 1, command: []string{"ls", "-Fal"}},
+			want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := buildHostCommand(tt.args.pid, tt.args.command); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("buildHostCommand() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_buildSentinelCommand(t *testing.T) {
+	type args struct {
+		rebootSentinelFile    string
+		rebootSentinelCommand string
+	}
+	tests := []struct {
+		name string
+		args args
+		want []string
+	}{
+		{
+			name: "Ensure a sentinelFile generates a shell 'test' command with the right file",
+			args: args{
+				rebootSentinelFile:    "/test1",
+				rebootSentinelCommand: "",
+			},
+			want: []string{"test", "-f", "/test1"},
+		},
+		{
+			name: "Ensure a sentinelCommand has priority over a sentinelFile if both are provided (because sentinelFile is always provided)",
+			args: args{
+				rebootSentinelFile:    "/test1",
+				rebootSentinelCommand: "/sbin/reboot-required -r",
+			},
+			want: []string{"/sbin/reboot-required", "-r"},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := buildSentinelCommand(tt.args.rebootSentinelFile, tt.args.rebootSentinelCommand); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("buildSentinelCommand() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_parseRebootCommand(t *testing.T) {
+	type args struct {
+		rebootCommand string
+	}
+	tests := []struct {
+		name string
+		args args
+		want []string
+	}{
+		{
+			name: "Ensure a reboot command is properly parsed",
+			args: args{
+				rebootCommand: "/sbin/systemctl reboot",
+			},
+			want: []string{"/sbin/systemctl", "reboot"},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := parseRebootCommand(tt.args.rebootCommand); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("parseRebootCommand() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_rebootRequired(t *testing.T) {
+	type args struct {
+		sentinelCommand []string
+	}
+	tests := []struct {
+		name string
+		args args
+		want bool
+	}{
+		{
+			name: "Ensure rc = 0 means reboot required",
+			args: args{
+				sentinelCommand: []string{"true"},
+			},
+			want: true,
+		},
+		{
+			name: "Ensure rc != 0 means reboot NOT required",
+			args: args{
+				sentinelCommand: []string{"false"},
+			},
+			want: false,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := rebootRequired(tt.args.sentinelCommand); got != tt.want {
+				t.Errorf("rebootRequired() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_rebootRequired_fatals(t *testing.T) {
+	cases := []struct {
+		param       []string
+		expectFatal bool
+	}{
+		{
+			param:       []string{"true"},
+			expectFatal: false,
+		},
+		{
+			param:       []string{"./babar"},
+			expectFatal: true,
+		},
+	}
+
+	defer func() { log.StandardLogger().ExitFunc = nil }()
+	var fatal bool
+	log.StandardLogger().ExitFunc = func(int) { fatal = true }
+
+	for _, c := range cases {
+		fatal = false
+		rebootRequired(c.param)
+		assert.Equal(t, c.expectFatal, fatal)
+	}
+
+}
--- a/cmd/prom-active-alerts/main.go
+++ b/cmd/prom-active-alerts/main.go
@@ -1,23 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-	"os"
-	"regexp"
-
-	"github.com/weaveworks/kured/pkg/alerts"
-)
-
-func main() {
-	if len(os.Args) != 3 {
-		log.Fatalf("USAGE: %s <prometheusURL> <filterRegexp>", os.Args[0])
-	}
-
-	count, err := alerts.PrometheusCountActive(os.Args[1], regexp.MustCompile(os.Args[2]))
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Println(count)
-}
--- a/go.mod
+++ b/go.mod
@@ -1,15 +1,102 @@
 module github.com/weaveworks/kured

-go 1.13
+go 1.17

 require (
-	github.com/googleapis/gnostic v0.2.0 // indirect
-	github.com/inconshreveable/mousetrap v1.0.0 // indirect
-	github.com/prometheus/client_golang v0.0.0-20181230203121-fb3d5cb2ad57
-	github.com/prometheus/common v0.0.0-20181218105931-67670fe90761
-	github.com/prometheus/procfs v0.0.0-20190102135031-14fa7590c24d // indirect
-	github.com/sirupsen/logrus v1.2.0
-	github.com/spf13/cobra v0.0.0-20181127133106-d2d81d9a96e2
-	k8s.io/apimachinery v0.17.0
-	k8s.io/client-go v0.17.0
+	github.com/containrrr/shoutrrr v0.6.0
+	github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
+	github.com/google/uuid v1.1.5 // indirect
+	github.com/prometheus/client_golang v1.12.2
+	github.com/prometheus/common v0.35.0
+	github.com/sirupsen/logrus v1.8.1
+	github.com/spf13/cobra v1.5.0
+	github.com/spf13/pflag v1.0.5
+	github.com/spf13/viper v1.12.0
+	github.com/stretchr/testify v1.7.5
+	gotest.tools/v3 v3.3.0
+	k8s.io/api v0.23.6
+	k8s.io/apimachinery v0.23.6
+	k8s.io/client-go v0.23.6
+	k8s.io/kubectl v0.23.6
+)
+
+require (
+	github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
+	github.com/MakeNowJust/heredoc v0.0.0-20170808103936-bb23615498cd // indirect
+	github.com/PuerkitoBio/purell v1.1.1 // indirect
+	github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/cespare/xxhash/v2 v2.1.2 // indirect
+	github.com/chai2010/gettext-go v0.0.0-20160711120539-c6fed771bfd5 // indirect
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
+	github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
+	github.com/fatih/color v1.13.0 // indirect
+	github.com/fsnotify/fsnotify v1.5.4 // indirect
+	github.com/go-errors/errors v1.0.1 // indirect
+	github.com/go-logr/logr v1.2.0 // indirect
+	github.com/go-openapi/jsonpointer v0.19.5 // indirect
+	github.com/go-openapi/jsonreference v0.19.5 // indirect
+	github.com/go-openapi/swag v0.19.14 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/protobuf v1.5.2 // indirect
+	github.com/google/btree v1.0.1 // indirect
+	github.com/google/go-cmp v0.5.8 // indirect
+	github.com/google/gofuzz v1.1.0 // indirect
+	github.com/googleapis/gnostic v0.5.5 // indirect
+	github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
+	github.com/hashicorp/hcl v1.0.0 // indirect
+	github.com/imdario/mergo v0.3.5 // indirect
+	github.com/inconshreveable/mousetrap v1.0.0 // indirect
+	github.com/josharian/intern v1.0.0 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
+	github.com/magiconair/properties v1.8.6 // indirect
+	github.com/mailru/easyjson v0.7.6 // indirect
+	github.com/mattn/go-colorable v0.1.12 // indirect
+	github.com/mattn/go-isatty v0.0.14 // indirect
+	github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
+	github.com/mitchellh/go-wordwrap v1.0.0 // indirect
+	github.com/mitchellh/mapstructure v1.5.0 // indirect
+	github.com/moby/spdystream v0.2.0 // indirect
+	github.com/moby/term v0.0.0-20210610120745-9d4ed1856297 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
+	github.com/pelletier/go-toml v1.9.5 // indirect
+	github.com/pelletier/go-toml/v2 v2.0.1 // indirect
+	github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
+	github.com/pkg/errors v0.9.1 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/prometheus/client_model v0.2.0 // indirect
+	github.com/prometheus/procfs v0.7.3 // indirect
+	github.com/russross/blackfriday v1.5.2 // indirect
+	github.com/spf13/afero v1.8.2 // indirect
+	github.com/spf13/cast v1.5.0 // indirect
+	github.com/spf13/jwalterweatherman v1.1.0 // indirect
+	github.com/subosito/gotenv v1.3.0 // indirect
+	github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca // indirect
+	go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect
+	golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2 // indirect
+	golang.org/x/oauth2 v0.0.0-20220411215720-9780585627b5 // indirect
+	golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a // indirect
+	golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
+	golang.org/x/text v0.3.7 // indirect
+	golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect
+	google.golang.org/appengine v1.6.7 // indirect
+	google.golang.org/protobuf v1.28.0 // indirect
+	gopkg.in/inf.v0 v0.9.1 // indirect
+	gopkg.in/ini.v1 v1.66.4 // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	k8s.io/cli-runtime v0.23.6 // indirect
+	k8s.io/component-base v0.23.6 // indirect
+	k8s.io/klog/v2 v2.30.0 // indirect
+	k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 // indirect
+	k8s.io/utils v0.0.0-20211116205334-6203023598ed // indirect
+	sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 // indirect
+	sigs.k8s.io/kustomize/api v0.10.1 // indirect
+	sigs.k8s.io/kustomize/kyaml v0.13.0 // indirect
+	sigs.k8s.io/structured-merge-diff/v4 v4.2.1 // indirect
+	sigs.k8s.io/yaml v1.2.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
--- a/kured-ds.yaml
+++ b/kured-ds.yaml
@@ -23,13 +23,15 @@ spec:
    spec:
      serviceAccountName: kured
      tolerations:
+        - key: node-role.kubernetes.io/control-plane
+          effect: NoSchedule
        - key: node-role.kubernetes.io/master
          effect: NoSchedule
      hostPID: true # Facilitate entering the host mount namespace via init
      restartPolicy: Always
      containers:
        - name: kured
-          image: docker.io/weaveworks/kured
+          image: docker.io/weaveworks/kured:1.10.0
                 # If you find yourself here wondering why there is no
                 # :latest tag on Docker Hub,see the FAQ in the README
          imagePullPolicy: IfNotPresent
@@ -44,20 +46,35 @@ spec:
                  fieldPath: spec.nodeName
          command:
            - /usr/bin/kured
-#            - --alert-filter-regexp=^RebootRequired$
-#            - --blocking-pod-selector=runtime=long,cost=expensive
-#            - --blocking-pod-selector=name=temperamental
-#            - --blocking-pod-selector=...
-#            - --ds-name=kured
-#            - --ds-namespace=kube-system
-#            - --end-time=23:59:59
-#            - --lock-annotation=weave.works/kured-node-lock
+#            - --force-reboot=false
+#            - --drain-grace-period=-1
+#            - --skip-wait-for-delete-timeout=0
+#            - --drain-timeout=0
 #            - --period=1h
+#            - --ds-namespace=kube-system
+#            - --ds-name=kured
+#            - --lock-annotation=weave.works/kured-node-lock
+#            - --lock-ttl=0
 #            - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
-#            - --reboot-days=sun,mon,tue,wed,thu,fri,sat
+#            - --alert-filter-regexp=^RebootRequired$
+#            - --alert-firing-only=false
 #            - --reboot-sentinel=/var/run/reboot-required
+#            - --prefer-no-schedule-taint=""
+#            - --reboot-sentinel-command=""
 #            - --slack-hook-url=https://hooks.slack.com/...
 #            - --slack-username=prod
 #            - --slack-channel=alerting
+#            - --notify-url="" # See also shoutrrr url format
+#            - --message-template-drain=Draining node %s
+#            - --message-template-drain=Rebooting node %s
+#            - --blocking-pod-selector=runtime=long,cost=expensive
+#            - --blocking-pod-selector=name=temperamental
+#            - --blocking-pod-selector=...
+#            - --reboot-days=sun,mon,tue,wed,thu,fri,sat
+#            - --reboot-delay=90s
 #            - --start-time=0:00
+#            - --end-time=23:59:59
 #            - --time-zone=UTC
+#            - --annotate-nodes=false
+#            - --lock-release-delay=30m
+#            - --log-format=text
--- a/kured-rbac.yaml
+++ b/kured-rbac.yaml
@@ -8,7 +8,7 @@ rules:
 # Allow kubectl to drain/uncordon
 #
 # NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
-# match https://github.com/kubernetes/kubernetes/blob/v1.17.5/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
+# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
 #
 - apiGroups: [""]
  resources: ["nodes"]
--- a/pkg/alerts/prometheus.go
+++ b/pkg/alerts/prometheus.go
@@ -7,22 +7,39 @@ import (
 	"sort"
 	"time"

-	"github.com/prometheus/client_golang/api"
-	"github.com/prometheus/client_golang/api/prometheus/v1"
+	papi "github.com/prometheus/client_golang/api"
+	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
 	"github.com/prometheus/common/model"
 )

-// Returns a list of names of active (e.g. pending or firing) alerts, filtered
-// by the supplied regexp.
-func PrometheusActiveAlerts(prometheusURL string, filter *regexp.Regexp) ([]string, error) {
-	client, err := api.NewClient(api.Config{Address: prometheusURL})
+// PromClient is a wrapper around the Prometheus Client interface and implements the api
+// This way, the PromClient can be instantiated with the configuration the Client needs, and
+// the ability to use the methods the api has, like Query and so on.
+type PromClient struct {
+	papi papi.Client
+	api  v1.API
+}
+
+// NewPromClient creates a new client to the Prometheus API.
+// It returns an error on any problem.
+func NewPromClient(conf papi.Config) (*PromClient, error) {
+	promClient, err := papi.NewClient(conf)
 	if err != nil {
 		return nil, err
 	}
+	client := PromClient{papi: promClient, api: v1.NewAPI(promClient)}
+	return &client, nil
+}

-	queryAPI := v1.NewAPI(client)
+// ActiveAlerts is a method of type PromClient, it returns a list of names of active alerts
+// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query.
+// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
+// block-list and will NOT block rebooting. query by includeLabel means,
+// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
+func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {

-	value, err := queryAPI.Query(context.Background(), "ALERTS", time.Now())
+	// get all alerts from prometheus
+	value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
 	if err != nil {
 		return nil, err
 	}
@@ -32,17 +49,17 @@ func PrometheusActiveAlerts(prometheusURL string, filter *regexp.Regexp) ([]stri
 			activeAlertSet := make(map[string]bool)
 			for _, sample := range vector {
 				if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
-					if filter == nil || !filter.MatchString(string(alertName)) {
+					if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
 						activeAlertSet[string(alertName)] = true
 					}
 				}
 			}

 			var activeAlerts []string
-			for activeAlert, _ := range activeAlertSet {
+			for activeAlert := range activeAlertSet {
 				activeAlerts = append(activeAlerts, activeAlert)
 			}
-			sort.Sort(sort.StringSlice(activeAlerts))
+			sort.Strings(activeAlerts)

 			return activeAlerts, nil
 		}
--- a/pkg/alerts/prometheus_test.go
+++ b/pkg/alerts/prometheus_test.go
@@ -0,0 +1,141 @@
+package alerts
+
+import (
+	"log"
+	"net/http"
+	"net/http/httptest"
+
+	"regexp"
+	"testing"
+
+	"github.com/prometheus/client_golang/api"
+
+	"github.com/stretchr/testify/assert"
+)
+
+type MockResponse struct {
+	StatusCode int
+	Body       []byte
+}
+
+// MockServerProperties ties a mock response to a url and a method
+type MockServerProperties struct {
+	URI        string
+	HTTPMethod string
+	Response   MockResponse
+}
+
+// NewMockServer sets up a new MockServer with properties ad starts the server.
+func NewMockServer(props ...MockServerProperties) *httptest.Server {
+
+	handler := http.HandlerFunc(
+		func(w http.ResponseWriter, r *http.Request) {
+			for _, proc := range props {
+				_, err := w.Write(proc.Response.Body)
+				if err != nil {
+					log.Fatal(err)
+				}
+			}
+		})
+	return httptest.NewServer(handler)
+}
+
+func TestActiveAlerts(t *testing.T) {
+	responsebody := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"GatekeeperViolations","alertstate":"firing","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodCrashing-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:8080","job":"kube-state-metrics","namespace":"dev","pod":"dev-deployment-78dcbmf25v","severity":"critical","team":"dev"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodRestart-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:1234","job":"kube-state-metrics","namespace":"qa","pod":"qa-job-deployment-78dcbmf25v","severity":"warning","team":"qa"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PrometheusTargetDown","alertstate":"firing","job":"kubernetes-pods","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`
+	addr := "http://localhost:10001"
+
+	for _, tc := range []struct {
+		it         string
+		rFilter    string
+		respBody   string
+		aName      string
+		wantN      int
+		firingOnly bool
+	}{
+		{
+			it:         "should return no active alerts",
+			respBody:   responsebody,
+			rFilter:    "",
+			wantN:      0,
+			firingOnly: false,
+		},
+		{
+			it:         "should return a subset of all alerts",
+			respBody:   responsebody,
+			rFilter:    "Pod",
+			wantN:      3,
+			firingOnly: false,
+		},
+		{
+			it:         "should return all active alerts by regex",
+			respBody:   responsebody,
+			rFilter:    "*",
+			wantN:      5,
+			firingOnly: false,
+		},
+		{
+			it:         "should return all active alerts by regex filter",
+			respBody:   responsebody,
+			rFilter:    "*",
+			wantN:      5,
+			firingOnly: false,
+		},
+		{
+			it:         "should return only firing alerts if firingOnly is true",
+			respBody:   responsebody,
+			rFilter:    "*",
+			wantN:      4,
+			firingOnly: true,
+		},
+		{
+			it:         "should return ScheduledRebootFailing active alerts",
+			respBody:   `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
+			aName:      "ScheduledRebootFailing",
+			rFilter:    "*",
+			wantN:      1,
+			firingOnly: false,
+		},
+		{
+			it:         "should not return an active alert if RebootRequired is firing (regex filter)",
+			respBody:   `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
+			rFilter:    "RebootRequired",
+			wantN:      0,
+			firingOnly: false,
+		},
+	} {
+		// Start mockServer
+		mockServer := NewMockServer(MockServerProperties{
+			URI:        addr,
+			HTTPMethod: http.MethodPost,
+			Response: MockResponse{
+				Body: []byte(tc.respBody),
+			},
+		})
+		// Close mockServer after all connections are gone
+		defer mockServer.Close()
+
+		t.Run(tc.it, func(t *testing.T) {
+
+			// regex filter
+			regex, _ := regexp.Compile(tc.rFilter)
+
+			// instantiate the prometheus client with the mockserver-address
+			p, err := NewPromClient(api.Config{Address: mockServer.URL})
+			if err != nil {
+				log.Fatal(err)
+			}
+
+			result, err := p.ActiveAlerts(regex, tc.firingOnly)
+			if err != nil {
+				log.Fatal(err)
+			}
+
+			// assert
+			assert.Equal(t, tc.wantN, len(result), "expected amount of alerts %v, got %v", tc.wantN, len(result))
+
+			if tc.aName != "" {
+				assert.Equal(t, tc.aName, result[0], "expected active alert %v, got %v", tc.aName, result[0])
+			}
+		})
+	}
+}
--- a/pkg/daemonsetlock/daemonsetlock.go
+++ b/pkg/daemonsetlock/daemonsetlock.go
@@ -1,15 +1,25 @@
 package daemonsetlock

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"time"

+	v1 "k8s.io/api/apps/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/kubernetes"
 )

+const (
+	k8sAPICallRetrySleep   = 5 * time.Second // How much time to wait in between retrying a k8s API call
+	k8sAPICallRetryTimeout = 5 * time.Minute // How long to wait until we determine that the k8s API is definitively unavailable
+)
+
+// DaemonSetLock holds all necessary information to do actions
+// on the kured ds which holds lock info through annotations.
 type DaemonSetLock struct {
 	client     *kubernetes.Clientset
 	nodeID     string
@@ -19,19 +29,23 @@ type DaemonSetLock struct {
 }

 type lockAnnotationValue struct {
-	NodeID   string      `json:"nodeID"`
-	Metadata interface{} `json:"metadata,omitempty"`
+	NodeID   string        `json:"nodeID"`
+	Metadata interface{}   `json:"metadata,omitempty"`
+	Created  time.Time     `json:"created"`
+	TTL      time.Duration `json:"TTL"`
 }

+// New creates a daemonsetLock object containing the necessary data for follow up k8s requests
 func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock {
 	return &DaemonSetLock{client, nodeID, namespace, name, annotation}
 }

-func (dsl *DaemonSetLock) Acquire(metadata interface{}) (acquired bool, owner string, err error) {
+// Acquire attempts to annotate the kured daemonset with lock info from instantiated DaemonSetLock using client-go
+func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool, string, error) {
 	for {
-		ds, err := dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(dsl.name, metav1.GetOptions{})
+		ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
 		if err != nil {
-			return false, "", err
+			return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
 		}

 		valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
@@ -40,20 +54,23 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}) (acquired bool, owner st
 			if err := json.Unmarshal([]byte(valueString), &value); err != nil {
 				return false, "", err
 			}
-			return value.NodeID == dsl.nodeID, value.NodeID, nil
+
+			if !ttlExpired(value.Created, value.TTL) {
+				return value.NodeID == dsl.nodeID, value.NodeID, nil
+			}
 		}

 		if ds.ObjectMeta.Annotations == nil {
 			ds.ObjectMeta.Annotations = make(map[string]string)
 		}
-		value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata}
+		value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata, Created: time.Now().UTC(), TTL: TTL}
 		valueBytes, err := json.Marshal(&value)
 		if err != nil {
 			return false, "", err
 		}
 		ds.ObjectMeta.Annotations[dsl.annotation] = string(valueBytes)

-		_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(ds)
+		_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
 		if err != nil {
 			if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
 				// Something else updated the resource between us reading and writing - try again soon
@@ -67,10 +84,11 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}) (acquired bool, owner st
 	}
 }

-func (dsl *DaemonSetLock) Test(metadata interface{}) (holding bool, err error) {
-	ds, err := dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(dsl.name, metav1.GetOptions{})
+// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go
+func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
+	ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
 	if err != nil {
-		return false, err
+		return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
 	}

 	valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
@@ -79,17 +97,21 @@ func (dsl *DaemonSetLock) Test(metadata interface{}) (holding bool, err error) {
 		if err := json.Unmarshal([]byte(valueString), &value); err != nil {
 			return false, err
 		}
-		return value.NodeID == dsl.nodeID, nil
+
+		if !ttlExpired(value.Created, value.TTL) {
+			return value.NodeID == dsl.nodeID, nil
+		}
 	}

 	return false, nil
 }

+// Release attempts to remove the lock data from the kured ds annotations using client-go
 func (dsl *DaemonSetLock) Release() error {
 	for {
-		ds, err := dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(dsl.name, metav1.GetOptions{})
+		ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
 		if err != nil {
-			return err
+			return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
 		}

 		valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
@@ -98,6 +120,7 @@ func (dsl *DaemonSetLock) Release() error {
 			if err := json.Unmarshal([]byte(valueString), &value); err != nil {
 				return err
 			}
+
 			if value.NodeID != dsl.nodeID {
 				return fmt.Errorf("Not lock holder: %v", value.NodeID)
 			}
@@ -107,7 +130,7 @@ func (dsl *DaemonSetLock) Release() error {

 		delete(ds.ObjectMeta.Annotations, dsl.annotation)

-		_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(ds)
+		_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
 		if err != nil {
 			if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
 				// Something else updated the resource between us reading and writing - try again soon
@@ -120,3 +143,28 @@ func (dsl *DaemonSetLock) Release() error {
 		return nil
 	}
 }
+
+// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client
+func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) {
+	var ds *v1.DaemonSet
+	var lastError error
+	err := wait.PollImmediate(sleep, timeout, func() (bool, error) {
+		ctx, cancel := context.WithTimeout(context.Background(), timeout)
+		defer cancel()
+		if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil {
+			return false, nil
+		}
+		return true, nil
+	})
+	if err != nil {
+		return nil, fmt.Errorf("Timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError)
+	}
+	return ds, nil
+}
+
+func ttlExpired(created time.Time, ttl time.Duration) bool {
+	if ttl > 0 && time.Since(created) >= ttl {
+		return true
+	}
+	return false
+}
--- a/pkg/daemonsetlock/daemonsetlock_test.go
+++ b/pkg/daemonsetlock/daemonsetlock_test.go
@@ -0,0 +1,28 @@
+package daemonsetlock
+
+import (
+	"testing"
+	"time"
+)
+
+func TestTtlExpired(t *testing.T) {
+	d := time.Date(2020, 05, 05, 14, 15, 0, 0, time.UTC)
+	second, _ := time.ParseDuration("1s")
+	zero, _ := time.ParseDuration("0m")
+
+	tests := []struct {
+		created time.Time
+		ttl     time.Duration
+		result  bool
+	}{
+		{d, second, true},
+		{time.Now(), second, false},
+		{d, zero, false},
+	}
+
+	for i, tst := range tests {
+		if ttlExpired(tst.created, tst.ttl) != tst.result {
+			t.Errorf("Test %d failed, expected %v but got %v", i, tst.result, !tst.result)
+		}
+	}
+}
--- a/pkg/delaytick/delaytick.go
+++ b/pkg/delaytick/delaytick.go
@@ -5,7 +5,7 @@ import (
 	"time"
 )

-// Tick regularly after an initial delay randomly distributed between d/2 and d + d/2
+// New ticks regularly after an initial delay randomly distributed between d/2 and d + d/2
 func New(s rand.Source, d time.Duration) <-chan time.Time {
 	c := make(chan time.Time)

--- a/pkg/notifications/slack/slack.go
+++ b/pkg/notifications/slack/slack.go
@@ -1,52 +0,0 @@
-package slack
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"net/http"
-	"time"
-)
-
-var (
-	httpClient = &http.Client{Timeout: 5 * time.Second}
-)
-
-type body struct {
-	Text     string `json:"text,omitempty"`
-	Username string `json:"username,omitempty"`
-	Channel  string `json:"channel,omitempty"`
-}
-
-func notify(hookURL, username, channel, message string) error {
-	msg := body{
-		Text:     message,
-		Username: username,
-		Channel:  channel,
-	}
-
-	var buf bytes.Buffer
-	if err := json.NewEncoder(&buf).Encode(&msg); err != nil {
-		return err
-	}
-
-	resp, err := httpClient.Post(hookURL, "application/json", &buf)
-	if err != nil {
-		return err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
-		return fmt.Errorf(resp.Status)
-	}
-
-	return nil
-}
-
-func NotifyDrain(hookURL, username, channel, nodeID string) error {
-	return notify(hookURL, username, channel, fmt.Sprintf("Draining node %s", nodeID))
-}
-
-func NotifyReboot(hookURL, username, channel, nodeID string) error {
-	return notify(hookURL, username, channel, fmt.Sprintf("Rebooting node %s", nodeID))
-}
--- a/pkg/taints/taints.go
+++ b/pkg/taints/taints.go
@@ -0,0 +1,166 @@
+package taints
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+
+	log "github.com/sirupsen/logrus"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/kubernetes"
+)
+
+// Taint allows to set soft and hard limitations for scheduling and executing pods on nodes.
+type Taint struct {
+	client    *kubernetes.Clientset
+	nodeID    string
+	taintName string
+	effect    v1.TaintEffect
+	exists    bool
+}
+
+// New provides a new taint.
+func New(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect) *Taint {
+	exists, _, _ := taintExists(client, nodeID, taintName)
+
+	return &Taint{
+		client:    client,
+		nodeID:    nodeID,
+		taintName: taintName,
+		effect:    effect,
+		exists:    exists,
+	}
+}
+
+// Enable creates the taint for a node. Creating an existing taint is a noop.
+func (t *Taint) Enable() {
+	if t.taintName == "" {
+		return
+	}
+
+	if t.exists {
+		return
+	}
+
+	preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, true)
+
+	t.exists = true
+}
+
+// Disable removes the taint for a node. Removing a missing taint is a noop.
+func (t *Taint) Disable() {
+	if t.taintName == "" {
+		return
+	}
+
+	if !t.exists {
+		return
+	}
+
+	preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, false)
+
+	t.exists = false
+}
+
+func taintExists(client *kubernetes.Clientset, nodeID, taintName string) (bool, int, *v1.Node) {
+	updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
+	if err != nil || updatedNode == nil {
+		log.Fatalf("Error reading node %s: %v", nodeID, err)
+	}
+
+	for i, taint := range updatedNode.Spec.Taints {
+		if taint.Key == taintName {
+			return true, i, updatedNode
+		}
+	}
+
+	return false, 0, updatedNode
+}
+
+func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect, shouldExists bool) {
+	taintExists, offset, updatedNode := taintExists(client, nodeID, taintName)
+
+	if taintExists && shouldExists {
+		log.Debugf("Taint %v exists already for node %v.", taintName, nodeID)
+		return
+	}
+
+	if !taintExists && !shouldExists {
+		log.Debugf("Taint %v already missing for node %v.", taintName, nodeID)
+		return
+	}
+
+	type patchTaints struct {
+		Op    string      `json:"op"`
+		Path  string      `json:"path"`
+		Value interface{} `json:"value,omitempty"`
+	}
+
+	taint := v1.Taint{
+		Key:    taintName,
+		Effect: effect,
+	}
+
+	var patches []patchTaints
+
+	if len(updatedNode.Spec.Taints) == 0 {
+		// add first taint and ensure to keep current taints
+		patches = []patchTaints{
+			{
+				Op:    "test",
+				Path:  "/spec",
+				Value: updatedNode.Spec,
+			},
+			{
+				Op:    "add",
+				Path:  "/spec/taints",
+				Value: []v1.Taint{},
+			},
+			{
+				Op:    "add",
+				Path:  "/spec/taints/-",
+				Value: taint,
+			},
+		}
+	} else if taintExists {
+		// remove taint and ensure to test against race conditions
+		patches = []patchTaints{
+			{
+				Op:    "test",
+				Path:  fmt.Sprintf("/spec/taints/%d", offset),
+				Value: taint,
+			},
+			{
+				Op:   "remove",
+				Path: fmt.Sprintf("/spec/taints/%d", offset),
+			},
+		}
+	} else {
+		// add missing taint to exsting list
+		patches = []patchTaints{
+			{
+				Op:    "add",
+				Path:  "/spec/taints/-",
+				Value: taint,
+			},
+		}
+	}
+
+	patchBytes, err := json.Marshal(patches)
+	if err != nil {
+		log.Fatalf("Error encoding taint patch for node %s: %v", nodeID, err)
+	}
+
+	_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
+	if err != nil {
+		log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
+	}
+
+	if shouldExists {
+		log.Info("Node taint added")
+	} else {
+		log.Info("Node taint removed")
+	}
+}
--- a/pkg/timewindow/days.go
+++ b/pkg/timewindow/days.go
@@ -7,6 +7,8 @@ import (
 	"time"
 )

+// EveryDay contains all days of the week, and exports it
+// for convenience use in the cmd line arguments.
 var EveryDay = []string{"su", "mo", "tu", "we", "th", "fr", "sa"}

 // dayStrings maps day strings to time.Weekdays
@@ -78,14 +80,12 @@ func parseWeekday(day string) (time.Weekday, error) {
 	if n, err := strconv.Atoi(day); err == nil {
 		if n >= 0 && n < 7 {
 			return time.Weekday(n), nil
-		} else {
-			return time.Sunday, fmt.Errorf("Invalid weekday, number out of range: %s", day)
 		}
+		return time.Sunday, fmt.Errorf("Invalid weekday, number out of range: %s", day)
 	}

 	if weekday, ok := dayStrings[strings.ToLower(day)]; ok {
 		return weekday, nil
-	} else {
-		return time.Sunday, fmt.Errorf("Invalid weekday: %s", day)
 	}
+	return time.Sunday, fmt.Errorf("Invalid weekday: %s", day)
 }
--- a/pkg/timewindow/timewindow.go
+++ b/pkg/timewindow/timewindow.go
@@ -47,6 +47,19 @@ func (tw *TimeWindow) Contains(t time.Time) bool {
 	start := time.Date(loctime.Year(), loctime.Month(), loctime.Day(), tw.startTime.Hour(), tw.startTime.Minute(), tw.startTime.Second(), 0, tw.location)
 	end := time.Date(loctime.Year(), loctime.Month(), loctime.Day(), tw.endTime.Hour(), tw.endTime.Minute(), tw.endTime.Second(), 1e9-1, tw.location)

+	// Time Wrap validation
+	// First we check for start and end time, if start is after end time
+	// Next we need to validate if we want to wrap to the day before or to the day after
+	// For that we check the loctime value to see if it is before end time, we wrap with the day before
+	// Otherwise we wrap to the next day.
+	if tw.startTime.After(tw.endTime) {
+		if loctime.Before(end) {
+			start = start.Add(-24 * time.Hour)
+		} else {
+			end = end.Add(24 * time.Hour)
+		}
+	}
+
 	return (loctime.After(start) || loctime.Equal(start)) && (loctime.Before(end) || loctime.Equal(end))
 }

--- a/pkg/timewindow/timewindow_test.go
+++ b/pkg/timewindow/timewindow_test.go
@@ -20,12 +20,12 @@ func TestTimeWindows(t *testing.T) {
 		cases []testcase
 	}{
 		{"mon,tue,wed,thu,fri", "9am", "5pm", "America/Los_Angeles", []testcase{
-			{"2019/04/04 00:49 PDT", false},
-			{"2019/04/05 08:59 PDT", false},
-			{"2019/04/05 9:01 PDT", true},
 			{"2019/03/31 10:00 PDT", false},
+			{"2019/04/04 00:49 PDT", false},
 			{"2019/04/04 12:00 PDT", true},
 			{"2019/04/04 11:59 UTC", false},
+			{"2019/04/05 08:59 PDT", false},
+			{"2019/04/05 9:01 PDT", true},
 		}},
 		{"mon,we,fri", "10:01", "11:30am", "America/Los_Angeles", []testcase{
 			{"2019/04/05 10:30 PDT", true},
@@ -40,6 +40,43 @@ func TestTimeWindows(t *testing.T) {
 			{"2019/04/18 00:00 UTC", true},
 			{"2019/04/18 23:59 UTC", true},
 		}},
+		{"mon,tue,wed,thu,fri", "9pm", "5am", "America/Los_Angeles", []testcase{
+			{"2019/03/30 04:00 PDT", false},
+			{"2019/03/31 10:00 PDT", false},
+			{"2019/03/31 22:00 PDT", false},
+			{"2019/04/04 00:49 PDT", true},
+			{"2019/04/04 12:00 PDT", false},
+			{"2019/04/04 22:49 PDT", true},
+			{"2019/04/05 00:49 PDT", true},
+			{"2019/04/05 08:59 PDT", false},
+			{"2019/04/05 9:01 PDT", false},
+		}},
+		{"mon,tue,wed,thu,fri", "11:59pm", "00:01am", "America/Los_Angeles", []testcase{
+			{"2019/04/04 23:58 PDT", false},
+			{"2019/04/04 23:59 PDT", true},
+			{"2019/04/05 00:00 PDT", true},
+			{"2019/04/05 00:01 PDT", true},
+			{"2019/04/05 00:02 PDT", false},
+		}},
+		{"mon,tue,wed,fri", "11:59pm", "00:01am", "America/Los_Angeles", []testcase{
+			{"2019/04/04 23:58 PDT", false},
+			{"2019/04/04 23:59 PDT", false}, // Even that this falls in the between the hours Thursday is not included so should not run
+			{"2019/04/05 00:00 PDT", true},
+			{"2019/04/05 00:02 PDT", false},
+		}},
+		{"mon,tue,wed,thu", "11:59pm", "00:01am", "America/Los_Angeles", []testcase{
+			{"2019/04/04 23:58 PDT", false},
+			{"2019/04/04 23:59 PDT", true},
+			{"2019/04/05 00:00 PDT", false}, // Even that this falls in the between the hours Friday is not included so should not run
+			{"2019/04/05 00:02 PDT", false},
+		}},
+		{"mon,tue,wed,thu,fri", "11:59pm", "00:01am", "UTC", []testcase{
+			{"2019/04/04 23:58 UTC", false},
+			{"2019/04/04 23:59 UTC", true},
+			{"2019/04/05 00:00 UTC", true},
+			{"2019/04/05 00:01 UTC", true},
+			{"2019/04/05 00:02 UTC", false},
+		}},
 	}

 	for i, tst := range tests {
--- a/tests/kind/create-reboot-sentinels.sh
+++ b/tests/kind/create-reboot-sentinels.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+# USE KUBECTL_CMD to pass context and/or namespaces.
+KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
+SENTINEL_FILE="${SENTINEL_FILE:-/var/run/reboot-required}"
+
+echo "Creating reboot sentinel on all nodes"
+
+for nodename in $("$KUBECTL_CMD" get nodes -o name); do
+    docker exec "${nodename/node\//}" hostname
+    docker exec "${nodename/node\//}" touch "${SENTINEL_FILE}"
+done
--- a/tests/kind/follow-coordinated-reboot.sh
+++ b/tests/kind/follow-coordinated-reboot.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+NODECOUNT=${NODECOUNT:-5}
+KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
+DEBUG="${DEBUG:-false}"
+CONTAINER_NAME_FORMAT=${CONTAINER_NAME_FORMAT:-"chart-testing-*"}
+
+tmp_dir=$(mktemp -d -t kured-XXXX)
+function gather_logs_and_cleanup {
+    if [[ -f "$tmp_dir"/node_output ]]; then
+        rm "$tmp_dir"/node_output
+    fi
+    rmdir "$tmp_dir"
+
+    # The next commands are useful regardless of success or failures.
+    if [[ "$DEBUG" == "true" ]]; then
+        echo "############################################################"
+        # This is useful to see if containers have crashed.
+        echo "docker ps -a:"
+        docker ps -a
+	echo "docker journal logs"
+	journalctl -u docker --no-pager
+
+        # This is useful to see if the nodes have _properly_ rebooted.
+        # It should show the reboot/two container starts per node.
+        for name in $(docker ps -a -f "name=${CONTAINER_NAME_FORMAT}" -q); do
+            echo "############################################################"
+            echo "docker logs for container $name:"
+            docker logs "$name"
+        done
+
+    fi
+}
+trap gather_logs_and_cleanup EXIT
+
+declare -A was_unschedulable
+declare -A has_recovered
+max_attempts="60"
+sleep_time=60
+attempt_num=1
+
+set +o errexit
+echo "There are $NODECOUNT nodes in the cluster"
+until [ ${#was_unschedulable[@]} == "$NODECOUNT" ] && [ ${#has_recovered[@]} == "$NODECOUNT" ]
+do
+    echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}"
+    echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}"
+
+    "$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output
+    if [[ "$DEBUG" == "true" ]]; then
+        # This is useful to see if a node gets stuck after drain, and doesn't
+        # come back up.
+        echo "Result of command $KUBECTL_CMD get nodes ... showing unschedulable nodes:"
+        cat "$tmp_dir"/node_output
+    fi
+    while read -r node; do
+        unschedulable=$(echo "$node" | grep true | cut -f 1 -d ' ')
+        if [ -n "$unschedulable" ] && [ -z ${was_unschedulable["$unschedulable"]+x} ] ; then
+            echo "$unschedulable is now unschedulable!"
+            was_unschedulable["$unschedulable"]=1
+        fi
+        schedulable=$(echo "$node" | grep '<none>' | cut -f 1 -d ' ')
+        if [ -n "$schedulable" ] && [ ${was_unschedulable["$schedulable"]+x} ] && [ -z ${has_recovered["$schedulable"]+x} ]; then
+            echo "$schedulable has recovered!"
+            has_recovered["$schedulable"]=1
+        fi
+    done < "$tmp_dir"/node_output
+
+    if [[ "${#has_recovered[@]}" == "$NODECOUNT" ]]; then
+        echo "All nodes recovered."
+        break
+    else
+        if (( attempt_num == max_attempts ))
+        then
+            echo "Attempt $attempt_num failed and there are no more attempts left!"
+            exit 1
+        else
+            echo "Attempt $attempt_num failed! Trying again in $sleep_time seconds..."
+            sleep "$sleep_time"
+        fi
+    fi
+    (( attempt_num++ ))
+done
+
+echo "Test successful"
--- a/tests/kind/test-metrics.sh
+++ b/tests/kind/test-metrics.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+expected="$1"
+if [[ "$expected" != "0" && "$expected" != "1" ]]; then
+    echo "You should give an argument to this script, the gauge value (0 or 1)"
+    exit 1
+fi
+
+HOST="${HOST:-localhost}"
+PORT="${PORT:-30000}"
+NODENAME="${NODENAME-chart-testing-control-plane}"
+
+reboot_required=$(docker exec "$NODENAME" curl "http://$HOST:$PORT/metrics" | awk '/^kured_reboot_required/{print $2}')
+if [[ "$reboot_required" == "$expected" ]]; then
+    echo "Test success"
+else
+    echo "Test failed"
+    exit 1
+fi