Compare commits

..

3 Commits

Author SHA1 Message Date
Frank Vissing
6a4dcc8fb3 reboot if forceRebootSentinel exists 2018-06-01 13:29:49 +02:00
Frank Vissing
989594acbf Update main.go
fix spelling
2018-05-15 14:19:08 +02:00
Frank Vissing
1b5565cbae adding option to force reboot, ignoring active allerts 2018-05-15 12:40:54 +02:00
59 changed files with 777 additions and 4978 deletions

20
.circleci/config.yml Normal file
View File

@@ -0,0 +1,20 @@
version: 2
jobs:
build:
working_directory: /go/src/github.com/weaveworks/kured
docker:
- image: circleci/golang:1.8
steps:
- checkout
- setup_remote_docker
- run: go get github.com/golang/dep/cmd/dep
- run: dep ensure
- run: make
- deploy:
name: Maybe push master images
command: |
if [ -z "${CIRCLE_TAG}" -a "${CIRCLE_BRANCH}" == "master" ]; then
docker login -u "$DOCKER_USER" -p "$DOCKER_PASS" quay.io
make publish-image
fi

7
.github/ct.yaml vendored
View File

@@ -1,7 +0,0 @@
# See https://github.com/helm/chart-testing#configuration
remote: origin
target-branch: main
chart-dirs:
- charts
chart-repos: []
helm-extra-args: --timeout 600s

View File

@@ -1,21 +0,0 @@
version: 2
updates:
# Maintain dependencies for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
# Maintain dependencies for gomod
- package-ecosystem: "gomod"
directory: "/"
schedule:
interval: "daily"
ignore:
- dependency-name: "k8s.io/api"
- dependency-name: "k8s.io/apimachinery"
- dependency-name: "k8s.io/client-go"
- dependency-name: "k8s.io/kubectl"
- package-ecosystem: "docker"
directory: "cmd/kured"
schedule:
interval: "daily"

View File

@@ -1,13 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: kindest/node:v1.21.2
- role: control-plane
image: kindest/node:v1.21.2
- role: control-plane
image: kindest/node:v1.21.2
- role: worker
image: kindest/node:v1.21.2
- role: worker
image: kindest/node:v1.21.2

View File

@@ -1,13 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: kindest/node:v1.22.4
- role: control-plane
image: kindest/node:v1.22.4
- role: control-plane
image: kindest/node:v1.22.4
- role: worker
image: kindest/node:v1.22.4
- role: worker
image: kindest/node:v1.22.4

View File

@@ -1,13 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: "kindest/node:v1.23.0"
- role: control-plane
image: "kindest/node:v1.23.0"
- role: control-plane
image: "kindest/node:v1.23.0"
- role: worker
image: "kindest/node:v1.23.0"
- role: worker
image: "kindest/node:v1.23.0"

View File

@@ -1,19 +0,0 @@
name: Publish helm chart
on:
push:
branches:
- "main"
paths:
- "charts/**"
jobs:
publish-helm-chart:
name: Publish latest chart
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Publish Helm chart
uses: stefanprodan/helm-gh-pages@master
with:
token: ${{ secrets.GITHUB_TOKEN }}
charts_dir: charts

View File

@@ -1,59 +0,0 @@
# We publish every merged commit in the form of an image
# named kured:<branch>-<short tag>
name: Push image of latest main
on:
push:
branches:
- main
jobs:
tag-scan-and-push-final-image:
name: "Build, scan, and publish tagged image"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
- name: Login to ghcr.io
uses: docker/login-action@v1
with:
registry: ghcr.io
username: weave-ghcr-bot
password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
- name: Find current tag version
run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
id: tags
- name: Build image
uses: docker/build-push-action@v2
with:
context: .
file: cmd/kured/Dockerfile.multi
platforms: linux/arm64, linux/amd64
push: true
tags: |
docker.io/${{ GITHUB.REPOSITORY }}:main-${{ steps.tags.outputs.sha_short }}
ghcr.io/${{ GITHUB.REPOSITORY }}:main-${{ steps.tags.outputs.sha_short }}

View File

@@ -1,78 +0,0 @@
#This is just extra testing, for lint check, and basic installation
#Those can fail earlier than functional tests (shorter tests)
# and give developer feedback soon if they didn't test themselves
name: PR - charts
on:
pull_request:
paths:
- "charts/**"
jobs:
# We create two jobs (with a matrix) instead of one to make those parallel.
# We don't need to conditionally check if something has changed, due to github actions
# tackling that for us.
# Fail-fast ensures that if one of those matrix job fail, the other one gets cancelled.
test-chart:
name: Test helm chart changes
runs-on: ubuntu-latest
strategy:
fail-fast: true
matrix:
test-action:
- lint
- install
steps:
- name: Checkout
uses: actions/checkout@v2
with:
fetch-depth: "0"
- uses: actions/setup-python@v2
with:
python-version: 3.7
# Helm is already present in github actions, so do not re-install it
- name: Setup chart testing
uses: helm/chart-testing-action@v2.2.0
- name: Create default kind cluster
uses: helm/kind-action@v1.2.0
with:
version: v0.11.0
if: ${{ matrix.test-action == 'install' }}
- name: Run chart tests
run: ct ${{ matrix.test-action }} --config .github/ct.yaml
# This doesn't re-use the ct actions, due to many limitations (auto tear down, no real testing)
deploy-chart:
name: Functional test of helm chart in its current state (needs published image of the helm chart)
runs-on: ubuntu-latest
needs: test-chart
steps:
- uses: actions/checkout@v2
# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create 1 node kind cluster
uses: helm/kind-action@v1.2.0
with:
version: v0.11.0
- name: Deploy kured on default namespace with its helm chart
run: |
# Documented in official helm doc to live on the edge
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
# Refresh bins
hash -r
helm install kured ./charts/kured/ --set configuration.period=1m --wait
kubectl config set-context kind-chart-testing
kubectl get ds --all-namespaces
kubectl describe ds kured
- name: Test if successful deploy
uses: nick-invision/retry@v2.6.0
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 10
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1'"

View File

@@ -1,336 +0,0 @@
name: PR
on:
pull_request:
push:
jobs:
pr-gotest:
name: Run go tests
runs-on: ubuntu-18.04
steps:
- name: checkout
uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- name: run tests
run: go test -json ./... > test.json
- name: Annotate tests
if: always()
uses: guyarb/golang-test-annoations@v0.5.0
with:
test-results: test.json
pr-shellcheck:
name: Lint bash code with shellcheck
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run ShellCheck
uses: bewuethr/shellcheck-action@v2
pr-lint-code:
name: Lint golang code
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- name: Lint cmd folder
uses: Jerome1337/golint-action@v1.0.2
with:
golint-path: './cmd/...'
- name: Lint pkg folder
uses: Jerome1337/golint-action@v1.0.2
with:
golint-path: './pkg/...'
pr-check-docs-links:
name: Check docs for incorrect links
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Link Checker
id: lc
uses: peter-evans/link-checker@v1
with:
args: -r *.md *.yaml */*/*.go -x .cluster.local
- name: Fail if there were link errors
run: exit ${{ steps.lc.outputs.exit_code }}
# This should not be made a mandatory test
# It is only used to make us aware of any potential security failure, that
# should trigger a bump of the image in build/.
pr-vuln-scan:
name: Build image and scan it against known vulnerabilities
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- run: make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
- uses: Azure/container-scan@v0
with:
image-name: docker.io/${{ github.repository_owner }}/kured:${{ github.sha }}
# This ensures the latest code works with the manifests built from tree.
# It is useful for two things:
# - Test manifests changes (obviously), ensuring they don't break existing clusters
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
e2e-manifests:
name: End-to-End test with kured with code and manifests from HEAD
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes:
- "1.21"
- "1.22"
- "1.23"
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- name: Build artifacts
run: |
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" manifest
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF
# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create kind cluster with 5 nodes
uses: helm/kind-action@v1.2.0
with:
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
version: v0.11.0
- name: Preload previously built images onto kind cluster
run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
- name: Do not wait for an hour before detecting the rebootSentinel
run: |
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
- name: Install kured with kubectl
run: |
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
- name: Ensure kured is ready
uses: nick-invision/retry@v2.6.0
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"
- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh
- name: Follow reboot until success
env:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh
scenario-prom-helm:
name: Test prometheus with latest code from HEAD (=overrides image of the helm chart)
runs-on: ubuntu-latest
# only build with oldest and newest supported, it should be good enough.
strategy:
fail-fast: false
matrix:
kubernetes:
- "1.21"
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- name: Build artifacts
run: |
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" helm-chart
- name: Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF
# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create 1 node kind cluster
uses: helm/kind-action@v1.2.0
with:
version: v0.11.0
- name: Preload previously built images onto kind cluster
run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
- name: Deploy kured on default namespace with its helm chart
run: |
# Documented in official helm doc to live on the edge
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
# Refresh bins
hash -r
helm install kured ./charts/kured/ --wait --values ./charts/kured/ci/prometheus-values.yaml
kubectl config set-context kind-chart-testing
kubectl get ds --all-namespaces
kubectl describe ds kured
- name: Ensure kured is ready
uses: nick-invision/retry@v2.6.0
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE
command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1' "
- name: Get metrics (healthy)
uses: nick-invision/retry@v2.6.0
with:
timeout_minutes: 2
max_attempts: 12
retry_wait_seconds: 5
command: "./tests/kind/test-metrics.sh 0"
- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh
- name: Get metrics (need reboot)
uses: nick-invision/retry@v2.6.0
with:
timeout_minutes: 15
max_attempts: 10
retry_wait_seconds: 60
command: "./tests/kind/test-metrics.sh 1"
# TEMPLATE Scenario testing.
# Note: keep in mind that the helm chart's appVersion is overriden to test your HEAD of the branch,
# if you `make helm-chart`.
# This will allow you to test properly your scenario and not use an existing image which will not
# contain your feature.
# scenario-<REPLACETHIS>-helm:
# #example: Testing <REPLACETHIS> with helm chart and code from HEAD"
# name: "<REPLACETHIS>"
# runs-on: ubuntu-latest
# strategy:
# fail-fast: false
# # You can define your own kubernetes versions. For example if your helm chart change should behave differently with different kubernetes versions.
# matrix:
# kubernetes:
# - "1.20"
# steps:
# - uses: actions/checkout@v2
# - name: Find go version
# run: |
# GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
# echo "::set-output name=version::${GO_VERSION}"
# id: awk_gomod
# - name: Ensure go version
# uses: actions/setup-go@v2
# with:
# go-version: "${{ steps.awk_gomod.outputs.version }}"
# - name: Build artifacts
# run: |
# make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
# make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" helm-chart
#
# - name: "Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions"
# run: |
# sudo bash << EOF
# cp /etc/docker/daemon.json /etc/docker/daemon.json.old
# echo '{}' > /etc/docker/daemon.json
# systemctl restart docker || journalctl --no-pager -n 500
# systemctl status docker
# EOF
#
# # Default name for helm/kind-action kind clusters is "chart-testing"
# - name: Create 5 node kind cluster
# uses: helm/kind-action@master
# with:
# config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
#
# - name: Preload previously built images onto kind cluster
# run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
#
# - name: Deploy kured on default namespace with its helm chart
# run: |
# # Documented in official helm doc to live on the edge
# curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
# # Refresh bins
# hash -r
# helm install kured ./charts/kured/ --wait --values ./charts/kured/ci/<REPLACETHIS>-values.yaml
# kubectl config set-context kind-chart-testing
# kubectl get ds --all-namespaces
# kubectl describe ds kured
#
# - name: Ensure kured is ready
# uses: nick-invision/retry@v2.6.0
# with:
# timeout_minutes: 10
# max_attempts: 10
# retry_wait_seconds: 60
# # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = 5
# command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' "
#
# - name: Create reboot sentinel files
# run: |
# ./tests/kind/create-reboot-sentinels.sh
#
# - name: Test <REPLACETHIS>
# env:
# DEBUG: true
# run: |
# <TODO>

View File

@@ -1,63 +0,0 @@
# when we add a tag to the repo, we should publish the kured image to a public repository
# if it's safe.
# It doesn't mean it's ready for release, but at least it's getting us started.
# The next step is to have a PR with the helm chart, to bump the version of the image used
name: Tag repo
on:
push:
tags:
- "*"
jobs:
tag-scan-and-push-final-image:
name: "Build, scan, and publish tagged image"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- name: Find current tag version
run: echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
id: tags
- run: |
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ steps.tags.outputs.version }}" image
- uses: Azure/container-scan@v0
with:
image-name: docker.io/${{ github.repository_owner }}/kured:${{ steps.tags.outputs.version }}
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
- name: Login to ghcr.io
uses: docker/login-action@v1
with:
registry: ghcr.io
username: weave-ghcr-bot
password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
# - name: Set up QEMU
# uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
- name: Build image
uses: docker/build-push-action@v2
with:
context: .
file: cmd/kured/Dockerfile.multi
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
push: true
tags: |
docker.io/${{ GITHUB.REPOSITORY }}:${{ steps.tags.outputs.version }}
ghcr.io/${{ GITHUB.REPOSITORY }}:${{ steps.tags.outputs.version }}

View File

@@ -1,136 +0,0 @@
name: Daily jobs
on:
schedule:
- cron: "30 1 * * *"
jobs:
periodics-gotest:
name: Run go tests
runs-on: ubuntu-18.04
steps:
- name: checkout
uses: actions/checkout@v2
- name: run tests
run: go test -json ./... > test.json
- name: Annotate tests
if: always()
uses: guyarb/golang-test-annoations@v0.5.0
with:
test-results: test.json
periodics-mark-stale:
name: Mark stale issues and PRs
runs-on: ubuntu-latest
steps:
# Stale by default waits for 60 days before marking PR/issues as stale, and closes them after 21 days.
# Do not expire the first issues that would allow the community to grow.
- uses: actions/stale@v4
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: 'This issue was automatically considered stale due to lack of activity. Please update it and/or join our slack channels to promote it, before it automatically closes (in 7 days).'
stale-pr-message: 'This PR was automatically considered stale due to lack of activity. Please refresh it and/or join our slack channels to highlight it, before it automatically closes (in 7 days).'
stale-issue-label: 'no-issue-activity'
stale-pr-label: 'no-pr-activity'
exempt-issue-labels: 'good first issue,keep'
days-before-close: 21
check-docs-links:
name: Check docs for incorrect links
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Link Checker
id: lc
uses: peter-evans/link-checker@v1
with:
args: -r *.md *.yaml */*/*.go -x .cluster.local
- name: Fail if there were link errors
run: exit ${{ steps.lc.outputs.exit_code }}
vuln-scan:
name: Build image and scan it against known vulnerabilities
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- run: make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
- uses: Azure/container-scan@v0
with:
image-name: docker.io/${{ github.repository_owner }}/kured:${{ github.sha }}
deploy-helm:
name: Ensure our currently released helm chart works on all kubernetes versions
runs-on: ubuntu-latest
# only build with oldest and newest supported, it should be good enough.
strategy:
matrix:
kubernetes:
- "1.21"
- "1.22"
- "1.23"
steps:
- uses: actions/checkout@v2
- name: Find go version
run: |
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
echo "::set-output name=version::${GO_VERSION}"
id: awk_gomod
- name: Ensure go version
uses: actions/setup-go@v2
with:
go-version: "${{ steps.awk_gomod.outputs.version }}"
- name: "Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions"
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF
# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create 5 node kind cluster
uses: helm/kind-action@v1.2.0
with:
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
version: v0.11.0
- name: Deploy kured on default namespace with its helm chart
run: |
# Documented in official helm doc to live on the edge
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
# Refresh bins
hash -r
helm install kured ./charts/kured/ --set configuration.period=1m
kubectl config set-context kind-chart-testing
kubectl get ds --all-namespaces
kubectl describe ds kured
- name: Ensure kured is ready
uses: nick-invision/retry@v2.6.0
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = 5
command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' "
- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh
- name: Follow reboot until success
env:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
cmd/kured/kured
cmd/prom-active-alerts/prom-active-alerts
vendor
build

View File

@@ -1,235 +0,0 @@
# Developing `kured`
We love contributions to `kured`, no matter if you are [helping out on
Slack][slack], reporting or triaging [issues][issues] or contributing code
to `kured`.
In any case, it will make sense to familiarise yourself with the main
[README][readme] to understand the different features and options, which is
helpful for testing. The "building" section in particular makes sense if
you are planning to contribute code.
[slack]: README.md#getting-help
[issues]: https://github.com/weaveworks/kured/issues
[readme]: README.md
## Regular development activities
### Updating k8s support
Whenever we want to update e.g. the `kubectl` or `client-go` dependencies,
some RBAC changes might be necessary too.
This is what it took to support Kubernetes 1.14:
<https://github.com/weaveworks/kured/pull/75>
That the process can be more involved that that can be seen in
<https://github.com/weaveworks/kured/commits/support-k8s-1.10>
Please update our .github/workflows with the new k8s images, starting by
the creation of a .github/kind-cluster-<version>.yaml, then updating
our workflows with the new versions.
Once you updated everything, make sure you update the support matrix on
the main [README][readme] as well.
### Updating other dependencies
Dependabot proposes changes in our go.mod/go.sum.
Some of those changes are covered by CI testing, some are not.
Please make sure to test those not covered by CI (mostly the integration
with other tools) manually before merging.
### Review periodic jobs
We run periodic jobs (see also Automated testing section of this documentation).
Those should be monitored for failures.
If a failure happen in periodics, something terribly wrong must have happened
(or github is failing at the creation of a kind cluster). Please monitor those
failures carefully.
### Introducing new features
When you introduce a new feature, the kured team expects you to have tested
your change thoroughly. If possible, include all the necessary testing in your change.
If your change involves a user facing change (change in flags of kured for example),
please include expose your new feature in our default manifest (`kured-ds.yaml`),
as a comment.
Do not update the helm chart directly.
Helm charts and our release manifests (see below) are our stable interfaces.
Any user facing changes will therefore have to wait for a while before being
exposed to our users.
This also means that when you expose a new feature, you should create another PR
for your changes in `charts/` to make your feature available for our next kured version.
In this change, you can directly bump the appVersion to the next minor version.
(for example, if current appVersion is 1.6.x, make sure you update your appVersion
to 1.7.0). It allows us to have an easy view of what we land each release.
Do not hesitate to increase the test coverage for your feature, whether it's unit
testing to full functional testing (even using helm charts)
### Increasing test coverage
We are welcoming any change to increase our test coverage.
See also our github issues for the label `testing`.
### Updating helm charts
Helm charts are continuously published. Any change in `charts/` will be immediately
pushed in production.
## Automated testing
Our CI is covered by github actions.
You can see their contents in .github/workflows.
We currently run:
- go tests and lint
- shellcheck
- a check for dead links in our docs
- a security check against our base image (alpine)
- a deep functional test using our manifests on all supported k8s versions
- basic deployment using our helm chart on any chart change
Changes in helm charts are not functionally tested on PRs. We assume that
the PRs to implement the feature are properly tested by our users and
contributors before merge.
To test your code manually, follow the section Manual testing.
## Manual (release) testing
Before `kured` is released, we want to make sure it still works fine on the
previous, current and next minor version of Kubernetes (with respect to the
`client-go` & `kubectl` dependencies in use). For local testing e.g.
`minikube` or `kind` can be sufficient. This will allow you to catch issues
that might not have been tested in our CI, like integration with other tools,
or your specific use case.
Deploy kured in your test scenario, make sure you pass the right `image`,
update the e.g. `period` and `reboot-days` options, so you get immediate
results, if you login to a node and run:
```console
sudo touch /var/run/reboot-required
```
### Example of golang testing
Please run `make test`. You should have golint installed.
### Example of testing with `minikube`
A test-run with `minikube` could look like this:
```console
# start minikube
minikube start --vm-driver kvm2 --kubernetes-version <k8s-release>
# build kured image and publish to registry accessible by minikube
make image minikube-publish
# edit kured-ds.yaml to
# - point to new image
# - change e.g. period and reboot-days option for immediate results
minikube kubectl -- apply -f kured-rbac.yaml
minikube kubectl -- apply -f kured-ds.yaml
minikube kubectl -- logs daemonset.apps/kured -n kube-system -f
# Alternatively use helm to install the chart
# edit values-local.yaml to change any chart parameters
helm install kured ./charts/kured --namespace kube-system -f ./charts/kured/values.minikube.yaml
# In separate terminal
minikube ssh
sudo touch /var/run/reboot-required
minikube logs -f
```
Now check for the 'Commanding reboot' message and minikube going down.
Unfortunately as of today, you are going to run into
<https://github.com/kubernetes/minikube/issues/2874>. This means that
minikube won't come back easily. You will need to start minikube again.
Then you can check for the lock release.
If all the tests ran well, kured maintainers can reach out to the Weaveworks
team to get an upcoming `kured` release tested in the Dev environment for
real life testing.
### Example of testing with `kind`
A test-run with `kind` could look like this:
```console
# create kind cluster
kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
# create reboot required files on pre-defined kind nodes
./tests/kind/create-reboot-sentinels.sh
# check if reboot is working fine
./tests/kind/follow-coordinated-reboot.sh
```
## Publishing a new kured release
### Prepare Documentation
Check that `README.md` has an updated compatibility matrix and that the
url in the `kubectl` incantation (under "Installation") is updated to the
new version you want to release.
### Create a tag on the repo
Before going further, we should freeze the code for a release, by
tagging the code. The Github-Action should start a new job and push
the new image to the registry.
### Create the combined manifest
Now create the `kured-<release>-dockerhub.yaml` for e.g. `1.3.0`:
```sh
VERSION=1.3.0
MANIFEST="kured-$VERSION-dockerhub.yaml"
make DH_ORG="weaveworks" VERSION="${VERSION}" manifest
cat kured-rbac.yaml > "$MANIFEST"
cat kured-ds.yaml >> "$MANIFEST"
```
### Publish release artifacts
Now you can head to the Github UI, use the version number as tag and upload the
`kured-<release>-dockerhub.yaml` file.
Please describe what's new and noteworthy in the release notes, list the PRs
that landed and give a shout-out to everyone who contributed.
Please also note down on which releases the upcoming `kured` release was
tested on. (Check old release notes if you're unsure.)
### Update the Helm chart
You can automatically bump the helm chart's application version
with the latest image tag by running:
```sh
make DH_ORG="weaveworks" VERSION="1.3.0" helm-chart
```
A change in the helm chart requires a bump of the `version`
in `charts/kured/Chart.yaml` (following the versioning rules).
Update it, and issue a PR. Upon merge, that PR will automatically
publish the chart to the gh-pages branch.
When there are open helm-chart PRs which are on hold until the helm-chart has been updated
with the new kured version, they can be merged now (unless a rebase is needed from the contributor).

431
Gopkg.lock generated Normal file
View File

@@ -0,0 +1,431 @@
# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
[[projects]]
name = "github.com/PuerkitoBio/purell"
packages = ["."]
revision = "0bcb03f4b4d0a9428594752bd2a3b9aa0a9d4bd4"
version = "v1.1.0"
[[projects]]
branch = "master"
name = "github.com/PuerkitoBio/urlesc"
packages = ["."]
revision = "bbf7a2afc14f93e1e0a5c06df524fbd75e5031e5"
[[projects]]
name = "github.com/asaskevich/govalidator"
packages = ["."]
revision = "73945b6115bfbbcc57d89b7316e28109364124e1"
version = "v7"
[[projects]]
branch = "master"
name = "github.com/beorn7/perks"
packages = ["quantile"]
revision = "4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9"
[[projects]]
branch = "master"
name = "github.com/cloudfoundry-incubator/candiedyaml"
packages = ["."]
revision = "99c3df83b51532e3615f851d8c2dbb638f5313bf"
[[projects]]
branch = "master"
name = "github.com/davecgh/go-spew"
packages = ["spew"]
revision = "5215b55f46b2b919f50a1df0eaa5886afe4e3b3d"
[[projects]]
branch = "master"
name = "github.com/docker/distribution"
packages = [
"digest",
"reference"
]
revision = "7365003236ca58bd7fa17ef1459328d13301d7d5"
[[projects]]
branch = "master"
name = "github.com/emicklei/go-restful"
packages = [
".",
"log"
]
revision = "b14c3a95fc27c52959d2eddc85066da3c14bf269"
[[projects]]
name = "github.com/emicklei/go-restful-swagger12"
packages = ["."]
revision = "dcef7f55730566d41eae5db10e7d6981829720f6"
version = "1.0.1"
[[projects]]
branch = "master"
name = "github.com/ghodss/yaml"
packages = ["."]
revision = "aa0c862057666179de291b67d9f093d12b5a8473"
[[projects]]
branch = "master"
name = "github.com/go-openapi/analysis"
packages = ["."]
revision = "8ed83f2ea9f00f945516462951a288eaa68bf0d6"
[[projects]]
branch = "master"
name = "github.com/go-openapi/errors"
packages = ["."]
revision = "03cfca65330da08a5a440053faf994a3c682b5bf"
[[projects]]
branch = "master"
name = "github.com/go-openapi/jsonpointer"
packages = ["."]
revision = "779f45308c19820f1a69e9a4cd965f496e0da10f"
[[projects]]
branch = "master"
name = "github.com/go-openapi/jsonreference"
packages = ["."]
revision = "36d33bfe519efae5632669801b180bf1a245da3b"
[[projects]]
branch = "master"
name = "github.com/go-openapi/loads"
packages = ["."]
revision = "a80dea3052f00e5f032e860dd7355cd0cc67e24d"
[[projects]]
branch = "master"
name = "github.com/go-openapi/spec"
packages = ["."]
revision = "e51c28f07047ad90caff03f6450908720d337e0c"
[[projects]]
branch = "master"
name = "github.com/go-openapi/strfmt"
packages = ["."]
revision = "610b6cacdcde6852f4de68998bd20ce1dac85b22"
[[projects]]
branch = "master"
name = "github.com/go-openapi/swag"
packages = ["."]
revision = "24ebf76d720bab64f62824d76bced3184a65490d"
[[projects]]
branch = "master"
name = "github.com/gogo/protobuf"
packages = [
"proto",
"sortkeys"
]
revision = "e33835a643a970c11ac74f6333f5f6866387a101"
[[projects]]
branch = "master"
name = "github.com/golang/glog"
packages = ["."]
revision = "23def4e6c14b4da8ac2ed8007337bc5eb5007998"
[[projects]]
branch = "master"
name = "github.com/golang/protobuf"
packages = ["proto"]
revision = "2bba0603135d7d7f5cb73b2125beeda19c09f4ef"
[[projects]]
branch = "master"
name = "github.com/google/gofuzz"
packages = ["."]
revision = "fd52762d25a41827db7ef64c43756fd4b9f7e382"
[[projects]]
branch = "master"
name = "github.com/inconshreveable/mousetrap"
packages = ["."]
revision = "76626ae9c91c4f2a10f34cad8ce83ea42c93bb75"
[[projects]]
branch = "master"
name = "github.com/juju/ratelimit"
packages = ["."]
revision = "5b9ff866471762aa2ab2dced63c9fb6f53921342"
[[projects]]
branch = "master"
name = "github.com/mailru/easyjson"
packages = [
"buffer",
"jlexer",
"jwriter"
]
revision = "2af9a745a611440bab0528e5ac19b2805a1c50eb"
[[projects]]
name = "github.com/matttproud/golang_protobuf_extensions"
packages = ["pbutil"]
revision = "3247c84500bff8d9fb6d579d800f20b3e091582c"
version = "v1.0.0"
[[projects]]
branch = "master"
name = "github.com/mitchellh/mapstructure"
packages = ["."]
revision = "d0303fe809921458f417bcf828397a65db30a7e4"
[[projects]]
branch = "master"
name = "github.com/prometheus/client_golang"
packages = [
"api/prometheus",
"prometheus",
"prometheus/promhttp"
]
revision = "5636dc67ae776adf5590da7349e70fbb9559972d"
[[projects]]
branch = "master"
name = "github.com/prometheus/client_model"
packages = ["go"]
revision = "6f3806018612930941127f2a7c6c453ba2c527d2"
[[projects]]
branch = "master"
name = "github.com/prometheus/common"
packages = [
"expfmt",
"internal/bitbucket.org/ww/goautoneg",
"model"
]
revision = "ebdfc6da46522d58825777cf1f90490a5b1ef1d8"
[[projects]]
branch = "master"
name = "github.com/prometheus/procfs"
packages = [
".",
"xfs"
]
revision = "e645f4e5aaa8506fc71d6edbc5c4ff02c04c46f2"
[[projects]]
name = "github.com/sirupsen/logrus"
packages = ["."]
revision = "c155da19408a8799da419ed3eeb0cb5db0ad5dbc"
version = "v1.0.5"
[[projects]]
branch = "master"
name = "github.com/spf13/cobra"
packages = ["."]
revision = "b24564e919247d7c870fe0ed3738c98d8741aca4"
[[projects]]
branch = "master"
name = "github.com/spf13/pflag"
packages = ["."]
revision = "367864438f1b1a3c7db4da06a2f55b144e6784e0"
[[projects]]
branch = "master"
name = "github.com/ugorji/go"
packages = ["codec"]
revision = "3487a5545b3d480987dfb0492035299077fab33a"
[[projects]]
branch = "master"
name = "golang.org/x/crypto"
packages = ["ssh/terminal"]
revision = "beb2a9779c3b677077c41673505f150149fce895"
[[projects]]
branch = "master"
name = "golang.org/x/net"
packages = [
"context",
"context/ctxhttp",
"http2",
"http2/hpack",
"idna"
]
revision = "2a35e686583654a1b89ca79c4ac78cb3d6529ca3"
[[projects]]
branch = "master"
name = "golang.org/x/sys"
packages = [
"unix",
"windows"
]
revision = "3b87a42e500a6dc65dae1a55d0b641295971163e"
[[projects]]
branch = "master"
name = "golang.org/x/text"
packages = [
"internal/gen",
"internal/triegen",
"internal/ucd",
"transform",
"unicode/cldr",
"unicode/norm",
"width"
]
revision = "a9a820217f98f7c8a207ec1e45a874e1fe12c478"
[[projects]]
branch = "master"
name = "gopkg.in/inf.v0"
packages = ["."]
revision = "3887ee99ecf07df5b447e9b00d9c0b2adaa9f3e4"
[[projects]]
branch = "v2"
name = "gopkg.in/mgo.v2"
packages = [
"bson",
"internal/json"
]
revision = "3f83fa5005286a7fe593b055f0d7771a7dce4655"
[[projects]]
branch = "v2"
name = "gopkg.in/yaml.v2"
packages = ["."]
revision = "cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b"
[[projects]]
branch = "release-1.7"
name = "k8s.io/apimachinery"
packages = [
"pkg/api/equality",
"pkg/api/errors",
"pkg/api/meta",
"pkg/api/resource",
"pkg/apimachinery",
"pkg/apimachinery/announced",
"pkg/apimachinery/registered",
"pkg/apis/meta/v1",
"pkg/apis/meta/v1/unstructured",
"pkg/apis/meta/v1alpha1",
"pkg/conversion",
"pkg/conversion/queryparams",
"pkg/conversion/unstructured",
"pkg/fields",
"pkg/labels",
"pkg/openapi",
"pkg/runtime",
"pkg/runtime/schema",
"pkg/runtime/serializer",
"pkg/runtime/serializer/json",
"pkg/runtime/serializer/protobuf",
"pkg/runtime/serializer/recognizer",
"pkg/runtime/serializer/streaming",
"pkg/runtime/serializer/versioning",
"pkg/selection",
"pkg/types",
"pkg/util/clock",
"pkg/util/diff",
"pkg/util/errors",
"pkg/util/framer",
"pkg/util/intstr",
"pkg/util/json",
"pkg/util/net",
"pkg/util/rand",
"pkg/util/runtime",
"pkg/util/sets",
"pkg/util/validation",
"pkg/util/validation/field",
"pkg/util/wait",
"pkg/util/yaml",
"pkg/version",
"pkg/watch",
"third_party/forked/golang/reflect"
]
revision = "8ab5f3d8a330c2e9baaf84e39042db8d49034ae2"
[[projects]]
name = "k8s.io/client-go"
packages = [
"discovery",
"kubernetes",
"kubernetes/scheme",
"kubernetes/typed/admissionregistration/v1alpha1",
"kubernetes/typed/apps/v1beta1",
"kubernetes/typed/authentication/v1",
"kubernetes/typed/authentication/v1beta1",
"kubernetes/typed/authorization/v1",
"kubernetes/typed/authorization/v1beta1",
"kubernetes/typed/autoscaling/v1",
"kubernetes/typed/autoscaling/v2alpha1",
"kubernetes/typed/batch/v1",
"kubernetes/typed/batch/v2alpha1",
"kubernetes/typed/certificates/v1beta1",
"kubernetes/typed/core/v1",
"kubernetes/typed/extensions/v1beta1",
"kubernetes/typed/networking/v1",
"kubernetes/typed/policy/v1beta1",
"kubernetes/typed/rbac/v1alpha1",
"kubernetes/typed/rbac/v1beta1",
"kubernetes/typed/settings/v1alpha1",
"kubernetes/typed/storage/v1",
"kubernetes/typed/storage/v1beta1",
"pkg/api",
"pkg/api/v1",
"pkg/api/v1/ref",
"pkg/apis/admissionregistration",
"pkg/apis/admissionregistration/v1alpha1",
"pkg/apis/apps",
"pkg/apis/apps/v1beta1",
"pkg/apis/authentication",
"pkg/apis/authentication/v1",
"pkg/apis/authentication/v1beta1",
"pkg/apis/authorization",
"pkg/apis/authorization/v1",
"pkg/apis/authorization/v1beta1",
"pkg/apis/autoscaling",
"pkg/apis/autoscaling/v1",
"pkg/apis/autoscaling/v2alpha1",
"pkg/apis/batch",
"pkg/apis/batch/v1",
"pkg/apis/batch/v2alpha1",
"pkg/apis/certificates",
"pkg/apis/certificates/v1beta1",
"pkg/apis/extensions",
"pkg/apis/extensions/v1beta1",
"pkg/apis/networking",
"pkg/apis/networking/v1",
"pkg/apis/policy",
"pkg/apis/policy/v1beta1",
"pkg/apis/rbac",
"pkg/apis/rbac/v1alpha1",
"pkg/apis/rbac/v1beta1",
"pkg/apis/settings",
"pkg/apis/settings/v1alpha1",
"pkg/apis/storage",
"pkg/apis/storage/v1",
"pkg/apis/storage/v1beta1",
"pkg/util",
"pkg/util/parsers",
"pkg/version",
"rest",
"rest/watch",
"tools/clientcmd/api",
"tools/metrics",
"transport",
"util/cert",
"util/flowcontrol",
"util/integer"
]
revision = "d92e8497f71b7b4e0494e5bd204b48d34bd6f254"
version = "v4.0.0"
[solve-meta]
analyzer-name = "dep"
analyzer-version = 1
inputs-digest = "029e6d2251ccbf5acdfc3bc0c36f340dc3a98511b0d7338c3e9bb167e412a155"
solver-name = "gps-cdcl"
solver-version = 1

24
Gopkg.toml Normal file
View File

@@ -0,0 +1,24 @@
[[constraint]]
name = "github.com/sirupsen/logrus"
version = "v1.0.5"
[[constraint]]
branch = "master"
name = "github.com/prometheus/client_golang"
[[constraint]]
branch = "master"
name = "github.com/prometheus/common"
[[constraint]]
branch = "master"
name = "github.com/spf13/cobra"
[[constraint]]
name = "k8s.io/client-go"
version = "v4.0.0"
[[constraint]]
name = "k8s.io/apimachinery"
branch = "release-1.7"

View File

@@ -1,5 +0,0 @@
Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
Daniel Holbach <daniel@weave.works> (@dholbach)
Hidde Beydals <hidde@weave.works> (@hiddeco)
Jean-Phillipe Evrard <jean-philippe.evrard@suse.com> (@evrardjp)
Jack Francis <jackfrancis@gmail.com> (@jackfrancis)

View File

@@ -1,17 +1,17 @@
.DEFAULT: all
.PHONY: all clean image publish-image minikube-publish manifest helm-chart test tests
.PHONY: all clean image publish-image minikube-publish
DH_ORG=weaveworks
VERSION=$(shell git symbolic-ref --short HEAD)-$(shell git rev-parse --short HEAD)
SUDO=$(shell docker info >/dev/null 2>&1 || echo "sudo -E")
all: image
clean:
go clean
rm -f cmd/kured/kured
rm -rf ./build
godeps=$(shell go list -f '{{join .Deps "\n"}}' $1 | grep -v /vendor/ | xargs go list -f '{{if not .Standard}}{{ $$dep := . }}{{range .GoFiles}}{{$$dep.Dir}}/{{.}} {{end}}{{end}}')
godeps=$(shell go get $1 && go list -f '{{join .Deps "\n"}}' $1 | grep -v /vendor/ | xargs go list -f '{{if not .Standard}}{{ $$dep := . }}{{range .GoFiles}}{{$$dep.Dir}}/{{.}} {{end}}{{end}}')
DEPS=$(call godeps,./cmd/kured)
@@ -22,34 +22,13 @@ cmd/kured/kured: cmd/kured/*.go
build/.image.done: cmd/kured/Dockerfile cmd/kured/kured
mkdir -p build
cp $^ build
$(SUDO) docker build -t docker.io/$(DH_ORG)/kured -f build/Dockerfile ./build
$(SUDO) docker tag docker.io/$(DH_ORG)/kured docker.io/$(DH_ORG)/kured:$(VERSION)
$(SUDO) docker tag docker.io/$(DH_ORG)/kured ghcr.io/$(DH_ORG)/kured:$(VERSION)
sudo -E docker build -t quay.io/$(DH_ORG)/kured:$(VERSION) -f build/Dockerfile ./build
touch $@
image: build/.image.done
publish-image: image
$(SUDO) docker push docker.io/$(DH_ORG)/kured:$(VERSION)
$(SUDO) docker push ghcr.io/$(DH_ORG)/kured:$(VERSION)
sudo -E docker push quay.io/$(DH_ORG)/kured:$(VERSION)
minikube-publish: image
$(SUDO) docker save docker.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load)
manifest:
sed -i "s#image: docker.io/.*kured.*#image: docker.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds.yaml
echo "Please generate combined manifest if necessary"
helm-chart:
sed -i "s#repository:.*/kured#repository: $(DH_ORG)/kured#g" charts/kured/values.yaml
sed -i "s#appVersion:.*#appVersion: \"$(VERSION)\"#g" charts/kured/Chart.yaml
sed -i "s#\`[0-9]*\.[0-9]*\.[0-9]*\`#\`$(VERSION)\`#g" charts/kured/README.md
echo "Please bump version in charts/kured/Chart.yaml"
test: tests
echo "Running go tests"
go test ./...
echo "Running golint on pkg"
golint ./pkg/...
echo "Running golint on cmd"
golint ./cmd/...
sudo -E docker save quay.io/$(DH_ORG)/kured:$(VERSION) | (eval $$(minikube docker-env) && docker load)

289
README.md
View File

@@ -1,29 +1,21 @@
# kured - Kubernetes Reboot Daemon
<img src="https://github.com/weaveworks/kured/raw/master/img/logo.png" align="right"/>
<img src="https://github.com/weaveworks/kured/raw/main/img/logo.png" align="right"/>
- [Introduction](#introduction)
- [Kubernetes & OS Compatibility](#kubernetes--os-compatibility)
- [Installation](#installation)
- [Configuration](#configuration)
- [Reboot Sentinel File & Period](#reboot-sentinel-file--period)
- [Setting a schedule](#setting-a-schedule)
- [Blocking Reboots via Alerts](#blocking-reboots-via-alerts)
- [Blocking Reboots via Pods](#blocking-reboots-via-pods)
- [Prometheus Metrics](#prometheus-metrics)
- [Notifications](#notifications)
- [Overriding Lock Configuration](#overriding-lock-configuration)
- [Operation](#operation)
- [Testing](#testing)
- [Disabling Reboots](#disabling-reboots)
- [Manual Unlock](#manual-unlock)
- [Automatic Unlock](#automatic-unlock)
- [Delaying Lock Release](#delaying-lock-release)
- [Building](#building)
- [Frequently Asked/Anticipated Questions](#frequently-askedanticipated-questions)
- [Why is there no `latest` tag on Docker Hub?](#why-is-there-no-latest-tag-on-docker-hub)
- [Getting Help](#getting-help)
* [Introduction](#introduction)
* [Kubernetes & OS Compatibility](#kubernetes-&-os-compatibility)
* [Installation](#installation)
* [Configuration](#configuration)
* [Reboot Sentinel File & Period](#reboot-sentinel-file-&-period)
* [Blocking Reboots via Alerts](#blocking-reboots-via-alerts)
* [Prometheus Metrics](#prometheus-metrics)
* [Slack Notifications](#slack-notifications)
* [Overriding Lock Configuration](#overriding-lock-configuration)
* [Operation](#operation)
* [Testing](#testing)
* [Disabling Reboots](#disabling-reboots)
* [Manual Unlock](#manual-unlock)
* [Building](#building)
* [Getting Help](#getting-help)
## Introduction
@@ -31,50 +23,33 @@ Kured (KUbernetes REboot Daemon) is a Kubernetes daemonset that
performs safe automatic node reboots when the need to do so is
indicated by the package management system of the underlying OS.
* Watches for the presence of a reboot sentinel file e.g. `/var/run/reboot-required`
or the successful run of a sentinel command.
* Watches for the presence of a reboot sentinel e.g. `/var/run/reboot-required`
* Utilises a lock in the API server to ensure only one node reboots at
a time
* Optionally defers reboots in the presence of active Prometheus alerts or selected pods
* Optionally defers reboots in the presence of active Prometheus alerts
* Cordons & drains worker nodes before reboot, uncordoning them after
## Kubernetes & OS Compatibility
The daemon image contains versions of `k8s.io/client-go` and
`k8s.io/kubectl` (the binary of `kubectl` in older releases) for the purposes of
maintaining the lock and draining worker nodes. Kubernetes aims to provide
forwards and backwards compatibility of one minor version between client and
server:
The daemon image contains a 1.7.x `k8s.io/client-go` and `kubectl`
binary for the purposes of maintaining the lock and draining worker
nodes. Whilst it has only been tested on a 1.7.x cluster, Kubernetes
typically has good forwards/backwards compatibility so there is a
reasonable chance it will work on adjacent versions; please file an
issue if this is not the case.
| kured | kubectl | k8s.io/client-go | k8s.io/apimachinery | expected kubernetes compatibility |
|-------|---------|------------------|---------------------|-----------------------------------|
| main | 1.22.4 | v0.22.4 | v0.22.4 | 1.21.x, 1.22.x, 1.23.x |
| 1.9.1 | 1.22.4 | v0.22.4 | v0.22.4 | 1.21.x, 1.22.x, 1.23.x |
| 1.8.1 | 1.21.4 | v0.21.4 | v0.21.4 | 1.20.x, 1.21.x, 1.22.x |
| 1.7.0 | 1.20.5 | v0.20.5 | v0.20.5 | 1.19.x, 1.20.x, 1.21.x |
| 1.6.1 | 1.19.4 | v0.19.4 | v0.19.4 | 1.18.x, 1.19.x, 1.20.x |
| 1.5.1 | 1.18.8 | v0.18.8 | v0.18.8 | 1.17.x, 1.18.x, 1.19.x |
| 1.4.4 | 1.17.7 | v0.17.0 | v0.17.0 | 1.16.x, 1.17.x, 1.18.x |
| 1.3.0 | 1.15.10 | v12.0.0 | release-1.15 | 1.15.x, 1.16.x, 1.17.x |
| 1.2.0 | 1.13.6 | v10.0.0 | release-1.13 | 1.12.x, 1.13.x, 1.14.x |
| 1.1.0 | 1.12.1 | v9.0.0 | release-1.12 | 1.11.x, 1.12.x, 1.13.x |
| 1.0.0 | 1.7.6 | v4.0.0 | release-1.7 | 1.6.x, 1.7.x, 1.8.x |
See the [release notes](https://github.com/weaveworks/kured/releases)
for specific version compatibility information, including which
combination have been formally tested.
Versions >=1.1.0 enter the host mount namespace to invoke
`systemctl reboot`, so should work on any systemd distribution.
Additionally, the image contains a `systemctl` binary from Ubuntu
16.04 in order to command reboots. Again, although this has not been
tested against other systemd distributions there is a good chance that
it will work.
## Installation
To obtain a default installation without Prometheus alerting interlock
or Slack notifications:
```console
latest=$(curl -s https://api.github.com/repos/weaveworks/kured/releases | jq -r .[0].tag_name)
kubectl apply -f "https://github.com/weaveworks/kured/releases/download/$latest/kured-$latest-dockerhub.yaml"
```
kubectl apply -f https://github.com/weaveworks/kured/releases/download/1.0.0/kured-ds.yaml
```
If you want to customise the installation, download the manifest and
@@ -84,39 +59,18 @@ edit it in accordance with the following section before application.
The following arguments can be passed to kured via the daemonset pod template:
```console
```
Flags:
--alert-filter-regexp regexp.Regexp alert names to ignore when checking for active alerts
--alert-firing-only bool only consider firing alerts when checking for active alerts
--blocking-pod-selector stringArray label selector identifying pods whose presence should prevent reboots
--drain-grace-period int time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
--skip-wait-for-delete-timeout int when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
--ds-name string name of daemonset on which to place lock (default "kured")
--ds-namespace string namespace containing daemonset on which to place lock (default "kube-system")
--end-time string schedule reboot only before this time of day (default "23:59:59")
--force-reboot bool force a reboot even if the drain is still running (default: false)
--drain-timeout duration timeout after which the drain is aborted (default: 0, infinite time)
-h, --help help for kured
--lock-annotation string annotation in which to record locking node (default "weave.works/kured-node-lock")
--lock-release-delay duration hold lock after reboot by this duration (default: 0, disabled)
--lock-ttl duration expire lock annotation after this duration (default: 0, disabled)
--message-template-drain string message template used to notify about a node being drained (default "Draining node %s")
--message-template-reboot string message template used to notify about a node being rebooted (default "Rebooting node %s")
--notify-url url for reboot notifications (cannot use with --slack-hook-url flags)
--period duration reboot check period (default 1h0m0s)
--prefer-no-schedule-taint string Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to "weave.works/kured-node-reboot" to enable tainting.
--prometheus-url string Prometheus instance to probe for active alerts
--reboot-command string command to run when a reboot is required by the sentinel (default "/sbin/systemctl reboot")
--reboot-days strings schedule reboot on these days (default [su,mo,tu,we,th,fr,sa])
--reboot-delay duration add a delay after drain finishes but before the reboot command is issued (default 0, no time)
--reboot-sentinel string path to file whose existence signals need to reboot (default "/var/run/reboot-required")
--reboot-sentinel-command string command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
--slack-channel string slack channel for reboot notfications
--slack-hook-url string slack hook URL for reboot notfications [deprecated in favor of --notify-url]
--slack-username string slack username for reboot notfications (default "kured")
--start-time string schedule reboot only after this time of day (default "0:00")
--time-zone string use this timezone for schedule inputs (default "UTC")
--log-format string log format specified as text or json, defaults to "text"
--alert-filter-regexp value alert names to ignore when checking for active alerts
--ds-name string namespace containing daemonset on which to place lock (default "kube-system")
--ds-namespace string name of daemonset on which to place lock (default "kured")
--lock-annotation string annotation in which to record locking node (default "weave.works/kured-node-lock")
--period duration reboot check period (default 1h0m0s)
--prometheus-url string Prometheus instance to probe for active alerts
--reboot-sentinel string path to file whose existence signals need to reboot (default "/var/run/reboot-required")
--force-reboot-sentinel string path to file whose existence signals need to force reboot aka. ignore active prometheus alerts (default "/var/run/force-reboot-required")
--slack-hook-url string slack hook URL for reboot notfications
--slack-username string slack username for reboot notfications (default "kured")
```
### Reboot Sentinel File & Period
@@ -127,89 +81,32 @@ values with `--reboot-sentinel` and `--period`. Each replica of the
daemon uses a random offset derived from the period on startup so that
nodes don't all contend for the lock simultaneously.
Alternatively, a reboot sentinel command can be used. If a reboot
sentinel command is used, the reboot sentinel file presence will be
ignored.
### Setting a schedule
By default, kured will reboot any time it detects the sentinel, but this
may cause reboots during odd hours. While service disruption does not
normally occur, anything is possible and operators may want to restrict
reboots to predictable schedules. Use `--reboot-days`, `--start-time`,
`--end-time`, and `--time-zone` to set a schedule. For example, business
hours on the west coast USA can be specified with:
```console
--reboot-days=mon,tue,wed,thu,fri
--start-time=9am
--end-time=5pm
--time-zone=America/Los_Angeles
```
Times can be formatted in numerous ways, including `5pm`, `5:00pm` `17:00`,
and `17`. `--time-zone` represents a Go `time.Location`, and can be `UTC`,
`Local`, or any entry in the standard Linux tz database.
Note that when using smaller time windows, you should consider shortening
the sentinel check period (`--period`).
### Blocking Reboots via Alerts
You may find it desirable to block automatic node reboots when there
are active alerts - you can do so by providing the URL of your
Prometheus server:
```console
```
--prometheus-url=http://prometheus.monitoring.svc.cluster.local
```
By default the presence of *any* active (pending or firing) alerts
will block reboots, however you can ignore specific alerts:
```console
```
--alert-filter-regexp=^(RebootRequired|AnotherBenignAlert|...$
```
You can also only block reboots for firing alerts:
```console
--alert-firing-only=true
```
See the section on Prometheus metrics for an important application of this
filter.
### Blocking Reboots via Pods
You can also block reboots of an _individual node_ when specific pods
are scheduled on it:
```console
--blocking-pod-selector=runtime=long,cost=expensive
```
Since label selector strings use commas to express logical 'and', you can
specify this parameter multiple times for 'or':
```console
--blocking-pod-selector=runtime=long,cost=expensive
--blocking-pod-selector=name=temperamental
```
In this case, the presence of either an (appropriately labelled) expensive long
running job or a known temperamental pod on a node will stop it rebooting.
> Try not to abuse this mechanism - it's better to strive for
> restartability where possible. If you do use it, make sure you set
> up a RebootRequired alert as described in the next section so that
> you can intervene manually if reboots are blocked for too long.
An important application of this filter will become apparent in the
next section.
### Prometheus Metrics
Each kured pod exposes a single gauge metric (`:8080/metrics`) that
indicates the presence of the sentinel file:
```console
```
# HELP kured_reboot_required OS requires reboot due to software updates.
# TYPE kured_reboot_required gauge
kured_reboot_required{node="ip-xxx-xxx-xxx-xxx.ec2.internal"} 0
@@ -219,7 +116,7 @@ The purpose of this metric is to power an alert which will summon an
operator if the cluster cannot reboot itself automatically for a
prolonged period:
```console
```
# Alert if a reboot is required for any machines. Acts as a failsafe for the
# reboot daemon, which will not reboot nodes if there are pending alerts save
# this one.
@@ -238,33 +135,15 @@ If you choose to employ such an alert and have configured kured to
probe for active alerts before rebooting, be sure to specify
`--alert-filter-regexp=^RebootRequired$` to avoid deadlock!
### Notifications
### Slack Notifications
When you specify a formatted URL using `--notify-url`, kured will notify
about draining and rebooting nodes across a list of technologies.
If you specify a Slack hook via `--slack-hook-url`, kured will notify
you immediately prior to rebooting a node:
![Notification](img/slack-notification.png)
<img src="https://github.com/weaveworks/kured/raw/master/img/slack-notification.png"/>
Alternatively you can use the `--message-template-drain` and `--message-template-reboot` to customize the text of the message, e.g.
```cli
--message-template-drain="Draining node %s part of *my-cluster* in region *xyz*"
```
Here is the syntax:
- slack: `slack://tokenA/tokenB/tokenC`
(`--slack-hook-url` is deprecated but possible to use)
- rocketchat: `rocketchat://[username@]rocketchat-host/token[/channel|@recipient]`
- teams: `teams://tName/token-a/token-b/token-c`
> **Attention** as the [format of the url has changed](https://github.com/containrrr/shoutrrr/issues/138) you also have to specify a `tName`
- Email: `smtp://username:password@host:port/?fromAddress=fromAddress&toAddresses=recipient1[,recipient2,...]`
More details here: [containrrr.dev/shoutrrr/v0.4/services/overview](https://containrrr.dev/shoutrrr/v0.4/services/overview)
We recommend setting `--slack-username` to be the name of the
environment, e.g. `dev` or `prod`.
### Overriding Lock Configuration
@@ -287,7 +166,7 @@ if you have, you will have to adjust the commands accordingly.
You can test your configuration by provoking a reboot on a node:
```console
```
sudo touch /var/run/reboot-required
```
@@ -296,7 +175,7 @@ sudo touch /var/run/reboot-required
If you need to temporarily stop kured from rebooting any nodes, you
can take the lock manually:
```console
```
kubectl -n kube-system annotate ds kured weave.works/kured-node-lock='{"nodeID":"manual"}'
```
@@ -308,71 +187,25 @@ In exceptional circumstances, such as a node experiencing a permanent
failure whilst rebooting, manual intervention may be required to
remove the cluster lock:
```console
```
kubectl -n kube-system annotate ds kured weave.works/kured-node-lock-
```
> NB the `-` at the end of the command is important - it instructs
> `kubectl` to remove that annotation entirely.
### Automatic Unlock
In exceptional circumstances (especially when used with cluster-autoscaler) a node
which holds lock might be killed thus annotation will stay there for ever.
Using `--lock-ttl=30m` will allow other nodes to take over if TTL has expired (in this case 30min) and continue reboot process.
### Delaying Lock Release
Using `--lock-release-delay=30m` will cause nodes to hold the lock for the specified time frame (in this case 30min) before it is released and the reboot process continues. This can be used to throttle reboots across the cluster.
## Building
Kured now uses [Go
Modules](https://github.com/golang/go/wiki/Modules), so build
instructions vary depending on where you have checked out the
repository:
**Building outside $GOPATH:**
```console
make
```
**Building inside $GOPATH:**
```console
GO111MODULE=on make
dep ensure && make
```
You can find the current preferred version of Golang in the [go.mod file](go.mod).
If you are interested in contributing code to kured, please take a look at
our [development][development] docs.
[development]: DEVELOPMENT.md
## Frequently Asked/Anticipated Questions
### Why is there no `latest` tag on Docker Hub?
Use of `latest` for production deployments is bad practice - see
[here](https://kubernetes.io/docs/concepts/configuration/overview) for
details. The manifest on `main` refers to `latest` for local
development testing with minikube only; for production use choose a
versioned manifest from the [release page](https://github.com/weaveworks/kured/releases/).
## Getting Help
If you have any questions about, feedback for or problems with `kured`:
* Invite yourself to the <a href="https://slack.weave.works/" target="_blank">Weave Users Slack</a>.
* Ask a question on the [#kured](https://weave-community.slack.com/messages/kured/) slack channel.
* [File an issue](https://github.com/weaveworks/kured/issues/new).
* Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
every fourth Wednesday of the month at 16:00 UTC.
We follow the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md).
- Invite yourself to the <a href="https://weaveworks.github.io/community-slack/" target="_blank"> #weave-community </a> slack channel.
- Ask a question on the <a href="https://weave-community.slack.com/messages/general/"> #weave-community</a> slack channel.
- Send an email to <a href="mailto:weave-users@weave.works">weave-users@weave.works</a>
- <a href="https://github.com/weaveworks/kured/issues/new">File an issue.</a>
Your feedback is always welcome!

View File

@@ -1,21 +0,0 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj

View File

@@ -1,14 +0,0 @@
apiVersion: v1
appVersion: "1.9.1"
description: A Helm chart for kured
name: kured
version: 2.11.2
home: https://github.com/weaveworks/kured
maintainers:
- name: ckotzbauer
email: christian.kotzbauer@gmail.com
- name: davidkarlsen
email: david@davidkarlsen.com
sources:
- https://github.com/weaveworks/kured
icon: https://raw.githubusercontent.com/weaveworks/kured/main/img/logo.png

View File

@@ -1,126 +0,0 @@
# Kured (KUbernetes REboot Daemon)
## Introduction
This chart installs the "Kubernetes Reboot Daemon" using the Helm Package Manager.
## Prerequisites
- Kubernetes 1.9+
## Installing the Chart
To install the chart with the release name `my-release`:
```bash
$ helm repo add kured https://weaveworks.github.io/kured
$ helm install my-release kured/kured
```
## Uninstalling the Chart
To uninstall/delete the `my-release` deployment:
```bash
$ helm delete my-release
```
The command removes all the Kubernetes components associated with the chart and deletes the release.
## Migrate from stable Helm-Chart
The following changes have been made compared to the stable chart:
- **[BREAKING CHANGE]** The `autolock` feature was removed. Use `configuration.startTime` and `configuration.endTime` instead.
- Role inconsistencies have been fixed (allowed verbs for modifying the `DaemonSet`, apiGroup of `PodSecurityPolicy`)
- Added support for affinities.
- Configuration of cli-flags can be made through a `configuration` object.
- Added optional `Service` and `ServiceMonitor` support for metrics endpoint.
## Configuration
| Config | Description | Default |
| ------ | ----------- | ------- |
| `image.repository` | Image repository | `weaveworks/kured` |
| `image.tag` | Image tag | `1.9.1` |
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
| `image.pullSecrets` | Image pull secrets | `[]` |
| `updateStrategy` | Daemonset update strategy | `RollingUpdate` |
| `maxUnavailable` | The max pods unavailable during a rolling update | `1` |
| `podAnnotations` | Annotations to apply to pods (eg to add Prometheus annotations) | `{}` |
| `dsAnnotations` | Annotations to apply to the kured DaemonSet | `{}` |
| `extraArgs` | Extra arguments to pass to `/usr/bin/kured`. See below. | `{}` |
| `extraEnvVars` | Array of environment variables to pass to the daemonset. | `{}` |
| `configuration.lockTtl` | cli-parameter `--lock-ttl` | `0` |
| `configuration.lockReleaseDelay` | cli-parameter `--lock-release-delay` | `0` |
| `configuration.alertFilterRegexp` | cli-parameter `--alert-filter-regexp` | `""` |
| `configuration.alertFiringOnly` | cli-parameter `--alert-firing-only` | `false` |
| `configuration.blockingPodSelector` | Array of selectors for multiple cli-parameters `--blocking-pod-selector` | `[]` |
| `configuration.endTime` | cli-parameter `--end-time` | `""` |
| `configuration.lockAnnotation` | cli-parameter `--lock-annotation` | `""` |
| `configuration.period` | cli-parameter `--period` | `""` |
| `configuration.forceReboot` | cli-parameter `--force-reboot` | `false` |
| `configuration.drainGracePeriod` | cli-parameter `--drain-grace-period` | `""` |
| `configuration.drainTimeout` | cli-parameter `--drain-timeout` | `""` |
| `configuration.skipWaitForDeleteTimeout` | cli-parameter `--skip-wait-for-delete-timeout` | `""` |
| `configuration.prometheusUrl` | cli-parameter `--prometheus-url` | `""` |
| `configuration.rebootDays` | Array of days for multiple cli-parameters `--reboot-days` | `[]` |
| `configuration.rebootSentinel` | cli-parameter `--reboot-sentinel` | `""` |
| `configuration.rebootSentinelCommand` | cli-parameter `--reboot-sentinel-command` | `""` |
| `configuration.rebootCommand` | cli-parameter `--reboot-command` | `""` |
| `configuration.rebootDelay` | cli-parameter `--reboot-delay` | `""` |
| `configuration.slackChannel` | cli-parameter `--slack-channel` | `""` |
| `configuration.slackHookUrl` | cli-parameter `--slack-hook-url` | `""` |
| `configuration.slackUsername` | cli-parameter `--slack-username` | `""` |
| `configuration.notifyUrl` | cli-parameter `--notify-url` | `""` |
| `configuration.messageTemplateDrain` | cli-parameter `--message-template-drain` | `""` |
| `configuration.messageTemplateReboot` | cli-parameter `--message-template-reboot` | `""` |
| `configuration.startTime` | cli-parameter `--start-time` | `""` |
| `configuration.timeZone` | cli-parameter `--time-zone` | `""` |
| `configuration.annotateNodes` | cli-parameter `--annotate-nodes` | `false` |
| `configuration.logFormat` | cli-parameter `--log-format` | `"text"` |
| `configuration.preferNoScheduleTaint` | Taint name applied during pending node reboot | `""` |
| `rbac.create` | Create RBAC roles | `true` |
| `serviceAccount.create` | Create a service account | `true` |
| `serviceAccount.name` | Service account name to create (or use if `serviceAccount.create` is false) | (chart fullname) |
| `podSecurityPolicy.create` | Create podSecurityPolicy | `false` |
| `resources` | Resources requests and limits. | `{}` |
| `metrics.create` | Create a ServiceMonitor for prometheus-operator | `false` |
| `metrics.namespace` | The namespace to create the ServiceMonitor in | `""` |
| `metrics.labels` | Additional labels for the ServiceMonitor | `{}` |
| `metrics.interval` | Interval prometheus should scrape the endpoint | `60s` |
| `metrics.scrapeTimeout` | A custom scrapeTimeout for prometheus | `""` |
| `service.create` | Create a Service for the metrics endpoint | `false` |
| `service.name ` | Service name for the metrics endpoint | `""` |
| `service.port` | Port of the service to expose | `8080` |
| `service.annotations` | Annotations to apply to the service (eg to add Prometheus annotations) | `{}` |
| `podLabels` | Additional labels for pods (e.g. CostCenter=IT) | `{}` |
| `priorityClassName` | Priority Class to be used by the pods | `""` |
| `tolerations` | Tolerations to apply to the daemonset (eg to allow running on master) | `[{"key": "node-role.kubernetes.io/master", "effect": "NoSchedule"}]`|
| `affinity` | Affinity for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
| `nodeSelector` | Node Selector for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
| `volumeMounts` | Maps of volumes mount to mount | `{}` |
| `volumes` | Maps of volumes to mount | `{}` |
See https://github.com/weaveworks/kured#configuration for values (not contained in the `configuration` object) for `extraArgs`. Note that
```yaml
extraArgs:
foo: 1
bar-baz: 2
```
becomes `/usr/bin/kured ... --foo=1 --bar-baz=2`.
## Prometheus Metrics
Kured exposes a single prometheus metric indicating whether a reboot is required or not (see [kured docs](https://github.com/weaveworks/kured#prometheus-metrics)) for details.
#### Prometheus-Operator
```yaml
metrics:
create: true
```
#### Prometheus Annotations
```yaml
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "8080"
```

View File

@@ -1,13 +0,0 @@
# This is tested twice:
# Basic install test with chart-testing (on charts PRs)
# Functional testing in PRs (other PRs)
service:
create: true
name: kured-prometheus-endpoint
port: 8080
type: NodePort
nodePort: 30000
# Do not override the configuration: period in this, so that
# We can test prometheus exposed metrics without rebooting.

View File

@@ -1,3 +0,0 @@
Kured will check for /var/run/reboot-required, and reboot nodes when needed.
See https://github.com/weaveworks/kured/ for details.

View File

@@ -1,72 +0,0 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "kured.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "kured.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "kured.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create the name of the service account to use
*/}}
{{- define "kured.serviceAccountName" -}}
{{- if .Values.serviceAccount.create -}}
{{ default (include "kured.fullname" .) .Values.serviceAccount.name }}
{{- else -}}
{{ default "default" .Values.serviceAccount.name }}
{{- end -}}
{{- end -}}
{{/*
Return the appropriate apiVersion for podsecuritypolicy.
*/}}
{{- define "kured.psp.apiVersion" -}}
{{- if semverCompare "<1.10-0" .Capabilities.KubeVersion.GitVersion -}}
{{- print "extensions/v1beta1" -}}
{{- else -}}
{{- print "policy/v1beta1" -}}
{{- end -}}
{{- end -}}
{{/*
Returns a set of labels applied to each resource.
*/}}
{{- define "kured.labels" -}}
app: {{ template "kured.name" . }}
chart: {{ template "kured.chart" . }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
{{- end -}}
{{/*
Returns a set of matchLabels applied.
*/}}
{{- define "kured.matchLabels" -}}
app: {{ template "kured.name" . }}
release: {{ .Release.Name }}
{{- end -}}

View File

@@ -1,30 +0,0 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
rules:
# Allow kured to read spec.unschedulable
# Allow kubectl to drain/uncordon
#
# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
#
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "patch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["list","delete","get"]
- apiGroups: ["extensions"]
resources: ["daemonsets"]
verbs: ["get"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
{{- end -}}

View File

@@ -1,16 +0,0 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ template "kured.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ template "kured.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
{{- end -}}

View File

@@ -1,187 +0,0 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ template "kured.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kured.labels" . | nindent 4 }}
{{- if .Values.dsAnnotations }}
annotations:
{{- range $key, $value := .Values.dsAnnotations }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
spec:
updateStrategy:
type: {{ .Values.updateStrategy }}
{{- if eq .Values.updateStrategy "RollingUpdate"}}
rollingUpdate:
maxUnavailable: {{ .Values.maxUnavailable }}
{{- end}}
selector:
matchLabels:
{{- include "kured.matchLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "kured.labels" . | nindent 8 }}
{{- if .Values.podLabels }}
{{- toYaml .Values.podLabels | nindent 8 }}
{{- end }}
{{- if .Values.podAnnotations }}
annotations:
{{- range $key, $value := .Values.podAnnotations }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
spec:
serviceAccountName: {{ template "kured.serviceAccountName" . }}
hostPID: true
restartPolicy: Always
{{- with .Values.image.pullSecrets }}
imagePullSecrets:
{{ toYaml . | indent 8 }}
{{- end }}
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
securityContext:
privileged: true # Give permission to nsenter /proc/1/ns/mnt
resources:
{{ toYaml .Values.resources | indent 12 }}
command:
- /usr/bin/kured
args:
- --ds-name={{ template "kured.fullname" . }}
- --ds-namespace={{ .Release.Namespace }}
{{- if .Values.configuration.lockTtl }}
- --lock-ttl={{ .Values.configuration.lockTtl }}
{{- end }}
{{- if .Values.configuration.lockReleaseDelay }}
- --lock-release-delay={{ .Values.configuration.lockReleaseDelay }}
{{- end }}
{{- if .Values.configuration.alertFilterRegexp }}
- --alert-filter-regexp={{ .Values.configuration.alertFilterRegexp }}
{{- end }}
{{- if .Values.configuration.alertFiringOnly }}
- --alert-firing-only={{ .Values.configuration.alertFiringOnly }}
{{- end }}
{{- range .Values.configuration.blockingPodSelector }}
- --blocking-pod-selector={{ . }}
{{- end }}
{{- if .Values.configuration.endTime }}
- --end-time={{ .Values.configuration.endTime }}
{{- end }}
{{- if .Values.configuration.lockAnnotation }}
- --lock-annotation={{ .Values.configuration.lockAnnotation }}
{{- end }}
{{- if .Values.configuration.period }}
- --period={{ .Values.configuration.period }}
{{- end }}
{{- if .Values.configuration.forceReboot }}
- --force-reboot
{{- end }}
{{- if .Values.configuration.drainGracePeriod }}
- --drain-grace-period={{ .Values.configuration.drainGracePeriod }}
{{- end }}
{{- if .Values.configuration.drainTimeout }}
- --drain-timeout={{ .Values.configuration.drainTimeout }}
{{- end }}
{{- if .Values.configuration.skipWaitForDeleteTimeout }}
- --skip-wait-for-delete-timeout={{ .Values.configuration.skipWaitForDeleteTimeout }}
{{- end }}
{{- if .Values.configuration.prometheusUrl }}
- --prometheus-url={{ .Values.configuration.prometheusUrl }}
{{- end }}
{{- range .Values.configuration.rebootDays }}
- --reboot-days={{ . }}
{{- end }}
{{- if .Values.configuration.rebootSentinel }}
- --reboot-sentinel={{ .Values.configuration.rebootSentinel }}
{{- end }}
{{- if .Values.configuration.rebootSentinelCommand }}
- --reboot-sentinel-command={{ .Values.configuration.rebootSentinelCommand }}
{{- end }}
{{- if .Values.configuration.rebootCommand }}
- --reboot-command={{ .Values.configuration.rebootCommand }}
{{- end }}
{{- if .Values.configuration.rebootDelay }}
- --reboot-delay={{ .Values.configuration.rebootDelay }}
{{- end }}
{{- if .Values.configuration.slackChannel }}
- --slack-channel={{ .Values.configuration.slackChannel }}
{{- end }}
{{- if .Values.configuration.slackHookUrl }}
- --slack-hook-url={{ .Values.configuration.slackHookUrl }}
{{- end }}
{{- if .Values.configuration.slackUsername }}
- --slack-username={{ .Values.configuration.slackUsername }}
{{- end }}
{{- if .Values.configuration.notifyUrl }}
- --notify-url={{ .Values.configuration.notifyUrl }}
{{- end }}
{{- if .Values.configuration.messageTemplateDrain }}
- --message-template-drain={{ .Values.configuration.messageTemplateDrain }}
{{- end }}
{{- if .Values.configuration.messageTemplateReboot }}
- --message-template-reboot={{ .Values.configuration.messageTemplateReboot }}
{{- end }}
{{- if .Values.configuration.startTime }}
- --start-time={{ .Values.configuration.startTime }}
{{- end }}
{{- if .Values.configuration.timeZone }}
- --time-zone={{ .Values.configuration.timeZone }}
{{- end }}
{{- if .Values.configuration.annotateNodes }}
- --annotate-nodes={{ .Values.configuration.annotateNodes }}
{{- end }}
{{- if .Values.configuration.preferNoScheduleTaint }}
- --prefer-no-schedule-taint={{ .Values.configuration.preferNoScheduleTaint }}
{{- end }}
{{- if .Values.configuration.logFormat }}
- --log-format={{ .Values.configuration.logFormat }}
{{- end }}
{{- range $key, $value := .Values.extraArgs }}
{{- if $value }}
- --{{ $key }}={{ $value }}
{{- else }}
- --{{ $key }}
{{- end }}
{{- end }}
{{- if .Values.volumeMounts }}
volumeMounts:
{{- toYaml .Values.volumeMounts | nindent 12 }}
{{- end }}
ports:
- containerPort: 8080
name: metrics
env:
# Pass in the name of the node on which this pod is scheduled
# for use with drain/uncordon operations and lock acquisition
- name: KURED_NODE_ID
valueFrom:
fieldRef:
fieldPath: spec.nodeName
{{- if .Values.extraEnvVars }}
{{ toYaml .Values.extraEnvVars | nindent 12 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{ toYaml . | indent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{ toYaml . | indent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{ toYaml . | indent 8 }}
{{- end }}
{{- if .Values.volumes }}
volumes:
{{- toYaml .Values.volumes | nindent 8 }}
{{- end }}

View File

@@ -1,21 +0,0 @@
{{- if .Values.podSecurityPolicy.create}}
apiVersion: {{ template "kured.psp.apiVersion" . }}
kind: PodSecurityPolicy
metadata:
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
spec:
privileged: true
hostPID: true
allowedCapabilities: ['*']
fsGroup:
rule: RunAsAny
runAsUser:
rule: RunAsAny
seLinux:
rule: RunAsAny
supplementalGroups:
rule: RunAsAny
volumes: ['*']
{{- end }}

View File

@@ -1,30 +0,0 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: {{ .Release.Namespace }}
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
rules:
# Allow kured to lock/unlock itself
- apiGroups: ["extensions"]
resources: ["daemonsets"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["update", "patch"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["update", "patch"]
{{- if .Values.podSecurityPolicy.create }}
- apiGroups: ["extensions"]
resources: ["podsecuritypolicies"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["use"]
- apiGroups: ["policy"]
resources: ["podsecuritypolicies"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["use"]
{{- end }}
{{- end -}}

View File

@@ -1,17 +0,0 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
namespace: {{ .Release.Namespace }}
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
subjects:
- kind: ServiceAccount
namespace: {{ .Release.Namespace }}
name: {{ template "kured.serviceAccountName" . }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ template "kured.fullname" . }}
{{- end -}}

View File

@@ -1,29 +0,0 @@
{{- if or .Values.service.create .Values.metrics.create }}
apiVersion: v1
kind: Service
metadata:
{{- if .Values.service.name }}
name: {{ .Values.service.name }}
{{- else }}
name: {{ template "kured.fullname" . }}
{{- end }}
labels:
{{- include "kured.labels" . | nindent 4 }}
{{- if .Values.service.annotations }}
annotations:
{{- range $key, $value := .Values.service.annotations }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
spec:
type: {{ .Values.service.type }}
ports:
- name: metrics
port: {{ .Values.service.port }}
targetPort: 8080
{{- if eq .Values.service.type "NodePort" }}
nodePort: {{ .Values.service.nodePort }}
{{- end }}
selector:
{{- include "kured.matchLabels" . | nindent 4 }}
{{- end }}

View File

@@ -1,9 +0,0 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ template "kured.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kured.labels" . | nindent 4 }}
{{- end -}}

View File

@@ -1,31 +0,0 @@
{{- if .Values.metrics.create }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ template "kured.fullname" . }}
{{- if .Values.metrics.namespace }}
namespace: {{ .Values.metrics.namespace }}
{{- end }}
labels:
{{- include "kured.labels" . | nindent 4 }}
{{- if .Values.metrics.labels }}
{{- toYaml .Values.metrics.labels | nindent 4 }}
{{- end }}
spec:
endpoints:
- interval: {{ .Values.metrics.interval }}
{{- if .Values.metrics.scrapeTimeout }}
scrapeTimeout: {{ .Values.metrics.scrapeTimeout }}
{{- end }}
honorLabels: true
targetPort: 8080
path: /metrics
scheme: http
jobLabel: "{{ .Release.Name }}"
selector:
matchLabels:
{{- include "kured.matchLabels" . | nindent 6 }}
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
{{- end }}

View File

@@ -1,31 +0,0 @@
image:
repository: weaveworks/kured
tag: latest
configuration:
# annotationTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
# alertFilterRegexp: "" # alert names to ignore when checking for active alerts
# alertFiringOnly: false # only consider firing alerts when checking for active alerts
# blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
# endTime: "" # only reboot before this time of day (default "23:59")
# lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
period: "1m" # reboot check period (default 1h0m0s)
# forceReboot: false # force a reboot even if the drain fails or times out (default: false)
# drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
# drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
# skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
# prometheusUrl: "" # Prometheus instance to probe for active alerts
# rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
# rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
# rebootSentinelCommand: "" # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
# slackChannel: "" # slack channel for reboot notfications
# slackHookUrl: "" # slack hook URL for reboot notfications
# slackUsername: "" # slack username for reboot notfications (default "kured")
# notifyUrl: "" # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
# messageTemplateDrain: "" # slack message template when notifying about a node being drained (default "Draining node %s")
# messageTemplateReboot: "" # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
# startTime: "" # only reboot after this time of day (default "0:00")
# timeZone: "" # time-zone to use (valid zones from "time" golang package)
# annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
# lockReleaseDelay: "5m" # hold lock after reboot by this amount of time (default 0, disabled)
# logFormat: "text" # log format specified as text or json, defaults to text

View File

@@ -1,96 +0,0 @@
image:
repository: weaveworks/kured
tag: "" # will default to the appVersion in Chart.yaml
pullPolicy: IfNotPresent
pullSecrets: []
updateStrategy: RollingUpdate
# requires RollingUpdate updateStrategy
maxUnavailable: 1
podAnnotations: {}
dsAnnotations: {}
extraArgs: {}
extraEnvVars:
# - name: slackHookUrl
# valueFrom:
# secretKeyRef:
# name: secret_name
# key: secret_key
# - name: regularEnvVariable
# value: 123
configuration:
lockTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
alertFilterRegexp: "" # alert names to ignore when checking for active alerts
alertFiringOnly: false # only consider firing alerts when checking for active alerts
blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
endTime: "" # only reboot before this time of day (default "23:59")
lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
period: "" # reboot check period (default 1h0m0s)
forceReboot: false # force a reboot even if the drain fails or times out (default: false)
drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
prometheusUrl: "" # Prometheus instance to probe for active alerts
rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
rebootSentinelCommand: "" # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
rebootCommand: "/bin/systemctl reboot" # command to run when a reboot is required by the sentinel
rebootDelay: "" # add a delay after drain finishes but before the reboot command is issued
slackChannel: "" # slack channel for reboot notfications
slackHookUrl: "" # slack hook URL for reboot notfications
slackUsername: "" # slack username for reboot notfications (default "kured")
notifyUrl: "" # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
messageTemplateDrain: "" # slack message template when notifying about a node being drained (default "Draining node %s")
messageTemplateReboot: "" # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
startTime: "" # only reboot after this time of day (default "0:00")
timeZone: "" # time-zone to use (valid zones from "time" golang package)
annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
lockReleaseDelay: 0 # hold lock after reboot by this amount of time (default 0, disabled)
preferNoScheduleTaint: "" # Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to "weave.works/kured-node-reboot" to enable tainting.
logFormat: "text" # log format specified as text or json, defaults to text
rbac:
create: true
serviceAccount:
create: true
name:
podSecurityPolicy:
create: false
resources: {}
metrics:
create: false
namespace: ""
labels: {}
interval: 60s
scrapeTimeout: ""
service:
create: false
port: 8080
annotations: {}
name: ""
type: ClusterIP
podLabels: {}
priorityClassName: ""
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
affinity: {}
nodeSelector: {}
volumeMounts: []
volumes: []

View File

@@ -1,4 +1,6 @@
FROM alpine:3.15.0
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
FROM ubuntu:16.04
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/cache/apt
ADD https://storage.googleapis.com/kubernetes-release/release/v1.9.6/bin/linux/amd64/kubectl /usr/bin/kubectl
RUN chmod 0755 /usr/bin/kubectl
COPY ./kured /usr/bin/kured
ENTRYPOINT ["/usr/bin/kured"]

View File

@@ -1,21 +0,0 @@
FROM --platform=$BUILDPLATFORM golang:bullseye AS build
ARG TARGETOS
ARG TARGETARCH
ARG TARGETVARIANT
ENV GOOS=$TARGETOS
ENV GOARCH=$TARGETARCH
ENV GOVARIANT=$TARGETVARIANT
WORKDIR /src
COPY go.mod go.sum .
RUN go mod download
COPY . .
RUN go list -f '{{join .Deps "\n"}}' ./cmd/kured | grep -v /vendor/ | xargs go list -f '{{if not .Standard}}{{ $dep := . }}{{range .GoFiles}}{{$dep.Dir}}/{{.}} {{end}}{{end}}'
RUN CGO_ENABLED=0 go build -o cmd/kured/kured cmd/kured/*.go
FROM --platform=$TARGETPLATFORM alpine:3.15 as bin
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
COPY --from=build /src/cmd/kured/kured /usr/bin/kured
ENTRYPOINT ["/usr/bin/kured"]

View File

@@ -1,79 +1,41 @@
package main
import (
"context"
"encoding/json"
"fmt"
"math/rand"
"net/http"
"net/url"
"os"
"os/exec"
"regexp"
"strings"
"time"
papi "github.com/prometheus/client_golang/api"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
"github.com/spf13/viper"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
kubectldrain "k8s.io/kubectl/pkg/drain"
"github.com/google/shlex"
shoutrrr "github.com/containrrr/shoutrrr"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/weaveworks/kured/pkg/alerts"
"github.com/weaveworks/kured/pkg/daemonsetlock"
"github.com/weaveworks/kured/pkg/delaytick"
"github.com/weaveworks/kured/pkg/taints"
"github.com/weaveworks/kured/pkg/timewindow"
"github.com/weaveworks/kured/pkg/notifications/slack"
)
var (
version = "unreleased"
// Command line flags
forceReboot bool
drainTimeout time.Duration
rebootDelay time.Duration
period time.Duration
drainGracePeriod int
skipWaitForDeleteTimeoutSeconds int
dsNamespace string
dsName string
lockAnnotation string
lockTTL time.Duration
lockReleaseDelay time.Duration
prometheusURL string
preferNoScheduleTaintName string
alertFilter *regexp.Regexp
alertFiringOnly bool
rebootSentinelFile string
rebootSentinelCommand string
notifyURL string
slackHookURL string
slackUsername string
slackChannel string
messageTemplateDrain string
messageTemplateReboot string
podSelectors []string
rebootCommand string
logFormat string
nodeID string
rebootDays []string
rebootStart string
rebootEnd string
timezone string
annotateNodes bool
period time.Duration
dsNamespace string
dsName string
lockAnnotation string
prometheusURL string
alertFilter *regexp.Regexp
rebootSentinel string
forceRebootSentinel string
slackHookURL string
slackUsername string
// Metrics
rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
@@ -83,167 +45,46 @@ var (
}, []string{"node"})
)
const (
// KuredNodeLockAnnotation is the canonical string value for the kured node-lock annotation
KuredNodeLockAnnotation string = "weave.works/kured-node-lock"
// KuredRebootInProgressAnnotation is the canonical string value for the kured reboot-in-progress annotation
KuredRebootInProgressAnnotation string = "weave.works/kured-reboot-in-progress"
// KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
// EnvPrefix The environment variable prefix of all environment variables bound to our command line flags.
EnvPrefix = "KURED"
)
func init() {
prometheus.MustRegister(rebootRequiredGauge)
}
func main() {
cmd := NewRootCommand()
if err := cmd.Execute(); err != nil {
log.Fatal(err)
}
}
// NewRootCommand construct the Cobra root command
func NewRootCommand() *cobra.Command {
rootCmd := &cobra.Command{
Use: "kured",
Short: "Kubernetes Reboot Daemon",
PersistentPreRunE: bindViper,
PreRun: flagCheck,
Run: root}
Use: "kured",
Short: "Kubernetes Reboot Daemon",
Run: root}
rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "",
"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false,
"force a reboot even if the drain fails or times out (default: false)")
rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1,
"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)")
rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)")
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
"timeout after which the drain is aborted (default: 0, infinite time)")
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
"delay reboot for this duration (default: 0, disabled)")
rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60,
"sentinel check period")
"reboot check period")
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system",
"namespace containing daemonset on which to place lock")
rootCmd.PersistentFlags().StringVar(&dsName, "ds-name", "kured",
"name of daemonset on which to place lock")
rootCmd.PersistentFlags().StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation,
rootCmd.PersistentFlags().StringVar(&lockAnnotation, "lock-annotation", "weave.works/kured-node-lock",
"annotation in which to record locking node")
rootCmd.PersistentFlags().DurationVar(&lockTTL, "lock-ttl", 0,
"expire lock annotation after this duration (default: 0, disabled)")
rootCmd.PersistentFlags().DurationVar(&lockReleaseDelay, "lock-release-delay", 0,
"delay lock release for this duration (default: 0, disabled)")
rootCmd.PersistentFlags().StringVar(&prometheusURL, "prometheus-url", "",
"Prometheus instance to probe for active alerts")
rootCmd.PersistentFlags().Var(&regexpValue{&alertFilter}, "alert-filter-regexp",
"alert names to ignore when checking for active alerts")
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
"only consider firing alerts when checking for active alerts (default: false)")
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
"path to file whose existence triggers the reboot command")
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "",
"Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"weave.works/kured-node-reboot\" to enable tainting.")
rootCmd.PersistentFlags().StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "",
"command for which a zero return code will trigger a reboot command")
rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
"command to run when a reboot is required")
rootCmd.PersistentFlags().StringVar(&rebootSentinel, "reboot-sentinel", "/var/run/reboot-required",
"path to file whose existence signals need to reboot")
rootCmd.PersistentFlags().StringVar(&forceRebootSentinel, "force-reboot-sentinel", "/var/run/force-reboot-required",
"path to file whose existence signals need to force reboot")
rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
"slack hook URL for notifications")
"slack hook URL for reboot notfications")
rootCmd.PersistentFlags().StringVar(&slackUsername, "slack-username", "kured",
"slack username for notifications")
rootCmd.PersistentFlags().StringVar(&slackChannel, "slack-channel", "",
"slack channel for reboot notfications")
rootCmd.PersistentFlags().StringVar(&notifyURL, "notify-url", "",
"notify URL for reboot notfications")
rootCmd.PersistentFlags().StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s",
"message template used to notify about a node being drained")
rootCmd.PersistentFlags().StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s",
"message template used to notify about a node being rebooted")
"slack username for reboot notfications")
rootCmd.PersistentFlags().StringArrayVar(&podSelectors, "blocking-pod-selector", nil,
"label selector identifying pods whose presence should prevent reboots")
rootCmd.PersistentFlags().StringSliceVar(&rebootDays, "reboot-days", timewindow.EveryDay,
"schedule reboot on these days")
rootCmd.PersistentFlags().StringVar(&rebootStart, "start-time", "0:00",
"schedule reboot only after this time of day")
rootCmd.PersistentFlags().StringVar(&rebootEnd, "end-time", "23:59:59",
"schedule reboot only before this time of day")
rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC",
"use this timezone for schedule inputs")
rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false,
"if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots")
rootCmd.PersistentFlags().StringVar(&logFormat, "log-format", "text",
"use text or json log format")
return rootCmd
}
// temporary func that checks for deprecated slack-notification-related flags
func flagCheck(cmd *cobra.Command, args []string) {
if slackHookURL != "" && notifyURL != "" {
log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...")
if err := rootCmd.Execute(); err != nil {
log.Fatal(err)
}
if slackHookURL != "" {
log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.")
trataURL, err := url.Parse(slackHookURL)
if err != nil {
log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)
}
if len(strings.Split(strings.Trim(trataURL.Path, "/services/"), "/")) != 3 {
log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")
} else {
notifyURL = fmt.Sprintf("slack://%s", strings.Trim(trataURL.Path, "/services/"))
}
}
}
// bindViper initializes viper and binds command flags with environment variables
func bindViper(cmd *cobra.Command, args []string) error {
v := viper.New()
v.SetEnvPrefix(EnvPrefix)
v.AutomaticEnv()
bindFlags(cmd, v)
return nil
}
// bindFlags binds each cobra flag to its associated viper configuration (environment variable)
func bindFlags(cmd *cobra.Command, v *viper.Viper) {
cmd.Flags().VisitAll(func(f *pflag.Flag) {
// Environment variables can't have dashes in them, so bind them to their equivalent keys with underscores
if strings.Contains(f.Name, "-") {
v.BindEnv(f.Name, flagToEnvVar(f.Name))
}
// Apply the viper config value to the flag when the flag is not set and viper has a value
if !f.Changed && v.IsSet(f.Name) {
val := v.Get(f.Name)
log.Infof("Binding %s command flag to environment variable: %s", f.Name, flagToEnvVar(f.Name))
cmd.Flags().Set(f.Name, fmt.Sprintf("%v", val))
}
})
}
// flagToEnvVar converts command flag name to equivalent environment variable name
func flagToEnvVar(flag string) string {
envVarSuffix := strings.ToUpper(strings.ReplaceAll(flag, "-", "_"))
return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix)
}
// newCommand creates a new Command with stdout/stderr wired to our standard logger
func newCommand(name string, arg ...string) *exec.Cmd {
cmd := exec.Command(name, arg...)
cmd.Stdout = log.NewEntry(log.StandardLogger()).
WithField("cmd", cmd.Args[0]).
WithField("std", "out").
@@ -257,111 +98,58 @@ func newCommand(name string, arg ...string) *exec.Cmd {
return cmd
}
// buildHostCommand writes a new command to run in the host namespace
// Rancher based need different pid
func buildHostCommand(pid int, command []string) []string {
// From the container, we nsenter into the proper PID to run the hostCommand.
// For this, kured daemonset need to be configured with hostPID:true and privileged:true
cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"}
cmd = append(cmd, command...)
return cmd
}
func rebootRequired(sentinelCommand []string) bool {
if err := newCommand(sentinelCommand[0], sentinelCommand[1:]...).Run(); err != nil {
switch err := err.(type) {
case *exec.ExitError:
// We assume a non-zero exit code means 'reboot not required', but of course
// the user could have misconfigured the sentinel command or something else
// went wrong during its execution. In that case, not entering a reboot loop
// is the right thing to do, and we are logging stdout/stderr of the command
// so it should be obvious what is wrong.
return false
default:
// Something was grossly misconfigured, such as the command path being wrong.
log.Fatalf("Error invoking sentinel command: %v", err)
}
}
return true
}
// RebootBlocker interface should be implemented by types
// to know if their instantiations should block a reboot
type RebootBlocker interface {
isBlocked() bool
}
// PrometheusBlockingChecker contains info for connecting
// to prometheus, and can give info about whether a reboot should be blocked
type PrometheusBlockingChecker struct {
// prometheusClient to make prometheus-go-client and api config available
// into the PrometheusBlockingChecker struct
promClient *alerts.PromClient
// regexp used to get alerts
filter *regexp.Regexp
// bool to indicate if only firing alerts should be considered
firingOnly bool
}
// KubernetesBlockingChecker contains info for connecting
// to k8s, and can give info about whether a reboot should be blocked
type KubernetesBlockingChecker struct {
// client used to contact kubernetes API
client *kubernetes.Clientset
nodename string
// lised used to filter pods (podSelector)
filter []string
}
func (pb PrometheusBlockingChecker) isBlocked() bool {
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
if err != nil {
log.Warnf("Reboot blocked: prometheus query error: %v", err)
func sentinelExists() bool {
_, err := os.Stat(rebootSentinel)
switch {
case err == nil:
return true
case os.IsNotExist(err):
return false
default:
log.Fatalf("Unable to determine existence of sentinel: %v", err)
return false // unreachable; prevents compilation error
}
count := len(alertNames)
if count > 10 {
alertNames = append(alertNames[:10], "...")
}
if count > 0 {
log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
}
func forceRebootsentinelExists() bool {
_, err := os.Stat(forceRebootSentinel)
switch {
case err == nil:
return true
case os.IsNotExist(err):
return false
default:
log.Fatalf("Unable to determine existence of force reboot sentinel: %v", err)
return false // unreachable; prevents compilation error
}
return false
}
func (kb KubernetesBlockingChecker) isBlocked() bool {
fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodename)
for _, labelSelector := range kb.filter {
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
LabelSelector: labelSelector,
FieldSelector: fieldSelector,
Limit: 10})
func rebootRequired() bool {
if sentinelExists() || forceRebootsentinelExists() {
log.Infof("Reboot required")
return true
} else {
log.Infof("Reboot not required")
return false
}
}
func rebootBlocked() bool {
if forceRebootsentinelExists() {
log.Infof("Force reboot sentinel %v exists, force rebooting activated", forceRebootSentinel)
return false
}
if prometheusURL != "" {
alertNames, err := alerts.PrometheusActiveAlerts(prometheusURL, alertFilter)
if err != nil {
log.Warnf("Reboot blocked: pod query error: %v", err)
log.Warnf("Reboot blocked: prometheus query error: %v", err)
return true
}
if len(podList.Items) > 0 {
podNames := make([]string, 0, len(podList.Items))
for _, pod := range podList.Items {
podNames = append(podNames, pod.Name)
}
if len(podList.Continue) > 0 {
podNames = append(podNames, "...")
}
log.Warnf("Reboot blocked: matching pods: %v", podNames)
return true
count := len(alertNames)
if count > 10 {
alertNames = append(alertNames[:10], "...")
}
}
return false
}
func rebootBlocked(blockers ...RebootBlocker) bool {
for _, blocker := range blockers {
if blocker.isBlocked() {
if count > 0 {
log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
return true
}
}
@@ -379,8 +167,8 @@ func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
return holding
}
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration) bool {
holding, holder, err := lock.Acquire(metadata, TTL)
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
holding, holder, err := lock.Acquire(metadata)
switch {
case err != nil:
log.Fatalf("Error acquiring lock: %v", err)
@@ -394,13 +182,6 @@ func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.D
}
}
func throttle(releaseDelay time.Duration) {
if releaseDelay > 0 {
log.Infof("Delaying lock release by %v", releaseDelay)
time.Sleep(releaseDelay)
}
}
func release(lock *daemonsetlock.DaemonSetLock) {
log.Infof("Releasing lock")
if err := lock.Release(); err != nil {
@@ -408,78 +189,43 @@ func release(lock *daemonsetlock.DaemonSetLock) {
}
}
func drain(client *kubernetes.Clientset, node *v1.Node) {
nodename := node.GetName()
func drain(nodeID string) {
log.Infof("Draining node %s", nodeID)
drainCmd := newCommand("/usr/bin/kubectl", "drain",
"--ignore-daemonsets", "--delete-local-data", "--force", nodeID)
log.Infof("Draining node %s", nodename)
if notifyURL != "" {
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateDrain, nodename)); err != nil {
log.Warnf("Error notifying: %v", err)
}
}
drainer := &kubectldrain.Helper{
Client: client,
Ctx: context.Background(),
GracePeriodSeconds: drainGracePeriod,
SkipWaitForDeleteTimeoutSeconds: skipWaitForDeleteTimeoutSeconds,
Force: true,
DeleteEmptyDirData: true,
IgnoreAllDaemonSets: true,
ErrOut: os.Stderr,
Out: os.Stdout,
Timeout: drainTimeout,
}
if err := kubectldrain.RunCordonOrUncordon(drainer, node, true); err != nil {
if !forceReboot {
log.Fatalf("Error cordonning %s: %v", nodename, err)
}
log.Errorf("Error cordonning %s: %v, continuing with reboot anyway", nodename, err)
return
}
if err := kubectldrain.RunNodeDrain(drainer, nodename); err != nil {
if !forceReboot {
log.Fatalf("Error draining %s: %v", nodename, err)
}
log.Errorf("Error draining %s: %v, continuing with reboot anyway", nodename, err)
return
if err := drainCmd.Run(); err != nil {
log.Fatalf("Error invoking drain command: %v", err)
}
}
func uncordon(client *kubernetes.Clientset, node *v1.Node) {
nodename := node.GetName()
log.Infof("Uncordoning node %s", nodename)
drainer := &kubectldrain.Helper{
Client: client,
ErrOut: os.Stderr,
Out: os.Stdout,
Ctx: context.Background(),
}
if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil {
log.Fatalf("Error uncordonning %s: %v", nodename, err)
func uncordon(nodeID string) {
log.Infof("Uncordoning node %s", nodeID)
uncordonCmd := newCommand("/usr/bin/kubectl", "uncordon", nodeID)
if err := uncordonCmd.Run(); err != nil {
log.Fatalf("Error invoking uncordon command: %v", err)
}
}
func invokeReboot(nodeID string, rebootCommand []string) {
log.Infof("Running command: %s for node: %s", rebootCommand, nodeID)
func commandReboot(nodeID string) {
log.Infof("Commanding reboot")
if notifyURL != "" {
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil {
log.Warnf("Error notifying: %v", err)
if slackHookURL != "" {
if err := slack.NotifyReboot(slackHookURL, slackUsername, nodeID); err != nil {
log.Warnf("Error notifying slack: %v", err)
}
}
if err := newCommand(rebootCommand[0], rebootCommand[1:]...).Run(); err != nil {
// Relies on /var/run/dbus/system_bus_socket bind mount to talk to systemd
rebootCmd := newCommand("/bin/systemctl", "reboot")
if err := rebootCmd.Run(); err != nil {
log.Fatalf("Error invoking reboot command: %v", err)
}
}
func maintainRebootRequiredMetric(nodeID string, sentinelCommand []string) {
func maintainRebootRequiredMetric(nodeID string) {
for {
if rebootRequired(sentinelCommand) {
if sentinelExists() {
rebootRequiredGauge.WithLabelValues(nodeID).Set(1)
} else {
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
@@ -493,45 +239,7 @@ type nodeMeta struct {
Unschedulable bool `json:"unschedulable"`
}
func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) {
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %s", err)
}
for k, v := range annotations {
node.Annotations[k] = v
log.Infof("Adding node %s annotation: %s=%s", node.GetName(), k, v)
}
bytes, err := json.Marshal(node)
if err != nil {
log.Fatalf("Error marshalling node object into JSON: %v", err)
}
_, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
if err != nil {
var annotationsErr string
for k, v := range annotations {
annotationsErr += fmt.Sprintf("%s=%s ", k, v)
}
log.Fatalf("Error adding node annotations %s via k8s API: %v", annotationsErr, err)
}
}
func deleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) {
log.Infof("Deleting node %s annotation %s", nodeID, key)
// JSON Patch takes as path input a JSON Pointer, defined in RFC6901
// So we replace all instances of "/" with "~1" as per:
// https://tools.ietf.org/html/rfc6901#section-3
patch := []byte(fmt.Sprintf("[{\"op\":\"remove\",\"path\":\"/metadata/annotations/%s\"}]", strings.ReplaceAll(key, "/", "~1")))
_, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{})
if err != nil {
log.Fatalf("Error deleting node annotation %s via k8s API: %v", key, err)
}
}
func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) {
func rebootAsRequired(nodeID string) {
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal(err)
@@ -546,179 +254,50 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
nodeMeta := nodeMeta{}
if holding(lock, &nodeMeta) {
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}
if !nodeMeta.Unschedulable {
uncordon(client, node)
uncordon(nodeID)
}
// If we're holding the lock we know we've tried, in a prior run, to reboot
// So (1) we want to confirm that the reboot succeeded practically ( !rebootRequired() )
// And (2) check if we previously annotated the node that it was in the process of being rebooted,
// And finally (3) if it has that annotation, to delete it.
// This indicates to other node tools running on the cluster that this node may be a candidate for maintenance
if annotateNodes && !rebootRequired(sentinelCommand) {
if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok {
deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation)
}
}
throttle(releaseDelay)
release(lock)
}
preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)
// Remove taint immediately during startup to quickly allow scheduling again.
if !rebootRequired(sentinelCommand) {
preferNoScheduleTaint.Disable()
}
// instantiate prometheus client
promClient, err := alerts.NewPromClient(papi.Config{Address: prometheusURL})
if err != nil {
log.Fatal("Unable to create prometheus client: ", err)
}
source := rand.NewSource(time.Now().UnixNano())
tick := delaytick.New(source, period)
for range tick {
if !window.Contains(time.Now()) {
// Remove taint outside the reboot time window to allow for normal operation.
preferNoScheduleTaint.Disable()
continue
}
for _ = range tick {
if rebootRequired() && !rebootBlocked() {
node, err := client.CoreV1().Nodes().Get(nodeID, metav1.GetOptions{})
if err != nil {
log.Fatal(err)
}
nodeMeta.Unschedulable = node.Spec.Unschedulable
if !rebootRequired(sentinelCommand) {
log.Infof("Reboot not required")
preferNoScheduleTaint.Disable()
continue
}
log.Infof("Reboot required")
var blockCheckers []RebootBlocker
if prometheusURL != "" {
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
}
if podSelectors != nil {
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
}
if rebootBlocked(blockCheckers...) {
continue
}
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}
nodeMeta.Unschedulable = node.Spec.Unschedulable
var timeNowString string
if annotateNodes {
if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; !ok {
timeNowString = time.Now().Format(time.RFC3339)
// Annotate this node to indicate that "I am going to be rebooted!"
// so that other node maintenance tools running on the cluster are aware that this node is in the process of a "state transition"
annotations := map[string]string{KuredRebootInProgressAnnotation: timeNowString}
// & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot
annotations[KuredMostRecentRebootNeededAnnotation] = timeNowString
addNodeAnnotations(client, nodeID, annotations)
if acquire(lock, &nodeMeta) {
if !nodeMeta.Unschedulable {
drain(nodeID)
}
commandReboot(nodeID)
for {
log.Infof("Waiting for reboot")
time.Sleep(time.Minute)
}
}
}
if !acquire(lock, &nodeMeta, TTL) {
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
preferNoScheduleTaint.Enable()
continue
}
drain(client, node)
if rebootDelay > 0 {
log.Infof("Delaying reboot for %v", rebootDelay)
time.Sleep(rebootDelay)
}
invokeReboot(nodeID, rebootCommand)
for {
log.Infof("Waiting for reboot")
time.Sleep(time.Minute)
}
}
}
// buildSentinelCommand creates the shell command line which will need wrapping to escape
// the container boundaries
func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand string) []string {
if rebootSentinelCommand != "" {
cmd, err := shlex.Split(rebootSentinelCommand)
if err != nil {
log.Fatalf("Error parsing provided sentinel command: %v", err)
}
return cmd
}
return []string{"test", "-f", rebootSentinelFile}
}
// parseRebootCommand creates the shell command line which will need wrapping to escape
// the container boundaries
func parseRebootCommand(rebootCommand string) []string {
command, err := shlex.Split(rebootCommand)
if err != nil {
log.Fatalf("Error parsing provided reboot command: %v", err)
}
return command
}
func root(cmd *cobra.Command, args []string) {
if logFormat == "json" {
log.SetFormatter(&log.JSONFormatter{})
}
log.Infof("Kubernetes Reboot Daemon: %s", version)
nodeID := os.Getenv("KURED_NODE_ID")
if nodeID == "" {
log.Fatal("KURED_NODE_ID environment variable required")
}
window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone)
if err != nil {
log.Fatalf("Failed to build time window: %v", err)
}
sentinelCommand := buildSentinelCommand(rebootSentinelFile, rebootSentinelCommand)
restartCommand := parseRebootCommand(rebootCommand)
log.Infof("Node ID: %s", nodeID)
log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation)
if lockTTL > 0 {
log.Infof("Lock TTL set, lock will expire after: %v", lockTTL)
} else {
log.Info("Lock TTL not set, lock will remain until being released")
}
if lockReleaseDelay > 0 {
log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay)
} else {
log.Info("Lock release delay not set, lock will be released immediately after rebooting")
}
log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName)
log.Infof("Blocking Pod Selectors: %v", podSelectors)
log.Infof("Reboot schedule: %v", window)
log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
log.Infof("Reboot command: %s", restartCommand)
if annotateNodes {
log.Infof("Will annotate nodes during kured reboot operations")
}
log.Infof("Reboot Sentinel: %s every %v", rebootSentinel, period)
// To run those commands as it was the host, we'll use nsenter
// Relies on hostPID:true and privileged:true to enter host mount space
// PID set to 1, until we have a better discovery mechanism.
hostSentinelCommand := buildHostCommand(1, sentinelCommand)
hostRestartCommand := buildHostCommand(1, restartCommand)
go rebootAsRequired(nodeID, hostRestartCommand, hostSentinelCommand, window, lockTTL, lockReleaseDelay)
go maintainRebootRequiredMetric(nodeID, hostSentinelCommand)
go rebootAsRequired(nodeID)
go maintainRebootRequiredMetric(nodeID)
http.Handle("/metrics", promhttp.Handler())
log.Fatal(http.ListenAndServe(":8080", nil))

View File

@@ -1,235 +0,0 @@
package main
import (
"reflect"
"testing"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/weaveworks/kured/pkg/alerts"
assert "gotest.tools/v3/assert"
papi "github.com/prometheus/client_golang/api"
)
type BlockingChecker struct {
blocking bool
}
func (fbc BlockingChecker) isBlocked() bool {
return fbc.blocking
}
var _ RebootBlocker = BlockingChecker{} // Verify that Type implements Interface.
var _ RebootBlocker = (*BlockingChecker)(nil) // Verify that *Type implements Interface.
func Test_flagCheck(t *testing.T) {
var cmd *cobra.Command
var args []string
slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
flagCheck(cmd, args)
if notifyURL != "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" {
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", notifyURL)
}
}
func Test_rebootBlocked(t *testing.T) {
noCheckers := []RebootBlocker{}
nonblockingChecker := BlockingChecker{blocking: false}
blockingChecker := BlockingChecker{blocking: true}
// Instantiate a prometheusClient with a broken_url
promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"})
if err != nil {
log.Fatal("Can't create prometheusClient: ", err)
}
brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false}
type args struct {
blockers []RebootBlocker
}
tests := []struct {
name string
args args
want bool
}{
{
name: "Do not block on no blocker defined",
args: args{blockers: noCheckers},
want: false,
},
{
name: "Ensure a blocker blocks",
args: args{blockers: []RebootBlocker{blockingChecker}},
want: true,
},
{
name: "Ensure a non-blocker doesn't block",
args: args{blockers: []RebootBlocker{nonblockingChecker}},
want: false,
},
{
name: "Ensure one blocker is enough to block",
args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}},
want: true,
},
{
name: "Do block on error contacting prometheus API",
args: args{blockers: []RebootBlocker{brokenPrometheusClient}},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := rebootBlocked(tt.args.blockers...); got != tt.want {
t.Errorf("rebootBlocked() = %v, want %v", got, tt.want)
}
})
}
}
func Test_buildHostCommand(t *testing.T) {
type args struct {
pid int
command []string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Ensure command will run with nsenter",
args: args{pid: 1, command: []string{"ls", "-Fal"}},
want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := buildHostCommand(tt.args.pid, tt.args.command); !reflect.DeepEqual(got, tt.want) {
t.Errorf("buildHostCommand() = %v, want %v", got, tt.want)
}
})
}
}
func Test_buildSentinelCommand(t *testing.T) {
type args struct {
rebootSentinelFile string
rebootSentinelCommand string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Ensure a sentinelFile generates a shell 'test' command with the right file",
args: args{
rebootSentinelFile: "/test1",
rebootSentinelCommand: "",
},
want: []string{"test", "-f", "/test1"},
},
{
name: "Ensure a sentinelCommand has priority over a sentinelFile if both are provided (because sentinelFile is always provided)",
args: args{
rebootSentinelFile: "/test1",
rebootSentinelCommand: "/sbin/reboot-required -r",
},
want: []string{"/sbin/reboot-required", "-r"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := buildSentinelCommand(tt.args.rebootSentinelFile, tt.args.rebootSentinelCommand); !reflect.DeepEqual(got, tt.want) {
t.Errorf("buildSentinelCommand() = %v, want %v", got, tt.want)
}
})
}
}
func Test_parseRebootCommand(t *testing.T) {
type args struct {
rebootCommand string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Ensure a reboot command is properly parsed",
args: args{
rebootCommand: "/sbin/systemctl reboot",
},
want: []string{"/sbin/systemctl", "reboot"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := parseRebootCommand(tt.args.rebootCommand); !reflect.DeepEqual(got, tt.want) {
t.Errorf("parseRebootCommand() = %v, want %v", got, tt.want)
}
})
}
}
func Test_rebootRequired(t *testing.T) {
type args struct {
sentinelCommand []string
}
tests := []struct {
name string
args args
want bool
}{
{
name: "Ensure rc = 0 means reboot required",
args: args{
sentinelCommand: []string{"true"},
},
want: true,
},
{
name: "Ensure rc != 0 means reboot NOT required",
args: args{
sentinelCommand: []string{"false"},
},
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := rebootRequired(tt.args.sentinelCommand); got != tt.want {
t.Errorf("rebootRequired() = %v, want %v", got, tt.want)
}
})
}
}
func Test_rebootRequired_fatals(t *testing.T) {
cases := []struct {
param []string
expectFatal bool
}{
{
param: []string{"true"},
expectFatal: false,
},
{
param: []string{"./babar"},
expectFatal: true,
},
}
defer func() { log.StandardLogger().ExitFunc = nil }()
var fatal bool
log.StandardLogger().ExitFunc = func(int) { fatal = true }
for _, c := range cases {
fatal = false
rebootRequired(c.param)
assert.Equal(t, c.expectFatal, fatal)
}
}

View File

@@ -0,0 +1,23 @@
package main
import (
"fmt"
"log"
"os"
"regexp"
"github.com/weaveworks/kured/pkg/alerts"
)
func main() {
if len(os.Args) != 3 {
log.Fatalf("USAGE: %s <prometheusURL> <filterRegexp>", os.Args[0])
}
count, err := alerts.PrometheusCountActive(os.Args[1], regexp.MustCompile(os.Args[2]))
if err != nil {
log.Fatal(err)
}
fmt.Println(count)
}

20
go.mod
View File

@@ -1,20 +0,0 @@
module github.com/weaveworks/kured
go 1.16
require (
github.com/containrrr/shoutrrr v0.5.2
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
github.com/prometheus/client_golang v1.11.0
github.com/prometheus/common v0.32.1
github.com/sirupsen/logrus v1.8.1
github.com/spf13/cobra v1.3.0
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.10.1
github.com/stretchr/testify v1.7.0
gotest.tools/v3 v3.0.3
k8s.io/api v0.22.4
k8s.io/apimachinery v0.22.4
k8s.io/client-go v0.22.4
k8s.io/kubectl v0.22.4
)

1189
go.sum

File diff suppressed because it is too large Load Diff

View File

@@ -1,40 +1,32 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kured
namespace: kube-system
---
apiVersion: apps/v1
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: kured # Must match `--ds-name`
namespace: kube-system # Must match `--ds-namespace`
spec:
selector:
matchLabels:
name: kured
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: kured
spec:
serviceAccountName: kured
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
hostPID: true # Facilitate entering the host mount namespace via init
restartPolicy: Always
containers:
- name: kured
image: docker.io/weaveworks/kured:1.9.1
# If you find yourself here wondering why there is no
# :latest tag on Docker Hub,see the FAQ in the README
image: quay.io/weaveworks/kured
imagePullPolicy: IfNotPresent
securityContext:
privileged: true # Give permission to nsenter /proc/1/ns/mnt
command:
- /usr/bin/kured
# args:
# - --alert-filter-regexp=^RebootRequired$
# - --ds-name=kured
# - --ds-namespace=kube-system
# - --lock-annotation=weave.works/kured-node-lock
# - --period=1h
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --reboot-sentinel=/var/run/reboot-required
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
#
# NO USER SERVICEABLE PARTS BEYOND THIS POINT
env:
# Pass in the name of the node on which this pod is scheduled
# for use with drain/uncordon operations and lock acquisition
@@ -42,37 +34,14 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- /usr/bin/kured
# - --force-reboot=false
# - --drain-grace-period=-1
# - --skip-wait-for-delete-timeout=0
# - --drain-timeout=0
# - --period=1h
# - --ds-namespace=kube-system
# - --ds-name=kured
# - --lock-annotation=weave.works/kured-node-lock
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-firing-only=false
# - --reboot-sentinel=/var/run/reboot-required
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
# - --slack-channel=alerting
# - --notify-url="" # See also shoutrrr url format
# - --message-template-drain=Draining node %s
# - --message-template-drain=Rebooting node %s
# - --blocking-pod-selector=runtime=long,cost=expensive
# - --blocking-pod-selector=name=temperamental
# - --blocking-pod-selector=...
# - --reboot-days=sun,mon,tue,wed,thu,fri,sat
# - --reboot-delay=90s
# - --start-time=0:00
# - --end-time=23:59:59
# - --time-zone=UTC
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text
volumeMounts:
# Needed for two purposes:
# * Testing for the existence of /var/run/reboot-required
# * Accessing /var/run/dbus/system_bus_socket to effect reboot
- name: hostrun
mountPath: /var/run
restartPolicy: Always
volumes:
- name: hostrun
hostPath:
path: /var/run

View File

@@ -1,63 +0,0 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kured
rules:
# Allow kured to read spec.unschedulable
# Allow kubectl to drain/uncordon
#
# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
#
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "patch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["list","delete","get"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kured
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kured
subjects:
- kind: ServiceAccount
name: kured
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: kube-system
name: kured
rules:
# Allow kured to lock/unlock itself
- apiGroups: ["apps"]
resources: ["daemonsets"]
resourceNames: ["kured"]
verbs: ["update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
namespace: kube-system
name: kured
subjects:
- kind: ServiceAccount
namespace: kube-system
name: kured
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: kured

View File

@@ -7,39 +7,21 @@ import (
"sort"
"time"
papi "github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/client_golang/api/prometheus"
"github.com/prometheus/common/model"
)
// PromClient is a wrapper around the Prometheus Client interface and implements the api
// This way, the PromClient can be instantiated with the configuration the Client needs, and
// the ability to use the methods the api has, like Query and so on.
type PromClient struct {
papi papi.Client
api v1.API
}
// NewPromClient creates a new client to the Prometheus API.
// It returns an error on any problem.
func NewPromClient(conf papi.Config) (*PromClient, error) {
promClient, err := papi.NewClient(conf)
// Returns a list of names of active (e.g. pending or firing) alerts, filtered
// by the supplied regexp.
func PrometheusActiveAlerts(prometheusURL string, filter *regexp.Regexp) ([]string, error) {
client, err := prometheus.New(prometheus.Config{Address: prometheusURL})
if err != nil {
return nil, err
}
client := PromClient{papi: promClient, api: v1.NewAPI(promClient)}
return &client, nil
}
// ActiveAlerts is a method of type PromClient, it returns a list of names of active alerts
// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query.
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
// block-list and will NOT block rebooting. query by includeLabel means,
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {
queryAPI := prometheus.NewQueryAPI(client)
// get all alerts from prometheus
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
value, err := queryAPI.Query(context.Background(), "ALERTS", time.Now())
if err != nil {
return nil, err
}
@@ -49,17 +31,17 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str
activeAlertSet := make(map[string]bool)
for _, sample := range vector {
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
if filter == nil || !filter.MatchString(string(alertName)) {
activeAlertSet[string(alertName)] = true
}
}
}
var activeAlerts []string
for activeAlert := range activeAlertSet {
for activeAlert, _ := range activeAlertSet {
activeAlerts = append(activeAlerts, activeAlert)
}
sort.Strings(activeAlerts)
sort.Sort(sort.StringSlice(activeAlerts))
return activeAlerts, nil
}

View File

@@ -1,141 +0,0 @@
package alerts
import (
"log"
"net/http"
"net/http/httptest"
"regexp"
"testing"
"github.com/prometheus/client_golang/api"
"github.com/stretchr/testify/assert"
)
type MockResponse struct {
StatusCode int
Body []byte
}
// MockServerProperties ties a mock response to a url and a method
type MockServerProperties struct {
URI string
HTTPMethod string
Response MockResponse
}
// NewMockServer sets up a new MockServer with properties ad starts the server.
func NewMockServer(props ...MockServerProperties) *httptest.Server {
handler := http.HandlerFunc(
func(w http.ResponseWriter, r *http.Request) {
for _, proc := range props {
_, err := w.Write(proc.Response.Body)
if err != nil {
log.Fatal(err)
}
}
})
return httptest.NewServer(handler)
}
func TestActiveAlerts(t *testing.T) {
responsebody := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"GatekeeperViolations","alertstate":"firing","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodCrashing-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:8080","job":"kube-state-metrics","namespace":"dev","pod":"dev-deployment-78dcbmf25v","severity":"critical","team":"dev"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodRestart-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:1234","job":"kube-state-metrics","namespace":"qa","pod":"qa-job-deployment-78dcbmf25v","severity":"warning","team":"qa"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PrometheusTargetDown","alertstate":"firing","job":"kubernetes-pods","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`
addr := "http://localhost:10001"
for _, tc := range []struct {
it string
rFilter string
respBody string
aName string
wantN int
firingOnly bool
}{
{
it: "should return no active alerts",
respBody: responsebody,
rFilter: "",
wantN: 0,
firingOnly: false,
},
{
it: "should return a subset of all alerts",
respBody: responsebody,
rFilter: "Pod",
wantN: 3,
firingOnly: false,
},
{
it: "should return all active alerts by regex",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
},
{
it: "should return all active alerts by regex filter",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
},
{
it: "should return only firing alerts if firingOnly is true",
respBody: responsebody,
rFilter: "*",
wantN: 4,
firingOnly: true,
},
{
it: "should return ScheduledRebootFailing active alerts",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
aName: "ScheduledRebootFailing",
rFilter: "*",
wantN: 1,
firingOnly: false,
},
{
it: "should not return an active alert if RebootRequired is firing (regex filter)",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
rFilter: "RebootRequired",
wantN: 0,
firingOnly: false,
},
} {
// Start mockServer
mockServer := NewMockServer(MockServerProperties{
URI: addr,
HTTPMethod: http.MethodPost,
Response: MockResponse{
Body: []byte(tc.respBody),
},
})
// Close mockServer after all connections are gone
defer mockServer.Close()
t.Run(tc.it, func(t *testing.T) {
// regex filter
regex, _ := regexp.Compile(tc.rFilter)
// instantiate the prometheus client with the mockserver-address
p, err := NewPromClient(api.Config{Address: mockServer.URL})
if err != nil {
log.Fatal(err)
}
result, err := p.ActiveAlerts(regex, tc.firingOnly)
if err != nil {
log.Fatal(err)
}
// assert
assert.Equal(t, tc.wantN, len(result), "expected amount of alerts %v, got %v", tc.wantN, len(result))
if tc.aName != "" {
assert.Equal(t, tc.aName, result[0], "expected active alert %v, got %v", tc.aName, result[0])
}
})
}
}

View File

@@ -1,25 +1,15 @@
package daemonsetlock
import (
"context"
"encoding/json"
"fmt"
"time"
v1 "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
)
const (
k8sAPICallRetrySleep = 5 * time.Second // How much time to wait in between retrying a k8s API call
k8sAPICallRetryTimeout = 5 * time.Minute // How long to wait until we determine that the k8s API is definitively unavailable
)
// DaemonSetLock holds all necessary information to do actions
// on the kured ds which holds lock info through annotations.
type DaemonSetLock struct {
client *kubernetes.Clientset
nodeID string
@@ -29,23 +19,19 @@ type DaemonSetLock struct {
}
type lockAnnotationValue struct {
NodeID string `json:"nodeID"`
Metadata interface{} `json:"metadata,omitempty"`
Created time.Time `json:"created"`
TTL time.Duration `json:"TTL"`
NodeID string `json:"nodeID"`
Metadata interface{} `json:"metadata,omitempty"`
}
// New creates a daemonsetLock object containing the necessary data for follow up k8s requests
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock {
return &DaemonSetLock{client, nodeID, namespace, name, annotation}
}
// Acquire attempts to annotate the kured daemonset with lock info from instantiated DaemonSetLock using client-go
func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool, string, error) {
func (dsl *DaemonSetLock) Acquire(metadata interface{}) (acquired bool, owner string, err error) {
for {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
ds, err := dsl.client.ExtensionsV1beta1().DaemonSets(dsl.namespace).Get(dsl.name, metav1.GetOptions{})
if err != nil {
return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
return false, "", err
}
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
@@ -54,23 +40,20 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, "", err
}
if !ttlExpired(value.Created, value.TTL) {
return value.NodeID == dsl.nodeID, value.NodeID, nil
}
return value.NodeID == dsl.nodeID, value.NodeID, nil
}
if ds.ObjectMeta.Annotations == nil {
ds.ObjectMeta.Annotations = make(map[string]string)
}
value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata, Created: time.Now().UTC(), TTL: TTL}
value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata}
valueBytes, err := json.Marshal(&value)
if err != nil {
return false, "", err
}
ds.ObjectMeta.Annotations[dsl.annotation] = string(valueBytes)
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
_, err = dsl.client.ExtensionsV1beta1().DaemonSets(dsl.namespace).Update(ds)
if err != nil {
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
// Something else updated the resource between us reading and writing - try again soon
@@ -84,11 +67,10 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
}
}
// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go
func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
func (dsl *DaemonSetLock) Test(metadata interface{}) (holding bool, err error) {
ds, err := dsl.client.ExtensionsV1beta1().DaemonSets(dsl.namespace).Get(dsl.name, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
return false, err
}
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
@@ -97,21 +79,17 @@ func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, err
}
if !ttlExpired(value.Created, value.TTL) {
return value.NodeID == dsl.nodeID, nil
}
return value.NodeID == dsl.nodeID, nil
}
return false, nil
}
// Release attempts to remove the lock data from the kured ds annotations using client-go
func (dsl *DaemonSetLock) Release() error {
for {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
ds, err := dsl.client.ExtensionsV1beta1().DaemonSets(dsl.namespace).Get(dsl.name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
return err
}
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
@@ -120,7 +98,6 @@ func (dsl *DaemonSetLock) Release() error {
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return err
}
if value.NodeID != dsl.nodeID {
return fmt.Errorf("Not lock holder: %v", value.NodeID)
}
@@ -130,7 +107,7 @@ func (dsl *DaemonSetLock) Release() error {
delete(ds.ObjectMeta.Annotations, dsl.annotation)
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
_, err = dsl.client.ExtensionsV1beta1().DaemonSets(dsl.namespace).Update(ds)
if err != nil {
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
// Something else updated the resource between us reading and writing - try again soon
@@ -143,28 +120,3 @@ func (dsl *DaemonSetLock) Release() error {
return nil
}
}
// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client
func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) {
var ds *v1.DaemonSet
var lastError error
err := wait.PollImmediate(sleep, timeout, func() (bool, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil {
return false, nil
}
return true, nil
})
if err != nil {
return nil, fmt.Errorf("Timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError)
}
return ds, nil
}
func ttlExpired(created time.Time, ttl time.Duration) bool {
if ttl > 0 && time.Since(created) >= ttl {
return true
}
return false
}

View File

@@ -1,28 +0,0 @@
package daemonsetlock
import (
"testing"
"time"
)
func TestTtlExpired(t *testing.T) {
d := time.Date(2020, 05, 05, 14, 15, 0, 0, time.UTC)
second, _ := time.ParseDuration("1s")
zero, _ := time.ParseDuration("0m")
tests := []struct {
created time.Time
ttl time.Duration
result bool
}{
{d, second, true},
{time.Now(), second, false},
{d, zero, false},
}
for i, tst := range tests {
if ttlExpired(tst.created, tst.ttl) != tst.result {
t.Errorf("Test %d failed, expected %v but got %v", i, tst.result, !tst.result)
}
}
}

View File

@@ -5,7 +5,7 @@ import (
"time"
)
// New ticks regularly after an initial delay randomly distributed between d/2 and d + d/2
// Tick regularly after an initial delay randomly distributed between d/2 and d + d/2
func New(s rand.Source, d time.Duration) <-chan time.Time {
c := make(chan time.Time)

View File

@@ -0,0 +1,42 @@
package slack
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
"time"
)
var (
httpClient = &http.Client{Timeout: 5 * time.Second}
)
type body struct {
Text string `json:"text,omitempty"`
Username string `json:"username,omitempty"`
}
func NotifyReboot(hookURL, username, nodeID string) error {
msg := body{
Text: fmt.Sprintf("Rebooting node %s", nodeID),
Username: username,
}
var buf bytes.Buffer
if err := json.NewEncoder(&buf).Encode(&msg); err != nil {
return err
}
resp, err := httpClient.Post(hookURL, "application/json", &buf)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf(resp.Status)
}
return nil
}

View File

@@ -1,166 +0,0 @@
package taints
import (
"context"
"encoding/json"
"fmt"
log "github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
)
// Taint allows to set soft and hard limitations for scheduling and executing pods on nodes.
type Taint struct {
client *kubernetes.Clientset
nodeID string
taintName string
effect v1.TaintEffect
exists bool
}
// New provides a new taint.
func New(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect) *Taint {
exists, _, _ := taintExists(client, nodeID, taintName)
return &Taint{
client: client,
nodeID: nodeID,
taintName: taintName,
effect: effect,
exists: exists,
}
}
// Enable creates the taint for a node. Creating an existing taint is a noop.
func (t *Taint) Enable() {
if t.taintName == "" {
return
}
if t.exists {
return
}
preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, true)
t.exists = true
}
// Disable removes the taint for a node. Removing a missing taint is a noop.
func (t *Taint) Disable() {
if t.taintName == "" {
return
}
if !t.exists {
return
}
preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, false)
t.exists = false
}
func taintExists(client *kubernetes.Clientset, nodeID, taintName string) (bool, int, *v1.Node) {
updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil || updatedNode == nil {
log.Fatalf("Error reading node %s: %v", nodeID, err)
}
for i, taint := range updatedNode.Spec.Taints {
if taint.Key == taintName {
return true, i, updatedNode
}
}
return false, 0, updatedNode
}
func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect, shouldExists bool) {
taintExists, offset, updatedNode := taintExists(client, nodeID, taintName)
if taintExists && shouldExists {
log.Debugf("Taint %v exists already for node %v.", taintName, nodeID)
return
}
if !taintExists && !shouldExists {
log.Debugf("Taint %v already missing for node %v.", taintName, nodeID)
return
}
type patchTaints struct {
Op string `json:"op"`
Path string `json:"path"`
Value interface{} `json:"value,omitempty"`
}
taint := v1.Taint{
Key: taintName,
Effect: effect,
}
var patches []patchTaints
if len(updatedNode.Spec.Taints) == 0 {
// add first taint and ensure to keep current taints
patches = []patchTaints{
{
Op: "test",
Path: "/spec",
Value: updatedNode.Spec,
},
{
Op: "add",
Path: "/spec/taints",
Value: []v1.Taint{},
},
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
} else if taintExists {
// remove taint and ensure to test against race conditions
patches = []patchTaints{
{
Op: "test",
Path: fmt.Sprintf("/spec/taints/%d", offset),
Value: taint,
},
{
Op: "remove",
Path: fmt.Sprintf("/spec/taints/%d", offset),
},
}
} else {
// add missing taint to exsting list
patches = []patchTaints{
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
}
patchBytes, err := json.Marshal(patches)
if err != nil {
log.Fatalf("Error encoding taint patch for node %s: %v", nodeID, err)
}
_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
if err != nil {
log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
}
if shouldExists {
log.Info("Node taint added")
} else {
log.Info("Node taint removed")
}
}

View File

@@ -1,91 +0,0 @@
package timewindow
import (
"fmt"
"strconv"
"strings"
"time"
)
// EveryDay contains all days of the week, and exports it
// for convenience use in the cmd line arguments.
var EveryDay = []string{"su", "mo", "tu", "we", "th", "fr", "sa"}
// dayStrings maps day strings to time.Weekdays
var dayStrings = map[string]time.Weekday{
"su": time.Sunday,
"sun": time.Sunday,
"sunday": time.Sunday,
"mo": time.Monday,
"mon": time.Monday,
"monday": time.Monday,
"tu": time.Tuesday,
"tue": time.Tuesday,
"tuesday": time.Tuesday,
"we": time.Wednesday,
"wed": time.Wednesday,
"wednesday": time.Wednesday,
"th": time.Thursday,
"thu": time.Thursday,
"thursday": time.Thursday,
"fr": time.Friday,
"fri": time.Friday,
"friday": time.Friday,
"sa": time.Saturday,
"sat": time.Saturday,
"saturday": time.Saturday,
}
type weekdays uint32
// parseWeekdays creates a set of weekdays from a string slice
func parseWeekdays(days []string) (weekdays, error) {
var result uint32
for _, day := range days {
if len(day) == 0 {
continue
}
weekday, err := parseWeekday(day)
if err != nil {
return weekdays(0), err
}
result |= 1 << uint32(weekday)
}
return weekdays(result), nil
}
// Contains returns true if the specified weekday is a member of this set.
func (w weekdays) Contains(day time.Weekday) bool {
return uint32(w)&(1<<uint32(day)) != 0
}
// String returns a string representation of the set of weekdays.
func (w weekdays) String() string {
var b strings.Builder
for i := uint32(0); i < 7; i++ {
if uint32(w)&(1<<i) != 0 {
b.WriteString(time.Weekday(i).String()[0:3])
} else {
b.WriteString("---")
}
}
return b.String()
}
func parseWeekday(day string) (time.Weekday, error) {
if n, err := strconv.Atoi(day); err == nil {
if n >= 0 && n < 7 {
return time.Weekday(n), nil
}
return time.Sunday, fmt.Errorf("Invalid weekday, number out of range: %s", day)
}
if weekday, ok := dayStrings[strings.ToLower(day)]; ok {
return weekday, nil
}
return time.Sunday, fmt.Errorf("Invalid weekday: %s", day)
}

View File

@@ -1,46 +0,0 @@
package timewindow
import (
"strings"
"testing"
)
func TestParseWeekdays(t *testing.T) {
tests := []struct {
input string
result string
}{
{"0,4", "Sun---------Thu------"},
{"su,mo,tu", "SunMonTue------------"},
{"sunday,tu,thu", "Sun---Tue---Thu------"},
{"THURSDAY", "------------Thu------"},
{"we,WED,WeDnEsDaY", "---------Wed---------"},
{"", "---------------------"},
{",,,", "---------------------"},
}
for _, tst := range tests {
res, err := parseWeekdays(strings.Split(tst.input, ","))
if err != nil {
t.Errorf("Received error for input %s: %v", tst.input, err)
} else if res.String() != tst.result {
t.Errorf("Test %s: Expected %s got %s", tst.input, tst.result, res.String())
}
}
}
func TestParseWeekdaysErrors(t *testing.T) {
tests := []string{
"15",
"-8",
"8",
"mon,tue,wed,fridayyyy",
}
for _, tst := range tests {
_, err := parseWeekdays(strings.Split(tst, ","))
if err == nil {
t.Errorf("Expected to receive error for input %s", tst)
}
}
}

View File

@@ -1,81 +0,0 @@
package timewindow
import (
"fmt"
"time"
)
// TimeWindow specifies a schedule of days and times.
type TimeWindow struct {
days weekdays
location *time.Location
startTime time.Time
endTime time.Time
}
// New creates a TimeWindow instance based on string inputs specifying a schedule.
func New(days []string, startTime, endTime, location string) (*TimeWindow, error) {
tw := &TimeWindow{}
var err error
if tw.days, err = parseWeekdays(days); err != nil {
return nil, err
}
if tw.location, err = time.LoadLocation(location); err != nil {
return nil, err
}
if tw.startTime, err = parseTime(startTime, tw.location); err != nil {
return nil, err
}
if tw.endTime, err = parseTime(endTime, tw.location); err != nil {
return nil, err
}
return tw, nil
}
// Contains determines whether the specified time is within this time window.
func (tw *TimeWindow) Contains(t time.Time) bool {
loctime := t.In(tw.location)
if !tw.days.Contains(loctime.Weekday()) {
return false
}
start := time.Date(loctime.Year(), loctime.Month(), loctime.Day(), tw.startTime.Hour(), tw.startTime.Minute(), tw.startTime.Second(), 0, tw.location)
end := time.Date(loctime.Year(), loctime.Month(), loctime.Day(), tw.endTime.Hour(), tw.endTime.Minute(), tw.endTime.Second(), 1e9-1, tw.location)
// Time Wrap validation
// First we check for start and end time, if start is after end time
// Next we need to validate if we want to wrap to the day before or to the day after
// For that we check the loctime value to see if it is before end time, we wrap with the day before
// Otherwise we wrap to the next day.
if tw.startTime.After(tw.endTime) {
if loctime.Before(end) {
start = start.Add(-24 * time.Hour)
} else {
end = end.Add(24 * time.Hour)
}
}
return (loctime.After(start) || loctime.Equal(start)) && (loctime.Before(end) || loctime.Equal(end))
}
// String returns a string representation of this time window.
func (tw *TimeWindow) String() string {
return fmt.Sprintf("%s between %02d:%02d and %02d:%02d %s", tw.days.String(), tw.startTime.Hour(), tw.startTime.Minute(), tw.endTime.Hour(), tw.endTime.Minute(), tw.location.String())
}
// parseTime tries to parse a time with several formats.
func parseTime(s string, loc *time.Location) (time.Time, error) {
fmts := []string{"15:04", "15:04:05", "03:04pm", "15", "03pm", "3pm"}
for _, f := range fmts {
if t, err := time.ParseInLocation(f, s, loc); err == nil {
return t, nil
}
}
return time.Now(), fmt.Errorf("Invalid time format: %s", s)
}

View File

@@ -1,97 +0,0 @@
package timewindow
import (
"strings"
"testing"
"time"
)
func TestTimeWindows(t *testing.T) {
type testcase struct {
time string
result bool
}
tests := []struct {
days string
start string
end string
loc string
cases []testcase
}{
{"mon,tue,wed,thu,fri", "9am", "5pm", "America/Los_Angeles", []testcase{
{"2019/03/31 10:00 PDT", false},
{"2019/04/04 00:49 PDT", false},
{"2019/04/04 12:00 PDT", true},
{"2019/04/04 11:59 UTC", false},
{"2019/04/05 08:59 PDT", false},
{"2019/04/05 9:01 PDT", true},
}},
{"mon,we,fri", "10:01", "11:30am", "America/Los_Angeles", []testcase{
{"2019/04/05 10:30 PDT", true},
{"2019/04/06 10:30 PDT", false},
{"2019/04/07 10:30 PDT", false},
{"2019/04/08 10:30 PDT", true},
{"2019/04/09 10:30 PDT", false},
{"2019/04/10 10:30 PDT", true},
{"2019/04/11 10:30 PDT", false},
}},
{"mo,tu,we,th,fr", "00:00", "23:59:59", "UTC", []testcase{
{"2019/04/18 00:00 UTC", true},
{"2019/04/18 23:59 UTC", true},
}},
{"mon,tue,wed,thu,fri", "9pm", "5am", "America/Los_Angeles", []testcase{
{"2019/03/30 04:00 PDT", false},
{"2019/03/31 10:00 PDT", false},
{"2019/03/31 22:00 PDT", false},
{"2019/04/04 00:49 PDT", true},
{"2019/04/04 12:00 PDT", false},
{"2019/04/04 22:49 PDT", true},
{"2019/04/05 00:49 PDT", true},
{"2019/04/05 08:59 PDT", false},
{"2019/04/05 9:01 PDT", false},
}},
{"mon,tue,wed,thu,fri", "11:59pm", "00:01am", "America/Los_Angeles", []testcase{
{"2019/04/04 23:58 PDT", false},
{"2019/04/04 23:59 PDT", true},
{"2019/04/05 00:00 PDT", true},
{"2019/04/05 00:01 PDT", true},
{"2019/04/05 00:02 PDT", false},
}},
{"mon,tue,wed,fri", "11:59pm", "00:01am", "America/Los_Angeles", []testcase{
{"2019/04/04 23:58 PDT", false},
{"2019/04/04 23:59 PDT", false}, // Even that this falls in the between the hours Thursday is not included so should not run
{"2019/04/05 00:00 PDT", true},
{"2019/04/05 00:02 PDT", false},
}},
{"mon,tue,wed,thu", "11:59pm", "00:01am", "America/Los_Angeles", []testcase{
{"2019/04/04 23:58 PDT", false},
{"2019/04/04 23:59 PDT", true},
{"2019/04/05 00:00 PDT", false}, // Even that this falls in the between the hours Friday is not included so should not run
{"2019/04/05 00:02 PDT", false},
}},
{"mon,tue,wed,thu,fri", "11:59pm", "00:01am", "UTC", []testcase{
{"2019/04/04 23:58 UTC", false},
{"2019/04/04 23:59 UTC", true},
{"2019/04/05 00:00 UTC", true},
{"2019/04/05 00:01 UTC", true},
{"2019/04/05 00:02 UTC", false},
}},
}
for i, tst := range tests {
tw, err := New(strings.Split(tst.days, ","), tst.start, tst.end, tst.loc)
if err != nil {
t.Errorf("Test [%d] failed to create TimeWindow: %v", i, err)
}
for _, cas := range tst.cases {
tm, err := time.ParseInLocation("2006/01/02 15:04 MST", cas.time, tw.location)
if err != nil {
t.Errorf("Failed to parse time \"%s\": %v", cas.time, err)
} else if cas.result != tw.Contains(tm) {
t.Errorf("(%s) contains (%s) didn't match expected result of %v", tw.String(), cas.time, cas.result)
}
}
}
}

View File

@@ -1,12 +0,0 @@
#!/usr/bin/env bash
# USE KUBECTL_CMD to pass context and/or namespaces.
KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
SENTINEL_FILE="${SENTINEL_FILE:-/var/run/reboot-required}"
echo "Creating reboot sentinel on all nodes"
for nodename in $("$KUBECTL_CMD" get nodes -o name); do
docker exec "${nodename/node\//}" hostname
docker exec "${nodename/node\//}" touch "${SENTINEL_FILE}"
done

View File

@@ -1,85 +0,0 @@
#!/usr/bin/env bash
NODECOUNT=${NODECOUNT:-5}
KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
DEBUG="${DEBUG:-false}"
CONTAINER_NAME_FORMAT=${CONTAINER_NAME_FORMAT:-"chart-testing-*"}
tmp_dir=$(mktemp -d -t kured-XXXX)
function gather_logs_and_cleanup {
if [[ -f "$tmp_dir"/node_output ]]; then
rm "$tmp_dir"/node_output
fi
rmdir "$tmp_dir"
# The next commands are useful regardless of success or failures.
if [[ "$DEBUG" == "true" ]]; then
echo "############################################################"
# This is useful to see if containers have crashed.
echo "docker ps -a:"
docker ps -a
echo "docker journal logs"
journalctl -u docker --no-pager
# This is useful to see if the nodes have _properly_ rebooted.
# It should show the reboot/two container starts per node.
for name in $(docker ps -a -f "name=${CONTAINER_NAME_FORMAT}" -q); do
echo "############################################################"
echo "docker logs for container $name:"
docker logs "$name"
done
fi
}
trap gather_logs_and_cleanup EXIT
declare -A was_unschedulable
declare -A has_recovered
max_attempts="60"
sleep_time=60
attempt_num=1
set +o errexit
echo "There are $NODECOUNT nodes in the cluster"
until [ ${#was_unschedulable[@]} == "$NODECOUNT" ] && [ ${#has_recovered[@]} == "$NODECOUNT" ]
do
echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}"
echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}"
"$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output
if [[ "$DEBUG" == "true" ]]; then
# This is useful to see if a node gets stuck after drain, and doesn't
# come back up.
echo "Result of command $KUBECTL_CMD get nodes ... showing unschedulable nodes:"
cat "$tmp_dir"/node_output
fi
while read -r node; do
unschedulable=$(echo "$node" | grep true | cut -f 1 -d ' ')
if [ -n "$unschedulable" ] && [ -z ${was_unschedulable["$unschedulable"]+x} ] ; then
echo "$unschedulable is now unschedulable!"
was_unschedulable["$unschedulable"]=1
fi
schedulable=$(echo "$node" | grep '<none>' | cut -f 1 -d ' ')
if [ -n "$schedulable" ] && [ ${was_unschedulable["$schedulable"]+x} ] && [ -z ${has_recovered["$schedulable"]+x} ]; then
echo "$schedulable has recovered!"
has_recovered["$schedulable"]=1
fi
done < "$tmp_dir"/node_output
if [[ "${#has_recovered[@]}" == "$NODECOUNT" ]]; then
echo "All nodes recovered."
break
else
if (( attempt_num == max_attempts ))
then
echo "Attempt $attempt_num failed and there are no more attempts left!"
exit 1
else
echo "Attempt $attempt_num failed! Trying again in $sleep_time seconds..."
sleep "$sleep_time"
fi
fi
(( attempt_num++ ))
done
echo "Test successful"

View File

@@ -1,19 +0,0 @@
#!/usr/bin/env bash
expected="$1"
if [[ "$expected" != "0" && "$expected" != "1" ]]; then
echo "You should give an argument to this script, the gauge value (0 or 1)"
exit 1
fi
HOST="${HOST:-localhost}"
PORT="${PORT:-30000}"
NODENAME="${NODENAME-chart-testing-control-plane}"
reboot_required=$(docker exec "$NODENAME" curl "http://$HOST:$PORT/metrics" | awk '/^kured_reboot_required/{print $2}')
if [[ "$reboot_required" == "$expected" ]]; then
echo "Test success"
else
echo "Test failed"
exit 1
fi