mirror of
https://github.com/kubereboot/kured.git
synced 2026-02-17 10:49:50 +00:00
Compare commits
146 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e1db60b2b5 | ||
|
|
f3295b99ef | ||
|
|
178ba93b5a | ||
|
|
f3ed0087d2 | ||
|
|
71a273a14c | ||
|
|
2b36eab0f8 | ||
|
|
aefd901b4e | ||
|
|
91b01b5524 | ||
|
|
f1255bff91 | ||
|
|
22a76f0da2 | ||
|
|
b52a9587f3 | ||
|
|
a6e1cf8191 | ||
|
|
d7576dce0f | ||
|
|
661af3b042 | ||
|
|
eec8ca1f9b | ||
|
|
15356fa26d | ||
|
|
7e3565a565 | ||
|
|
a3bc03b4b9 | ||
|
|
22ce5a2628 | ||
|
|
0f80b70478 | ||
|
|
28be690849 | ||
|
|
84292cc8c3 | ||
|
|
21b54227a7 | ||
|
|
8e3fb55ec4 | ||
|
|
1a6592851e | ||
|
|
bba3b8d83f | ||
|
|
9c6d6a6d82 | ||
|
|
997794eaac | ||
|
|
0763cdd95a | ||
|
|
c004566e97 | ||
|
|
077ef2488e | ||
|
|
06093ab53b | ||
|
|
4d2019c07f | ||
|
|
687aeda813 | ||
|
|
acddd6b675 | ||
|
|
54e7d93902 | ||
|
|
2666b49d01 | ||
|
|
ff1a27ba8b | ||
|
|
38ed636ecf | ||
|
|
8324b09bb9 | ||
|
|
fb8677e7ac | ||
|
|
bdd16d4e01 | ||
|
|
16e6d3c4d3 | ||
|
|
af824bfd6a | ||
|
|
8264a529d6 | ||
|
|
cd25017d67 | ||
|
|
4c1a23a047 | ||
|
|
8f86e1d4f8 | ||
|
|
79e19d84ba | ||
|
|
01396db3d1 | ||
|
|
d3b59b8922 | ||
|
|
eafe2c3d98 | ||
|
|
e4f1c7358c | ||
|
|
348b5b4c96 | ||
|
|
c8a3a6ff9d | ||
|
|
c196d4e97f | ||
|
|
efc98c8813 | ||
|
|
b108aa4d2d | ||
|
|
2ae0a82510 | ||
|
|
f95664156d | ||
|
|
891afda596 | ||
|
|
2b89170417 | ||
|
|
de59c2614d | ||
|
|
2e5cb81b4c | ||
|
|
fde91041d5 | ||
|
|
8a3f486ad9 | ||
|
|
513db7ce8c | ||
|
|
938cbd428c | ||
|
|
fa28b550b2 | ||
|
|
164183e1bc | ||
|
|
7d0499cc0a | ||
|
|
5e32864e0b | ||
|
|
718faf4d31 | ||
|
|
ac9e669b52 | ||
|
|
7c33ad8b6e | ||
|
|
6f8d36e8db | ||
|
|
688346e811 | ||
|
|
079425349d | ||
|
|
d7589b16d7 | ||
|
|
bab1425e1a | ||
|
|
4e1c05c5e3 | ||
|
|
2c7ca8261f | ||
|
|
6ebf9a96f9 | ||
|
|
adffa11796 | ||
|
|
1152d72d51 | ||
|
|
fb6a224f66 | ||
|
|
c671dce161 | ||
|
|
f8fc6e5017 | ||
|
|
effbf62987 | ||
|
|
6423bf0069 | ||
|
|
9c81caa92e | ||
|
|
978acba030 | ||
|
|
acef34e916 | ||
|
|
f72ef8c2ca | ||
|
|
3c2508050d | ||
|
|
483a5d8211 | ||
|
|
9b89a8c0fc | ||
|
|
b5a4bf432c | ||
|
|
cee15cfc32 | ||
|
|
b2b1940435 | ||
|
|
a9eb139f60 | ||
|
|
d6e478ec6b | ||
|
|
0955403470 | ||
|
|
a3f9796305 | ||
|
|
9473f831be | ||
|
|
3682eb36de | ||
|
|
3900ee8876 | ||
|
|
4c31084be8 | ||
|
|
6c9ee57dc1 | ||
|
|
3c5eb968d3 | ||
|
|
54c0e4e25f | ||
|
|
afac9d435a | ||
|
|
6af3f1abc1 | ||
|
|
a48da239bc | ||
|
|
c7d5810503 | ||
|
|
6e16e993d9 | ||
|
|
24f4925b3f | ||
|
|
c0333d186e | ||
|
|
7a2b4a6a1a | ||
|
|
fb7a7feb15 | ||
|
|
ffddfd7add | ||
|
|
a0bc7daa32 | ||
|
|
fd6f520b6e | ||
|
|
c2f275ebd0 | ||
|
|
01b0ca8cea | ||
|
|
aa45139b80 | ||
|
|
1654b75ec4 | ||
|
|
e4da44a774 | ||
|
|
e301908ae8 | ||
|
|
f442c6b632 | ||
|
|
8fc0a9daf2 | ||
|
|
4d783e4321 | ||
|
|
11f077f689 | ||
|
|
807b727ab3 | ||
|
|
c826d73695 | ||
|
|
5193f2de16 | ||
|
|
310c6c114d | ||
|
|
e1017f47fb | ||
|
|
42f69c7b1e | ||
|
|
e3f4a88a07 | ||
|
|
48dc84b3e6 | ||
|
|
816c732f39 | ||
|
|
0bd22c7c56 | ||
|
|
2850417e48 | ||
|
|
120bf713c0 | ||
|
|
9583df2e50 |
4
.github/dependabot.yml
vendored
4
.github/dependabot.yml
vendored
@@ -15,3 +15,7 @@ updates:
|
||||
- dependency-name: "k8s.io/apimachinery"
|
||||
- dependency-name: "k8s.io/client-go"
|
||||
- dependency-name: "k8s.io/kubectl"
|
||||
- package-ecosystem: "docker"
|
||||
directory: "cmd/kured"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
|
||||
13
.github/kind-cluster-1.20.yaml
vendored
13
.github/kind-cluster-1.20.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.20.2"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.20.2"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.20.2"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.20.2"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.20.2"
|
||||
10
.github/kind-cluster-1.21.yaml
vendored
10
.github/kind-cluster-1.21.yaml
vendored
@@ -2,12 +2,12 @@ kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.21.1
|
||||
image: kindest/node:v1.21.2
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.21.1
|
||||
image: kindest/node:v1.21.2
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.21.1
|
||||
image: kindest/node:v1.21.2
|
||||
- role: worker
|
||||
image: kindest/node:v1.21.1
|
||||
image: kindest/node:v1.21.2
|
||||
- role: worker
|
||||
image: kindest/node:v1.21.1
|
||||
image: kindest/node:v1.21.2
|
||||
|
||||
@@ -2,12 +2,12 @@ kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.19.7
|
||||
image: kindest/node:v1.22.4
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.19.7
|
||||
image: kindest/node:v1.22.4
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.19.7
|
||||
image: kindest/node:v1.22.4
|
||||
- role: worker
|
||||
image: kindest/node:v1.19.7
|
||||
image: kindest/node:v1.22.4
|
||||
- role: worker
|
||||
image: kindest/node:v1.19.7
|
||||
image: kindest/node:v1.22.4
|
||||
13
.github/kind-cluster-1.23.yaml
vendored
Normal file
13
.github/kind-cluster-1.23.yaml
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.23.0"
|
||||
33
.github/workflows/on-main-push.yaml
vendored
33
.github/workflows/on-main-push.yaml
vendored
@@ -29,10 +29,31 @@ jobs:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
|
||||
|
||||
- name: Build image
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" image
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: weave-ghcr-bot
|
||||
password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
|
||||
|
||||
- name: Publish image
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" publish-image
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Find current tag version
|
||||
run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
id: tags
|
||||
|
||||
- name: Build image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
file: cmd/kured/Dockerfile.multi
|
||||
platforms: linux/arm64, linux/amd64
|
||||
push: true
|
||||
tags: |
|
||||
docker.io/${{ GITHUB.REPOSITORY }}:main-${{ steps.tags.outputs.sha_short }}
|
||||
ghcr.io/${{ GITHUB.REPOSITORY }}:main-${{ steps.tags.outputs.sha_short }}
|
||||
|
||||
8
.github/workflows/on-pr-charts.yaml
vendored
8
.github/workflows/on-pr-charts.yaml
vendored
@@ -32,10 +32,10 @@ jobs:
|
||||
|
||||
# Helm is already present in github actions, so do not re-install it
|
||||
- name: Setup chart testing
|
||||
uses: helm/chart-testing-action@v2.0.1
|
||||
uses: helm/chart-testing-action@v2.2.0
|
||||
|
||||
- name: Create default kind cluster
|
||||
uses: helm/kind-action@v1.1.0
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
version: v0.11.0
|
||||
if: ${{ matrix.test-action == 'install' }}
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create 1 node kind cluster
|
||||
uses: helm/kind-action@v1.1.0
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
version: v0.11.0
|
||||
|
||||
@@ -69,7 +69,7 @@ jobs:
|
||||
kubectl describe ds kured
|
||||
|
||||
- name: Test if successful deploy
|
||||
uses: nick-invision/retry@v2.4.0
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
|
||||
29
.github/workflows/on-pr.yaml
vendored
29
.github/workflows/on-pr.yaml
vendored
@@ -10,11 +10,20 @@ jobs:
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
- name: run tests
|
||||
run: go test -json ./... > test.json
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.4.0
|
||||
uses: guyarb/golang-test-annoations@v0.5.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
@@ -97,9 +106,9 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.19"
|
||||
- "1.20"
|
||||
- "1.21"
|
||||
- "1.22"
|
||||
- "1.23"
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
@@ -127,7 +136,7 @@ jobs:
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create kind cluster with 5 nodes
|
||||
uses: helm/kind-action@v1.1.0
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.11.0
|
||||
@@ -144,7 +153,7 @@ jobs:
|
||||
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.4.0
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
@@ -198,7 +207,7 @@ jobs:
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create 1 node kind cluster
|
||||
uses: helm/kind-action@v1.1.0
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
version: v0.11.0
|
||||
|
||||
@@ -217,7 +226,7 @@ jobs:
|
||||
kubectl describe ds kured
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.4.0
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
@@ -226,7 +235,7 @@ jobs:
|
||||
command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1' "
|
||||
|
||||
- name: Get metrics (healthy)
|
||||
uses: nick-invision/retry@v2.4.0
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 2
|
||||
max_attempts: 12
|
||||
@@ -238,7 +247,7 @@ jobs:
|
||||
./tests/kind/create-reboot-sentinels.sh
|
||||
|
||||
- name: Get metrics (need reboot)
|
||||
uses: nick-invision/retry@v2.4.0
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 15
|
||||
max_attempts: 10
|
||||
@@ -308,7 +317,7 @@ jobs:
|
||||
# kubectl describe ds kured
|
||||
#
|
||||
# - name: Ensure kured is ready
|
||||
# uses: nick-invision/retry@v2.4.0
|
||||
# uses: nick-invision/retry@v2.6.0
|
||||
# with:
|
||||
# timeout_minutes: 10
|
||||
# max_attempts: 10
|
||||
|
||||
27
.github/workflows/on-tag.yaml
vendored
27
.github/workflows/on-tag.yaml
vendored
@@ -37,6 +37,27 @@ jobs:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
|
||||
|
||||
- name: Publish image
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ steps.tags.outputs.version }}" publish-image
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: weave-ghcr-bot
|
||||
password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Build image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
file: cmd/kured/Dockerfile.multi
|
||||
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
|
||||
push: true
|
||||
tags: |
|
||||
docker.io/${{ GITHUB.REPOSITORY }}:${{ steps.tags.outputs.version }}
|
||||
ghcr.io/${{ GITHUB.REPOSITORY }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
14
.github/workflows/periodics-daily.yaml
vendored
14
.github/workflows/periodics-daily.yaml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
run: go test -json ./... > test.json
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.4.0
|
||||
uses: guyarb/golang-test-annoations@v0.5.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
@@ -25,14 +25,14 @@ jobs:
|
||||
steps:
|
||||
# Stale by default waits for 60 days before marking PR/issues as stale, and closes them after 21 days.
|
||||
# Do not expire the first issues that would allow the community to grow.
|
||||
- uses: actions/stale@v3.0.19
|
||||
- uses: actions/stale@v4
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
stale-issue-message: 'This issue was automatically considered stale due to lack of activity. Please update it and/or join our slack channels to promote it, before it automatically closes (in 7 days).'
|
||||
stale-pr-message: 'This PR was automatically considered stale due to lack of activity. Please refresh it and/or join our slack channels to highlight it, before it automatically closes (in 7 days).'
|
||||
stale-issue-label: 'no-issue-activity'
|
||||
stale-pr-label: 'no-pr-activity'
|
||||
exempt-issue-labels: 'good-first-issue'
|
||||
exempt-issue-labels: 'good first issue,keep'
|
||||
days-before-close: 21
|
||||
|
||||
check-docs-links:
|
||||
@@ -74,9 +74,9 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.19"
|
||||
- "1.20"
|
||||
- "1.21"
|
||||
- "1.22"
|
||||
- "1.23"
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
@@ -100,7 +100,7 @@ jobs:
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create 5 node kind cluster
|
||||
uses: helm/kind-action@v1.1.0
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.11.0
|
||||
@@ -117,7 +117,7 @@ jobs:
|
||||
kubectl describe ds kured
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.4.0
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
|
||||
@@ -187,19 +187,12 @@ Check that `README.md` has an updated compatibility matrix and that the
|
||||
url in the `kubectl` incantation (under "Installation") is updated to the
|
||||
new version you want to release.
|
||||
|
||||
### Create a tag on the repo and publish the image
|
||||
### Create a tag on the repo
|
||||
|
||||
Before going further, we should freeze the code for a release, by
|
||||
tagging the code, and publishing its immutable artifact: the kured
|
||||
docker image.
|
||||
tagging the code. The Github-Action should start a new job and push
|
||||
the new image to the registry.
|
||||
|
||||
```sh
|
||||
make DH_ORG="weaveworks" VERSION="1.3.0" image
|
||||
```
|
||||
|
||||
Then docker push the image. In the future, that might be automatically
|
||||
done when creating a tag on the repository, with the help of github
|
||||
actions.
|
||||
|
||||
### Create the combined manifest
|
||||
|
||||
@@ -237,3 +230,6 @@ A change in the helm chart requires a bump of the `version`
|
||||
in `charts/kured/Chart.yaml` (following the versioning rules).
|
||||
Update it, and issue a PR. Upon merge, that PR will automatically
|
||||
publish the chart to the gh-pages branch.
|
||||
|
||||
When there are open helm-chart PRs which are on hold until the helm-chart has been updated
|
||||
with the new kured version, they can be merged now (unless a rebase is needed from the contributor).
|
||||
|
||||
@@ -2,3 +2,4 @@ Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
|
||||
Daniel Holbach <daniel@weave.works> (@dholbach)
|
||||
Hidde Beydals <hidde@weave.works> (@hiddeco)
|
||||
Jean-Phillipe Evrard <jean-philippe.evrard@suse.com> (@evrardjp)
|
||||
Jack Francis <jackfrancis@gmail.com> (@jackfrancis)
|
||||
|
||||
2
Makefile
2
Makefile
@@ -24,12 +24,14 @@ build/.image.done: cmd/kured/Dockerfile cmd/kured/kured
|
||||
cp $^ build
|
||||
$(SUDO) docker build -t docker.io/$(DH_ORG)/kured -f build/Dockerfile ./build
|
||||
$(SUDO) docker tag docker.io/$(DH_ORG)/kured docker.io/$(DH_ORG)/kured:$(VERSION)
|
||||
$(SUDO) docker tag docker.io/$(DH_ORG)/kured ghcr.io/$(DH_ORG)/kured:$(VERSION)
|
||||
touch $@
|
||||
|
||||
image: build/.image.done
|
||||
|
||||
publish-image: image
|
||||
$(SUDO) docker push docker.io/$(DH_ORG)/kured:$(VERSION)
|
||||
$(SUDO) docker push ghcr.io/$(DH_ORG)/kured:$(VERSION)
|
||||
|
||||
minikube-publish: image
|
||||
$(SUDO) docker save docker.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load)
|
||||
|
||||
15
README.md
15
README.md
@@ -48,7 +48,9 @@ server:
|
||||
|
||||
| kured | kubectl | k8s.io/client-go | k8s.io/apimachinery | expected kubernetes compatibility |
|
||||
|-------|---------|------------------|---------------------|-----------------------------------|
|
||||
| main | 1.20.5 | v0.20.5 | v0.20.5 | 1.19.x, 1.20.x, 1.21.x |
|
||||
| main | 1.22.4 | v0.22.4 | v0.22.4 | 1.21.x, 1.22.x, 1.23.x |
|
||||
| 1.9.1 | 1.22.4 | v0.22.4 | v0.22.4 | 1.21.x, 1.22.x, 1.23.x |
|
||||
| 1.8.1 | 1.21.4 | v0.21.4 | v0.21.4 | 1.20.x, 1.21.x, 1.22.x |
|
||||
| 1.7.0 | 1.20.5 | v0.20.5 | v0.20.5 | 1.19.x, 1.20.x, 1.21.x |
|
||||
| 1.6.1 | 1.19.4 | v0.19.4 | v0.19.4 | 1.18.x, 1.19.x, 1.20.x |
|
||||
| 1.5.1 | 1.18.8 | v0.18.8 | v0.18.8 | 1.17.x, 1.18.x, 1.19.x |
|
||||
@@ -85,6 +87,7 @@ The following arguments can be passed to kured via the daemonset pod template:
|
||||
```console
|
||||
Flags:
|
||||
--alert-filter-regexp regexp.Regexp alert names to ignore when checking for active alerts
|
||||
--alert-firing-only bool only consider firing alerts when checking for active alerts
|
||||
--blocking-pod-selector stringArray label selector identifying pods whose presence should prevent reboots
|
||||
--drain-grace-period int time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
|
||||
--skip-wait-for-delete-timeout int when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
|
||||
@@ -95,6 +98,7 @@ Flags:
|
||||
--drain-timeout duration timeout after which the drain is aborted (default: 0, infinite time)
|
||||
-h, --help help for kured
|
||||
--lock-annotation string annotation in which to record locking node (default "weave.works/kured-node-lock")
|
||||
--lock-release-delay duration hold lock after reboot by this duration (default: 0, disabled)
|
||||
--lock-ttl duration expire lock annotation after this duration (default: 0, disabled)
|
||||
--message-template-drain string message template used to notify about a node being drained (default "Draining node %s")
|
||||
--message-template-reboot string message template used to notify about a node being rebooted (default "Rebooting node %s")
|
||||
@@ -104,6 +108,7 @@ Flags:
|
||||
--prometheus-url string Prometheus instance to probe for active alerts
|
||||
--reboot-command string command to run when a reboot is required by the sentinel (default "/sbin/systemctl reboot")
|
||||
--reboot-days strings schedule reboot on these days (default [su,mo,tu,we,th,fr,sa])
|
||||
--reboot-delay duration add a delay after drain finishes but before the reboot command is issued (default 0, no time)
|
||||
--reboot-sentinel string path to file whose existence signals need to reboot (default "/var/run/reboot-required")
|
||||
--reboot-sentinel-command string command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
|
||||
--slack-channel string slack channel for reboot notfications
|
||||
@@ -111,6 +116,7 @@ Flags:
|
||||
--slack-username string slack username for reboot notfications (default "kured")
|
||||
--start-time string schedule reboot only after this time of day (default "0:00")
|
||||
--time-zone string use this timezone for schedule inputs (default "UTC")
|
||||
--log-format string log format specified as text or json, defaults to "text"
|
||||
```
|
||||
|
||||
### Reboot Sentinel File & Period
|
||||
@@ -165,6 +171,11 @@ will block reboots, however you can ignore specific alerts:
|
||||
--alert-filter-regexp=^(RebootRequired|AnotherBenignAlert|...$
|
||||
```
|
||||
|
||||
You can also only block reboots for firing alerts:
|
||||
```console
|
||||
--alert-firing-only=true
|
||||
```
|
||||
|
||||
See the section on Prometheus metrics for an important application of this
|
||||
filter.
|
||||
|
||||
@@ -253,7 +264,7 @@ Here is the syntax:
|
||||
|
||||
- Email: `smtp://username:password@host:port/?fromAddress=fromAddress&toAddresses=recipient1[,recipient2,...]`
|
||||
|
||||
More details here: <https://github.com/containrrr/shoutrrr/blob/main/docs/services/overview.md>
|
||||
More details here: [containrrr.dev/shoutrrr/v0.4/services/overview](https://containrrr.dev/shoutrrr/v0.4/services/overview)
|
||||
|
||||
### Overriding Lock Configuration
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
apiVersion: v1
|
||||
appVersion: "1.6.1"
|
||||
appVersion: "1.9.1"
|
||||
description: A Helm chart for kured
|
||||
name: kured
|
||||
version: 2.4.3
|
||||
version: 2.11.2
|
||||
home: https://github.com/weaveworks/kured
|
||||
maintainers:
|
||||
- name: ckotzbauer
|
||||
|
||||
@@ -36,31 +36,44 @@ The following changes have been made compared to the stable chart:
|
||||
| Config | Description | Default |
|
||||
| ------ | ----------- | ------- |
|
||||
| `image.repository` | Image repository | `weaveworks/kured` |
|
||||
| `image.tag` | Image tag | `1.6.1` |
|
||||
| `image.tag` | Image tag | `1.9.1` |
|
||||
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
|
||||
| `image.pullSecrets` | Image pull secrets | `[]` |
|
||||
| `updateStrategy` | Daemonset update strategy | `OnDelete` |
|
||||
| `updateStrategy` | Daemonset update strategy | `RollingUpdate` |
|
||||
| `maxUnavailable` | The max pods unavailable during a rolling update | `1` |
|
||||
| `podAnnotations` | Annotations to apply to pods (eg to add Prometheus annotations) | `{}` |
|
||||
| `dsAnnotations` | Annotations to apply to the kured DaemonSet | `{}` |
|
||||
| `extraArgs` | Extra arguments to pass to `/usr/bin/kured`. See below. | `{}` |
|
||||
| `extraEnvVars` | Array of environment variables to pass to the daemonset. | `{}` |
|
||||
| `configuration.lockTtl` | cli-parameter `--lock-ttl` | `0` |
|
||||
| `configuration.lockReleaseDelay` | cli-parameter `--lock-release-delay` | `0` |
|
||||
| `configuration.alertFilterRegexp` | cli-parameter `--alert-filter-regexp` | `""` |
|
||||
| `configuration.alertFiringOnly` | cli-parameter `--alert-firing-only` | `false` |
|
||||
| `configuration.blockingPodSelector` | Array of selectors for multiple cli-parameters `--blocking-pod-selector` | `[]` |
|
||||
| `configuration.endTime` | cli-parameter `--end-time` | `""` |
|
||||
| `configuration.lockAnnotation` | cli-parameter `--lock-annotation` | `""` |
|
||||
| `configuration.period` | cli-parameter `--period` | `""` |
|
||||
| `configuration.forceReboot` | cli-parameter `--force-reboot` | `false` |
|
||||
| `configuration.drainGracePeriod` | cli-parameter `--drain-grace-period` | `""` |
|
||||
| `configuration.drainTimeout` | cli-parameter `--drain-timeout` | `""` |
|
||||
| `configuration.skipWaitForDeleteTimeout` | cli-parameter `--skip-wait-for-delete-timeout` | `""` |
|
||||
| `configuration.prometheusUrl` | cli-parameter `--prometheus-url` | `""` |
|
||||
| `configuration.rebootDays` | Array of days for multiple cli-parameters `--reboot-days` | `[]` |
|
||||
| `configuration.rebootSentinel` | cli-parameter `--reboot-sentinel` | `""` |
|
||||
| `configuration.rebootSentinelCommand` | cli-parameter `--reboot-sentinel-command` | `""` |
|
||||
| `configuration.rebootCommand` | cli-parameter `--reboot-command` | `""` |
|
||||
| `configuration.rebootDelay` | cli-parameter `--reboot-delay` | `""` |
|
||||
| `configuration.slackChannel` | cli-parameter `--slack-channel` | `""` |
|
||||
| `configuration.slackHookUrl` | cli-parameter `--slack-hook-url` | `""` |
|
||||
| `configuration.slackUsername` | cli-parameter `--slack-username` | `""` |
|
||||
| `configuration.notifyUrl` | cli-parameter `--notify-url` | `""` |
|
||||
| `configuration.messageTemplateDrain` | cli-parameter `--message-template-drain` | `""` |
|
||||
| `configuration.messageTemplateReboot` | cli-parameter `--message-template-reboot` | `""` |
|
||||
| `configuration.startTime` | cli-parameter `--start-time` | `""` |
|
||||
| `configuration.timeZone` | cli-parameter `--time-zone` | `""` |
|
||||
| `configuration.annotateNodes` | cli-parameter `--annotate-nodes` | `false` |
|
||||
| `configuration.logFormat` | cli-parameter `--log-format` | `"text"` |
|
||||
| `configuration.preferNoScheduleTaint` | Taint name applied during pending node reboot | `""` |
|
||||
| `rbac.create` | Create RBAC roles | `true` |
|
||||
| `serviceAccount.create` | Create a service account | `true` |
|
||||
| `serviceAccount.name` | Service account name to create (or use if `serviceAccount.create` is false) | (chart fullname) |
|
||||
@@ -80,7 +93,8 @@ The following changes have been made compared to the stable chart:
|
||||
| `tolerations` | Tolerations to apply to the daemonset (eg to allow running on master) | `[{"key": "node-role.kubernetes.io/master", "effect": "NoSchedule"}]`|
|
||||
| `affinity` | Affinity for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
|
||||
| `nodeSelector` | Node Selector for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
|
||||
|
||||
| `volumeMounts` | Maps of volumes mount to mount | `{}` |
|
||||
| `volumes` | Maps of volumes to mount | `{}` |
|
||||
See https://github.com/weaveworks/kured#configuration for values (not contained in the `configuration` object) for `extraArgs`. Note that
|
||||
```yaml
|
||||
extraArgs:
|
||||
|
||||
@@ -5,6 +5,12 @@ metadata:
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
{{- if .Values.dsAnnotations }}
|
||||
annotations:
|
||||
{{- range $key, $value := .Values.dsAnnotations }}
|
||||
{{ $key }}: {{ $value | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: {{ .Values.updateStrategy }}
|
||||
@@ -55,9 +61,15 @@ spec:
|
||||
{{- if .Values.configuration.lockTtl }}
|
||||
- --lock-ttl={{ .Values.configuration.lockTtl }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.lockReleaseDelay }}
|
||||
- --lock-release-delay={{ .Values.configuration.lockReleaseDelay }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.alertFilterRegexp }}
|
||||
- --alert-filter-regexp={{ .Values.configuration.alertFilterRegexp }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.alertFiringOnly }}
|
||||
- --alert-firing-only={{ .Values.configuration.alertFiringOnly }}
|
||||
{{- end }}
|
||||
{{- range .Values.configuration.blockingPodSelector }}
|
||||
- --blocking-pod-selector={{ . }}
|
||||
{{- end }}
|
||||
@@ -70,6 +82,18 @@ spec:
|
||||
{{- if .Values.configuration.period }}
|
||||
- --period={{ .Values.configuration.period }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.forceReboot }}
|
||||
- --force-reboot
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.drainGracePeriod }}
|
||||
- --drain-grace-period={{ .Values.configuration.drainGracePeriod }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.drainTimeout }}
|
||||
- --drain-timeout={{ .Values.configuration.drainTimeout }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.skipWaitForDeleteTimeout }}
|
||||
- --skip-wait-for-delete-timeout={{ .Values.configuration.skipWaitForDeleteTimeout }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.prometheusUrl }}
|
||||
- --prometheus-url={{ .Values.configuration.prometheusUrl }}
|
||||
{{- end }}
|
||||
@@ -79,6 +103,15 @@ spec:
|
||||
{{- if .Values.configuration.rebootSentinel }}
|
||||
- --reboot-sentinel={{ .Values.configuration.rebootSentinel }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.rebootSentinelCommand }}
|
||||
- --reboot-sentinel-command={{ .Values.configuration.rebootSentinelCommand }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.rebootCommand }}
|
||||
- --reboot-command={{ .Values.configuration.rebootCommand }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.rebootDelay }}
|
||||
- --reboot-delay={{ .Values.configuration.rebootDelay }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.slackChannel }}
|
||||
- --slack-channel={{ .Values.configuration.slackChannel }}
|
||||
{{- end }}
|
||||
@@ -88,6 +121,9 @@ spec:
|
||||
{{- if .Values.configuration.slackUsername }}
|
||||
- --slack-username={{ .Values.configuration.slackUsername }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.notifyUrl }}
|
||||
- --notify-url={{ .Values.configuration.notifyUrl }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.messageTemplateDrain }}
|
||||
- --message-template-drain={{ .Values.configuration.messageTemplateDrain }}
|
||||
{{- end }}
|
||||
@@ -103,6 +139,12 @@ spec:
|
||||
{{- if .Values.configuration.annotateNodes }}
|
||||
- --annotate-nodes={{ .Values.configuration.annotateNodes }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.preferNoScheduleTaint }}
|
||||
- --prefer-no-schedule-taint={{ .Values.configuration.preferNoScheduleTaint }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.logFormat }}
|
||||
- --log-format={{ .Values.configuration.logFormat }}
|
||||
{{- end }}
|
||||
{{- range $key, $value := .Values.extraArgs }}
|
||||
{{- if $value }}
|
||||
- --{{ $key }}={{ $value }}
|
||||
@@ -110,6 +152,10 @@ spec:
|
||||
- --{{ $key }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.volumeMounts }}
|
||||
volumeMounts:
|
||||
{{- toYaml .Values.volumeMounts | nindent 12 }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: metrics
|
||||
@@ -135,3 +181,7 @@ spec:
|
||||
affinity:
|
||||
{{ toYaml . | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.volumes }}
|
||||
volumes:
|
||||
{{- toYaml .Values.volumes | nindent 8 }}
|
||||
{{- end }}
|
||||
|
||||
@@ -3,20 +3,29 @@ image:
|
||||
tag: latest
|
||||
|
||||
configuration:
|
||||
# annotationTtl: 0 # force clean annotation after this ammount of time (default 0, disabled)
|
||||
# annotationTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
|
||||
# alertFilterRegexp: "" # alert names to ignore when checking for active alerts
|
||||
# alertFiringOnly: false # only consider firing alerts when checking for active alerts
|
||||
# blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
|
||||
# endTime: "" # only reboot before this time of day (default "23:59")
|
||||
# lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
|
||||
period: "1m" # reboot check period (default 1h0m0s)
|
||||
# forceReboot: false # force a reboot even if the drain fails or times out (default: false)
|
||||
# drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
|
||||
# drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
|
||||
# skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
|
||||
# prometheusUrl: "" # Prometheus instance to probe for active alerts
|
||||
# rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
|
||||
# rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
|
||||
# rebootSentinelCommand: "" # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
|
||||
# slackChannel: "" # slack channel for reboot notfications
|
||||
# slackHookUrl: "" # slack hook URL for reboot notfications
|
||||
# slackUsername: "" # slack username for reboot notfications (default "kured")
|
||||
# notifyUrl: "" # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
|
||||
# messageTemplateDrain: "" # slack message template when notifying about a node being drained (default "Draining node %s")
|
||||
# messageTemplateReboot: "" # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
|
||||
# startTime: "" # only reboot after this time of day (default "0:00")
|
||||
# timeZone: "" # time-zone to use (valid zones from "time" golang package)
|
||||
# annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
|
||||
# annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
|
||||
# lockReleaseDelay: "5m" # hold lock after reboot by this amount of time (default 0, disabled)
|
||||
# logFormat: "text" # log format specified as text or json, defaults to text
|
||||
|
||||
@@ -4,11 +4,12 @@ image:
|
||||
pullPolicy: IfNotPresent
|
||||
pullSecrets: []
|
||||
|
||||
updateStrategy: OnDelete
|
||||
updateStrategy: RollingUpdate
|
||||
# requires RollingUpdate updateStrategy
|
||||
maxUnavailable: 1
|
||||
|
||||
podAnnotations: {}
|
||||
dsAnnotations: {}
|
||||
|
||||
extraArgs: {}
|
||||
|
||||
@@ -22,23 +23,35 @@ extraEnvVars:
|
||||
# value: 123
|
||||
|
||||
configuration:
|
||||
lockTtl: 0 # force clean annotation after this ammount of time (default 0, disabled)
|
||||
lockTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
|
||||
alertFilterRegexp: "" # alert names to ignore when checking for active alerts
|
||||
alertFiringOnly: false # only consider firing alerts when checking for active alerts
|
||||
blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
|
||||
endTime: "" # only reboot before this time of day (default "23:59")
|
||||
lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
|
||||
period: "" # reboot check period (default 1h0m0s)
|
||||
forceReboot: false # force a reboot even if the drain fails or times out (default: false)
|
||||
drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
|
||||
drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
|
||||
skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
|
||||
prometheusUrl: "" # Prometheus instance to probe for active alerts
|
||||
rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
|
||||
rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
|
||||
rebootSentinelCommand: "" # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
|
||||
rebootCommand: "/bin/systemctl reboot" # command to run when a reboot is required by the sentinel
|
||||
rebootDelay: "" # add a delay after drain finishes but before the reboot command is issued
|
||||
slackChannel: "" # slack channel for reboot notfications
|
||||
slackHookUrl: "" # slack hook URL for reboot notfications
|
||||
slackUsername: "" # slack username for reboot notfications (default "kured")
|
||||
notifyUrl: "" # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
|
||||
messageTemplateDrain: "" # slack message template when notifying about a node being drained (default "Draining node %s")
|
||||
messageTemplateReboot: "" # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
|
||||
startTime: "" # only reboot after this time of day (default "0:00")
|
||||
timeZone: "" # time-zone to use (valid zones from "time" golang package)
|
||||
annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
|
||||
annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
|
||||
lockReleaseDelay: 0 # hold lock after reboot by this amount of time (default 0, disabled)
|
||||
preferNoScheduleTaint: "" # Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to "weave.works/kured-node-reboot" to enable tainting.
|
||||
logFormat: "text" # log format specified as text or json, defaults to text
|
||||
|
||||
rbac:
|
||||
create: true
|
||||
@@ -77,3 +90,7 @@ tolerations:
|
||||
affinity: {}
|
||||
|
||||
nodeSelector: {}
|
||||
|
||||
volumeMounts: []
|
||||
|
||||
volumes: []
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM alpine:3.13
|
||||
FROM alpine:3.15.0
|
||||
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
|
||||
COPY ./kured /usr/bin/kured
|
||||
ENTRYPOINT ["/usr/bin/kured"]
|
||||
|
||||
19
cmd/kured/Dockerfile.multi
Normal file
19
cmd/kured/Dockerfile.multi
Normal file
@@ -0,0 +1,19 @@
|
||||
FROM --platform=$BUILDPLATFORM golang:bullseye AS build
|
||||
|
||||
ARG TARGETOS
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
ENV GOOS=$TARGETOS
|
||||
ENV GOARCH=$TARGETARCH
|
||||
ENV GOVARIANT=$TARGETVARIANT
|
||||
|
||||
WORKDIR /src
|
||||
COPY . .
|
||||
RUN go list -f '{{join .Deps "\n"}}' ./cmd/kured | grep -v /vendor/ | xargs go list -f '{{if not .Standard}}{{ $dep := . }}{{range .GoFiles}}{{$dep.Dir}}/{{.}} {{end}}{{end}}'
|
||||
RUN CGO_ENABLED=0 go build -o cmd/kured/kured cmd/kured/*.go
|
||||
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.15 as bin
|
||||
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
|
||||
COPY --from=build /src/cmd/kured/kured /usr/bin/kured
|
||||
ENTRYPOINT ["/usr/bin/kured"]
|
||||
@@ -6,14 +6,18 @@ import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/spf13/pflag"
|
||||
"github.com/spf13/viper"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
@@ -29,7 +33,6 @@ import (
|
||||
"github.com/weaveworks/kured/pkg/alerts"
|
||||
"github.com/weaveworks/kured/pkg/daemonsetlock"
|
||||
"github.com/weaveworks/kured/pkg/delaytick"
|
||||
"github.com/weaveworks/kured/pkg/notifications/slack"
|
||||
"github.com/weaveworks/kured/pkg/taints"
|
||||
"github.com/weaveworks/kured/pkg/timewindow"
|
||||
)
|
||||
@@ -40,6 +43,7 @@ var (
|
||||
// Command line flags
|
||||
forceReboot bool
|
||||
drainTimeout time.Duration
|
||||
rebootDelay time.Duration
|
||||
period time.Duration
|
||||
drainGracePeriod int
|
||||
skipWaitForDeleteTimeoutSeconds int
|
||||
@@ -47,10 +51,11 @@ var (
|
||||
dsName string
|
||||
lockAnnotation string
|
||||
lockTTL time.Duration
|
||||
lockReleaseDelay time.Duration
|
||||
lockReleaseDelay time.Duration
|
||||
prometheusURL string
|
||||
preferNoScheduleTaintName string
|
||||
alertFilter *regexp.Regexp
|
||||
alertFiringOnly bool
|
||||
rebootSentinelFile string
|
||||
rebootSentinelCommand string
|
||||
notifyURL string
|
||||
@@ -61,6 +66,8 @@ var (
|
||||
messageTemplateReboot string
|
||||
podSelectors []string
|
||||
rebootCommand string
|
||||
logFormat string
|
||||
nodeID string
|
||||
|
||||
rebootDays []string
|
||||
rebootStart string
|
||||
@@ -83,6 +90,8 @@ const (
|
||||
KuredRebootInProgressAnnotation string = "weave.works/kured-reboot-in-progress"
|
||||
// KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation
|
||||
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
|
||||
// EnvPrefix The environment variable prefix of all environment variables bound to our command line flags.
|
||||
EnvPrefix = "KURED"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -90,20 +99,34 @@ func init() {
|
||||
}
|
||||
|
||||
func main() {
|
||||
rootCmd := &cobra.Command{
|
||||
Use: "kured",
|
||||
Short: "Kubernetes Reboot Daemon",
|
||||
PreRun: flagCheck,
|
||||
Run: root}
|
||||
cmd := NewRootCommand()
|
||||
|
||||
if err := cmd.Execute(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// NewRootCommand construct the Cobra root command
|
||||
func NewRootCommand() *cobra.Command {
|
||||
rootCmd := &cobra.Command{
|
||||
Use: "kured",
|
||||
Short: "Kubernetes Reboot Daemon",
|
||||
PersistentPreRunE: bindViper,
|
||||
PreRun: flagCheck,
|
||||
Run: root}
|
||||
|
||||
rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "",
|
||||
"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
|
||||
rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false,
|
||||
"force a reboot even if the drain is still running (default: false)")
|
||||
"force a reboot even if the drain fails or times out (default: false)")
|
||||
rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1,
|
||||
"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)")
|
||||
rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
|
||||
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)")
|
||||
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
|
||||
"timeout after which the drain is aborted (default: 0, infinite time)")
|
||||
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
|
||||
"delay reboot for this duration (default: 0, disabled)")
|
||||
rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60,
|
||||
"sentinel check period")
|
||||
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system",
|
||||
@@ -120,6 +143,8 @@ func main() {
|
||||
"Prometheus instance to probe for active alerts")
|
||||
rootCmd.PersistentFlags().Var(®expValue{&alertFilter}, "alert-filter-regexp",
|
||||
"alert names to ignore when checking for active alerts")
|
||||
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
|
||||
"only consider firing alerts when checking for active alerts (default: false)")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
|
||||
"path to file whose existence triggers the reboot command")
|
||||
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "",
|
||||
@@ -157,21 +182,63 @@ func main() {
|
||||
rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false,
|
||||
"if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots")
|
||||
|
||||
if err := rootCmd.Execute(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
rootCmd.PersistentFlags().StringVar(&logFormat, "log-format", "text",
|
||||
"use text or json log format")
|
||||
|
||||
return rootCmd
|
||||
}
|
||||
|
||||
// temporary func that checks for deprecated slack-notification-related flags
|
||||
func flagCheck(cmd *cobra.Command, args []string) {
|
||||
if slackHookURL != "" && notifyURL != "" {
|
||||
log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...")
|
||||
slackHookURL = ""
|
||||
}
|
||||
if slackChannel != "" || slackHookURL != "" || slackUsername != "" {
|
||||
if slackHookURL != "" {
|
||||
log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.")
|
||||
trataURL, err := url.Parse(slackHookURL)
|
||||
if err != nil {
|
||||
log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)
|
||||
}
|
||||
if len(strings.Split(strings.Trim(trataURL.Path, "/services/"), "/")) != 3 {
|
||||
log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")
|
||||
} else {
|
||||
notifyURL = fmt.Sprintf("slack://%s", strings.Trim(trataURL.Path, "/services/"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bindViper initializes viper and binds command flags with environment variables
|
||||
func bindViper(cmd *cobra.Command, args []string) error {
|
||||
v := viper.New()
|
||||
|
||||
v.SetEnvPrefix(EnvPrefix)
|
||||
v.AutomaticEnv()
|
||||
bindFlags(cmd, v)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// bindFlags binds each cobra flag to its associated viper configuration (environment variable)
|
||||
func bindFlags(cmd *cobra.Command, v *viper.Viper) {
|
||||
cmd.Flags().VisitAll(func(f *pflag.Flag) {
|
||||
// Environment variables can't have dashes in them, so bind them to their equivalent keys with underscores
|
||||
if strings.Contains(f.Name, "-") {
|
||||
v.BindEnv(f.Name, flagToEnvVar(f.Name))
|
||||
}
|
||||
|
||||
// Apply the viper config value to the flag when the flag is not set and viper has a value
|
||||
if !f.Changed && v.IsSet(f.Name) {
|
||||
val := v.Get(f.Name)
|
||||
log.Infof("Binding %s command flag to environment variable: %s", f.Name, flagToEnvVar(f.Name))
|
||||
cmd.Flags().Set(f.Name, fmt.Sprintf("%v", val))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// flagToEnvVar converts command flag name to equivalent environment variable name
|
||||
func flagToEnvVar(flag string) string {
|
||||
envVarSuffix := strings.ToUpper(strings.ReplaceAll(flag, "-", "_"))
|
||||
return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix)
|
||||
}
|
||||
|
||||
// newCommand creates a new Command with stdout/stderr wired to our standard logger
|
||||
@@ -228,10 +295,13 @@ type RebootBlocker interface {
|
||||
// PrometheusBlockingChecker contains info for connecting
|
||||
// to prometheus, and can give info about whether a reboot should be blocked
|
||||
type PrometheusBlockingChecker struct {
|
||||
// URL to contact prometheus API for checking alerts
|
||||
promURL string
|
||||
// prometheusClient to make prometheus-go-client and api config available
|
||||
// into the PrometheusBlockingChecker struct
|
||||
promClient *alerts.PromClient
|
||||
// regexp used to get alerts
|
||||
filter *regexp.Regexp
|
||||
// bool to indicate if only firing alerts should be considered
|
||||
firingOnly bool
|
||||
}
|
||||
|
||||
// KubernetesBlockingChecker contains info for connecting
|
||||
@@ -245,7 +315,8 @@ type KubernetesBlockingChecker struct {
|
||||
}
|
||||
|
||||
func (pb PrometheusBlockingChecker) isBlocked() bool {
|
||||
alertNames, err := alerts.PrometheusActiveAlerts(pb.promURL, pb.filter)
|
||||
|
||||
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
|
||||
if err != nil {
|
||||
log.Warnf("Reboot blocked: prometheus query error: %v", err)
|
||||
return true
|
||||
@@ -262,7 +333,7 @@ func (pb PrometheusBlockingChecker) isBlocked() bool {
|
||||
}
|
||||
|
||||
func (kb KubernetesBlockingChecker) isBlocked() bool {
|
||||
fieldSelector := fmt.Sprintf("spec.nodeName=%s", kb.nodename)
|
||||
fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodename)
|
||||
for _, labelSelector := range kb.filter {
|
||||
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
|
||||
LabelSelector: labelSelector,
|
||||
@@ -342,11 +413,6 @@ func drain(client *kubernetes.Clientset, node *v1.Node) {
|
||||
|
||||
log.Infof("Draining node %s", nodename)
|
||||
|
||||
if slackHookURL != "" {
|
||||
if err := slack.NotifyDrain(slackHookURL, slackUsername, slackChannel, messageTemplateDrain, nodename); err != nil {
|
||||
log.Warnf("Error notifying slack: %v", err)
|
||||
}
|
||||
}
|
||||
if notifyURL != "" {
|
||||
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateDrain, nodename)); err != nil {
|
||||
log.Warnf("Error notifying: %v", err)
|
||||
@@ -390,6 +456,7 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) {
|
||||
Client: client,
|
||||
ErrOut: os.Stderr,
|
||||
Out: os.Stdout,
|
||||
Ctx: context.Background(),
|
||||
}
|
||||
if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil {
|
||||
log.Fatalf("Error uncordonning %s: %v", nodename, err)
|
||||
@@ -399,12 +466,6 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) {
|
||||
func invokeReboot(nodeID string, rebootCommand []string) {
|
||||
log.Infof("Running command: %s for node: %s", rebootCommand, nodeID)
|
||||
|
||||
if slackHookURL != "" {
|
||||
if err := slack.NotifyReboot(slackHookURL, slackUsername, slackChannel, messageTemplateReboot, nodeID); err != nil {
|
||||
log.Warnf("Error notifying slack: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if notifyURL != "" {
|
||||
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil {
|
||||
log.Warnf("Error notifying: %v", err)
|
||||
@@ -513,6 +574,12 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
preferNoScheduleTaint.Disable()
|
||||
}
|
||||
|
||||
// instantiate prometheus client
|
||||
promClient, err := alerts.NewPromClient(papi.Config{Address: prometheusURL})
|
||||
if err != nil {
|
||||
log.Fatal("Unable to create prometheus client: ", err)
|
||||
}
|
||||
|
||||
source := rand.NewSource(time.Now().UnixNano())
|
||||
tick := delaytick.New(source, period)
|
||||
for range tick {
|
||||
@@ -531,7 +598,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
|
||||
var blockCheckers []RebootBlocker
|
||||
if prometheusURL != "" {
|
||||
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promURL: prometheusURL, filter: alertFilter})
|
||||
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
|
||||
}
|
||||
if podSelectors != nil {
|
||||
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
|
||||
@@ -567,6 +634,12 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
}
|
||||
|
||||
drain(client, node)
|
||||
|
||||
if rebootDelay > 0 {
|
||||
log.Infof("Delaying reboot for %v", rebootDelay)
|
||||
time.Sleep(rebootDelay)
|
||||
}
|
||||
|
||||
invokeReboot(nodeID, rebootCommand)
|
||||
for {
|
||||
log.Infof("Waiting for reboot")
|
||||
@@ -599,9 +672,12 @@ func parseRebootCommand(rebootCommand string) []string {
|
||||
}
|
||||
|
||||
func root(cmd *cobra.Command, args []string) {
|
||||
if logFormat == "json" {
|
||||
log.SetFormatter(&log.JSONFormatter{})
|
||||
}
|
||||
|
||||
log.Infof("Kubernetes Reboot Daemon: %s", version)
|
||||
|
||||
nodeID := os.Getenv("KURED_NODE_ID")
|
||||
if nodeID == "" {
|
||||
log.Fatal("KURED_NODE_ID environment variable required")
|
||||
}
|
||||
|
||||
@@ -5,7 +5,11 @@ import (
|
||||
"testing"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/weaveworks/kured/pkg/alerts"
|
||||
assert "gotest.tools/v3/assert"
|
||||
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
)
|
||||
|
||||
type BlockingChecker struct {
|
||||
@@ -19,11 +23,26 @@ func (fbc BlockingChecker) isBlocked() bool {
|
||||
var _ RebootBlocker = BlockingChecker{} // Verify that Type implements Interface.
|
||||
var _ RebootBlocker = (*BlockingChecker)(nil) // Verify that *Type implements Interface.
|
||||
|
||||
func Test_flagCheck(t *testing.T) {
|
||||
var cmd *cobra.Command
|
||||
var args []string
|
||||
slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", notifyURL)
|
||||
}
|
||||
}
|
||||
func Test_rebootBlocked(t *testing.T) {
|
||||
noCheckers := []RebootBlocker{}
|
||||
nonblockingChecker := BlockingChecker{blocking: false}
|
||||
blockingChecker := BlockingChecker{blocking: true}
|
||||
brokenPrometheusClient := PrometheusBlockingChecker{promURL: "", filter: nil}
|
||||
|
||||
// Instantiate a prometheusClient with a broken_url
|
||||
promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"})
|
||||
if err != nil {
|
||||
log.Fatal("Can't create prometheusClient: ", err)
|
||||
}
|
||||
brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false}
|
||||
|
||||
type args struct {
|
||||
blockers []RebootBlocker
|
||||
|
||||
22
go.mod
22
go.mod
@@ -1,18 +1,20 @@
|
||||
module github.com/weaveworks/kured
|
||||
|
||||
go 1.15
|
||||
go 1.16
|
||||
|
||||
require (
|
||||
github.com/containrrr/shoutrrr v0.4.4
|
||||
github.com/containrrr/shoutrrr v0.5.2
|
||||
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
|
||||
github.com/prometheus/client_golang v1.10.0
|
||||
github.com/prometheus/common v0.25.0
|
||||
github.com/prometheus/client_golang v1.11.0
|
||||
github.com/prometheus/common v0.32.1
|
||||
github.com/sirupsen/logrus v1.8.1
|
||||
github.com/spf13/cobra v1.1.3
|
||||
golang.org/x/crypto v0.0.0-20210506145944-38f3c27a63bf // indirect
|
||||
github.com/spf13/cobra v1.3.0
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/spf13/viper v1.10.1
|
||||
github.com/stretchr/testify v1.7.0
|
||||
gotest.tools/v3 v3.0.3
|
||||
k8s.io/api v0.20.5
|
||||
k8s.io/apimachinery v0.20.5
|
||||
k8s.io/client-go v0.20.5
|
||||
k8s.io/kubectl v0.20.5
|
||||
k8s.io/api v0.22.4
|
||||
k8s.io/apimachinery v0.22.4
|
||||
k8s.io/client-go v0.22.4
|
||||
k8s.io/kubectl v0.22.4
|
||||
)
|
||||
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
restartPolicy: Always
|
||||
containers:
|
||||
- name: kured
|
||||
image: docker.io/weaveworks/kured
|
||||
image: docker.io/weaveworks/kured:1.9.1
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
imagePullPolicy: IfNotPresent
|
||||
@@ -55,6 +55,7 @@ spec:
|
||||
# - --lock-ttl=0
|
||||
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
|
||||
# - --alert-filter-regexp=^RebootRequired$
|
||||
# - --alert-firing-only=false
|
||||
# - --reboot-sentinel=/var/run/reboot-required
|
||||
# - --prefer-no-schedule-taint=""
|
||||
# - --reboot-sentinel-command=""
|
||||
@@ -68,8 +69,10 @@ spec:
|
||||
# - --blocking-pod-selector=name=temperamental
|
||||
# - --blocking-pod-selector=...
|
||||
# - --reboot-days=sun,mon,tue,wed,thu,fri,sat
|
||||
# - --reboot-delay=90s
|
||||
# - --start-time=0:00
|
||||
# - --end-time=23:59:59
|
||||
# - --time-zone=UTC
|
||||
# - --annotate-nodes=false
|
||||
# - --lock-release-delay=30m
|
||||
# - --log-format=text
|
||||
|
||||
@@ -7,22 +7,39 @@ import (
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/api"
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
"github.com/prometheus/common/model"
|
||||
)
|
||||
|
||||
// PrometheusActiveAlerts returns a list of names of active (e.g. pending or firing) alerts, filtered
|
||||
// by the supplied regexp.
|
||||
func PrometheusActiveAlerts(prometheusURL string, filter *regexp.Regexp) ([]string, error) {
|
||||
client, err := api.NewClient(api.Config{Address: prometheusURL})
|
||||
// PromClient is a wrapper around the Prometheus Client interface and implements the api
|
||||
// This way, the PromClient can be instantiated with the configuration the Client needs, and
|
||||
// the ability to use the methods the api has, like Query and so on.
|
||||
type PromClient struct {
|
||||
papi papi.Client
|
||||
api v1.API
|
||||
}
|
||||
|
||||
// NewPromClient creates a new client to the Prometheus API.
|
||||
// It returns an error on any problem.
|
||||
func NewPromClient(conf papi.Config) (*PromClient, error) {
|
||||
promClient, err := papi.NewClient(conf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
client := PromClient{papi: promClient, api: v1.NewAPI(promClient)}
|
||||
return &client, nil
|
||||
}
|
||||
|
||||
queryAPI := v1.NewAPI(client)
|
||||
// ActiveAlerts is a method of type PromClient, it returns a list of names of active alerts
|
||||
// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query.
|
||||
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
|
||||
// block-list and will NOT block rebooting. query by includeLabel means,
|
||||
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
|
||||
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {
|
||||
|
||||
value, _, err := queryAPI.Query(context.Background(), "ALERTS", time.Now())
|
||||
// get all alerts from prometheus
|
||||
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -32,7 +49,7 @@ func PrometheusActiveAlerts(prometheusURL string, filter *regexp.Regexp) ([]stri
|
||||
activeAlertSet := make(map[string]bool)
|
||||
for _, sample := range vector {
|
||||
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
|
||||
if filter == nil || !filter.MatchString(string(alertName)) {
|
||||
if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
|
||||
activeAlertSet[string(alertName)] = true
|
||||
}
|
||||
}
|
||||
@@ -42,7 +59,7 @@ func PrometheusActiveAlerts(prometheusURL string, filter *regexp.Regexp) ([]stri
|
||||
for activeAlert := range activeAlertSet {
|
||||
activeAlerts = append(activeAlerts, activeAlert)
|
||||
}
|
||||
sort.Sort(sort.StringSlice(activeAlerts))
|
||||
sort.Strings(activeAlerts)
|
||||
|
||||
return activeAlerts, nil
|
||||
}
|
||||
|
||||
141
pkg/alerts/prometheus_test.go
Normal file
141
pkg/alerts/prometheus_test.go
Normal file
@@ -0,0 +1,141 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/api"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
type MockResponse struct {
|
||||
StatusCode int
|
||||
Body []byte
|
||||
}
|
||||
|
||||
// MockServerProperties ties a mock response to a url and a method
|
||||
type MockServerProperties struct {
|
||||
URI string
|
||||
HTTPMethod string
|
||||
Response MockResponse
|
||||
}
|
||||
|
||||
// NewMockServer sets up a new MockServer with properties ad starts the server.
|
||||
func NewMockServer(props ...MockServerProperties) *httptest.Server {
|
||||
|
||||
handler := http.HandlerFunc(
|
||||
func(w http.ResponseWriter, r *http.Request) {
|
||||
for _, proc := range props {
|
||||
_, err := w.Write(proc.Response.Body)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
return httptest.NewServer(handler)
|
||||
}
|
||||
|
||||
func TestActiveAlerts(t *testing.T) {
|
||||
responsebody := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"GatekeeperViolations","alertstate":"firing","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodCrashing-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:8080","job":"kube-state-metrics","namespace":"dev","pod":"dev-deployment-78dcbmf25v","severity":"critical","team":"dev"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodRestart-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:1234","job":"kube-state-metrics","namespace":"qa","pod":"qa-job-deployment-78dcbmf25v","severity":"warning","team":"qa"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PrometheusTargetDown","alertstate":"firing","job":"kubernetes-pods","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`
|
||||
addr := "http://localhost:10001"
|
||||
|
||||
for _, tc := range []struct {
|
||||
it string
|
||||
rFilter string
|
||||
respBody string
|
||||
aName string
|
||||
wantN int
|
||||
firingOnly bool
|
||||
}{
|
||||
{
|
||||
it: "should return no active alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Pod",
|
||||
wantN: 3,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex filter",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return only firing alerts if firingOnly is true",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 4,
|
||||
firingOnly: true,
|
||||
},
|
||||
{
|
||||
it: "should return ScheduledRebootFailing active alerts",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
aName: "ScheduledRebootFailing",
|
||||
rFilter: "*",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
},
|
||||
} {
|
||||
// Start mockServer
|
||||
mockServer := NewMockServer(MockServerProperties{
|
||||
URI: addr,
|
||||
HTTPMethod: http.MethodPost,
|
||||
Response: MockResponse{
|
||||
Body: []byte(tc.respBody),
|
||||
},
|
||||
})
|
||||
// Close mockServer after all connections are gone
|
||||
defer mockServer.Close()
|
||||
|
||||
t.Run(tc.it, func(t *testing.T) {
|
||||
|
||||
// regex filter
|
||||
regex, _ := regexp.Compile(tc.rFilter)
|
||||
|
||||
// instantiate the prometheus client with the mockserver-address
|
||||
p, err := NewPromClient(api.Config{Address: mockServer.URL})
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
result, err := p.ActiveAlerts(regex, tc.firingOnly)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// assert
|
||||
assert.Equal(t, tc.wantN, len(result), "expected amount of alerts %v, got %v", tc.wantN, len(result))
|
||||
|
||||
if tc.aName != "" {
|
||||
assert.Equal(t, tc.aName, result[0], "expected active alert %v, got %v", tc.aName, result[0])
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,54 +0,0 @@
|
||||
package slack
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
httpClient = &http.Client{Timeout: 5 * time.Second}
|
||||
)
|
||||
|
||||
type body struct {
|
||||
Text string `json:"text,omitempty"`
|
||||
Username string `json:"username,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
}
|
||||
|
||||
func notify(hookURL, username, channel, message string) error {
|
||||
msg := body{
|
||||
Text: message,
|
||||
Username: username,
|
||||
Channel: channel,
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
if err := json.NewEncoder(&buf).Encode(&msg); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
resp, err := httpClient.Post(hookURL, "application/json", &buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return fmt.Errorf(resp.Status)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// NotifyDrain is the exposed way to notify of a drain event onto a slack chan
|
||||
func NotifyDrain(hookURL, username, channel, messageTemplate, nodeID string) error {
|
||||
return notify(hookURL, username, channel, fmt.Sprintf(messageTemplate, nodeID))
|
||||
}
|
||||
|
||||
// NotifyReboot is the exposed way to notify of a reboot event onto a slack chan
|
||||
func NotifyReboot(hookURL, username, channel, messageTemplate, nodeID string) error {
|
||||
return notify(hookURL, username, channel, fmt.Sprintf(messageTemplate, nodeID))
|
||||
}
|
||||
Reference in New Issue
Block a user