mirror of
https://github.com/kubereboot/kured.git
synced 2026-02-14 17:39:49 +00:00
Compare commits
337 Commits
1.9.1
...
cleanup/to
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59cbea5e25 | ||
|
|
776c35c1e1 | ||
|
|
9a4b8fdb32 | ||
|
|
3b9b190422 | ||
|
|
f22b1abd17 | ||
|
|
c159b37fcc | ||
|
|
351ca71787 | ||
|
|
16dc5e30d9 | ||
|
|
aa971697ff | ||
|
|
d019e7a50a | ||
|
|
ee81617645 | ||
|
|
d7adcf6e1e | ||
|
|
409ff0a3e6 | ||
|
|
3be3cd46b5 | ||
|
|
e8202c602c | ||
|
|
752176d16b | ||
|
|
d30a71e1d3 | ||
|
|
815df5e1e9 | ||
|
|
77327b3915 | ||
|
|
ec328e33d6 | ||
|
|
54e127c2ad | ||
|
|
1867c3253e | ||
|
|
05a3ff85a3 | ||
|
|
19846c73f2 | ||
|
|
ba62c32cbf | ||
|
|
4c75199b41 | ||
|
|
91eb403942 | ||
|
|
a27c755260 | ||
|
|
2a6d119b3b | ||
|
|
b17224addc | ||
|
|
a2f21ebe49 | ||
|
|
4d2f26f483 | ||
|
|
b358be7617 | ||
|
|
e88434b619 | ||
|
|
1b12e52434 | ||
|
|
64e40a62b0 | ||
|
|
6690396679 | ||
|
|
9acb2450ea | ||
|
|
e1a5b7d705 | ||
|
|
72f52f2c6f | ||
|
|
6df454c0eb | ||
|
|
c09e65eab1 | ||
|
|
a34c994f4b | ||
|
|
60c54bef31 | ||
|
|
8afa302680 | ||
|
|
de42273849 | ||
|
|
d3e2c9af95 | ||
|
|
00648786b7 | ||
|
|
c7f4380847 | ||
|
|
c659c25b94 | ||
|
|
f44ced2d04 | ||
|
|
e7d24bfff0 | ||
|
|
0378c8a8c5 | ||
|
|
2cfeb34c03 | ||
|
|
3bfacca254 | ||
|
|
46e1b9616b | ||
|
|
fe95f17503 | ||
|
|
462a063b6e | ||
|
|
e664de6c6f | ||
|
|
b666474cf1 | ||
|
|
64313f82ef | ||
|
|
59ba53584e | ||
|
|
b2ffc0d154 | ||
|
|
b7edf8b345 | ||
|
|
4e01e607cc | ||
|
|
1929c11297 | ||
|
|
28832f5cfb | ||
|
|
3c79c750e1 | ||
|
|
58afedd842 | ||
|
|
57783966db | ||
|
|
316a0ef4a3 | ||
|
|
7a86e65c69 | ||
|
|
efa0fe808d | ||
|
|
c2f97614dd | ||
|
|
e710e05658 | ||
|
|
4ff3378df5 | ||
|
|
002f331486 | ||
|
|
2993afb329 | ||
|
|
97e1f56008 | ||
|
|
4c9ed478d4 | ||
|
|
6e0af2f320 | ||
|
|
1ea3823069 | ||
|
|
0063141b89 | ||
|
|
3a1cfe395e | ||
|
|
ae3ab9f3e1 | ||
|
|
0b27a7ea80 | ||
|
|
2596dcdcab | ||
|
|
00c8b5254b | ||
|
|
6aa6a96b46 | ||
|
|
a7b155a78f | ||
|
|
031ceed1f1 | ||
|
|
0ceb062a47 | ||
|
|
a4fba5a5bc | ||
|
|
942f9d7eed | ||
|
|
fd58b79413 | ||
|
|
132215ee97 | ||
|
|
25662304c2 | ||
|
|
887b2e2427 | ||
|
|
6afa8513c8 | ||
|
|
94a4387407 | ||
|
|
9ab71c894f | ||
|
|
72eda8a7c3 | ||
|
|
7bb9b75e2a | ||
|
|
dfb8441078 | ||
|
|
0e0cf7fac1 | ||
|
|
06af12114d | ||
|
|
477f356571 | ||
|
|
ad1e9b8401 | ||
|
|
80628b1b79 | ||
|
|
30673c0391 | ||
|
|
35e7bf9897 | ||
|
|
f8551b6714 | ||
|
|
d87d585b9c | ||
|
|
6ff57552c7 | ||
|
|
36c78d94ce | ||
|
|
0bc867cf11 | ||
|
|
c6d9bf07e6 | ||
|
|
fb84fa8253 | ||
|
|
05414fb9d0 | ||
|
|
230fa45461 | ||
|
|
6aca815125 | ||
|
|
eed2df6493 | ||
|
|
ff773d96bd | ||
|
|
dd7081d58a | ||
|
|
504708e5cd | ||
|
|
77a57089f0 | ||
|
|
185761c024 | ||
|
|
a5c051d2e5 | ||
|
|
85c4fae0a6 | ||
|
|
99cfe852ba | ||
|
|
1a8eccc039 | ||
|
|
75478e8608 | ||
|
|
4098158c6d | ||
|
|
93bd87547a | ||
|
|
28a05ca941 | ||
|
|
5db3889bdc | ||
|
|
72c3113c56 | ||
|
|
6122192232 | ||
|
|
c8ba0d98dc | ||
|
|
80579a3a81 | ||
|
|
72c234e470 | ||
|
|
ae29a90f43 | ||
|
|
1e0e1e52d8 | ||
|
|
3299e8ebf7 | ||
|
|
f93a634bd1 | ||
|
|
c19045c344 | ||
|
|
2440b32c49 | ||
|
|
0fc09c6d31 | ||
|
|
ed8e3292f1 | ||
|
|
9ec2d83476 | ||
|
|
6456d50880 | ||
|
|
c9a229c02c | ||
|
|
d2d5ba4ee8 | ||
|
|
a7e227a259 | ||
|
|
67c0b9bfdb | ||
|
|
f008edd583 | ||
|
|
39451838aa | ||
|
|
60d0f90c80 | ||
|
|
3005892498 | ||
|
|
b74784d36a | ||
|
|
f9ff4b3ac6 | ||
|
|
eb877416ae | ||
|
|
16fd2db8fb | ||
|
|
0f2dff84cd | ||
|
|
bdd59a72f3 | ||
|
|
09bfa18f47 | ||
|
|
a713389ca5 | ||
|
|
6986342e12 | ||
|
|
064724b8f6 | ||
|
|
3f8b119b39 | ||
|
|
e5dc205494 | ||
|
|
c3c80834d0 | ||
|
|
8f96eb6688 | ||
|
|
16d33d96f7 | ||
|
|
590b6b87a4 | ||
|
|
53e17ec12a | ||
|
|
4485fb60b0 | ||
|
|
4d7ba069d3 | ||
|
|
0ff6390107 | ||
|
|
42bf7c5fd9 | ||
|
|
b39fa7f2b8 | ||
|
|
12a252532e | ||
|
|
121b5e8a12 | ||
|
|
e4512458de | ||
|
|
d0a3031b86 | ||
|
|
9db896fc6a | ||
|
|
91139a728a | ||
|
|
b0acc1e3e4 | ||
|
|
82e3c99964 | ||
|
|
857eecf36d | ||
|
|
0189c350d0 | ||
|
|
b340cd53e9 | ||
|
|
3aad30974a | ||
|
|
87ef8296b2 | ||
|
|
e3442b164b | ||
|
|
47f0b244fe | ||
|
|
6ee5fa8636 | ||
|
|
6b757de2ee | ||
|
|
ba1328ca12 | ||
|
|
8cabfb7d75 | ||
|
|
5ac4f7ec86 | ||
|
|
beecd839c2 | ||
|
|
e70af373fc | ||
|
|
cb4eccb22e | ||
|
|
fe6bea8c29 | ||
|
|
06fad838d4 | ||
|
|
15d0492e23 | ||
|
|
645c768001 | ||
|
|
1bbcbe93b7 | ||
|
|
c828d27fb2 | ||
|
|
e34f1a0947 | ||
|
|
50d024c3f9 | ||
|
|
e36a43c57c | ||
|
|
bce0bac183 | ||
|
|
d5217121ac | ||
|
|
82142f4d6a | ||
|
|
796014ab80 | ||
|
|
e5867b9f87 | ||
|
|
8343ddd9c5 | ||
|
|
92db607e89 | ||
|
|
c69a2449b2 | ||
|
|
e3032373ab | ||
|
|
5fd42b0085 | ||
|
|
9d28cac8b4 | ||
|
|
788e351a18 | ||
|
|
c20a5c2da9 | ||
|
|
8e42373fcb | ||
|
|
6966f628b9 | ||
|
|
41ae0b20a5 | ||
|
|
8f5b9abc19 | ||
|
|
774dc69e3b | ||
|
|
9377948f62 | ||
|
|
4d6cac66a6 | ||
|
|
9d4ebfc1f8 | ||
|
|
ad781ad6f0 | ||
|
|
9525ce53a3 | ||
|
|
c5bb9ae478 | ||
|
|
8cf12fa24e | ||
|
|
98fdb334aa | ||
|
|
87eda823e7 | ||
|
|
9788dba4f3 | ||
|
|
777f5b2cce | ||
|
|
055de3a949 | ||
|
|
7bea9d53c1 | ||
|
|
aa5a3f0ba9 | ||
|
|
8230add524 | ||
|
|
10d42b07a5 | ||
|
|
5a59c2f504 | ||
|
|
9c56b28282 | ||
|
|
dd0bce41be | ||
|
|
67c50b27ab | ||
|
|
e2e6e86e0c | ||
|
|
5aaa1e01bc | ||
|
|
00d5b4920a | ||
|
|
28c5332450 | ||
|
|
71b3f1dd7f | ||
|
|
95aee6828c | ||
|
|
66ce93ef09 | ||
|
|
1e76d65d00 | ||
|
|
f802373e0f | ||
|
|
6c34fee96b | ||
|
|
8dfccdbe48 | ||
|
|
db62f4aa0e | ||
|
|
115fea9d2a | ||
|
|
0734e270fa | ||
|
|
08774994ad | ||
|
|
90d2d9a39b | ||
|
|
35a6b8955d | ||
|
|
641c319eb8 | ||
|
|
bee558cd8f | ||
|
|
78064e1d2c | ||
|
|
29560f15b3 | ||
|
|
500a8a1bbb | ||
|
|
9e441ebee6 | ||
|
|
34f0df2605 | ||
|
|
cd7c4f8da3 | ||
|
|
9407c3f8f6 | ||
|
|
da59ebff70 | ||
|
|
d2d21f31c0 | ||
|
|
6191c73a3c | ||
|
|
48d112ba32 | ||
|
|
b12ae4eccd | ||
|
|
50aac294b7 | ||
|
|
c3cb2bbc6c | ||
|
|
67e979c198 | ||
|
|
9be88fb878 | ||
|
|
4fcf6e184b | ||
|
|
aa5c3e7783 | ||
|
|
5ab20e62d2 | ||
|
|
03e8c2116a | ||
|
|
9415f301a2 | ||
|
|
4d4d3982c2 | ||
|
|
84fa914fe6 | ||
|
|
d1e8b1b1a5 | ||
|
|
3487860e06 | ||
|
|
d965e7f67e | ||
|
|
4ab3bf9813 | ||
|
|
7397365c51 | ||
|
|
d771013cde | ||
|
|
195f4f0bee | ||
|
|
15735cd933 | ||
|
|
c44ecff3e5 | ||
|
|
1020e7179a | ||
|
|
5ff221b5b6 | ||
|
|
c7b5520859 | ||
|
|
bbdce6abe5 | ||
|
|
5b11ebcc3a | ||
|
|
8543cf25a2 | ||
|
|
6691996bc0 | ||
|
|
eb4acc69bf | ||
|
|
302578467d | ||
|
|
99e7b71ba4 | ||
|
|
e38d153fe7 | ||
|
|
7f6d4a1846 | ||
|
|
07208ef84b | ||
|
|
d6964180ca | ||
|
|
966698f3c6 | ||
|
|
445310b9b7 | ||
|
|
1eec15b5dd | ||
|
|
238e6993f3 | ||
|
|
1ca0203db2 | ||
|
|
9ddad78071 | ||
|
|
4918203ea9 | ||
|
|
640efa56b8 | ||
|
|
67232f00d9 | ||
|
|
ce32f9dc05 | ||
|
|
d82d295f2d | ||
|
|
580279f419 | ||
|
|
87508eb778 | ||
|
|
7d3b97541d | ||
|
|
93d6a783a1 | ||
|
|
b7494f5f80 | ||
|
|
8e1933cd28 | ||
|
|
96bf7c1add | ||
|
|
178ba93b5a | ||
|
|
f3ed0087d2 | ||
|
|
71a273a14c |
3
.clomonitor.yml
Normal file
3
.clomonitor.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
exemptions:
|
||||
- check: analytics
|
||||
reason: "We don't track people"
|
||||
7
.github/ct.yaml
vendored
7
.github/ct.yaml
vendored
@@ -1,7 +0,0 @@
|
||||
# See https://github.com/helm/chart-testing#configuration
|
||||
remote: origin
|
||||
target-branch: main
|
||||
chart-dirs:
|
||||
- charts
|
||||
chart-repos: []
|
||||
helm-extra-args: --timeout 600s
|
||||
2
.github/dependabot.yml
vendored
2
.github/dependabot.yml
vendored
@@ -16,6 +16,6 @@ updates:
|
||||
- dependency-name: "k8s.io/client-go"
|
||||
- dependency-name: "k8s.io/kubectl"
|
||||
- package-ecosystem: "docker"
|
||||
directory: "cmd/kured"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
|
||||
13
.github/kind-cluster-1.21.yaml
vendored
13
.github/kind-cluster-1.21.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.21.2
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.21.2
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.21.2
|
||||
- role: worker
|
||||
image: kindest/node:v1.21.2
|
||||
- role: worker
|
||||
image: kindest/node:v1.21.2
|
||||
13
.github/kind-cluster-1.22.yaml
vendored
13
.github/kind-cluster-1.22.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.22.4
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.22.4
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.22.4
|
||||
- role: worker
|
||||
image: kindest/node:v1.22.4
|
||||
- role: worker
|
||||
image: kindest/node:v1.22.4
|
||||
13
.github/kind-cluster-1.23.yaml
vendored
13
.github/kind-cluster-1.23.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.23.0"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.23.0"
|
||||
13
.github/kind-cluster-1.25.yaml
vendored
Normal file
13
.github/kind-cluster-1.25.yaml
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.11
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.11
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.11
|
||||
- role: worker
|
||||
image: kindest/node:v1.25.11
|
||||
- role: worker
|
||||
image: kindest/node:v1.25.11
|
||||
13
.github/kind-cluster-1.26.yaml
vendored
Normal file
13
.github/kind-cluster-1.26.yaml
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.26.6"
|
||||
13
.github/kind-cluster-1.27.yaml
vendored
Normal file
13
.github/kind-cluster-1.27.yaml
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.27.3"
|
||||
37
.github/scripts/goreleaser-install.sh
vendored
Normal file
37
.github/scripts/goreleaser-install.sh
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
RELEASES_URL="https://github.com/goreleaser/goreleaser/releases"
|
||||
FILE_BASENAME="goreleaser"
|
||||
|
||||
test -z "$VERSION" && {
|
||||
echo "Unable to get goreleaser version." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
test -z "$TMPDIR" && TMPDIR="$(mktemp -d)"
|
||||
TAR_FILE="$TMPDIR/${FILE_BASENAME}_$(uname -s)_$(uname -m).tar.gz"
|
||||
export TAR_FILE
|
||||
|
||||
(
|
||||
echo "Downloading GoReleaser $VERSION..."
|
||||
curl -sfLo "$TAR_FILE" \
|
||||
"$RELEASES_URL/download/$VERSION/${FILE_BASENAME}_$(uname -s)_$(uname -m).tar.gz"
|
||||
cd "$TMPDIR"
|
||||
curl -sfLo "checksums.txt" "$RELEASES_URL/download/$VERSION/checksums.txt"
|
||||
curl -sfLo "checksums.txt.sig" "$RELEASES_URL/download/$VERSION/checksums.txt.sig"
|
||||
echo "Verifying checksums..."
|
||||
sha256sum --ignore-missing --quiet --check checksums.txt
|
||||
if command -v cosign >/dev/null 2>&1; then
|
||||
echo "Verifying signatures..."
|
||||
COSIGN_EXPERIMENTAL=1 cosign verify-blob \
|
||||
--signature checksums.txt.sig \
|
||||
checksums.txt
|
||||
else
|
||||
echo "Could not verify signatures, cosign is not installed."
|
||||
fi
|
||||
)
|
||||
|
||||
tar -xf "$TAR_FILE" -O goreleaser > "$TMPDIR/goreleaser"
|
||||
rm "$TMPDIR/checksums.txt" "$TMPDIR/checksums.txt.sig"
|
||||
rm "$TAR_FILE"
|
||||
75
.github/workflows/codeql.yml
vendored
Normal file
75
.github/workflows/codeql.yml
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches: [ "main" ]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: [ "main" ]
|
||||
schedule:
|
||||
- cron: '24 13 * * 3'
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'go' ]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
|
||||
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v2
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
|
||||
# Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
|
||||
# queries: security-extended,security-and-quality
|
||||
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v2
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
|
||||
# If the Autobuild fails above, remove it and uncomment the following three lines.
|
||||
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
|
||||
|
||||
# - run: |
|
||||
# echo "Run, Build Application using script"
|
||||
# ./location_of_script_within_repo/buildscript.sh
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v2
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
||||
19
.github/workflows/on-main-push-charts.yaml
vendored
19
.github/workflows/on-main-push-charts.yaml
vendored
@@ -1,19 +0,0 @@
|
||||
name: Publish helm chart
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- "charts/**"
|
||||
|
||||
jobs:
|
||||
publish-helm-chart:
|
||||
name: Publish latest chart
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Publish Helm chart
|
||||
uses: stefanprodan/helm-gh-pages@master
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
charts_dir: charts
|
||||
86
.github/workflows/on-main-push.yaml
vendored
86
.github/workflows/on-main-push.yaml
vendored
@@ -5,41 +5,81 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
jobs:
|
||||
tag-scan-and-push-final-image:
|
||||
name: "Build, scan, and publish tagged image"
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: write
|
||||
packages: write
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v1
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: weave-ghcr-bot
|
||||
password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
|
||||
- name: Build binaries
|
||||
run: make kured-release-snapshot
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
|
||||
- name: Build image
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" image
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
|
||||
push: true
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: |
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
|
||||
- name: Publish image
|
||||
- name: Generate SBOM
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" publish-image
|
||||
.tmp/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }} -o spdx > kured.sbom
|
||||
|
||||
- name: Sign and attest artifacts
|
||||
run: |
|
||||
.tmp/cosign sign -f -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
|
||||
.tmp/cosign sign-blob --output-signature kured.sbom.sig --output-certificate kured.sbom.pem kured.sbom
|
||||
|
||||
.tmp/cosign attest -f --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
.tmp/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
env:
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
|
||||
78
.github/workflows/on-pr-charts.yaml
vendored
78
.github/workflows/on-pr-charts.yaml
vendored
@@ -1,78 +0,0 @@
|
||||
#This is just extra testing, for lint check, and basic installation
|
||||
#Those can fail earlier than functional tests (shorter tests)
|
||||
# and give developer feedback soon if they didn't test themselves
|
||||
name: PR - charts
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "charts/**"
|
||||
jobs:
|
||||
# We create two jobs (with a matrix) instead of one to make those parallel.
|
||||
# We don't need to conditionally check if something has changed, due to github actions
|
||||
# tackling that for us.
|
||||
# Fail-fast ensures that if one of those matrix job fail, the other one gets cancelled.
|
||||
test-chart:
|
||||
name: Test helm chart changes
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
test-action:
|
||||
- lint
|
||||
- install
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: "0"
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
|
||||
# Helm is already present in github actions, so do not re-install it
|
||||
- name: Setup chart testing
|
||||
uses: helm/chart-testing-action@v2.1.0
|
||||
|
||||
- name: Create default kind cluster
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
version: v0.11.0
|
||||
if: ${{ matrix.test-action == 'install' }}
|
||||
|
||||
- name: Run chart tests
|
||||
run: ct ${{ matrix.test-action }} --config .github/ct.yaml
|
||||
|
||||
# This doesn't re-use the ct actions, due to many limitations (auto tear down, no real testing)
|
||||
deploy-chart:
|
||||
name: Functional test of helm chart in its current state (needs published image of the helm chart)
|
||||
runs-on: ubuntu-latest
|
||||
needs: test-chart
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create 1 node kind cluster
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
version: v0.11.0
|
||||
|
||||
- name: Deploy kured on default namespace with its helm chart
|
||||
run: |
|
||||
# Documented in official helm doc to live on the edge
|
||||
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
# Refresh bins
|
||||
hash -r
|
||||
helm install kured ./charts/kured/ --set configuration.period=1m --wait
|
||||
kubectl config set-context kind-chart-testing
|
||||
kubectl get ds --all-namespaces
|
||||
kubectl describe ds kured
|
||||
|
||||
- name: Test if successful deploy
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
retry_wait_seconds: 10
|
||||
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
|
||||
command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1'"
|
||||
296
.github/workflows/on-pr.yaml
vendored
296
.github/workflows/on-pr.yaml
vendored
@@ -6,24 +6,20 @@ on:
|
||||
jobs:
|
||||
pr-gotest:
|
||||
name: Run go tests
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: run tests
|
||||
run: go test -json ./... > test.json
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.5.0
|
||||
uses: guyarb/golang-test-annoations@v0.7.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
@@ -31,7 +27,7 @@ jobs:
|
||||
name: Lint bash code with shellcheck
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Run ShellCheck
|
||||
uses: bewuethr/shellcheck-action@v2
|
||||
|
||||
@@ -39,22 +35,18 @@ jobs:
|
||||
name: Lint golang code
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Lint cmd folder
|
||||
uses: Jerome1337/golint-action@v1.0.2
|
||||
uses: Jerome1337/golint-action@v1.0.3
|
||||
with:
|
||||
golint-path: './cmd/...'
|
||||
- name: Lint pkg folder
|
||||
uses: Jerome1337/golint-action@v1.0.2
|
||||
uses: Jerome1337/golint-action@v1.0.3
|
||||
with:
|
||||
golint-path: './pkg/...'
|
||||
|
||||
@@ -62,14 +54,14 @@ jobs:
|
||||
name: Check docs for incorrect links
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Link Checker
|
||||
id: lc
|
||||
uses: peter-evans/link-checker@v1
|
||||
uses: lycheeverse/lychee-action@ec3ed119d4f44ad2673a7232460dc7dff59d2421
|
||||
env:
|
||||
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
||||
with:
|
||||
args: -r *.md *.yaml */*/*.go -x .cluster.local
|
||||
- name: Fail if there were link errors
|
||||
run: exit ${{ steps.lc.outputs.exit_code }}
|
||||
args: --verbose --no-progress '*.md' '*.yaml' '*/*/*.go' --exclude-link-local
|
||||
fail: true
|
||||
|
||||
# This should not be made a mandatory test
|
||||
# It is only used to make us aware of any potential security failure, that
|
||||
@@ -78,20 +70,32 @@ jobs:
|
||||
name: Build image and scan it against known vulnerabilities
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
- run: make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
|
||||
- uses: Azure/container-scan@v0
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build image
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
|
||||
with:
|
||||
image-name: docker.io/${{ github.repository_owner }}/kured:${{ github.sha }}
|
||||
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
# This ensures the latest code works with the manifests built from tree.
|
||||
# It is useful for two things:
|
||||
@@ -106,24 +110,29 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.21"
|
||||
- "1.22"
|
||||
- "1.23"
|
||||
- "1.25"
|
||||
- "1.26"
|
||||
- "1.27"
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build artifacts
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
|
||||
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" manifest
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest
|
||||
|
||||
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
|
||||
run: |
|
||||
@@ -136,13 +145,13 @@ jobs:
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create kind cluster with 5 nodes
|
||||
uses: helm/kind-action@v1.2.0
|
||||
uses: helm/kind-action@v1.8.0
|
||||
with:
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.11.0
|
||||
version: v0.14.0
|
||||
|
||||
- name: Preload previously built images onto kind cluster
|
||||
run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
|
||||
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing
|
||||
|
||||
- name: Do not wait for an hour before detecting the rebootSentinel
|
||||
run: |
|
||||
@@ -153,7 +162,7 @@ jobs:
|
||||
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
uses: nick-invision/retry@v2.8.3
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
@@ -171,32 +180,47 @@ jobs:
|
||||
run: |
|
||||
./tests/kind/follow-coordinated-reboot.sh
|
||||
|
||||
scenario-prom-helm:
|
||||
name: Test prometheus with latest code from HEAD (=overrides image of the helm chart)
|
||||
|
||||
|
||||
# This ensures the latest code works with the manifests built from tree.
|
||||
# It is useful for two things:
|
||||
# - Test manifests changes (obviously), ensuring they don't break existing clusters
|
||||
# - Ensure manifests work with the latest versions even with no manifest change
|
||||
# (compared to helm charts, manifests cannot easily template changes based on versions)
|
||||
# Helm charts are _trailing_ releases, while manifests are done during development.
|
||||
# Concurrency = 2
|
||||
e2e-manifests-concurent:
|
||||
name: End-to-End test with kured with code and manifests from HEAD (concurrent)
|
||||
runs-on: ubuntu-latest
|
||||
# only build with oldest and newest supported, it should be good enough.
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.21"
|
||||
- "1.25"
|
||||
- "1.26"
|
||||
- "1.27"
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build artifacts
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
|
||||
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" helm-chart
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest
|
||||
|
||||
- name: Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions
|
||||
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
|
||||
run: |
|
||||
sudo bash << EOF
|
||||
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
|
||||
@@ -206,131 +230,39 @@ jobs:
|
||||
EOF
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create 1 node kind cluster
|
||||
uses: helm/kind-action@v1.2.0
|
||||
- name: Create kind cluster with 5 nodes
|
||||
uses: helm/kind-action@v1.8.0
|
||||
with:
|
||||
version: v0.11.0
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.14.0
|
||||
|
||||
- name: Preload previously built images onto kind cluster
|
||||
run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
|
||||
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing
|
||||
|
||||
- name: Deploy kured on default namespace with its helm chart
|
||||
- name: Do not wait for an hour before detecting the rebootSentinel
|
||||
run: |
|
||||
# Documented in official helm doc to live on the edge
|
||||
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
# Refresh bins
|
||||
hash -r
|
||||
helm install kured ./charts/kured/ --wait --values ./charts/kured/ci/prometheus-values.yaml
|
||||
kubectl config set-context kind-chart-testing
|
||||
kubectl get ds --all-namespaces
|
||||
kubectl describe ds kured
|
||||
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
|
||||
sed -i 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml
|
||||
|
||||
- name: Install kured with kubectl
|
||||
run: |
|
||||
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
uses: nick-invision/retry@v2.8.3
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
retry_wait_seconds: 60
|
||||
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE
|
||||
command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1' "
|
||||
|
||||
- name: Get metrics (healthy)
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 2
|
||||
max_attempts: 12
|
||||
retry_wait_seconds: 5
|
||||
command: "./tests/kind/test-metrics.sh 0"
|
||||
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
|
||||
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"
|
||||
|
||||
- name: Create reboot sentinel files
|
||||
run: |
|
||||
./tests/kind/create-reboot-sentinels.sh
|
||||
|
||||
- name: Get metrics (need reboot)
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 15
|
||||
max_attempts: 10
|
||||
retry_wait_seconds: 60
|
||||
command: "./tests/kind/test-metrics.sh 1"
|
||||
|
||||
|
||||
# TEMPLATE Scenario testing.
|
||||
# Note: keep in mind that the helm chart's appVersion is overriden to test your HEAD of the branch,
|
||||
# if you `make helm-chart`.
|
||||
# This will allow you to test properly your scenario and not use an existing image which will not
|
||||
# contain your feature.
|
||||
|
||||
# scenario-<REPLACETHIS>-helm:
|
||||
# #example: Testing <REPLACETHIS> with helm chart and code from HEAD"
|
||||
# name: "<REPLACETHIS>"
|
||||
# runs-on: ubuntu-latest
|
||||
# strategy:
|
||||
# fail-fast: false
|
||||
# # You can define your own kubernetes versions. For example if your helm chart change should behave differently with different kubernetes versions.
|
||||
# matrix:
|
||||
# kubernetes:
|
||||
# - "1.20"
|
||||
# steps:
|
||||
# - uses: actions/checkout@v2
|
||||
# - name: Find go version
|
||||
# run: |
|
||||
# GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
# echo "::set-output name=version::${GO_VERSION}"
|
||||
# id: awk_gomod
|
||||
# - name: Ensure go version
|
||||
# uses: actions/setup-go@v2
|
||||
# with:
|
||||
# go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
# - name: Build artifacts
|
||||
# run: |
|
||||
# make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
|
||||
# make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" helm-chart
|
||||
#
|
||||
# - name: "Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions"
|
||||
# run: |
|
||||
# sudo bash << EOF
|
||||
# cp /etc/docker/daemon.json /etc/docker/daemon.json.old
|
||||
# echo '{}' > /etc/docker/daemon.json
|
||||
# systemctl restart docker || journalctl --no-pager -n 500
|
||||
# systemctl status docker
|
||||
# EOF
|
||||
#
|
||||
# # Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
# - name: Create 5 node kind cluster
|
||||
# uses: helm/kind-action@master
|
||||
# with:
|
||||
# config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
#
|
||||
# - name: Preload previously built images onto kind cluster
|
||||
# run: kind load docker-image docker.io/${{ github.repository_owner }}/kured:${{ github.sha }} --name chart-testing
|
||||
#
|
||||
# - name: Deploy kured on default namespace with its helm chart
|
||||
# run: |
|
||||
# # Documented in official helm doc to live on the edge
|
||||
# curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
# # Refresh bins
|
||||
# hash -r
|
||||
# helm install kured ./charts/kured/ --wait --values ./charts/kured/ci/<REPLACETHIS>-values.yaml
|
||||
# kubectl config set-context kind-chart-testing
|
||||
# kubectl get ds --all-namespaces
|
||||
# kubectl describe ds kured
|
||||
#
|
||||
# - name: Ensure kured is ready
|
||||
# uses: nick-invision/retry@v2.6.0
|
||||
# with:
|
||||
# timeout_minutes: 10
|
||||
# max_attempts: 10
|
||||
# retry_wait_seconds: 60
|
||||
# # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = 5
|
||||
# command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' "
|
||||
#
|
||||
# - name: Create reboot sentinel files
|
||||
# run: |
|
||||
# ./tests/kind/create-reboot-sentinels.sh
|
||||
#
|
||||
# - name: Test <REPLACETHIS>
|
||||
# env:
|
||||
# DEBUG: true
|
||||
# run: |
|
||||
# <TODO>
|
||||
- name: Follow reboot until success
|
||||
env:
|
||||
DEBUG: true
|
||||
run: |
|
||||
./tests/kind/follow-coordinated-reboot.sh
|
||||
|
||||
97
.github/workflows/on-tag.yaml
vendored
97
.github/workflows/on-tag.yaml
vendored
@@ -7,43 +7,94 @@ on:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
jobs:
|
||||
tag-scan-and-push-final-image:
|
||||
name: "Build, scan, and publish tagged image"
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: write
|
||||
packages: write
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Find current tag version
|
||||
run: echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
|
||||
run: echo "version=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ steps.tags.outputs.version }}" image
|
||||
- uses: Azure/container-scan@v0
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Build binaries
|
||||
run: make kured-release-tag
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
- name: Build single image for scan
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
image-name: docker.io/${{ github.repository_owner }}/kured:${{ steps.tags.outputs.version }}
|
||||
context: .
|
||||
platforms: linux/amd64
|
||||
push: false
|
||||
load: true
|
||||
tags: |
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME_WEAVEWORKSKUREDCI }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN_WEAVEWORKSKUREDCI }}
|
||||
image-ref: '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v1
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: weave-ghcr-bot
|
||||
password: ${{ secrets.KURED_WEAVE_GHCR_BOT_TOKEN }}
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Publish image
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
- name: Build release images
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
|
||||
push: true
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: |
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
- name: Generate SBOM
|
||||
run: |
|
||||
make DH_ORG="${{ github.repository_owner }}" VERSION="${{ steps.tags.outputs.version }}" publish-image
|
||||
.tmp/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }} -o spdx > kured.sbom
|
||||
|
||||
- name: Sign and attest artifacts
|
||||
run: |
|
||||
.tmp/cosign sign -f -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
.tmp/cosign sign-blob --output-signature kured.sbom.sig kured.sbom
|
||||
|
||||
.tmp/cosign attest -f --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
.tmp/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
env:
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
|
||||
122
.github/workflows/periodics-daily.yaml
vendored
122
.github/workflows/periodics-daily.yaml
vendored
@@ -7,15 +7,15 @@ on:
|
||||
jobs:
|
||||
periodics-gotest:
|
||||
name: Run go tests
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
- name: run tests
|
||||
run: go test -json ./... > test.json
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.5.0
|
||||
uses: guyarb/golang-test-annoations@v0.7.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
@@ -25,7 +25,7 @@ jobs:
|
||||
steps:
|
||||
# Stale by default waits for 60 days before marking PR/issues as stale, and closes them after 21 days.
|
||||
# Do not expire the first issues that would allow the community to grow.
|
||||
- uses: actions/stale@v4
|
||||
- uses: actions/stale@v8
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
stale-issue-message: 'This issue was automatically considered stale due to lack of activity. Please update it and/or join our slack channels to promote it, before it automatically closes (in 7 days).'
|
||||
@@ -39,98 +39,42 @@ jobs:
|
||||
name: Check docs for incorrect links
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Link Checker
|
||||
id: lc
|
||||
uses: peter-evans/link-checker@v1
|
||||
uses: lycheeverse/lychee-action@ec3ed119d4f44ad2673a7232460dc7dff59d2421
|
||||
env:
|
||||
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
||||
with:
|
||||
args: -r *.md *.yaml */*/*.go -x .cluster.local
|
||||
- name: Fail if there were link errors
|
||||
run: exit ${{ steps.lc.outputs.exit_code }}
|
||||
args: --verbose --no-progress '*.md' '*.yaml' '*/*/*.go' --exclude-link-local
|
||||
fail: true
|
||||
|
||||
vuln-scan:
|
||||
name: Build image and scan it against known vulnerabilities
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
- run: make DH_ORG="${{ github.repository_owner }}" VERSION="${{ github.sha }}" image
|
||||
- uses: Azure/container-scan@v0
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build artifacts
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
|
||||
with:
|
||||
image-name: docker.io/${{ github.repository_owner }}/kured:${{ github.sha }}
|
||||
|
||||
deploy-helm:
|
||||
name: Ensure our currently released helm chart works on all kubernetes versions
|
||||
runs-on: ubuntu-latest
|
||||
# only build with oldest and newest supported, it should be good enough.
|
||||
strategy:
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.21"
|
||||
- "1.22"
|
||||
- "1.23"
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Find go version
|
||||
run: |
|
||||
GO_VERSION=$(awk '/^go/ {print $2};' go.mod)
|
||||
echo "::set-output name=version::${GO_VERSION}"
|
||||
id: awk_gomod
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: "${{ steps.awk_gomod.outputs.version }}"
|
||||
|
||||
- name: "Workaround 'Failed to attach 1 to compat systemd cgroup /actions_job/...' on gh actions"
|
||||
run: |
|
||||
sudo bash << EOF
|
||||
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
|
||||
echo '{}' > /etc/docker/daemon.json
|
||||
systemctl restart docker || journalctl --no-pager -n 500
|
||||
systemctl status docker
|
||||
EOF
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create 5 node kind cluster
|
||||
uses: helm/kind-action@v1.2.0
|
||||
with:
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.11.0
|
||||
|
||||
- name: Deploy kured on default namespace with its helm chart
|
||||
run: |
|
||||
# Documented in official helm doc to live on the edge
|
||||
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
# Refresh bins
|
||||
hash -r
|
||||
helm install kured ./charts/kured/ --set configuration.period=1m
|
||||
kubectl config set-context kind-chart-testing
|
||||
kubectl get ds --all-namespaces
|
||||
kubectl describe ds kured
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.6.0
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
retry_wait_seconds: 60
|
||||
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = 5
|
||||
command: "kubectl get ds kured | grep -E 'kured.*5.*5.*5.*5.*5' "
|
||||
|
||||
- name: Create reboot sentinel files
|
||||
run: |
|
||||
./tests/kind/create-reboot-sentinels.sh
|
||||
|
||||
- name: Follow reboot until success
|
||||
env:
|
||||
DEBUG: true
|
||||
run: |
|
||||
./tests/kind/follow-coordinated-reboot.sh
|
||||
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,3 +1,5 @@
|
||||
cmd/kured/kured
|
||||
vendor
|
||||
build
|
||||
dist
|
||||
.tmp
|
||||
|
||||
32
.goreleaser.yml
Normal file
32
.goreleaser.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
project_name: kured
|
||||
before:
|
||||
hooks:
|
||||
- go mod tidy
|
||||
builds:
|
||||
- main: ./cmd/kured
|
||||
env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
- linux
|
||||
goarch:
|
||||
- amd64
|
||||
- arm64
|
||||
- arm
|
||||
- "386"
|
||||
goarm:
|
||||
- "6"
|
||||
- "7"
|
||||
ldflags:
|
||||
- -s -w -X main.version={{ if .IsSnapshot }}{{ .ShortCommit }}{{ else }}{{ .Version }}{{ end }}
|
||||
mod_timestamp: "{{ .CommitTimestamp }}"
|
||||
flags:
|
||||
- -trimpath
|
||||
|
||||
snapshot:
|
||||
name_template: "{{ .ShortCommit }}"
|
||||
|
||||
release:
|
||||
disable: true
|
||||
|
||||
changelog:
|
||||
skip: true
|
||||
6
.lycheeignore
Normal file
6
.lycheeignore
Normal file
@@ -0,0 +1,6 @@
|
||||
app.fossa.com
|
||||
cluster.local
|
||||
hooks.slack.com
|
||||
localhost
|
||||
slack://
|
||||
teams://
|
||||
3
CODE_OF_CONDUCT.md
Normal file
3
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,3 @@
|
||||
## Kured Community Code of Conduct
|
||||
|
||||
Kured follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/main/code-of-conduct.md).
|
||||
242
CONTRIBUTING.md
Normal file
242
CONTRIBUTING.md
Normal file
@@ -0,0 +1,242 @@
|
||||
# Developing `kured`
|
||||
|
||||
We love contributions to `kured`, no matter if you are [helping out on
|
||||
Slack][slack], reporting or triaging [issues][issues] or contributing code
|
||||
to `kured`.
|
||||
|
||||
In any case, it will make sense to familiarise yourself with the main
|
||||
[README][readme] to understand the different features and options, which is
|
||||
helpful for testing. The "building" section in particular makes sense if
|
||||
you are planning to contribute code.
|
||||
|
||||
[slack]: README.md#getting-help
|
||||
[issues]: https://github.com/kubereboot/kured/issues
|
||||
[readme]: README.md
|
||||
|
||||
## Certificate of Origin
|
||||
|
||||
By contributing to this project you agree to the Developer Certificate of
|
||||
Origin (DCO). This document was created by the Linux Kernel community and is a
|
||||
simple statement that you, as a contributor, have the legal right to make the
|
||||
contribution.
|
||||
|
||||
We require all commits to be signed. By signing off with your signature, you
|
||||
certify that you wrote the patch or otherwise have the right to contribute the
|
||||
material by the rules of the [DCO](DCO):
|
||||
|
||||
`Signed-off-by: Jane Doe <jane.doe@example.com>`
|
||||
|
||||
The signature must contain your real name
|
||||
(sorry, no pseudonyms or anonymous contributions)
|
||||
If your `user.name` and `user.email` are configured in your Git config,
|
||||
you can sign your commit automatically with `git commit -s`.
|
||||
|
||||
## Kured Repositories
|
||||
|
||||
All Kured repositories are kept under <https://github.com/kubereboot>. To find the code and work on the individual pieces that make Kured, here is our overview:
|
||||
|
||||
| Repositories | Contents |
|
||||
| --------------------------------------- | ------------------------- |
|
||||
| <https://github.com/kubereboot/kured> | Kured operator itself |
|
||||
| <https://github.com/kubereboot/charts> | Helm chart |
|
||||
| <https://github.com/kubereboot/website> | website and documentation |
|
||||
|
||||
## Regular development activities
|
||||
|
||||
### Prepare environment
|
||||
|
||||
Please run `make bootstrap-tools` once on a fresh repository clone to download several needed tools, e.g. GoReleaser.
|
||||
|
||||
### Updating k8s support
|
||||
|
||||
Whenever we want to update e.g. the `kubectl` or `client-go` dependencies,
|
||||
some RBAC changes might be necessary too.
|
||||
|
||||
This is what it took to support Kubernetes 1.14:
|
||||
<https://github.com/kubereboot/kured/pull/75>
|
||||
|
||||
That the process can be more involved based on kubernetes changes.
|
||||
For example, k8s 1.10 changes to apps triggered the following commits:
|
||||
|
||||
b3f9ddf: Bump client-go for optimum k8s 1.10 compatibility
|
||||
bc3f28d: Move deployment manifest to apps/v1
|
||||
908998a: Update RBAC permissions for kubectl v1.10.3
|
||||
efbb0c3: Document version compatibility in release notes
|
||||
5731b98: Add warning to Dockerfile re: upgrading kubectl
|
||||
|
||||
Search the git log for inspiration for your cases.
|
||||
|
||||
Please update our `.github/workflows` with the new k8s images, starting by
|
||||
the creation of a `.github/kind-cluster-<version>.yaml`, then updating
|
||||
our workflows with the new versions.
|
||||
|
||||
Once you updated everything, make sure you update the support matrix on
|
||||
the main [README][readme] as well.
|
||||
|
||||
### Updating other dependencies
|
||||
|
||||
Dependabot proposes changes in our go.mod/go.sum.
|
||||
Some of those changes are covered by CI testing, some are not.
|
||||
|
||||
Please make sure to test those not covered by CI (mostly the integration
|
||||
with other tools) manually before merging.
|
||||
|
||||
### Review periodic jobs
|
||||
|
||||
We run periodic jobs (see also Automated testing section of this documentation).
|
||||
Those should be monitored for failures.
|
||||
|
||||
If a failure happen in periodics, something terribly wrong must have happened
|
||||
(or github is failing at the creation of a kind cluster). Please monitor those
|
||||
failures carefully.
|
||||
|
||||
### Introducing new features
|
||||
|
||||
When you introduce a new feature, the kured team expects you to have tested
|
||||
your change thoroughly. If possible, include all the necessary testing in your change.
|
||||
|
||||
If your change involves a user facing change (change in flags of kured for example),
|
||||
please include expose your new feature in our default manifest (`kured-ds.yaml`),
|
||||
as a comment.
|
||||
|
||||
Our release manifests and helm charts are our stable interfaces.
|
||||
Any user facing changes will therefore have to wait for a release before being
|
||||
exposed to our users.
|
||||
|
||||
This also means that when you expose a new feature, you should create another PR
|
||||
for your changes in <https://github.com/kubereboot/charts> to make your feature
|
||||
available at the next kured version for helm users.
|
||||
|
||||
In the charts PR, you can directly bump the appVersion to the next minor version
|
||||
(you are introducing a new feature, which requires a bump of the minor number.
|
||||
For example, if current appVersion is 1.6.x, make sure you update your appVersion
|
||||
to 1.7.0). It allows us to have an easy view of what we land each release.
|
||||
|
||||
Do not hesitate to increase the test coverage for your feature, whether it's unit
|
||||
testing to full functional testing (even using helm charts)
|
||||
|
||||
### Increasing test coverage
|
||||
|
||||
We are welcoming any change to increase our test coverage.
|
||||
See also our github issues for the label `testing`.
|
||||
|
||||
## Automated testing
|
||||
|
||||
Our CI is covered by github actions.
|
||||
You can see their contents in .github/workflows.
|
||||
|
||||
We currently run:
|
||||
|
||||
- go tests and lint
|
||||
- `shellcheck`
|
||||
- a check for dead links in our docs
|
||||
- a security check against our base image (alpine)
|
||||
- a deep functional test using our manifests on all supported k8s versions
|
||||
|
||||
To test your code manually, follow the section Manual testing.
|
||||
|
||||
## Manual (release) testing
|
||||
|
||||
Before `kured` is released, we want to make sure it still works fine on the
|
||||
previous, current and next minor version of Kubernetes (with respect to the
|
||||
`client-go` & `kubectl` dependencies in use). For local testing e.g.
|
||||
`minikube` or `kind` can be sufficient. This will allow you to catch issues
|
||||
that might not have been tested in our CI, like integration with other tools,
|
||||
or your specific use case.
|
||||
|
||||
Deploy kured in your test scenario, make sure you pass the right `image`,
|
||||
update the e.g. `period` and `reboot-days` options, so you get immediate
|
||||
results, if you login to a node and run:
|
||||
|
||||
```console
|
||||
sudo touch /var/run/reboot-required
|
||||
```
|
||||
|
||||
### Example of golang testing
|
||||
|
||||
Please run `make test`. You should have `golint` installed.
|
||||
|
||||
### Example of testing with `minikube`
|
||||
|
||||
A test-run with `minikube` could look like this:
|
||||
|
||||
```console
|
||||
# start minikube
|
||||
minikube start --driver=kvm2 --kubernetes-version <k8s-release>
|
||||
|
||||
# build kured image and publish to registry accessible by minikube
|
||||
make image minikube-publish
|
||||
|
||||
# edit kured-ds.yaml to
|
||||
# - point to new image
|
||||
# - change e.g. period and reboot-days option for immediate results
|
||||
|
||||
minikube kubectl -- apply -f kured-rbac.yaml
|
||||
minikube kubectl -- apply -f kured-ds.yaml
|
||||
minikube kubectl -- logs daemonset.apps/kured -n kube-system -f
|
||||
|
||||
# In separate terminal
|
||||
minikube ssh
|
||||
sudo touch /var/run/reboot-required
|
||||
minikube logs -f
|
||||
```
|
||||
|
||||
Now check for the 'Commanding reboot' message and minikube going down.
|
||||
|
||||
Unfortunately as of today, you are going to run into
|
||||
<https://github.com/kubernetes/minikube/issues/2874>. This means that
|
||||
minikube won't come back easily. You will need to start minikube again.
|
||||
Then you can check for the lock release.
|
||||
|
||||
### Example of testing with `kind`
|
||||
|
||||
A test-run with `kind` could look like this:
|
||||
|
||||
```console
|
||||
# create kind cluster
|
||||
kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
|
||||
|
||||
# create reboot required files on pre-defined kind nodes
|
||||
./tests/kind/create-reboot-sentinels.sh
|
||||
|
||||
# check if reboot is working fine
|
||||
./tests/kind/follow-coordinated-reboot.sh
|
||||
|
||||
```
|
||||
|
||||
## Publishing a new kured release
|
||||
|
||||
### Prepare Documentation
|
||||
|
||||
Check that `README.md` has an updated compatibility matrix and that the
|
||||
url in the `kubectl` incantation (under "Installation") is updated to the
|
||||
new version you want to release.
|
||||
|
||||
### Create a tag on the repo
|
||||
|
||||
Before going further, we should freeze the code for a release, by
|
||||
tagging the code. The Github-Action should start a new job and push
|
||||
the new image to the registry.
|
||||
|
||||
### Create the combined manifest
|
||||
|
||||
Now create the `kured-<release>-dockerhub.yaml` for e.g. `1.3.0`:
|
||||
|
||||
```sh
|
||||
VERSION=1.3.0
|
||||
MANIFEST="kured-$VERSION-dockerhub.yaml"
|
||||
make DH_ORG="kubereboot" VERSION="${VERSION}" manifest
|
||||
cat kured-rbac.yaml > "$MANIFEST"
|
||||
cat kured-ds.yaml >> "$MANIFEST"
|
||||
```
|
||||
|
||||
### Publish release artifacts
|
||||
|
||||
Now you can head to the Github UI, use the version number as tag and upload the
|
||||
`kured-<release>-dockerhub.yaml` file.
|
||||
|
||||
Please describe what's new and noteworthy in the release notes, list the PRs
|
||||
that landed and give a shout-out to everyone who contributed.
|
||||
|
||||
Please also note down on which releases the upcoming `kured` release was
|
||||
tested on. (Check old release notes if you're unsure.)
|
||||
36
DCO
Normal file
36
DCO
Normal file
@@ -0,0 +1,36 @@
|
||||
Developer Certificate of Origin
|
||||
Version 1.1
|
||||
|
||||
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
|
||||
660 York Street, Suite 102,
|
||||
San Francisco, CA 94110 USA
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim copies of this
|
||||
license document, but changing it is not allowed.
|
||||
|
||||
|
||||
Developer's Certificate of Origin 1.1
|
||||
|
||||
By making a contribution to this project, I certify that:
|
||||
|
||||
(a) The contribution was created in whole or in part by me and I
|
||||
have the right to submit it under the open source license
|
||||
indicated in the file; or
|
||||
|
||||
(b) The contribution is based upon previous work that, to the best
|
||||
of my knowledge, is covered under an appropriate open source
|
||||
license and I have the right under that license to submit that
|
||||
work with modifications, whether created in whole or in part
|
||||
by me, under the same open source license (unless I am
|
||||
permitted to submit under a different license), as indicated
|
||||
in the file; or
|
||||
|
||||
(c) The contribution was provided directly to me by some other
|
||||
person who certified (a), (b) or (c) and I have not modified
|
||||
it.
|
||||
|
||||
(d) I understand and agree that this project and the contribution
|
||||
are public and that a record of the contribution (including all
|
||||
personal information I submit with it, including my sign-off) is
|
||||
maintained indefinitely and may be redistributed consistent with
|
||||
this project or the open source license(s) involved.
|
||||
236
DEVELOPMENT.md
236
DEVELOPMENT.md
@@ -1,235 +1 @@
|
||||
# Developing `kured`
|
||||
|
||||
We love contributions to `kured`, no matter if you are [helping out on
|
||||
Slack][slack], reporting or triaging [issues][issues] or contributing code
|
||||
to `kured`.
|
||||
|
||||
In any case, it will make sense to familiarise yourself with the main
|
||||
[README][readme] to understand the different features and options, which is
|
||||
helpful for testing. The "building" section in particular makes sense if
|
||||
you are planning to contribute code.
|
||||
|
||||
[slack]: README.md#getting-help
|
||||
[issues]: https://github.com/weaveworks/kured/issues
|
||||
[readme]: README.md
|
||||
|
||||
## Regular development activities
|
||||
|
||||
### Updating k8s support
|
||||
|
||||
Whenever we want to update e.g. the `kubectl` or `client-go` dependencies,
|
||||
some RBAC changes might be necessary too.
|
||||
|
||||
This is what it took to support Kubernetes 1.14:
|
||||
<https://github.com/weaveworks/kured/pull/75>
|
||||
|
||||
That the process can be more involved that that can be seen in
|
||||
<https://github.com/weaveworks/kured/commits/support-k8s-1.10>
|
||||
|
||||
Please update our .github/workflows with the new k8s images, starting by
|
||||
the creation of a .github/kind-cluster-<version>.yaml, then updating
|
||||
our workflows with the new versions.
|
||||
|
||||
Once you updated everything, make sure you update the support matrix on
|
||||
the main [README][readme] as well.
|
||||
|
||||
### Updating other dependencies
|
||||
|
||||
Dependabot proposes changes in our go.mod/go.sum.
|
||||
Some of those changes are covered by CI testing, some are not.
|
||||
|
||||
Please make sure to test those not covered by CI (mostly the integration
|
||||
with other tools) manually before merging.
|
||||
|
||||
### Review periodic jobs
|
||||
|
||||
We run periodic jobs (see also Automated testing section of this documentation).
|
||||
Those should be monitored for failures.
|
||||
|
||||
If a failure happen in periodics, something terribly wrong must have happened
|
||||
(or github is failing at the creation of a kind cluster). Please monitor those
|
||||
failures carefully.
|
||||
|
||||
### Introducing new features
|
||||
|
||||
When you introduce a new feature, the kured team expects you to have tested
|
||||
your change thoroughly. If possible, include all the necessary testing in your change.
|
||||
|
||||
If your change involves a user facing change (change in flags of kured for example),
|
||||
please include expose your new feature in our default manifest (`kured-ds.yaml`),
|
||||
as a comment.
|
||||
|
||||
Do not update the helm chart directly.
|
||||
Helm charts and our release manifests (see below) are our stable interfaces.
|
||||
Any user facing changes will therefore have to wait for a while before being
|
||||
exposed to our users.
|
||||
|
||||
This also means that when you expose a new feature, you should create another PR
|
||||
for your changes in `charts/` to make your feature available for our next kured version.
|
||||
In this change, you can directly bump the appVersion to the next minor version.
|
||||
(for example, if current appVersion is 1.6.x, make sure you update your appVersion
|
||||
to 1.7.0). It allows us to have an easy view of what we land each release.
|
||||
|
||||
Do not hesitate to increase the test coverage for your feature, whether it's unit
|
||||
testing to full functional testing (even using helm charts)
|
||||
|
||||
### Increasing test coverage
|
||||
|
||||
We are welcoming any change to increase our test coverage.
|
||||
See also our github issues for the label `testing`.
|
||||
|
||||
### Updating helm charts
|
||||
|
||||
Helm charts are continuously published. Any change in `charts/` will be immediately
|
||||
pushed in production.
|
||||
|
||||
## Automated testing
|
||||
|
||||
Our CI is covered by github actions.
|
||||
You can see their contents in .github/workflows.
|
||||
|
||||
We currently run:
|
||||
- go tests and lint
|
||||
- shellcheck
|
||||
- a check for dead links in our docs
|
||||
- a security check against our base image (alpine)
|
||||
- a deep functional test using our manifests on all supported k8s versions
|
||||
- basic deployment using our helm chart on any chart change
|
||||
|
||||
Changes in helm charts are not functionally tested on PRs. We assume that
|
||||
the PRs to implement the feature are properly tested by our users and
|
||||
contributors before merge.
|
||||
|
||||
To test your code manually, follow the section Manual testing.
|
||||
|
||||
## Manual (release) testing
|
||||
|
||||
Before `kured` is released, we want to make sure it still works fine on the
|
||||
previous, current and next minor version of Kubernetes (with respect to the
|
||||
`client-go` & `kubectl` dependencies in use). For local testing e.g.
|
||||
`minikube` or `kind` can be sufficient. This will allow you to catch issues
|
||||
that might not have been tested in our CI, like integration with other tools,
|
||||
or your specific use case.
|
||||
|
||||
Deploy kured in your test scenario, make sure you pass the right `image`,
|
||||
update the e.g. `period` and `reboot-days` options, so you get immediate
|
||||
results, if you login to a node and run:
|
||||
|
||||
```console
|
||||
sudo touch /var/run/reboot-required
|
||||
```
|
||||
|
||||
### Example of golang testing
|
||||
|
||||
Please run `make test`. You should have golint installed.
|
||||
|
||||
### Example of testing with `minikube`
|
||||
|
||||
A test-run with `minikube` could look like this:
|
||||
|
||||
```console
|
||||
# start minikube
|
||||
minikube start --vm-driver kvm2 --kubernetes-version <k8s-release>
|
||||
|
||||
# build kured image and publish to registry accessible by minikube
|
||||
make image minikube-publish
|
||||
|
||||
# edit kured-ds.yaml to
|
||||
# - point to new image
|
||||
# - change e.g. period and reboot-days option for immediate results
|
||||
|
||||
minikube kubectl -- apply -f kured-rbac.yaml
|
||||
minikube kubectl -- apply -f kured-ds.yaml
|
||||
minikube kubectl -- logs daemonset.apps/kured -n kube-system -f
|
||||
|
||||
# Alternatively use helm to install the chart
|
||||
# edit values-local.yaml to change any chart parameters
|
||||
helm install kured ./charts/kured --namespace kube-system -f ./charts/kured/values.minikube.yaml
|
||||
|
||||
# In separate terminal
|
||||
minikube ssh
|
||||
sudo touch /var/run/reboot-required
|
||||
minikube logs -f
|
||||
```
|
||||
|
||||
Now check for the 'Commanding reboot' message and minikube going down.
|
||||
|
||||
Unfortunately as of today, you are going to run into
|
||||
<https://github.com/kubernetes/minikube/issues/2874>. This means that
|
||||
minikube won't come back easily. You will need to start minikube again.
|
||||
Then you can check for the lock release.
|
||||
|
||||
If all the tests ran well, kured maintainers can reach out to the Weaveworks
|
||||
team to get an upcoming `kured` release tested in the Dev environment for
|
||||
real life testing.
|
||||
|
||||
### Example of testing with `kind`
|
||||
|
||||
A test-run with `kind` could look like this:
|
||||
|
||||
```console
|
||||
# create kind cluster
|
||||
kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
|
||||
|
||||
# create reboot required files on pre-defined kind nodes
|
||||
./tests/kind/create-reboot-sentinels.sh
|
||||
|
||||
# check if reboot is working fine
|
||||
./tests/kind/follow-coordinated-reboot.sh
|
||||
|
||||
```
|
||||
|
||||
## Publishing a new kured release
|
||||
|
||||
### Prepare Documentation
|
||||
|
||||
Check that `README.md` has an updated compatibility matrix and that the
|
||||
url in the `kubectl` incantation (under "Installation") is updated to the
|
||||
new version you want to release.
|
||||
|
||||
### Create a tag on the repo
|
||||
|
||||
Before going further, we should freeze the code for a release, by
|
||||
tagging the code. The Github-Action should start a new job and push
|
||||
the new image to the registry.
|
||||
|
||||
|
||||
### Create the combined manifest
|
||||
|
||||
Now create the `kured-<release>-dockerhub.yaml` for e.g. `1.3.0`:
|
||||
|
||||
```sh
|
||||
VERSION=1.3.0
|
||||
MANIFEST="kured-$VERSION-dockerhub.yaml"
|
||||
make DH_ORG="weaveworks" VERSION="${VERSION}" manifest
|
||||
cat kured-rbac.yaml > "$MANIFEST"
|
||||
cat kured-ds.yaml >> "$MANIFEST"
|
||||
```
|
||||
|
||||
### Publish release artifacts
|
||||
|
||||
Now you can head to the Github UI, use the version number as tag and upload the
|
||||
`kured-<release>-dockerhub.yaml` file.
|
||||
|
||||
Please describe what's new and noteworthy in the release notes, list the PRs
|
||||
that landed and give a shout-out to everyone who contributed.
|
||||
|
||||
Please also note down on which releases the upcoming `kured` release was
|
||||
tested on. (Check old release notes if you're unsure.)
|
||||
|
||||
### Update the Helm chart
|
||||
|
||||
You can automatically bump the helm chart's application version
|
||||
with the latest image tag by running:
|
||||
|
||||
```sh
|
||||
make DH_ORG="weaveworks" VERSION="1.3.0" helm-chart
|
||||
```
|
||||
|
||||
A change in the helm chart requires a bump of the `version`
|
||||
in `charts/kured/Chart.yaml` (following the versioning rules).
|
||||
Update it, and issue a PR. Upon merge, that PR will automatically
|
||||
publish the chart to the gh-pages branch.
|
||||
|
||||
When there are open helm-chart PRs which are on hold until the helm-chart has been updated
|
||||
with the new kured version, they can be merged now (unless a rebase is needed from the contributor).
|
||||
This file was moved to [CONTRIBUTING.md](CONTRIBUTING.md).
|
||||
|
||||
25
Dockerfile
Normal file
25
Dockerfile
Normal file
@@ -0,0 +1,25 @@
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.18.3 as bin
|
||||
|
||||
ARG TARGETOS
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
COPY dist/ /dist
|
||||
RUN set -ex \
|
||||
&& case "${TARGETARCH}" in \
|
||||
amd64) \
|
||||
SUFFIX="_v1" \
|
||||
;; \
|
||||
arm) \
|
||||
SUFFIX="_${TARGETVARIANT:1}" \
|
||||
;; \
|
||||
*) \
|
||||
SUFFIX="" \
|
||||
;; \
|
||||
esac \
|
||||
&& cp /dist/kured_${TARGETOS}_${TARGETARCH}${SUFFIX}/kured /dist/kured;
|
||||
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.18.3
|
||||
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
|
||||
COPY --from=bin /dist/kured /usr/bin/kured
|
||||
ENTRYPOINT ["/usr/bin/kured"]
|
||||
112
GOVERNANCE.md
Normal file
112
GOVERNANCE.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Project Governance
|
||||
|
||||
- [Values](#values)
|
||||
- [Maintainers](#maintainers)
|
||||
- [Becoming a Maintainer](#becoming-a-maintainer)
|
||||
- [Meetings](#meetings)
|
||||
- [Code of Conduct Enforcement](#code-of-conduct)
|
||||
- [Voting](#voting)
|
||||
|
||||
## Values
|
||||
|
||||
The Kured project and its leadership embrace the following values:
|
||||
|
||||
- Openness: Communication and decision-making happens in the open and is discoverable for future
|
||||
reference. As much as possible, all discussions and work take place in public
|
||||
forums and open repositories.
|
||||
|
||||
- Fairness: All stakeholders have the opportunity to provide feedback and submit
|
||||
contributions, which will be considered on their merits.
|
||||
|
||||
- Community over Product or Company: Sustaining and growing our community takes
|
||||
priority over shipping code or sponsors' organizational goals. Each
|
||||
contributor participates in the project as an individual.
|
||||
|
||||
- Inclusivity: We innovate through different perspectives and skill sets, which
|
||||
can only be accomplished in a welcoming and respectful environment.
|
||||
|
||||
- Participation: Responsibilities within the project are earned through
|
||||
participation, and there is a clear path up the contributor ladder into leadership
|
||||
positions.
|
||||
|
||||
- Consensus: Whether or not wider input is required, the Kured community believes that
|
||||
the best decisions are reached through Consensus
|
||||
<https://en.wikipedia.org/wiki/Consensus_decision-making>.
|
||||
|
||||
## Maintainers
|
||||
|
||||
Kured Maintainers have write access to the [project GitHub
|
||||
organisation](https://github.com/kubereboot). They can merge their own patches or patches
|
||||
from others. The current maintainers can be found in [MAINTAINERS][maintainers-file].
|
||||
Maintainers collectively manage the project's resources and contributors.
|
||||
|
||||
This privilege is granted with some expectation of responsibility: maintainers
|
||||
are people who care about the Kured project and want to help it grow and
|
||||
improve. A maintainer is not just someone who can make changes, but someone who
|
||||
has demonstrated their ability to collaborate with the team, get the most
|
||||
knowledgeable people to review code and docs, contribute high-quality code, and
|
||||
follow through to fix issues (in code or tests).
|
||||
|
||||
A maintainer is a contributor to the project's success and a citizen helping
|
||||
the project succeed.
|
||||
|
||||
## Becoming a Maintainer
|
||||
|
||||
To become a Maintainer you need to demonstrate the following:
|
||||
|
||||
- commitment to the project:
|
||||
- participate in discussions, contributions, code and documentation reviews
|
||||
for 3 months or more and participate in Slack discussions and meetings
|
||||
if possible,
|
||||
- perform reviews for 5 non-trivial pull requests,
|
||||
- contribute 5 non-trivial pull requests and have them merged,
|
||||
- ability to write quality code and/or documentation,
|
||||
- ability to collaborate with the team,
|
||||
- understanding of how the team works (policies, processes for testing and code review, etc),
|
||||
- understanding of the project's code base and coding and documentation style.
|
||||
|
||||
We realise that everybody brings different abilities and qualities to the team, that's
|
||||
why we are willing to change the rules somewhat depending on the circumstances.
|
||||
|
||||
A new Maintainer can apply by proposing a PR to the [MAINTAINERS
|
||||
file][maintainers-file]. A simple majority vote of existing Maintainers
|
||||
approves the application.
|
||||
|
||||
Maintainers who are selected will be granted the necessary GitHub rights,
|
||||
and invited to the [private maintainer mailing list][private-list].
|
||||
|
||||
## Meetings
|
||||
|
||||
Time zones permitting, Maintainers are expected to participate in the public
|
||||
developer meeting, details can be found [here][meeting-agenda].
|
||||
|
||||
Maintainers will also have closed meetings in order to discuss security reports
|
||||
or Code of Conduct violations. Such meetings should be scheduled by any
|
||||
Maintainer on receipt of a security issue or CoC report. All current Maintainers
|
||||
must be invited to such closed meetings, except for any Maintainer who is
|
||||
accused of a CoC violation.
|
||||
|
||||
## Code of Conduct
|
||||
|
||||
[Code of Conduct](./CODE_OF_CONDUCT.md) violations by community members will
|
||||
be discussed and resolved on the [private Maintainer mailing list][private-list].
|
||||
If the reported CoC violator is a Maintainer, the Maintainers will instead
|
||||
designate two Maintainers to work with CNCF staff in resolving the report.
|
||||
|
||||
## Voting
|
||||
|
||||
While most business in Kured is conducted by "lazy consensus", periodically
|
||||
the Maintainers may need to vote on specific actions or changes.
|
||||
A vote can be taken in [kured issues labeled 'decision'][decision-issues] or
|
||||
[the private Maintainer mailing list][private-list] for security or conduct
|
||||
matters. Votes may also be taken at [the developer meeting][meeting-agenda].
|
||||
Any Maintainer may demand a vote be taken.
|
||||
|
||||
Most votes require a simple majority of all Maintainers to succeed. Maintainers
|
||||
can be removed by a 2/3 majority vote of all Maintainers, and changes to this
|
||||
Governance require a 2/3 vote of all Maintainers.
|
||||
|
||||
[maintainers-file]: ./MAINTAINERS
|
||||
[private-list]: cncf-kured-maintainers@lists.cncf.io
|
||||
[meeting-agenda]: https://docs.google.com/document/d/1AWT8YDdqZY-Se6Y1oAlwtujWLVpNVK2M_F_Vfqw06aI/edit
|
||||
[decision-issues]: https://github.com/kubereboot/kured/labels/decision
|
||||
@@ -1,5 +1,5 @@
|
||||
Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
|
||||
Daniel Holbach <daniel@weave.works> (@dholbach)
|
||||
Daniel Holbach <daniel.holbach@gmail.com> (@dholbach)
|
||||
Hidde Beydals <hidde@weave.works> (@hiddeco)
|
||||
Jean-Phillipe Evrard <jean-philippe.evrard@suse.com> (@evrardjp)
|
||||
Jack Francis <jackfrancis@gmail.com> (@jackfrancis)
|
||||
Jean-Philippe Evrard <open-source@a.spamming.party> (@evrardjp)
|
||||
|
||||
60
Makefile
60
Makefile
@@ -1,52 +1,50 @@
|
||||
.DEFAULT: all
|
||||
.PHONY: all clean image publish-image minikube-publish manifest helm-chart test tests
|
||||
.PHONY: all clean image minikube-publish manifest test kured-all
|
||||
|
||||
DH_ORG=weaveworks
|
||||
VERSION=$(shell git symbolic-ref --short HEAD)-$(shell git rev-parse --short HEAD)
|
||||
TEMPDIR=./.tmp
|
||||
GORELEASER_CMD=$(TEMPDIR)/goreleaser
|
||||
DH_ORG=kubereboot
|
||||
VERSION=$(shell git rev-parse --short HEAD)
|
||||
SUDO=$(shell docker info >/dev/null 2>&1 || echo "sudo -E")
|
||||
|
||||
all: image
|
||||
|
||||
$(TEMPDIR):
|
||||
mkdir -p $(TEMPDIR)
|
||||
|
||||
.PHONY: bootstrap-tools
|
||||
bootstrap-tools: $(TEMPDIR)
|
||||
VERSION=v1.11.4 TMPDIR=.tmp bash .github/scripts/goreleaser-install.sh
|
||||
curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b .tmp v0.58.0
|
||||
curl -sSfL https://github.com/sigstore/cosign/releases/download/v1.12.1/cosign-linux-amd64 -o .tmp/cosign
|
||||
chmod +x .tmp/goreleaser .tmp/cosign .tmp/syft
|
||||
|
||||
clean:
|
||||
rm -f cmd/kured/kured
|
||||
rm -rf ./build
|
||||
rm -rf ./dist
|
||||
|
||||
godeps=$(shell go list -f '{{join .Deps "\n"}}' $1 | grep -v /vendor/ | xargs go list -f '{{if not .Standard}}{{ $$dep := . }}{{range .GoFiles}}{{$$dep.Dir}}/{{.}} {{end}}{{end}}')
|
||||
kured:
|
||||
$(GORELEASER_CMD) build --rm-dist --single-target --snapshot
|
||||
|
||||
DEPS=$(call godeps,./cmd/kured)
|
||||
kured-all:
|
||||
$(GORELEASER_CMD) build --rm-dist --snapshot
|
||||
|
||||
cmd/kured/kured: $(DEPS)
|
||||
cmd/kured/kured: cmd/kured/*.go
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.version=$(VERSION)" -o $@ cmd/kured/*.go
|
||||
kured-release-tag:
|
||||
$(GORELEASER_CMD) release --rm-dist
|
||||
|
||||
build/.image.done: cmd/kured/Dockerfile cmd/kured/kured
|
||||
mkdir -p build
|
||||
cp $^ build
|
||||
$(SUDO) docker build -t docker.io/$(DH_ORG)/kured -f build/Dockerfile ./build
|
||||
$(SUDO) docker tag docker.io/$(DH_ORG)/kured docker.io/$(DH_ORG)/kured:$(VERSION)
|
||||
$(SUDO) docker tag docker.io/$(DH_ORG)/kured ghcr.io/$(DH_ORG)/kured:$(VERSION)
|
||||
touch $@
|
||||
kured-release-snapshot:
|
||||
$(GORELEASER_CMD) release --rm-dist --snapshot
|
||||
|
||||
image: build/.image.done
|
||||
|
||||
publish-image: image
|
||||
$(SUDO) docker push docker.io/$(DH_ORG)/kured:$(VERSION)
|
||||
$(SUDO) docker push ghcr.io/$(DH_ORG)/kured:$(VERSION)
|
||||
image: kured
|
||||
$(SUDO) docker buildx build --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) .
|
||||
|
||||
minikube-publish: image
|
||||
$(SUDO) docker save docker.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load)
|
||||
$(SUDO) docker save ghcr.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load)
|
||||
|
||||
manifest:
|
||||
sed -i "s#image: docker.io/.*kured.*#image: docker.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds.yaml
|
||||
sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds.yaml
|
||||
echo "Please generate combined manifest if necessary"
|
||||
|
||||
helm-chart:
|
||||
sed -i "s#repository:.*/kured#repository: $(DH_ORG)/kured#g" charts/kured/values.yaml
|
||||
sed -i "s#appVersion:.*#appVersion: \"$(VERSION)\"#g" charts/kured/Chart.yaml
|
||||
sed -i "s#\`[0-9]*\.[0-9]*\.[0-9]*\`#\`$(VERSION)\`#g" charts/kured/README.md
|
||||
echo "Please bump version in charts/kured/Chart.yaml"
|
||||
|
||||
test: tests
|
||||
test:
|
||||
echo "Running go tests"
|
||||
go test ./...
|
||||
echo "Running golint on pkg"
|
||||
|
||||
396
README.md
396
README.md
@@ -1,29 +1,17 @@
|
||||
|
||||
# kured - Kubernetes Reboot Daemon
|
||||
|
||||
<img src="https://github.com/weaveworks/kured/raw/main/img/logo.png" align="right"/>
|
||||
[](https://artifacthub.io/packages/helm/kured/kured)
|
||||
[](https://app.fossa.com/projects/git%2Bgithub.com%2Fkubereboot%2Fkured?ref=badge_shield)
|
||||
[](https://clomonitor.io/projects/cncf/kured)
|
||||
|
||||
- [Introduction](#introduction)
|
||||
- [Kubernetes & OS Compatibility](#kubernetes--os-compatibility)
|
||||
- [Installation](#installation)
|
||||
- [Configuration](#configuration)
|
||||
- [Reboot Sentinel File & Period](#reboot-sentinel-file--period)
|
||||
- [Setting a schedule](#setting-a-schedule)
|
||||
- [Blocking Reboots via Alerts](#blocking-reboots-via-alerts)
|
||||
- [Blocking Reboots via Pods](#blocking-reboots-via-pods)
|
||||
- [Prometheus Metrics](#prometheus-metrics)
|
||||
- [Notifications](#notifications)
|
||||
- [Overriding Lock Configuration](#overriding-lock-configuration)
|
||||
- [Operation](#operation)
|
||||
- [Testing](#testing)
|
||||
- [Disabling Reboots](#disabling-reboots)
|
||||
- [Manual Unlock](#manual-unlock)
|
||||
- [Automatic Unlock](#automatic-unlock)
|
||||
- [Delaying Lock Release](#delaying-lock-release)
|
||||
- [Building](#building)
|
||||
- [Frequently Asked/Anticipated Questions](#frequently-askedanticipated-questions)
|
||||
- [Why is there no `latest` tag on Docker Hub?](#why-is-there-no-latest-tag-on-docker-hub)
|
||||
- [Getting Help](#getting-help)
|
||||
<img src="https://github.com/kubereboot/website/raw/main/static/img/kured.png" width="200" align="right"/>
|
||||
|
||||
- [kured - Kubernetes Reboot Daemon](#kured---kubernetes-reboot-daemon)
|
||||
- [Introduction](#introduction)
|
||||
- [Documentation](#documentation)
|
||||
- [Getting Help](#getting-help)
|
||||
- [Trademarks](#trademarks)
|
||||
- [License](#license)
|
||||
|
||||
## Introduction
|
||||
|
||||
@@ -31,348 +19,48 @@ Kured (KUbernetes REboot Daemon) is a Kubernetes daemonset that
|
||||
performs safe automatic node reboots when the need to do so is
|
||||
indicated by the package management system of the underlying OS.
|
||||
|
||||
* Watches for the presence of a reboot sentinel file e.g. `/var/run/reboot-required`
|
||||
- Watches for the presence of a reboot sentinel file e.g. `/var/run/reboot-required`
|
||||
or the successful run of a sentinel command.
|
||||
* Utilises a lock in the API server to ensure only one node reboots at
|
||||
- Utilises a lock in the API server to ensure only one node reboots at
|
||||
a time
|
||||
* Optionally defers reboots in the presence of active Prometheus alerts or selected pods
|
||||
* Cordons & drains worker nodes before reboot, uncordoning them after
|
||||
- Optionally defers reboots in the presence of active Prometheus alerts or selected pods
|
||||
- Cordons & drains worker nodes before reboot, uncordoning them after
|
||||
|
||||
## Kubernetes & OS Compatibility
|
||||
## Documentation
|
||||
|
||||
The daemon image contains versions of `k8s.io/client-go` and
|
||||
`k8s.io/kubectl` (the binary of `kubectl` in older releases) for the purposes of
|
||||
maintaining the lock and draining worker nodes. Kubernetes aims to provide
|
||||
forwards and backwards compatibility of one minor version between client and
|
||||
server:
|
||||
Find all our docs on <https://kured.dev>:
|
||||
|
||||
| kured | kubectl | k8s.io/client-go | k8s.io/apimachinery | expected kubernetes compatibility |
|
||||
|-------|---------|------------------|---------------------|-----------------------------------|
|
||||
| main | 1.22.4 | v0.22.4 | v0.22.4 | 1.21.x, 1.22.x, 1.23.x |
|
||||
| 1.9.1 | 1.22.4 | v0.22.4 | v0.22.4 | 1.21.x, 1.22.x, 1.23.x |
|
||||
| 1.8.1 | 1.21.4 | v0.21.4 | v0.21.4 | 1.20.x, 1.21.x, 1.22.x |
|
||||
| 1.7.0 | 1.20.5 | v0.20.5 | v0.20.5 | 1.19.x, 1.20.x, 1.21.x |
|
||||
| 1.6.1 | 1.19.4 | v0.19.4 | v0.19.4 | 1.18.x, 1.19.x, 1.20.x |
|
||||
| 1.5.1 | 1.18.8 | v0.18.8 | v0.18.8 | 1.17.x, 1.18.x, 1.19.x |
|
||||
| 1.4.4 | 1.17.7 | v0.17.0 | v0.17.0 | 1.16.x, 1.17.x, 1.18.x |
|
||||
| 1.3.0 | 1.15.10 | v12.0.0 | release-1.15 | 1.15.x, 1.16.x, 1.17.x |
|
||||
| 1.2.0 | 1.13.6 | v10.0.0 | release-1.13 | 1.12.x, 1.13.x, 1.14.x |
|
||||
| 1.1.0 | 1.12.1 | v9.0.0 | release-1.12 | 1.11.x, 1.12.x, 1.13.x |
|
||||
| 1.0.0 | 1.7.6 | v4.0.0 | release-1.7 | 1.6.x, 1.7.x, 1.8.x |
|
||||
- [All Kured Documentation](https://kured.dev/docs/)
|
||||
- [Installing Kured](https://kured.dev/docs/installation/)
|
||||
- [Configuring Kured](https://kured.dev/docs/configuration/)
|
||||
- [Operating Kured](https://kured.dev/docs/operation/)
|
||||
- [Developing Kured](https://kured.dev/docs/development/)
|
||||
|
||||
See the [release notes](https://github.com/weaveworks/kured/releases)
|
||||
for specific version compatibility information, including which
|
||||
combination have been formally tested.
|
||||
|
||||
Versions >=1.1.0 enter the host mount namespace to invoke
|
||||
`systemctl reboot`, so should work on any systemd distribution.
|
||||
|
||||
## Installation
|
||||
|
||||
To obtain a default installation without Prometheus alerting interlock
|
||||
or Slack notifications:
|
||||
|
||||
```console
|
||||
latest=$(curl -s https://api.github.com/repos/weaveworks/kured/releases | jq -r .[0].tag_name)
|
||||
kubectl apply -f "https://github.com/weaveworks/kured/releases/download/$latest/kured-$latest-dockerhub.yaml"
|
||||
```
|
||||
|
||||
If you want to customise the installation, download the manifest and
|
||||
edit it in accordance with the following section before application.
|
||||
|
||||
## Configuration
|
||||
|
||||
The following arguments can be passed to kured via the daemonset pod template:
|
||||
|
||||
```console
|
||||
Flags:
|
||||
--alert-filter-regexp regexp.Regexp alert names to ignore when checking for active alerts
|
||||
--alert-firing-only bool only consider firing alerts when checking for active alerts
|
||||
--blocking-pod-selector stringArray label selector identifying pods whose presence should prevent reboots
|
||||
--drain-grace-period int time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
|
||||
--skip-wait-for-delete-timeout int when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
|
||||
--ds-name string name of daemonset on which to place lock (default "kured")
|
||||
--ds-namespace string namespace containing daemonset on which to place lock (default "kube-system")
|
||||
--end-time string schedule reboot only before this time of day (default "23:59:59")
|
||||
--force-reboot bool force a reboot even if the drain is still running (default: false)
|
||||
--drain-timeout duration timeout after which the drain is aborted (default: 0, infinite time)
|
||||
-h, --help help for kured
|
||||
--lock-annotation string annotation in which to record locking node (default "weave.works/kured-node-lock")
|
||||
--lock-release-delay duration hold lock after reboot by this duration (default: 0, disabled)
|
||||
--lock-ttl duration expire lock annotation after this duration (default: 0, disabled)
|
||||
--message-template-drain string message template used to notify about a node being drained (default "Draining node %s")
|
||||
--message-template-reboot string message template used to notify about a node being rebooted (default "Rebooting node %s")
|
||||
--notify-url url for reboot notifications (cannot use with --slack-hook-url flags)
|
||||
--period duration reboot check period (default 1h0m0s)
|
||||
--prefer-no-schedule-taint string Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to "weave.works/kured-node-reboot" to enable tainting.
|
||||
--prometheus-url string Prometheus instance to probe for active alerts
|
||||
--reboot-command string command to run when a reboot is required by the sentinel (default "/sbin/systemctl reboot")
|
||||
--reboot-days strings schedule reboot on these days (default [su,mo,tu,we,th,fr,sa])
|
||||
--reboot-delay duration add a delay after drain finishes but before the reboot command is issued (default 0, no time)
|
||||
--reboot-sentinel string path to file whose existence signals need to reboot (default "/var/run/reboot-required")
|
||||
--reboot-sentinel-command string command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
|
||||
--slack-channel string slack channel for reboot notfications
|
||||
--slack-hook-url string slack hook URL for reboot notfications [deprecated in favor of --notify-url]
|
||||
--slack-username string slack username for reboot notfications (default "kured")
|
||||
--start-time string schedule reboot only after this time of day (default "0:00")
|
||||
--time-zone string use this timezone for schedule inputs (default "UTC")
|
||||
--log-format string log format specified as text or json, defaults to "text"
|
||||
```
|
||||
|
||||
### Reboot Sentinel File & Period
|
||||
|
||||
By default kured checks for the existence of
|
||||
`/var/run/reboot-required` every sixty minutes; you can override these
|
||||
values with `--reboot-sentinel` and `--period`. Each replica of the
|
||||
daemon uses a random offset derived from the period on startup so that
|
||||
nodes don't all contend for the lock simultaneously.
|
||||
|
||||
Alternatively, a reboot sentinel command can be used. If a reboot
|
||||
sentinel command is used, the reboot sentinel file presence will be
|
||||
ignored.
|
||||
|
||||
### Setting a schedule
|
||||
|
||||
By default, kured will reboot any time it detects the sentinel, but this
|
||||
may cause reboots during odd hours. While service disruption does not
|
||||
normally occur, anything is possible and operators may want to restrict
|
||||
reboots to predictable schedules. Use `--reboot-days`, `--start-time`,
|
||||
`--end-time`, and `--time-zone` to set a schedule. For example, business
|
||||
hours on the west coast USA can be specified with:
|
||||
|
||||
```console
|
||||
--reboot-days=mon,tue,wed,thu,fri
|
||||
--start-time=9am
|
||||
--end-time=5pm
|
||||
--time-zone=America/Los_Angeles
|
||||
```
|
||||
|
||||
Times can be formatted in numerous ways, including `5pm`, `5:00pm` `17:00`,
|
||||
and `17`. `--time-zone` represents a Go `time.Location`, and can be `UTC`,
|
||||
`Local`, or any entry in the standard Linux tz database.
|
||||
|
||||
Note that when using smaller time windows, you should consider shortening
|
||||
the sentinel check period (`--period`).
|
||||
|
||||
### Blocking Reboots via Alerts
|
||||
|
||||
You may find it desirable to block automatic node reboots when there
|
||||
are active alerts - you can do so by providing the URL of your
|
||||
Prometheus server:
|
||||
|
||||
```console
|
||||
--prometheus-url=http://prometheus.monitoring.svc.cluster.local
|
||||
```
|
||||
|
||||
By default the presence of *any* active (pending or firing) alerts
|
||||
will block reboots, however you can ignore specific alerts:
|
||||
|
||||
```console
|
||||
--alert-filter-regexp=^(RebootRequired|AnotherBenignAlert|...$
|
||||
```
|
||||
|
||||
You can also only block reboots for firing alerts:
|
||||
```console
|
||||
--alert-firing-only=true
|
||||
```
|
||||
|
||||
See the section on Prometheus metrics for an important application of this
|
||||
filter.
|
||||
|
||||
### Blocking Reboots via Pods
|
||||
|
||||
You can also block reboots of an _individual node_ when specific pods
|
||||
are scheduled on it:
|
||||
|
||||
```console
|
||||
--blocking-pod-selector=runtime=long,cost=expensive
|
||||
```
|
||||
|
||||
Since label selector strings use commas to express logical 'and', you can
|
||||
specify this parameter multiple times for 'or':
|
||||
|
||||
```console
|
||||
--blocking-pod-selector=runtime=long,cost=expensive
|
||||
--blocking-pod-selector=name=temperamental
|
||||
```
|
||||
|
||||
In this case, the presence of either an (appropriately labelled) expensive long
|
||||
running job or a known temperamental pod on a node will stop it rebooting.
|
||||
|
||||
> Try not to abuse this mechanism - it's better to strive for
|
||||
> restartability where possible. If you do use it, make sure you set
|
||||
> up a RebootRequired alert as described in the next section so that
|
||||
> you can intervene manually if reboots are blocked for too long.
|
||||
|
||||
### Prometheus Metrics
|
||||
|
||||
Each kured pod exposes a single gauge metric (`:8080/metrics`) that
|
||||
indicates the presence of the sentinel file:
|
||||
|
||||
```console
|
||||
# HELP kured_reboot_required OS requires reboot due to software updates.
|
||||
# TYPE kured_reboot_required gauge
|
||||
kured_reboot_required{node="ip-xxx-xxx-xxx-xxx.ec2.internal"} 0
|
||||
```
|
||||
|
||||
The purpose of this metric is to power an alert which will summon an
|
||||
operator if the cluster cannot reboot itself automatically for a
|
||||
prolonged period:
|
||||
|
||||
```console
|
||||
# Alert if a reboot is required for any machines. Acts as a failsafe for the
|
||||
# reboot daemon, which will not reboot nodes if there are pending alerts save
|
||||
# this one.
|
||||
ALERT RebootRequired
|
||||
IF max(kured_reboot_required) != 0
|
||||
FOR 24h
|
||||
LABELS { severity="warning" }
|
||||
ANNOTATIONS {
|
||||
summary = "Machine(s) require being rebooted, and the reboot daemon has failed to do so for 24 hours",
|
||||
impact = "Cluster nodes more vulnerable to security exploits. Eventually, no disk space left.",
|
||||
description = "Machine(s) require being rebooted, probably due to kernel update.",
|
||||
}
|
||||
```
|
||||
|
||||
If you choose to employ such an alert and have configured kured to
|
||||
probe for active alerts before rebooting, be sure to specify
|
||||
`--alert-filter-regexp=^RebootRequired$` to avoid deadlock!
|
||||
|
||||
### Notifications
|
||||
|
||||
When you specify a formatted URL using `--notify-url`, kured will notify
|
||||
about draining and rebooting nodes across a list of technologies.
|
||||
|
||||

|
||||
|
||||
Alternatively you can use the `--message-template-drain` and `--message-template-reboot` to customize the text of the message, e.g.
|
||||
|
||||
```cli
|
||||
--message-template-drain="Draining node %s part of *my-cluster* in region *xyz*"
|
||||
```
|
||||
|
||||
Here is the syntax:
|
||||
|
||||
- slack: `slack://tokenA/tokenB/tokenC`
|
||||
(`--slack-hook-url` is deprecated but possible to use)
|
||||
|
||||
- rocketchat: `rocketchat://[username@]rocketchat-host/token[/channel|@recipient]`
|
||||
|
||||
- teams: `teams://tName/token-a/token-b/token-c`
|
||||
|
||||
> **Attention** as the [format of the url has changed](https://github.com/containrrr/shoutrrr/issues/138) you also have to specify a `tName`
|
||||
|
||||
- Email: `smtp://username:password@host:port/?fromAddress=fromAddress&toAddresses=recipient1[,recipient2,...]`
|
||||
|
||||
More details here: [containrrr.dev/shoutrrr/v0.4/services/overview](https://containrrr.dev/shoutrrr/v0.4/services/overview)
|
||||
|
||||
### Overriding Lock Configuration
|
||||
|
||||
The `--ds-name` and `--ds-namespace` arguments should match the name and
|
||||
namespace of the daemonset used to deploy the reboot daemon - the locking is
|
||||
implemented by means of an annotation on this resource. The defaults match
|
||||
the daemonset YAML provided in the repository.
|
||||
|
||||
Similarly `--lock-annotation` can be used to change the name of the
|
||||
annotation kured will use to store the lock, but the default is almost
|
||||
certainly safe.
|
||||
|
||||
## Operation
|
||||
|
||||
The example commands in this section assume that you have not
|
||||
overriden the default lock annotation, daemonset name or namespace;
|
||||
if you have, you will have to adjust the commands accordingly.
|
||||
|
||||
### Testing
|
||||
|
||||
You can test your configuration by provoking a reboot on a node:
|
||||
|
||||
```console
|
||||
sudo touch /var/run/reboot-required
|
||||
```
|
||||
|
||||
### Disabling Reboots
|
||||
|
||||
If you need to temporarily stop kured from rebooting any nodes, you
|
||||
can take the lock manually:
|
||||
|
||||
```console
|
||||
kubectl -n kube-system annotate ds kured weave.works/kured-node-lock='{"nodeID":"manual"}'
|
||||
```
|
||||
|
||||
Don't forget to release it afterwards!
|
||||
|
||||
### Manual Unlock
|
||||
|
||||
In exceptional circumstances, such as a node experiencing a permanent
|
||||
failure whilst rebooting, manual intervention may be required to
|
||||
remove the cluster lock:
|
||||
|
||||
```console
|
||||
kubectl -n kube-system annotate ds kured weave.works/kured-node-lock-
|
||||
```
|
||||
|
||||
> NB the `-` at the end of the command is important - it instructs
|
||||
> `kubectl` to remove that annotation entirely.
|
||||
|
||||
### Automatic Unlock
|
||||
|
||||
In exceptional circumstances (especially when used with cluster-autoscaler) a node
|
||||
which holds lock might be killed thus annotation will stay there for ever.
|
||||
|
||||
Using `--lock-ttl=30m` will allow other nodes to take over if TTL has expired (in this case 30min) and continue reboot process.
|
||||
|
||||
### Delaying Lock Release
|
||||
|
||||
|
||||
Using `--lock-release-delay=30m` will cause nodes to hold the lock for the specified time frame (in this case 30min) before it is released and the reboot process continues. This can be used to throttle reboots across the cluster.
|
||||
|
||||
## Building
|
||||
|
||||
Kured now uses [Go
|
||||
Modules](https://github.com/golang/go/wiki/Modules), so build
|
||||
instructions vary depending on where you have checked out the
|
||||
repository:
|
||||
|
||||
**Building outside $GOPATH:**
|
||||
|
||||
```console
|
||||
make
|
||||
```
|
||||
|
||||
**Building inside $GOPATH:**
|
||||
|
||||
```console
|
||||
GO111MODULE=on make
|
||||
```
|
||||
|
||||
You can find the current preferred version of Golang in the [go.mod file](go.mod).
|
||||
|
||||
If you are interested in contributing code to kured, please take a look at
|
||||
our [development][development] docs.
|
||||
|
||||
[development]: DEVELOPMENT.md
|
||||
|
||||
## Frequently Asked/Anticipated Questions
|
||||
|
||||
### Why is there no `latest` tag on Docker Hub?
|
||||
|
||||
Use of `latest` for production deployments is bad practice - see
|
||||
[here](https://kubernetes.io/docs/concepts/configuration/overview) for
|
||||
details. The manifest on `main` refers to `latest` for local
|
||||
development testing with minikube only; for production use choose a
|
||||
versioned manifest from the [release page](https://github.com/weaveworks/kured/releases/).
|
||||
And there's much more!
|
||||
|
||||
## Getting Help
|
||||
|
||||
If you have any questions about, feedback for or problems with `kured`:
|
||||
|
||||
* Invite yourself to the <a href="https://slack.weave.works/" target="_blank">Weave Users Slack</a>.
|
||||
* Ask a question on the [#kured](https://weave-community.slack.com/messages/kured/) slack channel.
|
||||
* [File an issue](https://github.com/weaveworks/kured/issues/new).
|
||||
* Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
|
||||
every fourth Wednesday of the month at 16:00 UTC.
|
||||
- Invite yourself to the <a href="https://slack.cncf.io/" target="_blank">CNCF Slack</a>.
|
||||
- Ask a question on the [#kured](https://cloud-native.slack.com/archives/kured) slack channel.
|
||||
- [File an issue](https://github.com/kubereboot/kured/issues/new).
|
||||
- Join us in [our monthly meeting](https://docs.google.com/document/d/1AWT8YDdqZY-Se6Y1oAlwtujWLVpNVK2M_F_Vfqw06aI/edit),
|
||||
every first Wednesday of the month at 16:00 UTC.
|
||||
- You might want to [join the kured-dev mailing list](https://lists.cncf.io/g/cncf-kured-dev) as well.
|
||||
|
||||
We follow the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md).
|
||||
We follow the [CNCF Code of Conduct](CODE_OF_CONDUCT.md).
|
||||
|
||||
Your feedback is always welcome!
|
||||
|
||||
## Trademarks
|
||||
|
||||
**Kured is a [Cloud Native Computing Foundation](https://cncf.io/) Sandbox project.**
|
||||
|
||||

|
||||
|
||||
The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/trademark-usage/).
|
||||
|
||||
## License
|
||||
|
||||
[](https://app.fossa.com/projects/git%2Bgithub.com%2Fkubereboot%2Fkured?ref=badge_large)
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
@@ -1,14 +0,0 @@
|
||||
apiVersion: v1
|
||||
appVersion: "1.9.1"
|
||||
description: A Helm chart for kured
|
||||
name: kured
|
||||
version: 2.11.1
|
||||
home: https://github.com/weaveworks/kured
|
||||
maintainers:
|
||||
- name: ckotzbauer
|
||||
email: christian.kotzbauer@gmail.com
|
||||
- name: davidkarlsen
|
||||
email: david@davidkarlsen.com
|
||||
sources:
|
||||
- https://github.com/weaveworks/kured
|
||||
icon: https://raw.githubusercontent.com/weaveworks/kured/main/img/logo.png
|
||||
@@ -1,125 +0,0 @@
|
||||
# Kured (KUbernetes REboot Daemon)
|
||||
|
||||
## Introduction
|
||||
This chart installs the "Kubernetes Reboot Daemon" using the Helm Package Manager.
|
||||
|
||||
## Prerequisites
|
||||
- Kubernetes 1.9+
|
||||
|
||||
## Installing the Chart
|
||||
To install the chart with the release name `my-release`:
|
||||
```bash
|
||||
$ helm repo add kured https://weaveworks.github.io/kured
|
||||
$ helm install my-release kured/kured
|
||||
```
|
||||
|
||||
## Uninstalling the Chart
|
||||
To uninstall/delete the `my-release` deployment:
|
||||
```bash
|
||||
$ helm delete my-release
|
||||
```
|
||||
|
||||
The command removes all the Kubernetes components associated with the chart and deletes the release.
|
||||
|
||||
|
||||
## Migrate from stable Helm-Chart
|
||||
The following changes have been made compared to the stable chart:
|
||||
- **[BREAKING CHANGE]** The `autolock` feature was removed. Use `configuration.startTime` and `configuration.endTime` instead.
|
||||
- Role inconsistencies have been fixed (allowed verbs for modifying the `DaemonSet`, apiGroup of `PodSecurityPolicy`)
|
||||
- Added support for affinities.
|
||||
- Configuration of cli-flags can be made through a `configuration` object.
|
||||
- Added optional `Service` and `ServiceMonitor` support for metrics endpoint.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Config | Description | Default |
|
||||
| ------ | ----------- | ------- |
|
||||
| `image.repository` | Image repository | `weaveworks/kured` |
|
||||
| `image.tag` | Image tag | `1.9.1` |
|
||||
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
|
||||
| `image.pullSecrets` | Image pull secrets | `[]` |
|
||||
| `updateStrategy` | Daemonset update strategy | `RollingUpdate` |
|
||||
| `maxUnavailable` | The max pods unavailable during a rolling update | `1` |
|
||||
| `podAnnotations` | Annotations to apply to pods (eg to add Prometheus annotations) | `{}` |
|
||||
| `extraArgs` | Extra arguments to pass to `/usr/bin/kured`. See below. | `{}` |
|
||||
| `extraEnvVars` | Array of environment variables to pass to the daemonset. | `{}` |
|
||||
| `configuration.lockTtl` | cli-parameter `--lock-ttl` | `0` |
|
||||
| `configuration.lockReleaseDelay` | cli-parameter `--lock-release-delay` | `0` |
|
||||
| `configuration.alertFilterRegexp` | cli-parameter `--alert-filter-regexp` | `""` |
|
||||
| `configuration.alertFiringOnly` | cli-parameter `--alert-firing-only` | `false` |
|
||||
| `configuration.blockingPodSelector` | Array of selectors for multiple cli-parameters `--blocking-pod-selector` | `[]` |
|
||||
| `configuration.endTime` | cli-parameter `--end-time` | `""` |
|
||||
| `configuration.lockAnnotation` | cli-parameter `--lock-annotation` | `""` |
|
||||
| `configuration.period` | cli-parameter `--period` | `""` |
|
||||
| `configuration.forceReboot` | cli-parameter `--force-reboot` | `false` |
|
||||
| `configuration.drainGracePeriod` | cli-parameter `--drain-grace-period` | `""` |
|
||||
| `configuration.drainTimeout` | cli-parameter `--drain-timeout` | `""` |
|
||||
| `configuration.skipWaitForDeleteTimeout` | cli-parameter `--skip-wait-for-delete-timeout` | `""` |
|
||||
| `configuration.prometheusUrl` | cli-parameter `--prometheus-url` | `""` |
|
||||
| `configuration.rebootDays` | Array of days for multiple cli-parameters `--reboot-days` | `[]` |
|
||||
| `configuration.rebootSentinel` | cli-parameter `--reboot-sentinel` | `""` |
|
||||
| `configuration.rebootSentinelCommand` | cli-parameter `--reboot-sentinel-command` | `""` |
|
||||
| `configuration.rebootCommand` | cli-parameter `--reboot-command` | `""` |
|
||||
| `configuration.rebootDelay` | cli-parameter `--reboot-delay` | `""` |
|
||||
| `configuration.slackChannel` | cli-parameter `--slack-channel` | `""` |
|
||||
| `configuration.slackHookUrl` | cli-parameter `--slack-hook-url` | `""` |
|
||||
| `configuration.slackUsername` | cli-parameter `--slack-username` | `""` |
|
||||
| `configuration.notifyUrl` | cli-parameter `--notify-url` | `""` |
|
||||
| `configuration.messageTemplateDrain` | cli-parameter `--message-template-drain` | `""` |
|
||||
| `configuration.messageTemplateReboot` | cli-parameter `--message-template-reboot` | `""` |
|
||||
| `configuration.startTime` | cli-parameter `--start-time` | `""` |
|
||||
| `configuration.timeZone` | cli-parameter `--time-zone` | `""` |
|
||||
| `configuration.annotateNodes` | cli-parameter `--annotate-nodes` | `false` |
|
||||
| `configuration.logFormat` | cli-parameter `--log-format` | `"text"` |
|
||||
| `configuration.preferNoScheduleTaint` | Taint name applied during pending node reboot | `""` |
|
||||
| `rbac.create` | Create RBAC roles | `true` |
|
||||
| `serviceAccount.create` | Create a service account | `true` |
|
||||
| `serviceAccount.name` | Service account name to create (or use if `serviceAccount.create` is false) | (chart fullname) |
|
||||
| `podSecurityPolicy.create` | Create podSecurityPolicy | `false` |
|
||||
| `resources` | Resources requests and limits. | `{}` |
|
||||
| `metrics.create` | Create a ServiceMonitor for prometheus-operator | `false` |
|
||||
| `metrics.namespace` | The namespace to create the ServiceMonitor in | `""` |
|
||||
| `metrics.labels` | Additional labels for the ServiceMonitor | `{}` |
|
||||
| `metrics.interval` | Interval prometheus should scrape the endpoint | `60s` |
|
||||
| `metrics.scrapeTimeout` | A custom scrapeTimeout for prometheus | `""` |
|
||||
| `service.create` | Create a Service for the metrics endpoint | `false` |
|
||||
| `service.name ` | Service name for the metrics endpoint | `""` |
|
||||
| `service.port` | Port of the service to expose | `8080` |
|
||||
| `service.annotations` | Annotations to apply to the service (eg to add Prometheus annotations) | `{}` |
|
||||
| `podLabels` | Additional labels for pods (e.g. CostCenter=IT) | `{}` |
|
||||
| `priorityClassName` | Priority Class to be used by the pods | `""` |
|
||||
| `tolerations` | Tolerations to apply to the daemonset (eg to allow running on master) | `[{"key": "node-role.kubernetes.io/master", "effect": "NoSchedule"}]`|
|
||||
| `affinity` | Affinity for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
|
||||
| `nodeSelector` | Node Selector for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
|
||||
| `volumeMounts` | Maps of volumes mount to mount | `{}` |
|
||||
| `volumes` | Maps of volumes to mount | `{}` |
|
||||
See https://github.com/weaveworks/kured#configuration for values (not contained in the `configuration` object) for `extraArgs`. Note that
|
||||
```yaml
|
||||
extraArgs:
|
||||
foo: 1
|
||||
bar-baz: 2
|
||||
```
|
||||
becomes `/usr/bin/kured ... --foo=1 --bar-baz=2`.
|
||||
|
||||
|
||||
## Prometheus Metrics
|
||||
|
||||
Kured exposes a single prometheus metric indicating whether a reboot is required or not (see [kured docs](https://github.com/weaveworks/kured#prometheus-metrics)) for details.
|
||||
|
||||
#### Prometheus-Operator
|
||||
|
||||
```yaml
|
||||
metrics:
|
||||
create: true
|
||||
```
|
||||
|
||||
#### Prometheus Annotations
|
||||
|
||||
```yaml
|
||||
service:
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/path: "/metrics"
|
||||
prometheus.io/port: "8080"
|
||||
```
|
||||
@@ -1,13 +0,0 @@
|
||||
# This is tested twice:
|
||||
# Basic install test with chart-testing (on charts PRs)
|
||||
# Functional testing in PRs (other PRs)
|
||||
|
||||
service:
|
||||
create: true
|
||||
name: kured-prometheus-endpoint
|
||||
port: 8080
|
||||
type: NodePort
|
||||
nodePort: 30000
|
||||
|
||||
# Do not override the configuration: period in this, so that
|
||||
# We can test prometheus exposed metrics without rebooting.
|
||||
@@ -1,3 +0,0 @@
|
||||
Kured will check for /var/run/reboot-required, and reboot nodes when needed.
|
||||
|
||||
See https://github.com/weaveworks/kured/ for details.
|
||||
@@ -1,72 +0,0 @@
|
||||
{{/* vim: set filetype=mustache: */}}
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "kured.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "kured.fullname" -}}
|
||||
{{- if .Values.fullnameOverride -}}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else -}}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride -}}
|
||||
{{- if contains $name .Release.Name -}}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else -}}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "kured.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create the name of the service account to use
|
||||
*/}}
|
||||
{{- define "kured.serviceAccountName" -}}
|
||||
{{- if .Values.serviceAccount.create -}}
|
||||
{{ default (include "kured.fullname" .) .Values.serviceAccount.name }}
|
||||
{{- else -}}
|
||||
{{ default "default" .Values.serviceAccount.name }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return the appropriate apiVersion for podsecuritypolicy.
|
||||
*/}}
|
||||
{{- define "kured.psp.apiVersion" -}}
|
||||
{{- if semverCompare "<1.10-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
{{- print "extensions/v1beta1" -}}
|
||||
{{- else -}}
|
||||
{{- print "policy/v1beta1" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Returns a set of labels applied to each resource.
|
||||
*/}}
|
||||
{{- define "kured.labels" -}}
|
||||
app: {{ template "kured.name" . }}
|
||||
chart: {{ template "kured.chart" . }}
|
||||
release: {{ .Release.Name }}
|
||||
heritage: {{ .Release.Service }}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Returns a set of matchLabels applied.
|
||||
*/}}
|
||||
{{- define "kured.matchLabels" -}}
|
||||
app: {{ template "kured.name" . }}
|
||||
release: {{ .Release.Name }}
|
||||
{{- end -}}
|
||||
@@ -1,30 +0,0 @@
|
||||
{{- if .Values.rbac.create -}}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ template "kured.fullname" . }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
rules:
|
||||
# Allow kured to read spec.unschedulable
|
||||
# Allow kubectl to drain/uncordon
|
||||
#
|
||||
# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
|
||||
# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
|
||||
#
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "patch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["list","delete","get"]
|
||||
- apiGroups: ["extensions"]
|
||||
resources: ["daemonsets"]
|
||||
verbs: ["get"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["daemonsets"]
|
||||
verbs: ["get"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/eviction"]
|
||||
verbs: ["create"]
|
||||
{{- end -}}
|
||||
@@ -1,16 +0,0 @@
|
||||
{{- if .Values.rbac.create -}}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ template "kured.fullname" . }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: {{ template "kured.fullname" . }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ template "kured.serviceAccountName" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
{{- end -}}
|
||||
@@ -1,181 +0,0 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: {{ template "kured.fullname" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: {{ .Values.updateStrategy }}
|
||||
{{- if eq .Values.updateStrategy "RollingUpdate"}}
|
||||
rollingUpdate:
|
||||
maxUnavailable: {{ .Values.maxUnavailable }}
|
||||
{{- end}}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "kured.matchLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 8 }}
|
||||
{{- if .Values.podLabels }}
|
||||
{{- toYaml .Values.podLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.podAnnotations }}
|
||||
annotations:
|
||||
{{- range $key, $value := .Values.podAnnotations }}
|
||||
{{ $key }}: {{ $value | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
spec:
|
||||
serviceAccountName: {{ template "kured.serviceAccountName" . }}
|
||||
hostPID: true
|
||||
restartPolicy: Always
|
||||
{{- with .Values.image.pullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{ toYaml . | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.priorityClassName }}
|
||||
priorityClassName: {{ .Values.priorityClassName }}
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: {{ .Chart.Name }}
|
||||
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
securityContext:
|
||||
privileged: true # Give permission to nsenter /proc/1/ns/mnt
|
||||
resources:
|
||||
{{ toYaml .Values.resources | indent 12 }}
|
||||
command:
|
||||
- /usr/bin/kured
|
||||
args:
|
||||
- --ds-name={{ template "kured.fullname" . }}
|
||||
- --ds-namespace={{ .Release.Namespace }}
|
||||
{{- if .Values.configuration.lockTtl }}
|
||||
- --lock-ttl={{ .Values.configuration.lockTtl }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.lockReleaseDelay }}
|
||||
- --lock-release-delay={{ .Values.configuration.lockReleaseDelay }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.alertFilterRegexp }}
|
||||
- --alert-filter-regexp={{ .Values.configuration.alertFilterRegexp }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.alertFiringOnly }}
|
||||
- --alert-firing-only={{ .Values.configuration.alertFiringOnly }}
|
||||
{{- end }}
|
||||
{{- range .Values.configuration.blockingPodSelector }}
|
||||
- --blocking-pod-selector={{ . }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.endTime }}
|
||||
- --end-time={{ .Values.configuration.endTime }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.lockAnnotation }}
|
||||
- --lock-annotation={{ .Values.configuration.lockAnnotation }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.period }}
|
||||
- --period={{ .Values.configuration.period }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.forceReboot }}
|
||||
- --force-reboot
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.drainGracePeriod }}
|
||||
- --drain-grace-period={{ .Values.configuration.drainGracePeriod }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.drainTimeout }}
|
||||
- --drain-timeout={{ .Values.configuration.drainTimeout }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.skipWaitForDeleteTimeout }}
|
||||
- --skip-wait-for-delete-timeout={{ .Values.configuration.skipWaitForDeleteTimeout }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.prometheusUrl }}
|
||||
- --prometheus-url={{ .Values.configuration.prometheusUrl }}
|
||||
{{- end }}
|
||||
{{- range .Values.configuration.rebootDays }}
|
||||
- --reboot-days={{ . }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.rebootSentinel }}
|
||||
- --reboot-sentinel={{ .Values.configuration.rebootSentinel }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.rebootSentinelCommand }}
|
||||
- --reboot-sentinel-command={{ .Values.configuration.rebootSentinelCommand }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.rebootCommand }}
|
||||
- --reboot-command={{ .Values.configuration.rebootCommand }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.rebootDelay }}
|
||||
- --reboot-delay={{ .Values.configuration.rebootDelay }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.slackChannel }}
|
||||
- --slack-channel={{ .Values.configuration.slackChannel }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.slackHookUrl }}
|
||||
- --slack-hook-url={{ .Values.configuration.slackHookUrl }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.slackUsername }}
|
||||
- --slack-username={{ .Values.configuration.slackUsername }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.notifyUrl }}
|
||||
- --notify-url={{ .Values.configuration.notifyUrl }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.messageTemplateDrain }}
|
||||
- --message-template-drain={{ .Values.configuration.messageTemplateDrain }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.messageTemplateReboot }}
|
||||
- --message-template-reboot={{ .Values.configuration.messageTemplateReboot }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.startTime }}
|
||||
- --start-time={{ .Values.configuration.startTime }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.timeZone }}
|
||||
- --time-zone={{ .Values.configuration.timeZone }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.annotateNodes }}
|
||||
- --annotate-nodes={{ .Values.configuration.annotateNodes }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.preferNoScheduleTaint }}
|
||||
- --prefer-no-schedule-taint={{ .Values.configuration.preferNoScheduleTaint }}
|
||||
{{- end }}
|
||||
{{- if .Values.configuration.logFormat }}
|
||||
- --log-format={{ .Values.configuration.logFormat }}
|
||||
{{- end }}
|
||||
{{- range $key, $value := .Values.extraArgs }}
|
||||
{{- if $value }}
|
||||
- --{{ $key }}={{ $value }}
|
||||
{{- else }}
|
||||
- --{{ $key }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.volumeMounts }}
|
||||
volumeMounts:
|
||||
{{- toYaml .Values.volumeMounts | nindent 12 }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: metrics
|
||||
env:
|
||||
# Pass in the name of the node on which this pod is scheduled
|
||||
# for use with drain/uncordon operations and lock acquisition
|
||||
- name: KURED_NODE_ID
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
{{- if .Values.extraEnvVars }}
|
||||
{{ toYaml .Values.extraEnvVars | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{ toYaml . | indent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{ toYaml . | indent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.affinity }}
|
||||
affinity:
|
||||
{{ toYaml . | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.volumes }}
|
||||
volumes:
|
||||
{{- toYaml .Values.volumes | nindent 8 }}
|
||||
{{- end }}
|
||||
@@ -1,21 +0,0 @@
|
||||
{{- if .Values.podSecurityPolicy.create}}
|
||||
apiVersion: {{ template "kured.psp.apiVersion" . }}
|
||||
kind: PodSecurityPolicy
|
||||
metadata:
|
||||
name: {{ template "kured.fullname" . }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
spec:
|
||||
privileged: true
|
||||
hostPID: true
|
||||
allowedCapabilities: ['*']
|
||||
fsGroup:
|
||||
rule: RunAsAny
|
||||
runAsUser:
|
||||
rule: RunAsAny
|
||||
seLinux:
|
||||
rule: RunAsAny
|
||||
supplementalGroups:
|
||||
rule: RunAsAny
|
||||
volumes: ['*']
|
||||
{{- end }}
|
||||
@@ -1,30 +0,0 @@
|
||||
{{- if .Values.rbac.create -}}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
namespace: {{ .Release.Namespace }}
|
||||
name: {{ template "kured.fullname" . }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
rules:
|
||||
# Allow kured to lock/unlock itself
|
||||
- apiGroups: ["extensions"]
|
||||
resources: ["daemonsets"]
|
||||
resourceNames: ["{{ template "kured.fullname" . }}"]
|
||||
verbs: ["update", "patch"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["daemonsets"]
|
||||
resourceNames: ["{{ template "kured.fullname" . }}"]
|
||||
verbs: ["update", "patch"]
|
||||
{{- if .Values.podSecurityPolicy.create }}
|
||||
- apiGroups: ["extensions"]
|
||||
resources: ["podsecuritypolicies"]
|
||||
resourceNames: ["{{ template "kured.fullname" . }}"]
|
||||
verbs: ["use"]
|
||||
- apiGroups: ["policy"]
|
||||
resources: ["podsecuritypolicies"]
|
||||
resourceNames: ["{{ template "kured.fullname" . }}"]
|
||||
verbs: ["use"]
|
||||
{{- end }}
|
||||
|
||||
{{- end -}}
|
||||
@@ -1,17 +0,0 @@
|
||||
{{- if .Values.rbac.create -}}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
namespace: {{ .Release.Namespace }}
|
||||
name: {{ template "kured.fullname" . }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
namespace: {{ .Release.Namespace }}
|
||||
name: {{ template "kured.serviceAccountName" . }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: {{ template "kured.fullname" . }}
|
||||
{{- end -}}
|
||||
@@ -1,29 +0,0 @@
|
||||
{{- if or .Values.service.create .Values.metrics.create }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
{{- if .Values.service.name }}
|
||||
name: {{ .Values.service.name }}
|
||||
{{- else }}
|
||||
name: {{ template "kured.fullname" . }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
{{- if .Values.service.annotations }}
|
||||
annotations:
|
||||
{{- range $key, $value := .Values.service.annotations }}
|
||||
{{ $key }}: {{ $value | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: {{ .Values.service.type }}
|
||||
ports:
|
||||
- name: metrics
|
||||
port: {{ .Values.service.port }}
|
||||
targetPort: 8080
|
||||
{{- if eq .Values.service.type "NodePort" }}
|
||||
nodePort: {{ .Values.service.nodePort }}
|
||||
{{- end }}
|
||||
selector:
|
||||
{{- include "kured.matchLabels" . | nindent 4 }}
|
||||
{{- end }}
|
||||
@@ -1,9 +0,0 @@
|
||||
{{- if .Values.serviceAccount.create -}}
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ template "kured.serviceAccountName" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
{{- end -}}
|
||||
@@ -1,31 +0,0 @@
|
||||
{{- if .Values.metrics.create }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ template "kured.fullname" . }}
|
||||
{{- if .Values.metrics.namespace }}
|
||||
namespace: {{ .Values.metrics.namespace }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "kured.labels" . | nindent 4 }}
|
||||
{{- if .Values.metrics.labels }}
|
||||
{{- toYaml .Values.metrics.labels | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
endpoints:
|
||||
- interval: {{ .Values.metrics.interval }}
|
||||
{{- if .Values.metrics.scrapeTimeout }}
|
||||
scrapeTimeout: {{ .Values.metrics.scrapeTimeout }}
|
||||
{{- end }}
|
||||
honorLabels: true
|
||||
targetPort: 8080
|
||||
path: /metrics
|
||||
scheme: http
|
||||
jobLabel: "{{ .Release.Name }}"
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "kured.matchLabels" . | nindent 6 }}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ .Release.Namespace }}
|
||||
{{- end }}
|
||||
@@ -1,31 +0,0 @@
|
||||
image:
|
||||
repository: weaveworks/kured
|
||||
tag: latest
|
||||
|
||||
configuration:
|
||||
# annotationTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
|
||||
# alertFilterRegexp: "" # alert names to ignore when checking for active alerts
|
||||
# alertFiringOnly: false # only consider firing alerts when checking for active alerts
|
||||
# blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
|
||||
# endTime: "" # only reboot before this time of day (default "23:59")
|
||||
# lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
|
||||
period: "1m" # reboot check period (default 1h0m0s)
|
||||
# forceReboot: false # force a reboot even if the drain fails or times out (default: false)
|
||||
# drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
|
||||
# drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
|
||||
# skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
|
||||
# prometheusUrl: "" # Prometheus instance to probe for active alerts
|
||||
# rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
|
||||
# rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
|
||||
# rebootSentinelCommand: "" # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
|
||||
# slackChannel: "" # slack channel for reboot notfications
|
||||
# slackHookUrl: "" # slack hook URL for reboot notfications
|
||||
# slackUsername: "" # slack username for reboot notfications (default "kured")
|
||||
# notifyUrl: "" # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
|
||||
# messageTemplateDrain: "" # slack message template when notifying about a node being drained (default "Draining node %s")
|
||||
# messageTemplateReboot: "" # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
|
||||
# startTime: "" # only reboot after this time of day (default "0:00")
|
||||
# timeZone: "" # time-zone to use (valid zones from "time" golang package)
|
||||
# annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
|
||||
# lockReleaseDelay: "5m" # hold lock after reboot by this amount of time (default 0, disabled)
|
||||
# logFormat: "text" # log format specified as text or json, defaults to text
|
||||
@@ -1,95 +0,0 @@
|
||||
image:
|
||||
repository: weaveworks/kured
|
||||
tag: "" # will default to the appVersion in Chart.yaml
|
||||
pullPolicy: IfNotPresent
|
||||
pullSecrets: []
|
||||
|
||||
updateStrategy: RollingUpdate
|
||||
# requires RollingUpdate updateStrategy
|
||||
maxUnavailable: 1
|
||||
|
||||
podAnnotations: {}
|
||||
|
||||
extraArgs: {}
|
||||
|
||||
extraEnvVars:
|
||||
# - name: slackHookUrl
|
||||
# valueFrom:
|
||||
# secretKeyRef:
|
||||
# name: secret_name
|
||||
# key: secret_key
|
||||
# - name: regularEnvVariable
|
||||
# value: 123
|
||||
|
||||
configuration:
|
||||
lockTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
|
||||
alertFilterRegexp: "" # alert names to ignore when checking for active alerts
|
||||
alertFiringOnly: false # only consider firing alerts when checking for active alerts
|
||||
blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
|
||||
endTime: "" # only reboot before this time of day (default "23:59")
|
||||
lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
|
||||
period: "" # reboot check period (default 1h0m0s)
|
||||
forceReboot: false # force a reboot even if the drain fails or times out (default: false)
|
||||
drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
|
||||
drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
|
||||
skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
|
||||
prometheusUrl: "" # Prometheus instance to probe for active alerts
|
||||
rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
|
||||
rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
|
||||
rebootSentinelCommand: "" # command for which a successful run signals need to reboot (default ""). If non-empty, sentinel file will be ignored.
|
||||
rebootCommand: "/bin/systemctl reboot" # command to run when a reboot is required by the sentinel
|
||||
rebootDelay: "" # add a delay after drain finishes but before the reboot command is issued
|
||||
slackChannel: "" # slack channel for reboot notfications
|
||||
slackHookUrl: "" # slack hook URL for reboot notfications
|
||||
slackUsername: "" # slack username for reboot notfications (default "kured")
|
||||
notifyUrl: "" # notification URL with the syntax as follows: https://containrrr.dev/shoutrrr/services/overview/
|
||||
messageTemplateDrain: "" # slack message template when notifying about a node being drained (default "Draining node %s")
|
||||
messageTemplateReboot: "" # slack message template when notifying about a node being rebooted (default "Rebooted node %s")
|
||||
startTime: "" # only reboot after this time of day (default "0:00")
|
||||
timeZone: "" # time-zone to use (valid zones from "time" golang package)
|
||||
annotateNodes: false # enable 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' node annotations to signify kured reboot operations
|
||||
lockReleaseDelay: 0 # hold lock after reboot by this amount of time (default 0, disabled)
|
||||
preferNoScheduleTaint: "" # Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to "weave.works/kured-node-reboot" to enable tainting.
|
||||
logFormat: "text" # log format specified as text or json, defaults to text
|
||||
|
||||
rbac:
|
||||
create: true
|
||||
|
||||
serviceAccount:
|
||||
create: true
|
||||
name:
|
||||
|
||||
podSecurityPolicy:
|
||||
create: false
|
||||
|
||||
resources: {}
|
||||
|
||||
metrics:
|
||||
create: false
|
||||
namespace: ""
|
||||
labels: {}
|
||||
interval: 60s
|
||||
scrapeTimeout: ""
|
||||
|
||||
service:
|
||||
create: false
|
||||
port: 8080
|
||||
annotations: {}
|
||||
name: ""
|
||||
type: ClusterIP
|
||||
|
||||
podLabels: {}
|
||||
|
||||
priorityClassName: ""
|
||||
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
|
||||
affinity: {}
|
||||
|
||||
nodeSelector: {}
|
||||
|
||||
volumeMounts: []
|
||||
|
||||
volumes: []
|
||||
@@ -1,4 +0,0 @@
|
||||
FROM alpine:3.15.0
|
||||
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
|
||||
COPY ./kured /usr/bin/kured
|
||||
ENTRYPOINT ["/usr/bin/kured"]
|
||||
@@ -9,7 +9,9 @@ import (
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -28,13 +30,13 @@ import (
|
||||
"github.com/google/shlex"
|
||||
|
||||
shoutrrr "github.com/containrrr/shoutrrr"
|
||||
"github.com/kubereboot/kured/pkg/alerts"
|
||||
"github.com/kubereboot/kured/pkg/daemonsetlock"
|
||||
"github.com/kubereboot/kured/pkg/delaytick"
|
||||
"github.com/kubereboot/kured/pkg/taints"
|
||||
"github.com/kubereboot/kured/pkg/timewindow"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/weaveworks/kured/pkg/alerts"
|
||||
"github.com/weaveworks/kured/pkg/daemonsetlock"
|
||||
"github.com/weaveworks/kured/pkg/delaytick"
|
||||
"github.com/weaveworks/kured/pkg/taints"
|
||||
"github.com/weaveworks/kured/pkg/timewindow"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -45,7 +47,10 @@ var (
|
||||
drainTimeout time.Duration
|
||||
rebootDelay time.Duration
|
||||
period time.Duration
|
||||
metricsHost string
|
||||
metricsPort int
|
||||
drainGracePeriod int
|
||||
drainPodSelector string
|
||||
skipWaitForDeleteTimeoutSeconds int
|
||||
dsNamespace string
|
||||
dsName string
|
||||
@@ -55,6 +60,7 @@ var (
|
||||
prometheusURL string
|
||||
preferNoScheduleTaintName string
|
||||
alertFilter *regexp.Regexp
|
||||
alertFilterMatchOnly bool
|
||||
alertFiringOnly bool
|
||||
rebootSentinelFile string
|
||||
rebootSentinelCommand string
|
||||
@@ -64,10 +70,14 @@ var (
|
||||
slackChannel string
|
||||
messageTemplateDrain string
|
||||
messageTemplateReboot string
|
||||
messageTemplateUncordon string
|
||||
podSelectors []string
|
||||
rebootCommand string
|
||||
logFormat string
|
||||
preRebootNodeLabels []string
|
||||
postRebootNodeLabels []string
|
||||
nodeID string
|
||||
concurrency int
|
||||
|
||||
rebootDays []string
|
||||
rebootStart string
|
||||
@@ -118,11 +128,17 @@ func NewRootCommand() *cobra.Command {
|
||||
rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "",
|
||||
"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
|
||||
rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false,
|
||||
"force a reboot even if the drain fails or times out (default: false)")
|
||||
"force a reboot even if the drain fails or times out")
|
||||
rootCmd.PersistentFlags().StringVar(&metricsHost, "metrics-host", "",
|
||||
"host where metrics will listen")
|
||||
rootCmd.PersistentFlags().IntVar(&metricsPort, "metrics-port", 8080,
|
||||
"port number where metrics will listen")
|
||||
rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1,
|
||||
"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)")
|
||||
"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used")
|
||||
rootCmd.PersistentFlags().StringVar(&drainPodSelector, "drain-pod-selector", "",
|
||||
"only drain pods with labels matching the selector (default: '', all pods)")
|
||||
rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
|
||||
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)")
|
||||
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node")
|
||||
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
|
||||
"timeout after which the drain is aborted (default: 0, infinite time)")
|
||||
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
|
||||
@@ -143,8 +159,10 @@ func NewRootCommand() *cobra.Command {
|
||||
"Prometheus instance to probe for active alerts")
|
||||
rootCmd.PersistentFlags().Var(®expValue{&alertFilter}, "alert-filter-regexp",
|
||||
"alert names to ignore when checking for active alerts")
|
||||
rootCmd.PersistentFlags().BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false,
|
||||
"Only block if the alert-filter-regexp matches active alerts")
|
||||
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
|
||||
"only consider firing alerts when checking for active alerts (default: false)")
|
||||
"only consider firing alerts when checking for active alerts")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
|
||||
"path to file whose existence triggers the reboot command")
|
||||
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "",
|
||||
@@ -153,15 +171,19 @@ func NewRootCommand() *cobra.Command {
|
||||
"command for which a zero return code will trigger a reboot command")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
|
||||
"command to run when a reboot is required")
|
||||
rootCmd.PersistentFlags().IntVar(&concurrency, "concurrency", 1,
|
||||
"amount of nodes to concurrently reboot. Defaults to 1")
|
||||
|
||||
rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
|
||||
"slack hook URL for notifications")
|
||||
"slack hook URL for reboot notifications [deprecated in favor of --notify-url]")
|
||||
rootCmd.PersistentFlags().StringVar(&slackUsername, "slack-username", "kured",
|
||||
"slack username for notifications")
|
||||
"slack username for reboot notifications")
|
||||
rootCmd.PersistentFlags().StringVar(&slackChannel, "slack-channel", "",
|
||||
"slack channel for reboot notfications")
|
||||
"slack channel for reboot notifications")
|
||||
rootCmd.PersistentFlags().StringVar(¬ifyURL, "notify-url", "",
|
||||
"notify URL for reboot notfications")
|
||||
"notify URL for reboot notifications (cannot use with --slack-hook-url flags)")
|
||||
rootCmd.PersistentFlags().StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!",
|
||||
"message template used to notify about a node being successfully uncordoned")
|
||||
rootCmd.PersistentFlags().StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s",
|
||||
"message template used to notify about a node being drained")
|
||||
rootCmd.PersistentFlags().StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s",
|
||||
@@ -185,15 +207,23 @@ func NewRootCommand() *cobra.Command {
|
||||
rootCmd.PersistentFlags().StringVar(&logFormat, "log-format", "text",
|
||||
"use text or json log format")
|
||||
|
||||
rootCmd.PersistentFlags().StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil,
|
||||
"labels to add to nodes before cordoning")
|
||||
rootCmd.PersistentFlags().StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil,
|
||||
"labels to add to nodes after uncordoning")
|
||||
|
||||
return rootCmd
|
||||
}
|
||||
|
||||
// temporary func that checks for deprecated slack-notification-related flags
|
||||
// func that checks for deprecated slack-notification-related flags and node labels that do not match
|
||||
func flagCheck(cmd *cobra.Command, args []string) {
|
||||
if slackHookURL != "" && notifyURL != "" {
|
||||
log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...")
|
||||
}
|
||||
if slackHookURL != "" {
|
||||
if notifyURL != "" {
|
||||
notifyURL = stripQuotes(notifyURL)
|
||||
} else if slackHookURL != "" {
|
||||
slackHookURL = stripQuotes(slackHookURL)
|
||||
log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.")
|
||||
trataURL, err := url.Parse(slackHookURL)
|
||||
if err != nil {
|
||||
@@ -205,6 +235,31 @@ func flagCheck(cmd *cobra.Command, args []string) {
|
||||
notifyURL = fmt.Sprintf("slack://%s", strings.Trim(trataURL.Path, "/services/"))
|
||||
}
|
||||
}
|
||||
var preRebootNodeLabelKeys, postRebootNodeLabelKeys []string
|
||||
for _, label := range preRebootNodeLabels {
|
||||
preRebootNodeLabelKeys = append(preRebootNodeLabelKeys, strings.Split(label, "=")[0])
|
||||
}
|
||||
for _, label := range postRebootNodeLabels {
|
||||
postRebootNodeLabelKeys = append(postRebootNodeLabelKeys, strings.Split(label, "=")[0])
|
||||
}
|
||||
sort.Strings(preRebootNodeLabelKeys)
|
||||
sort.Strings(postRebootNodeLabelKeys)
|
||||
if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) {
|
||||
log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.")
|
||||
}
|
||||
}
|
||||
|
||||
// stripQuotes removes any literal single or double quote chars that surround a string
|
||||
func stripQuotes(str string) string {
|
||||
if len(str) > 2 {
|
||||
firstChar := str[0]
|
||||
lastChar := str[len(str)-1]
|
||||
if firstChar == lastChar && (firstChar == '"' || firstChar == '\'') {
|
||||
return str[1 : len(str)-1]
|
||||
}
|
||||
}
|
||||
// return the original string if it has a length of zero or one
|
||||
return str
|
||||
}
|
||||
|
||||
// bindViper initializes viper and binds command flags with environment variables
|
||||
@@ -269,7 +324,8 @@ func buildHostCommand(pid int, command []string) []string {
|
||||
}
|
||||
|
||||
func rebootRequired(sentinelCommand []string) bool {
|
||||
if err := newCommand(sentinelCommand[0], sentinelCommand[1:]...).Run(); err != nil {
|
||||
cmd := newCommand(sentinelCommand[0], sentinelCommand[1:]...)
|
||||
if err := cmd.Run(); err != nil {
|
||||
switch err := err.(type) {
|
||||
case *exec.ExitError:
|
||||
// We assume a non-zero exit code means 'reboot not required', but of course
|
||||
@@ -277,6 +333,9 @@ func rebootRequired(sentinelCommand []string) bool {
|
||||
// went wrong during its execution. In that case, not entering a reboot loop
|
||||
// is the right thing to do, and we are logging stdout/stderr of the command
|
||||
// so it should be obvious what is wrong.
|
||||
if cmd.ProcessState.ExitCode() != 1 {
|
||||
log.Warnf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode())
|
||||
}
|
||||
return false
|
||||
default:
|
||||
// Something was grossly misconfigured, such as the command path being wrong.
|
||||
@@ -302,6 +361,8 @@ type PrometheusBlockingChecker struct {
|
||||
filter *regexp.Regexp
|
||||
// bool to indicate if only firing alerts should be considered
|
||||
firingOnly bool
|
||||
// bool to indicate that we're only blocking on alerts which match the filter
|
||||
filterMatchOnly bool
|
||||
}
|
||||
|
||||
// KubernetesBlockingChecker contains info for connecting
|
||||
@@ -315,8 +376,7 @@ type KubernetesBlockingChecker struct {
|
||||
}
|
||||
|
||||
func (pb PrometheusBlockingChecker) isBlocked() bool {
|
||||
|
||||
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
|
||||
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly, pb.filterMatchOnly)
|
||||
if err != nil {
|
||||
log.Warnf("Reboot blocked: prometheus query error: %v", err)
|
||||
return true
|
||||
@@ -335,7 +395,7 @@ func (pb PrometheusBlockingChecker) isBlocked() bool {
|
||||
func (kb KubernetesBlockingChecker) isBlocked() bool {
|
||||
fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodename)
|
||||
for _, labelSelector := range kb.filter {
|
||||
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
|
||||
podList, err := kb.client.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{
|
||||
LabelSelector: labelSelector,
|
||||
FieldSelector: fieldSelector,
|
||||
Limit: 10})
|
||||
@@ -368,8 +428,14 @@ func rebootBlocked(blockers ...RebootBlocker) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
|
||||
holding, err := lock.Test(metadata)
|
||||
func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}, isMultiLock bool) bool {
|
||||
var holding bool
|
||||
var err error
|
||||
if isMultiLock {
|
||||
holding, err = lock.TestMultiple()
|
||||
} else {
|
||||
holding, err = lock.Test(metadata)
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatalf("Error testing lock: %v", err)
|
||||
}
|
||||
@@ -379,8 +445,17 @@ func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
|
||||
return holding
|
||||
}
|
||||
|
||||
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration) bool {
|
||||
holding, holder, err := lock.Acquire(metadata, TTL)
|
||||
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration, maxOwners int) bool {
|
||||
var holding bool
|
||||
var holder string
|
||||
var err error
|
||||
if maxOwners > 1 {
|
||||
var holders []string
|
||||
holding, holders, err = lock.AcquireMultiple(metadata, TTL, maxOwners)
|
||||
holder = strings.Join(holders, ",")
|
||||
} else {
|
||||
holding, holder, err = lock.Acquire(metadata, TTL)
|
||||
}
|
||||
switch {
|
||||
case err != nil:
|
||||
log.Fatalf("Error acquiring lock: %v", err)
|
||||
@@ -401,16 +476,27 @@ func throttle(releaseDelay time.Duration) {
|
||||
}
|
||||
}
|
||||
|
||||
func release(lock *daemonsetlock.DaemonSetLock) {
|
||||
func release(lock *daemonsetlock.DaemonSetLock, isMultiLock bool) {
|
||||
log.Infof("Releasing lock")
|
||||
if err := lock.Release(); err != nil {
|
||||
|
||||
var err error
|
||||
if isMultiLock {
|
||||
err = lock.ReleaseMultiple()
|
||||
} else {
|
||||
err = lock.Release()
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatalf("Error releasing lock: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func drain(client *kubernetes.Clientset, node *v1.Node) {
|
||||
func drain(client *kubernetes.Clientset, node *v1.Node) error {
|
||||
nodename := node.GetName()
|
||||
|
||||
if preRebootNodeLabels != nil {
|
||||
updateNodeLabels(client, node, preRebootNodeLabels)
|
||||
}
|
||||
|
||||
log.Infof("Draining node %s", nodename)
|
||||
|
||||
if notifyURL != "" {
|
||||
@@ -423,6 +509,7 @@ func drain(client *kubernetes.Clientset, node *v1.Node) {
|
||||
Client: client,
|
||||
Ctx: context.Background(),
|
||||
GracePeriodSeconds: drainGracePeriod,
|
||||
PodSelector: drainPodSelector,
|
||||
SkipWaitForDeleteTimeoutSeconds: skipWaitForDeleteTimeoutSeconds,
|
||||
Force: true,
|
||||
DeleteEmptyDirData: true,
|
||||
@@ -433,23 +520,18 @@ func drain(client *kubernetes.Clientset, node *v1.Node) {
|
||||
}
|
||||
|
||||
if err := kubectldrain.RunCordonOrUncordon(drainer, node, true); err != nil {
|
||||
if !forceReboot {
|
||||
log.Fatalf("Error cordonning %s: %v", nodename, err)
|
||||
}
|
||||
log.Errorf("Error cordonning %s: %v, continuing with reboot anyway", nodename, err)
|
||||
return
|
||||
log.Errorf("Error cordonning %s: %v", nodename, err)
|
||||
return err
|
||||
}
|
||||
|
||||
if err := kubectldrain.RunNodeDrain(drainer, nodename); err != nil {
|
||||
if !forceReboot {
|
||||
log.Fatalf("Error draining %s: %v", nodename, err)
|
||||
}
|
||||
log.Errorf("Error draining %s: %v, continuing with reboot anyway", nodename, err)
|
||||
return
|
||||
log.Errorf("Error draining %s: %v", nodename, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func uncordon(client *kubernetes.Clientset, node *v1.Node) {
|
||||
func uncordon(client *kubernetes.Clientset, node *v1.Node) error {
|
||||
nodename := node.GetName()
|
||||
log.Infof("Uncordoning node %s", nodename)
|
||||
drainer := &kubectldrain.Helper{
|
||||
@@ -460,7 +542,11 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) {
|
||||
}
|
||||
if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil {
|
||||
log.Fatalf("Error uncordonning %s: %v", nodename, err)
|
||||
return err
|
||||
} else if postRebootNodeLabels != nil {
|
||||
updateNodeLabels(client, node, postRebootNodeLabels)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func invokeReboot(nodeID string, rebootCommand []string) {
|
||||
@@ -493,10 +579,11 @@ type nodeMeta struct {
|
||||
Unschedulable bool `json:"unschedulable"`
|
||||
}
|
||||
|
||||
func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) {
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error {
|
||||
node, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error retrieving node object via k8s API: %s", err)
|
||||
log.Errorf("Error retrieving node object via k8s API: %s", err)
|
||||
return err
|
||||
}
|
||||
for k, v := range annotations {
|
||||
node.Annotations[k] = v
|
||||
@@ -505,29 +592,64 @@ func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations
|
||||
|
||||
bytes, err := json.Marshal(node)
|
||||
if err != nil {
|
||||
log.Fatalf("Error marshalling node object into JSON: %v", err)
|
||||
log.Errorf("Error marshalling node object into JSON: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
|
||||
_, err = client.CoreV1().Nodes().Patch(context.Background(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
var annotationsErr string
|
||||
for k, v := range annotations {
|
||||
annotationsErr += fmt.Sprintf("%s=%s ", k, v)
|
||||
}
|
||||
log.Fatalf("Error adding node annotations %s via k8s API: %v", annotationsErr, err)
|
||||
log.Errorf("Error adding node annotations %s via k8s API: %v", annotationsErr, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func deleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) {
|
||||
func deleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) error {
|
||||
log.Infof("Deleting node %s annotation %s", nodeID, key)
|
||||
|
||||
// JSON Patch takes as path input a JSON Pointer, defined in RFC6901
|
||||
// So we replace all instances of "/" with "~1" as per:
|
||||
// https://tools.ietf.org/html/rfc6901#section-3
|
||||
patch := []byte(fmt.Sprintf("[{\"op\":\"remove\",\"path\":\"/metadata/annotations/%s\"}]", strings.ReplaceAll(key, "/", "~1")))
|
||||
_, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{})
|
||||
_, err := client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error deleting node annotation %s via k8s API: %v", key, err)
|
||||
log.Errorf("Error deleting node annotation %s via k8s API: %v", key, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []string) {
|
||||
labelsMap := make(map[string]string)
|
||||
for _, label := range labels {
|
||||
k := strings.Split(label, "=")[0]
|
||||
v := strings.Split(label, "=")[1]
|
||||
labelsMap[k] = v
|
||||
log.Infof("Updating node %s label: %s=%s", node.GetName(), k, v)
|
||||
}
|
||||
|
||||
bytes, err := json.Marshal(map[string]interface{}{
|
||||
"metadata": map[string]interface{}{
|
||||
"labels": labelsMap,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("Error marshalling node object into JSON: %v", err)
|
||||
}
|
||||
|
||||
_, err = client.CoreV1().Nodes().Patch(context.Background(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
var labelsErr string
|
||||
for _, label := range labels {
|
||||
k := strings.Split(label, "=")[0]
|
||||
v := strings.Split(label, "=")[1]
|
||||
labelsErr += fmt.Sprintf("%s=%s ", k, v)
|
||||
}
|
||||
log.Errorf("Error updating node labels %s via k8s API: %v", labelsErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -545,26 +667,47 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation)
|
||||
|
||||
nodeMeta := nodeMeta{}
|
||||
if holding(lock, &nodeMeta) {
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error retrieving node object via k8s API: %v", err)
|
||||
}
|
||||
if !nodeMeta.Unschedulable {
|
||||
uncordon(client, node)
|
||||
}
|
||||
// If we're holding the lock we know we've tried, in a prior run, to reboot
|
||||
// So (1) we want to confirm that the reboot succeeded practically ( !rebootRequired() )
|
||||
// And (2) check if we previously annotated the node that it was in the process of being rebooted,
|
||||
// And finally (3) if it has that annotation, to delete it.
|
||||
// This indicates to other node tools running on the cluster that this node may be a candidate for maintenance
|
||||
if annotateNodes && !rebootRequired(sentinelCommand) {
|
||||
if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok {
|
||||
deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation)
|
||||
source := rand.NewSource(time.Now().UnixNano())
|
||||
tick := delaytick.New(source, 1*time.Minute)
|
||||
for range tick {
|
||||
if holding(lock, &nodeMeta, concurrency > 1) {
|
||||
node, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Errorf("Error retrieving node object via k8s API: %v", err)
|
||||
continue
|
||||
}
|
||||
if !nodeMeta.Unschedulable {
|
||||
err = uncordon(client, node)
|
||||
if err != nil {
|
||||
log.Errorf("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err)
|
||||
continue
|
||||
} else {
|
||||
if notifyURL != "" {
|
||||
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateUncordon, nodeID)); err != nil {
|
||||
log.Warnf("Error notifying: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// If we're holding the lock we know we've tried, in a prior run, to reboot
|
||||
// So (1) we want to confirm that the reboot succeeded practically ( !rebootRequired() )
|
||||
// And (2) check if we previously annotated the node that it was in the process of being rebooted,
|
||||
// And finally (3) if it has that annotation, to delete it.
|
||||
// This indicates to other node tools running on the cluster that this node may be a candidate for maintenance
|
||||
if annotateNodes && !rebootRequired(sentinelCommand) {
|
||||
if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok {
|
||||
err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
throttle(releaseDelay)
|
||||
release(lock, concurrency > 1)
|
||||
break
|
||||
} else {
|
||||
break
|
||||
}
|
||||
throttle(releaseDelay)
|
||||
release(lock)
|
||||
}
|
||||
|
||||
preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)
|
||||
@@ -580,8 +723,8 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
log.Fatal("Unable to create prometheus client: ", err)
|
||||
}
|
||||
|
||||
source := rand.NewSource(time.Now().UnixNano())
|
||||
tick := delaytick.New(source, period)
|
||||
source = rand.NewSource(time.Now().UnixNano())
|
||||
tick = delaytick.New(source, period)
|
||||
for range tick {
|
||||
if !window.Contains(time.Now()) {
|
||||
// Remove taint outside the reboot time window to allow for normal operation.
|
||||
@@ -596,19 +739,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
}
|
||||
log.Infof("Reboot required")
|
||||
|
||||
var blockCheckers []RebootBlocker
|
||||
if prometheusURL != "" {
|
||||
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
|
||||
}
|
||||
if podSelectors != nil {
|
||||
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
|
||||
}
|
||||
|
||||
if rebootBlocked(blockCheckers...) {
|
||||
continue
|
||||
}
|
||||
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
node, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error retrieving node object via k8s API: %v", err)
|
||||
}
|
||||
@@ -623,17 +754,41 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
annotations := map[string]string{KuredRebootInProgressAnnotation: timeNowString}
|
||||
// & annotate this node with a timestamp so that other node maintenance tools know how long it's been since this node has been marked for reboot
|
||||
annotations[KuredMostRecentRebootNeededAnnotation] = timeNowString
|
||||
addNodeAnnotations(client, nodeID, annotations)
|
||||
err := addNodeAnnotations(client, nodeID, annotations)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !acquire(lock, &nodeMeta, TTL) {
|
||||
if !holding(lock, &nodeMeta, concurrency > 1) && !acquire(lock, &nodeMeta, TTL, concurrency) {
|
||||
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
|
||||
preferNoScheduleTaint.Enable()
|
||||
continue
|
||||
}
|
||||
|
||||
drain(client, node)
|
||||
var blockCheckers []RebootBlocker
|
||||
if prometheusURL != "" {
|
||||
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly, filterMatchOnly: alertFilterMatchOnly})
|
||||
}
|
||||
if podSelectors != nil {
|
||||
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
|
||||
}
|
||||
|
||||
if rebootBlocked(blockCheckers...) {
|
||||
continue
|
||||
}
|
||||
|
||||
err = drain(client, node)
|
||||
if err != nil {
|
||||
if !forceReboot {
|
||||
log.Errorf("Unable to cordon or drain %s: %v, will release lock and retry cordon and drain before rebooting when lock is next acquired", node.GetName(), err)
|
||||
release(lock, concurrency > 1)
|
||||
log.Infof("Performing a best-effort uncordon after failed cordon and drain")
|
||||
uncordon(client, node)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if rebootDelay > 0 {
|
||||
log.Infof("Delaying reboot for %v", rebootDelay)
|
||||
@@ -706,6 +861,7 @@ func root(cmd *cobra.Command, args []string) {
|
||||
log.Infof("Blocking Pod Selectors: %v", podSelectors)
|
||||
log.Infof("Reboot schedule: %v", window)
|
||||
log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
|
||||
log.Infof("Concurrency: %v", concurrency)
|
||||
log.Infof("Reboot command: %s", restartCommand)
|
||||
if annotateNodes {
|
||||
log.Infof("Will annotate nodes during kured reboot operations")
|
||||
@@ -721,5 +877,5 @@ func root(cmd *cobra.Command, args []string) {
|
||||
go maintainRebootRequiredMetric(nodeID, hostSentinelCommand)
|
||||
|
||||
http.Handle("/metrics", promhttp.Handler())
|
||||
log.Fatal(http.ListenAndServe(":8080", nil))
|
||||
log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil))
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/weaveworks/kured/pkg/alerts"
|
||||
"github.com/kubereboot/kured/pkg/alerts"
|
||||
assert "gotest.tools/v3/assert"
|
||||
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
@@ -27,11 +27,86 @@ func Test_flagCheck(t *testing.T) {
|
||||
var cmd *cobra.Command
|
||||
var args []string
|
||||
slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
expected := "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", notifyURL)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
|
||||
// validate that surrounding quotes are stripped
|
||||
slackHookURL = "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\""
|
||||
expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
slackHookURL = "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'"
|
||||
expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
slackHookURL = ""
|
||||
notifyURL = "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\""
|
||||
expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
notifyURL = "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'"
|
||||
expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
}
|
||||
|
||||
func Test_stripQuotes(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "string with no surrounding quotes is unchanged",
|
||||
input: "Hello, world!",
|
||||
expected: "Hello, world!",
|
||||
},
|
||||
{
|
||||
name: "string with surrounding double quotes should strip quotes",
|
||||
input: "\"Hello, world!\"",
|
||||
expected: "Hello, world!",
|
||||
},
|
||||
{
|
||||
name: "string with surrounding single quotes should strip quotes",
|
||||
input: "'Hello, world!'",
|
||||
expected: "Hello, world!",
|
||||
},
|
||||
{
|
||||
name: "string with unbalanced surrounding quotes is unchanged",
|
||||
input: "'Hello, world!\"",
|
||||
expected: "'Hello, world!\"",
|
||||
},
|
||||
{
|
||||
name: "string with length of one is unchanged",
|
||||
input: "'",
|
||||
expected: "'",
|
||||
},
|
||||
{
|
||||
name: "string with length of zero is unchanged",
|
||||
input: "",
|
||||
expected: "",
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := stripQuotes(tt.input); !reflect.DeepEqual(got, tt.expected) {
|
||||
t.Errorf("stripQuotes() = %v, expected %v", got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_rebootBlocked(t *testing.T) {
|
||||
noCheckers := []RebootBlocker{}
|
||||
nonblockingChecker := BlockingChecker{blocking: false}
|
||||
|
||||
113
go.mod
113
go.mod
@@ -1,20 +1,105 @@
|
||||
module github.com/weaveworks/kured
|
||||
module github.com/kubereboot/kured
|
||||
|
||||
go 1.16
|
||||
go 1.19
|
||||
|
||||
replace golang.org/x/net => golang.org/x/net v0.7.0
|
||||
|
||||
replace github.com/emicklei/go-restful/v3 => github.com/emicklei/go-restful/v3 v3.10.2
|
||||
|
||||
require (
|
||||
github.com/containrrr/shoutrrr v0.5.2
|
||||
github.com/containrrr/shoutrrr v0.7.1
|
||||
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
|
||||
github.com/prometheus/client_golang v1.11.0
|
||||
github.com/prometheus/common v0.32.1
|
||||
github.com/sirupsen/logrus v1.8.1
|
||||
github.com/spf13/cobra v1.3.0
|
||||
github.com/google/uuid v1.3.0 // indirect
|
||||
github.com/prometheus/client_golang v1.16.0
|
||||
github.com/prometheus/common v0.44.0
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/spf13/cobra v1.7.0
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/spf13/viper v1.10.1
|
||||
github.com/stretchr/testify v1.7.0
|
||||
gotest.tools/v3 v3.0.3
|
||||
k8s.io/api v0.22.4
|
||||
k8s.io/apimachinery v0.22.4
|
||||
k8s.io/client-go v0.22.4
|
||||
k8s.io/kubectl v0.22.4
|
||||
github.com/spf13/viper v1.16.0
|
||||
github.com/stretchr/testify v1.8.4
|
||||
gotest.tools/v3 v3.5.0
|
||||
k8s.io/api v0.26.7
|
||||
k8s.io/apimachinery v0.26.7
|
||||
k8s.io/client-go v0.26.7
|
||||
k8s.io/kubectl v0.26.7
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
|
||||
github.com/MakeNowJust/heredoc v1.0.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
||||
github.com/chai2010/gettext-go v1.0.2 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
|
||||
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
|
||||
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
|
||||
github.com/fatih/color v1.14.1 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/go-errors/errors v1.0.1 // indirect
|
||||
github.com/go-logr/logr v1.2.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.19.5 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.0 // indirect
|
||||
github.com/go-openapi/swag v0.19.14 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/protobuf v1.5.3 // indirect
|
||||
github.com/google/btree v1.0.1 // indirect
|
||||
github.com/google/gnostic v0.5.7-v3refs // indirect
|
||||
github.com/google/go-cmp v0.5.9 // indirect
|
||||
github.com/google/gofuzz v1.1.0 // indirect
|
||||
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
|
||||
github.com/hashicorp/hcl v1.0.0 // indirect
|
||||
github.com/imdario/mergo v0.3.6 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
|
||||
github.com/magiconair/properties v1.8.7 // indirect
|
||||
github.com/mailru/easyjson v0.7.6 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.17 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/mitchellh/go-wordwrap v1.0.0 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/moby/spdystream v0.2.0 // indirect
|
||||
github.com/moby/term v0.0.0-20220808134915-39b0c02b01ae // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
|
||||
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/prometheus/client_model v0.4.0 // indirect
|
||||
github.com/prometheus/procfs v0.10.1 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/spf13/afero v1.9.5 // indirect
|
||||
github.com/spf13/cast v1.5.1 // indirect
|
||||
github.com/spf13/jwalterweatherman v1.1.0 // indirect
|
||||
github.com/subosito/gotenv v1.4.2 // indirect
|
||||
github.com/xlab/treeprint v1.1.0 // indirect
|
||||
go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect
|
||||
golang.org/x/net v0.10.0 // indirect
|
||||
golang.org/x/oauth2 v0.8.0 // indirect
|
||||
golang.org/x/sys v0.8.0 // indirect
|
||||
golang.org/x/term v0.6.0 // indirect
|
||||
golang.org/x/text v0.9.0 // indirect
|
||||
golang.org/x/time v0.1.0 // indirect
|
||||
google.golang.org/appengine v1.6.7 // indirect
|
||||
google.golang.org/protobuf v1.30.0 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/ini.v1 v1.67.0 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/cli-runtime v0.26.7 // indirect
|
||||
k8s.io/component-base v0.26.7 // indirect
|
||||
k8s.io/klog/v2 v2.80.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect
|
||||
k8s.io/utils v0.0.0-20221107191617-1a15be271d1d // indirect
|
||||
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
|
||||
sigs.k8s.io/kustomize/api v0.12.1 // indirect
|
||||
sigs.k8s.io/kustomize/kyaml v0.13.9 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
|
||||
sigs.k8s.io/yaml v1.3.0 // indirect
|
||||
)
|
||||
|
||||
BIN
img/cncf-color.png
Executable file
BIN
img/cncf-color.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
BIN
img/logo.png
BIN
img/logo.png
Binary file not shown.
|
Before Width: | Height: | Size: 16 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 7.2 KiB |
@@ -8,14 +8,14 @@ metadata:
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kured # Must match `--ds-name`
|
||||
name: kured # Must match `--ds-name`
|
||||
namespace: kube-system # Must match `--ds-namespace`
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: kured
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@@ -23,18 +23,24 @@ spec:
|
||||
spec:
|
||||
serviceAccountName: kured
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
hostPID: true # Facilitate entering the host mount namespace via init
|
||||
restartPolicy: Always
|
||||
containers:
|
||||
- name: kured
|
||||
image: docker.io/weaveworks/kured:1.9.1
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
image: ghcr.io/kubereboot/kured:1.13.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
privileged: true # Give permission to nsenter /proc/1/ns/mnt
|
||||
readOnlyRootFilesystem: true
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: metrics
|
||||
env:
|
||||
# Pass in the name of the node on which this pod is scheduled
|
||||
# for use with drain/uncordon operations and lock acquisition
|
||||
@@ -55,6 +61,7 @@ spec:
|
||||
# - --lock-ttl=0
|
||||
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
|
||||
# - --alert-filter-regexp=^RebootRequired$
|
||||
# - --alert-filter-match-only=false
|
||||
# - --alert-firing-only=false
|
||||
# - --reboot-sentinel=/var/run/reboot-required
|
||||
# - --prefer-no-schedule-taint=""
|
||||
@@ -64,7 +71,8 @@ spec:
|
||||
# - --slack-channel=alerting
|
||||
# - --notify-url="" # See also shoutrrr url format
|
||||
# - --message-template-drain=Draining node %s
|
||||
# - --message-template-drain=Rebooting node %s
|
||||
# - --message-template-reboot=Rebooting node %s
|
||||
# - --message-template-uncordon=Node %s rebooted & uncordoned successfully!
|
||||
# - --blocking-pod-selector=runtime=long,cost=expensive
|
||||
# - --blocking-pod-selector=name=temperamental
|
||||
# - --blocking-pod-selector=...
|
||||
@@ -76,3 +84,6 @@ spec:
|
||||
# - --annotate-nodes=false
|
||||
# - --lock-release-delay=30m
|
||||
# - --log-format=text
|
||||
# - --metrics-host=""
|
||||
# - --metrics-port=8080
|
||||
# - --concurrency=1
|
||||
|
||||
@@ -36,7 +36,7 @@ func NewPromClient(conf papi.Config) (*PromClient, error) {
|
||||
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
|
||||
// block-list and will NOT block rebooting. query by includeLabel means,
|
||||
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
|
||||
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {
|
||||
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly, filterMatchOnly bool) ([]string, error) {
|
||||
|
||||
// get all alerts from prometheus
|
||||
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
|
||||
@@ -49,7 +49,7 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str
|
||||
activeAlertSet := make(map[string]bool)
|
||||
for _, sample := range vector {
|
||||
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
|
||||
if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
|
||||
if matchesRegex(filter, string(alertName), filterMatchOnly) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
|
||||
activeAlertSet[string(alertName)] = true
|
||||
}
|
||||
}
|
||||
@@ -67,3 +67,11 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str
|
||||
|
||||
return nil, fmt.Errorf("Unexpected value type: %v", value)
|
||||
}
|
||||
|
||||
func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool {
|
||||
if filter == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
return filter.MatchString(string(alertName)) == filterMatchOnly
|
||||
}
|
||||
|
||||
@@ -45,62 +45,87 @@ func TestActiveAlerts(t *testing.T) {
|
||||
addr := "http://localhost:10001"
|
||||
|
||||
for _, tc := range []struct {
|
||||
it string
|
||||
rFilter string
|
||||
respBody string
|
||||
aName string
|
||||
wantN int
|
||||
firingOnly bool
|
||||
it string
|
||||
rFilter string
|
||||
respBody string
|
||||
aName string
|
||||
wantN int
|
||||
firingOnly bool
|
||||
filterMatchOnly bool
|
||||
}{
|
||||
{
|
||||
it: "should return no active alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
it: "should return no active alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Pod",
|
||||
wantN: 3,
|
||||
firingOnly: false,
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Pod",
|
||||
wantN: 3,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Gatekeeper",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: true,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex filter",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
it: "should return all active alerts by regex",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return only firing alerts if firingOnly is true",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 4,
|
||||
firingOnly: true,
|
||||
it: "should return all active alerts by regex filter",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return ScheduledRebootFailing active alerts",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
aName: "ScheduledRebootFailing",
|
||||
rFilter: "*",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
it: "should return only firing alerts if firingOnly is true",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 4,
|
||||
firingOnly: true,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
|
||||
{
|
||||
it: "should return ScheduledRebootFailing active alerts",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
aName: "ScheduledRebootFailing",
|
||||
rFilter: "*",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: true,
|
||||
},
|
||||
} {
|
||||
// Start mockServer
|
||||
@@ -125,7 +150,7 @@ func TestActiveAlerts(t *testing.T) {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
result, err := p.ActiveAlerts(regex, tc.firingOnly)
|
||||
result, err := p.ActiveAlerts(regex, tc.firingOnly, tc.filterMatchOnly)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -35,6 +35,11 @@ type lockAnnotationValue struct {
|
||||
TTL time.Duration `json:"TTL"`
|
||||
}
|
||||
|
||||
type multiLockAnnotationValue struct {
|
||||
MaxOwners int `json:"maxOwners"`
|
||||
LockAnnotations []lockAnnotationValue `json:"locks"`
|
||||
}
|
||||
|
||||
// New creates a daemonsetLock object containing the necessary data for follow up k8s requests
|
||||
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock {
|
||||
return &DaemonSetLock{client, nodeID, namespace, name, annotation}
|
||||
@@ -70,7 +75,7 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(valueBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
// Something else updated the resource between us reading and writing - try again soon
|
||||
@@ -84,6 +89,92 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
|
||||
}
|
||||
}
|
||||
|
||||
// AcquireMultiple creates and annotates the daemonset with a multiple owner lock
|
||||
func (dsl *DaemonSetLock) AcquireMultiple(metadata interface{}, TTL time.Duration, maxOwners int) (bool, []string, error) {
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return false, []string{}, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
annotation := multiLockAnnotationValue{}
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
if err := json.Unmarshal([]byte(valueString), &annotation); err != nil {
|
||||
return false, []string{}, fmt.Errorf("error getting multi lock: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, metadata, TTL, maxOwners)
|
||||
if !lockPossible {
|
||||
return false, nodeIDsFromMultiLock(newAnnotation), nil
|
||||
}
|
||||
|
||||
if ds.ObjectMeta.Annotations == nil {
|
||||
ds.ObjectMeta.Annotations = make(map[string]string)
|
||||
}
|
||||
newAnnotationBytes, err := json.Marshal(&newAnnotation)
|
||||
if err != nil {
|
||||
return false, []string{}, fmt.Errorf("error marshalling new annotation lock: %w", err)
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
time.Sleep(time.Second)
|
||||
continue
|
||||
} else {
|
||||
return false, []string{}, fmt.Errorf("error updating daemonset with multi lock: %w", err)
|
||||
}
|
||||
}
|
||||
return true, nodeIDsFromMultiLock(newAnnotation), nil
|
||||
}
|
||||
}
|
||||
|
||||
func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string {
|
||||
nodeIDs := make([]string, 0, len(annotation.LockAnnotations))
|
||||
for _, nodeLock := range annotation.LockAnnotations {
|
||||
nodeIDs = append(nodeIDs, nodeLock.NodeID)
|
||||
}
|
||||
return nodeIDs
|
||||
}
|
||||
|
||||
func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata interface{}, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) {
|
||||
newAnnotation := multiLockAnnotationValue{MaxOwners: maxOwners}
|
||||
freeSpace := false
|
||||
if annotation.LockAnnotations == nil || len(annotation.LockAnnotations) < maxOwners {
|
||||
freeSpace = true
|
||||
newAnnotation.LockAnnotations = annotation.LockAnnotations
|
||||
} else {
|
||||
for _, nodeLock := range annotation.LockAnnotations {
|
||||
if ttlExpired(nodeLock.Created, nodeLock.TTL) {
|
||||
freeSpace = true
|
||||
continue
|
||||
}
|
||||
newAnnotation.LockAnnotations = append(
|
||||
newAnnotation.LockAnnotations,
|
||||
nodeLock,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if freeSpace {
|
||||
newAnnotation.LockAnnotations = append(
|
||||
newAnnotation.LockAnnotations,
|
||||
lockAnnotationValue{
|
||||
NodeID: dsl.nodeID,
|
||||
Metadata: metadata,
|
||||
Created: time.Now().UTC(),
|
||||
TTL: TTL,
|
||||
},
|
||||
)
|
||||
return true, newAnnotation
|
||||
}
|
||||
|
||||
return false, multiLockAnnotationValue{}
|
||||
}
|
||||
|
||||
// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go
|
||||
func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
@@ -106,6 +197,30 @@ func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// TestMultiple attempts to check the kured daemonset lock status for multi locks
|
||||
func (dsl *DaemonSetLock) TestMultiple() (bool, error) {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
value := multiLockAnnotationValue{}
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
for _, nodeLock := range value.LockAnnotations {
|
||||
if nodeLock.NodeID == dsl.nodeID && !ttlExpired(nodeLock.Created, nodeLock.TTL) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Release attempts to remove the lock data from the kured ds annotations using client-go
|
||||
func (dsl *DaemonSetLock) Release() error {
|
||||
for {
|
||||
@@ -130,7 +245,56 @@ func (dsl *DaemonSetLock) Release() error {
|
||||
|
||||
delete(ds.ObjectMeta.Annotations, dsl.annotation)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
// Something else updated the resource between us reading and writing - try again soon
|
||||
time.Sleep(time.Second)
|
||||
continue
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// ReleaseMultiple attempts to remove the lock data from the kured ds annotations using client-go
|
||||
func (dsl *DaemonSetLock) ReleaseMultiple() error {
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
modified := false
|
||||
value := multiLockAnnotationValue{}
|
||||
if exists {
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for idx, nodeLock := range value.LockAnnotations {
|
||||
if nodeLock.NodeID == dsl.nodeID {
|
||||
value.LockAnnotations = append(value.LockAnnotations[:idx], value.LockAnnotations[idx+1:]...)
|
||||
modified = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !exists || !modified {
|
||||
return fmt.Errorf("Lock not held")
|
||||
}
|
||||
|
||||
newAnnotationBytes, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error marshalling new annotation on release: %v", err)
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
// Something else updated the resource between us reading and writing - try again soon
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package daemonsetlock
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -26,3 +28,181 @@ func TestTtlExpired(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func multiLockAnnotationsAreEqualByNodes(src, dst multiLockAnnotationValue) bool {
|
||||
srcNodes := []string{}
|
||||
for _, srcLock := range src.LockAnnotations {
|
||||
srcNodes = append(srcNodes, srcLock.NodeID)
|
||||
}
|
||||
sort.Strings(srcNodes)
|
||||
|
||||
dstNodes := []string{}
|
||||
for _, dstLock := range dst.LockAnnotations {
|
||||
dstNodes = append(dstNodes, dstLock.NodeID)
|
||||
}
|
||||
sort.Strings(dstNodes)
|
||||
|
||||
return reflect.DeepEqual(srcNodes, dstNodes)
|
||||
}
|
||||
|
||||
func TestCanAcquireMultiple(t *testing.T) {
|
||||
node1Name := "n1"
|
||||
node2Name := "n2"
|
||||
node3Name := "n3"
|
||||
testCases := []struct {
|
||||
name string
|
||||
daemonSetLock DaemonSetLock
|
||||
maxOwners int
|
||||
current multiLockAnnotationValue
|
||||
desired multiLockAnnotationValue
|
||||
lockPossible bool
|
||||
}{
|
||||
{
|
||||
name: "empty_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "partial_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node2Name},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
{NodeID: node2Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "full_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node2Name},
|
||||
{NodeID: node3Name},
|
||||
},
|
||||
},
|
||||
lockPossible: false,
|
||||
},
|
||||
{
|
||||
name: "full_with_one_expired_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
{NodeID: node3Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "full_with_all_expired_locks",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, testCase := range testCases {
|
||||
t.Run(testCase.name, func(t *testing.T) {
|
||||
lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, struct{}{}, time.Minute, testCase.maxOwners)
|
||||
if lockPossible != testCase.lockPossible {
|
||||
t.Fatalf(
|
||||
"unexpected result for lock possible (got %t expected %t new annotation %v",
|
||||
lockPossible,
|
||||
testCase.lockPossible,
|
||||
actual,
|
||||
)
|
||||
}
|
||||
|
||||
if lockPossible && (!multiLockAnnotationsAreEqualByNodes(actual, testCase.desired) || testCase.desired.MaxOwners != actual.MaxOwners) {
|
||||
t.Fatalf(
|
||||
"expected lock %v but got %v",
|
||||
testCase.desired,
|
||||
actual,
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,7 @@ func (t *Taint) Disable() {
|
||||
}
|
||||
|
||||
func taintExists(client *kubernetes.Clientset, nodeID, taintName string) (bool, int, *v1.Node) {
|
||||
updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
updatedNode, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil || updatedNode == nil {
|
||||
log.Fatalf("Error reading node %s: %v", nodeID, err)
|
||||
}
|
||||
@@ -153,7 +153,7 @@ func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, ef
|
||||
log.Fatalf("Error encoding taint patch for node %s: %v", nodeID, err)
|
||||
}
|
||||
|
||||
_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
|
||||
_, err = client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
|
||||
}
|
||||
|
||||
@@ -46,6 +46,12 @@ do
|
||||
echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}"
|
||||
echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}"
|
||||
|
||||
#"$KUBECTL_CMD" logs -n kube-system -l name=kured --ignore-errors > "$tmp_dir"/node_output
|
||||
#if [[ "$DEBUG" == "true" ]]; then
|
||||
# echo "Kured pod logs:"
|
||||
# cat "$tmp_dir"/node_output
|
||||
#fi
|
||||
|
||||
"$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output
|
||||
if [[ "$DEBUG" == "true" ]]; then
|
||||
# This is useful to see if a node gets stuck after drain, and doesn't
|
||||
|
||||
Reference in New Issue
Block a user