mirror of
https://github.com/kubereboot/kured.git
synced 2026-02-14 17:39:49 +00:00
Compare commits
479 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
96f8d2f981 | ||
|
|
67584fc365 | ||
|
|
1e1d4a3e18 | ||
|
|
0f71c238c2 | ||
|
|
76d761f7dc | ||
|
|
35880ef052 | ||
|
|
2a4cf0a9b5 | ||
|
|
e8c951d536 | ||
|
|
f1a74eefef | ||
|
|
930db0bd96 | ||
|
|
05ac71a782 | ||
|
|
03e01e52c9 | ||
|
|
19ea0d99f0 | ||
|
|
3ca4a06257 | ||
|
|
903d223daa | ||
|
|
45b19f650c | ||
|
|
8b1630c441 | ||
|
|
abd47bfe63 | ||
|
|
dc5076b217 | ||
|
|
f441e5d1cd | ||
|
|
20a35ad8d2 | ||
|
|
990c1093bb | ||
|
|
a789a844e9 | ||
|
|
be6065c7a6 | ||
|
|
13bb79bd96 | ||
|
|
acc8e89504 | ||
|
|
6de3fcaf68 | ||
|
|
18c73209e1 | ||
|
|
895cad9414 | ||
|
|
662b2b7daa | ||
|
|
a68734f82a | ||
|
|
4ebb03a368 | ||
|
|
8635b70bdb | ||
|
|
39f0b77b18 | ||
|
|
8edd8a49d4 | ||
|
|
5a9765cc0f | ||
|
|
6b4bbca160 | ||
|
|
36c6df7d16 | ||
|
|
2038f8a914 | ||
|
|
f030572252 | ||
|
|
a04d848f9c | ||
|
|
44da60c421 | ||
|
|
cb13ff9c8c | ||
|
|
eaceb3df70 | ||
|
|
ad2e501c53 | ||
|
|
3c36826507 | ||
|
|
daebda889b | ||
|
|
391532df84 | ||
|
|
02e36de369 | ||
|
|
140fcf5fbb | ||
|
|
746009d780 | ||
|
|
dbc02cc6ec | ||
|
|
80ce8ff7d6 | ||
|
|
e0e81bcd33 | ||
|
|
941304cf40 | ||
|
|
95990fbfd8 | ||
|
|
b72417259a | ||
|
|
4ff41fa4fc | ||
|
|
0b02795d70 | ||
|
|
50189ebeab | ||
|
|
2bd8d484c7 | ||
|
|
e191397fa4 | ||
|
|
8c2292ae4c | ||
|
|
aa918fba5f | ||
|
|
72e13cc0de | ||
|
|
e60c8b730b | ||
|
|
55357c9c65 | ||
|
|
3e082ef0f1 | ||
|
|
f1d5ddc3e2 | ||
|
|
b6af0a1d5f | ||
|
|
9206ff0d91 | ||
|
|
b2ee3e3cdd | ||
|
|
2a8dcac3d3 | ||
|
|
455b3df0dc | ||
|
|
856c1b6950 | ||
|
|
565fea9d62 | ||
|
|
5397501441 | ||
|
|
5d5310b509 | ||
|
|
2415937578 | ||
|
|
6dce6d44e0 | ||
|
|
d0f9fa4647 | ||
|
|
6527ca3b6e | ||
|
|
b2933fab75 | ||
|
|
248242da5b | ||
|
|
d94087afad | ||
|
|
465a6cceab | ||
|
|
51dc738292 | ||
|
|
36f38825b9 | ||
|
|
de77a0f44c | ||
|
|
628a8ae590 | ||
|
|
dd5a303113 | ||
|
|
72e39575c2 | ||
|
|
ab1673a72a | ||
|
|
c2fbb8f849 | ||
|
|
473beb873c | ||
|
|
b8bdb5e00a | ||
|
|
a9676db6c3 | ||
|
|
9848deb283 | ||
|
|
15b451ee04 | ||
|
|
8b14073176 | ||
|
|
1b5d01ee8c | ||
|
|
b37bf39a74 | ||
|
|
c768c7c8d5 | ||
|
|
5530ab0db1 | ||
|
|
8f9af5c1dd | ||
|
|
31551a2c23 | ||
|
|
91ef335394 | ||
|
|
e5f01ce172 | ||
|
|
9a24d9ddab | ||
|
|
56f2b97045 | ||
|
|
cbb1d5702b | ||
|
|
888964c17a | ||
|
|
83eca94075 | ||
|
|
390fe1e742 | ||
|
|
785a8efdf4 | ||
|
|
5275bbd5a9 | ||
|
|
95e6055522 | ||
|
|
a5b3faaa05 | ||
|
|
3da7d5b8f4 | ||
|
|
ec0ba4f1bd | ||
|
|
3adeb5a384 | ||
|
|
9b13117fd4 | ||
|
|
e370b0bd4a | ||
|
|
659e9fd5bf | ||
|
|
94e73465ad | ||
|
|
f81a302fa5 | ||
|
|
f20a1ddd05 | ||
|
|
7c3184239a | ||
|
|
9fbd0a2cc8 | ||
|
|
738564296a | ||
|
|
b47d43f268 | ||
|
|
9ac37661d2 | ||
|
|
fc8d979da4 | ||
|
|
030ff4525e | ||
|
|
c62e67b27a | ||
|
|
2a2ee20b32 | ||
|
|
e6c06078ff | ||
|
|
e110d575f0 | ||
|
|
f680091f02 | ||
|
|
edb7460928 | ||
|
|
62f22e7060 | ||
|
|
eba33a7149 | ||
|
|
d29cc2ae68 | ||
|
|
11acd1a86d | ||
|
|
3a116be09d | ||
|
|
e750f07de8 | ||
|
|
c77090d5fd | ||
|
|
f559a95304 | ||
|
|
73f00ce445 | ||
|
|
626db87158 | ||
|
|
67df0e935a | ||
|
|
231888e58a | ||
|
|
d8b9e31ac9 | ||
|
|
104a745305 | ||
|
|
aae5bb6ebb | ||
|
|
a8132a2286 | ||
|
|
42c4b8bc53 | ||
|
|
3895a2f6d3 | ||
|
|
f43ed1484e | ||
|
|
969926dfc3 | ||
|
|
36e6c8b4d8 | ||
|
|
00d8a524ab | ||
|
|
eeedf203c3 | ||
|
|
574065ff8a | ||
|
|
3bfdd76f29 | ||
|
|
f34864758e | ||
|
|
7239c5460c | ||
|
|
6b7d9be99f | ||
|
|
2eec401435 | ||
|
|
a1f3d1eba9 | ||
|
|
d81b2fd93b | ||
|
|
9592fbc94f | ||
|
|
199103498b | ||
|
|
f04f465cad | ||
|
|
575fd245ae | ||
|
|
608abc6e89 | ||
|
|
804ff87592 | ||
|
|
9ed8d412ac | ||
|
|
5615e1e3d2 | ||
|
|
1ce0d36b64 | ||
|
|
3ff79eb20d | ||
|
|
98dfe109ce | ||
|
|
f986887214 | ||
|
|
18c3c06b6e | ||
|
|
fc9a5c75e3 | ||
|
|
719d241e30 | ||
|
|
204b094554 | ||
|
|
4451747a83 | ||
|
|
cec0881290 | ||
|
|
5c71880f32 | ||
|
|
fdac3b1fe7 | ||
|
|
a02ae67559 | ||
|
|
5536bf7e30 | ||
|
|
29b4af1ab7 | ||
|
|
a82b11f1c2 | ||
|
|
679cdc40b9 | ||
|
|
efbd514af8 | ||
|
|
54d356c420 | ||
|
|
ee18dbf482 | ||
|
|
2d52f00bfe | ||
|
|
35a7b19d4d | ||
|
|
7efa076f4f | ||
|
|
6547ce1f08 | ||
|
|
a6a5f76210 | ||
|
|
685a8c66be | ||
|
|
3f39fd0943 | ||
|
|
9a33096e73 | ||
|
|
d51c1fb683 | ||
|
|
58091f6145 | ||
|
|
b21644b11b | ||
|
|
8bb145700c | ||
|
|
d9478cb614 | ||
|
|
2d6f6f811b | ||
|
|
66cfca4a60 | ||
|
|
ce8e96290c | ||
|
|
5f88ff3005 | ||
|
|
b08498318e | ||
|
|
7a3879d030 | ||
|
|
fb292a68dc | ||
|
|
19a31c4c3e | ||
|
|
3f69a3c3cb | ||
|
|
7cbf797407 | ||
|
|
fcc29c3a88 | ||
|
|
ddff18e5bf | ||
|
|
129e68f9fa | ||
|
|
1e648cabcd | ||
|
|
396c688041 | ||
|
|
601e89a4d8 | ||
|
|
1ccc4aad1d | ||
|
|
1bfc6ed242 | ||
|
|
6e46f79f14 | ||
|
|
dffe47a22e | ||
|
|
6933fb0ca6 | ||
|
|
4d37b6544c | ||
|
|
ab4e2b0112 | ||
|
|
836d23005e | ||
|
|
0103f35b50 | ||
|
|
496423095a | ||
|
|
11b61b5fe6 | ||
|
|
a02ae8696b | ||
|
|
b9f7045bc6 | ||
|
|
fdc252e475 | ||
|
|
ab56a0c0ce | ||
|
|
b526600ec6 | ||
|
|
6b7161aa92 | ||
|
|
f077413aff | ||
|
|
c4a1e9893b | ||
|
|
1a8718096b | ||
|
|
23a0fcb912 | ||
|
|
9e5e0bb930 | ||
|
|
28f878cf7c | ||
|
|
2efd823e13 | ||
|
|
bc2867f283 | ||
|
|
e3ade9d053 | ||
|
|
0ad395a9f1 | ||
|
|
cdc6e68ae1 | ||
|
|
d9216e9baf | ||
|
|
d0bdc115a7 | ||
|
|
6f5d6cb1f9 | ||
|
|
ebb7ccf96d | ||
|
|
dccf0856c7 | ||
|
|
b37fd26062 | ||
|
|
221d5d222a | ||
|
|
e71df6a94e | ||
|
|
c50793933f | ||
|
|
c99930d4d4 | ||
|
|
39c353fb8f | ||
|
|
3fd1b0d32a | ||
|
|
83fcc8f28f | ||
|
|
cbfafbb6f4 | ||
|
|
238423969c | ||
|
|
ea7d9d83f1 | ||
|
|
1c540d94de | ||
|
|
1151d324fa | ||
|
|
a8cc821de5 | ||
|
|
b057ed9eba | ||
|
|
17badb57df | ||
|
|
99c255074d | ||
|
|
408889e2fa | ||
|
|
ca4e4a7063 | ||
|
|
4c098b202a | ||
|
|
3ab5ad025f | ||
|
|
38889b9cac | ||
|
|
90fc467a00 | ||
|
|
8da66de1a9 | ||
|
|
76954d0d94 | ||
|
|
c78f9948ee | ||
|
|
0bfe7a0208 | ||
|
|
d909286a5d | ||
|
|
87202d8fcf | ||
|
|
fb800aade5 | ||
|
|
5a1c90da48 | ||
|
|
9f78ef3555 | ||
|
|
ebbcabee37 | ||
|
|
a1df379c43 | ||
|
|
21aa783cb4 | ||
|
|
cc96064c26 | ||
|
|
3f8760be48 | ||
|
|
8d32574da1 | ||
|
|
a3d7bc3172 | ||
|
|
f75bd4697a | ||
|
|
e47210e986 | ||
|
|
bd45aa61d5 | ||
|
|
0f1f724c06 | ||
|
|
84a70a035a | ||
|
|
e7825fde9f | ||
|
|
496b61be6b | ||
|
|
5344747b58 | ||
|
|
d51258ffde | ||
|
|
6912a1e14e | ||
|
|
61309b9a73 | ||
|
|
9bfa399adb | ||
|
|
1ef888b79e | ||
|
|
0db448da56 | ||
|
|
06d583fbe6 | ||
|
|
bd1c593694 | ||
|
|
a74ea49a3f | ||
|
|
234f819b07 | ||
|
|
125a74976c | ||
|
|
50ba52c2d8 | ||
|
|
60dd73e69c | ||
|
|
608afb84d6 | ||
|
|
7ba9d8aa0b | ||
|
|
ae4d5679b4 | ||
|
|
53cdf40254 | ||
|
|
49967f701c | ||
|
|
7e6d442fa9 | ||
|
|
a74cf1a37e | ||
|
|
7b1e2ffe02 | ||
|
|
34b3d9c1a3 | ||
|
|
f948902710 | ||
|
|
adbf0bb1dc | ||
|
|
9e4b69f818 | ||
|
|
510b2e7e29 | ||
|
|
2f58a4ee7e | ||
|
|
09b8a6118f | ||
|
|
4ce09a8cb1 | ||
|
|
ec551fb390 | ||
|
|
924799cac8 | ||
|
|
1c5baef1f8 | ||
|
|
328dad5ac1 | ||
|
|
09edf0605e | ||
|
|
500693735e | ||
|
|
7cd5b102bf | ||
|
|
b983f8a612 | ||
|
|
97a2514015 | ||
|
|
cddb6afa39 | ||
|
|
5b13247370 | ||
|
|
de6460b2aa | ||
|
|
60ff3e7051 | ||
|
|
61b96375af | ||
|
|
ccbdbe8d16 | ||
|
|
357e2e3d2b | ||
|
|
d4f4a7b553 | ||
|
|
4e935d18f7 | ||
|
|
55f5c0b0fa | ||
|
|
a9e5098dc8 | ||
|
|
f98f74c2b9 | ||
|
|
acdea520b7 | ||
|
|
ff62aecca6 | ||
|
|
0057783ac8 | ||
|
|
ab15cf14a2 | ||
|
|
614c6e8472 | ||
|
|
63a388bded | ||
|
|
f75a87ae4a | ||
|
|
13ee1b90aa | ||
|
|
2aaa2bb732 | ||
|
|
35c41c2bc6 | ||
|
|
48981e6c71 | ||
|
|
a8bcf5bbfe | ||
|
|
ecec14c773 | ||
|
|
a7f113bdf8 | ||
|
|
ac6f777d60 | ||
|
|
8a2d2f9f2f | ||
|
|
620943eefb | ||
|
|
0df1059d66 | ||
|
|
14d887e9d0 | ||
|
|
8bc66c937d | ||
|
|
3639080851 | ||
|
|
fb51a566da | ||
|
|
9a4b8fdb32 | ||
|
|
3b9b190422 | ||
|
|
f22b1abd17 | ||
|
|
c159b37fcc | ||
|
|
351ca71787 | ||
|
|
16dc5e30d9 | ||
|
|
aa971697ff | ||
|
|
d019e7a50a | ||
|
|
ee81617645 | ||
|
|
d7adcf6e1e | ||
|
|
409ff0a3e6 | ||
|
|
3be3cd46b5 | ||
|
|
e8202c602c | ||
|
|
752176d16b | ||
|
|
d30a71e1d3 | ||
|
|
815df5e1e9 | ||
|
|
77327b3915 | ||
|
|
ec328e33d6 | ||
|
|
54e127c2ad | ||
|
|
1867c3253e | ||
|
|
05a3ff85a3 | ||
|
|
19846c73f2 | ||
|
|
ba62c32cbf | ||
|
|
4c75199b41 | ||
|
|
91eb403942 | ||
|
|
a27c755260 | ||
|
|
2a6d119b3b | ||
|
|
b17224addc | ||
|
|
a2f21ebe49 | ||
|
|
4d2f26f483 | ||
|
|
b358be7617 | ||
|
|
e88434b619 | ||
|
|
1b12e52434 | ||
|
|
64e40a62b0 | ||
|
|
6690396679 | ||
|
|
9acb2450ea | ||
|
|
e1a5b7d705 | ||
|
|
72f52f2c6f | ||
|
|
6df454c0eb | ||
|
|
c09e65eab1 | ||
|
|
a34c994f4b | ||
|
|
60c54bef31 | ||
|
|
8afa302680 | ||
|
|
de42273849 | ||
|
|
d3e2c9af95 | ||
|
|
00648786b7 | ||
|
|
c7f4380847 | ||
|
|
c659c25b94 | ||
|
|
f44ced2d04 | ||
|
|
e7d24bfff0 | ||
|
|
0378c8a8c5 | ||
|
|
2cfeb34c03 | ||
|
|
3bfacca254 | ||
|
|
46e1b9616b | ||
|
|
fe95f17503 | ||
|
|
462a063b6e | ||
|
|
e664de6c6f | ||
|
|
b666474cf1 | ||
|
|
64313f82ef | ||
|
|
59ba53584e | ||
|
|
b2ffc0d154 | ||
|
|
b7edf8b345 | ||
|
|
4e01e607cc | ||
|
|
1929c11297 | ||
|
|
28832f5cfb | ||
|
|
3c79c750e1 | ||
|
|
58afedd842 | ||
|
|
57783966db | ||
|
|
316a0ef4a3 | ||
|
|
7a86e65c69 | ||
|
|
efa0fe808d | ||
|
|
c2f97614dd | ||
|
|
e710e05658 | ||
|
|
4ff3378df5 | ||
|
|
002f331486 | ||
|
|
2993afb329 | ||
|
|
97e1f56008 | ||
|
|
4c9ed478d4 | ||
|
|
6e0af2f320 | ||
|
|
1ea3823069 | ||
|
|
0063141b89 | ||
|
|
3a1cfe395e | ||
|
|
ae3ab9f3e1 | ||
|
|
0b27a7ea80 | ||
|
|
2596dcdcab | ||
|
|
00c8b5254b | ||
|
|
6aa6a96b46 | ||
|
|
a7b155a78f | ||
|
|
031ceed1f1 | ||
|
|
0ceb062a47 | ||
|
|
a4fba5a5bc | ||
|
|
942f9d7eed | ||
|
|
fd58b79413 | ||
|
|
132215ee97 | ||
|
|
25662304c2 | ||
|
|
887b2e2427 | ||
|
|
6afa8513c8 | ||
|
|
94a4387407 | ||
|
|
9ab71c894f |
13
.github/kind-cluster-1.24.yaml
vendored
13
.github/kind-cluster-1.24.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.24.7"
|
||||
13
.github/kind-cluster-1.25.yaml
vendored
13
.github/kind-cluster-1.25.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.3
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.3
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.3
|
||||
- role: worker
|
||||
image: kindest/node:v1.25.3
|
||||
- role: worker
|
||||
image: kindest/node:v1.25.3
|
||||
13
.github/kind-cluster-1.26.yaml
vendored
13
.github/kind-cluster-1.26.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.0"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.0"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.0"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.26.0"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.26.0"
|
||||
9
.github/kind-cluster-current.yaml
vendored
Normal file
9
.github/kind-cluster-current.yaml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.30.10"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.30.10"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.30.10"
|
||||
9
.github/kind-cluster-next.yaml
vendored
Normal file
9
.github/kind-cluster-next.yaml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.31.6"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.31.6"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.31.6"
|
||||
9
.github/kind-cluster-previous.yaml
vendored
Normal file
9
.github/kind-cluster-previous.yaml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.29.14"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.29.14"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.29.14"
|
||||
16
.github/workflows/codeql.yml
vendored
16
.github/workflows/codeql.yml
vendored
@@ -21,6 +21,9 @@ on:
|
||||
schedule:
|
||||
- cron: '24 13 * * 3'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
@@ -38,12 +41,17 @@ jobs:
|
||||
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v2
|
||||
uses: github/codeql-action/init@b56ba49b26e50535fa1e7f7db0f4f7b4bf65d80d # v3.28.10
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
@@ -57,7 +65,7 @@ jobs:
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v2
|
||||
uses: github/codeql-action/autobuild@b56ba49b26e50535fa1e7f7db0f4f7b4bf65d80d # v3.28.10
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
@@ -70,6 +78,6 @@ jobs:
|
||||
# ./location_of_script_within_repo/buildscript.sh
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v2
|
||||
uses: github/codeql-action/analyze@b56ba49b26e50535fa1e7f7db0f4f7b4bf65d80d # v3.28.10
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
||||
|
||||
27
.github/workflows/dependency-review.yml
vendored
Normal file
27
.github/workflows/dependency-review.yml
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# Dependency Review Action
|
||||
#
|
||||
# This Action will scan dependency manifest files that change as part of a Pull Request,
|
||||
# surfacing known-vulnerable versions of the packages declared or updated in the PR.
|
||||
# Once installed, if the workflow run is marked as required,
|
||||
# PRs introducing known-vulnerable packages will be blocked from merging.
|
||||
#
|
||||
# Source repository: https://github.com/actions/dependency-review-action
|
||||
name: 'Dependency Review'
|
||||
on: [pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
dependency-review:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: 'Checkout Repository'
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: 'Dependency Review'
|
||||
uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0
|
||||
37
.github/workflows/on-main-push.yaml
vendored
37
.github/workflows/on-main-push.yaml
vendored
@@ -10,6 +10,9 @@ env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
tag-scan-and-push-final-image:
|
||||
name: "Build, scan, and publish tagged image"
|
||||
@@ -19,16 +22,21 @@ jobs:
|
||||
contents: write
|
||||
packages: write
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v2
|
||||
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
@@ -36,15 +44,15 @@ jobs:
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@507c2f2dc502c992ad446e3d7a5dfbe311567a96
|
||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
|
||||
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
@@ -57,10 +65,9 @@ jobs:
|
||||
run: make kured-release-snapshot
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
|
||||
- name: Build image
|
||||
uses: docker/build-push-action@v4
|
||||
uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
|
||||
@@ -71,15 +78,11 @@ jobs:
|
||||
|
||||
- name: Generate SBOM
|
||||
run: |
|
||||
.tmp/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }} -o spdx > kured.sbom
|
||||
hack/bin/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }} -o spdx > kured.sbom
|
||||
|
||||
- name: Sign and attest artifacts
|
||||
run: |
|
||||
.tmp/cosign sign -f -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
|
||||
.tmp/cosign sign-blob --output-signature kured.sbom.sig --output-certificate kured.sbom.pem kured.sbom
|
||||
|
||||
.tmp/cosign attest -f --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
.tmp/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
env:
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
hack/bin/cosign sign -y -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
hack/bin/cosign sign-blob -y --output-signature kured.sbom.sig --output-certificate kured.sbom.pem kured.sbom
|
||||
hack/bin/cosign attest -y --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
hack/bin/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
|
||||
|
||||
198
.github/workflows/on-pr.yaml
vendored
198
.github/workflows/on-pr.yaml
vendored
@@ -4,59 +4,43 @@ on:
|
||||
push:
|
||||
|
||||
jobs:
|
||||
pr-gotest:
|
||||
name: Run go tests
|
||||
pr-short-tests:
|
||||
name: Run short go tests
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: run tests
|
||||
run: go test -json ./... > test.json
|
||||
run: make test
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.6.0
|
||||
uses: guyarb/golang-test-annoations@2941118d7ef622b1b3771d1ff6eae9e90659eb26 # v0.8.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
pr-shellcheck:
|
||||
name: Lint bash code with shellcheck
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Run ShellCheck
|
||||
uses: bewuethr/shellcheck-action@v2
|
||||
|
||||
pr-lint-code:
|
||||
name: Lint golang code
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Lint cmd folder
|
||||
uses: Jerome1337/golint-action@v1.0.3
|
||||
with:
|
||||
golint-path: './cmd/...'
|
||||
- name: Lint pkg folder
|
||||
uses: Jerome1337/golint-action@v1.0.3
|
||||
with:
|
||||
golint-path: './pkg/...'
|
||||
|
||||
pr-check-docs-links:
|
||||
name: Check docs for incorrect links
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Link Checker
|
||||
uses: lycheeverse/lychee-action@4dcb8bee2a0a4531cba1a1f392c54e8375d6dd81
|
||||
uses: lycheeverse/lychee-action@f613c4a64e50d792e0b31ec34bbcbba12263c6a6
|
||||
env:
|
||||
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
||||
with:
|
||||
@@ -70,29 +54,37 @@ jobs:
|
||||
name: Build image and scan it against known vulnerabilities
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build image
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
- uses: Azure/container-scan@v0
|
||||
env:
|
||||
# See https://github.com/goodwithtech/dockle/issues/188
|
||||
DOCKLE_HOST: "unix:///var/run/docker.sock"
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make image
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0
|
||||
with:
|
||||
image-name: ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}
|
||||
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
# This ensures the latest code works with the manifests built from tree.
|
||||
# It is useful for two things:
|
||||
@@ -100,79 +92,91 @@ jobs:
|
||||
# - Ensure manifests work with the latest versions even with no manifest change
|
||||
# (compared to helm charts, manifests cannot easily template changes based on versions)
|
||||
# Helm charts are _trailing_ releases, while manifests are done during development.
|
||||
# This test uses the "command" reboot-method.
|
||||
e2e-manifests:
|
||||
name: End-to-End test with kured with code and manifests from HEAD
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.24"
|
||||
- "1.25"
|
||||
- "1.26"
|
||||
testname:
|
||||
- "TestE2EWithCommand"
|
||||
- "TestE2EWithSignal"
|
||||
- "TestE2EConcurrentWithCommand"
|
||||
- "TestE2EConcurrentWithSignal"
|
||||
kubernetes_version:
|
||||
- "previous"
|
||||
- "current"
|
||||
- "next"
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build artifacts
|
||||
run: |
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest
|
||||
|
||||
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
|
||||
run: |
|
||||
sudo bash << EOF
|
||||
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
|
||||
echo '{}' > /etc/docker/daemon.json
|
||||
systemctl restart docker || journalctl --no-pager -n 500
|
||||
systemctl status docker
|
||||
EOF
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create kind cluster with 5 nodes
|
||||
uses: helm/kind-action@v1.5.0
|
||||
- name: Install kind
|
||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||
with:
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.14.0
|
||||
install_only: true
|
||||
version: v0.27.0
|
||||
- name: Run specific e2e tests
|
||||
run: make e2e-test ARGS="-run ^${{ matrix.testname }}/${{ matrix.kubernetes_version }}"
|
||||
|
||||
- name: Preload previously built images onto kind cluster
|
||||
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing
|
||||
|
||||
- name: Do not wait for an hour before detecting the rebootSentinel
|
||||
run: |
|
||||
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
|
||||
|
||||
- name: Install kured with kubectl
|
||||
run: |
|
||||
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.8.3
|
||||
e2e-tests-singleversion:
|
||||
name: End-to-End test targetting a single version of kubernetes
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
testname:
|
||||
- "TestCordonningIsKept/concurrency1"
|
||||
- "TestCordonningIsKept/concurrency2"
|
||||
- "TestE2EBlocker/podblocker"
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
retry_wait_seconds: 60
|
||||
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
|
||||
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"
|
||||
egress-policy: audit
|
||||
|
||||
- name: Create reboot sentinel files
|
||||
run: |
|
||||
./tests/kind/create-reboot-sentinels.sh
|
||||
|
||||
- name: Follow reboot until success
|
||||
env:
|
||||
DEBUG: true
|
||||
run: |
|
||||
./tests/kind/follow-coordinated-reboot.sh
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Install kind
|
||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||
with:
|
||||
install_only: true
|
||||
version: v0.27.0
|
||||
# Keep this until v1.31 (or superior) becomes the default kubectl version for the kind-action.
|
||||
# It is used in podblocker shell script test to use --all-pods.
|
||||
# If the podblocker e2e test relies on another way, this can also be removed.
|
||||
kubectl_version: v1.31.0
|
||||
- name: Run specific e2e tests
|
||||
run: make e2e-test ARGS="-run ^${{ matrix.testname }}"
|
||||
|
||||
52
.github/workflows/on-tag.yaml
vendored
52
.github/workflows/on-tag.yaml
vendored
@@ -12,6 +12,9 @@ env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
tag-scan-and-push-final-image:
|
||||
name: "Build, scan, and publish tagged image"
|
||||
@@ -21,9 +24,14 @@ jobs:
|
||||
contents: write
|
||||
packages: write
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -31,18 +39,17 @@ jobs:
|
||||
run: echo "version=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Build binaries
|
||||
run: make kured-release-tag
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
- name: Build single image for scan
|
||||
uses: docker/build-push-action@v4
|
||||
uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64
|
||||
@@ -51,15 +58,18 @@ jobs:
|
||||
tags: |
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
- uses: Azure/container-scan@v0
|
||||
env:
|
||||
# See https://github.com/goodwithtech/dockle/issues/188
|
||||
DOCKLE_HOST: "unix:///var/run/docker.sock"
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0
|
||||
with:
|
||||
image-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
image-ref: '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v2
|
||||
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
@@ -67,12 +77,12 @@ jobs:
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@507c2f2dc502c992ad446e3d7a5dfbe311567a96
|
||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
- name: Build release images
|
||||
uses: docker/build-push-action@v4
|
||||
uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
|
||||
@@ -83,15 +93,11 @@ jobs:
|
||||
|
||||
- name: Generate SBOM
|
||||
run: |
|
||||
.tmp/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }} -o spdx > kured.sbom
|
||||
hack/bin/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }} -o spdx > kured.sbom
|
||||
|
||||
- name: Sign and attest artifacts
|
||||
run: |
|
||||
.tmp/cosign sign -f -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
.tmp/cosign sign-blob --output-signature kured.sbom.sig kured.sbom
|
||||
|
||||
.tmp/cosign attest -f --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
.tmp/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
env:
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
hack/bin/cosign sign -y -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
hack/bin/cosign sign-blob -y --output-signature kured.sbom.sig kured.sbom
|
||||
hack/bin/cosign attest -y --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
hack/bin/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
55
.github/workflows/periodics-daily.yaml
vendored
55
.github/workflows/periodics-daily.yaml
vendored
@@ -9,13 +9,18 @@ jobs:
|
||||
name: Run go tests
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: run tests
|
||||
run: go test -json ./... > test.json
|
||||
run: make test
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.6.0
|
||||
uses: guyarb/golang-test-annoations@2941118d7ef622b1b3771d1ff6eae9e90659eb26 # v0.8.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
@@ -25,7 +30,12 @@ jobs:
|
||||
steps:
|
||||
# Stale by default waits for 60 days before marking PR/issues as stale, and closes them after 21 days.
|
||||
# Do not expire the first issues that would allow the community to grow.
|
||||
- uses: actions/stale@v7
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
stale-issue-message: 'This issue was automatically considered stale due to lack of activity. Please update it and/or join our slack channels to promote it, before it automatically closes (in 7 days).'
|
||||
@@ -39,9 +49,14 @@ jobs:
|
||||
name: Check docs for incorrect links
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Link Checker
|
||||
uses: lycheeverse/lychee-action@4dcb8bee2a0a4531cba1a1f392c54e8375d6dd81
|
||||
uses: lycheeverse/lychee-action@f613c4a64e50d792e0b31ec34bbcbba12263c6a6
|
||||
env:
|
||||
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
||||
with:
|
||||
@@ -52,26 +67,34 @@ jobs:
|
||||
name: Build image and scan it against known vulnerabilities
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build artifacts
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
- uses: Azure/container-scan@v0
|
||||
env:
|
||||
# See https://github.com/goodwithtech/dockle/issues/188
|
||||
DOCKLE_HOST: "unix:///var/run/docker.sock"
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make image
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0
|
||||
with:
|
||||
image-name: ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}
|
||||
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
78
.github/workflows/scorecard.yml
vendored
Normal file
78
.github/workflows/scorecard.yml
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
# This workflow uses actions that are not certified by GitHub. They are provided
|
||||
# by a third-party and are governed by separate terms of service, privacy
|
||||
# policy, and support documentation.
|
||||
|
||||
name: Scorecard supply-chain security
|
||||
on:
|
||||
# For Branch-Protection check. Only the default branch is supported. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
|
||||
branch_protection_rule:
|
||||
# To guarantee Maintained check is occasionally updated. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
|
||||
schedule:
|
||||
- cron: '34 3 * * 6'
|
||||
push:
|
||||
branches: [ "main" ]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecard analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
# Needed to publish results and get a badge (see publish_results below).
|
||||
id-token: write
|
||||
# Uncomment the permissions below if installing in a private repository.
|
||||
# contents: read
|
||||
# actions: read
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
|
||||
# - you want to enable the Branch-Protection check on a *public* repository, or
|
||||
# - you are installing Scorecard on a *private* repository
|
||||
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
|
||||
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
|
||||
|
||||
# Public repositories:
|
||||
# - Publish results to OpenSSF REST API for easy access by consumers
|
||||
# - Allows the repository to include the Scorecard badge.
|
||||
# - See https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories:
|
||||
# - `publish_results` will always be set to `false`, regardless
|
||||
# of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
|
||||
# format to the repository Actions tab.
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v3.pre.node20
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard (optional).
|
||||
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@b56ba49b26e50535fa1e7f7db0f4f7b4bf65d80d # v3.28.10
|
||||
with:
|
||||
sarif_file: results.sarif
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -2,4 +2,6 @@ cmd/kured/kured
|
||||
vendor
|
||||
build
|
||||
dist
|
||||
.tmp
|
||||
test.json
|
||||
tests/kind/testfiles/*.yaml
|
||||
hack/bin/
|
||||
|
||||
@@ -2,3 +2,5 @@ app.fossa.com
|
||||
cluster.local
|
||||
hooks.slack.com
|
||||
localhost
|
||||
slack://
|
||||
teams://
|
||||
|
||||
0
.trivyignore
Normal file
0
.trivyignore
Normal file
@@ -1,3 +1,3 @@
|
||||
## Kured Community Code of Conduct
|
||||
# Kured Community Code of Conduct
|
||||
|
||||
Kured follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/main/code-of-conduct.md).
|
||||
|
||||
131
CONTRIBUTING.md
131
CONTRIBUTING.md
@@ -5,13 +5,13 @@ Slack][slack], reporting or triaging [issues][issues] or contributing code
|
||||
to `kured`.
|
||||
|
||||
In any case, it will make sense to familiarise yourself with the main
|
||||
[README][readme] to understand the different features and options, which is
|
||||
helpful for testing. The "building" section in particular makes sense if
|
||||
you are planning to contribute code.
|
||||
[documentation][documentation] to understand the different features and
|
||||
options, which is helpful for testing. The "building" section in
|
||||
particular makes sense if you are planning to contribute code.
|
||||
|
||||
[slack]: README.md#getting-help
|
||||
[slack]: https://github.com/kubereboot/kured/blob/main/README.md#getting-help
|
||||
[issues]: https://github.com/kubereboot/kured/issues
|
||||
[readme]: README.md
|
||||
[documentation]: https://kured.dev/docs
|
||||
|
||||
## Certificate of Origin
|
||||
|
||||
@@ -41,6 +41,15 @@ All Kured repositories are kept under <https://github.com/kubereboot>. To find t
|
||||
| <https://github.com/kubereboot/charts> | Helm chart |
|
||||
| <https://github.com/kubereboot/website> | website and documentation |
|
||||
|
||||
### Kured code
|
||||
|
||||
- Kured's main code can be found in the [`cmd`](cmd) and [`pkg`](pkg) directories
|
||||
- Its e2e tests are in the [`tests`](tests) directory
|
||||
- We use [GoReleaser to build](.goreleaser.yml).
|
||||
- Every PR and tagged release is tested by [Kind in GitHub workflows](.github/workflows).
|
||||
|
||||
As a project, we try to follow all the official and obvious standards.
|
||||
|
||||
## Regular development activities
|
||||
|
||||
### Prepare environment
|
||||
@@ -66,16 +75,23 @@ efbb0c3: Document version compatibility in release notes
|
||||
|
||||
Search the git log for inspiration for your cases.
|
||||
|
||||
Please update our `.github/workflows` with the new k8s images, starting by
|
||||
the creation of a `.github/kind-cluster-<version>.yaml`, then updating
|
||||
our workflows with the new versions.
|
||||
Please update our `.github/workflows` with the new k8s images.
|
||||
|
||||
Once you updated everything, make sure you update the support matrix on
|
||||
the main [README][readme] as well.
|
||||
For that, run the following:
|
||||
|
||||
`cp .github/kind-cluster-current.yaml .github/kind-cluster-previous.yaml`
|
||||
`cp .github/kind-cluster-next.yaml .github/kind-cluster-current.yaml`
|
||||
|
||||
Then edit `.github/kind-cluster-next.yaml` to point to the new version.
|
||||
|
||||
This will make the full test matrix updated (the CI and the test code).
|
||||
|
||||
Once your code passes all tests, update the support matrix in
|
||||
the [installation docs](https://kured.dev/docs/installation/).
|
||||
|
||||
### Updating other dependencies
|
||||
|
||||
Dependabot proposes changes in our go.mod/go.sum.
|
||||
Dependabot proposes changes in our `go.mod`/`go.sum`.
|
||||
Some of those changes are covered by CI testing, some are not.
|
||||
|
||||
Please make sure to test those not covered by CI (mostly the integration
|
||||
@@ -87,7 +103,7 @@ We run periodic jobs (see also Automated testing section of this documentation).
|
||||
Those should be monitored for failures.
|
||||
|
||||
If a failure happen in periodics, something terribly wrong must have happened
|
||||
(or github is failing at the creation of a kind cluster). Please monitor those
|
||||
(or GitHub is failing at the creation of a kind cluster). Please monitor those
|
||||
failures carefully.
|
||||
|
||||
### Introducing new features
|
||||
@@ -107,23 +123,24 @@ This also means that when you expose a new feature, you should create another PR
|
||||
for your changes in <https://github.com/kubereboot/charts> to make your feature
|
||||
available at the next kured version for helm users.
|
||||
|
||||
In the charts PR, you can directly bump the appVersion to the next minor version
|
||||
In the charts PR, you can directly bump the `appVersion` to the next minor version
|
||||
(you are introducing a new feature, which requires a bump of the minor number.
|
||||
For example, if current appVersion is 1.6.x, make sure you update your appVersion
|
||||
to 1.7.0). It allows us to have an easy view of what we land each release.
|
||||
For example, if current `appVersion` is `1.6.x`, make sure you update your `appVersion`
|
||||
to `1.7.0`). It allows us to have an easy view of what we land each release.
|
||||
|
||||
Do not hesitate to increase the test coverage for your feature, whether it's unit
|
||||
testing to full functional testing (even using helm charts)
|
||||
testing to full functional testing (even using helm charts).
|
||||
|
||||
### Increasing test coverage
|
||||
|
||||
We are welcoming any change to increase our test coverage.
|
||||
See also our github issues for the label `testing`.
|
||||
See also our GitHub issues for the label
|
||||
[`testing`](https://github.com/kubereboot/kured/labels/testing).
|
||||
|
||||
## Automated testing
|
||||
|
||||
Our CI is covered by github actions.
|
||||
You can see their contents in .github/workflows.
|
||||
Our CI is covered by GitHub actions.
|
||||
You can see their contents in `.github/workflows`.
|
||||
|
||||
We currently run:
|
||||
|
||||
@@ -137,6 +154,13 @@ To test your code manually, follow the section Manual testing.
|
||||
|
||||
## Manual (release) testing
|
||||
|
||||
### Quick Golang code testing
|
||||
|
||||
Please run `make test` to run only the basic tests. It gives a good
|
||||
idea of the code behaviour.
|
||||
|
||||
### Manual functional testing
|
||||
|
||||
Before `kured` is released, we want to make sure it still works fine on the
|
||||
previous, current and next minor version of Kubernetes (with respect to the
|
||||
`client-go` & `kubectl` dependencies in use). For local testing e.g.
|
||||
@@ -152,15 +176,11 @@ results, if you login to a node and run:
|
||||
sudo touch /var/run/reboot-required
|
||||
```
|
||||
|
||||
### Example of golang testing
|
||||
|
||||
Please run `make test`. You should have `golint` installed.
|
||||
|
||||
### Example of testing with `minikube`
|
||||
### Example of functional testing with `minikube`
|
||||
|
||||
A test-run with `minikube` could look like this:
|
||||
|
||||
```console
|
||||
```cli
|
||||
# start minikube
|
||||
minikube start --driver=kvm2 --kubernetes-version <k8s-release>
|
||||
|
||||
@@ -192,7 +212,7 @@ Then you can check for the lock release.
|
||||
|
||||
A test-run with `kind` could look like this:
|
||||
|
||||
```console
|
||||
```cli
|
||||
# create kind cluster
|
||||
kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
|
||||
|
||||
@@ -204,23 +224,44 @@ kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
|
||||
|
||||
```
|
||||
|
||||
### Example of testing with `kind` and `make`
|
||||
|
||||
A test-run with `kind` and `make` can be done with the following command:
|
||||
|
||||
```cli
|
||||
# Build kured:dev image, build manifests, and run the "long" go tests
|
||||
make e2e-test
|
||||
```
|
||||
|
||||
You can alter test behaviour by passing arguments to this command.
|
||||
A few examples below:
|
||||
|
||||
```shell
|
||||
# Run only TestE2EWithSignal test for the kubernetes version named "current" (see kind file)
|
||||
make e2e-test ARGS="-run ^TestE2EWithSignal/current"
|
||||
# Run all tests but make sure to extend the timeout, for slower machines.
|
||||
make e2e-test ARGS="-timeout 1200s'
|
||||
```
|
||||
|
||||
## Publishing a new kured release
|
||||
|
||||
### Prepare Documentation
|
||||
|
||||
Check that `README.md` has an updated compatibility matrix and that the
|
||||
url in the `kubectl` incantation (under "Installation") is updated to the
|
||||
new version you want to release.
|
||||
Ensure the [compatibility matrix](https://kured.dev/docs/installation/) is
|
||||
updated to the new version you want to release.
|
||||
|
||||
### Create a tag on the repo
|
||||
### Update the manifests with the new version
|
||||
|
||||
Before going further, we should freeze the code for a release, by
|
||||
tagging the code. The Github-Action should start a new job and push
|
||||
the new image to the registry.
|
||||
Create a commit updating the manifest with future image [like this one](https://github.com/kubereboot/kured/commit/58091f6145771f426b4b9e012a43a9c847af2560).
|
||||
|
||||
### Create the combined manifest
|
||||
### Create the new version tag on the repo
|
||||
|
||||
Now create the `kured-<release>-dockerhub.yaml` for e.g. `1.3.0`:
|
||||
Tag the previously created commit with the future release version.
|
||||
The Github Actions workflow will push the new image to the registry.
|
||||
|
||||
### Create the combined manifest for the new version
|
||||
|
||||
Now create the `kured-<new version>-dockerhub.yaml` for e.g. `1.3.0`:
|
||||
|
||||
```sh
|
||||
VERSION=1.3.0
|
||||
@@ -230,13 +271,23 @@ cat kured-rbac.yaml > "$MANIFEST"
|
||||
cat kured-ds.yaml >> "$MANIFEST"
|
||||
```
|
||||
|
||||
### Publish release artifacts
|
||||
### Publish new version release artifacts
|
||||
|
||||
Now you can head to the Github UI, use the version number as tag and upload the
|
||||
`kured-<release>-dockerhub.yaml` file.
|
||||
Now you can head to the GitHub UI for releases, drafting a new
|
||||
release. Chose, as tag, the new version number.
|
||||
|
||||
Click to generate the release notes.
|
||||
|
||||
Fill, as name, "Kured <new version>".
|
||||
|
||||
Edit the generated text.
|
||||
|
||||
Please describe what's new and noteworthy in the release notes, list the PRs
|
||||
that landed and give a shout-out to everyone who contributed.
|
||||
|
||||
Please also note down on which releases the upcoming `kured` release was
|
||||
tested on. (Check old release notes if you're unsure.)
|
||||
tested on or what it supports. (Check old release notes if you're unsure.)
|
||||
|
||||
Before clicking on publishing release, upload the yaml manifest
|
||||
(`kured-<new version>-dockerhub.yaml`) file.
|
||||
|
||||
Click on publish the release and set as the latest release.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.17.1 as bin
|
||||
FROM alpine:3.21.3@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS bin
|
||||
|
||||
ARG TARGETOS
|
||||
ARG TARGETARCH
|
||||
@@ -19,7 +19,7 @@ RUN set -ex \
|
||||
esac \
|
||||
&& cp /dist/kured_${TARGETOS}_${TARGETARCH}${SUFFIX}/kured /dist/kured;
|
||||
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.17.1
|
||||
FROM alpine:3.21.3@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c
|
||||
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
|
||||
COPY --from=bin /dist/kured /usr/bin/kured
|
||||
ENTRYPOINT ["/usr/bin/kured"]
|
||||
|
||||
@@ -108,5 +108,5 @@ Governance require a 2/3 vote of all Maintainers.
|
||||
|
||||
[maintainers-file]: ./MAINTAINERS
|
||||
[private-list]: cncf-kured-maintainers@lists.cncf.io
|
||||
[meeting-agenda]: https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#
|
||||
[meeting-agenda]: https://docs.google.com/document/d/1AWT8YDdqZY-Se6Y1oAlwtujWLVpNVK2M_F_Vfqw06aI/edit
|
||||
[decision-issues]: https://github.com/kubereboot/kured/labels/decision
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
|
||||
Daniel Holbach <daniel@weave.works> (@dholbach)
|
||||
Hidde Beydals <hidde@weave.works> (@hiddeco)
|
||||
Christian Hopf <christian.kotzbauer@gmail.com> (@ckotzbauer)
|
||||
Daniel Holbach <daniel.holbach@gmail.com> (@dholbach)
|
||||
Hidde Beydals <hidde@hhh.computer> (@hiddeco)
|
||||
Jack Francis <jackfrancis@gmail.com> (@jackfrancis)
|
||||
Jean-Philippe Evrard <open-source@a.spamming.party> (@evrardjp)
|
||||
|
||||
73
Makefile
73
Makefile
@@ -1,53 +1,74 @@
|
||||
.DEFAULT: all
|
||||
.PHONY: all clean image minikube-publish manifest test kured-all
|
||||
|
||||
TEMPDIR=./.tmp
|
||||
GORELEASER_CMD=$(TEMPDIR)/goreleaser
|
||||
DH_ORG=kubereboot
|
||||
HACKDIR=./hack/bin
|
||||
GORELEASER_CMD=$(HACKDIR)/goreleaser
|
||||
DH_ORG ?= kubereboot
|
||||
VERSION=$(shell git rev-parse --short HEAD)
|
||||
SUDO=$(shell docker info >/dev/null 2>&1 || echo "sudo -E")
|
||||
|
||||
all: image
|
||||
|
||||
$(TEMPDIR):
|
||||
mkdir -p $(TEMPDIR)
|
||||
$(HACKDIR):
|
||||
mkdir -p $(HACKDIR)
|
||||
|
||||
.PHONY: bootstrap-tools
|
||||
bootstrap-tools: $(TEMPDIR)
|
||||
VERSION=v1.11.4 TMPDIR=.tmp bash .github/scripts/goreleaser-install.sh
|
||||
curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b .tmp v0.58.0
|
||||
curl -sSfL https://github.com/sigstore/cosign/releases/download/v1.12.1/cosign-linux-amd64 -o .tmp/cosign
|
||||
chmod +x .tmp/goreleaser .tmp/cosign .tmp/syft
|
||||
bootstrap-tools: $(HACKDIR)
|
||||
command -v $(HACKDIR)/goreleaser || VERSION=v1.24.0 TMPDIR=$(HACKDIR) bash hack/installers/goreleaser-install.sh
|
||||
command -v $(HACKDIR)/syft || curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b $(HACKDIR) v1.0.1
|
||||
command -v $(HACKDIR)/cosign || curl -sSfL https://github.com/sigstore/cosign/releases/download/v2.2.3/cosign-linux-amd64 -o $(HACKDIR)/cosign
|
||||
command -v $(HACKDIR)/shellcheck || (curl -sSfL https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz | tar -J -v -x shellcheck-stable/shellcheck && mv shellcheck-stable/shellcheck $(HACKDIR)/shellcheck && rmdir shellcheck-stable)
|
||||
chmod +x $(HACKDIR)/goreleaser $(HACKDIR)/cosign $(HACKDIR)/syft $(HACKDIR)/shellcheck
|
||||
command -v staticcheck || go install honnef.co/go/tools/cmd/staticcheck@latest
|
||||
|
||||
clean:
|
||||
rm -rf ./dist
|
||||
|
||||
kured:
|
||||
$(GORELEASER_CMD) build --rm-dist --single-target --snapshot
|
||||
kured: bootstrap-tools
|
||||
$(GORELEASER_CMD) build --clean --single-target --snapshot
|
||||
|
||||
kured-all:
|
||||
$(GORELEASER_CMD) build --rm-dist --snapshot
|
||||
kured-all: bootstrap-tools
|
||||
$(GORELEASER_CMD) build --clean --snapshot
|
||||
|
||||
kured-release-tag:
|
||||
$(GORELEASER_CMD) release --rm-dist
|
||||
kured-release-tag: bootstrap-tools
|
||||
$(GORELEASER_CMD) release --clean
|
||||
|
||||
kured-release-snapshot:
|
||||
$(GORELEASER_CMD) release --rm-dist --snapshot
|
||||
kured-release-snapshot: bootstrap-tools
|
||||
$(GORELEASER_CMD) release --clean --snapshot
|
||||
|
||||
image: kured
|
||||
$(SUDO) docker buildx build --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) .
|
||||
$(SUDO) docker buildx build --no-cache --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) .
|
||||
|
||||
dev-image: image
|
||||
$(SUDO) docker tag ghcr.io/$(DH_ORG)/kured:$(VERSION) kured:dev
|
||||
|
||||
dev-manifest:
|
||||
# basic e2e scenario
|
||||
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds.yaml > tests/kind/testfiles/kured-ds.yaml
|
||||
# signal e2e scenario
|
||||
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-signal.yaml
|
||||
# concurrency e2e command scenario
|
||||
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml > tests/kind/testfiles/kured-ds-concurrent-command.yaml
|
||||
# concurrency e2e signal scenario
|
||||
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-concurrent-signal.yaml
|
||||
# pod blocker e2e signal scenario
|
||||
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--blocking-pod-selector=name=temperamental/\1--blocking-pod-selector=app=blocker/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-podblocker.yaml
|
||||
|
||||
e2e-test: dev-manifest dev-image
|
||||
echo "Running ALL go tests"
|
||||
go test -count=1 -v --parallel 4 ./... $(ARGS)
|
||||
|
||||
minikube-publish: image
|
||||
$(SUDO) docker save ghcr.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load)
|
||||
|
||||
manifest:
|
||||
sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds.yaml
|
||||
sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds-signal.yaml
|
||||
echo "Please generate combined manifest if necessary"
|
||||
|
||||
test:
|
||||
echo "Running go tests"
|
||||
go test ./...
|
||||
echo "Running golint on pkg"
|
||||
golint ./pkg/...
|
||||
echo "Running golint on cmd"
|
||||
golint ./cmd/...
|
||||
test: bootstrap-tools
|
||||
echo "Running short go tests"
|
||||
go test -test.short -json ./... > test.json
|
||||
echo "Running shellcheck"
|
||||
find . -name '*.sh' | xargs -n1 $(HACKDIR)/shellcheck
|
||||
staticcheck ./...
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
[](https://artifacthub.io/packages/helm/kured/kured)
|
||||
[](https://app.fossa.com/projects/git%2Bgithub.com%2Fkubereboot%2Fkured?ref=badge_shield)
|
||||
[](https://clomonitor.io/projects/cncf/kured)
|
||||
[](https://www.bestpractices.dev/projects/8867)
|
||||
|
||||
<img src="https://github.com/kubereboot/website/raw/main/static/img/kured.png" width="200" align="right"/>
|
||||
<img src="https://github.com/kubereboot/website/raw/main/static/img/kured.png" alt="kured logo" width="200" align="right"/>
|
||||
|
||||
- [kured - Kubernetes Reboot Daemon](#kured---kubernetes-reboot-daemon)
|
||||
- [Introduction](#introduction)
|
||||
@@ -45,7 +46,7 @@ If you have any questions about, feedback for or problems with `kured`:
|
||||
- Invite yourself to the <a href="https://slack.cncf.io/" target="_blank">CNCF Slack</a>.
|
||||
- Ask a question on the [#kured](https://cloud-native.slack.com/archives/kured) slack channel.
|
||||
- [File an issue](https://github.com/kubereboot/kured/issues/new).
|
||||
- Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
|
||||
- Join us in [our monthly meeting](https://docs.google.com/document/d/1AWT8YDdqZY-Se6Y1oAlwtujWLVpNVK2M_F_Vfqw06aI/edit),
|
||||
every first Wednesday of the month at 16:00 UTC.
|
||||
- You might want to [join the kured-dev mailing list](https://lists.cncf.io/g/cncf-kured-dev) as well.
|
||||
|
||||
|
||||
@@ -8,35 +8,32 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/containrrr/shoutrrr"
|
||||
"github.com/kubereboot/kured/internal"
|
||||
"github.com/kubereboot/kured/pkg/blockers"
|
||||
"github.com/kubereboot/kured/pkg/checkers"
|
||||
"github.com/kubereboot/kured/pkg/daemonsetlock"
|
||||
"github.com/kubereboot/kured/pkg/delaytick"
|
||||
"github.com/kubereboot/kured/pkg/reboot"
|
||||
"github.com/kubereboot/kured/pkg/taints"
|
||||
"github.com/kubereboot/kured/pkg/timewindow"
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/spf13/pflag"
|
||||
"github.com/spf13/viper"
|
||||
flag "github.com/spf13/pflag"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
kubectldrain "k8s.io/kubectl/pkg/drain"
|
||||
|
||||
"github.com/google/shlex"
|
||||
|
||||
shoutrrr "github.com/containrrr/shoutrrr"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/kubereboot/kured/pkg/alerts"
|
||||
"github.com/kubereboot/kured/pkg/daemonsetlock"
|
||||
"github.com/kubereboot/kured/pkg/delaytick"
|
||||
"github.com/kubereboot/kured/pkg/taints"
|
||||
"github.com/kubereboot/kured/pkg/timewindow"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -44,10 +41,15 @@ var (
|
||||
|
||||
// Command line flags
|
||||
forceReboot bool
|
||||
drainDelay time.Duration
|
||||
drainTimeout time.Duration
|
||||
rebootDelay time.Duration
|
||||
rebootMethod string
|
||||
period time.Duration
|
||||
metricsHost string
|
||||
metricsPort int
|
||||
drainGracePeriod int
|
||||
drainPodSelector string
|
||||
skipWaitForDeleteTimeoutSeconds int
|
||||
dsNamespace string
|
||||
dsName string
|
||||
@@ -56,7 +58,8 @@ var (
|
||||
lockReleaseDelay time.Duration
|
||||
prometheusURL string
|
||||
preferNoScheduleTaintName string
|
||||
alertFilter *regexp.Regexp
|
||||
alertFilter regexpValue
|
||||
alertFilterMatchOnly bool
|
||||
alertFiringOnly bool
|
||||
rebootSentinelFile string
|
||||
rebootSentinelCommand string
|
||||
@@ -69,10 +72,12 @@ var (
|
||||
messageTemplateUncordon string
|
||||
podSelectors []string
|
||||
rebootCommand string
|
||||
rebootSignal int
|
||||
logFormat string
|
||||
preRebootNodeLabels []string
|
||||
postRebootNodeLabels []string
|
||||
nodeID string
|
||||
concurrency int
|
||||
|
||||
rebootDays []string
|
||||
rebootStart string
|
||||
@@ -97,6 +102,8 @@ const (
|
||||
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
|
||||
// EnvPrefix The environment variable prefix of all environment variables bound to our command line flags.
|
||||
EnvPrefix = "KURED"
|
||||
|
||||
sigTrminPlus5 = 34 + 5
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -104,122 +111,185 @@ func init() {
|
||||
}
|
||||
|
||||
func main() {
|
||||
cmd := NewRootCommand()
|
||||
|
||||
if err := cmd.Execute(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// NewRootCommand construct the Cobra root command
|
||||
func NewRootCommand() *cobra.Command {
|
||||
rootCmd := &cobra.Command{
|
||||
Use: "kured",
|
||||
Short: "Kubernetes Reboot Daemon",
|
||||
PersistentPreRunE: bindViper,
|
||||
PreRun: flagCheck,
|
||||
Run: root}
|
||||
|
||||
rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "",
|
||||
flag.StringVar(&nodeID, "node-id", "",
|
||||
"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
|
||||
rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false,
|
||||
flag.BoolVar(&forceReboot, "force-reboot", false,
|
||||
"force a reboot even if the drain fails or times out")
|
||||
rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1,
|
||||
flag.StringVar(&metricsHost, "metrics-host", "",
|
||||
"host where metrics will listen")
|
||||
flag.IntVar(&metricsPort, "metrics-port", 8080,
|
||||
"port number where metrics will listen")
|
||||
flag.IntVar(&drainGracePeriod, "drain-grace-period", -1,
|
||||
"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used")
|
||||
rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
|
||||
flag.StringVar(&drainPodSelector, "drain-pod-selector", "",
|
||||
"only drain pods with labels matching the selector (default: '', all pods)")
|
||||
flag.IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
|
||||
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node")
|
||||
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
|
||||
flag.DurationVar(&drainDelay, "drain-delay", 0,
|
||||
"delay drain for this duration (default: 0, disabled)")
|
||||
flag.DurationVar(&drainTimeout, "drain-timeout", 0,
|
||||
"timeout after which the drain is aborted (default: 0, infinite time)")
|
||||
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
|
||||
flag.DurationVar(&rebootDelay, "reboot-delay", 0,
|
||||
"delay reboot for this duration (default: 0, disabled)")
|
||||
rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60,
|
||||
flag.StringVar(&rebootMethod, "reboot-method", "command",
|
||||
"method to use for reboots. Available: command")
|
||||
flag.DurationVar(&period, "period", time.Minute*60,
|
||||
"sentinel check period")
|
||||
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system",
|
||||
flag.StringVar(&dsNamespace, "ds-namespace", "kube-system",
|
||||
"namespace containing daemonset on which to place lock")
|
||||
rootCmd.PersistentFlags().StringVar(&dsName, "ds-name", "kured",
|
||||
flag.StringVar(&dsName, "ds-name", "kured",
|
||||
"name of daemonset on which to place lock")
|
||||
rootCmd.PersistentFlags().StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation,
|
||||
flag.StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation,
|
||||
"annotation in which to record locking node")
|
||||
rootCmd.PersistentFlags().DurationVar(&lockTTL, "lock-ttl", 0,
|
||||
flag.DurationVar(&lockTTL, "lock-ttl", 0,
|
||||
"expire lock annotation after this duration (default: 0, disabled)")
|
||||
rootCmd.PersistentFlags().DurationVar(&lockReleaseDelay, "lock-release-delay", 0,
|
||||
flag.DurationVar(&lockReleaseDelay, "lock-release-delay", 0,
|
||||
"delay lock release for this duration (default: 0, disabled)")
|
||||
rootCmd.PersistentFlags().StringVar(&prometheusURL, "prometheus-url", "",
|
||||
flag.StringVar(&prometheusURL, "prometheus-url", "",
|
||||
"Prometheus instance to probe for active alerts")
|
||||
rootCmd.PersistentFlags().Var(®expValue{&alertFilter}, "alert-filter-regexp",
|
||||
flag.Var(&alertFilter, "alert-filter-regexp",
|
||||
"alert names to ignore when checking for active alerts")
|
||||
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
|
||||
flag.BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false,
|
||||
"Only block if the alert-filter-regexp matches active alerts")
|
||||
flag.BoolVar(&alertFiringOnly, "alert-firing-only", false,
|
||||
"only consider firing alerts when checking for active alerts")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
|
||||
flag.StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
|
||||
"path to file whose existence triggers the reboot command")
|
||||
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "",
|
||||
flag.StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "",
|
||||
"Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"weave.works/kured-node-reboot\" to enable tainting.")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "",
|
||||
flag.StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "",
|
||||
"command for which a zero return code will trigger a reboot command")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
|
||||
flag.StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
|
||||
"command to run when a reboot is required")
|
||||
|
||||
rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
|
||||
flag.IntVar(&concurrency, "concurrency", 1,
|
||||
"amount of nodes to concurrently reboot. Defaults to 1")
|
||||
flag.IntVar(&rebootSignal, "reboot-signal", sigTrminPlus5,
|
||||
"signal to use for reboot, SIGRTMIN+5 by default.")
|
||||
flag.StringVar(&slackHookURL, "slack-hook-url", "",
|
||||
"slack hook URL for reboot notifications [deprecated in favor of --notify-url]")
|
||||
rootCmd.PersistentFlags().StringVar(&slackUsername, "slack-username", "kured",
|
||||
flag.StringVar(&slackUsername, "slack-username", "kured",
|
||||
"slack username for reboot notifications")
|
||||
rootCmd.PersistentFlags().StringVar(&slackChannel, "slack-channel", "",
|
||||
flag.StringVar(&slackChannel, "slack-channel", "",
|
||||
"slack channel for reboot notifications")
|
||||
rootCmd.PersistentFlags().StringVar(¬ifyURL, "notify-url", "",
|
||||
flag.StringVar(¬ifyURL, "notify-url", "",
|
||||
"notify URL for reboot notifications (cannot use with --slack-hook-url flags)")
|
||||
rootCmd.PersistentFlags().StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!",
|
||||
flag.StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!",
|
||||
"message template used to notify about a node being successfully uncordoned")
|
||||
rootCmd.PersistentFlags().StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s",
|
||||
flag.StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s",
|
||||
"message template used to notify about a node being drained")
|
||||
rootCmd.PersistentFlags().StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s",
|
||||
flag.StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s",
|
||||
"message template used to notify about a node being rebooted")
|
||||
|
||||
rootCmd.PersistentFlags().StringArrayVar(&podSelectors, "blocking-pod-selector", nil,
|
||||
flag.StringArrayVar(&podSelectors, "blocking-pod-selector", nil,
|
||||
"label selector identifying pods whose presence should prevent reboots")
|
||||
|
||||
rootCmd.PersistentFlags().StringSliceVar(&rebootDays, "reboot-days", timewindow.EveryDay,
|
||||
flag.StringSliceVar(&rebootDays, "reboot-days", timewindow.EveryDay,
|
||||
"schedule reboot on these days")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootStart, "start-time", "0:00",
|
||||
flag.StringVar(&rebootStart, "start-time", "0:00",
|
||||
"schedule reboot only after this time of day")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootEnd, "end-time", "23:59:59",
|
||||
flag.StringVar(&rebootEnd, "end-time", "23:59:59",
|
||||
"schedule reboot only before this time of day")
|
||||
rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC",
|
||||
flag.StringVar(&timezone, "time-zone", "UTC",
|
||||
"use this timezone for schedule inputs")
|
||||
|
||||
rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false,
|
||||
flag.BoolVar(&annotateNodes, "annotate-nodes", false,
|
||||
"if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots")
|
||||
|
||||
rootCmd.PersistentFlags().StringVar(&logFormat, "log-format", "text",
|
||||
flag.StringVar(&logFormat, "log-format", "text",
|
||||
"use text or json log format")
|
||||
|
||||
rootCmd.PersistentFlags().StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil,
|
||||
flag.StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil,
|
||||
"labels to add to nodes before cordoning")
|
||||
rootCmd.PersistentFlags().StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil,
|
||||
flag.StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil,
|
||||
"labels to add to nodes after uncordoning")
|
||||
|
||||
return rootCmd
|
||||
flag.Parse()
|
||||
|
||||
// Load flags from environment variables
|
||||
LoadFromEnv()
|
||||
|
||||
log.Infof("Kubernetes Reboot Daemon: %s", version)
|
||||
|
||||
if logFormat == "json" {
|
||||
log.SetFormatter(&log.JSONFormatter{})
|
||||
}
|
||||
|
||||
if nodeID == "" {
|
||||
log.Fatal("KURED_NODE_ID environment variable required")
|
||||
}
|
||||
log.Infof("Node ID: %s", nodeID)
|
||||
|
||||
notifyURL = validateNotificationURL(notifyURL, slackHookURL)
|
||||
|
||||
err := validateNodeLabels(preRebootNodeLabels, postRebootNodeLabels)
|
||||
if err != nil {
|
||||
log.Warnf(err.Error())
|
||||
}
|
||||
|
||||
log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName)
|
||||
|
||||
// This should be printed from blocker list instead of only blocking pod selectors
|
||||
log.Infof("Blocking Pod Selectors: %v", podSelectors)
|
||||
|
||||
log.Infof("Reboot period %v", period)
|
||||
log.Infof("Concurrency: %v", concurrency)
|
||||
|
||||
if annotateNodes {
|
||||
log.Infof("Will annotate nodes during kured reboot operations")
|
||||
}
|
||||
|
||||
// Now call the rest of the main loop.
|
||||
window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to build time window: %v", err)
|
||||
}
|
||||
log.Infof("Reboot schedule: %v", window)
|
||||
|
||||
log.Infof("Reboot method: %s", rebootMethod)
|
||||
rebooter, err := internal.NewRebooter(rebootMethod, rebootCommand, rebootSignal)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to build rebooter: %v", err)
|
||||
}
|
||||
|
||||
rebootChecker, err := internal.NewRebootChecker(rebootSentinelCommand, rebootSentinelFile)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to build reboot checker: %v", err)
|
||||
}
|
||||
|
||||
config, err := rest.InClusterConfig()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
client, err := kubernetes.NewForConfig(config)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
var blockCheckers []blockers.RebootBlocker
|
||||
if prometheusURL != "" {
|
||||
blockCheckers = append(blockCheckers, blockers.NewPrometheusBlockingChecker(papi.Config{Address: prometheusURL}, alertFilter.Regexp, alertFiringOnly, alertFilterMatchOnly))
|
||||
}
|
||||
if podSelectors != nil {
|
||||
blockCheckers = append(blockCheckers, blockers.NewKubernetesBlockingChecker(client, nodeID, podSelectors))
|
||||
}
|
||||
log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation)
|
||||
if lockTTL > 0 {
|
||||
log.Infof("Lock TTL set, lock will expire after: %v", lockTTL)
|
||||
} else {
|
||||
log.Info("Lock TTL not set, lock will remain until being released")
|
||||
}
|
||||
if lockReleaseDelay > 0 {
|
||||
log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay)
|
||||
} else {
|
||||
log.Info("Lock release delay not set, lock will be released immediately after rebooting")
|
||||
}
|
||||
lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay)
|
||||
|
||||
go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client)
|
||||
go maintainRebootRequiredMetric(nodeID, rebootChecker)
|
||||
|
||||
http.Handle("/metrics", promhttp.Handler())
|
||||
log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil))
|
||||
}
|
||||
|
||||
// func that checks for deprecated slack-notification-related flags and node labels that do not match
|
||||
func flagCheck(cmd *cobra.Command, args []string) {
|
||||
if slackHookURL != "" && notifyURL != "" {
|
||||
log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...")
|
||||
}
|
||||
if notifyURL != "" {
|
||||
notifyURL = stripQuotes(notifyURL)
|
||||
} else if slackHookURL != "" {
|
||||
slackHookURL = stripQuotes(slackHookURL)
|
||||
log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.")
|
||||
trataURL, err := url.Parse(slackHookURL)
|
||||
if err != nil {
|
||||
log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)
|
||||
}
|
||||
if len(strings.Split(strings.Trim(trataURL.Path, "/services/"), "/")) != 3 {
|
||||
log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")
|
||||
} else {
|
||||
notifyURL = fmt.Sprintf("slack://%s", strings.Trim(trataURL.Path, "/services/"))
|
||||
}
|
||||
}
|
||||
func validateNodeLabels(preRebootNodeLabels []string, postRebootNodeLabels []string) error {
|
||||
var preRebootNodeLabelKeys, postRebootNodeLabelKeys []string
|
||||
for _, label := range preRebootNodeLabels {
|
||||
preRebootNodeLabelKeys = append(preRebootNodeLabelKeys, strings.Split(label, "=")[0])
|
||||
@@ -230,8 +300,95 @@ func flagCheck(cmd *cobra.Command, args []string) {
|
||||
sort.Strings(preRebootNodeLabelKeys)
|
||||
sort.Strings(postRebootNodeLabelKeys)
|
||||
if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) {
|
||||
log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.")
|
||||
return fmt.Errorf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match, resulting in unexpected behaviour")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateNotificationURL(notifyURL string, slackHookURL string) string {
|
||||
switch {
|
||||
case slackHookURL != "" && notifyURL != "":
|
||||
log.Warnf("Cannot use both --notify-url (given: %v) and --slack-hook-url (given: %v) flags. Kured will only use --notify-url flag", slackHookURL, notifyURL)
|
||||
return validateNotificationURL(notifyURL, "")
|
||||
case notifyURL != "":
|
||||
return stripQuotes(notifyURL)
|
||||
case slackHookURL != "":
|
||||
log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.")
|
||||
parsedURL, err := url.Parse(stripQuotes(slackHookURL))
|
||||
if err != nil {
|
||||
log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err)
|
||||
return ""
|
||||
}
|
||||
if len(strings.Split(strings.Replace(parsedURL.Path, "/services/", "", -1), "/")) != 3 {
|
||||
log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n")
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf("slack://%s", strings.Replace(parsedURL.Path, "/services/", "", -1))
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// LoadFromEnv attempts to load environment variables corresponding to flags.
|
||||
// It looks for an environment variable with the uppercase version of the flag name (prefixed by EnvPrefix).
|
||||
func LoadFromEnv() {
|
||||
flag.VisitAll(func(f *flag.Flag) {
|
||||
envVarName := fmt.Sprintf("%s_%s", EnvPrefix, strings.ToUpper(strings.ReplaceAll(f.Name, "-", "_")))
|
||||
|
||||
if envValue, exists := os.LookupEnv(envVarName); exists {
|
||||
switch f.Value.Type() {
|
||||
case "int":
|
||||
if parsedVal, err := strconv.Atoi(envValue); err == nil {
|
||||
err := flag.Set(f.Name, strconv.Itoa(parsedVal))
|
||||
if err != nil {
|
||||
fmt.Printf("cannot set flag %s from env var named %s", f.Name, envVarName)
|
||||
os.Exit(1)
|
||||
} // Set int flag
|
||||
} else {
|
||||
fmt.Printf("Invalid value for env var named %s", envVarName)
|
||||
os.Exit(1)
|
||||
}
|
||||
case "string":
|
||||
err := flag.Set(f.Name, envValue)
|
||||
if err != nil {
|
||||
fmt.Printf("cannot set flag %s from env{%s}: %s\n", f.Name, envVarName, envValue)
|
||||
os.Exit(1)
|
||||
} // Set string flag
|
||||
case "bool":
|
||||
if parsedVal, err := strconv.ParseBool(envValue); err == nil {
|
||||
err := flag.Set(f.Name, strconv.FormatBool(parsedVal))
|
||||
if err != nil {
|
||||
fmt.Printf("cannot set flag %s from env{%s}: %s\n", f.Name, envVarName, envValue)
|
||||
os.Exit(1)
|
||||
} // Set boolean flag
|
||||
} else {
|
||||
fmt.Printf("Invalid value for %s: %s\n", envVarName, envValue)
|
||||
os.Exit(1)
|
||||
}
|
||||
case "duration":
|
||||
// Set duration from the environment variable (e.g., "1h30m")
|
||||
if _, err := time.ParseDuration(envValue); err == nil {
|
||||
flag.Set(f.Name, envValue)
|
||||
} else {
|
||||
fmt.Printf("Invalid duration for %s: %s\n", envVarName, envValue)
|
||||
os.Exit(1)
|
||||
}
|
||||
case "regexp":
|
||||
// For regexp, set it from the environment variable
|
||||
flag.Set(f.Name, envValue)
|
||||
case "stringSlice":
|
||||
// For stringSlice, split the environment variable by commas and set it
|
||||
err := flag.Set(f.Name, envValue)
|
||||
if err != nil {
|
||||
fmt.Printf("cannot set flag %s from env{%s}: %s\n", f.Name, envVarName, envValue)
|
||||
os.Exit(1)
|
||||
}
|
||||
default:
|
||||
fmt.Printf("Unsupported flag type for %s\n", f.Name)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
// stripQuotes removes any literal single or double quote chars that surround a string
|
||||
@@ -247,207 +404,6 @@ func stripQuotes(str string) string {
|
||||
return str
|
||||
}
|
||||
|
||||
// bindViper initializes viper and binds command flags with environment variables
|
||||
func bindViper(cmd *cobra.Command, args []string) error {
|
||||
v := viper.New()
|
||||
|
||||
v.SetEnvPrefix(EnvPrefix)
|
||||
v.AutomaticEnv()
|
||||
bindFlags(cmd, v)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// bindFlags binds each cobra flag to its associated viper configuration (environment variable)
|
||||
func bindFlags(cmd *cobra.Command, v *viper.Viper) {
|
||||
cmd.Flags().VisitAll(func(f *pflag.Flag) {
|
||||
// Environment variables can't have dashes in them, so bind them to their equivalent keys with underscores
|
||||
if strings.Contains(f.Name, "-") {
|
||||
v.BindEnv(f.Name, flagToEnvVar(f.Name))
|
||||
}
|
||||
|
||||
// Apply the viper config value to the flag when the flag is not set and viper has a value
|
||||
if !f.Changed && v.IsSet(f.Name) {
|
||||
val := v.Get(f.Name)
|
||||
log.Infof("Binding %s command flag to environment variable: %s", f.Name, flagToEnvVar(f.Name))
|
||||
cmd.Flags().Set(f.Name, fmt.Sprintf("%v", val))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// flagToEnvVar converts command flag name to equivalent environment variable name
|
||||
func flagToEnvVar(flag string) string {
|
||||
envVarSuffix := strings.ToUpper(strings.ReplaceAll(flag, "-", "_"))
|
||||
return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix)
|
||||
}
|
||||
|
||||
// newCommand creates a new Command with stdout/stderr wired to our standard logger
|
||||
func newCommand(name string, arg ...string) *exec.Cmd {
|
||||
cmd := exec.Command(name, arg...)
|
||||
cmd.Stdout = log.NewEntry(log.StandardLogger()).
|
||||
WithField("cmd", cmd.Args[0]).
|
||||
WithField("std", "out").
|
||||
WriterLevel(log.InfoLevel)
|
||||
|
||||
cmd.Stderr = log.NewEntry(log.StandardLogger()).
|
||||
WithField("cmd", cmd.Args[0]).
|
||||
WithField("std", "err").
|
||||
WriterLevel(log.WarnLevel)
|
||||
|
||||
return cmd
|
||||
}
|
||||
|
||||
// buildHostCommand writes a new command to run in the host namespace
|
||||
// Rancher based need different pid
|
||||
func buildHostCommand(pid int, command []string) []string {
|
||||
|
||||
// From the container, we nsenter into the proper PID to run the hostCommand.
|
||||
// For this, kured daemonset need to be configured with hostPID:true and privileged:true
|
||||
cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"}
|
||||
cmd = append(cmd, command...)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func rebootRequired(sentinelCommand []string) bool {
|
||||
if err := newCommand(sentinelCommand[0], sentinelCommand[1:]...).Run(); err != nil {
|
||||
switch err := err.(type) {
|
||||
case *exec.ExitError:
|
||||
// We assume a non-zero exit code means 'reboot not required', but of course
|
||||
// the user could have misconfigured the sentinel command or something else
|
||||
// went wrong during its execution. In that case, not entering a reboot loop
|
||||
// is the right thing to do, and we are logging stdout/stderr of the command
|
||||
// so it should be obvious what is wrong.
|
||||
return false
|
||||
default:
|
||||
// Something was grossly misconfigured, such as the command path being wrong.
|
||||
log.Fatalf("Error invoking sentinel command: %v", err)
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// RebootBlocker interface should be implemented by types
|
||||
// to know if their instantiations should block a reboot
|
||||
type RebootBlocker interface {
|
||||
isBlocked() bool
|
||||
}
|
||||
|
||||
// PrometheusBlockingChecker contains info for connecting
|
||||
// to prometheus, and can give info about whether a reboot should be blocked
|
||||
type PrometheusBlockingChecker struct {
|
||||
// prometheusClient to make prometheus-go-client and api config available
|
||||
// into the PrometheusBlockingChecker struct
|
||||
promClient *alerts.PromClient
|
||||
// regexp used to get alerts
|
||||
filter *regexp.Regexp
|
||||
// bool to indicate if only firing alerts should be considered
|
||||
firingOnly bool
|
||||
}
|
||||
|
||||
// KubernetesBlockingChecker contains info for connecting
|
||||
// to k8s, and can give info about whether a reboot should be blocked
|
||||
type KubernetesBlockingChecker struct {
|
||||
// client used to contact kubernetes API
|
||||
client *kubernetes.Clientset
|
||||
nodename string
|
||||
// lised used to filter pods (podSelector)
|
||||
filter []string
|
||||
}
|
||||
|
||||
func (pb PrometheusBlockingChecker) isBlocked() bool {
|
||||
|
||||
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
|
||||
if err != nil {
|
||||
log.Warnf("Reboot blocked: prometheus query error: %v", err)
|
||||
return true
|
||||
}
|
||||
count := len(alertNames)
|
||||
if count > 10 {
|
||||
alertNames = append(alertNames[:10], "...")
|
||||
}
|
||||
if count > 0 {
|
||||
log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (kb KubernetesBlockingChecker) isBlocked() bool {
|
||||
fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodename)
|
||||
for _, labelSelector := range kb.filter {
|
||||
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
|
||||
LabelSelector: labelSelector,
|
||||
FieldSelector: fieldSelector,
|
||||
Limit: 10})
|
||||
if err != nil {
|
||||
log.Warnf("Reboot blocked: pod query error: %v", err)
|
||||
return true
|
||||
}
|
||||
|
||||
if len(podList.Items) > 0 {
|
||||
podNames := make([]string, 0, len(podList.Items))
|
||||
for _, pod := range podList.Items {
|
||||
podNames = append(podNames, pod.Name)
|
||||
}
|
||||
if len(podList.Continue) > 0 {
|
||||
podNames = append(podNames, "...")
|
||||
}
|
||||
log.Warnf("Reboot blocked: matching pods: %v", podNames)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func rebootBlocked(blockers ...RebootBlocker) bool {
|
||||
for _, blocker := range blockers {
|
||||
if blocker.isBlocked() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
|
||||
holding, err := lock.Test(metadata)
|
||||
if err != nil {
|
||||
log.Fatalf("Error testing lock: %v", err)
|
||||
}
|
||||
if holding {
|
||||
log.Infof("Holding lock")
|
||||
}
|
||||
return holding
|
||||
}
|
||||
|
||||
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration) bool {
|
||||
holding, holder, err := lock.Acquire(metadata, TTL)
|
||||
switch {
|
||||
case err != nil:
|
||||
log.Fatalf("Error acquiring lock: %v", err)
|
||||
return false
|
||||
case !holding:
|
||||
log.Warnf("Lock already held: %v", holder)
|
||||
return false
|
||||
default:
|
||||
log.Infof("Acquired reboot lock")
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
func throttle(releaseDelay time.Duration) {
|
||||
if releaseDelay > 0 {
|
||||
log.Infof("Delaying lock release by %v", releaseDelay)
|
||||
time.Sleep(releaseDelay)
|
||||
}
|
||||
}
|
||||
|
||||
func release(lock *daemonsetlock.DaemonSetLock) {
|
||||
log.Infof("Releasing lock")
|
||||
if err := lock.Release(); err != nil {
|
||||
log.Fatalf("Error releasing lock: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func drain(client *kubernetes.Clientset, node *v1.Node) error {
|
||||
nodename := node.GetName()
|
||||
|
||||
@@ -455,6 +411,11 @@ func drain(client *kubernetes.Clientset, node *v1.Node) error {
|
||||
updateNodeLabels(client, node, preRebootNodeLabels)
|
||||
}
|
||||
|
||||
if drainDelay > 0 {
|
||||
log.Infof("Delaying drain for %v", drainDelay)
|
||||
time.Sleep(drainDelay)
|
||||
}
|
||||
|
||||
log.Infof("Draining node %s", nodename)
|
||||
|
||||
if notifyURL != "" {
|
||||
@@ -467,6 +428,7 @@ func drain(client *kubernetes.Clientset, node *v1.Node) error {
|
||||
Client: client,
|
||||
Ctx: context.Background(),
|
||||
GracePeriodSeconds: drainGracePeriod,
|
||||
PodSelector: drainPodSelector,
|
||||
SkipWaitForDeleteTimeoutSeconds: skipWaitForDeleteTimeoutSeconds,
|
||||
Force: true,
|
||||
DeleteEmptyDirData: true,
|
||||
@@ -506,23 +468,9 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func invokeReboot(nodeID string, rebootCommand []string) {
|
||||
log.Infof("Running command: %s for node: %s", rebootCommand, nodeID)
|
||||
|
||||
if notifyURL != "" {
|
||||
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil {
|
||||
log.Warnf("Error notifying: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := newCommand(rebootCommand[0], rebootCommand[1:]...).Run(); err != nil {
|
||||
log.Fatalf("Error invoking reboot command: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func maintainRebootRequiredMetric(nodeID string, sentinelCommand []string) {
|
||||
func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) {
|
||||
for {
|
||||
if rebootRequired(sentinelCommand) {
|
||||
if checker.RebootRequired() {
|
||||
rebootRequiredGauge.WithLabelValues(nodeID).Set(1)
|
||||
} else {
|
||||
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
|
||||
@@ -531,11 +479,6 @@ func maintainRebootRequiredMetric(nodeID string, sentinelCommand []string) {
|
||||
}
|
||||
}
|
||||
|
||||
// nodeMeta is used to remember information across reboots
|
||||
type nodeMeta struct {
|
||||
Unschedulable bool `json:"unschedulable"`
|
||||
}
|
||||
|
||||
func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error {
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
@@ -610,30 +553,23 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri
|
||||
}
|
||||
}
|
||||
|
||||
func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) {
|
||||
config, err := rest.InClusterConfig()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, blockCheckers []blockers.RebootBlocker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset) {
|
||||
|
||||
client, err := kubernetes.NewForConfig(config)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation)
|
||||
|
||||
nodeMeta := nodeMeta{}
|
||||
source := rand.NewSource(time.Now().UnixNano())
|
||||
tick := delaytick.New(source, 1*time.Minute)
|
||||
for range tick {
|
||||
if holding(lock, &nodeMeta) {
|
||||
holding, lockData, err := lock.Holding()
|
||||
if err != nil {
|
||||
log.Errorf("Error testing lock: %v", err)
|
||||
}
|
||||
if holding {
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Errorf("Error retrieving node object via k8s API: %v", err)
|
||||
continue
|
||||
}
|
||||
if !nodeMeta.Unschedulable {
|
||||
|
||||
if !lockData.Metadata.Unschedulable {
|
||||
err = uncordon(client, node)
|
||||
if err != nil {
|
||||
log.Errorf("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err)
|
||||
@@ -651,7 +587,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
// And (2) check if we previously annotated the node that it was in the process of being rebooted,
|
||||
// And finally (3) if it has that annotation, to delete it.
|
||||
// This indicates to other node tools running on the cluster that this node may be a candidate for maintenance
|
||||
if annotateNodes && !rebootRequired(sentinelCommand) {
|
||||
if annotateNodes && !checker.RebootRequired() {
|
||||
if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok {
|
||||
err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation)
|
||||
if err != nil {
|
||||
@@ -659,8 +595,12 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
}
|
||||
}
|
||||
}
|
||||
throttle(releaseDelay)
|
||||
release(lock)
|
||||
|
||||
err = lock.Release()
|
||||
if err != nil {
|
||||
log.Errorf("Error releasing lock, will retry: %v", err)
|
||||
continue
|
||||
}
|
||||
break
|
||||
} else {
|
||||
break
|
||||
@@ -670,16 +610,10 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)
|
||||
|
||||
// Remove taint immediately during startup to quickly allow scheduling again.
|
||||
if !rebootRequired(sentinelCommand) {
|
||||
if !checker.RebootRequired() {
|
||||
preferNoScheduleTaint.Disable()
|
||||
}
|
||||
|
||||
// instantiate prometheus client
|
||||
promClient, err := alerts.NewPromClient(papi.Config{Address: prometheusURL})
|
||||
if err != nil {
|
||||
log.Fatal("Unable to create prometheus client: ", err)
|
||||
}
|
||||
|
||||
source = rand.NewSource(time.Now().UnixNano())
|
||||
tick = delaytick.New(source, period)
|
||||
for range tick {
|
||||
@@ -689,30 +623,18 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
continue
|
||||
}
|
||||
|
||||
if !rebootRequired(sentinelCommand) {
|
||||
if !checker.RebootRequired() {
|
||||
log.Infof("Reboot not required")
|
||||
preferNoScheduleTaint.Disable()
|
||||
continue
|
||||
}
|
||||
log.Infof("Reboot required")
|
||||
|
||||
var blockCheckers []RebootBlocker
|
||||
if prometheusURL != "" {
|
||||
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
|
||||
}
|
||||
if podSelectors != nil {
|
||||
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
|
||||
}
|
||||
|
||||
if rebootBlocked(blockCheckers...) {
|
||||
continue
|
||||
}
|
||||
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error retrieving node object via k8s API: %v", err)
|
||||
}
|
||||
nodeMeta.Unschedulable = node.Spec.Unschedulable
|
||||
|
||||
nodeMeta := daemonsetlock.NodeMeta{Unschedulable: node.Spec.Unschedulable}
|
||||
|
||||
var timeNowString string
|
||||
if annotateNodes {
|
||||
@@ -730,17 +652,39 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
}
|
||||
}
|
||||
|
||||
if !holding(lock, &nodeMeta) && !acquire(lock, &nodeMeta, TTL) {
|
||||
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
|
||||
preferNoScheduleTaint.Enable()
|
||||
var rebootRequiredBlockCondition string
|
||||
if blockers.RebootBlocked(blockCheckers...) {
|
||||
rebootRequiredBlockCondition = ", but blocked at this time"
|
||||
continue
|
||||
}
|
||||
log.Infof("Reboot required%s", rebootRequiredBlockCondition)
|
||||
|
||||
holding, _, err := lock.Holding()
|
||||
if err != nil {
|
||||
log.Errorf("Error testing lock: %v", err)
|
||||
}
|
||||
|
||||
if !holding {
|
||||
acquired, holder, err := lock.Acquire(nodeMeta)
|
||||
if err != nil {
|
||||
log.Errorf("Error acquiring lock: %v", err)
|
||||
}
|
||||
if !acquired {
|
||||
log.Warnf("Lock already held: %v", holder)
|
||||
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
|
||||
preferNoScheduleTaint.Enable()
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
err = drain(client, node)
|
||||
if err != nil {
|
||||
if !forceReboot {
|
||||
log.Errorf("Unable to cordon or drain %s: %v, will release lock and retry cordon and drain before rebooting when lock is next acquired", node.GetName(), err)
|
||||
release(lock)
|
||||
err = lock.Release()
|
||||
if err != nil {
|
||||
log.Errorf("Error releasing lock: %v", err)
|
||||
}
|
||||
log.Infof("Performing a best-effort uncordon after failed cordon and drain")
|
||||
uncordon(client, node)
|
||||
continue
|
||||
@@ -752,86 +696,20 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
time.Sleep(rebootDelay)
|
||||
}
|
||||
|
||||
invokeReboot(nodeID, rebootCommand)
|
||||
if notifyURL != "" {
|
||||
if err := shoutrrr.Send(notifyURL, fmt.Sprintf(messageTemplateReboot, nodeID)); err != nil {
|
||||
log.Warnf("Error notifying: %v", err)
|
||||
}
|
||||
}
|
||||
log.Infof("Triggering reboot for node %v", nodeID)
|
||||
|
||||
err = rebooter.Reboot()
|
||||
if err != nil {
|
||||
log.Fatalf("Unable to reboot node: %v", err)
|
||||
}
|
||||
for {
|
||||
log.Infof("Waiting for reboot")
|
||||
time.Sleep(time.Minute)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// buildSentinelCommand creates the shell command line which will need wrapping to escape
|
||||
// the container boundaries
|
||||
func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand string) []string {
|
||||
if rebootSentinelCommand != "" {
|
||||
cmd, err := shlex.Split(rebootSentinelCommand)
|
||||
if err != nil {
|
||||
log.Fatalf("Error parsing provided sentinel command: %v", err)
|
||||
}
|
||||
return cmd
|
||||
}
|
||||
return []string{"test", "-f", rebootSentinelFile}
|
||||
}
|
||||
|
||||
// parseRebootCommand creates the shell command line which will need wrapping to escape
|
||||
// the container boundaries
|
||||
func parseRebootCommand(rebootCommand string) []string {
|
||||
command, err := shlex.Split(rebootCommand)
|
||||
if err != nil {
|
||||
log.Fatalf("Error parsing provided reboot command: %v", err)
|
||||
}
|
||||
return command
|
||||
}
|
||||
|
||||
func root(cmd *cobra.Command, args []string) {
|
||||
if logFormat == "json" {
|
||||
log.SetFormatter(&log.JSONFormatter{})
|
||||
}
|
||||
|
||||
log.Infof("Kubernetes Reboot Daemon: %s", version)
|
||||
|
||||
if nodeID == "" {
|
||||
log.Fatal("KURED_NODE_ID environment variable required")
|
||||
}
|
||||
|
||||
window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to build time window: %v", err)
|
||||
}
|
||||
|
||||
sentinelCommand := buildSentinelCommand(rebootSentinelFile, rebootSentinelCommand)
|
||||
restartCommand := parseRebootCommand(rebootCommand)
|
||||
|
||||
log.Infof("Node ID: %s", nodeID)
|
||||
log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation)
|
||||
if lockTTL > 0 {
|
||||
log.Infof("Lock TTL set, lock will expire after: %v", lockTTL)
|
||||
} else {
|
||||
log.Info("Lock TTL not set, lock will remain until being released")
|
||||
}
|
||||
if lockReleaseDelay > 0 {
|
||||
log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay)
|
||||
} else {
|
||||
log.Info("Lock release delay not set, lock will be released immediately after rebooting")
|
||||
}
|
||||
log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName)
|
||||
log.Infof("Blocking Pod Selectors: %v", podSelectors)
|
||||
log.Infof("Reboot schedule: %v", window)
|
||||
log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
|
||||
log.Infof("Reboot command: %s", restartCommand)
|
||||
if annotateNodes {
|
||||
log.Infof("Will annotate nodes during kured reboot operations")
|
||||
}
|
||||
|
||||
// To run those commands as it was the host, we'll use nsenter
|
||||
// Relies on hostPID:true and privileged:true to enter host mount space
|
||||
// PID set to 1, until we have a better discovery mechanism.
|
||||
hostSentinelCommand := buildHostCommand(1, sentinelCommand)
|
||||
hostRestartCommand := buildHostCommand(1, restartCommand)
|
||||
|
||||
go rebootAsRequired(nodeID, hostRestartCommand, hostSentinelCommand, window, lockTTL, lockReleaseDelay)
|
||||
go maintainRebootRequiredMetric(nodeID, hostSentinelCommand)
|
||||
|
||||
http.Handle("/metrics", promhttp.Handler())
|
||||
log.Fatal(http.ListenAndServe(":8080", nil))
|
||||
}
|
||||
|
||||
@@ -3,61 +3,29 @@ package main
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/kubereboot/kured/pkg/alerts"
|
||||
assert "gotest.tools/v3/assert"
|
||||
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
)
|
||||
|
||||
type BlockingChecker struct {
|
||||
blocking bool
|
||||
}
|
||||
func TestValidateNotificationURL(t *testing.T) {
|
||||
|
||||
func (fbc BlockingChecker) isBlocked() bool {
|
||||
return fbc.blocking
|
||||
}
|
||||
|
||||
var _ RebootBlocker = BlockingChecker{} // Verify that Type implements Interface.
|
||||
var _ RebootBlocker = (*BlockingChecker)(nil) // Verify that *Type implements Interface.
|
||||
|
||||
func Test_flagCheck(t *testing.T) {
|
||||
var cmd *cobra.Command
|
||||
var args []string
|
||||
slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
expected := "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
tests := []struct {
|
||||
name string
|
||||
slackHookURL string
|
||||
notifyURL string
|
||||
expected string
|
||||
}{
|
||||
{"slackHookURL only works fine", "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"},
|
||||
{"slackHookURL and notify URL together only keeps notifyURL", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"},
|
||||
{"slackHookURL removes extraneous double quotes", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"},
|
||||
{"slackHookURL removes extraneous single quotes", "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"},
|
||||
{"notifyURL removes extraneous double quotes", "", "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"},
|
||||
{"notifyURL removes extraneous single quotes", "", "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"},
|
||||
}
|
||||
|
||||
// validate that surrounding quotes are stripped
|
||||
slackHookURL = "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\""
|
||||
expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
slackHookURL = "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'"
|
||||
expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
slackHookURL = ""
|
||||
notifyURL = "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\""
|
||||
expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
}
|
||||
notifyURL = "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'"
|
||||
expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
|
||||
flagCheck(cmd, args)
|
||||
if notifyURL != expected {
|
||||
t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := validateNotificationURL(tt.notifyURL, tt.slackHookURL); !reflect.DeepEqual(got, tt.expected) {
|
||||
t.Errorf("validateNotificationURL() = %v, expected %v", got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,205 +74,3 @@ func Test_stripQuotes(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_rebootBlocked(t *testing.T) {
|
||||
noCheckers := []RebootBlocker{}
|
||||
nonblockingChecker := BlockingChecker{blocking: false}
|
||||
blockingChecker := BlockingChecker{blocking: true}
|
||||
|
||||
// Instantiate a prometheusClient with a broken_url
|
||||
promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"})
|
||||
if err != nil {
|
||||
log.Fatal("Can't create prometheusClient: ", err)
|
||||
}
|
||||
brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false}
|
||||
|
||||
type args struct {
|
||||
blockers []RebootBlocker
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "Do not block on no blocker defined",
|
||||
args: args{blockers: noCheckers},
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "Ensure a blocker blocks",
|
||||
args: args{blockers: []RebootBlocker{blockingChecker}},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Ensure a non-blocker doesn't block",
|
||||
args: args{blockers: []RebootBlocker{nonblockingChecker}},
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "Ensure one blocker is enough to block",
|
||||
args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Do block on error contacting prometheus API",
|
||||
args: args{blockers: []RebootBlocker{brokenPrometheusClient}},
|
||||
want: true,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := rebootBlocked(tt.args.blockers...); got != tt.want {
|
||||
t.Errorf("rebootBlocked() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_buildHostCommand(t *testing.T) {
|
||||
type args struct {
|
||||
pid int
|
||||
command []string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want []string
|
||||
}{
|
||||
{
|
||||
name: "Ensure command will run with nsenter",
|
||||
args: args{pid: 1, command: []string{"ls", "-Fal"}},
|
||||
want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := buildHostCommand(tt.args.pid, tt.args.command); !reflect.DeepEqual(got, tt.want) {
|
||||
t.Errorf("buildHostCommand() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_buildSentinelCommand(t *testing.T) {
|
||||
type args struct {
|
||||
rebootSentinelFile string
|
||||
rebootSentinelCommand string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want []string
|
||||
}{
|
||||
{
|
||||
name: "Ensure a sentinelFile generates a shell 'test' command with the right file",
|
||||
args: args{
|
||||
rebootSentinelFile: "/test1",
|
||||
rebootSentinelCommand: "",
|
||||
},
|
||||
want: []string{"test", "-f", "/test1"},
|
||||
},
|
||||
{
|
||||
name: "Ensure a sentinelCommand has priority over a sentinelFile if both are provided (because sentinelFile is always provided)",
|
||||
args: args{
|
||||
rebootSentinelFile: "/test1",
|
||||
rebootSentinelCommand: "/sbin/reboot-required -r",
|
||||
},
|
||||
want: []string{"/sbin/reboot-required", "-r"},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := buildSentinelCommand(tt.args.rebootSentinelFile, tt.args.rebootSentinelCommand); !reflect.DeepEqual(got, tt.want) {
|
||||
t.Errorf("buildSentinelCommand() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_parseRebootCommand(t *testing.T) {
|
||||
type args struct {
|
||||
rebootCommand string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want []string
|
||||
}{
|
||||
{
|
||||
name: "Ensure a reboot command is properly parsed",
|
||||
args: args{
|
||||
rebootCommand: "/sbin/systemctl reboot",
|
||||
},
|
||||
want: []string{"/sbin/systemctl", "reboot"},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := parseRebootCommand(tt.args.rebootCommand); !reflect.DeepEqual(got, tt.want) {
|
||||
t.Errorf("parseRebootCommand() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_rebootRequired(t *testing.T) {
|
||||
type args struct {
|
||||
sentinelCommand []string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "Ensure rc = 0 means reboot required",
|
||||
args: args{
|
||||
sentinelCommand: []string{"true"},
|
||||
},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Ensure rc != 0 means reboot NOT required",
|
||||
args: args{
|
||||
sentinelCommand: []string{"false"},
|
||||
},
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := rebootRequired(tt.args.sentinelCommand); got != tt.want {
|
||||
t.Errorf("rebootRequired() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_rebootRequired_fatals(t *testing.T) {
|
||||
cases := []struct {
|
||||
param []string
|
||||
expectFatal bool
|
||||
}{
|
||||
{
|
||||
param: []string{"true"},
|
||||
expectFatal: false,
|
||||
},
|
||||
{
|
||||
param: []string{"./babar"},
|
||||
expectFatal: true,
|
||||
},
|
||||
}
|
||||
|
||||
defer func() { log.StandardLogger().ExitFunc = nil }()
|
||||
var fatal bool
|
||||
log.StandardLogger().ExitFunc = func(int) { fatal = true }
|
||||
|
||||
for _, c := range cases {
|
||||
fatal = false
|
||||
rebootRequired(c.param)
|
||||
assert.Equal(t, c.expectFatal, fatal)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -5,14 +5,14 @@ import (
|
||||
)
|
||||
|
||||
type regexpValue struct {
|
||||
value **regexp.Regexp
|
||||
*regexp.Regexp
|
||||
}
|
||||
|
||||
func (rev *regexpValue) String() string {
|
||||
if *rev.value == nil {
|
||||
if rev.Regexp == nil {
|
||||
return ""
|
||||
}
|
||||
return (*rev.value).String()
|
||||
return rev.Regexp.String()
|
||||
}
|
||||
|
||||
func (rev *regexpValue) Set(s string) error {
|
||||
@@ -20,12 +20,11 @@ func (rev *regexpValue) Set(s string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
*rev.value = value
|
||||
|
||||
rev.Regexp = value
|
||||
return nil
|
||||
}
|
||||
|
||||
// Type method returns the type of the flag as a string
|
||||
func (rev *regexpValue) Type() string {
|
||||
return "regexp.Regexp"
|
||||
return "regexp"
|
||||
}
|
||||
|
||||
132
go.mod
132
go.mod
@@ -1,108 +1,90 @@
|
||||
module github.com/kubereboot/kured
|
||||
|
||||
go 1.18
|
||||
|
||||
replace (
|
||||
golang.org/x/net => golang.org/x/net v0.4.0
|
||||
golang.org/x/text => golang.org/x/text v0.3.8
|
||||
)
|
||||
go 1.22.12
|
||||
|
||||
require (
|
||||
github.com/containrrr/shoutrrr v0.7.1
|
||||
github.com/containrrr/shoutrrr v0.8.0
|
||||
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
|
||||
github.com/google/uuid v1.3.0 // indirect
|
||||
github.com/prometheus/client_golang v1.14.0
|
||||
github.com/prometheus/common v0.39.0
|
||||
github.com/sirupsen/logrus v1.9.0
|
||||
github.com/spf13/cobra v1.6.1
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/spf13/viper v1.15.0
|
||||
github.com/stretchr/testify v1.8.1
|
||||
gotest.tools/v3 v3.4.0
|
||||
k8s.io/api v0.25.5
|
||||
k8s.io/apimachinery v0.25.5
|
||||
k8s.io/client-go v0.25.5
|
||||
k8s.io/kubectl v0.25.5
|
||||
github.com/prometheus/client_golang v1.21.0
|
||||
github.com/prometheus/common v0.62.0
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/spf13/pflag v1.0.6
|
||||
github.com/stretchr/testify v1.10.0
|
||||
k8s.io/api v0.30.10
|
||||
k8s.io/apimachinery v0.30.10
|
||||
k8s.io/client-go v0.30.10
|
||||
k8s.io/kubectl v0.30.10
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
|
||||
github.com/MakeNowJust/heredoc v1.0.0 // indirect
|
||||
github.com/PuerkitoBio/purell v1.1.1 // indirect
|
||||
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.1.2 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/chai2010/gettext-go v1.0.2 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.8.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
|
||||
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
|
||||
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
|
||||
github.com/fatih/color v1.13.0 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/go-errors/errors v1.0.1 // indirect
|
||||
github.com/go-logr/logr v1.2.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.19.5 // indirect
|
||||
github.com/go-openapi/jsonreference v0.19.5 // indirect
|
||||
github.com/go-openapi/swag v0.19.14 // indirect
|
||||
github.com/fatih/color v1.15.0 // indirect
|
||||
github.com/go-errors/errors v1.4.2 // indirect
|
||||
github.com/go-logr/logr v1.4.1 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.19.6 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.2 // indirect
|
||||
github.com/go-openapi/swag v0.22.3 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/protobuf v1.5.2 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/btree v1.0.1 // indirect
|
||||
github.com/google/gnostic v0.5.7-v3refs // indirect
|
||||
github.com/google/go-cmp v0.5.9 // indirect
|
||||
github.com/google/gofuzz v1.1.0 // indirect
|
||||
github.com/google/gnostic-models v0.6.8 // indirect
|
||||
github.com/google/gofuzz v1.2.0 // indirect
|
||||
github.com/google/uuid v1.4.0 // indirect
|
||||
github.com/gorilla/websocket v1.5.0 // indirect
|
||||
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
|
||||
github.com/hashicorp/hcl v1.0.0 // indirect
|
||||
github.com/imdario/mergo v0.3.6 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.0.1 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.17.11 // indirect
|
||||
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
|
||||
github.com/magiconair/properties v1.8.7 // indirect
|
||||
github.com/mailru/easyjson v0.7.6 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.16 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/mitchellh/go-wordwrap v1.0.0 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.17 // indirect
|
||||
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
|
||||
github.com/moby/spdystream v0.2.0 // indirect
|
||||
github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect
|
||||
github.com/moby/term v0.0.0-20221205130635-1aeaba878587 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.0.6 // indirect
|
||||
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
|
||||
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/prometheus/client_model v0.3.0 // indirect
|
||||
github.com/prometheus/procfs v0.8.0 // indirect
|
||||
github.com/russross/blackfriday v1.5.2 // indirect
|
||||
github.com/spf13/afero v1.9.3 // indirect
|
||||
github.com/spf13/cast v1.5.0 // indirect
|
||||
github.com/spf13/jwalterweatherman v1.1.0 // indirect
|
||||
github.com/subosito/gotenv v1.4.2 // indirect
|
||||
github.com/xlab/treeprint v1.1.0 // indirect
|
||||
go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect
|
||||
golang.org/x/net v0.4.0 // indirect
|
||||
golang.org/x/oauth2 v0.3.0 // indirect
|
||||
golang.org/x/sys v0.3.0 // indirect
|
||||
golang.org/x/term v0.3.0 // indirect
|
||||
golang.org/x/text v0.5.0 // indirect
|
||||
golang.org/x/time v0.1.0 // indirect
|
||||
google.golang.org/appengine v1.6.7 // indirect
|
||||
google.golang.org/protobuf v1.28.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/prometheus/client_model v0.6.1 // indirect
|
||||
github.com/prometheus/procfs v0.15.1 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/spf13/cobra v1.8.1 // indirect
|
||||
github.com/xlab/treeprint v1.2.0 // indirect
|
||||
go.starlark.net v0.0.0-20230525235612-a134d8f9ddca // indirect
|
||||
golang.org/x/net v0.33.0 // indirect
|
||||
golang.org/x/oauth2 v0.24.0 // indirect
|
||||
golang.org/x/sync v0.10.0 // indirect
|
||||
golang.org/x/sys v0.28.0 // indirect
|
||||
golang.org/x/term v0.27.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
golang.org/x/time v0.5.0 // indirect
|
||||
google.golang.org/protobuf v1.36.1 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/ini.v1 v1.67.0 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/cli-runtime v0.25.5 // indirect
|
||||
k8s.io/component-base v0.25.5 // indirect
|
||||
k8s.io/klog/v2 v2.70.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect
|
||||
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect
|
||||
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
|
||||
sigs.k8s.io/kustomize/api v0.12.1 // indirect
|
||||
sigs.k8s.io/kustomize/kyaml v0.13.9 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
|
||||
sigs.k8s.io/yaml v1.2.0 // indirect
|
||||
k8s.io/cli-runtime v0.30.10 // indirect
|
||||
k8s.io/component-base v0.30.10 // indirect
|
||||
k8s.io/klog/v2 v2.120.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
|
||||
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
|
||||
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
|
||||
sigs.k8s.io/kustomize/api v0.13.5-0.20230601165947-6ce0bf390ce3 // indirect
|
||||
sigs.k8s.io/kustomize/kyaml v0.14.3-0.20230601165947-6ce0bf390ce3 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
|
||||
sigs.k8s.io/yaml v1.3.0 // indirect
|
||||
)
|
||||
|
||||
@@ -10,28 +10,21 @@ test -z "$VERSION" && {
|
||||
}
|
||||
|
||||
test -z "$TMPDIR" && TMPDIR="$(mktemp -d)"
|
||||
TAR_FILE="$TMPDIR/${FILE_BASENAME}_$(uname -s)_$(uname -m).tar.gz"
|
||||
# goreleaser uses arm64 instead of aarch64
|
||||
goreleaser_arch=$(uname -m | sed -e 's/aarch64/arm64/g' -e 's/ppc64le/ppc64/' -e 's/armv7l/armv7/' )
|
||||
TAR_FILE="$TMPDIR/${FILE_BASENAME}_$(uname -s)_${goreleaser_arch}.tar.gz"
|
||||
export TAR_FILE
|
||||
|
||||
(
|
||||
echo "Downloading GoReleaser $VERSION..."
|
||||
curl -sfLo "$TAR_FILE" \
|
||||
"$RELEASES_URL/download/$VERSION/${FILE_BASENAME}_$(uname -s)_$(uname -m).tar.gz"
|
||||
"$RELEASES_URL/download/$VERSION/${FILE_BASENAME}_$(uname -s)_${goreleaser_arch}.tar.gz"
|
||||
cd "$TMPDIR"
|
||||
curl -sfLo "checksums.txt" "$RELEASES_URL/download/$VERSION/checksums.txt"
|
||||
curl -sfLo "checksums.txt.sig" "$RELEASES_URL/download/$VERSION/checksums.txt.sig"
|
||||
echo "Verifying checksums..."
|
||||
sha256sum --ignore-missing --quiet --check checksums.txt
|
||||
if command -v cosign >/dev/null 2>&1; then
|
||||
echo "Verifying signatures..."
|
||||
COSIGN_EXPERIMENTAL=1 cosign verify-blob \
|
||||
--signature checksums.txt.sig \
|
||||
checksums.txt
|
||||
else
|
||||
echo "Could not verify signatures, cosign is not installed."
|
||||
fi
|
||||
)
|
||||
|
||||
tar -xf "$TAR_FILE" -O goreleaser > "$TMPDIR/goreleaser"
|
||||
rm "$TMPDIR/checksums.txt" "$TMPDIR/checksums.txt.sig"
|
||||
rm "$TMPDIR/checksums.txt"
|
||||
rm "$TAR_FILE"
|
||||
35
internal/validators.go
Normal file
35
internal/validators.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/kubereboot/kured/pkg/checkers"
|
||||
"github.com/kubereboot/kured/pkg/reboot"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input,
|
||||
// then chains to the right constructor.
|
||||
func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (reboot.Rebooter, error) {
|
||||
switch {
|
||||
case rebootMethod == "command":
|
||||
log.Infof("Reboot command: %s", rebootCommand)
|
||||
return reboot.NewCommandRebooter(rebootCommand)
|
||||
case rebootMethod == "signal":
|
||||
log.Infof("Reboot signal: %d", rebootSignal)
|
||||
return reboot.NewSignalRebooter(rebootSignal)
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod)
|
||||
}
|
||||
}
|
||||
|
||||
// NewRebootChecker validates the rebootSentinelCommand, rebootSentinelFile input,
|
||||
// then chains to the right constructor.
|
||||
func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (checkers.Checker, error) {
|
||||
// An override of rebootSentinelCommand means a privileged command
|
||||
if rebootSentinelCommand != "" {
|
||||
log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand)
|
||||
return checkers.NewCommandChecker(rebootSentinelCommand, 1, true)
|
||||
}
|
||||
log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile)
|
||||
return checkers.NewFileRebootChecker(rebootSentinelFile)
|
||||
}
|
||||
100
kured-ds-signal.yaml
Normal file
100
kured-ds-signal.yaml
Normal file
@@ -0,0 +1,100 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: kured
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kured # Must match `--ds-name`
|
||||
namespace: kube-system # Must match `--ds-namespace`
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: kured
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: kured
|
||||
spec:
|
||||
serviceAccountName: kured
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
hostPID: true # Facilitate entering the host mount namespace via init
|
||||
restartPolicy: Always
|
||||
volumes:
|
||||
- name: sentinel
|
||||
hostPath:
|
||||
path: /var/run
|
||||
type: Directory
|
||||
containers:
|
||||
- name: kured
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
image: ghcr.io/kubereboot/kured:1.17.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
privileged: false # Give permission to nsenter /proc/1/ns/mnt
|
||||
readOnlyRootFilesystem: true
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["*"]
|
||||
add: ["CAP_KILL"]
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: metrics
|
||||
env:
|
||||
# Pass in the name of the node on which this pod is scheduled
|
||||
# for use with drain/uncordon operations and lock acquisition
|
||||
- name: KURED_NODE_ID
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- mountPath: /sentinel
|
||||
name: sentinel
|
||||
readOnly: true
|
||||
command:
|
||||
- /usr/bin/kured
|
||||
- --reboot-sentinel=/sentinel/reboot-required
|
||||
- --reboot-method=signal
|
||||
# - --reboot-signal=39
|
||||
# - --force-reboot=false
|
||||
# - --drain-grace-period=-1
|
||||
# - --skip-wait-for-delete-timeout=0
|
||||
# - --drain-timeout=0
|
||||
# - --period=1h
|
||||
# - --ds-namespace=kube-system
|
||||
# - --ds-name=kured
|
||||
# - --lock-annotation=weave.works/kured-node-lock
|
||||
# - --lock-ttl=0
|
||||
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
|
||||
# - --alert-filter-regexp=^RebootRequired$
|
||||
# - --alert-firing-only=false
|
||||
# - --prefer-no-schedule-taint=""
|
||||
# - --reboot-sentinel-command=""
|
||||
# - --slack-hook-url=https://hooks.slack.com/...
|
||||
# - --slack-username=prod
|
||||
# - --slack-channel=alerting
|
||||
# - --notify-url="" # See also shoutrrr url format
|
||||
# - --message-template-drain=Draining node %s
|
||||
# - --message-template-reboot=Rebooting node %s
|
||||
# - --message-template-uncordon=Node %s rebooted & uncordoned successfully!
|
||||
# - --blocking-pod-selector=runtime=long,cost=expensive
|
||||
# - --blocking-pod-selector=name=temperamental
|
||||
# - --blocking-pod-selector=...
|
||||
# - --reboot-days=sun,mon,tue,wed,thu,fri,sat
|
||||
# - --reboot-delay=90s
|
||||
# - --start-time=0:00
|
||||
# - --end-time=23:59:59
|
||||
# - --time-zone=UTC
|
||||
# - --annotate-nodes=false
|
||||
# - --lock-release-delay=30m
|
||||
# - --log-format=text
|
||||
@@ -8,14 +8,14 @@ metadata:
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kured # Must match `--ds-name`
|
||||
name: kured # Must match `--ds-name`
|
||||
namespace: kube-system # Must match `--ds-namespace`
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: kured
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@@ -29,14 +29,23 @@ spec:
|
||||
effect: NoSchedule
|
||||
hostPID: true # Facilitate entering the host mount namespace via init
|
||||
restartPolicy: Always
|
||||
volumes:
|
||||
- name: sentinel
|
||||
hostPath:
|
||||
path: /var/run
|
||||
type: Directory
|
||||
containers:
|
||||
- name: kured
|
||||
image: ghcr.io/kubereboot/kured:1.12.1
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
image: ghcr.io/kubereboot/kured:1.17.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
privileged: true # Give permission to nsenter /proc/1/ns/mnt
|
||||
readOnlyRootFilesystem: true
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: metrics
|
||||
env:
|
||||
# Pass in the name of the node on which this pod is scheduled
|
||||
# for use with drain/uncordon operations and lock acquisition
|
||||
@@ -44,12 +53,19 @@ spec:
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- mountPath: /sentinel
|
||||
name: sentinel
|
||||
readOnly: true
|
||||
command:
|
||||
- /usr/bin/kured
|
||||
- --reboot-sentinel=/sentinel/reboot-required
|
||||
# - --force-reboot=false
|
||||
# - --drain-grace-period=-1
|
||||
# - --skip-wait-for-delete-timeout=0
|
||||
# - --drain-delay=0
|
||||
# - --drain-timeout=0
|
||||
# - --drain-pod-selector=""
|
||||
# - --period=1h
|
||||
# - --ds-namespace=kube-system
|
||||
# - --ds-name=kured
|
||||
@@ -57,10 +73,12 @@ spec:
|
||||
# - --lock-ttl=0
|
||||
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
|
||||
# - --alert-filter-regexp=^RebootRequired$
|
||||
# - --alert-filter-match-only=false
|
||||
# - --alert-firing-only=false
|
||||
# - --reboot-sentinel=/var/run/reboot-required
|
||||
# - --prefer-no-schedule-taint=""
|
||||
# - --reboot-sentinel-command=""
|
||||
# - --reboot-method=command
|
||||
# - --reboot-signal=39
|
||||
# - --slack-hook-url=https://hooks.slack.com/...
|
||||
# - --slack-username=prod
|
||||
# - --slack-channel=alerting
|
||||
@@ -79,3 +97,6 @@ spec:
|
||||
# - --annotate-nodes=false
|
||||
# - --lock-release-delay=30m
|
||||
# - --log-format=text
|
||||
# - --metrics-host=""
|
||||
# - --metrics-port=8080
|
||||
# - --concurrency=1
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
"github.com/prometheus/common/model"
|
||||
)
|
||||
|
||||
// PromClient is a wrapper around the Prometheus Client interface and implements the api
|
||||
// This way, the PromClient can be instantiated with the configuration the Client needs, and
|
||||
// the ability to use the methods the api has, like Query and so on.
|
||||
type PromClient struct {
|
||||
papi papi.Client
|
||||
api v1.API
|
||||
}
|
||||
|
||||
// NewPromClient creates a new client to the Prometheus API.
|
||||
// It returns an error on any problem.
|
||||
func NewPromClient(conf papi.Config) (*PromClient, error) {
|
||||
promClient, err := papi.NewClient(conf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
client := PromClient{papi: promClient, api: v1.NewAPI(promClient)}
|
||||
return &client, nil
|
||||
}
|
||||
|
||||
// ActiveAlerts is a method of type PromClient, it returns a list of names of active alerts
|
||||
// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query.
|
||||
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
|
||||
// block-list and will NOT block rebooting. query by includeLabel means,
|
||||
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
|
||||
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {
|
||||
|
||||
// get all alerts from prometheus
|
||||
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.Type() == model.ValVector {
|
||||
if vector, ok := value.(model.Vector); ok {
|
||||
activeAlertSet := make(map[string]bool)
|
||||
for _, sample := range vector {
|
||||
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
|
||||
if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
|
||||
activeAlertSet[string(alertName)] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var activeAlerts []string
|
||||
for activeAlert := range activeAlertSet {
|
||||
activeAlerts = append(activeAlerts, activeAlert)
|
||||
}
|
||||
sort.Strings(activeAlerts)
|
||||
|
||||
return activeAlerts, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("Unexpected value type: %v", value)
|
||||
}
|
||||
@@ -1,141 +0,0 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/api"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
type MockResponse struct {
|
||||
StatusCode int
|
||||
Body []byte
|
||||
}
|
||||
|
||||
// MockServerProperties ties a mock response to a url and a method
|
||||
type MockServerProperties struct {
|
||||
URI string
|
||||
HTTPMethod string
|
||||
Response MockResponse
|
||||
}
|
||||
|
||||
// NewMockServer sets up a new MockServer with properties ad starts the server.
|
||||
func NewMockServer(props ...MockServerProperties) *httptest.Server {
|
||||
|
||||
handler := http.HandlerFunc(
|
||||
func(w http.ResponseWriter, r *http.Request) {
|
||||
for _, proc := range props {
|
||||
_, err := w.Write(proc.Response.Body)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
return httptest.NewServer(handler)
|
||||
}
|
||||
|
||||
func TestActiveAlerts(t *testing.T) {
|
||||
responsebody := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"GatekeeperViolations","alertstate":"firing","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodCrashing-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:8080","job":"kube-state-metrics","namespace":"dev","pod":"dev-deployment-78dcbmf25v","severity":"critical","team":"dev"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodRestart-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:1234","job":"kube-state-metrics","namespace":"qa","pod":"qa-job-deployment-78dcbmf25v","severity":"warning","team":"qa"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PrometheusTargetDown","alertstate":"firing","job":"kubernetes-pods","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`
|
||||
addr := "http://localhost:10001"
|
||||
|
||||
for _, tc := range []struct {
|
||||
it string
|
||||
rFilter string
|
||||
respBody string
|
||||
aName string
|
||||
wantN int
|
||||
firingOnly bool
|
||||
}{
|
||||
{
|
||||
it: "should return no active alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Pod",
|
||||
wantN: 3,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex filter",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return only firing alerts if firingOnly is true",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 4,
|
||||
firingOnly: true,
|
||||
},
|
||||
{
|
||||
it: "should return ScheduledRebootFailing active alerts",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
aName: "ScheduledRebootFailing",
|
||||
rFilter: "*",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
},
|
||||
} {
|
||||
// Start mockServer
|
||||
mockServer := NewMockServer(MockServerProperties{
|
||||
URI: addr,
|
||||
HTTPMethod: http.MethodPost,
|
||||
Response: MockResponse{
|
||||
Body: []byte(tc.respBody),
|
||||
},
|
||||
})
|
||||
// Close mockServer after all connections are gone
|
||||
defer mockServer.Close()
|
||||
|
||||
t.Run(tc.it, func(t *testing.T) {
|
||||
|
||||
// regex filter
|
||||
regex, _ := regexp.Compile(tc.rFilter)
|
||||
|
||||
// instantiate the prometheus client with the mockserver-address
|
||||
p, err := NewPromClient(api.Config{Address: mockServer.URL})
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
result, err := p.ActiveAlerts(regex, tc.firingOnly)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// assert
|
||||
assert.Equal(t, tc.wantN, len(result), "expected amount of alerts %v, got %v", tc.wantN, len(result))
|
||||
|
||||
if tc.aName != "" {
|
||||
assert.Equal(t, tc.aName, result[0], "expected active alert %v, got %v", tc.aName, result[0])
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
18
pkg/blockers/blockers.go
Normal file
18
pkg/blockers/blockers.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package blockers
|
||||
|
||||
// RebootBlocked checks that a single block Checker
|
||||
// will block the reboot or not.
|
||||
func RebootBlocked(blockers ...RebootBlocker) bool {
|
||||
for _, blocker := range blockers {
|
||||
if blocker.IsBlocked() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// RebootBlocker interface should be implemented by types
|
||||
// to know if their instantiations should block a reboot
|
||||
type RebootBlocker interface {
|
||||
IsBlocked() bool
|
||||
}
|
||||
65
pkg/blockers/blockers_test.go
Normal file
65
pkg/blockers/blockers_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package blockers
|
||||
|
||||
import (
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type BlockingChecker struct {
|
||||
blocking bool
|
||||
}
|
||||
|
||||
func (fbc BlockingChecker) IsBlocked() bool {
|
||||
return fbc.blocking
|
||||
}
|
||||
|
||||
func Test_rebootBlocked(t *testing.T) {
|
||||
noCheckers := []RebootBlocker{}
|
||||
nonblockingChecker := BlockingChecker{blocking: false}
|
||||
blockingChecker := BlockingChecker{blocking: true}
|
||||
|
||||
// Instantiate a prometheusClient with a broken_url
|
||||
brokenPrometheusClient := NewPrometheusBlockingChecker(papi.Config{Address: "broken_url"}, nil, false, false)
|
||||
|
||||
type args struct {
|
||||
blockers []RebootBlocker
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "Do not block on no blocker defined",
|
||||
args: args{blockers: noCheckers},
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "Ensure a blocker blocks",
|
||||
args: args{blockers: []RebootBlocker{blockingChecker}},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Ensure a non-blocker doesn't block",
|
||||
args: args{blockers: []RebootBlocker{nonblockingChecker}},
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "Ensure one blocker is enough to block",
|
||||
args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "Do block on error contacting prometheus API",
|
||||
args: args{blockers: []RebootBlocker{brokenPrometheusClient}},
|
||||
want: true,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := RebootBlocked(tt.args.blockers...); got != tt.want {
|
||||
t.Errorf("rebootBlocked() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
61
pkg/blockers/kubernetespod.go
Normal file
61
pkg/blockers/kubernetespod.go
Normal file
@@ -0,0 +1,61 @@
|
||||
package blockers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
log "github.com/sirupsen/logrus"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
// Compile-time checks to ensure the type implements the interface
|
||||
var (
|
||||
_ RebootBlocker = (*KubernetesBlockingChecker)(nil)
|
||||
)
|
||||
|
||||
// KubernetesBlockingChecker contains info for connecting
|
||||
// to k8s, and can give info about whether a reboot should be blocked
|
||||
type KubernetesBlockingChecker struct {
|
||||
// client used to contact kubernetes API
|
||||
client *kubernetes.Clientset
|
||||
nodeName string
|
||||
// lised used to filter pods (podSelector)
|
||||
filter []string
|
||||
}
|
||||
|
||||
func NewKubernetesBlockingChecker(client *kubernetes.Clientset, nodename string, podSelectors []string) *KubernetesBlockingChecker {
|
||||
return &KubernetesBlockingChecker{
|
||||
client: client,
|
||||
nodeName: nodename,
|
||||
filter: podSelectors,
|
||||
}
|
||||
}
|
||||
|
||||
// IsBlocked for the KubernetesBlockingChecker will check if a pod, for the node, is preventing
|
||||
// the reboot. It will warn in the logs about blocking, but does not return an error.
|
||||
func (kb KubernetesBlockingChecker) IsBlocked() bool {
|
||||
fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodeName)
|
||||
for _, labelSelector := range kb.filter {
|
||||
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
|
||||
LabelSelector: labelSelector,
|
||||
FieldSelector: fieldSelector,
|
||||
Limit: 10})
|
||||
if err != nil {
|
||||
log.Warnf("Reboot blocked: pod query error: %v", err)
|
||||
return true
|
||||
}
|
||||
|
||||
if len(podList.Items) > 0 {
|
||||
podNames := make([]string, 0, len(podList.Items))
|
||||
for _, pod := range podList.Items {
|
||||
podNames = append(podNames, pod.Name)
|
||||
}
|
||||
if len(podList.Continue) > 0 {
|
||||
podNames = append(podNames, "...")
|
||||
}
|
||||
log.Warnf("Reboot blocked: matching pods: %v", podNames)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
118
pkg/blockers/prometheus.go
Normal file
118
pkg/blockers/prometheus.go
Normal file
@@ -0,0 +1,118 @@
|
||||
package blockers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
papi "github.com/prometheus/client_golang/api"
|
||||
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
"github.com/prometheus/common/model"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"regexp"
|
||||
"sort"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Compile-time checks to ensure the type implements the interface
|
||||
var (
|
||||
_ RebootBlocker = (*PrometheusBlockingChecker)(nil)
|
||||
)
|
||||
|
||||
// PrometheusBlockingChecker contains info for connecting
|
||||
// to prometheus, and can give info about whether a reboot should be blocked
|
||||
type PrometheusBlockingChecker struct {
|
||||
promConfig papi.Config
|
||||
// regexp used to get alerts
|
||||
filter *regexp.Regexp
|
||||
// bool to indicate if only firing alerts should be considered
|
||||
firingOnly bool
|
||||
// bool to indicate that we're only blocking on alerts which match the filter
|
||||
filterMatchOnly bool
|
||||
// storing the promClient
|
||||
promClient papi.Client
|
||||
}
|
||||
|
||||
func NewPrometheusBlockingChecker(config papi.Config, alertFilter *regexp.Regexp, firingOnly bool, filterMatchOnly bool) PrometheusBlockingChecker {
|
||||
promClient, _ := papi.NewClient(config)
|
||||
|
||||
return PrometheusBlockingChecker{
|
||||
promConfig: config,
|
||||
filter: alertFilter,
|
||||
firingOnly: firingOnly,
|
||||
filterMatchOnly: filterMatchOnly,
|
||||
promClient: promClient,
|
||||
}
|
||||
}
|
||||
|
||||
// IsBlocked for the prometheus will check if there are active alerts matching
|
||||
// the arguments given into the PrometheusBlockingChecker which would actively
|
||||
// block the reboot.
|
||||
// As of today, no blocker information is shared as a return of the method,
|
||||
// and the information is simply logged.
|
||||
func (pb PrometheusBlockingChecker) IsBlocked() bool {
|
||||
alertNames, err := pb.ActiveAlerts()
|
||||
if err != nil {
|
||||
log.Warnf("Reboot blocked: prometheus query error: %v", err)
|
||||
return true
|
||||
}
|
||||
count := len(alertNames)
|
||||
if count > 10 {
|
||||
alertNames = append(alertNames[:10], "...")
|
||||
}
|
||||
if count > 0 {
|
||||
log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// MetricLabel is used to give a fancier name
|
||||
// than the type to the label for rebootBlockedCounter
|
||||
func (pb PrometheusBlockingChecker) MetricLabel() string {
|
||||
return "prometheus"
|
||||
}
|
||||
|
||||
// ActiveAlerts is a method of type promClient, it returns a list of names of active alerts
|
||||
// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query.
|
||||
// filter by regexp means when the regexp finds the alert-name; the alert is excluded from the
|
||||
// block-list and will NOT block rebooting. query by includeLabel means,
|
||||
// if the query finds an alert, it will include it to the block-list, and it WILL block rebooting.
|
||||
func (pb PrometheusBlockingChecker) ActiveAlerts() ([]string, error) {
|
||||
api := v1.NewAPI(pb.promClient)
|
||||
|
||||
// get all alerts from prometheus
|
||||
value, _, err := api.Query(context.Background(), "ALERTS", time.Now())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if value.Type() == model.ValVector {
|
||||
if vector, ok := value.(model.Vector); ok {
|
||||
activeAlertSet := make(map[string]bool)
|
||||
for _, sample := range vector {
|
||||
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
|
||||
if matchesRegex(pb.filter, string(alertName), pb.filterMatchOnly) && (!pb.firingOnly || sample.Metric["alertstate"] == "firing") {
|
||||
activeAlertSet[string(alertName)] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var activeAlerts []string
|
||||
for activeAlert := range activeAlertSet {
|
||||
activeAlerts = append(activeAlerts, activeAlert)
|
||||
}
|
||||
sort.Strings(activeAlerts)
|
||||
|
||||
return activeAlerts, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unexpected value type %v", value)
|
||||
}
|
||||
|
||||
func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool {
|
||||
if filter == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
return filter.MatchString(alertName) == filterMatchOnly
|
||||
}
|
||||
163
pkg/blockers/prometheus_test.go
Normal file
163
pkg/blockers/prometheus_test.go
Normal file
@@ -0,0 +1,163 @@
|
||||
package blockers
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/api"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
type MockResponse struct {
|
||||
StatusCode int
|
||||
Body []byte
|
||||
}
|
||||
|
||||
// MockServerProperties ties a mock response to a url and a method
|
||||
type MockServerProperties struct {
|
||||
URI string
|
||||
HTTPMethod string
|
||||
Response MockResponse
|
||||
}
|
||||
|
||||
// NewMockServer sets up a new MockServer with properties ad starts the server.
|
||||
func NewMockServer(props ...MockServerProperties) *httptest.Server {
|
||||
|
||||
handler := http.HandlerFunc(
|
||||
func(w http.ResponseWriter, r *http.Request) {
|
||||
for _, proc := range props {
|
||||
_, err := w.Write(proc.Response.Body)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
return httptest.NewServer(handler)
|
||||
}
|
||||
|
||||
func TestActiveAlerts(t *testing.T) {
|
||||
responsebody := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"GatekeeperViolations","alertstate":"firing","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodCrashing-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:8080","job":"kube-state-metrics","namespace":"dev","pod":"dev-deployment-78dcbmf25v","severity":"critical","team":"dev"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PodRestart-dev","alertstate":"firing","container":"deployment","instance":"1.2.3.4:1234","job":"kube-state-metrics","namespace":"qa","pod":"qa-job-deployment-78dcbmf25v","severity":"warning","team":"qa"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"PrometheusTargetDown","alertstate":"firing","job":"kubernetes-pods","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]},{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`
|
||||
addr := "http://localhost:10001"
|
||||
|
||||
for _, tc := range []struct {
|
||||
it string
|
||||
rFilter string
|
||||
respBody string
|
||||
aName string
|
||||
wantN int
|
||||
firingOnly bool
|
||||
filterMatchOnly bool
|
||||
}{
|
||||
{
|
||||
it: "should return no active alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Pod",
|
||||
wantN: 3,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Gatekeeper",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: true,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex filter",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return only firing alerts if firingOnly is true",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 4,
|
||||
firingOnly: true,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
|
||||
{
|
||||
it: "should return ScheduledRebootFailing active alerts",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
aName: "ScheduledRebootFailing",
|
||||
rFilter: "*",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: true,
|
||||
},
|
||||
} {
|
||||
// Start mockServer
|
||||
mockServer := NewMockServer(MockServerProperties{
|
||||
URI: addr,
|
||||
HTTPMethod: http.MethodPost,
|
||||
Response: MockResponse{
|
||||
Body: []byte(tc.respBody),
|
||||
},
|
||||
})
|
||||
// Close mockServer after all connections are gone
|
||||
defer mockServer.Close()
|
||||
|
||||
t.Run(tc.it, func(t *testing.T) {
|
||||
|
||||
// regex filter
|
||||
regex, _ := regexp.Compile(tc.rFilter)
|
||||
|
||||
// instantiate the prometheus client with the mockserver-address
|
||||
p := NewPrometheusBlockingChecker(api.Config{Address: mockServer.URL}, regex, tc.firingOnly, tc.filterMatchOnly)
|
||||
|
||||
result, err := p.ActiveAlerts()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// assert
|
||||
assert.Equal(t, tc.wantN, len(result), "expected amount of alerts %v, got %v", tc.wantN, len(result))
|
||||
|
||||
if tc.aName != "" {
|
||||
assert.Equal(t, tc.aName, result[0], "expected active alert %v, got %v", tc.aName, result[0])
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
109
pkg/checkers/checker.go
Normal file
109
pkg/checkers/checker.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package checkers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"github.com/google/shlex"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Checker is the standard interface to use to check
|
||||
// if a reboot is required. Its types must implement a
|
||||
// CheckRebootRequired method which returns a single boolean
|
||||
// clarifying whether a reboot is expected or not.
|
||||
type Checker interface {
|
||||
RebootRequired() bool
|
||||
}
|
||||
|
||||
// FileRebootChecker is the default reboot checker.
|
||||
// It is unprivileged, and tests the presence of a files
|
||||
type FileRebootChecker struct {
|
||||
FilePath string
|
||||
}
|
||||
|
||||
// RebootRequired checks the file presence
|
||||
// needs refactoring to also return an error, instead of leaking it inside the code.
|
||||
// This needs refactoring to get rid of NewCommand
|
||||
// This needs refactoring to only contain file location, instead of CheckCommand
|
||||
func (rc FileRebootChecker) RebootRequired() bool {
|
||||
if _, err := os.Stat(rc.FilePath); err == nil {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// NewFileRebootChecker is the constructor for the file based reboot checker
|
||||
// TODO: Add extra input validation on filePath string here
|
||||
func NewFileRebootChecker(filePath string) (*FileRebootChecker, error) {
|
||||
return &FileRebootChecker{
|
||||
FilePath: filePath,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// CommandChecker is using a custom command to check
|
||||
// if a reboot is required. There are two modes of behaviour,
|
||||
// if Privileged is granted, the NamespacePid is used to nsenter
|
||||
// the given PID's namespace.
|
||||
type CommandChecker struct {
|
||||
CheckCommand []string
|
||||
NamespacePid int
|
||||
Privileged bool
|
||||
}
|
||||
|
||||
// RebootRequired for CommandChecker runs a command without returning
|
||||
// any eventual error. This should be later refactored to return the errors,
|
||||
// instead of logging and fataling them here.
|
||||
func (rc CommandChecker) RebootRequired() bool {
|
||||
bufStdout := new(bytes.Buffer)
|
||||
bufStderr := new(bytes.Buffer)
|
||||
cmd := exec.Command(rc.CheckCommand[0], rc.CheckCommand[1:]...)
|
||||
cmd.Stdout = bufStdout
|
||||
cmd.Stderr = bufStderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
switch err := err.(type) {
|
||||
case *exec.ExitError:
|
||||
// We assume a non-zero exit code means 'reboot not required', but of course
|
||||
// the user could have misconfigured the sentinel command or something else
|
||||
// went wrong during its execution. In that case, not entering a reboot loop
|
||||
// is the right thing to do, and we are logging stdout/stderr of the command
|
||||
// so it should be obvious what is wrong.
|
||||
if cmd.ProcessState.ExitCode() != 1 {
|
||||
log.Warn(fmt.Sprintf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
|
||||
}
|
||||
return false
|
||||
default:
|
||||
// Something was grossly misconfigured, such as the command path being wrong.
|
||||
log.Fatal(fmt.Sprintf("Error invoking sentinel command: %v", err), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
|
||||
}
|
||||
}
|
||||
log.Info("checking if reboot is required", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
|
||||
return true
|
||||
}
|
||||
|
||||
// NewCommandChecker is the constructor for the commandChecker, and by default
|
||||
// runs new commands in a privileged fashion.
|
||||
// Privileged means wrapping the command with nsenter.
|
||||
// It allows to run a command from systemd's namespace for example (pid 1)
|
||||
// This relies on hostPID:true and privileged:true to enter host mount space
|
||||
// For info, rancher based need different pid, which should be user given.
|
||||
// until we have a better discovery mechanism.
|
||||
func NewCommandChecker(sentinelCommand string, pid int, privileged bool) (*CommandChecker, error) {
|
||||
var cmd []string
|
||||
if privileged {
|
||||
cmd = append(cmd, "/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--")
|
||||
}
|
||||
parsedCommand, err := shlex.Split(sentinelCommand)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing provided sentinel command: %v", err)
|
||||
}
|
||||
cmd = append(cmd, parsedCommand...)
|
||||
return &CommandChecker{
|
||||
CheckCommand: cmd,
|
||||
NamespacePid: pid,
|
||||
Privileged: privileged,
|
||||
}, nil
|
||||
}
|
||||
87
pkg/checkers/checker_test.go
Normal file
87
pkg/checkers/checker_test.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package checkers
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func Test_nsEntering(t *testing.T) {
|
||||
type args struct {
|
||||
pid int
|
||||
command string
|
||||
privileged bool
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want []string
|
||||
}{
|
||||
{
|
||||
name: "Ensure command will run with nsenter",
|
||||
args: args{pid: 1, command: "ls -Fal", privileged: true},
|
||||
want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
cc, _ := NewCommandChecker(tt.args.command, tt.args.pid, tt.args.privileged)
|
||||
if !reflect.DeepEqual(cc.CheckCommand, tt.want) {
|
||||
t.Errorf("command parsed as %v, want %v", cc.CheckCommand, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_rebootRequired(t *testing.T) {
|
||||
type args struct {
|
||||
sentinelCommand []string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want bool
|
||||
fatals bool
|
||||
}{
|
||||
{
|
||||
name: "Ensure rc = 0 means reboot required",
|
||||
args: args{
|
||||
sentinelCommand: []string{"true"},
|
||||
},
|
||||
want: true,
|
||||
fatals: false,
|
||||
},
|
||||
{
|
||||
name: "Ensure rc != 0 means reboot NOT required",
|
||||
args: args{
|
||||
sentinelCommand: []string{"false"},
|
||||
},
|
||||
want: false,
|
||||
fatals: false,
|
||||
},
|
||||
{
|
||||
name: "Ensure a wrong command fatals",
|
||||
args: args{
|
||||
sentinelCommand: []string{"./babar"},
|
||||
},
|
||||
want: true,
|
||||
fatals: true,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
defer func() { log.StandardLogger().ExitFunc = nil }()
|
||||
fatal := false
|
||||
log.StandardLogger().ExitFunc = func(int) { fatal = true }
|
||||
|
||||
a := CommandChecker{CheckCommand: tt.args.sentinelCommand, NamespacePid: 1, Privileged: false}
|
||||
|
||||
if got := a.RebootRequired(); got != tt.want {
|
||||
t.Errorf("rebootRequired() = %v, want %v", got, tt.want)
|
||||
}
|
||||
if tt.fatals != fatal {
|
||||
t.Errorf("fatal flag is %v, want fatal %v", fatal, tt.fatals)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/apps/v1"
|
||||
@@ -18,6 +20,21 @@ const (
|
||||
k8sAPICallRetryTimeout = 5 * time.Minute // How long to wait until we determine that the k8s API is definitively unavailable
|
||||
)
|
||||
|
||||
type Lock interface {
|
||||
Acquire(NodeMeta) (bool, string, error)
|
||||
Release() error
|
||||
Holding() (bool, LockAnnotationValue, error)
|
||||
}
|
||||
|
||||
type GenericLock struct {
|
||||
TTL time.Duration
|
||||
releaseDelay time.Duration
|
||||
}
|
||||
|
||||
type NodeMeta struct {
|
||||
Unschedulable bool `json:"unschedulable"`
|
||||
}
|
||||
|
||||
// DaemonSetLock holds all necessary information to do actions
|
||||
// on the kured ds which holds lock info through annotations.
|
||||
type DaemonSetLock struct {
|
||||
@@ -28,20 +45,90 @@ type DaemonSetLock struct {
|
||||
annotation string
|
||||
}
|
||||
|
||||
type lockAnnotationValue struct {
|
||||
// DaemonSetSingleLock holds all necessary information to do actions
|
||||
// on the kured ds which holds lock info through annotations.
|
||||
type DaemonSetSingleLock struct {
|
||||
GenericLock
|
||||
DaemonSetLock
|
||||
}
|
||||
|
||||
// DaemonSetMultiLock holds all necessary information to do actions
|
||||
// on the kured ds which holds lock info through annotations, valid
|
||||
// for multiple nodes
|
||||
type DaemonSetMultiLock struct {
|
||||
GenericLock
|
||||
DaemonSetLock
|
||||
maxOwners int
|
||||
}
|
||||
|
||||
// LockAnnotationValue contains the lock data,
|
||||
// which allows persistence across reboots, particularily recording if the
|
||||
// node was already unschedulable before kured reboot.
|
||||
// To be modified when using another type of lock storage.
|
||||
type LockAnnotationValue struct {
|
||||
NodeID string `json:"nodeID"`
|
||||
Metadata interface{} `json:"metadata,omitempty"`
|
||||
Metadata NodeMeta `json:"metadata,omitempty"`
|
||||
Created time.Time `json:"created"`
|
||||
TTL time.Duration `json:"TTL"`
|
||||
}
|
||||
|
||||
type multiLockAnnotationValue struct {
|
||||
MaxOwners int `json:"maxOwners"`
|
||||
LockAnnotations []LockAnnotationValue `json:"locks"`
|
||||
}
|
||||
|
||||
// New creates a daemonsetLock object containing the necessary data for follow up k8s requests
|
||||
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock {
|
||||
return &DaemonSetLock{client, nodeID, namespace, name, annotation}
|
||||
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string, TTL time.Duration, concurrency int, lockReleaseDelay time.Duration) Lock {
|
||||
if concurrency > 1 {
|
||||
return &DaemonSetMultiLock{
|
||||
GenericLock: GenericLock{
|
||||
TTL: TTL,
|
||||
releaseDelay: lockReleaseDelay,
|
||||
},
|
||||
DaemonSetLock: DaemonSetLock{
|
||||
client: client,
|
||||
nodeID: nodeID,
|
||||
namespace: namespace,
|
||||
name: name,
|
||||
annotation: annotation,
|
||||
},
|
||||
maxOwners: concurrency,
|
||||
}
|
||||
} else {
|
||||
return &DaemonSetSingleLock{
|
||||
GenericLock: GenericLock{
|
||||
TTL: TTL,
|
||||
releaseDelay: lockReleaseDelay,
|
||||
},
|
||||
DaemonSetLock: DaemonSetLock{
|
||||
client: client,
|
||||
nodeID: nodeID,
|
||||
namespace: namespace,
|
||||
name: name,
|
||||
annotation: annotation,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client
|
||||
func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) {
|
||||
var ds *v1.DaemonSet
|
||||
var lastError error
|
||||
err := wait.PollUntilContextTimeout(context.Background(), sleep, timeout, true, func(ctx context.Context) (bool, error) {
|
||||
if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError)
|
||||
}
|
||||
return ds, nil
|
||||
}
|
||||
|
||||
// Acquire attempts to annotate the kured daemonset with lock info from instantiated DaemonSetLock using client-go
|
||||
func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool, string, error) {
|
||||
func (dsl *DaemonSetSingleLock) Acquire(nodeMetadata NodeMeta) (bool, string, error) {
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
@@ -50,7 +137,7 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
value := lockAnnotationValue{}
|
||||
value := LockAnnotationValue{}
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
@@ -63,7 +150,7 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
|
||||
if ds.ObjectMeta.Annotations == nil {
|
||||
ds.ObjectMeta.Annotations = make(map[string]string)
|
||||
}
|
||||
value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata, Created: time.Now().UTC(), TTL: TTL}
|
||||
value := LockAnnotationValue{NodeID: dsl.nodeID, Metadata: nodeMetadata, Created: time.Now().UTC(), TTL: dsl.TTL}
|
||||
valueBytes, err := json.Marshal(&value)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
@@ -85,29 +172,34 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
|
||||
}
|
||||
|
||||
// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go
|
||||
func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
|
||||
func (dsl *DaemonSetSingleLock) Holding() (bool, LockAnnotationValue, error) {
|
||||
var lockData LockAnnotationValue
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
return false, lockData, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
value := lockAnnotationValue{Metadata: metadata}
|
||||
value := LockAnnotationValue{}
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return false, err
|
||||
return false, lockData, err
|
||||
}
|
||||
|
||||
if !ttlExpired(value.Created, value.TTL) {
|
||||
return value.NodeID == dsl.nodeID, nil
|
||||
return value.NodeID == dsl.nodeID, value, nil
|
||||
}
|
||||
}
|
||||
|
||||
return false, nil
|
||||
return false, lockData, nil
|
||||
}
|
||||
|
||||
// Release attempts to remove the lock data from the kured ds annotations using client-go
|
||||
func (dsl *DaemonSetLock) Release() error {
|
||||
func (dsl *DaemonSetSingleLock) Release() error {
|
||||
if dsl.releaseDelay > 0 {
|
||||
log.Infof("Waiting %v before releasing lock", dsl.releaseDelay)
|
||||
time.Sleep(dsl.releaseDelay)
|
||||
}
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
@@ -116,16 +208,16 @@ func (dsl *DaemonSetLock) Release() error {
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
value := lockAnnotationValue{}
|
||||
value := LockAnnotationValue{}
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if value.NodeID != dsl.nodeID {
|
||||
return fmt.Errorf("Not lock holder: %v", value.NodeID)
|
||||
return fmt.Errorf("not lock holder: %v", value.NodeID)
|
||||
}
|
||||
} else {
|
||||
return fmt.Errorf("Lock not held")
|
||||
return fmt.Errorf("lock not held")
|
||||
}
|
||||
|
||||
delete(ds.ObjectMeta.Annotations, dsl.annotation)
|
||||
@@ -144,27 +236,173 @@ func (dsl *DaemonSetLock) Release() error {
|
||||
}
|
||||
}
|
||||
|
||||
// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client
|
||||
func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) {
|
||||
var ds *v1.DaemonSet
|
||||
var lastError error
|
||||
err := wait.PollImmediate(sleep, timeout, func() (bool, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError)
|
||||
}
|
||||
return ds, nil
|
||||
}
|
||||
|
||||
func ttlExpired(created time.Time, ttl time.Duration) bool {
|
||||
if ttl > 0 && time.Since(created) >= ttl {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string {
|
||||
nodeIDs := make([]string, 0, len(annotation.LockAnnotations))
|
||||
for _, nodeLock := range annotation.LockAnnotations {
|
||||
nodeIDs = append(nodeIDs, nodeLock.NodeID)
|
||||
}
|
||||
return nodeIDs
|
||||
}
|
||||
|
||||
func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata NodeMeta, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) {
|
||||
newAnnotation := multiLockAnnotationValue{MaxOwners: maxOwners}
|
||||
freeSpace := false
|
||||
if annotation.LockAnnotations == nil || len(annotation.LockAnnotations) < maxOwners {
|
||||
freeSpace = true
|
||||
newAnnotation.LockAnnotations = annotation.LockAnnotations
|
||||
} else {
|
||||
for _, nodeLock := range annotation.LockAnnotations {
|
||||
if ttlExpired(nodeLock.Created, nodeLock.TTL) {
|
||||
freeSpace = true
|
||||
continue
|
||||
}
|
||||
newAnnotation.LockAnnotations = append(
|
||||
newAnnotation.LockAnnotations,
|
||||
nodeLock,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if freeSpace {
|
||||
newAnnotation.LockAnnotations = append(
|
||||
newAnnotation.LockAnnotations,
|
||||
LockAnnotationValue{
|
||||
NodeID: dsl.nodeID,
|
||||
Metadata: metadata,
|
||||
Created: time.Now().UTC(),
|
||||
TTL: TTL,
|
||||
},
|
||||
)
|
||||
return true, newAnnotation
|
||||
}
|
||||
|
||||
return false, multiLockAnnotationValue{}
|
||||
}
|
||||
|
||||
// Acquire creates and annotates the daemonset with a multiple owner lock
|
||||
func (dsl *DaemonSetMultiLock) Acquire(nodeMetaData NodeMeta) (bool, string, error) {
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
annotation := multiLockAnnotationValue{}
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
if err := json.Unmarshal([]byte(valueString), &annotation); err != nil {
|
||||
return false, "", fmt.Errorf("error getting multi lock: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, nodeMetaData, dsl.TTL, dsl.maxOwners)
|
||||
if !lockPossible {
|
||||
return false, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil
|
||||
}
|
||||
|
||||
if ds.ObjectMeta.Annotations == nil {
|
||||
ds.ObjectMeta.Annotations = make(map[string]string)
|
||||
}
|
||||
newAnnotationBytes, err := json.Marshal(&newAnnotation)
|
||||
if err != nil {
|
||||
return false, "", fmt.Errorf("error marshalling new annotation lock: %w", err)
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
time.Sleep(time.Second)
|
||||
continue
|
||||
} else {
|
||||
return false, "", fmt.Errorf("error updating daemonset with multi lock: %w", err)
|
||||
}
|
||||
}
|
||||
return true, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil
|
||||
}
|
||||
}
|
||||
|
||||
// TestMultiple attempts to check the kured daemonset lock status for multi locks
|
||||
func (dsl *DaemonSetMultiLock) Holding() (bool, LockAnnotationValue, error) {
|
||||
var lockdata LockAnnotationValue
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return false, lockdata, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
value := multiLockAnnotationValue{}
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return false, lockdata, err
|
||||
}
|
||||
|
||||
for _, nodeLock := range value.LockAnnotations {
|
||||
if nodeLock.NodeID == dsl.nodeID && !ttlExpired(nodeLock.Created, nodeLock.TTL) {
|
||||
return true, nodeLock, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false, lockdata, nil
|
||||
}
|
||||
|
||||
// Release attempts to remove the lock data for a single node from the multi node annotation
|
||||
func (dsl *DaemonSetMultiLock) Release() error {
|
||||
if dsl.releaseDelay > 0 {
|
||||
log.Infof("Waiting %v before releasing lock", dsl.releaseDelay)
|
||||
time.Sleep(dsl.releaseDelay)
|
||||
}
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
modified := false
|
||||
value := multiLockAnnotationValue{}
|
||||
if exists {
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for idx, nodeLock := range value.LockAnnotations {
|
||||
if nodeLock.NodeID == dsl.nodeID {
|
||||
value.LockAnnotations = append(value.LockAnnotations[:idx], value.LockAnnotations[idx+1:]...)
|
||||
modified = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !exists || !modified {
|
||||
return fmt.Errorf("Lock not held")
|
||||
}
|
||||
|
||||
newAnnotationBytes, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error marshalling new annotation on release: %v", err)
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
// Something else updated the resource between us reading and writing - try again soon
|
||||
time.Sleep(time.Second)
|
||||
continue
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package daemonsetlock
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -26,3 +28,181 @@ func TestTtlExpired(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func multiLockAnnotationsAreEqualByNodes(src, dst multiLockAnnotationValue) bool {
|
||||
srcNodes := []string{}
|
||||
for _, srcLock := range src.LockAnnotations {
|
||||
srcNodes = append(srcNodes, srcLock.NodeID)
|
||||
}
|
||||
sort.Strings(srcNodes)
|
||||
|
||||
dstNodes := []string{}
|
||||
for _, dstLock := range dst.LockAnnotations {
|
||||
dstNodes = append(dstNodes, dstLock.NodeID)
|
||||
}
|
||||
sort.Strings(dstNodes)
|
||||
|
||||
return reflect.DeepEqual(srcNodes, dstNodes)
|
||||
}
|
||||
|
||||
func TestCanAcquireMultiple(t *testing.T) {
|
||||
node1Name := "n1"
|
||||
node2Name := "n2"
|
||||
node3Name := "n3"
|
||||
testCases := []struct {
|
||||
name string
|
||||
daemonSetLock DaemonSetLock
|
||||
maxOwners int
|
||||
current multiLockAnnotationValue
|
||||
desired multiLockAnnotationValue
|
||||
lockPossible bool
|
||||
}{
|
||||
{
|
||||
name: "empty_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "partial_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{NodeID: node2Name},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
{NodeID: node2Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "full_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{NodeID: node2Name},
|
||||
{NodeID: node3Name},
|
||||
},
|
||||
},
|
||||
lockPossible: false,
|
||||
},
|
||||
{
|
||||
name: "full_with_one_expired_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
{NodeID: node3Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "full_with_all_expired_locks",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []LockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
}
|
||||
nm := NodeMeta{Unschedulable: false}
|
||||
for _, testCase := range testCases {
|
||||
t.Run(testCase.name, func(t *testing.T) {
|
||||
lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, nm, time.Minute, testCase.maxOwners)
|
||||
if lockPossible != testCase.lockPossible {
|
||||
t.Fatalf(
|
||||
"unexpected result for lock possible (got %t expected %t new annotation %v",
|
||||
lockPossible,
|
||||
testCase.lockPossible,
|
||||
actual,
|
||||
)
|
||||
}
|
||||
|
||||
if lockPossible && (!multiLockAnnotationsAreEqualByNodes(actual, testCase.desired) || testCase.desired.MaxOwners != actual.MaxOwners) {
|
||||
t.Fatalf(
|
||||
"expected lock %v but got %v",
|
||||
testCase.desired,
|
||||
actual,
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
48
pkg/reboot/command.go
Normal file
48
pkg/reboot/command.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package reboot
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"github.com/google/shlex"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CommandRebooter holds context-information for a reboot with command
|
||||
type CommandRebooter struct {
|
||||
RebootCommand []string
|
||||
}
|
||||
|
||||
// Reboot triggers the reboot command
|
||||
func (c CommandRebooter) Reboot() error {
|
||||
log.Infof("Invoking command: %s", c.RebootCommand)
|
||||
|
||||
bufStdout := new(bytes.Buffer)
|
||||
bufStderr := new(bytes.Buffer)
|
||||
cmd := exec.Command(c.RebootCommand[0], c.RebootCommand[1:]...)
|
||||
cmd.Stdout = bufStdout
|
||||
cmd.Stderr = bufStderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("error invoking reboot command %s: %v (stdout: %v, stderr: %v)", c.RebootCommand, err, bufStdout.String(), bufStderr.String())
|
||||
}
|
||||
log.Info("Invoked reboot command", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewCommandRebooter is the constructor to create a CommandRebooter from a string not
|
||||
// yet shell lexed. You can skip this constructor if you parse the data correctly first
|
||||
// when instantiating a CommandRebooter instance.
|
||||
func NewCommandRebooter(rebootCommand string) (*CommandRebooter, error) {
|
||||
if rebootCommand == "" {
|
||||
return nil, fmt.Errorf("no reboot command specified")
|
||||
}
|
||||
cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", 1), "--"}
|
||||
parsedCommand, err := shlex.Split(rebootCommand)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error %v when parsing reboot command %s", err, rebootCommand)
|
||||
}
|
||||
cmd = append(cmd, parsedCommand...)
|
||||
return &CommandRebooter{RebootCommand: cmd}, nil
|
||||
}
|
||||
43
pkg/reboot/command_test.go
Normal file
43
pkg/reboot/command_test.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package reboot
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewCommandRebooter(t *testing.T) {
|
||||
type args struct {
|
||||
rebootCommand string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
want *CommandRebooter
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "Ensure command is nsenter wrapped",
|
||||
args: args{"ls -Fal"},
|
||||
want: &CommandRebooter{RebootCommand: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"}},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "Ensure empty command is erroring",
|
||||
args: args{""},
|
||||
want: nil,
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := NewCommandRebooter(tt.args.rebootCommand)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("NewCommandRebooter() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if !reflect.DeepEqual(got, tt.want) {
|
||||
t.Errorf("NewCommandRebooter() got = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
9
pkg/reboot/reboot.go
Normal file
9
pkg/reboot/reboot.go
Normal file
@@ -0,0 +1,9 @@
|
||||
package reboot
|
||||
|
||||
// Rebooter is the standard interface to use to execute
|
||||
// the reboot, after it has been considered as necessary.
|
||||
// The Reboot method does not expect any return, yet should
|
||||
// most likely be refactored in the future to return an error
|
||||
type Rebooter interface {
|
||||
Reboot() error
|
||||
}
|
||||
37
pkg/reboot/signal.go
Normal file
37
pkg/reboot/signal.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package reboot
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// SignalRebooter holds context-information for a signal reboot.
|
||||
type SignalRebooter struct {
|
||||
Signal int
|
||||
}
|
||||
|
||||
// Reboot triggers the reboot signal
|
||||
func (c SignalRebooter) Reboot() error {
|
||||
process, err := os.FindProcess(1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("not running on Unix: %v", err)
|
||||
}
|
||||
|
||||
err = process.Signal(syscall.Signal(c.Signal))
|
||||
// Either PID does not exist, or the signal does not work. Hoping for
|
||||
// a decent enough error.
|
||||
if err != nil {
|
||||
return fmt.Errorf("signal of SIGRTMIN+5 failed: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewSignalRebooter is the constructor which sets the signal number.
|
||||
// The constructor does not yet validate any input. It should be done in a later commit.
|
||||
func NewSignalRebooter(sig int) (*SignalRebooter, error) {
|
||||
if sig < 1 {
|
||||
return nil, fmt.Errorf("invalid signal: %v", sig)
|
||||
}
|
||||
return &SignalRebooter{Signal: sig}, nil
|
||||
}
|
||||
@@ -81,11 +81,11 @@ func parseWeekday(day string) (time.Weekday, error) {
|
||||
if n >= 0 && n < 7 {
|
||||
return time.Weekday(n), nil
|
||||
}
|
||||
return time.Sunday, fmt.Errorf("Invalid weekday, number out of range: %s", day)
|
||||
return time.Sunday, fmt.Errorf("invalid weekday, number out of range: %s", day)
|
||||
}
|
||||
|
||||
if weekday, ok := dayStrings[strings.ToLower(day)]; ok {
|
||||
return weekday, nil
|
||||
}
|
||||
return time.Sunday, fmt.Errorf("Invalid weekday: %s", day)
|
||||
return time.Sunday, fmt.Errorf("invalid weekday: %s", day)
|
||||
}
|
||||
|
||||
@@ -77,5 +77,5 @@ func parseTime(s string, loc *time.Location) (time.Time, error) {
|
||||
}
|
||||
}
|
||||
|
||||
return time.Now(), fmt.Errorf("Invalid time format: %s", s)
|
||||
return time.Now(), fmt.Errorf("invalid time format: %s", s)
|
||||
}
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# USE KUBECTL_CMD to pass context and/or namespaces.
|
||||
KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
|
||||
SENTINEL_FILE="${SENTINEL_FILE:-/var/run/reboot-required}"
|
||||
|
||||
echo "Creating reboot sentinel on all nodes"
|
||||
|
||||
for nodename in $("$KUBECTL_CMD" get nodes -o name); do
|
||||
docker exec "${nodename/node\//}" hostname
|
||||
docker exec "${nodename/node\//}" touch "${SENTINEL_FILE}"
|
||||
done
|
||||
429
tests/kind/main_test.go
Normal file
429
tests/kind/main_test.go
Normal file
@@ -0,0 +1,429 @@
|
||||
package kind
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
kuredDevImage string = "kured:dev"
|
||||
)
|
||||
|
||||
// KindTest cluster deployed by each TestMain function, prepared to run a given test scenario.
|
||||
type KindTest struct {
|
||||
kindConfigPath string
|
||||
clusterName string
|
||||
timeout time.Duration
|
||||
deployManifests []string
|
||||
localImages []string
|
||||
logsDir string
|
||||
logBuffer bytes.Buffer
|
||||
testInstance *testing.T // Maybe move this to testing.TB
|
||||
}
|
||||
|
||||
func (k *KindTest) Write(p []byte) (n int, err error) {
|
||||
k.testInstance.Helper()
|
||||
k.logBuffer.Write(p)
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func (k *KindTest) FlushLog() {
|
||||
k.testInstance.Helper()
|
||||
k.testInstance.Log(k.logBuffer.String())
|
||||
k.logBuffer.Reset()
|
||||
}
|
||||
|
||||
func (k *KindTest) RunCmd(cmdDetails ...string) error {
|
||||
cmd := exec.Command(cmdDetails[0], cmdDetails[1:]...)
|
||||
// by making KindTest a Writer, we can simply wire k to logs
|
||||
// writing to k will write to proper logs.
|
||||
cmd.Stdout = k
|
||||
cmd.Stderr = k
|
||||
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Option that can be passed to the NewKind function in order to change the configuration
|
||||
// of the test cluster
|
||||
type Option func(k *KindTest)
|
||||
|
||||
// Deploy can be passed to NewKind to deploy extra components, in addition to the base deployment.
|
||||
func Deploy(manifest string) Option {
|
||||
return func(k *KindTest) {
|
||||
k.deployManifests = append(k.deployManifests, manifest)
|
||||
}
|
||||
}
|
||||
|
||||
// ExportLogs can be passed to NewKind to specify the folder where the kubernetes logs will be exported after the tests.
|
||||
func ExportLogs(folder string) Option {
|
||||
return func(k *KindTest) {
|
||||
k.logsDir = folder
|
||||
}
|
||||
}
|
||||
|
||||
// Timeout for long-running operations (e.g. deployments, readiness probes...)
|
||||
func Timeout(t time.Duration) Option {
|
||||
return func(k *KindTest) {
|
||||
k.timeout = t
|
||||
}
|
||||
}
|
||||
|
||||
// LocalImage is passed to NewKind to allow loading a local Docker image into the cluster
|
||||
func LocalImage(nameTag string) Option {
|
||||
return func(k *KindTest) {
|
||||
k.localImages = append(k.localImages, nameTag)
|
||||
}
|
||||
}
|
||||
|
||||
// NewKind creates a kind cluster given a name and set of Option instances.
|
||||
func NewKindTester(kindClusterName string, filePath string, t *testing.T, options ...Option) *KindTest {
|
||||
|
||||
k := &KindTest{
|
||||
clusterName: kindClusterName,
|
||||
timeout: 10 * time.Minute,
|
||||
kindConfigPath: filePath,
|
||||
testInstance: t,
|
||||
}
|
||||
for _, option := range options {
|
||||
option(k)
|
||||
}
|
||||
return k
|
||||
}
|
||||
|
||||
// Prepare the kind cluster.
|
||||
func (k *KindTest) Create() error {
|
||||
err := k.RunCmd("kind", "create", "cluster", "--name", k.clusterName, "--config", k.kindConfigPath)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create cluster: %v", err)
|
||||
}
|
||||
|
||||
for _, img := range k.localImages {
|
||||
if err := k.RunCmd("kind", "load", "docker-image", "--name", k.clusterName, img); err != nil {
|
||||
return fmt.Errorf("failed to load image: %v", err)
|
||||
}
|
||||
}
|
||||
for _, mf := range k.deployManifests {
|
||||
kubectlContext := fmt.Sprintf("kind-%v", k.clusterName)
|
||||
if err := k.RunCmd("kubectl", "--context", kubectlContext, "apply", "-f", mf); err != nil {
|
||||
return fmt.Errorf("failed to deploy manifest: %v", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (k *KindTest) Destroy() error {
|
||||
if k.logsDir != "" {
|
||||
if err := k.RunCmd("kind", "export", "logs", k.logsDir, "--name", k.clusterName); err != nil {
|
||||
return fmt.Errorf("failed to export logs: %v. will not teardown", err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := k.RunCmd("kind", "delete", "cluster", "--name", k.clusterName); err != nil {
|
||||
return fmt.Errorf("failed to destroy cluster: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func TestE2EWithCommand(t *testing.T) {
|
||||
t.Parallel()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
var kindClusterConfigs = []string{
|
||||
"previous",
|
||||
"current",
|
||||
"next",
|
||||
}
|
||||
// Iterate over each Kubernetes version
|
||||
for _, version := range kindClusterConfigs {
|
||||
version := version
|
||||
// Define a subtest for each combination
|
||||
t.Run(version, func(t *testing.T) {
|
||||
t.Parallel() // Allow tests to run in parallel
|
||||
|
||||
randomInt := strconv.Itoa(rand.Intn(100))
|
||||
kindClusterName := fmt.Sprintf("kured-e2e-command-%v-%v", version, randomInt)
|
||||
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
|
||||
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
|
||||
|
||||
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds.yaml"))
|
||||
defer k.FlushLog()
|
||||
|
||||
err := k.Create()
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating cluster %v", err)
|
||||
}
|
||||
defer func(k *KindTest) {
|
||||
err := k.Destroy()
|
||||
if err != nil {
|
||||
t.Fatalf("Error destroying cluster %v", err)
|
||||
}
|
||||
}(k)
|
||||
|
||||
k.Write([]byte("Now running e2e tests"))
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to create sentinels: %v", err)
|
||||
}
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to follow reboot: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestE2EWithSignal(t *testing.T) {
|
||||
t.Parallel()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
var kindClusterConfigs = []string{
|
||||
"previous",
|
||||
"current",
|
||||
"next",
|
||||
}
|
||||
// Iterate over each Kubernetes version
|
||||
for _, version := range kindClusterConfigs {
|
||||
version := version
|
||||
// Define a subtest for each combination
|
||||
t.Run(version, func(t *testing.T) {
|
||||
t.Parallel() // Allow tests to run in parallel
|
||||
|
||||
randomInt := strconv.Itoa(rand.Intn(100))
|
||||
kindClusterName := fmt.Sprintf("kured-e2e-signal-%v-%v", version, randomInt)
|
||||
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
|
||||
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
|
||||
|
||||
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-signal.yaml"))
|
||||
defer k.FlushLog()
|
||||
|
||||
err := k.Create()
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating cluster %v", err)
|
||||
}
|
||||
defer func(k *KindTest) {
|
||||
err := k.Destroy()
|
||||
if err != nil {
|
||||
t.Fatalf("Error destroying cluster %v", err)
|
||||
}
|
||||
}(k)
|
||||
|
||||
k.Write([]byte("Now running e2e tests"))
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to create sentinels: %v", err)
|
||||
}
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to follow reboot: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestE2EConcurrentWithCommand(t *testing.T) {
|
||||
t.Parallel()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
var kindClusterConfigs = []string{
|
||||
"previous",
|
||||
"current",
|
||||
"next",
|
||||
}
|
||||
// Iterate over each Kubernetes version
|
||||
for _, version := range kindClusterConfigs {
|
||||
version := version
|
||||
// Define a subtest for each combination
|
||||
t.Run(version, func(t *testing.T) {
|
||||
t.Parallel() // Allow tests to run in parallel
|
||||
|
||||
randomInt := strconv.Itoa(rand.Intn(100))
|
||||
kindClusterName := fmt.Sprintf("kured-e2e-concurrentcommand-%v-%v", version, randomInt)
|
||||
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
|
||||
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
|
||||
|
||||
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-command.yaml"))
|
||||
defer k.FlushLog()
|
||||
|
||||
err := k.Create()
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating cluster %v", err)
|
||||
}
|
||||
defer func(k *KindTest) {
|
||||
err := k.Destroy()
|
||||
if err != nil {
|
||||
t.Fatalf("Error destroying cluster %v", err)
|
||||
}
|
||||
}(k)
|
||||
|
||||
k.Write([]byte("Now running e2e tests"))
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to create sentinels: %v", err)
|
||||
}
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to follow reboot: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestE2EConcurrentWithSignal(t *testing.T) {
|
||||
t.Parallel()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
var kindClusterConfigs = []string{
|
||||
"previous",
|
||||
"current",
|
||||
"next",
|
||||
}
|
||||
// Iterate over each Kubernetes version
|
||||
for _, version := range kindClusterConfigs {
|
||||
version := version
|
||||
// Define a subtest for each combination
|
||||
t.Run(version, func(t *testing.T) {
|
||||
t.Parallel() // Allow tests to run in parallel
|
||||
|
||||
randomInt := strconv.Itoa(rand.Intn(100))
|
||||
kindClusterName := fmt.Sprintf("kured-e2e-concurrentsignal-%v-%v", version, randomInt)
|
||||
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
|
||||
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
|
||||
|
||||
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-signal.yaml"))
|
||||
defer k.FlushLog()
|
||||
|
||||
err := k.Create()
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating cluster %v", err)
|
||||
}
|
||||
defer func(k *KindTest) {
|
||||
err := k.Destroy()
|
||||
if err != nil {
|
||||
t.Fatalf("Error destroying cluster %v", err)
|
||||
}
|
||||
}(k)
|
||||
|
||||
k.Write([]byte("Now running e2e tests"))
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to create sentinels: %v", err)
|
||||
}
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
|
||||
t.Fatalf("failed to follow reboot: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCordonningIsKept(t *testing.T) {
|
||||
t.Parallel()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
var kindClusterConfigs = []string{
|
||||
"concurrency1",
|
||||
"concurrency2",
|
||||
}
|
||||
// Iterate over each test variant
|
||||
for _, variant := range kindClusterConfigs {
|
||||
variant := variant
|
||||
// Define a subtest for each combination
|
||||
t.Run(variant, func(t *testing.T) {
|
||||
t.Parallel() // Allow tests to run in parallel
|
||||
|
||||
randomInt := strconv.Itoa(rand.Intn(100))
|
||||
kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt)
|
||||
kindClusterConfigFile := "../../.github/kind-cluster-next.yaml"
|
||||
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
|
||||
|
||||
var manifest string
|
||||
if variant == "concurrency1" {
|
||||
manifest = "testfiles/kured-ds-signal.yaml"
|
||||
} else {
|
||||
manifest = "testfiles/kured-ds-concurrent-signal.yaml"
|
||||
}
|
||||
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(manifest))
|
||||
defer k.FlushLog()
|
||||
|
||||
err := k.Create()
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating cluster %v", err)
|
||||
}
|
||||
defer func(k *KindTest) {
|
||||
err := k.Destroy()
|
||||
if err != nil {
|
||||
t.Fatalf("Error destroying cluster %v", err)
|
||||
}
|
||||
}(k)
|
||||
|
||||
k.Write([]byte("Now running e2e tests"))
|
||||
|
||||
if err := k.RunCmd("bash", "testfiles/node-stays-as-cordonned.sh", kindContext); err != nil {
|
||||
t.Fatalf("node did not reboot in time: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
func TestE2EBlocker(t *testing.T) {
|
||||
t.Parallel()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
var kindClusterConfigs = []string{
|
||||
"podblocker",
|
||||
}
|
||||
// Iterate over each variant of the test
|
||||
for _, variant := range kindClusterConfigs {
|
||||
variant := variant
|
||||
// Define a subtest for each combination
|
||||
t.Run(variant, func(t *testing.T) {
|
||||
t.Parallel() // Allow tests to run in parallel
|
||||
|
||||
randomInt := strconv.Itoa(rand.Intn(100))
|
||||
kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt)
|
||||
kindClusterConfigFile := "../../.github/kind-cluster-next.yaml"
|
||||
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
|
||||
|
||||
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", variant)))
|
||||
defer k.FlushLog()
|
||||
|
||||
err := k.Create()
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating cluster %v", err)
|
||||
}
|
||||
defer func(k *KindTest) {
|
||||
err := k.Destroy()
|
||||
if err != nil {
|
||||
t.Fatalf("Error destroying cluster %v", err)
|
||||
}
|
||||
}(k)
|
||||
|
||||
k.Write([]byte("Now running e2e tests"))
|
||||
|
||||
if err := k.RunCmd("bash", fmt.Sprintf("testfiles/%v.sh", variant), kindContext); err != nil {
|
||||
t.Fatalf("node blocker test did not succeed: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
11
tests/kind/testfiles/create-reboot-sentinels.sh
Executable file
11
tests/kind/testfiles/create-reboot-sentinels.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
kubectl_flags=( )
|
||||
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
|
||||
|
||||
# To speed up the system, let's not kill the control plane.
|
||||
for nodename in $(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o name | grep -v control-plane); do
|
||||
echo "Creating reboot sentinel on $nodename"
|
||||
docker exec "${nodename/node\//}" hostname
|
||||
docker exec "${nodename/node\//}" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
|
||||
done
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
NODECOUNT=${NODECOUNT:-5}
|
||||
KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
|
||||
REBOOTCOUNT=${REBOOTCOUNT:-2} # By default we only create two sentinels in create-reboot-sentinels.
|
||||
DEBUG="${DEBUG:-false}"
|
||||
CONTAINER_NAME_FORMAT=${CONTAINER_NAME_FORMAT:-"chart-testing-*"}
|
||||
|
||||
kubectl_flags=( )
|
||||
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
|
||||
|
||||
tmp_dir=$(mktemp -d -t kured-XXXX)
|
||||
|
||||
function gather_logs_and_cleanup {
|
||||
if [[ -f "$tmp_dir"/node_output ]]; then
|
||||
rm "$tmp_dir"/node_output
|
||||
@@ -18,15 +21,15 @@ function gather_logs_and_cleanup {
|
||||
# This is useful to see if containers have crashed.
|
||||
echo "docker ps -a:"
|
||||
docker ps -a
|
||||
echo "docker journal logs"
|
||||
journalctl -u docker --no-pager
|
||||
echo "docker journal logs"
|
||||
journalctl -u docker --no-pager
|
||||
|
||||
# This is useful to see if the nodes have _properly_ rebooted.
|
||||
# It should show the reboot/two container starts per node.
|
||||
for name in $(docker ps -a -f "name=${CONTAINER_NAME_FORMAT}" -q); do
|
||||
for id in $(docker ps -a -q); do
|
||||
echo "############################################################"
|
||||
echo "docker logs for container $name:"
|
||||
docker logs "$name"
|
||||
echo "docker logs for container $id:"
|
||||
docker logs "$id"
|
||||
done
|
||||
|
||||
fi
|
||||
@@ -35,24 +38,28 @@ trap gather_logs_and_cleanup EXIT
|
||||
|
||||
declare -A was_unschedulable
|
||||
declare -A has_recovered
|
||||
max_attempts="60"
|
||||
sleep_time=60
|
||||
max_attempts="200"
|
||||
sleep_time=5
|
||||
attempt_num=1
|
||||
|
||||
# Get docker info of each of those kind containers. If one has crashed, restart it.
|
||||
|
||||
set +o errexit
|
||||
echo "There are $NODECOUNT nodes in the cluster"
|
||||
until [ ${#was_unschedulable[@]} == "$NODECOUNT" ] && [ ${#has_recovered[@]} == "$NODECOUNT" ]
|
||||
echo "There are $REBOOTCOUNT nodes total needing reboot in the cluster"
|
||||
until [ ${#was_unschedulable[@]} == "$REBOOTCOUNT" ] && [ ${#has_recovered[@]} == "$REBOOTCOUNT" ]
|
||||
do
|
||||
echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}"
|
||||
echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}"
|
||||
|
||||
"$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output
|
||||
|
||||
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers | grep -v control-plane > "$tmp_dir"/node_output
|
||||
if [[ "$DEBUG" == "true" ]]; then
|
||||
# This is useful to see if a node gets stuck after drain, and doesn't
|
||||
# come back up.
|
||||
echo "Result of command $KUBECTL_CMD get nodes ... showing unschedulable nodes:"
|
||||
echo "Result of command kubectl unschedulable nodes:"
|
||||
cat "$tmp_dir"/node_output
|
||||
fi
|
||||
|
||||
while read -r node; do
|
||||
unschedulable=$(echo "$node" | grep true | cut -f 1 -d ' ')
|
||||
if [ -n "$unschedulable" ] && [ -z ${was_unschedulable["$unschedulable"]+x} ] ; then
|
||||
@@ -64,9 +71,15 @@ do
|
||||
echo "$schedulable has recovered!"
|
||||
has_recovered["$schedulable"]=1
|
||||
fi
|
||||
|
||||
# If the container has crashed, restart it.
|
||||
node_name=$(echo "$node" | cut -f 1 -d ' ')
|
||||
stopped_container_id=$(docker container ls --filter=name="$node_name" --filter=status=exited -q)
|
||||
if [ -n "$stopped_container_id" ]; then echo "Node $stopped_container_id needs restart"; docker start "$stopped_container_id"; echo "Container started."; fi
|
||||
|
||||
done < "$tmp_dir"/node_output
|
||||
|
||||
if [[ "${#has_recovered[@]}" == "$NODECOUNT" ]]; then
|
||||
if [[ "${#has_recovered[@]}" == "$REBOOTCOUNT" ]]; then
|
||||
echo "All nodes recovered."
|
||||
break
|
||||
else
|
||||
59
tests/kind/testfiles/node-stays-as-cordonned.sh
Executable file
59
tests/kind/testfiles/node-stays-as-cordonned.sh
Executable file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
kubectl_flags=( )
|
||||
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
|
||||
|
||||
cordon() {
|
||||
kubectl "${kubectl_flags[@]}" cordon "${precordonned_node}"
|
||||
}
|
||||
|
||||
create_sentinel() {
|
||||
docker exec "${precordonned_node}" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
|
||||
docker exec "${notcordonned_node}" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
|
||||
}
|
||||
|
||||
check_reboot_required() {
|
||||
while true;
|
||||
do
|
||||
docker exec "${precordonned_node}" stat /var/run/reboot-required > /dev/null && echo "Reboot still required" || return 0
|
||||
sleep 3
|
||||
done
|
||||
}
|
||||
|
||||
check_node_back_online_as_cordonned() {
|
||||
sleep 5 # For safety, wait for 5 seconds, so that the kubectl command succeeds.
|
||||
# This test might be giving us false positive until we work on reliability of the
|
||||
# test.
|
||||
while true;
|
||||
do
|
||||
result=$(kubectl "${kubectl_flags[@]}" get node "${precordonned_node}" --no-headers | awk '{print $2;}')
|
||||
test "${result}" != "Ready,SchedulingDisabled" && echo "Node ${precordonned_node} in state ${result}" || return 0
|
||||
sleep 3
|
||||
done
|
||||
}
|
||||
|
||||
check_node_back_online_as_uncordonned() {
|
||||
while true;
|
||||
do
|
||||
result=$(kubectl "${kubectl_flags[@]}" get node "${notcordonned_node}" --no-headers | awk '{print $2;}')
|
||||
test "${result}" != "Ready" && echo "Node ${notcordonned_node} in state ${result}" || return 0
|
||||
sleep 3
|
||||
done
|
||||
}
|
||||
### Start main
|
||||
|
||||
worker_nodes=$(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=name:metadata.name --no-headers | grep worker)
|
||||
precordonned_node=$(echo "$worker_nodes" | head -n 1)
|
||||
notcordonned_node=$(echo "$worker_nodes" | tail -n 1)
|
||||
|
||||
# Wait for kured to install correctly
|
||||
sleep 15
|
||||
cordon
|
||||
create_sentinel
|
||||
check_reboot_required
|
||||
echo "Node has rebooted, but may take time to come back ready"
|
||||
check_node_back_online_as_cordonned
|
||||
check_node_back_online_as_uncordonned
|
||||
echo "Showing final node state"
|
||||
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes
|
||||
echo "Test successful"
|
||||
54
tests/kind/testfiles/podblocker.sh
Executable file
54
tests/kind/testfiles/podblocker.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
kubectl_flags=( )
|
||||
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
|
||||
|
||||
function gather_logs_and_cleanup {
|
||||
for id in $(docker ps -q); do
|
||||
echo "############################################################"
|
||||
echo "docker logs for container $id:"
|
||||
docker logs "$id"
|
||||
done
|
||||
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" logs ds/kured --all-pods -n kube-system
|
||||
}
|
||||
trap gather_logs_and_cleanup EXIT
|
||||
|
||||
set +o errexit
|
||||
worker=$(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=name:metadata.name --no-headers | grep worker | head -n 1)
|
||||
|
||||
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" label nodes "$worker" blocked-host=yes
|
||||
|
||||
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" apply -f - << EOF
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: nginx
|
||||
labels:
|
||||
app: blocker
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx
|
||||
imagePullPolicy: IfNotPresent
|
||||
nodeSelector:
|
||||
blocked-host: "yes"
|
||||
EOF
|
||||
|
||||
docker exec "$worker" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
|
||||
|
||||
set -o errexit
|
||||
max_attempts="100"
|
||||
attempt_num=1
|
||||
sleep_time=5
|
||||
|
||||
until ${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" logs ds/kured --all-pods -n kube-system | grep -i -e "Reboot.*blocked"
|
||||
do
|
||||
if (( attempt_num == max_attempts )); then
|
||||
echo "Attempt $attempt_num failed and there are no more attempts left!"
|
||||
exit 1
|
||||
else
|
||||
echo "Did not find 'reboot blocked' in the log, retrying in $sleep_time seconds (Attempt #$attempt_num)"
|
||||
sleep "$sleep_time"
|
||||
fi
|
||||
(( attempt_num++ ))
|
||||
done
|
||||
Reference in New Issue
Block a user