mirror of
https://github.com/kubereboot/kured.git
synced 2026-03-12 13:50:28 +00:00
Compare commits
123 Commits
1.12.0
...
cleanup/to
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59cbea5e25 | ||
|
|
776c35c1e1 | ||
|
|
9a4b8fdb32 | ||
|
|
3b9b190422 | ||
|
|
f22b1abd17 | ||
|
|
c159b37fcc | ||
|
|
351ca71787 | ||
|
|
16dc5e30d9 | ||
|
|
aa971697ff | ||
|
|
d019e7a50a | ||
|
|
ee81617645 | ||
|
|
d7adcf6e1e | ||
|
|
409ff0a3e6 | ||
|
|
3be3cd46b5 | ||
|
|
e8202c602c | ||
|
|
752176d16b | ||
|
|
d30a71e1d3 | ||
|
|
815df5e1e9 | ||
|
|
77327b3915 | ||
|
|
ec328e33d6 | ||
|
|
54e127c2ad | ||
|
|
1867c3253e | ||
|
|
05a3ff85a3 | ||
|
|
19846c73f2 | ||
|
|
ba62c32cbf | ||
|
|
4c75199b41 | ||
|
|
91eb403942 | ||
|
|
a27c755260 | ||
|
|
2a6d119b3b | ||
|
|
b17224addc | ||
|
|
a2f21ebe49 | ||
|
|
4d2f26f483 | ||
|
|
b358be7617 | ||
|
|
e88434b619 | ||
|
|
1b12e52434 | ||
|
|
64e40a62b0 | ||
|
|
6690396679 | ||
|
|
9acb2450ea | ||
|
|
e1a5b7d705 | ||
|
|
72f52f2c6f | ||
|
|
6df454c0eb | ||
|
|
c09e65eab1 | ||
|
|
a34c994f4b | ||
|
|
60c54bef31 | ||
|
|
8afa302680 | ||
|
|
de42273849 | ||
|
|
d3e2c9af95 | ||
|
|
00648786b7 | ||
|
|
c7f4380847 | ||
|
|
c659c25b94 | ||
|
|
f44ced2d04 | ||
|
|
e7d24bfff0 | ||
|
|
0378c8a8c5 | ||
|
|
2cfeb34c03 | ||
|
|
3bfacca254 | ||
|
|
46e1b9616b | ||
|
|
fe95f17503 | ||
|
|
462a063b6e | ||
|
|
e664de6c6f | ||
|
|
b666474cf1 | ||
|
|
64313f82ef | ||
|
|
59ba53584e | ||
|
|
b2ffc0d154 | ||
|
|
b7edf8b345 | ||
|
|
4e01e607cc | ||
|
|
1929c11297 | ||
|
|
28832f5cfb | ||
|
|
3c79c750e1 | ||
|
|
58afedd842 | ||
|
|
57783966db | ||
|
|
316a0ef4a3 | ||
|
|
7a86e65c69 | ||
|
|
efa0fe808d | ||
|
|
c2f97614dd | ||
|
|
e710e05658 | ||
|
|
4ff3378df5 | ||
|
|
002f331486 | ||
|
|
2993afb329 | ||
|
|
97e1f56008 | ||
|
|
4c9ed478d4 | ||
|
|
6e0af2f320 | ||
|
|
1ea3823069 | ||
|
|
0063141b89 | ||
|
|
3a1cfe395e | ||
|
|
ae3ab9f3e1 | ||
|
|
0b27a7ea80 | ||
|
|
2596dcdcab | ||
|
|
00c8b5254b | ||
|
|
6aa6a96b46 | ||
|
|
a7b155a78f | ||
|
|
031ceed1f1 | ||
|
|
0ceb062a47 | ||
|
|
a4fba5a5bc | ||
|
|
942f9d7eed | ||
|
|
fd58b79413 | ||
|
|
132215ee97 | ||
|
|
25662304c2 | ||
|
|
887b2e2427 | ||
|
|
6afa8513c8 | ||
|
|
94a4387407 | ||
|
|
9ab71c894f | ||
|
|
72eda8a7c3 | ||
|
|
7bb9b75e2a | ||
|
|
dfb8441078 | ||
|
|
0e0cf7fac1 | ||
|
|
06af12114d | ||
|
|
477f356571 | ||
|
|
ad1e9b8401 | ||
|
|
80628b1b79 | ||
|
|
30673c0391 | ||
|
|
35e7bf9897 | ||
|
|
f8551b6714 | ||
|
|
d87d585b9c | ||
|
|
6ff57552c7 | ||
|
|
36c78d94ce | ||
|
|
0bc867cf11 | ||
|
|
c6d9bf07e6 | ||
|
|
fb84fa8253 | ||
|
|
05414fb9d0 | ||
|
|
230fa45461 | ||
|
|
6aca815125 | ||
|
|
eed2df6493 | ||
|
|
ff773d96bd |
13
.github/kind-cluster-1.24.yaml
vendored
13
.github/kind-cluster-1.24.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.24.7"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.24.7"
|
||||
10
.github/kind-cluster-1.25.yaml
vendored
10
.github/kind-cluster-1.25.yaml
vendored
@@ -2,12 +2,12 @@ kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.3
|
||||
image: kindest/node:v1.25.11
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.3
|
||||
image: kindest/node:v1.25.11
|
||||
- role: control-plane
|
||||
image: kindest/node:v1.25.3
|
||||
image: kindest/node:v1.25.11
|
||||
- role: worker
|
||||
image: kindest/node:v1.25.3
|
||||
image: kindest/node:v1.25.11
|
||||
- role: worker
|
||||
image: kindest/node:v1.25.3
|
||||
image: kindest/node:v1.25.11
|
||||
|
||||
10
.github/kind-cluster-1.26.yaml
vendored
10
.github/kind-cluster-1.26.yaml
vendored
@@ -2,12 +2,12 @@ kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.0"
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.0"
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.26.0"
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.26.0"
|
||||
image: "kindest/node:v1.26.6"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.26.0"
|
||||
image: "kindest/node:v1.26.6"
|
||||
|
||||
13
.github/kind-cluster-1.27.yaml
vendored
Normal file
13
.github/kind-cluster-1.27.yaml
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: control-plane
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.27.3"
|
||||
- role: worker
|
||||
image: "kindest/node:v1.27.3"
|
||||
6
.github/workflows/on-main-push.yaml
vendored
6
.github/workflows/on-main-push.yaml
vendored
@@ -22,7 +22,7 @@ jobs:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@57396166ad8aefe6098280995947635806a0e6ea
|
||||
uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
@@ -60,7 +60,7 @@ jobs:
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
|
||||
- name: Build image
|
||||
uses: docker/build-push-action@v3
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
|
||||
|
||||
118
.github/workflows/on-pr.yaml
vendored
118
.github/workflows/on-pr.yaml
vendored
@@ -11,7 +11,7 @@ jobs:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -19,7 +19,7 @@ jobs:
|
||||
run: go test -json ./... > test.json
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.6.0
|
||||
uses: guyarb/golang-test-annoations@v0.7.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Link Checker
|
||||
uses: lycheeverse/lychee-action@4dcb8bee2a0a4531cba1a1f392c54e8375d6dd81
|
||||
uses: lycheeverse/lychee-action@ec3ed119d4f44ad2673a7232460dc7dff59d2421
|
||||
env:
|
||||
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
||||
with:
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -87,12 +87,15 @@ jobs:
|
||||
id: tags
|
||||
- name: Build image
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
- uses: Azure/container-scan@v0
|
||||
env:
|
||||
# See https://github.com/goodwithtech/dockle/issues/188
|
||||
DOCKLE_HOST: "unix:///var/run/docker.sock"
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
|
||||
with:
|
||||
image-name: ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}
|
||||
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
# This ensures the latest code works with the manifests built from tree.
|
||||
# It is useful for two things:
|
||||
@@ -107,13 +110,13 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.24"
|
||||
- "1.25"
|
||||
- "1.26"
|
||||
- "1.27"
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -142,7 +145,7 @@ jobs:
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create kind cluster with 5 nodes
|
||||
uses: helm/kind-action@v1.5.0
|
||||
uses: helm/kind-action@v1.8.0
|
||||
with:
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.14.0
|
||||
@@ -159,7 +162,94 @@ jobs:
|
||||
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.8.2
|
||||
uses: nick-invision/retry@v2.8.3
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
retry_wait_seconds: 60
|
||||
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
|
||||
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"
|
||||
|
||||
- name: Create reboot sentinel files
|
||||
run: |
|
||||
./tests/kind/create-reboot-sentinels.sh
|
||||
|
||||
- name: Follow reboot until success
|
||||
env:
|
||||
DEBUG: true
|
||||
run: |
|
||||
./tests/kind/follow-coordinated-reboot.sh
|
||||
|
||||
|
||||
|
||||
# This ensures the latest code works with the manifests built from tree.
|
||||
# It is useful for two things:
|
||||
# - Test manifests changes (obviously), ensuring they don't break existing clusters
|
||||
# - Ensure manifests work with the latest versions even with no manifest change
|
||||
# (compared to helm charts, manifests cannot easily template changes based on versions)
|
||||
# Helm charts are _trailing_ releases, while manifests are done during development.
|
||||
# Concurrency = 2
|
||||
e2e-manifests-concurent:
|
||||
name: End-to-End test with kured with code and manifests from HEAD (concurrent)
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
kubernetes:
|
||||
- "1.25"
|
||||
- "1.26"
|
||||
- "1.27"
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Setup GoReleaser
|
||||
run: make bootstrap-tools
|
||||
- name: Find current tag version
|
||||
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
id: tags
|
||||
- name: Build artifacts
|
||||
run: |
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest
|
||||
|
||||
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
|
||||
run: |
|
||||
sudo bash << EOF
|
||||
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
|
||||
echo '{}' > /etc/docker/daemon.json
|
||||
systemctl restart docker || journalctl --no-pager -n 500
|
||||
systemctl status docker
|
||||
EOF
|
||||
|
||||
# Default name for helm/kind-action kind clusters is "chart-testing"
|
||||
- name: Create kind cluster with 5 nodes
|
||||
uses: helm/kind-action@v1.8.0
|
||||
with:
|
||||
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
|
||||
version: v0.14.0
|
||||
|
||||
- name: Preload previously built images onto kind cluster
|
||||
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing
|
||||
|
||||
- name: Do not wait for an hour before detecting the rebootSentinel
|
||||
run: |
|
||||
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
|
||||
sed -i 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml
|
||||
|
||||
- name: Install kured with kubectl
|
||||
run: |
|
||||
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
|
||||
|
||||
- name: Ensure kured is ready
|
||||
uses: nick-invision/retry@v2.8.3
|
||||
with:
|
||||
timeout_minutes: 10
|
||||
max_attempts: 10
|
||||
|
||||
21
.github/workflows/on-tag.yaml
vendored
21
.github/workflows/on-tag.yaml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -42,7 +42,7 @@ jobs:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
COSIGN_EXPERIMENTAL: 1
|
||||
- name: Build single image for scan
|
||||
uses: docker/build-push-action@v3
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64
|
||||
@@ -51,12 +51,15 @@ jobs:
|
||||
tags: |
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
|
||||
- uses: Azure/container-scan@v0
|
||||
env:
|
||||
# See https://github.com/goodwithtech/dockle/issues/188
|
||||
DOCKLE_HOST: "unix:///var/run/docker.sock"
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
|
||||
with:
|
||||
image-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
|
||||
image-ref: '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v2
|
||||
@@ -67,12 +70,12 @@ jobs:
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@57396166ad8aefe6098280995947635806a0e6ea
|
||||
uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
- name: Build release images
|
||||
uses: docker/build-push-action@v3
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
|
||||
|
||||
21
.github/workflows/periodics-daily.yaml
vendored
21
.github/workflows/periodics-daily.yaml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
run: go test -json ./... > test.json
|
||||
- name: Annotate tests
|
||||
if: always()
|
||||
uses: guyarb/golang-test-annoations@v0.6.0
|
||||
uses: guyarb/golang-test-annoations@v0.7.0
|
||||
with:
|
||||
test-results: test.json
|
||||
|
||||
@@ -25,7 +25,7 @@ jobs:
|
||||
steps:
|
||||
# Stale by default waits for 60 days before marking PR/issues as stale, and closes them after 21 days.
|
||||
# Do not expire the first issues that would allow the community to grow.
|
||||
- uses: actions/stale@v6
|
||||
- uses: actions/stale@v8
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
stale-issue-message: 'This issue was automatically considered stale due to lack of activity. Please update it and/or join our slack channels to promote it, before it automatically closes (in 7 days).'
|
||||
@@ -41,7 +41,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Link Checker
|
||||
uses: lycheeverse/lychee-action@4dcb8bee2a0a4531cba1a1f392c54e8375d6dd81
|
||||
uses: lycheeverse/lychee-action@ec3ed119d4f44ad2673a7232460dc7dff59d2421
|
||||
env:
|
||||
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
|
||||
with:
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Ensure go version
|
||||
uses: actions/setup-go@v3
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
check-latest: true
|
||||
@@ -69,9 +69,12 @@ jobs:
|
||||
id: tags
|
||||
- name: Build artifacts
|
||||
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
|
||||
- uses: Azure/container-scan@v0
|
||||
env:
|
||||
# See https://github.com/goodwithtech/dockle/issues/188
|
||||
DOCKLE_HOST: "unix:///var/run/docker.sock"
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
|
||||
with:
|
||||
image-name: ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}
|
||||
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
|
||||
format: 'table'
|
||||
exit-code: '1'
|
||||
ignore-unfixed: true
|
||||
vuln-type: 'os,library'
|
||||
severity: 'CRITICAL,HIGH'
|
||||
|
||||
@@ -2,3 +2,5 @@ app.fossa.com
|
||||
cluster.local
|
||||
hooks.slack.com
|
||||
localhost
|
||||
slack://
|
||||
teams://
|
||||
|
||||
@@ -162,7 +162,7 @@ A test-run with `minikube` could look like this:
|
||||
|
||||
```console
|
||||
# start minikube
|
||||
minikube start --vm-driver kvm2 --kubernetes-version <k8s-release>
|
||||
minikube start --driver=kvm2 --kubernetes-version <k8s-release>
|
||||
|
||||
# build kured image and publish to registry accessible by minikube
|
||||
make image minikube-publish
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.17.0 as bin
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.18.3 as bin
|
||||
|
||||
ARG TARGETOS
|
||||
ARG TARGETARCH
|
||||
@@ -19,7 +19,7 @@ RUN set -ex \
|
||||
esac \
|
||||
&& cp /dist/kured_${TARGETOS}_${TARGETARCH}${SUFFIX}/kured /dist/kured;
|
||||
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.17.0
|
||||
FROM --platform=$TARGETPLATFORM alpine:3.18.3
|
||||
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
|
||||
COPY --from=bin /dist/kured /usr/bin/kured
|
||||
ENTRYPOINT ["/usr/bin/kured"]
|
||||
|
||||
@@ -108,5 +108,5 @@ Governance require a 2/3 vote of all Maintainers.
|
||||
|
||||
[maintainers-file]: ./MAINTAINERS
|
||||
[private-list]: cncf-kured-maintainers@lists.cncf.io
|
||||
[meeting-agenda]: https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#
|
||||
[meeting-agenda]: https://docs.google.com/document/d/1AWT8YDdqZY-Se6Y1oAlwtujWLVpNVK2M_F_Vfqw06aI/edit
|
||||
[decision-issues]: https://github.com/kubereboot/kured/labels/decision
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
|
||||
Daniel Holbach <daniel@weave.works> (@dholbach)
|
||||
Daniel Holbach <daniel.holbach@gmail.com> (@dholbach)
|
||||
Hidde Beydals <hidde@weave.works> (@hiddeco)
|
||||
Jack Francis <jackfrancis@gmail.com> (@jackfrancis)
|
||||
Jean-Philippe Evrard <open-source@a.spamming.party> (@evrardjp)
|
||||
|
||||
@@ -45,7 +45,7 @@ If you have any questions about, feedback for or problems with `kured`:
|
||||
- Invite yourself to the <a href="https://slack.cncf.io/" target="_blank">CNCF Slack</a>.
|
||||
- Ask a question on the [#kured](https://cloud-native.slack.com/archives/kured) slack channel.
|
||||
- [File an issue](https://github.com/kubereboot/kured/issues/new).
|
||||
- Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
|
||||
- Join us in [our monthly meeting](https://docs.google.com/document/d/1AWT8YDdqZY-Se6Y1oAlwtujWLVpNVK2M_F_Vfqw06aI/edit),
|
||||
every first Wednesday of the month at 16:00 UTC.
|
||||
- You might want to [join the kured-dev mailing list](https://lists.cncf.io/g/cncf-kured-dev) as well.
|
||||
|
||||
|
||||
@@ -30,13 +30,13 @@ import (
|
||||
"github.com/google/shlex"
|
||||
|
||||
shoutrrr "github.com/containrrr/shoutrrr"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/kubereboot/kured/pkg/alerts"
|
||||
"github.com/kubereboot/kured/pkg/daemonsetlock"
|
||||
"github.com/kubereboot/kured/pkg/delaytick"
|
||||
"github.com/kubereboot/kured/pkg/taints"
|
||||
"github.com/kubereboot/kured/pkg/timewindow"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -47,7 +47,10 @@ var (
|
||||
drainTimeout time.Duration
|
||||
rebootDelay time.Duration
|
||||
period time.Duration
|
||||
metricsHost string
|
||||
metricsPort int
|
||||
drainGracePeriod int
|
||||
drainPodSelector string
|
||||
skipWaitForDeleteTimeoutSeconds int
|
||||
dsNamespace string
|
||||
dsName string
|
||||
@@ -57,6 +60,7 @@ var (
|
||||
prometheusURL string
|
||||
preferNoScheduleTaintName string
|
||||
alertFilter *regexp.Regexp
|
||||
alertFilterMatchOnly bool
|
||||
alertFiringOnly bool
|
||||
rebootSentinelFile string
|
||||
rebootSentinelCommand string
|
||||
@@ -73,6 +77,7 @@ var (
|
||||
preRebootNodeLabels []string
|
||||
postRebootNodeLabels []string
|
||||
nodeID string
|
||||
concurrency int
|
||||
|
||||
rebootDays []string
|
||||
rebootStart string
|
||||
@@ -124,8 +129,14 @@ func NewRootCommand() *cobra.Command {
|
||||
"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
|
||||
rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false,
|
||||
"force a reboot even if the drain fails or times out")
|
||||
rootCmd.PersistentFlags().StringVar(&metricsHost, "metrics-host", "",
|
||||
"host where metrics will listen")
|
||||
rootCmd.PersistentFlags().IntVar(&metricsPort, "metrics-port", 8080,
|
||||
"port number where metrics will listen")
|
||||
rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1,
|
||||
"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used")
|
||||
rootCmd.PersistentFlags().StringVar(&drainPodSelector, "drain-pod-selector", "",
|
||||
"only drain pods with labels matching the selector (default: '', all pods)")
|
||||
rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
|
||||
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node")
|
||||
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
|
||||
@@ -148,6 +159,8 @@ func NewRootCommand() *cobra.Command {
|
||||
"Prometheus instance to probe for active alerts")
|
||||
rootCmd.PersistentFlags().Var(®expValue{&alertFilter}, "alert-filter-regexp",
|
||||
"alert names to ignore when checking for active alerts")
|
||||
rootCmd.PersistentFlags().BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false,
|
||||
"Only block if the alert-filter-regexp matches active alerts")
|
||||
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
|
||||
"only consider firing alerts when checking for active alerts")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
|
||||
@@ -158,6 +171,8 @@ func NewRootCommand() *cobra.Command {
|
||||
"command for which a zero return code will trigger a reboot command")
|
||||
rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
|
||||
"command to run when a reboot is required")
|
||||
rootCmd.PersistentFlags().IntVar(&concurrency, "concurrency", 1,
|
||||
"amount of nodes to concurrently reboot. Defaults to 1")
|
||||
|
||||
rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
|
||||
"slack hook URL for reboot notifications [deprecated in favor of --notify-url]")
|
||||
@@ -309,7 +324,8 @@ func buildHostCommand(pid int, command []string) []string {
|
||||
}
|
||||
|
||||
func rebootRequired(sentinelCommand []string) bool {
|
||||
if err := newCommand(sentinelCommand[0], sentinelCommand[1:]...).Run(); err != nil {
|
||||
cmd := newCommand(sentinelCommand[0], sentinelCommand[1:]...)
|
||||
if err := cmd.Run(); err != nil {
|
||||
switch err := err.(type) {
|
||||
case *exec.ExitError:
|
||||
// We assume a non-zero exit code means 'reboot not required', but of course
|
||||
@@ -317,6 +333,9 @@ func rebootRequired(sentinelCommand []string) bool {
|
||||
// went wrong during its execution. In that case, not entering a reboot loop
|
||||
// is the right thing to do, and we are logging stdout/stderr of the command
|
||||
// so it should be obvious what is wrong.
|
||||
if cmd.ProcessState.ExitCode() != 1 {
|
||||
log.Warnf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode())
|
||||
}
|
||||
return false
|
||||
default:
|
||||
// Something was grossly misconfigured, such as the command path being wrong.
|
||||
@@ -342,6 +361,8 @@ type PrometheusBlockingChecker struct {
|
||||
filter *regexp.Regexp
|
||||
// bool to indicate if only firing alerts should be considered
|
||||
firingOnly bool
|
||||
// bool to indicate that we're only blocking on alerts which match the filter
|
||||
filterMatchOnly bool
|
||||
}
|
||||
|
||||
// KubernetesBlockingChecker contains info for connecting
|
||||
@@ -355,8 +376,7 @@ type KubernetesBlockingChecker struct {
|
||||
}
|
||||
|
||||
func (pb PrometheusBlockingChecker) isBlocked() bool {
|
||||
|
||||
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
|
||||
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly, pb.filterMatchOnly)
|
||||
if err != nil {
|
||||
log.Warnf("Reboot blocked: prometheus query error: %v", err)
|
||||
return true
|
||||
@@ -375,7 +395,7 @@ func (pb PrometheusBlockingChecker) isBlocked() bool {
|
||||
func (kb KubernetesBlockingChecker) isBlocked() bool {
|
||||
fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodename)
|
||||
for _, labelSelector := range kb.filter {
|
||||
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
|
||||
podList, err := kb.client.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{
|
||||
LabelSelector: labelSelector,
|
||||
FieldSelector: fieldSelector,
|
||||
Limit: 10})
|
||||
@@ -408,8 +428,14 @@ func rebootBlocked(blockers ...RebootBlocker) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
|
||||
holding, err := lock.Test(metadata)
|
||||
func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}, isMultiLock bool) bool {
|
||||
var holding bool
|
||||
var err error
|
||||
if isMultiLock {
|
||||
holding, err = lock.TestMultiple()
|
||||
} else {
|
||||
holding, err = lock.Test(metadata)
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatalf("Error testing lock: %v", err)
|
||||
}
|
||||
@@ -419,8 +445,17 @@ func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
|
||||
return holding
|
||||
}
|
||||
|
||||
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration) bool {
|
||||
holding, holder, err := lock.Acquire(metadata, TTL)
|
||||
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration, maxOwners int) bool {
|
||||
var holding bool
|
||||
var holder string
|
||||
var err error
|
||||
if maxOwners > 1 {
|
||||
var holders []string
|
||||
holding, holders, err = lock.AcquireMultiple(metadata, TTL, maxOwners)
|
||||
holder = strings.Join(holders, ",")
|
||||
} else {
|
||||
holding, holder, err = lock.Acquire(metadata, TTL)
|
||||
}
|
||||
switch {
|
||||
case err != nil:
|
||||
log.Fatalf("Error acquiring lock: %v", err)
|
||||
@@ -441,9 +476,16 @@ func throttle(releaseDelay time.Duration) {
|
||||
}
|
||||
}
|
||||
|
||||
func release(lock *daemonsetlock.DaemonSetLock) {
|
||||
func release(lock *daemonsetlock.DaemonSetLock, isMultiLock bool) {
|
||||
log.Infof("Releasing lock")
|
||||
if err := lock.Release(); err != nil {
|
||||
|
||||
var err error
|
||||
if isMultiLock {
|
||||
err = lock.ReleaseMultiple()
|
||||
} else {
|
||||
err = lock.Release()
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatalf("Error releasing lock: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -467,6 +509,7 @@ func drain(client *kubernetes.Clientset, node *v1.Node) error {
|
||||
Client: client,
|
||||
Ctx: context.Background(),
|
||||
GracePeriodSeconds: drainGracePeriod,
|
||||
PodSelector: drainPodSelector,
|
||||
SkipWaitForDeleteTimeoutSeconds: skipWaitForDeleteTimeoutSeconds,
|
||||
Force: true,
|
||||
DeleteEmptyDirData: true,
|
||||
@@ -537,7 +580,7 @@ type nodeMeta struct {
|
||||
}
|
||||
|
||||
func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error {
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
node, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Errorf("Error retrieving node object via k8s API: %s", err)
|
||||
return err
|
||||
@@ -553,7 +596,7 @@ func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
|
||||
_, err = client.CoreV1().Nodes().Patch(context.Background(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
var annotationsErr string
|
||||
for k, v := range annotations {
|
||||
@@ -572,7 +615,7 @@ func deleteNodeAnnotation(client *kubernetes.Clientset, nodeID, key string) erro
|
||||
// So we replace all instances of "/" with "~1" as per:
|
||||
// https://tools.ietf.org/html/rfc6901#section-3
|
||||
patch := []byte(fmt.Sprintf("[{\"op\":\"remove\",\"path\":\"/metadata/annotations/%s\"}]", strings.ReplaceAll(key, "/", "~1")))
|
||||
_, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{})
|
||||
_, err := client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, patch, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
log.Errorf("Error deleting node annotation %s via k8s API: %v", key, err)
|
||||
return err
|
||||
@@ -598,7 +641,7 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri
|
||||
log.Fatalf("Error marshalling node object into JSON: %v", err)
|
||||
}
|
||||
|
||||
_, err = client.CoreV1().Nodes().Patch(context.TODO(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
|
||||
_, err = client.CoreV1().Nodes().Patch(context.Background(), node.GetName(), types.StrategicMergePatchType, bytes, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
var labelsErr string
|
||||
for _, label := range labels {
|
||||
@@ -627,8 +670,8 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
source := rand.NewSource(time.Now().UnixNano())
|
||||
tick := delaytick.New(source, 1*time.Minute)
|
||||
for range tick {
|
||||
if holding(lock, &nodeMeta) {
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
if holding(lock, &nodeMeta, concurrency > 1) {
|
||||
node, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Errorf("Error retrieving node object via k8s API: %v", err)
|
||||
continue
|
||||
@@ -660,7 +703,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
}
|
||||
}
|
||||
throttle(releaseDelay)
|
||||
release(lock)
|
||||
release(lock, concurrency > 1)
|
||||
break
|
||||
} else {
|
||||
break
|
||||
@@ -696,19 +739,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
}
|
||||
log.Infof("Reboot required")
|
||||
|
||||
var blockCheckers []RebootBlocker
|
||||
if prometheusURL != "" {
|
||||
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
|
||||
}
|
||||
if podSelectors != nil {
|
||||
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
|
||||
}
|
||||
|
||||
if rebootBlocked(blockCheckers...) {
|
||||
continue
|
||||
}
|
||||
|
||||
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
node, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error retrieving node object via k8s API: %v", err)
|
||||
}
|
||||
@@ -730,17 +761,29 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
|
||||
}
|
||||
}
|
||||
|
||||
if !holding(lock, &nodeMeta) && !acquire(lock, &nodeMeta, TTL) {
|
||||
if !holding(lock, &nodeMeta, concurrency > 1) && !acquire(lock, &nodeMeta, TTL, concurrency) {
|
||||
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
|
||||
preferNoScheduleTaint.Enable()
|
||||
continue
|
||||
}
|
||||
|
||||
var blockCheckers []RebootBlocker
|
||||
if prometheusURL != "" {
|
||||
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly, filterMatchOnly: alertFilterMatchOnly})
|
||||
}
|
||||
if podSelectors != nil {
|
||||
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
|
||||
}
|
||||
|
||||
if rebootBlocked(blockCheckers...) {
|
||||
continue
|
||||
}
|
||||
|
||||
err = drain(client, node)
|
||||
if err != nil {
|
||||
if !forceReboot {
|
||||
log.Errorf("Unable to cordon or drain %s: %v, will release lock and retry cordon and drain before rebooting when lock is next acquired", node.GetName(), err)
|
||||
release(lock)
|
||||
release(lock, concurrency > 1)
|
||||
log.Infof("Performing a best-effort uncordon after failed cordon and drain")
|
||||
uncordon(client, node)
|
||||
continue
|
||||
@@ -818,6 +861,7 @@ func root(cmd *cobra.Command, args []string) {
|
||||
log.Infof("Blocking Pod Selectors: %v", podSelectors)
|
||||
log.Infof("Reboot schedule: %v", window)
|
||||
log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
|
||||
log.Infof("Concurrency: %v", concurrency)
|
||||
log.Infof("Reboot command: %s", restartCommand)
|
||||
if annotateNodes {
|
||||
log.Infof("Will annotate nodes during kured reboot operations")
|
||||
@@ -833,5 +877,5 @@ func root(cmd *cobra.Command, args []string) {
|
||||
go maintainRebootRequiredMetric(nodeID, hostSentinelCommand)
|
||||
|
||||
http.Handle("/metrics", promhttp.Handler())
|
||||
log.Fatal(http.ListenAndServe(":8080", nil))
|
||||
log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil))
|
||||
}
|
||||
|
||||
99
go.mod
99
go.mod
@@ -1,54 +1,48 @@
|
||||
module github.com/kubereboot/kured
|
||||
|
||||
go 1.18
|
||||
go 1.19
|
||||
|
||||
replace (
|
||||
// Fix CVE-2022-1996 (for v2, Go Modules incompatible)
|
||||
github.com/emicklei/go-restful => github.com/emicklei/go-restful v2.16.0+incompatible
|
||||
replace golang.org/x/net => golang.org/x/net v0.7.0
|
||||
|
||||
golang.org/x/net => golang.org/x/net v0.0.0-20220906165146-f3363e06e74c
|
||||
golang.org/x/text => golang.org/x/text v0.3.8
|
||||
)
|
||||
replace github.com/emicklei/go-restful/v3 => github.com/emicklei/go-restful/v3 v3.10.2
|
||||
|
||||
require (
|
||||
github.com/containrrr/shoutrrr v0.6.1
|
||||
github.com/containrrr/shoutrrr v0.7.1
|
||||
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
|
||||
github.com/google/uuid v1.1.5 // indirect
|
||||
github.com/prometheus/client_golang v1.14.0
|
||||
github.com/prometheus/common v0.39.0
|
||||
github.com/sirupsen/logrus v1.9.0
|
||||
github.com/spf13/cobra v1.6.1
|
||||
github.com/google/uuid v1.3.0 // indirect
|
||||
github.com/prometheus/client_golang v1.16.0
|
||||
github.com/prometheus/common v0.44.0
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/spf13/cobra v1.7.0
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/spf13/viper v1.14.0
|
||||
github.com/stretchr/testify v1.8.1
|
||||
gotest.tools/v3 v3.4.0
|
||||
k8s.io/api v0.25.5
|
||||
k8s.io/apimachinery v0.25.5
|
||||
k8s.io/client-go v0.25.5
|
||||
k8s.io/kubectl v0.25.5
|
||||
github.com/spf13/viper v1.16.0
|
||||
github.com/stretchr/testify v1.8.4
|
||||
gotest.tools/v3 v3.5.0
|
||||
k8s.io/api v0.26.7
|
||||
k8s.io/apimachinery v0.26.7
|
||||
k8s.io/client-go v0.26.7
|
||||
k8s.io/kubectl v0.26.7
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
|
||||
github.com/MakeNowJust/heredoc v1.0.0 // indirect
|
||||
github.com/PuerkitoBio/purell v1.1.1 // indirect
|
||||
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.1.2 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
||||
github.com/chai2010/gettext-go v1.0.2 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.8.0 // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
|
||||
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
|
||||
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
|
||||
github.com/fatih/color v1.13.0 // indirect
|
||||
github.com/fatih/color v1.14.1 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/go-errors/errors v1.0.1 // indirect
|
||||
github.com/go-logr/logr v1.2.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.19.5 // indirect
|
||||
github.com/go-openapi/jsonreference v0.19.5 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.0 // indirect
|
||||
github.com/go-openapi/swag v0.19.14 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/protobuf v1.5.2 // indirect
|
||||
github.com/golang/protobuf v1.5.3 // indirect
|
||||
github.com/google/btree v1.0.1 // indirect
|
||||
github.com/google/gnostic v0.5.7-v3refs // indirect
|
||||
github.com/google/go-cmp v0.5.9 // indirect
|
||||
@@ -56,57 +50,56 @@ require (
|
||||
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
|
||||
github.com/hashicorp/hcl v1.0.0 // indirect
|
||||
github.com/imdario/mergo v0.3.6 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.0.1 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
|
||||
github.com/magiconair/properties v1.8.6 // indirect
|
||||
github.com/magiconair/properties v1.8.7 // indirect
|
||||
github.com/mailru/easyjson v0.7.6 // indirect
|
||||
github.com/mattn/go-colorable v0.1.12 // indirect
|
||||
github.com/mattn/go-isatty v0.0.14 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.17 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/mitchellh/go-wordwrap v1.0.0 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/moby/spdystream v0.2.0 // indirect
|
||||
github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect
|
||||
github.com/moby/term v0.0.0-20220808134915-39b0c02b01ae // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pelletier/go-toml v1.9.5 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.0.5 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
|
||||
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/prometheus/client_model v0.3.0 // indirect
|
||||
github.com/prometheus/procfs v0.8.0 // indirect
|
||||
github.com/russross/blackfriday v1.5.2 // indirect
|
||||
github.com/spf13/afero v1.9.2 // indirect
|
||||
github.com/spf13/cast v1.5.0 // indirect
|
||||
github.com/prometheus/client_model v0.4.0 // indirect
|
||||
github.com/prometheus/procfs v0.10.1 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/spf13/afero v1.9.5 // indirect
|
||||
github.com/spf13/cast v1.5.1 // indirect
|
||||
github.com/spf13/jwalterweatherman v1.1.0 // indirect
|
||||
github.com/subosito/gotenv v1.4.1 // indirect
|
||||
github.com/subosito/gotenv v1.4.2 // indirect
|
||||
github.com/xlab/treeprint v1.1.0 // indirect
|
||||
go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect
|
||||
golang.org/x/net v0.4.0 // indirect
|
||||
golang.org/x/oauth2 v0.3.0 // indirect
|
||||
golang.org/x/sys v0.3.0 // indirect
|
||||
golang.org/x/term v0.3.0 // indirect
|
||||
golang.org/x/text v0.5.0 // indirect
|
||||
golang.org/x/time v0.0.0-20220609170525-579cf78fd858 // indirect
|
||||
golang.org/x/net v0.10.0 // indirect
|
||||
golang.org/x/oauth2 v0.8.0 // indirect
|
||||
golang.org/x/sys v0.8.0 // indirect
|
||||
golang.org/x/term v0.6.0 // indirect
|
||||
golang.org/x/text v0.9.0 // indirect
|
||||
golang.org/x/time v0.1.0 // indirect
|
||||
google.golang.org/appengine v1.6.7 // indirect
|
||||
google.golang.org/protobuf v1.28.1 // indirect
|
||||
google.golang.org/protobuf v1.30.0 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/ini.v1 v1.67.0 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/cli-runtime v0.25.5 // indirect
|
||||
k8s.io/component-base v0.25.5 // indirect
|
||||
k8s.io/klog/v2 v2.70.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect
|
||||
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect
|
||||
k8s.io/cli-runtime v0.26.7 // indirect
|
||||
k8s.io/component-base v0.26.7 // indirect
|
||||
k8s.io/klog/v2 v2.80.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect
|
||||
k8s.io/utils v0.0.0-20221107191617-1a15be271d1d // indirect
|
||||
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
|
||||
sigs.k8s.io/kustomize/api v0.12.1 // indirect
|
||||
sigs.k8s.io/kustomize/kyaml v0.13.9 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
|
||||
sigs.k8s.io/yaml v1.2.0 // indirect
|
||||
sigs.k8s.io/yaml v1.3.0 // indirect
|
||||
)
|
||||
|
||||
@@ -8,14 +8,14 @@ metadata:
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kured # Must match `--ds-name`
|
||||
name: kured # Must match `--ds-name`
|
||||
namespace: kube-system # Must match `--ds-namespace`
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: kured
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@@ -31,12 +31,16 @@ spec:
|
||||
restartPolicy: Always
|
||||
containers:
|
||||
- name: kured
|
||||
image: ghcr.io/kubereboot/kured:1.12.0
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
# If you find yourself here wondering why there is no
|
||||
# :latest tag on Docker Hub,see the FAQ in the README
|
||||
image: ghcr.io/kubereboot/kured:1.13.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
privileged: true # Give permission to nsenter /proc/1/ns/mnt
|
||||
readOnlyRootFilesystem: true
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: metrics
|
||||
env:
|
||||
# Pass in the name of the node on which this pod is scheduled
|
||||
# for use with drain/uncordon operations and lock acquisition
|
||||
@@ -57,6 +61,7 @@ spec:
|
||||
# - --lock-ttl=0
|
||||
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
|
||||
# - --alert-filter-regexp=^RebootRequired$
|
||||
# - --alert-filter-match-only=false
|
||||
# - --alert-firing-only=false
|
||||
# - --reboot-sentinel=/var/run/reboot-required
|
||||
# - --prefer-no-schedule-taint=""
|
||||
@@ -79,3 +84,6 @@ spec:
|
||||
# - --annotate-nodes=false
|
||||
# - --lock-release-delay=30m
|
||||
# - --log-format=text
|
||||
# - --metrics-host=""
|
||||
# - --metrics-port=8080
|
||||
# - --concurrency=1
|
||||
|
||||
@@ -36,7 +36,7 @@ func NewPromClient(conf papi.Config) (*PromClient, error) {
|
||||
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
|
||||
// block-list and will NOT block rebooting. query by includeLabel means,
|
||||
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
|
||||
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {
|
||||
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly, filterMatchOnly bool) ([]string, error) {
|
||||
|
||||
// get all alerts from prometheus
|
||||
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
|
||||
@@ -49,7 +49,7 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str
|
||||
activeAlertSet := make(map[string]bool)
|
||||
for _, sample := range vector {
|
||||
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
|
||||
if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
|
||||
if matchesRegex(filter, string(alertName), filterMatchOnly) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
|
||||
activeAlertSet[string(alertName)] = true
|
||||
}
|
||||
}
|
||||
@@ -67,3 +67,11 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str
|
||||
|
||||
return nil, fmt.Errorf("Unexpected value type: %v", value)
|
||||
}
|
||||
|
||||
func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool {
|
||||
if filter == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
return filter.MatchString(string(alertName)) == filterMatchOnly
|
||||
}
|
||||
|
||||
@@ -45,62 +45,87 @@ func TestActiveAlerts(t *testing.T) {
|
||||
addr := "http://localhost:10001"
|
||||
|
||||
for _, tc := range []struct {
|
||||
it string
|
||||
rFilter string
|
||||
respBody string
|
||||
aName string
|
||||
wantN int
|
||||
firingOnly bool
|
||||
it string
|
||||
rFilter string
|
||||
respBody string
|
||||
aName string
|
||||
wantN int
|
||||
firingOnly bool
|
||||
filterMatchOnly bool
|
||||
}{
|
||||
{
|
||||
it: "should return no active alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
it: "should return no active alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Pod",
|
||||
wantN: 3,
|
||||
firingOnly: false,
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Pod",
|
||||
wantN: 3,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
it: "should return a subset of all alerts",
|
||||
respBody: responsebody,
|
||||
rFilter: "Gatekeeper",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: true,
|
||||
},
|
||||
{
|
||||
it: "should return all active alerts by regex filter",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
it: "should return all active alerts by regex",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return only firing alerts if firingOnly is true",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 4,
|
||||
firingOnly: true,
|
||||
it: "should return all active alerts by regex filter",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 5,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should return ScheduledRebootFailing active alerts",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
aName: "ScheduledRebootFailing",
|
||||
rFilter: "*",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
it: "should return only firing alerts if firingOnly is true",
|
||||
respBody: responsebody,
|
||||
rFilter: "*",
|
||||
wantN: 4,
|
||||
firingOnly: true,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
|
||||
{
|
||||
it: "should return ScheduledRebootFailing active alerts",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
aName: "ScheduledRebootFailing",
|
||||
rFilter: "*",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 0,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: false,
|
||||
},
|
||||
{
|
||||
it: "should not return an active alert if RebootRequired is firing (regex filter)",
|
||||
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
|
||||
rFilter: "RebootRequired",
|
||||
wantN: 1,
|
||||
firingOnly: false,
|
||||
filterMatchOnly: true,
|
||||
},
|
||||
} {
|
||||
// Start mockServer
|
||||
@@ -125,7 +150,7 @@ func TestActiveAlerts(t *testing.T) {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
result, err := p.ActiveAlerts(regex, tc.firingOnly)
|
||||
result, err := p.ActiveAlerts(regex, tc.firingOnly, tc.filterMatchOnly)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -35,6 +35,11 @@ type lockAnnotationValue struct {
|
||||
TTL time.Duration `json:"TTL"`
|
||||
}
|
||||
|
||||
type multiLockAnnotationValue struct {
|
||||
MaxOwners int `json:"maxOwners"`
|
||||
LockAnnotations []lockAnnotationValue `json:"locks"`
|
||||
}
|
||||
|
||||
// New creates a daemonsetLock object containing the necessary data for follow up k8s requests
|
||||
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock {
|
||||
return &DaemonSetLock{client, nodeID, namespace, name, annotation}
|
||||
@@ -70,7 +75,7 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(valueBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
// Something else updated the resource between us reading and writing - try again soon
|
||||
@@ -84,6 +89,92 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool
|
||||
}
|
||||
}
|
||||
|
||||
// AcquireMultiple creates and annotates the daemonset with a multiple owner lock
|
||||
func (dsl *DaemonSetLock) AcquireMultiple(metadata interface{}, TTL time.Duration, maxOwners int) (bool, []string, error) {
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return false, []string{}, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
annotation := multiLockAnnotationValue{}
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
if err := json.Unmarshal([]byte(valueString), &annotation); err != nil {
|
||||
return false, []string{}, fmt.Errorf("error getting multi lock: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, metadata, TTL, maxOwners)
|
||||
if !lockPossible {
|
||||
return false, nodeIDsFromMultiLock(newAnnotation), nil
|
||||
}
|
||||
|
||||
if ds.ObjectMeta.Annotations == nil {
|
||||
ds.ObjectMeta.Annotations = make(map[string]string)
|
||||
}
|
||||
newAnnotationBytes, err := json.Marshal(&newAnnotation)
|
||||
if err != nil {
|
||||
return false, []string{}, fmt.Errorf("error marshalling new annotation lock: %w", err)
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
time.Sleep(time.Second)
|
||||
continue
|
||||
} else {
|
||||
return false, []string{}, fmt.Errorf("error updating daemonset with multi lock: %w", err)
|
||||
}
|
||||
}
|
||||
return true, nodeIDsFromMultiLock(newAnnotation), nil
|
||||
}
|
||||
}
|
||||
|
||||
func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string {
|
||||
nodeIDs := make([]string, 0, len(annotation.LockAnnotations))
|
||||
for _, nodeLock := range annotation.LockAnnotations {
|
||||
nodeIDs = append(nodeIDs, nodeLock.NodeID)
|
||||
}
|
||||
return nodeIDs
|
||||
}
|
||||
|
||||
func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata interface{}, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) {
|
||||
newAnnotation := multiLockAnnotationValue{MaxOwners: maxOwners}
|
||||
freeSpace := false
|
||||
if annotation.LockAnnotations == nil || len(annotation.LockAnnotations) < maxOwners {
|
||||
freeSpace = true
|
||||
newAnnotation.LockAnnotations = annotation.LockAnnotations
|
||||
} else {
|
||||
for _, nodeLock := range annotation.LockAnnotations {
|
||||
if ttlExpired(nodeLock.Created, nodeLock.TTL) {
|
||||
freeSpace = true
|
||||
continue
|
||||
}
|
||||
newAnnotation.LockAnnotations = append(
|
||||
newAnnotation.LockAnnotations,
|
||||
nodeLock,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if freeSpace {
|
||||
newAnnotation.LockAnnotations = append(
|
||||
newAnnotation.LockAnnotations,
|
||||
lockAnnotationValue{
|
||||
NodeID: dsl.nodeID,
|
||||
Metadata: metadata,
|
||||
Created: time.Now().UTC(),
|
||||
TTL: TTL,
|
||||
},
|
||||
)
|
||||
return true, newAnnotation
|
||||
}
|
||||
|
||||
return false, multiLockAnnotationValue{}
|
||||
}
|
||||
|
||||
// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go
|
||||
func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
@@ -106,6 +197,30 @@ func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// TestMultiple attempts to check the kured daemonset lock status for multi locks
|
||||
func (dsl *DaemonSetLock) TestMultiple() (bool, error) {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
if exists {
|
||||
value := multiLockAnnotationValue{}
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
for _, nodeLock := range value.LockAnnotations {
|
||||
if nodeLock.NodeID == dsl.nodeID && !ttlExpired(nodeLock.Created, nodeLock.TTL) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Release attempts to remove the lock data from the kured ds annotations using client-go
|
||||
func (dsl *DaemonSetLock) Release() error {
|
||||
for {
|
||||
@@ -130,7 +245,56 @@ func (dsl *DaemonSetLock) Release() error {
|
||||
|
||||
delete(ds.ObjectMeta.Annotations, dsl.annotation)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
// Something else updated the resource between us reading and writing - try again soon
|
||||
time.Sleep(time.Second)
|
||||
continue
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// ReleaseMultiple attempts to remove the lock data from the kured ds annotations using client-go
|
||||
func (dsl *DaemonSetLock) ReleaseMultiple() error {
|
||||
for {
|
||||
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
|
||||
if err != nil {
|
||||
return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
|
||||
}
|
||||
|
||||
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
|
||||
modified := false
|
||||
value := multiLockAnnotationValue{}
|
||||
if exists {
|
||||
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for idx, nodeLock := range value.LockAnnotations {
|
||||
if nodeLock.NodeID == dsl.nodeID {
|
||||
value.LockAnnotations = append(value.LockAnnotations[:idx], value.LockAnnotations[idx+1:]...)
|
||||
modified = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !exists || !modified {
|
||||
return fmt.Errorf("Lock not held")
|
||||
}
|
||||
|
||||
newAnnotationBytes, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error marshalling new annotation on release: %v", err)
|
||||
}
|
||||
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
|
||||
|
||||
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
|
||||
// Something else updated the resource between us reading and writing - try again soon
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package daemonsetlock
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -26,3 +28,181 @@ func TestTtlExpired(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func multiLockAnnotationsAreEqualByNodes(src, dst multiLockAnnotationValue) bool {
|
||||
srcNodes := []string{}
|
||||
for _, srcLock := range src.LockAnnotations {
|
||||
srcNodes = append(srcNodes, srcLock.NodeID)
|
||||
}
|
||||
sort.Strings(srcNodes)
|
||||
|
||||
dstNodes := []string{}
|
||||
for _, dstLock := range dst.LockAnnotations {
|
||||
dstNodes = append(dstNodes, dstLock.NodeID)
|
||||
}
|
||||
sort.Strings(dstNodes)
|
||||
|
||||
return reflect.DeepEqual(srcNodes, dstNodes)
|
||||
}
|
||||
|
||||
func TestCanAcquireMultiple(t *testing.T) {
|
||||
node1Name := "n1"
|
||||
node2Name := "n2"
|
||||
node3Name := "n3"
|
||||
testCases := []struct {
|
||||
name string
|
||||
daemonSetLock DaemonSetLock
|
||||
maxOwners int
|
||||
current multiLockAnnotationValue
|
||||
desired multiLockAnnotationValue
|
||||
lockPossible bool
|
||||
}{
|
||||
{
|
||||
name: "empty_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "partial_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node2Name},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
{NodeID: node2Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "full_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node2Name},
|
||||
{NodeID: node3Name},
|
||||
},
|
||||
},
|
||||
lockPossible: false,
|
||||
},
|
||||
{
|
||||
name: "full_with_one_expired_lock",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Minute),
|
||||
TTL: time.Hour,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
{NodeID: node3Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
{
|
||||
name: "full_with_all_expired_locks",
|
||||
daemonSetLock: DaemonSetLock{
|
||||
nodeID: node1Name,
|
||||
},
|
||||
maxOwners: 2,
|
||||
current: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{
|
||||
NodeID: node2Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
{
|
||||
NodeID: node3Name,
|
||||
Created: time.Now().UTC().Add(-1 * time.Hour),
|
||||
TTL: time.Minute,
|
||||
},
|
||||
},
|
||||
},
|
||||
desired: multiLockAnnotationValue{
|
||||
MaxOwners: 2,
|
||||
LockAnnotations: []lockAnnotationValue{
|
||||
{NodeID: node1Name},
|
||||
},
|
||||
},
|
||||
lockPossible: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, testCase := range testCases {
|
||||
t.Run(testCase.name, func(t *testing.T) {
|
||||
lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, struct{}{}, time.Minute, testCase.maxOwners)
|
||||
if lockPossible != testCase.lockPossible {
|
||||
t.Fatalf(
|
||||
"unexpected result for lock possible (got %t expected %t new annotation %v",
|
||||
lockPossible,
|
||||
testCase.lockPossible,
|
||||
actual,
|
||||
)
|
||||
}
|
||||
|
||||
if lockPossible && (!multiLockAnnotationsAreEqualByNodes(actual, testCase.desired) || testCase.desired.MaxOwners != actual.MaxOwners) {
|
||||
t.Fatalf(
|
||||
"expected lock %v but got %v",
|
||||
testCase.desired,
|
||||
actual,
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,7 @@ func (t *Taint) Disable() {
|
||||
}
|
||||
|
||||
func taintExists(client *kubernetes.Clientset, nodeID, taintName string) (bool, int, *v1.Node) {
|
||||
updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
|
||||
updatedNode, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
|
||||
if err != nil || updatedNode == nil {
|
||||
log.Fatalf("Error reading node %s: %v", nodeID, err)
|
||||
}
|
||||
@@ -153,7 +153,7 @@ func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, ef
|
||||
log.Fatalf("Error encoding taint patch for node %s: %v", nodeID, err)
|
||||
}
|
||||
|
||||
_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
|
||||
_, err = client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
|
||||
if err != nil {
|
||||
log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
|
||||
}
|
||||
|
||||
@@ -46,6 +46,12 @@ do
|
||||
echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}"
|
||||
echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}"
|
||||
|
||||
#"$KUBECTL_CMD" logs -n kube-system -l name=kured --ignore-errors > "$tmp_dir"/node_output
|
||||
#if [[ "$DEBUG" == "true" ]]; then
|
||||
# echo "Kured pod logs:"
|
||||
# cat "$tmp_dir"/node_output
|
||||
#fi
|
||||
|
||||
"$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output
|
||||
if [[ "$DEBUG" == "true" ]]; then
|
||||
# This is useful to see if a node gets stuck after drain, and doesn't
|
||||
|
||||
Reference in New Issue
Block a user