Compare commits

..

2 Commits

Author SHA1 Message Date
Christian Kotzbauer
59cbea5e25 feat: add another background
Signed-off-by: Christian Kotzbauer <git@ckotzbauer.de>
2023-08-14 19:08:44 +02:00
Christian Kotzbauer
776c35c1e1 cleanup: use Background context
Signed-off-by: Christian Kotzbauer <git@ckotzbauer.de>
2023-08-14 19:08:23 +02:00
56 changed files with 2639 additions and 2754 deletions

13
.github/kind-cluster-1.25.yaml vendored Normal file
View File

@@ -0,0 +1,13 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: kindest/node:v1.25.11
- role: control-plane
image: kindest/node:v1.25.11
- role: control-plane
image: kindest/node:v1.25.11
- role: worker
image: kindest/node:v1.25.11
- role: worker
image: kindest/node:v1.25.11

13
.github/kind-cluster-1.26.yaml vendored Normal file
View File

@@ -0,0 +1,13 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: "kindest/node:v1.26.6"
- role: control-plane
image: "kindest/node:v1.26.6"
- role: control-plane
image: "kindest/node:v1.26.6"
- role: worker
image: "kindest/node:v1.26.6"
- role: worker
image: "kindest/node:v1.26.6"

13
.github/kind-cluster-1.27.yaml vendored Normal file
View File

@@ -0,0 +1,13 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: "kindest/node:v1.27.3"
- role: control-plane
image: "kindest/node:v1.27.3"
- role: control-plane
image: "kindest/node:v1.27.3"
- role: worker
image: "kindest/node:v1.27.3"
- role: worker
image: "kindest/node:v1.27.3"

View File

@@ -1,9 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: "kindest/node:v1.33.4"
- role: worker
image: "kindest/node:v1.33.4"
- role: worker
image: "kindest/node:v1.33.4"

View File

@@ -1,9 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: "kindest/node:v1.34.0"
- role: worker
image: "kindest/node:v1.34.0"
- role: worker
image: "kindest/node:v1.34.0"

View File

@@ -1,9 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: "kindest/node:v1.32.8"
- role: worker
image: "kindest/node:v1.32.8"
- role: worker
image: "kindest/node:v1.32.8"

View File

@@ -10,21 +10,28 @@ test -z "$VERSION" && {
}
test -z "$TMPDIR" && TMPDIR="$(mktemp -d)"
# goreleaser uses arm64 instead of aarch64
goreleaser_arch=$(uname -m | sed -e 's/aarch64/arm64/g' -e 's/ppc64le/ppc64/' -e 's/armv7l/armv7/' )
TAR_FILE="$TMPDIR/${FILE_BASENAME}_$(uname -s)_${goreleaser_arch}.tar.gz"
TAR_FILE="$TMPDIR/${FILE_BASENAME}_$(uname -s)_$(uname -m).tar.gz"
export TAR_FILE
(
echo "Downloading GoReleaser $VERSION..."
curl -sfLo "$TAR_FILE" \
"$RELEASES_URL/download/$VERSION/${FILE_BASENAME}_$(uname -s)_${goreleaser_arch}.tar.gz"
"$RELEASES_URL/download/$VERSION/${FILE_BASENAME}_$(uname -s)_$(uname -m).tar.gz"
cd "$TMPDIR"
curl -sfLo "checksums.txt" "$RELEASES_URL/download/$VERSION/checksums.txt"
curl -sfLo "checksums.txt.sig" "$RELEASES_URL/download/$VERSION/checksums.txt.sig"
echo "Verifying checksums..."
sha256sum --ignore-missing --quiet --check checksums.txt
if command -v cosign >/dev/null 2>&1; then
echo "Verifying signatures..."
COSIGN_EXPERIMENTAL=1 cosign verify-blob \
--signature checksums.txt.sig \
checksums.txt
else
echo "Could not verify signatures, cosign is not installed."
fi
)
tar -xf "$TAR_FILE" -O goreleaser > "$TMPDIR/goreleaser"
rm "$TMPDIR/checksums.txt"
rm "$TMPDIR/checksums.txt" "$TMPDIR/checksums.txt.sig"
rm "$TAR_FILE"

View File

@@ -19,10 +19,7 @@ on:
# The branches below must be a subset of the branches above
branches: [ "main" ]
schedule:
- cron: '24 13 * * 6'
permissions:
contents: read
- cron: '24 13 * * 3'
jobs:
analyze:
@@ -41,17 +38,12 @@ jobs:
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
uses: actions/checkout@v3
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -65,7 +57,7 @@ jobs:
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11
uses: github/codeql-action/autobuild@v2
# Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -78,6 +70,6 @@ jobs:
# ./location_of_script_within_repo/buildscript.sh
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11
uses: github/codeql-action/analyze@v2
with:
category: "/language:${{matrix.language}}"

View File

@@ -1,27 +0,0 @@
# Dependency Review Action
#
# This Action will scan dependency manifest files that change as part of a Pull Request,
# surfacing known-vulnerable versions of the packages declared or updated in the PR.
# Once installed, if the workflow run is marked as required,
# PRs introducing known-vulnerable packages will be blocked from merging.
#
# Source repository: https://github.com/actions/dependency-review-action
name: 'Dependency Review'
on: [pull_request]
permissions:
contents: read
jobs:
dependency-review:
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- name: 'Checkout Repository'
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: 'Dependency Review'
uses: actions/dependency-review-action@595b5aeba73380359d98a5e087f648dbb0edce1b # v4.7.3

View File

@@ -10,9 +10,6 @@ env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
permissions:
contents: read
jobs:
tag-scan-and-push-final-image:
name: "Build, scan, and publish tagged image"
@@ -22,21 +19,16 @@ jobs:
contents: write
packages: write
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Login to ghcr.io
uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -44,15 +36,15 @@ jobs:
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f
uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Set up QEMU
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
uses: docker/setup-buildx-action@v2
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
@@ -65,9 +57,10 @@ jobs:
run: make kured-release-snapshot
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COSIGN_EXPERIMENTAL: 1
- name: Build image
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
uses: docker/build-push-action@v4
with:
context: .
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
@@ -78,11 +71,15 @@ jobs:
- name: Generate SBOM
run: |
hack/bin/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }} -o spdx > kured.sbom
.tmp/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }} -o spdx > kured.sbom
- name: Sign and attest artifacts
run: |
hack/bin/cosign sign -y -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
hack/bin/cosign sign-blob -y --output-signature kured.sbom.sig --output-certificate kured.sbom.pem kured.sbom
hack/bin/cosign attest -y --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
hack/bin/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
.tmp/cosign sign -f -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
.tmp/cosign sign-blob --output-signature kured.sbom.sig --output-certificate kured.sbom.pem kured.sbom
.tmp/cosign attest -f --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
.tmp/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.sha_short }}
env:
COSIGN_EXPERIMENTAL: 1

View File

@@ -1,25 +0,0 @@
name: Verify Docs Links
on:
pull_request:
push:
paths:
- '**.md'
jobs:
pr-check-docs-links:
name: Check docs for incorrect links
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Link Checker
uses: lycheeverse/lychee-action@885c65f3dc543b57c898c8099f4e08c8afd178a2
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
with:
args: --verbose --no-progress '*.md' '*.yaml' '*/*/*.go' --exclude-link-local
fail: true

View File

@@ -4,30 +4,65 @@ on:
push:
jobs:
pr-short-tests:
name: Run short go tests
pr-gotest:
name: Run go tests
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- name: checkout
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: run tests
run: make test
run: go test -json ./... > test.json
- name: Annotate tests
if: always()
uses: guyarb/golang-test-annoations@2941118d7ef622b1b3771d1ff6eae9e90659eb26 # v0.8.0
uses: guyarb/golang-test-annoations@v0.7.0
with:
test-results: test.json
pr-shellcheck:
name: Lint bash code with shellcheck
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Run ShellCheck
uses: bewuethr/shellcheck-action@v2
pr-lint-code:
name: Lint golang code
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Lint cmd folder
uses: Jerome1337/golint-action@v1.0.3
with:
golint-path: './cmd/...'
- name: Lint pkg folder
uses: Jerome1337/golint-action@v1.0.3
with:
golint-path: './pkg/...'
pr-check-docs-links:
name: Check docs for incorrect links
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Link Checker
uses: lycheeverse/lychee-action@ec3ed119d4f44ad2673a7232460dc7dff59d2421
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
with:
args: --verbose --no-progress '*.md' '*.yaml' '*/*/*.go' --exclude-link-local
fail: true
# This should not be made a mandatory test
# It is only used to make us aware of any potential security failure, that
# should trigger a bump of the image in build/.
@@ -35,30 +70,25 @@ jobs:
name: Build image and scan it against known vulnerabilities
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Set up QEMU
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
id: tags
- name: Build image
run: VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make image
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@f9424c10c36e288d5fa79bd3dfd1aeb2d6eae808
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
with:
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
format: 'table'
@@ -73,91 +103,166 @@ jobs:
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
# This test uses the "command" reboot-method.
e2e-manifests:
name: End-to-End test with kured with code and manifests from HEAD
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
testname:
- "TestE2EWithCommand"
- "TestE2EWithSignal"
- "TestE2EConcurrentWithCommand"
- "TestE2EConcurrentWithSignal"
kubernetes_version:
- "previous"
- "current"
- "next"
kubernetes:
- "1.25"
- "1.26"
- "1.27"
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Set up QEMU
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
id: tags
- name: Install kind
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build artifacts
run: |
VERSION="${{ steps.tags.outputs.sha_short }}" make image
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF
# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create kind cluster with 5 nodes
uses: helm/kind-action@v1.8.0
with:
install_only: true
version: v0.30.0
- name: Run specific e2e tests
run: make e2e-test ARGS="-run ^${{ matrix.testname }}/${{ matrix.kubernetes_version }}"
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
version: v0.14.0
- name: Preload previously built images onto kind cluster
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing
- name: Do not wait for an hour before detecting the rebootSentinel
run: |
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
- name: Install kured with kubectl
run: |
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
- name: Ensure kured is ready
uses: nick-invision/retry@v2.8.3
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"
- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh
- name: Follow reboot until success
env:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh
e2e-tests-singleversion:
name: End-to-End test targetting a single version of kubernetes
# This ensures the latest code works with the manifests built from tree.
# It is useful for two things:
# - Test manifests changes (obviously), ensuring they don't break existing clusters
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
# Concurrency = 2
e2e-manifests-concurent:
name: End-to-End test with kured with code and manifests from HEAD (concurrent)
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
testname:
- "TestCordonningIsKept/concurrency1"
- "TestCordonningIsKept/concurrency2"
- "TestE2EBlocker/podblocker"
kubernetes:
- "1.25"
- "1.26"
- "1.27"
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Set up QEMU
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
id: tags
- name: Install kind
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build artifacts
run: |
VERSION="${{ steps.tags.outputs.sha_short }}" make image
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest
- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF
# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create kind cluster with 5 nodes
uses: helm/kind-action@v1.8.0
with:
install_only: true
version: v0.30.0
# Keep this until v1.31 (or superior) becomes the default kubectl version for the kind-action.
# It is used in podblocker shell script test to use --all-pods.
# If the podblocker e2e test relies on another way, this can also be removed.
kubectl_version: v1.31.0
- name: Run specific e2e tests
run: make e2e-test ARGS="-run ^${{ matrix.testname }}"
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
version: v0.14.0
- name: Preload previously built images onto kind cluster
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing
- name: Do not wait for an hour before detecting the rebootSentinel
run: |
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
sed -i 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml
- name: Install kured with kubectl
run: |
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
- name: Ensure kured is ready
uses: nick-invision/retry@v2.8.3
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"
- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh
- name: Follow reboot until success
env:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh

View File

@@ -12,9 +12,6 @@ env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
permissions:
contents: read
jobs:
tag-scan-and-push-final-image:
name: "Build, scan, and publish tagged image"
@@ -24,14 +21,9 @@ jobs:
contents: write
packages: write
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
@@ -39,17 +31,18 @@ jobs:
run: echo "version=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
id: tags
- name: Set up QEMU
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Build binaries
run: make kured-release-tag
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COSIGN_EXPERIMENTAL: 1
- name: Build single image for scan
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
uses: docker/build-push-action@v4
with:
context: .
platforms: linux/amd64
@@ -59,7 +52,7 @@ jobs:
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@f9424c10c36e288d5fa79bd3dfd1aeb2d6eae808
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
with:
image-ref: '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}'
format: 'table'
@@ -69,7 +62,7 @@ jobs:
severity: 'CRITICAL,HIGH'
- name: Login to ghcr.io
uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -77,12 +70,12 @@ jobs:
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f
uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Build release images
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
uses: docker/build-push-action@v4
with:
context: .
platforms: linux/arm64, linux/amd64, linux/arm/v7, linux/arm/v6, linux/386
@@ -93,11 +86,15 @@ jobs:
- name: Generate SBOM
run: |
hack/bin/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }} -o spdx > kured.sbom
.tmp/syft ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }} -o spdx > kured.sbom
- name: Sign and attest artifacts
run: |
hack/bin/cosign sign -y -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
hack/bin/cosign sign-blob -y --output-signature kured.sbom.sig kured.sbom
hack/bin/cosign attest -y --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
hack/bin/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
.tmp/cosign sign -f -r ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
.tmp/cosign sign-blob --output-signature kured.sbom.sig kured.sbom
.tmp/cosign attest -f --type spdx --predicate kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
.tmp/cosign attach sbom --type spdx --sbom kured.sbom ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.version }}
env:
COSIGN_EXPERIMENTAL: 1

View File

@@ -2,25 +2,20 @@ name: Daily jobs
on:
schedule:
- cron: "30 1 * * 6"
- cron: "30 1 * * *"
jobs:
periodics-gotest:
name: Run go tests
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- name: checkout
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
uses: actions/checkout@v3
- name: run tests
run: make test
run: go test -json ./... > test.json
- name: Annotate tests
if: always()
uses: guyarb/golang-test-annoations@2941118d7ef622b1b3771d1ff6eae9e90659eb26 # v0.8.0
uses: guyarb/golang-test-annoations@v0.7.0
with:
test-results: test.json
@@ -30,12 +25,7 @@ jobs:
steps:
# Stale by default waits for 60 days before marking PR/issues as stale, and closes them after 21 days.
# Do not expire the first issues that would allow the community to grow.
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
- uses: actions/stale@v8
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: 'This issue was automatically considered stale due to lack of activity. Please update it and/or join our slack channels to promote it, before it automatically closes (in 7 days).'
@@ -49,14 +39,9 @@ jobs:
name: Check docs for incorrect links
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@v3
- name: Link Checker
uses: lycheeverse/lychee-action@885c65f3dc543b57c898c8099f4e08c8afd178a2
uses: lycheeverse/lychee-action@ec3ed119d4f44ad2673a7232460dc7dff59d2421
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
with:
@@ -67,30 +52,25 @@ jobs:
name: Build image and scan it against known vulnerabilities
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Set up QEMU
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
id: tags
- name: Build artifacts
run: VERSION="${{ steps.tags.outputs.sha_short }}" DH_ORG="${{ github.repository_owner }}" make image
run: VERSION="${{ steps.tags.outputs.sha_short }}" make image
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@f9424c10c36e288d5fa79bd3dfd1aeb2d6eae808
uses: aquasecurity/trivy-action@41f05d9ecffa2ed3f1580af306000f734b733e54
with:
image-ref: 'ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }}'
format: 'table'

View File

@@ -1,78 +0,0 @@
# This workflow uses actions that are not certified by GitHub. They are provided
# by a third-party and are governed by separate terms of service, privacy
# policy, and support documentation.
name: Scorecard supply-chain security
on:
# For Branch-Protection check. Only the default branch is supported. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
branch_protection_rule:
# To guarantee Maintained check is occasionally updated. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
schedule:
- cron: '34 3 * * 6'
push:
branches: [ "main" ]
# Declare default permissions as read only.
permissions: read-all
jobs:
analysis:
name: Scorecard analysis
runs-on: ubuntu-latest
permissions:
# Needed to upload the results to code-scanning dashboard.
security-events: write
# Needed to publish results and get a badge (see publish_results below).
id-token: write
# Uncomment the permissions below if installing in a private repository.
# contents: read
# actions: read
steps:
- name: Harden Runner
uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0
with:
egress-policy: audit
- name: "Checkout code"
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
persist-credentials: false
- name: "Run analysis"
uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
with:
results_file: results.sarif
results_format: sarif
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
# - you want to enable the Branch-Protection check on a *public* repository, or
# - you are installing Scorecard on a *private* repository
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
# Public repositories:
# - Publish results to OpenSSF REST API for easy access by consumers
# - Allows the repository to include the Scorecard badge.
# - See https://github.com/ossf/scorecard-action#publishing-results.
# For private repositories:
# - `publish_results` will always be set to `false`, regardless
# of the value entered here.
publish_results: true
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
# format to the repository Actions tab.
- name: "Upload artifact"
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v3.pre.node20
with:
name: SARIF file
path: results.sarif
retention-days: 5
# Upload the results to GitHub's code scanning dashboard (optional).
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11
with:
sarif_file: results.sarif

4
.gitignore vendored
View File

@@ -2,6 +2,4 @@ cmd/kured/kured
vendor
build
dist
test.json
tests/kind/testfiles/*.yaml
hack/bin/
.tmp

View File

@@ -1,37 +0,0 @@
version: "2"
#timeout : 5m we can add this if needed
modules-download-mode: readonly
run:
tests: false
linters:
enable:
- govet
- staticcheck
- unused
- contextcheck
- goconst
- gosec
- testifylint
- errcheck
- revive
linters-settings:
errcheck:
check-type-assertions: true
check-blank: true
revive:
severity: warning
confidence: 0.8
rules:
- name: indent-error-flow
- name: var-naming
- name: import-shadowing
# https://github.com/mgechev/revive/blob/HEAD/RULES_DESCRIPTIONS.md#package-comments
- name: package-comments # This is not working!
disabled: true
output:
format: colored-line-number
print-issued-lines: true
print-linter-name: true
uniq-by-line: false
sort-results: true

View File

View File

@@ -1,3 +1,3 @@
# Kured Community Code of Conduct
## Kured Community Code of Conduct
Kured follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/main/code-of-conduct.md).

View File

@@ -5,13 +5,13 @@ Slack][slack], reporting or triaging [issues][issues] or contributing code
to `kured`.
In any case, it will make sense to familiarise yourself with the main
[documentation][documentation] to understand the different features and
options, which is helpful for testing. The "building" section in
particular makes sense if you are planning to contribute code.
[README][readme] to understand the different features and options, which is
helpful for testing. The "building" section in particular makes sense if
you are planning to contribute code.
[slack]: https://github.com/kubereboot/kured/blob/main/README.md#getting-help
[slack]: README.md#getting-help
[issues]: https://github.com/kubereboot/kured/issues
[documentation]: https://kured.dev/docs
[readme]: README.md
## Certificate of Origin
@@ -41,15 +41,6 @@ All Kured repositories are kept under <https://github.com/kubereboot>. To find t
| <https://github.com/kubereboot/charts> | Helm chart |
| <https://github.com/kubereboot/website> | website and documentation |
### Kured code
- Kured's main code can be found in the [`cmd`](cmd) and [`pkg`](pkg) directories
- Its e2e tests are in the [`tests`](tests) directory
- We use [GoReleaser to build](.goreleaser.yml).
- Every PR and tagged release is tested by [Kind in GitHub workflows](.github/workflows).
As a project, we try to follow all the official and obvious standards.
## Regular development activities
### Prepare environment
@@ -75,23 +66,16 @@ efbb0c3: Document version compatibility in release notes
Search the git log for inspiration for your cases.
Please update our `.github/workflows` with the new k8s images.
Please update our `.github/workflows` with the new k8s images, starting by
the creation of a `.github/kind-cluster-<version>.yaml`, then updating
our workflows with the new versions.
For that, run the following:
`cp .github/kind-cluster-current.yaml .github/kind-cluster-previous.yaml`
`cp .github/kind-cluster-next.yaml .github/kind-cluster-current.yaml`
Then edit `.github/kind-cluster-next.yaml` to point to the new version.
This will make the full test matrix updated (the CI and the test code).
Once your code passes all tests, update the support matrix in
the [installation docs](https://kured.dev/docs/installation/).
Once you updated everything, make sure you update the support matrix on
the main [README][readme] as well.
### Updating other dependencies
Dependabot proposes changes in our `go.mod`/`go.sum`.
Dependabot proposes changes in our go.mod/go.sum.
Some of those changes are covered by CI testing, some are not.
Please make sure to test those not covered by CI (mostly the integration
@@ -103,7 +87,7 @@ We run periodic jobs (see also Automated testing section of this documentation).
Those should be monitored for failures.
If a failure happen in periodics, something terribly wrong must have happened
(or GitHub is failing at the creation of a kind cluster). Please monitor those
(or github is failing at the creation of a kind cluster). Please monitor those
failures carefully.
### Introducing new features
@@ -123,28 +107,27 @@ This also means that when you expose a new feature, you should create another PR
for your changes in <https://github.com/kubereboot/charts> to make your feature
available at the next kured version for helm users.
In the charts PR, you can directly bump the `appVersion` to the next minor version
In the charts PR, you can directly bump the appVersion to the next minor version
(you are introducing a new feature, which requires a bump of the minor number.
For example, if current `appVersion` is `1.6.x`, make sure you update your `appVersion`
to `1.7.0`). It allows us to have an easy view of what we land each release.
For example, if current appVersion is 1.6.x, make sure you update your appVersion
to 1.7.0). It allows us to have an easy view of what we land each release.
Do not hesitate to increase the test coverage for your feature, whether it's unit
testing to full functional testing (even using helm charts).
testing to full functional testing (even using helm charts)
### Increasing test coverage
We are welcoming any change to increase our test coverage.
See also our GitHub issues for the label
[`testing`](https://github.com/kubereboot/kured/labels/testing).
See also our github issues for the label `testing`.
## Automated testing
Our CI is covered by GitHub actions.
You can see their contents in `.github/workflows`.
Our CI is covered by github actions.
You can see their contents in .github/workflows.
We currently run:
- go tests and golangci-lint
- go tests and lint
- `shellcheck`
- a check for dead links in our docs
- a security check against our base image (alpine)
@@ -154,22 +137,6 @@ To test your code manually, follow the section Manual testing.
## Manual (release) testing
### Quick Golang code testing
Please run `make test` to run only the basic tests. It gives a good
idea of the code behaviour.
### Linting
We use [`golangci-lint`](https://golangci-lint.run/) for Go code linting.
To run lint checks locally:
```bash
make lint
```
### Manual functional testing
Before `kured` is released, we want to make sure it still works fine on the
previous, current and next minor version of Kubernetes (with respect to the
`client-go` & `kubectl` dependencies in use). For local testing e.g.
@@ -185,11 +152,15 @@ results, if you login to a node and run:
sudo touch /var/run/reboot-required
```
### Example of functional testing with `minikube`
### Example of golang testing
Please run `make test`. You should have `golint` installed.
### Example of testing with `minikube`
A test-run with `minikube` could look like this:
```cli
```console
# start minikube
minikube start --driver=kvm2 --kubernetes-version <k8s-release>
@@ -221,7 +192,7 @@ Then you can check for the lock release.
A test-run with `kind` could look like this:
```cli
```console
# create kind cluster
kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
@@ -233,44 +204,23 @@ kind create cluster --config .github/kind-cluster-<k8s-version>.yaml
```
### Example of testing with `kind` and `make`
A test-run with `kind` and `make` can be done with the following command:
```cli
# Build kured:dev image, build manifests, and run the "long" go tests
make e2e-test
```
You can alter test behaviour by passing arguments to this command.
A few examples below:
```shell
# Run only TestE2EWithSignal test for the kubernetes version named "current" (see kind file)
make e2e-test ARGS="-run ^TestE2EWithSignal/current"
# Run all tests but make sure to extend the timeout, for slower machines.
make e2e-test ARGS="-timeout 1200s'
```
## Publishing a new kured release
### Prepare Documentation
Ensure the [compatibility matrix](https://kured.dev/docs/installation/) is
updated to the new version you want to release.
Check that `README.md` has an updated compatibility matrix and that the
url in the `kubectl` incantation (under "Installation") is updated to the
new version you want to release.
### Update the manifests with the new version
### Create a tag on the repo
Create a commit updating the manifest with future image [like this one](https://github.com/kubereboot/kured/commit/58091f6145771f426b4b9e012a43a9c847af2560).
Before going further, we should freeze the code for a release, by
tagging the code. The Github-Action should start a new job and push
the new image to the registry.
### Create the new version tag on the repo
### Create the combined manifest
Tag the previously created commit with the future release version.
The Github Actions workflow will push the new image to the registry.
### Create the combined manifest for the new version
Now create the `kured-<new version>-dockerhub.yaml` for e.g. `1.3.0`:
Now create the `kured-<release>-dockerhub.yaml` for e.g. `1.3.0`:
```sh
VERSION=1.3.0
@@ -280,23 +230,13 @@ cat kured-rbac.yaml > "$MANIFEST"
cat kured-ds.yaml >> "$MANIFEST"
```
### Publish new version release artifacts
### Publish release artifacts
Now you can head to the GitHub UI for releases, drafting a new
release. Chose, as tag, the new version number.
Click to generate the release notes.
Fill, as name, "Kured <new version>".
Edit the generated text.
Now you can head to the Github UI, use the version number as tag and upload the
`kured-<release>-dockerhub.yaml` file.
Please describe what's new and noteworthy in the release notes, list the PRs
that landed and give a shout-out to everyone who contributed.
Please also note down on which releases the upcoming `kured` release was
tested on or what it supports. (Check old release notes if you're unsure.)
Before clicking on publishing release, upload the yaml manifest
(`kured-<new version>-dockerhub.yaml`) file.
Click on publish the release and set as the latest release.
tested on. (Check old release notes if you're unsure.)

View File

@@ -1,4 +1,4 @@
FROM alpine:3.22.1@sha256:4bcff63911fcb4448bd4fdacec207030997caf25e9bea4045fa6c8c44de311d1 AS bin
FROM --platform=$TARGETPLATFORM alpine:3.18.3 as bin
ARG TARGETOS
ARG TARGETARCH
@@ -19,7 +19,7 @@ RUN set -ex \
esac \
&& cp /dist/kured_${TARGETOS}_${TARGETARCH}${SUFFIX}/kured /dist/kured;
FROM alpine:3.22.1@sha256:4bcff63911fcb4448bd4fdacec207030997caf25e9bea4045fa6c8c44de311d1
FROM --platform=$TARGETPLATFORM alpine:3.18.3
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
COPY --from=bin /dist/kured /usr/bin/kured
ENTRYPOINT ["/usr/bin/kured"]

View File

@@ -1,12 +1,5 @@
In alphabetical order:
Christian Hopf <christian.kotzbauer@gmail.com> (@ckotzbauer)
Hidde Beydals <hidde@hhh.computer> (@hiddeco)
Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
Daniel Holbach <daniel.holbach@gmail.com> (@dholbach)
Hidde Beydals <hidde@weave.works> (@hiddeco)
Jack Francis <jackfrancis@gmail.com> (@jackfrancis)
Jean-Philippe Evrard <open-source@a.spamming.party> (@evrardjp)
Retired maintainers:
- Daniel Holbach
Thank you for your involvement, and let us not say "farewell" ...

View File

@@ -1,77 +1,53 @@
.DEFAULT: all
.PHONY: all clean image minikube-publish manifest test kured-all lint
.PHONY: all clean image minikube-publish manifest test kured-all
HACKDIR=./hack/bin
GORELEASER_CMD=$(HACKDIR)/goreleaser
DH_ORG ?= kubereboot
TEMPDIR=./.tmp
GORELEASER_CMD=$(TEMPDIR)/goreleaser
DH_ORG=kubereboot
VERSION=$(shell git rev-parse --short HEAD)
SUDO=$(shell docker info >/dev/null 2>&1 || echo "sudo -E")
all: image
$(HACKDIR):
mkdir -p $(HACKDIR)
$(TEMPDIR):
mkdir -p $(TEMPDIR)
.PHONY: bootstrap-tools
bootstrap-tools: $(HACKDIR)
command -v $(HACKDIR)/goreleaser || VERSION=v1.24.0 TMPDIR=$(HACKDIR) bash hack/installers/goreleaser-install.sh
command -v $(HACKDIR)/syft || curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b $(HACKDIR) v1.0.1
command -v $(HACKDIR)/cosign || curl -sSfL https://github.com/sigstore/cosign/releases/download/v2.2.3/cosign-linux-amd64 -o $(HACKDIR)/cosign
command -v $(HACKDIR)/shellcheck || (curl -sSfL https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz | tar -J -v -x shellcheck-stable/shellcheck && mv shellcheck-stable/shellcheck $(HACKDIR)/shellcheck && rmdir shellcheck-stable)
chmod +x $(HACKDIR)/goreleaser $(HACKDIR)/cosign $(HACKDIR)/syft $(HACKDIR)/shellcheck
command -v $(HACKDIR)/golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(HACKDIR) v2.1.6
bootstrap-tools: $(TEMPDIR)
VERSION=v1.11.4 TMPDIR=.tmp bash .github/scripts/goreleaser-install.sh
curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b .tmp v0.58.0
curl -sSfL https://github.com/sigstore/cosign/releases/download/v1.12.1/cosign-linux-amd64 -o .tmp/cosign
chmod +x .tmp/goreleaser .tmp/cosign .tmp/syft
clean:
rm -rf ./dist
kured: bootstrap-tools
$(GORELEASER_CMD) build --clean --single-target --snapshot
kured:
$(GORELEASER_CMD) build --rm-dist --single-target --snapshot
kured-all: bootstrap-tools
$(GORELEASER_CMD) build --clean --snapshot
kured-all:
$(GORELEASER_CMD) build --rm-dist --snapshot
kured-release-tag: bootstrap-tools
$(GORELEASER_CMD) release --clean
kured-release-tag:
$(GORELEASER_CMD) release --rm-dist
kured-release-snapshot: bootstrap-tools
$(GORELEASER_CMD) release --clean --snapshot
kured-release-snapshot:
$(GORELEASER_CMD) release --rm-dist --snapshot
image: kured
$(SUDO) docker buildx build --no-cache --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) .
dev-image: image
$(SUDO) docker tag ghcr.io/$(DH_ORG)/kured:$(VERSION) kured:dev
dev-manifest:
# basic e2e scenario
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds.yaml > tests/kind/testfiles/kured-ds.yaml
# signal e2e scenario
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-signal.yaml
# concurrency e2e command scenario
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml > tests/kind/testfiles/kured-ds-concurrent-command.yaml
# concurrency e2e signal scenario
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-concurrent-signal.yaml
# pod blocker e2e signal scenario
sed -e "s#image: ghcr.io/.*kured.*#image: kured:dev#g" -e 's/#\(.*\)--period=1h/\1--period=20s/g' -e 's/#\(.*\)--blocking-pod-selector=name=temperamental/\1--blocking-pod-selector=app=blocker/g' kured-ds-signal.yaml > tests/kind/testfiles/kured-ds-podblocker.yaml
e2e-test: dev-manifest dev-image
echo "Running ALL go tests"
go test -count=1 -v --parallel 4 ./... $(ARGS)
$(SUDO) docker buildx build --load -t ghcr.io/$(DH_ORG)/kured:$(VERSION) .
minikube-publish: image
$(SUDO) docker save ghcr.io/$(DH_ORG)/kured | (eval $$(minikube docker-env) && docker load)
manifest:
sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds.yaml
sed -i "s#image: ghcr.io/.*kured.*#image: ghcr.io/$(DH_ORG)/kured:$(VERSION)#g" kured-ds-signal.yaml
echo "Please generate combined manifest if necessary"
test: lint
echo "Running short go tests"
go test -test.short -json ./... > test.json
lint: bootstrap-tools
echo "Running shellcheck"
find . -name '*.sh' | xargs -n1 $(HACKDIR)/shellcheck
@echo "Running golangci-lint..."
$(HACKDIR)/golangci-lint run ./...
test:
echo "Running go tests"
go test ./...
echo "Running golint on pkg"
golint ./pkg/...
echo "Running golint on cmd"
golint ./cmd/...

View File

@@ -3,9 +3,8 @@
[![Artifact HUB](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/kured)](https://artifacthub.io/packages/helm/kured/kured)
[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2Fkubereboot%2Fkured.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2Fkubereboot%2Fkured?ref=badge_shield)
[![CLOMonitor](https://img.shields.io/endpoint?url=https://clomonitor.io/api/projects/cncf/kured/badge)](https://clomonitor.io/projects/cncf/kured)
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/8867/badge)](https://www.bestpractices.dev/projects/8867)
<img src="https://github.com/kubereboot/website/raw/main/static/img/kured.png" alt="kured logo" width="200" align="right"/>
<img src="https://github.com/kubereboot/website/raw/main/static/img/kured.png" width="200" align="right"/>
- [kured - Kubernetes Reboot Daemon](#kured---kubernetes-reboot-daemon)
- [Introduction](#introduction)

File diff suppressed because it is too large Load Diff

View File

@@ -3,29 +3,61 @@ package main
import (
"reflect"
"testing"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/kubereboot/kured/pkg/alerts"
assert "gotest.tools/v3/assert"
papi "github.com/prometheus/client_golang/api"
)
func TestValidateNotificationURL(t *testing.T) {
type BlockingChecker struct {
blocking bool
}
tests := []struct {
name string
slackHookURL string
notifyURL string
expected string
}{
{"slackHookURL only works fine", "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"},
{"slackHookURL and notify URL together only keeps notifyURL", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"},
{"slackHookURL removes extraneous double quotes", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"},
{"slackHookURL removes extraneous single quotes", "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"},
{"notifyURL removes extraneous double quotes", "", "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"},
{"notifyURL removes extraneous single quotes", "", "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"},
func (fbc BlockingChecker) isBlocked() bool {
return fbc.blocking
}
var _ RebootBlocker = BlockingChecker{} // Verify that Type implements Interface.
var _ RebootBlocker = (*BlockingChecker)(nil) // Verify that *Type implements Interface.
func Test_flagCheck(t *testing.T) {
var cmd *cobra.Command
var args []string
slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
expected := "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
flagCheck(cmd, args)
if notifyURL != expected {
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := validateNotificationURL(tt.notifyURL, tt.slackHookURL); !reflect.DeepEqual(got, tt.expected) {
t.Errorf("validateNotificationURL() = %v, expected %v", got, tt.expected)
}
})
// validate that surrounding quotes are stripped
slackHookURL = "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\""
expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
flagCheck(cmd, args)
if notifyURL != expected {
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
}
slackHookURL = "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'"
expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"
flagCheck(cmd, args)
if notifyURL != expected {
t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
}
slackHookURL = ""
notifyURL = "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\""
expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
flagCheck(cmd, args)
if notifyURL != expected {
t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
}
notifyURL = "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'"
expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"
flagCheck(cmd, args)
if notifyURL != expected {
t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL)
}
}
@@ -74,3 +106,205 @@ func Test_stripQuotes(t *testing.T) {
})
}
}
func Test_rebootBlocked(t *testing.T) {
noCheckers := []RebootBlocker{}
nonblockingChecker := BlockingChecker{blocking: false}
blockingChecker := BlockingChecker{blocking: true}
// Instantiate a prometheusClient with a broken_url
promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"})
if err != nil {
log.Fatal("Can't create prometheusClient: ", err)
}
brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false}
type args struct {
blockers []RebootBlocker
}
tests := []struct {
name string
args args
want bool
}{
{
name: "Do not block on no blocker defined",
args: args{blockers: noCheckers},
want: false,
},
{
name: "Ensure a blocker blocks",
args: args{blockers: []RebootBlocker{blockingChecker}},
want: true,
},
{
name: "Ensure a non-blocker doesn't block",
args: args{blockers: []RebootBlocker{nonblockingChecker}},
want: false,
},
{
name: "Ensure one blocker is enough to block",
args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}},
want: true,
},
{
name: "Do block on error contacting prometheus API",
args: args{blockers: []RebootBlocker{brokenPrometheusClient}},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := rebootBlocked(tt.args.blockers...); got != tt.want {
t.Errorf("rebootBlocked() = %v, want %v", got, tt.want)
}
})
}
}
func Test_buildHostCommand(t *testing.T) {
type args struct {
pid int
command []string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Ensure command will run with nsenter",
args: args{pid: 1, command: []string{"ls", "-Fal"}},
want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := buildHostCommand(tt.args.pid, tt.args.command); !reflect.DeepEqual(got, tt.want) {
t.Errorf("buildHostCommand() = %v, want %v", got, tt.want)
}
})
}
}
func Test_buildSentinelCommand(t *testing.T) {
type args struct {
rebootSentinelFile string
rebootSentinelCommand string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Ensure a sentinelFile generates a shell 'test' command with the right file",
args: args{
rebootSentinelFile: "/test1",
rebootSentinelCommand: "",
},
want: []string{"test", "-f", "/test1"},
},
{
name: "Ensure a sentinelCommand has priority over a sentinelFile if both are provided (because sentinelFile is always provided)",
args: args{
rebootSentinelFile: "/test1",
rebootSentinelCommand: "/sbin/reboot-required -r",
},
want: []string{"/sbin/reboot-required", "-r"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := buildSentinelCommand(tt.args.rebootSentinelFile, tt.args.rebootSentinelCommand); !reflect.DeepEqual(got, tt.want) {
t.Errorf("buildSentinelCommand() = %v, want %v", got, tt.want)
}
})
}
}
func Test_parseRebootCommand(t *testing.T) {
type args struct {
rebootCommand string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Ensure a reboot command is properly parsed",
args: args{
rebootCommand: "/sbin/systemctl reboot",
},
want: []string{"/sbin/systemctl", "reboot"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := parseRebootCommand(tt.args.rebootCommand); !reflect.DeepEqual(got, tt.want) {
t.Errorf("parseRebootCommand() = %v, want %v", got, tt.want)
}
})
}
}
func Test_rebootRequired(t *testing.T) {
type args struct {
sentinelCommand []string
}
tests := []struct {
name string
args args
want bool
}{
{
name: "Ensure rc = 0 means reboot required",
args: args{
sentinelCommand: []string{"true"},
},
want: true,
},
{
name: "Ensure rc != 0 means reboot NOT required",
args: args{
sentinelCommand: []string{"false"},
},
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := rebootRequired(tt.args.sentinelCommand); got != tt.want {
t.Errorf("rebootRequired() = %v, want %v", got, tt.want)
}
})
}
}
func Test_rebootRequired_fatals(t *testing.T) {
cases := []struct {
param []string
expectFatal bool
}{
{
param: []string{"true"},
expectFatal: false,
},
{
param: []string{"./babar"},
expectFatal: true,
},
}
defer func() { log.StandardLogger().ExitFunc = nil }()
var fatal bool
log.StandardLogger().ExitFunc = func(int) { fatal = true }
for _, c := range cases {
fatal = false
rebootRequired(c.param)
assert.Equal(t, c.expectFatal, fatal)
}
}

View File

@@ -5,14 +5,14 @@ import (
)
type regexpValue struct {
*regexp.Regexp
value **regexp.Regexp
}
func (rev *regexpValue) String() string {
if rev.Regexp == nil {
if *rev.value == nil {
return ""
}
return rev.Regexp.String()
return (*rev.value).String()
}
func (rev *regexpValue) Set(s string) error {
@@ -20,11 +20,12 @@ func (rev *regexpValue) Set(s string) error {
if err != nil {
return err
}
rev.Regexp = value
*rev.value = value
return nil
}
// Type method returns the type of the flag as a string
func (rev *regexpValue) Type() string {
return "regexp"
return "regexp.Regexp"
}

136
go.mod
View File

@@ -1,89 +1,105 @@
module github.com/kubereboot/kured
go 1.24.6
go 1.19
replace golang.org/x/net => golang.org/x/net v0.7.0
replace github.com/emicklei/go-restful/v3 => github.com/emicklei/go-restful/v3 v3.10.2
require (
github.com/containrrr/shoutrrr v0.8.0
github.com/containrrr/shoutrrr v0.7.1
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
github.com/prometheus/client_golang v1.23.0
github.com/prometheus/common v0.65.0
github.com/google/uuid v1.3.0 // indirect
github.com/prometheus/client_golang v1.16.0
github.com/prometheus/common v0.44.0
github.com/sirupsen/logrus v1.9.3
github.com/spf13/pflag v1.0.7
github.com/stretchr/testify v1.11.1
k8s.io/api v0.33.4
k8s.io/apimachinery v0.33.4
k8s.io/client-go v0.33.4
k8s.io/kubectl v0.33.4
github.com/spf13/cobra v1.7.0
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.16.0
github.com/stretchr/testify v1.8.4
gotest.tools/v3 v3.5.0
k8s.io/api v0.26.7
k8s.io/apimachinery v0.26.7
k8s.io/client-go v0.26.7
k8s.io/kubectl v0.26.7
)
require (
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
github.com/MakeNowJust/heredoc v1.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/chai2010/gettext-go v1.0.2 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect
github.com/fatih/color v1.15.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-errors/errors v1.4.2 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
github.com/fatih/color v1.14.1 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/go-errors/errors v1.0.1 // indirect
github.com/go-logr/logr v1.2.3 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/jsonreference v0.20.0 // indirect
github.com/go-openapi/swag v0.19.14 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/btree v1.0.1 // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/gofuzz v1.1.0 // indirect
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/imdario/mergo v0.3.6 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mailru/easyjson v0.7.6 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.17 // indirect
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/moby/term v0.5.0 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mitchellh/go-wordwrap v1.0.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/moby/spdystream v0.2.0 // indirect
github.com/moby/term v0.0.0-20220808134915-39b0c02b01ae // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/procfs v0.10.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/spf13/cobra v1.8.1 // indirect
github.com/x448/float16 v0.8.4 // indirect
github.com/xlab/treeprint v1.2.0 // indirect
golang.org/x/net v0.40.0 // indirect
golang.org/x/oauth2 v0.30.0 // indirect
golang.org/x/sync v0.14.0 // indirect
golang.org/x/sys v0.33.0 // indirect
golang.org/x/term v0.32.0 // indirect
golang.org/x/text v0.25.0 // indirect
golang.org/x/time v0.9.0 // indirect
google.golang.org/protobuf v1.36.6 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
github.com/spf13/afero v1.9.5 // indirect
github.com/spf13/cast v1.5.1 // indirect
github.com/spf13/jwalterweatherman v1.1.0 // indirect
github.com/subosito/gotenv v1.4.2 // indirect
github.com/xlab/treeprint v1.1.0 // indirect
go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect
golang.org/x/net v0.10.0 // indirect
golang.org/x/oauth2 v0.8.0 // indirect
golang.org/x/sys v0.8.0 // indirect
golang.org/x/term v0.6.0 // indirect
golang.org/x/text v0.9.0 // indirect
golang.org/x/time v0.1.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cli-runtime v0.33.4 // indirect
k8s.io/component-base v0.33.4 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/kustomize/api v0.19.0 // indirect
sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
k8s.io/cli-runtime v0.26.7 // indirect
k8s.io/component-base v0.26.7 // indirect
k8s.io/klog/v2 v2.80.1 // indirect
k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect
k8s.io/utils v0.0.0-20221107191617-1a15be271d1d // indirect
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
sigs.k8s.io/kustomize/api v0.12.1 // indirect
sigs.k8s.io/kustomize/kyaml v0.13.9 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)

1310
go.sum

File diff suppressed because it is too large Load Diff

View File

@@ -1,41 +0,0 @@
// Package internal provides convenient tools which shouldn't be in cmd/main
// It will eventually provide internal validation and chaining logic to select
// appropriate reboot and sentinel check methods based on configuration.
// It validates user input and instantiates the correct checker and rebooter implementations
// for use elsewhere in kured.
package internal
import (
"fmt"
"github.com/kubereboot/kured/pkg/checkers"
"github.com/kubereboot/kured/pkg/reboot"
log "github.com/sirupsen/logrus"
)
// NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input,
// then chains to the right constructor.
func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (reboot.Rebooter, error) {
switch rebootMethod {
case "command":
log.Infof("Reboot command: %s", rebootCommand)
return reboot.NewCommandRebooter(rebootCommand)
case "signal":
log.Infof("Reboot signal: %d", rebootSignal)
return reboot.NewSignalRebooter(rebootSignal)
default:
return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod)
}
}
// NewRebootChecker validates the rebootSentinelCommand, rebootSentinelFile input,
// then chains to the right constructor.
func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (checkers.Checker, error) {
// An override of rebootSentinelCommand means a privileged command
if rebootSentinelCommand != "" {
log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand)
return checkers.NewCommandChecker(rebootSentinelCommand, 1, true)
}
log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile)
return checkers.NewFileRebootChecker(rebootSentinelFile)
}

View File

@@ -1,100 +0,0 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kured
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kured # Must match `--ds-name`
namespace: kube-system # Must match `--ds-namespace`
spec:
selector:
matchLabels:
name: kured
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: kured
spec:
serviceAccountName: kured
tolerations:
- key: node-role.kubernetes.io/control-plane
effect: NoSchedule
- key: node-role.kubernetes.io/master
effect: NoSchedule
hostPID: true # Facilitate entering the host mount namespace via init
restartPolicy: Always
volumes:
- name: sentinel
hostPath:
path: /var/run
type: Directory
containers:
- name: kured
# If you find yourself here wondering why there is no
# :latest tag on Docker Hub,see the FAQ in the README
image: ghcr.io/kubereboot/kured:1.20.0
imagePullPolicy: IfNotPresent
securityContext:
privileged: false # Give permission to nsenter /proc/1/ns/mnt
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["*"]
add: ["CAP_KILL"]
ports:
- containerPort: 8080
name: metrics
env:
# Pass in the name of the node on which this pod is scheduled
# for use with drain/uncordon operations and lock acquisition
- name: KURED_NODE_ID
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- mountPath: /sentinel
name: sentinel
readOnly: true
command:
- /usr/bin/kured
- --reboot-sentinel=/sentinel/reboot-required
- --reboot-method=signal
# - --reboot-signal=39
# - --force-reboot=false
# - --drain-grace-period=-1
# - --skip-wait-for-delete-timeout=0
# - --drain-timeout=0
# - --period=1h
# - --ds-namespace=kube-system
# - --ds-name=kured
# - --lock-annotation=weave.works/kured-node-lock
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-firing-only=false
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
# - --slack-channel=alerting
# - --notify-url="" # See also shoutrrr url format
# - --message-template-drain=Draining node %s
# - --message-template-reboot=Rebooting node %s
# - --message-template-uncordon=Node %s rebooted & uncordoned successfully!
# - --blocking-pod-selector=runtime=long,cost=expensive
# - --blocking-pod-selector=name=temperamental
# - --blocking-pod-selector=...
# - --reboot-days=sun,mon,tue,wed,thu,fri,sat
# - --reboot-delay=90s
# - --start-time=0:00
# - --end-time=23:59:59
# - --time-zone=UTC
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text

View File

@@ -29,16 +29,11 @@ spec:
effect: NoSchedule
hostPID: true # Facilitate entering the host mount namespace via init
restartPolicy: Always
volumes:
- name: sentinel
hostPath:
path: /var/run
type: Directory
containers:
- name: kured
# If you find yourself here wondering why there is no
# :latest tag on Docker Hub,see the FAQ in the README
image: ghcr.io/kubereboot/kured:1.20.0
image: ghcr.io/kubereboot/kured:1.13.2
imagePullPolicy: IfNotPresent
securityContext:
privileged: true # Give permission to nsenter /proc/1/ns/mnt
@@ -53,19 +48,12 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- mountPath: /sentinel
name: sentinel
readOnly: true
command:
- /usr/bin/kured
- --reboot-sentinel=/sentinel/reboot-required
# - --force-reboot=false
# - --drain-grace-period=-1
# - --skip-wait-for-delete-timeout=0
# - --drain-delay=0
# - --drain-timeout=0
# - --drain-pod-selector=""
# - --period=1h
# - --ds-namespace=kube-system
# - --ds-name=kured
@@ -75,10 +63,9 @@ spec:
# - --alert-filter-regexp=^RebootRequired$
# - --alert-filter-match-only=false
# - --alert-firing-only=false
# - --reboot-sentinel=/var/run/reboot-required
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
# - --reboot-method=command
# - --reboot-signal=39
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
# - --slack-channel=alerting

77
pkg/alerts/prometheus.go Normal file
View File

@@ -0,0 +1,77 @@
package alerts
import (
"context"
"fmt"
"regexp"
"sort"
"time"
papi "github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
)
// PromClient is a wrapper around the Prometheus Client interface and implements the api
// This way, the PromClient can be instantiated with the configuration the Client needs, and
// the ability to use the methods the api has, like Query and so on.
type PromClient struct {
papi papi.Client
api v1.API
}
// NewPromClient creates a new client to the Prometheus API.
// It returns an error on any problem.
func NewPromClient(conf papi.Config) (*PromClient, error) {
promClient, err := papi.NewClient(conf)
if err != nil {
return nil, err
}
client := PromClient{papi: promClient, api: v1.NewAPI(promClient)}
return &client, nil
}
// ActiveAlerts is a method of type PromClient, it returns a list of names of active alerts
// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query.
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
// block-list and will NOT block rebooting. query by includeLabel means,
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly, filterMatchOnly bool) ([]string, error) {
// get all alerts from prometheus
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
if err != nil {
return nil, err
}
if value.Type() == model.ValVector {
if vector, ok := value.(model.Vector); ok {
activeAlertSet := make(map[string]bool)
for _, sample := range vector {
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
if matchesRegex(filter, string(alertName), filterMatchOnly) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
activeAlertSet[string(alertName)] = true
}
}
}
var activeAlerts []string
for activeAlert := range activeAlertSet {
activeAlerts = append(activeAlerts, activeAlert)
}
sort.Strings(activeAlerts)
return activeAlerts, nil
}
}
return nil, fmt.Errorf("Unexpected value type: %v", value)
}
func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool {
if filter == nil {
return true
}
return filter.MatchString(string(alertName)) == filterMatchOnly
}

View File

@@ -1,4 +1,4 @@
package blockers
package alerts
import (
"log"
@@ -145,9 +145,12 @@ func TestActiveAlerts(t *testing.T) {
regex, _ := regexp.Compile(tc.rFilter)
// instantiate the prometheus client with the mockserver-address
p := NewPrometheusBlockingChecker(api.Config{Address: mockServer.URL}, regex, tc.firingOnly, tc.filterMatchOnly)
p, err := NewPromClient(api.Config{Address: mockServer.URL})
if err != nil {
log.Fatal(err)
}
result, err := p.ActiveAlerts()
result, err := p.ActiveAlerts(regex, tc.firingOnly, tc.filterMatchOnly)
if err != nil {
log.Fatal(err)
}

View File

@@ -1,21 +0,0 @@
// Package blockers provides interfaces and implementations for determining
// whether a system should be prevented to reboot.
// You can use that package if you fork Kured's main loop.
package blockers
// RebootBlocked checks that a single block Checker
// will block the reboot or not.
func RebootBlocked(blockers ...RebootBlocker) bool {
for _, blocker := range blockers {
if blocker.IsBlocked() {
return true
}
}
return false
}
// RebootBlocker interface should be implemented by types
// to know if their instantiations should block a reboot
type RebootBlocker interface {
IsBlocked() bool
}

View File

@@ -1,65 +0,0 @@
package blockers
import (
papi "github.com/prometheus/client_golang/api"
"testing"
)
type BlockingChecker struct {
blocking bool
}
func (fbc BlockingChecker) IsBlocked() bool {
return fbc.blocking
}
func Test_rebootBlocked(t *testing.T) {
noCheckers := []RebootBlocker{}
nonblockingChecker := BlockingChecker{blocking: false}
blockingChecker := BlockingChecker{blocking: true}
// Instantiate a prometheusClient with a broken_url
brokenPrometheusClient := NewPrometheusBlockingChecker(papi.Config{Address: "broken_url"}, nil, false, false)
type args struct {
blockers []RebootBlocker
}
tests := []struct {
name string
args args
want bool
}{
{
name: "Do not block on no blocker defined",
args: args{blockers: noCheckers},
want: false,
},
{
name: "Ensure a blocker blocks",
args: args{blockers: []RebootBlocker{blockingChecker}},
want: true,
},
{
name: "Ensure a non-blocker doesn't block",
args: args{blockers: []RebootBlocker{nonblockingChecker}},
want: false,
},
{
name: "Ensure one blocker is enough to block",
args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}},
want: true,
},
{
name: "Do block on error contacting prometheus API",
args: args{blockers: []RebootBlocker{brokenPrometheusClient}},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := RebootBlocked(tt.args.blockers...); got != tt.want {
t.Errorf("rebootBlocked() = %v, want %v", got, tt.want)
}
})
}
}

View File

@@ -1,64 +0,0 @@
package blockers
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)
// Compile-time checks to ensure the type implements the interface
var (
_ RebootBlocker = (*KubernetesBlockingChecker)(nil)
)
// KubernetesBlockingChecker contains info for connecting
// to k8s, and can give info about whether a reboot should be blocked
type KubernetesBlockingChecker struct {
// client used to contact kubernetes API
client *kubernetes.Clientset
nodeName string
// lised used to filter pods (podSelector)
filter []string
}
// NewKubernetesBlockingChecker creates a new KubernetesBlockingChecker using the provided Kubernetes client,
// node name, and pod selectors.
func NewKubernetesBlockingChecker(client *kubernetes.Clientset, nodename string, podSelectors []string) *KubernetesBlockingChecker {
return &KubernetesBlockingChecker{
client: client,
nodeName: nodename,
filter: podSelectors,
}
}
// IsBlocked for the KubernetesBlockingChecker will check if a pod, for the node, is preventing
// the reboot. It will warn in the logs about blocking, but does not return an error.
func (kb KubernetesBlockingChecker) IsBlocked() bool {
fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodeName)
for _, labelSelector := range kb.filter {
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
LabelSelector: labelSelector,
FieldSelector: fieldSelector,
Limit: 10})
if err != nil {
log.Warnf("Reboot blocked: pod query error: %v", err)
return true
}
if len(podList.Items) > 0 {
podNames := make([]string, 0, len(podList.Items))
for _, pod := range podList.Items {
podNames = append(podNames, pod.Name)
}
if len(podList.Continue) > 0 {
podNames = append(podNames, "...")
}
log.Warnf("Reboot blocked: matching pods: %v", podNames)
return true
}
}
return false
}

View File

@@ -1,121 +0,0 @@
package blockers
import (
"context"
"fmt"
"regexp"
"sort"
"time"
papi "github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
log "github.com/sirupsen/logrus"
)
// Compile-time checks to ensure the type implements the interface
var (
_ RebootBlocker = (*PrometheusBlockingChecker)(nil)
)
// PrometheusBlockingChecker contains info for connecting
// to prometheus, and can give info about whether a reboot should be blocked
type PrometheusBlockingChecker struct {
promConfig papi.Config
// regexp used to get alerts
filter *regexp.Regexp
// bool to indicate if only firing alerts should be considered
firingOnly bool
// bool to indicate that we're only blocking on alerts which match the filter
filterMatchOnly bool
// storing the promClient
promClient papi.Client
}
// NewPrometheusBlockingChecker creates a new PrometheusBlockingChecker using the given
// Prometheus API config, alert filter, and filtering options.
func NewPrometheusBlockingChecker(config papi.Config, alertFilter *regexp.Regexp, firingOnly bool, filterMatchOnly bool) PrometheusBlockingChecker {
promClient, _ := papi.NewClient(config)
return PrometheusBlockingChecker{
promConfig: config,
filter: alertFilter,
firingOnly: firingOnly,
filterMatchOnly: filterMatchOnly,
promClient: promClient,
}
}
// IsBlocked for the prometheus will check if there are active alerts matching
// the arguments given into the PrometheusBlockingChecker which would actively
// block the reboot.
// As of today, no blocker information is shared as a return of the method,
// and the information is simply logged.
func (pb PrometheusBlockingChecker) IsBlocked() bool {
alertNames, err := pb.ActiveAlerts()
if err != nil {
log.Warnf("Reboot blocked: prometheus query error: %v", err)
return true
}
count := len(alertNames)
if count > 10 {
alertNames = append(alertNames[:10], "...")
}
if count > 0 {
log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
return true
}
return false
}
// MetricLabel is used to give a fancier name
// than the type to the label for rebootBlockedCounter
func (pb PrometheusBlockingChecker) MetricLabel() string {
return "prometheus"
}
// ActiveAlerts is a method of type promClient, it returns a list of names of active alerts
// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query.
// filter by regexp means when the regexp finds the alert-name; the alert is excluded from the
// block-list and will NOT block rebooting. query by includeLabel means,
// if the query finds an alert, it will include it to the block-list, and it WILL block rebooting.
func (pb PrometheusBlockingChecker) ActiveAlerts() ([]string, error) {
api := v1.NewAPI(pb.promClient)
// get all alerts from prometheus
value, _, err := api.Query(context.Background(), "ALERTS", time.Now())
if err != nil {
return nil, err
}
if value.Type() == model.ValVector {
if vector, ok := value.(model.Vector); ok {
activeAlertSet := make(map[string]bool)
for _, sample := range vector {
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
if matchesRegex(pb.filter, string(alertName), pb.filterMatchOnly) && (!pb.firingOnly || sample.Metric["alertstate"] == "firing") {
activeAlertSet[string(alertName)] = true
}
}
}
var activeAlerts []string
for activeAlert := range activeAlertSet {
activeAlerts = append(activeAlerts, activeAlert)
}
sort.Strings(activeAlerts)
return activeAlerts, nil
}
}
return nil, fmt.Errorf("unexpected value type %v", value)
}
func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool {
if filter == nil {
return true
}
return filter.MatchString(alertName) == filterMatchOnly
}

View File

@@ -1,117 +0,0 @@
// Package checkers provides interfaces and implementations for determining
// whether a system reboot is required. It includes checkers based on file
// presence or custom commands, and supports privileged command execution
// in containerized environments. These checkers are used by kured to
// detect conditions that should trigger node reboots.
// You can use that package if you fork Kured's main loop.
package checkers
import (
"bytes"
"fmt"
"os"
"os/exec"
"strings"
"github.com/google/shlex"
log "github.com/sirupsen/logrus"
)
// Checker is the standard interface to use to check
// if a reboot is required. Its types must implement a
// CheckRebootRequired method which returns a single boolean
// clarifying whether a reboot is expected or not.
type Checker interface {
RebootRequired() bool
}
// FileRebootChecker is the default reboot checker.
// It is unprivileged, and tests the presence of a files
type FileRebootChecker struct {
FilePath string
}
// RebootRequired checks the file presence
// needs refactoring to also return an error, instead of leaking it inside the code.
// This needs refactoring to get rid of NewCommand
// This needs refactoring to only contain file location, instead of CheckCommand
func (rc FileRebootChecker) RebootRequired() bool {
if _, err := os.Stat(rc.FilePath); err == nil {
return true
}
return false
}
// NewFileRebootChecker is the constructor for the file based reboot checker
// TODO: Add extra input validation on filePath string here
func NewFileRebootChecker(filePath string) (*FileRebootChecker, error) {
return &FileRebootChecker{
FilePath: filePath,
}, nil
}
// CommandChecker is using a custom command to check
// if a reboot is required. There are two modes of behaviour,
// if Privileged is granted, the NamespacePid is used to nsenter
// the given PID's namespace.
type CommandChecker struct {
CheckCommand []string
NamespacePid int
Privileged bool
}
// RebootRequired for CommandChecker runs a command without returning
// any eventual error. This should be later refactored to return the errors,
// instead of logging and fataling them here.
func (rc CommandChecker) RebootRequired() bool {
bufStdout := new(bytes.Buffer)
bufStderr := new(bytes.Buffer)
// #nosec G204 -- CheckCommand is controlled and validated internally
cmd := exec.Command(rc.CheckCommand[0], rc.CheckCommand[1:]...)
cmd.Stdout = bufStdout
cmd.Stderr = bufStderr
if err := cmd.Run(); err != nil {
switch err := err.(type) {
case *exec.ExitError:
// We assume a non-zero exit code means 'reboot not required', but of course
// the user could have misconfigured the sentinel command or something else
// went wrong during its execution. In that case, not entering a reboot loop
// is the right thing to do, and we are logging stdout/stderr of the command
// so it should be obvious what is wrong.
if cmd.ProcessState.ExitCode() != 1 {
log.Warn(fmt.Sprintf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
}
return false
default:
// Something was grossly misconfigured, such as the command path being wrong.
log.Fatal(fmt.Sprintf("Error invoking sentinel command: %v", err), "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
}
}
log.Info("checking if reboot is required", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
return true
}
// NewCommandChecker is the constructor for the commandChecker, and by default
// runs new commands in a privileged fashion.
// Privileged means wrapping the command with nsenter.
// It allows to run a command from systemd's namespace for example (pid 1)
// This relies on hostPID:true and privileged:true to enter host mount space
// For info, rancher based need different pid, which should be user given.
// until we have a better discovery mechanism.
func NewCommandChecker(sentinelCommand string, pid int, privileged bool) (*CommandChecker, error) {
var cmd []string
if privileged {
cmd = append(cmd, "/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--")
}
parsedCommand, err := shlex.Split(sentinelCommand)
if err != nil {
return nil, fmt.Errorf("error parsing provided sentinel command: %v", err)
}
cmd = append(cmd, parsedCommand...)
return &CommandChecker{
CheckCommand: cmd,
NamespacePid: pid,
Privileged: privileged,
}, nil
}

View File

@@ -1,87 +0,0 @@
package checkers
import (
log "github.com/sirupsen/logrus"
"reflect"
"testing"
)
func Test_nsEntering(t *testing.T) {
type args struct {
pid int
command string
privileged bool
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Ensure command will run with nsenter",
args: args{pid: 1, command: "ls -Fal", privileged: true},
want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cc, _ := NewCommandChecker(tt.args.command, tt.args.pid, tt.args.privileged)
if !reflect.DeepEqual(cc.CheckCommand, tt.want) {
t.Errorf("command parsed as %v, want %v", cc.CheckCommand, tt.want)
}
})
}
}
func Test_rebootRequired(t *testing.T) {
type args struct {
sentinelCommand []string
}
tests := []struct {
name string
args args
want bool
fatals bool
}{
{
name: "Ensure rc = 0 means reboot required",
args: args{
sentinelCommand: []string{"true"},
},
want: true,
fatals: false,
},
{
name: "Ensure rc != 0 means reboot NOT required",
args: args{
sentinelCommand: []string{"false"},
},
want: false,
fatals: false,
},
{
name: "Ensure a wrong command fatals",
args: args{
sentinelCommand: []string{"./babar"},
},
want: true,
fatals: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
defer func() { log.StandardLogger().ExitFunc = nil }()
fatal := false
log.StandardLogger().ExitFunc = func(int) { fatal = true }
a := CommandChecker{CheckCommand: tt.args.sentinelCommand, NamespacePid: 1, Privileged: false}
if got := a.RebootRequired(); got != tt.want {
t.Errorf("rebootRequired() = %v, want %v", got, tt.want)
}
if tt.fatals != fatal {
t.Errorf("fatal flag is %v, want fatal %v", fatal, tt.fatals)
}
})
}
}

View File

@@ -1,18 +1,11 @@
// Package daemonsetlock provides mechanisms for leader election and locking
// using Kubernetes DaemonSets. It enables distributed coordination of operations
// (such as reboots) by ensuring only one node acts as the leader at any time,
// leveraging Kubernetes primitives for safe, atomic locking in clusters.
package daemonsetlock
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
log "github.com/sirupsen/logrus"
v1 "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -25,25 +18,6 @@ const (
k8sAPICallRetryTimeout = 5 * time.Minute // How long to wait until we determine that the k8s API is definitively unavailable
)
// Lock defines the interface for acquiring, releasing, and checking
// the status of a reboot coordination lock.
type Lock interface {
Acquire(NodeMeta) (bool, string, error)
Release() error
Holding() (bool, LockAnnotationValue, error)
}
// GenericLock holds the configuration for lock TTL and the delay before releasing it.
type GenericLock struct {
TTL time.Duration
releaseDelay time.Duration
}
// NodeMeta contains metadata about a node relevant to scheduling decisions.
type NodeMeta struct {
Unschedulable bool `json:"unschedulable"`
}
// DaemonSetLock holds all necessary information to do actions
// on the kured ds which holds lock info through annotations.
type DaemonSetLock struct {
@@ -54,98 +28,34 @@ type DaemonSetLock struct {
annotation string
}
// DaemonSetSingleLock holds all necessary information to do actions
// on the kured ds which holds lock info through annotations.
type DaemonSetSingleLock struct {
GenericLock
DaemonSetLock
}
// DaemonSetMultiLock holds all necessary information to do actions
// on the kured ds which holds lock info through annotations, valid
// for multiple nodes
type DaemonSetMultiLock struct {
GenericLock
DaemonSetLock
maxOwners int
}
// LockAnnotationValue contains the lock data,
// which allows persistence across reboots, particularily recording if the
// node was already unschedulable before kured reboot.
// To be modified when using another type of lock storage.
type LockAnnotationValue struct {
type lockAnnotationValue struct {
NodeID string `json:"nodeID"`
Metadata NodeMeta `json:"metadata,omitempty"`
Metadata interface{} `json:"metadata,omitempty"`
Created time.Time `json:"created"`
TTL time.Duration `json:"TTL"`
}
type multiLockAnnotationValue struct {
MaxOwners int `json:"maxOwners"`
LockAnnotations []LockAnnotationValue `json:"locks"`
LockAnnotations []lockAnnotationValue `json:"locks"`
}
// New creates a daemonsetLock object containing the necessary data for follow up k8s requests
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string, TTL time.Duration, concurrency int, lockReleaseDelay time.Duration) Lock {
if concurrency > 1 {
return &DaemonSetMultiLock{
GenericLock: GenericLock{
TTL: TTL,
releaseDelay: lockReleaseDelay,
},
DaemonSetLock: DaemonSetLock{
client: client,
nodeID: nodeID,
namespace: namespace,
name: name,
annotation: annotation,
},
maxOwners: concurrency,
}
}
return &DaemonSetSingleLock{
GenericLock: GenericLock{
TTL: TTL,
releaseDelay: lockReleaseDelay,
},
DaemonSetLock: DaemonSetLock{
client: client,
nodeID: nodeID,
namespace: namespace,
name: name,
annotation: annotation,
},
}
}
// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client
func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) {
var ds *v1.DaemonSet
var lastError error
err := wait.PollUntilContextTimeout(context.Background(), sleep, timeout, true, func(ctx context.Context) (bool, error) {
if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil {
return false, nil
}
return true, nil
})
if err != nil {
return nil, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError)
}
return ds, nil
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock {
return &DaemonSetLock{client, nodeID, namespace, name, annotation}
}
// Acquire attempts to annotate the kured daemonset with lock info from instantiated DaemonSetLock using client-go
func (dsl *DaemonSetSingleLock) Acquire(nodeMetadata NodeMeta) (bool, string, error) {
func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool, string, error) {
for {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
valueString, exists := ds.Annotations[dsl.annotation]
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
if exists {
value := LockAnnotationValue{}
value := lockAnnotationValue{}
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, "", err
}
@@ -155,108 +65,73 @@ func (dsl *DaemonSetSingleLock) Acquire(nodeMetadata NodeMeta) (bool, string, er
}
}
if ds.Annotations == nil {
ds.Annotations = make(map[string]string)
if ds.ObjectMeta.Annotations == nil {
ds.ObjectMeta.Annotations = make(map[string]string)
}
value := LockAnnotationValue{
NodeID: dsl.nodeID,
Metadata: nodeMetadata,
Created: time.Now().UTC(),
TTL: dsl.TTL,
}
value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata, Created: time.Now().UTC(), TTL: TTL}
valueBytes, err := json.Marshal(&value)
if err != nil {
return false, "", err
}
ds.Annotations[dsl.annotation] = string(valueBytes)
ds.ObjectMeta.Annotations[dsl.annotation] = string(valueBytes)
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
if err != nil {
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
// Something else updated the resource between us reading and writing - try again soon
time.Sleep(time.Second)
continue
} else {
return false, "", err
}
return false, "", err
}
return true, dsl.nodeID, nil
}
}
// Holding checks if the current node still holds the lock based on the DaemonSet annotation.
func (dsl *DaemonSetSingleLock) Holding() (bool, LockAnnotationValue, error) {
var lockData LockAnnotationValue
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return false, lockData, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
valueString, exists := ds.Annotations[dsl.annotation]
if exists {
value := LockAnnotationValue{}
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, lockData, err
}
if !ttlExpired(value.Created, value.TTL) {
return value.NodeID == dsl.nodeID, value, nil
}
}
return false, lockData, nil
}
// Release attempts to remove the lock data from the kured ds annotations using client-go
func (dsl *DaemonSetSingleLock) Release() error {
if dsl.releaseDelay > 0 {
log.Infof("Waiting %v before releasing lock", dsl.releaseDelay)
time.Sleep(dsl.releaseDelay)
}
// AcquireMultiple creates and annotates the daemonset with a multiple owner lock
func (dsl *DaemonSetLock) AcquireMultiple(metadata interface{}, TTL time.Duration, maxOwners int) (bool, []string, error) {
for {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
return false, []string{}, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
valueString, exists := ds.Annotations[dsl.annotation]
annotation := multiLockAnnotationValue{}
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
if exists {
value := LockAnnotationValue{}
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return err
if err := json.Unmarshal([]byte(valueString), &annotation); err != nil {
return false, []string{}, fmt.Errorf("error getting multi lock: %w", err)
}
if value.NodeID != dsl.nodeID {
return fmt.Errorf("not lock holder: %v", value.NodeID)
}
} else {
return fmt.Errorf("lock not held")
}
delete(ds.Annotations, dsl.annotation)
lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, metadata, TTL, maxOwners)
if !lockPossible {
return false, nodeIDsFromMultiLock(newAnnotation), nil
}
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
if ds.ObjectMeta.Annotations == nil {
ds.ObjectMeta.Annotations = make(map[string]string)
}
newAnnotationBytes, err := json.Marshal(&newAnnotation)
if err != nil {
return false, []string{}, fmt.Errorf("error marshalling new annotation lock: %w", err)
}
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
if err != nil {
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
// Something else updated the resource between us reading and writing - try again soon
time.Sleep(time.Second)
continue
} else {
return false, []string{}, fmt.Errorf("error updating daemonset with multi lock: %w", err)
}
return err
}
return nil
return true, nodeIDsFromMultiLock(newAnnotation), nil
}
}
func ttlExpired(created time.Time, ttl time.Duration) bool {
if ttl > 0 && time.Since(created) >= ttl {
return true
}
return false
}
func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string {
nodeIDs := make([]string, 0, len(annotation.LockAnnotations))
for _, nodeLock := range annotation.LockAnnotations {
@@ -265,7 +140,7 @@ func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string {
return nodeIDs
}
func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata NodeMeta, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) {
func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata interface{}, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) {
newAnnotation := multiLockAnnotationValue{MaxOwners: maxOwners}
freeSpace := false
if annotation.LockAnnotations == nil || len(annotation.LockAnnotations) < maxOwners {
@@ -287,7 +162,7 @@ func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue
if freeSpace {
newAnnotation.LockAnnotations = append(
newAnnotation.LockAnnotations,
LockAnnotationValue{
lockAnnotationValue{
NodeID: dsl.nodeID,
Metadata: metadata,
Created: time.Now().UTC(),
@@ -300,87 +175,99 @@ func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue
return false, multiLockAnnotationValue{}
}
// Acquire creates and annotates the daemonset with a multiple owner lock
func (dsl *DaemonSetMultiLock) Acquire(nodeMetaData NodeMeta) (bool, string, error) {
for {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
annotation := multiLockAnnotationValue{}
valueString, exists := ds.Annotations[dsl.annotation]
if exists {
if err := json.Unmarshal([]byte(valueString), &annotation); err != nil {
return false, "", fmt.Errorf("error getting multi lock: %w", err)
}
}
lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, nodeMetaData, dsl.TTL, dsl.maxOwners)
if !lockPossible {
return false, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil
}
if ds.Annotations == nil {
ds.Annotations = make(map[string]string)
}
newAnnotationBytes, err := json.Marshal(&newAnnotation)
if err != nil {
return false, "", fmt.Errorf("error marshalling new annotation lock: %w", err)
}
ds.Annotations[dsl.annotation] = string(newAnnotationBytes)
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
if err != nil {
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
time.Sleep(time.Second)
continue
}
return false, "", fmt.Errorf("error updating daemonset with multi lock: %w", err)
}
return true, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil
}
}
// Holding checks whether the current node is holding a valid lock for the DaemonSetMultiLock.
func (dsl *DaemonSetMultiLock) Holding() (bool, LockAnnotationValue, error) {
var lockdata LockAnnotationValue
// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go
func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return false, lockdata, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
valueString, exists := ds.Annotations[dsl.annotation]
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
if exists {
value := lockAnnotationValue{Metadata: metadata}
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, err
}
if !ttlExpired(value.Created, value.TTL) {
return value.NodeID == dsl.nodeID, nil
}
}
return false, nil
}
// TestMultiple attempts to check the kured daemonset lock status for multi locks
func (dsl *DaemonSetLock) TestMultiple() (bool, error) {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
if exists {
value := multiLockAnnotationValue{}
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, lockdata, err
return false, err
}
for _, nodeLock := range value.LockAnnotations {
if nodeLock.NodeID == dsl.nodeID && !ttlExpired(nodeLock.Created, nodeLock.TTL) {
return true, nodeLock, nil
return true, nil
}
}
}
return false, lockdata, nil
return false, nil
}
// Release attempts to remove the lock data for a single node from the multi node annotation
func (dsl *DaemonSetMultiLock) Release() error {
if dsl.releaseDelay > 0 {
log.Infof("Waiting %v before releasing lock", dsl.releaseDelay)
time.Sleep(dsl.releaseDelay)
}
// Release attempts to remove the lock data from the kured ds annotations using client-go
func (dsl *DaemonSetLock) Release() error {
for {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
valueString, exists := ds.Annotations[dsl.annotation]
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
if exists {
value := lockAnnotationValue{}
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return err
}
if value.NodeID != dsl.nodeID {
return fmt.Errorf("Not lock holder: %v", value.NodeID)
}
} else {
return fmt.Errorf("Lock not held")
}
delete(ds.ObjectMeta.Annotations, dsl.annotation)
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
if err != nil {
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
// Something else updated the resource between us reading and writing - try again soon
time.Sleep(time.Second)
continue
} else {
return err
}
}
return nil
}
}
// ReleaseMultiple attempts to remove the lock data from the kured ds annotations using client-go
func (dsl *DaemonSetLock) ReleaseMultiple() error {
for {
ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout)
if err != nil {
return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err)
}
valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation]
modified := false
value := multiLockAnnotationValue{}
if exists {
@@ -405,17 +292,43 @@ func (dsl *DaemonSetMultiLock) Release() error {
if err != nil {
return fmt.Errorf("error marshalling new annotation on release: %v", err)
}
ds.Annotations[dsl.annotation] = string(newAnnotationBytes)
ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes)
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{})
_, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{})
if err != nil {
if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict {
// Something else updated the resource between us reading and writing - try again soon
time.Sleep(time.Second)
continue
} else {
return err
}
return err
}
return nil
}
}
// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client
func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) {
var ds *v1.DaemonSet
var lastError error
err := wait.PollImmediate(sleep, timeout, func() (bool, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil {
return false, nil
}
return true, nil
})
if err != nil {
return nil, fmt.Errorf("Timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError)
}
return ds, nil
}
func ttlExpired(created time.Time, ttl time.Duration) bool {
if ttl > 0 && time.Since(created) >= ttl {
return true
}
return false
}

View File

@@ -66,7 +66,7 @@ func TestCanAcquireMultiple(t *testing.T) {
current: multiLockAnnotationValue{},
desired: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{NodeID: node1Name},
},
},
@@ -80,13 +80,13 @@ func TestCanAcquireMultiple(t *testing.T) {
maxOwners: 2,
current: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{NodeID: node2Name},
},
},
desired: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{NodeID: node1Name},
{NodeID: node2Name},
},
@@ -101,7 +101,7 @@ func TestCanAcquireMultiple(t *testing.T) {
maxOwners: 2,
current: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{
NodeID: node2Name,
Created: time.Now().UTC().Add(-1 * time.Minute),
@@ -116,7 +116,7 @@ func TestCanAcquireMultiple(t *testing.T) {
},
desired: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{NodeID: node2Name},
{NodeID: node3Name},
},
@@ -131,7 +131,7 @@ func TestCanAcquireMultiple(t *testing.T) {
maxOwners: 2,
current: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{
NodeID: node2Name,
Created: time.Now().UTC().Add(-1 * time.Hour),
@@ -146,7 +146,7 @@ func TestCanAcquireMultiple(t *testing.T) {
},
desired: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{NodeID: node1Name},
{NodeID: node3Name},
},
@@ -161,7 +161,7 @@ func TestCanAcquireMultiple(t *testing.T) {
maxOwners: 2,
current: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{
NodeID: node2Name,
Created: time.Now().UTC().Add(-1 * time.Hour),
@@ -176,17 +176,17 @@ func TestCanAcquireMultiple(t *testing.T) {
},
desired: multiLockAnnotationValue{
MaxOwners: 2,
LockAnnotations: []LockAnnotationValue{
LockAnnotations: []lockAnnotationValue{
{NodeID: node1Name},
},
},
lockPossible: true,
},
}
nm := NodeMeta{Unschedulable: false}
for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, nm, time.Minute, testCase.maxOwners)
lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, struct{}{}, time.Minute, testCase.maxOwners)
if lockPossible != testCase.lockPossible {
t.Fatalf(
"unexpected result for lock possible (got %t expected %t new annotation %v",

View File

@@ -1,9 +1,3 @@
// Package delaytick provides utilities for scheduling periodic events
// with an initial randomized delay. It is primarily used to delay the
// start of regular ticks, helping to avoid thundering herd problems
// when multiple nodes begin operations simultaneously.
// You can use that a random ticker in other projects, but there is
// no garantee that it will stay (initial plan was to move it to internal)
package delaytick
import (
@@ -16,7 +10,6 @@ func New(s rand.Source, d time.Duration) <-chan time.Time {
c := make(chan time.Time)
go func() {
// #nosec G404 -- math/rand is used here for non-security timing jitter
random := rand.New(s)
time.Sleep(time.Duration(float64(d)/2 + float64(d)*random.Float64()))
c <- time.Now()

View File

@@ -1,49 +0,0 @@
package reboot
import (
"bytes"
"fmt"
"os/exec"
"strings"
"github.com/google/shlex"
log "github.com/sirupsen/logrus"
)
// CommandRebooter holds context-information for a reboot with command
type CommandRebooter struct {
RebootCommand []string
}
// Reboot triggers the reboot command
func (c CommandRebooter) Reboot() error {
log.Infof("Invoking command: %s", c.RebootCommand)
bufStdout := new(bytes.Buffer)
bufStderr := new(bytes.Buffer)
cmd := exec.Command(c.RebootCommand[0], c.RebootCommand[1:]...) // #nosec G204
cmd.Stdout = bufStdout
cmd.Stderr = bufStderr
if err := cmd.Run(); err != nil {
return fmt.Errorf("error invoking reboot command %s: %v (stdout: %v, stderr: %v)", c.RebootCommand, err, bufStdout.String(), bufStderr.String())
}
log.Info("Invoked reboot command", "cmd", strings.Join(cmd.Args, " "), "stdout", bufStdout.String(), "stderr", bufStderr.String())
return nil
}
// NewCommandRebooter is the constructor to create a CommandRebooter from a string not
// yet shell lexed. You can skip this constructor if you parse the data correctly first
// when instantiating a CommandRebooter instance.
func NewCommandRebooter(rebootCommand string) (*CommandRebooter, error) {
if rebootCommand == "" {
return nil, fmt.Errorf("no reboot command specified")
}
cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", 1), "--"}
parsedCommand, err := shlex.Split(rebootCommand)
if err != nil {
return nil, fmt.Errorf("error %v when parsing reboot command %s", err, rebootCommand)
}
cmd = append(cmd, parsedCommand...)
return &CommandRebooter{RebootCommand: cmd}, nil
}

View File

@@ -1,43 +0,0 @@
package reboot
import (
"reflect"
"testing"
)
func TestNewCommandRebooter(t *testing.T) {
type args struct {
rebootCommand string
}
tests := []struct {
name string
args args
want *CommandRebooter
wantErr bool
}{
{
name: "Ensure command is nsenter wrapped",
args: args{"ls -Fal"},
want: &CommandRebooter{RebootCommand: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"}},
wantErr: false,
},
{
name: "Ensure empty command is erroring",
args: args{""},
want: nil,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := NewCommandRebooter(tt.args.rebootCommand)
if (err != nil) != tt.wantErr {
t.Errorf("NewCommandRebooter() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("NewCommandRebooter() got = %v, want %v", got, tt.want)
}
})
}
}

View File

@@ -1,13 +0,0 @@
// Package reboot provides mechanisms to trigger node reboots using different
// methods, like custom commands or signals.
// Each of those includes constructors and interfaces for handling different reboot
// strategies, supporting privileged command execution via nsenter for containerized environments.
package reboot
// Rebooter is the standard interface to use to execute
// the reboot, after it has been considered as necessary.
// The Reboot method does not expect any return, yet should
// most likely be refactored in the future to return an error
type Rebooter interface {
Reboot() error
}

View File

@@ -1,37 +0,0 @@
package reboot
import (
"fmt"
"os"
"syscall"
)
// SignalRebooter holds context-information for a signal reboot.
type SignalRebooter struct {
Signal int
}
// Reboot triggers the reboot signal
func (c SignalRebooter) Reboot() error {
process, err := os.FindProcess(1)
if err != nil {
return fmt.Errorf("not running on Unix: %v", err)
}
err = process.Signal(syscall.Signal(c.Signal))
// Either PID does not exist, or the signal does not work. Hoping for
// a decent enough error.
if err != nil {
return fmt.Errorf("signal of SIGRTMIN+5 failed: %v", err)
}
return nil
}
// NewSignalRebooter is the constructor which sets the signal number.
// The constructor does not yet validate any input. It should be done in a later commit.
func NewSignalRebooter(sig int) (*SignalRebooter, error) {
if sig < 1 {
return nil, fmt.Errorf("invalid signal: %v", sig)
}
return &SignalRebooter{Signal: sig}, nil
}

View File

@@ -1,6 +1,3 @@
// Package taints provides utilities to manage Kubernetes node taints for controlling
// pod scheduling and execution. It allows setting, removing, and checking taints on nodes,
// using Kubernetes client-go and JSON patching for atomic updates.
package taints
import (
@@ -68,7 +65,7 @@ func (t *Taint) Disable() {
}
func taintExists(client *kubernetes.Clientset, nodeID, taintName string) (bool, int, *v1.Node) {
updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
updatedNode, err := client.CoreV1().Nodes().Get(context.Background(), nodeID, metav1.GetOptions{})
if err != nil || updatedNode == nil {
log.Fatalf("Error reading node %s: %v", nodeID, err)
}
@@ -156,7 +153,7 @@ func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, ef
log.Fatalf("Error encoding taint patch for node %s: %v", nodeID, err)
}
_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
_, err = client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
if err != nil {
log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
}

View File

@@ -50,7 +50,7 @@ func parseWeekdays(days []string) (weekdays, error) {
if err != nil {
return weekdays(0), err
}
// #nosec G115 -- weekday is guaranteed to be between 06 by parseWeekday()
result |= 1 << uint32(weekday)
}
@@ -59,7 +59,6 @@ func parseWeekdays(days []string) (weekdays, error) {
// Contains returns true if the specified weekday is a member of this set.
func (w weekdays) Contains(day time.Weekday) bool {
// #nosec G115 -- day is time.Weekday [0-6], shift safe within uint32
return uint32(w)&(1<<uint32(day)) != 0
}
@@ -82,11 +81,11 @@ func parseWeekday(day string) (time.Weekday, error) {
if n >= 0 && n < 7 {
return time.Weekday(n), nil
}
return time.Sunday, fmt.Errorf("invalid weekday, number out of range: %s", day)
return time.Sunday, fmt.Errorf("Invalid weekday, number out of range: %s", day)
}
if weekday, ok := dayStrings[strings.ToLower(day)]; ok {
return weekday, nil
}
return time.Sunday, fmt.Errorf("invalid weekday: %s", day)
return time.Sunday, fmt.Errorf("Invalid weekday: %s", day)
}

View File

@@ -1,7 +1,3 @@
// Package timewindow provides utilities for handling days of the week,
// including parsing, representing, and manipulating sets of weekdays.
// It enables flexible specification of time windows for reboot operations
// in kured, supporting various formats and convenience functions.
package timewindow
import (
@@ -81,5 +77,5 @@ func parseTime(s string, loc *time.Location) (time.Time, error) {
}
}
return time.Now(), fmt.Errorf("invalid time format: %s", s)
return time.Now(), fmt.Errorf("Invalid time format: %s", s)
}

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# USE KUBECTL_CMD to pass context and/or namespaces.
KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
SENTINEL_FILE="${SENTINEL_FILE:-/var/run/reboot-required}"
echo "Creating reboot sentinel on all nodes"
for nodename in $("$KUBECTL_CMD" get nodes -o name); do
docker exec "${nodename/node\//}" hostname
docker exec "${nodename/node\//}" touch "${SENTINEL_FILE}"
done

View File

@@ -1,14 +1,11 @@
#!/usr/bin/env bash
REBOOTCOUNT=${REBOOTCOUNT:-2} # By default we only create two sentinels in create-reboot-sentinels.
NODECOUNT=${NODECOUNT:-5}
KUBECTL_CMD="${KUBECTL_CMD:-kubectl}"
DEBUG="${DEBUG:-false}"
CONTAINER_NAME_FORMAT=${CONTAINER_NAME_FORMAT:-"chart-testing-*"}
kubectl_flags=( )
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
tmp_dir=$(mktemp -d -t kured-XXXX)
function gather_logs_and_cleanup {
if [[ -f "$tmp_dir"/node_output ]]; then
rm "$tmp_dir"/node_output
@@ -21,15 +18,15 @@ function gather_logs_and_cleanup {
# This is useful to see if containers have crashed.
echo "docker ps -a:"
docker ps -a
echo "docker journal logs"
journalctl -u docker --no-pager
echo "docker journal logs"
journalctl -u docker --no-pager
# This is useful to see if the nodes have _properly_ rebooted.
# It should show the reboot/two container starts per node.
for id in $(docker ps -a -q); do
for name in $(docker ps -a -f "name=${CONTAINER_NAME_FORMAT}" -q); do
echo "############################################################"
echo "docker logs for container $id:"
docker logs "$id"
echo "docker logs for container $name:"
docker logs "$name"
done
fi
@@ -38,28 +35,30 @@ trap gather_logs_and_cleanup EXIT
declare -A was_unschedulable
declare -A has_recovered
max_attempts="200"
sleep_time=5
max_attempts="60"
sleep_time=60
attempt_num=1
# Get docker info of each of those kind containers. If one has crashed, restart it.
set +o errexit
echo "There are $REBOOTCOUNT nodes total needing reboot in the cluster"
until [ ${#was_unschedulable[@]} == "$REBOOTCOUNT" ] && [ ${#has_recovered[@]} == "$REBOOTCOUNT" ]
echo "There are $NODECOUNT nodes in the cluster"
until [ ${#was_unschedulable[@]} == "$NODECOUNT" ] && [ ${#has_recovered[@]} == "$NODECOUNT" ]
do
echo "${#was_unschedulable[@]} nodes were removed from pool once:" "${!was_unschedulable[@]}"
echo "${#has_recovered[@]} nodes removed from the pool are now back:" "${!has_recovered[@]}"
#"$KUBECTL_CMD" logs -n kube-system -l name=kured --ignore-errors > "$tmp_dir"/node_output
#if [[ "$DEBUG" == "true" ]]; then
# echo "Kured pod logs:"
# cat "$tmp_dir"/node_output
#fi
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers | grep -v control-plane > "$tmp_dir"/node_output
"$KUBECTL_CMD" get nodes -o custom-columns=NAME:.metadata.name,SCHEDULABLE:.spec.unschedulable --no-headers > "$tmp_dir"/node_output
if [[ "$DEBUG" == "true" ]]; then
# This is useful to see if a node gets stuck after drain, and doesn't
# come back up.
echo "Result of command kubectl unschedulable nodes:"
echo "Result of command $KUBECTL_CMD get nodes ... showing unschedulable nodes:"
cat "$tmp_dir"/node_output
fi
while read -r node; do
unschedulable=$(echo "$node" | grep true | cut -f 1 -d ' ')
if [ -n "$unschedulable" ] && [ -z ${was_unschedulable["$unschedulable"]+x} ] ; then
@@ -71,15 +70,9 @@ do
echo "$schedulable has recovered!"
has_recovered["$schedulable"]=1
fi
# If the container has crashed, restart it.
node_name=$(echo "$node" | cut -f 1 -d ' ')
stopped_container_id=$(docker container ls --filter=name="$node_name" --filter=status=exited -q)
if [ -n "$stopped_container_id" ]; then echo "Node $stopped_container_id needs restart"; docker start "$stopped_container_id"; echo "Container started."; fi
done < "$tmp_dir"/node_output
if [[ "${#has_recovered[@]}" == "$REBOOTCOUNT" ]]; then
if [[ "${#has_recovered[@]}" == "$NODECOUNT" ]]; then
echo "All nodes recovered."
break
else

View File

@@ -1,429 +0,0 @@
package kind
import (
"bytes"
"fmt"
"math/rand"
"os/exec"
"strconv"
"testing"
"time"
)
const (
kuredDevImage string = "kured:dev"
)
// KindTest cluster deployed by each TestMain function, prepared to run a given test scenario.
type KindTest struct {
kindConfigPath string
clusterName string
timeout time.Duration
deployManifests []string
localImages []string
logsDir string
logBuffer bytes.Buffer
testInstance *testing.T // Maybe move this to testing.TB
}
func (k *KindTest) Write(p []byte) (n int, err error) {
k.testInstance.Helper()
k.logBuffer.Write(p)
return len(p), nil
}
func (k *KindTest) FlushLog() {
k.testInstance.Helper()
k.testInstance.Log(k.logBuffer.String())
k.logBuffer.Reset()
}
func (k *KindTest) RunCmd(cmdDetails ...string) error {
cmd := exec.Command(cmdDetails[0], cmdDetails[1:]...)
// by making KindTest a Writer, we can simply wire k to logs
// writing to k will write to proper logs.
cmd.Stdout = k
cmd.Stderr = k
err := cmd.Run()
if err != nil {
return err
}
return nil
}
// Option that can be passed to the NewKind function in order to change the configuration
// of the test cluster
type Option func(k *KindTest)
// Deploy can be passed to NewKind to deploy extra components, in addition to the base deployment.
func Deploy(manifest string) Option {
return func(k *KindTest) {
k.deployManifests = append(k.deployManifests, manifest)
}
}
// ExportLogs can be passed to NewKind to specify the folder where the kubernetes logs will be exported after the tests.
func ExportLogs(folder string) Option {
return func(k *KindTest) {
k.logsDir = folder
}
}
// Timeout for long-running operations (e.g. deployments, readiness probes...)
func Timeout(t time.Duration) Option {
return func(k *KindTest) {
k.timeout = t
}
}
// LocalImage is passed to NewKind to allow loading a local Docker image into the cluster
func LocalImage(nameTag string) Option {
return func(k *KindTest) {
k.localImages = append(k.localImages, nameTag)
}
}
// NewKind creates a kind cluster given a name and set of Option instances.
func NewKindTester(kindClusterName string, filePath string, t *testing.T, options ...Option) *KindTest {
k := &KindTest{
clusterName: kindClusterName,
timeout: 10 * time.Minute,
kindConfigPath: filePath,
testInstance: t,
}
for _, option := range options {
option(k)
}
return k
}
// Prepare the kind cluster.
func (k *KindTest) Create() error {
err := k.RunCmd("kind", "create", "cluster", "--name", k.clusterName, "--config", k.kindConfigPath)
if err != nil {
return fmt.Errorf("failed to create cluster: %v", err)
}
for _, img := range k.localImages {
if err := k.RunCmd("kind", "load", "docker-image", "--name", k.clusterName, img); err != nil {
return fmt.Errorf("failed to load image: %v", err)
}
}
for _, mf := range k.deployManifests {
kubectlContext := fmt.Sprintf("kind-%v", k.clusterName)
if err := k.RunCmd("kubectl", "--context", kubectlContext, "apply", "-f", mf); err != nil {
return fmt.Errorf("failed to deploy manifest: %v", err)
}
}
return nil
}
func (k *KindTest) Destroy() error {
if k.logsDir != "" {
if err := k.RunCmd("kind", "export", "logs", k.logsDir, "--name", k.clusterName); err != nil {
return fmt.Errorf("failed to export logs: %v. will not teardown", err)
}
}
if err := k.RunCmd("kind", "delete", "cluster", "--name", k.clusterName); err != nil {
return fmt.Errorf("failed to destroy cluster: %v", err)
}
return nil
}
func TestE2EWithCommand(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var kindClusterConfigs = []string{
"previous",
"current",
"next",
}
// Iterate over each Kubernetes version
for _, version := range kindClusterConfigs {
version := version
// Define a subtest for each combination
t.Run(version, func(t *testing.T) {
t.Parallel() // Allow tests to run in parallel
randomInt := strconv.Itoa(rand.Intn(100))
kindClusterName := fmt.Sprintf("kured-e2e-command-%v-%v", version, randomInt)
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds.yaml"))
defer k.FlushLog()
err := k.Create()
if err != nil {
t.Fatalf("Error creating cluster %v", err)
}
defer func(k *KindTest) {
err := k.Destroy()
if err != nil {
t.Fatalf("Error destroying cluster %v", err)
}
}(k)
k.Write([]byte("Now running e2e tests"))
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
t.Fatalf("failed to create sentinels: %v", err)
}
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
t.Fatalf("failed to follow reboot: %v", err)
}
})
}
}
func TestE2EWithSignal(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var kindClusterConfigs = []string{
"previous",
"current",
"next",
}
// Iterate over each Kubernetes version
for _, version := range kindClusterConfigs {
version := version
// Define a subtest for each combination
t.Run(version, func(t *testing.T) {
t.Parallel() // Allow tests to run in parallel
randomInt := strconv.Itoa(rand.Intn(100))
kindClusterName := fmt.Sprintf("kured-e2e-signal-%v-%v", version, randomInt)
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-signal.yaml"))
defer k.FlushLog()
err := k.Create()
if err != nil {
t.Fatalf("Error creating cluster %v", err)
}
defer func(k *KindTest) {
err := k.Destroy()
if err != nil {
t.Fatalf("Error destroying cluster %v", err)
}
}(k)
k.Write([]byte("Now running e2e tests"))
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
t.Fatalf("failed to create sentinels: %v", err)
}
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
t.Fatalf("failed to follow reboot: %v", err)
}
})
}
}
func TestE2EConcurrentWithCommand(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var kindClusterConfigs = []string{
"previous",
"current",
"next",
}
// Iterate over each Kubernetes version
for _, version := range kindClusterConfigs {
version := version
// Define a subtest for each combination
t.Run(version, func(t *testing.T) {
t.Parallel() // Allow tests to run in parallel
randomInt := strconv.Itoa(rand.Intn(100))
kindClusterName := fmt.Sprintf("kured-e2e-concurrentcommand-%v-%v", version, randomInt)
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-command.yaml"))
defer k.FlushLog()
err := k.Create()
if err != nil {
t.Fatalf("Error creating cluster %v", err)
}
defer func(k *KindTest) {
err := k.Destroy()
if err != nil {
t.Fatalf("Error destroying cluster %v", err)
}
}(k)
k.Write([]byte("Now running e2e tests"))
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
t.Fatalf("failed to create sentinels: %v", err)
}
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
t.Fatalf("failed to follow reboot: %v", err)
}
})
}
}
func TestE2EConcurrentWithSignal(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var kindClusterConfigs = []string{
"previous",
"current",
"next",
}
// Iterate over each Kubernetes version
for _, version := range kindClusterConfigs {
version := version
// Define a subtest for each combination
t.Run(version, func(t *testing.T) {
t.Parallel() // Allow tests to run in parallel
randomInt := strconv.Itoa(rand.Intn(100))
kindClusterName := fmt.Sprintf("kured-e2e-concurrentsignal-%v-%v", version, randomInt)
kindClusterConfigFile := fmt.Sprintf("../../.github/kind-cluster-%v.yaml", version)
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy("testfiles/kured-ds-concurrent-signal.yaml"))
defer k.FlushLog()
err := k.Create()
if err != nil {
t.Fatalf("Error creating cluster %v", err)
}
defer func(k *KindTest) {
err := k.Destroy()
if err != nil {
t.Fatalf("Error destroying cluster %v", err)
}
}(k)
k.Write([]byte("Now running e2e tests"))
if err := k.RunCmd("bash", "testfiles/create-reboot-sentinels.sh", kindContext); err != nil {
t.Fatalf("failed to create sentinels: %v", err)
}
if err := k.RunCmd("bash", "testfiles/follow-coordinated-reboot.sh", kindContext); err != nil {
t.Fatalf("failed to follow reboot: %v", err)
}
})
}
}
func TestCordonningIsKept(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var kindClusterConfigs = []string{
"concurrency1",
"concurrency2",
}
// Iterate over each test variant
for _, variant := range kindClusterConfigs {
variant := variant
// Define a subtest for each combination
t.Run(variant, func(t *testing.T) {
t.Parallel() // Allow tests to run in parallel
randomInt := strconv.Itoa(rand.Intn(100))
kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt)
kindClusterConfigFile := "../../.github/kind-cluster-next.yaml"
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
var manifest string
if variant == "concurrency1" {
manifest = "testfiles/kured-ds-signal.yaml"
} else {
manifest = "testfiles/kured-ds-concurrent-signal.yaml"
}
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(manifest))
defer k.FlushLog()
err := k.Create()
if err != nil {
t.Fatalf("Error creating cluster %v", err)
}
defer func(k *KindTest) {
err := k.Destroy()
if err != nil {
t.Fatalf("Error destroying cluster %v", err)
}
}(k)
k.Write([]byte("Now running e2e tests"))
if err := k.RunCmd("bash", "testfiles/node-stays-as-cordonned.sh", kindContext); err != nil {
t.Fatalf("node did not reboot in time: %v", err)
}
})
}
}
func TestE2EBlocker(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("skipping test in short mode.")
}
var kindClusterConfigs = []string{
"podblocker",
}
// Iterate over each variant of the test
for _, variant := range kindClusterConfigs {
variant := variant
// Define a subtest for each combination
t.Run(variant, func(t *testing.T) {
t.Parallel() // Allow tests to run in parallel
randomInt := strconv.Itoa(rand.Intn(100))
kindClusterName := fmt.Sprintf("kured-e2e-cordon-%v-%v", variant, randomInt)
kindClusterConfigFile := "../../.github/kind-cluster-next.yaml"
kindContext := fmt.Sprintf("kind-%v", kindClusterName)
k := NewKindTester(kindClusterName, kindClusterConfigFile, t, LocalImage(kuredDevImage), Deploy("../../kured-rbac.yaml"), Deploy(fmt.Sprintf("testfiles/kured-ds-%v.yaml", variant)))
defer k.FlushLog()
err := k.Create()
if err != nil {
t.Fatalf("Error creating cluster %v", err)
}
defer func(k *KindTest) {
err := k.Destroy()
if err != nil {
t.Fatalf("Error destroying cluster %v", err)
}
}(k)
k.Write([]byte("Now running e2e tests"))
if err := k.RunCmd("bash", fmt.Sprintf("testfiles/%v.sh", variant), kindContext); err != nil {
t.Fatalf("node blocker test did not succeed: %v", err)
}
})
}
}

View File

@@ -1,11 +0,0 @@
#!/usr/bin/env bash
kubectl_flags=( )
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
# To speed up the system, let's not kill the control plane.
for nodename in $(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o name | grep -v control-plane); do
echo "Creating reboot sentinel on $nodename"
docker exec "${nodename/node\//}" hostname
docker exec "${nodename/node\//}" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
done

View File

@@ -1,59 +0,0 @@
#!/usr/bin/env bash
kubectl_flags=( )
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
cordon() {
kubectl "${kubectl_flags[@]}" cordon "${precordonned_node}"
}
create_sentinel() {
docker exec "${precordonned_node}" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
docker exec "${notcordonned_node}" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
}
check_reboot_required() {
while true;
do
docker exec "${precordonned_node}" stat /var/run/reboot-required > /dev/null && echo "Reboot still required" || return 0
sleep 3
done
}
check_node_back_online_as_cordonned() {
sleep 5 # For safety, wait for 5 seconds, so that the kubectl command succeeds.
# This test might be giving us false positive until we work on reliability of the
# test.
while true;
do
result=$(kubectl "${kubectl_flags[@]}" get node "${precordonned_node}" --no-headers | awk '{print $2;}')
test "${result}" != "Ready,SchedulingDisabled" && echo "Node ${precordonned_node} in state ${result}" || return 0
sleep 3
done
}
check_node_back_online_as_uncordonned() {
while true;
do
result=$(kubectl "${kubectl_flags[@]}" get node "${notcordonned_node}" --no-headers | awk '{print $2;}')
test "${result}" != "Ready" && echo "Node ${notcordonned_node} in state ${result}" || return 0
sleep 3
done
}
### Start main
worker_nodes=$(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=name:metadata.name --no-headers | grep worker)
precordonned_node=$(echo "$worker_nodes" | head -n 1)
notcordonned_node=$(echo "$worker_nodes" | tail -n 1)
# Wait for kured to install correctly
sleep 15
cordon
create_sentinel
check_reboot_required
echo "Node has rebooted, but may take time to come back ready"
check_node_back_online_as_cordonned
check_node_back_online_as_uncordonned
echo "Showing final node state"
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes
echo "Test successful"

View File

@@ -1,54 +0,0 @@
#!/usr/bin/env bash
kubectl_flags=( )
[[ "$1" != "" ]] && kubectl_flags=("${kubectl_flags[@]}" --context "$1")
function gather_logs_and_cleanup {
for id in $(docker ps -q); do
echo "############################################################"
echo "docker logs for container $id:"
docker logs "$id"
done
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" logs ds/kured --all-pods -n kube-system
}
trap gather_logs_and_cleanup EXIT
set +o errexit
worker=$(${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" get nodes -o custom-columns=name:metadata.name --no-headers | grep worker | head -n 1)
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" label nodes "$worker" blocked-host=yes
${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" apply -f - << EOF
apiVersion: v1
kind: Pod
metadata:
name: nginx
labels:
app: blocker
spec:
containers:
- name: nginx
image: nginx
imagePullPolicy: IfNotPresent
nodeSelector:
blocked-host: "yes"
EOF
docker exec "$worker" touch "${SENTINEL_FILE:-/var/run/reboot-required}"
set -o errexit
max_attempts="100"
attempt_num=1
sleep_time=5
until ${KUBECTL_CMD:-kubectl} "${kubectl_flags[@]}" logs ds/kured --all-pods -n kube-system | grep -i -e "Reboot.*blocked"
do
if (( attempt_num == max_attempts )); then
echo "Attempt $attempt_num failed and there are no more attempts left!"
exit 1
else
echo "Did not find 'reboot blocked' in the log, retrying in $sleep_time seconds (Attempt #$attempt_num)"
sleep "$sleep_time"
fi
(( attempt_num++ ))
done