Compare commits

...

44 Commits
1.4.0 ... 1.4.4

Author SHA1 Message Date
Daniel Holbach
2762837dac Merge pull request #164 from dholbach/1.4.4-release
Prep for 1.4.4 release
2020-07-01 11:27:51 +02:00
Daniel Holbach
d507361a45 update chart version 2020-07-01 10:49:12 +02:00
Daniel Holbach
1d1f22c93b Prep for 1.4.4 release
Drop bit in the docs about updating image tag - not necessary
	if you use the instructions.
2020-07-01 10:43:06 +02:00
Daniel Holbach
644aca3fa0 Merge pull request #163 from ckotzbauer/chart-fixes
Additional chart changes for service-handling
2020-06-30 20:28:30 +02:00
Christian Kotzbauer
59b078f38d bump and fix
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-30 19:21:06 +02:00
Christian Kotzbauer
36cef41c20 split matchLabels template
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-30 19:18:47 +02:00
Christian Kotzbauer
eb617adc2b restructured and improved service
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-30 19:15:32 +02:00
Daniel Holbach
2afd04ddd3 Merge pull request #162 from ckotzbauer/chart-fixes
Several small chart fixes
2020-06-30 18:25:53 +02:00
Christian Kotzbauer
3eb7f17b3a bumped kured to upcoming 1.4.3
fixed servicemonitor indent
fixed quotes for arguments

Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-30 18:00:05 +02:00
Daniel Holbach
a87e7b28d2 Merge pull request #158 from dholbach/use-github-token
Use GitHub token
2020-06-30 10:42:10 +02:00
Daniel Holbach
c6f341ec16 update things for 1.4.2 release 2020-06-30 10:37:39 +02:00
Daniel Holbach
14bda85a03 Use GITHUB_TOKEN for releasing chart
https://help.github.com/en/actions/configuring-and-managing-workflows/authenticating-with-the-github_token#about-the-github_token-secret
2020-06-30 10:35:13 +02:00
Daniel Holbach
d6f7609081 Merge pull request #157 from dholbach/update-readme
Update readme
2020-06-30 10:19:57 +02:00
Daniel Holbach
2b4830a0f6 make markdownlint happier 2020-06-30 09:58:43 +02:00
Daniel Holbach
688ba8ef72 update version 2020-06-30 09:53:44 +02:00
Daniel Holbach
aebe9463fd Merge pull request #155 from ckotzbauer/chart-release-1.4.1
prepare chart-release for 1.4.1
2020-06-30 09:30:59 +02:00
Christian Kotzbauer
900f58ae2d prepare chart-release for 1.4.1
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-29 17:22:51 +02:00
Daniel Holbach
b177a9f6eb Merge pull request #152 from dholbach/revert-139
Revert #139
2020-06-26 17:37:43 +02:00
Daniel Holbach
8fafad18bb Revert #139
This is a follow-up to #150, so we can get a 1.4.x release
	out that will be geared towards k8s 1.1[6-8].

	Update to latest 1.17 kubectl: 1.17.7.
2020-06-26 17:30:01 +02:00
Daniel Holbach
cc806b886c Merge pull request #150 from ckotzbauer/initial-helm-chart
Initial Kured Helm-Chart
2020-06-26 15:52:28 +02:00
Christian Kotzbauer
b78ba8e73b allow multiple blockingPodSelector and rebootDays to be defined
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-22 16:21:30 +02:00
Christian Kotzbauer
1ddd45d90b updated maintainers and version
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-22 15:20:48 +02:00
Christian Kotzbauer
a815867584 updated chart docs
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 12:04:27 +02:00
Christian Kotzbauer
d271165496 rearrange default values
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 10:50:54 +02:00
Christian Kotzbauer
2bb7b7937e add service and serviceMonitor
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 10:46:54 +02:00
Christian Kotzbauer
2afa0a9da7 add cli-flags as config-object
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 10:17:50 +02:00
Christian Kotzbauer
ab0b5d137c add affinity support; removed duplicate restartPolicy
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 09:52:41 +02:00
Christian Kotzbauer
d3ea5639f4 fix role inconsistencies
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 09:49:20 +02:00
Christian Kotzbauer
02bb6d650e use template function for labels
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 09:45:39 +02:00
Christian Kotzbauer
c473caafc8 remove autolock feature
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 09:37:44 +02:00
Christian Kotzbauer
a574a67c61 add unmodified chart from stable repo
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 09:36:07 +02:00
Christian Kotzbauer
4420dc82d6 add chart github-actions
Signed-off-by: Christian Kotzbauer <christian.kotzbauer@gmail.com>
2020-06-20 09:35:35 +02:00
Daniel Holbach
cdbcf8d4a0 Merge pull request #149 from dholbach/circleci-ignore-ghpages
make sure Circle CI ignores the 'gh-pages' branch
2020-06-17 12:50:56 +02:00
Daniel Holbach
3508110f52 make sure Circle CI ignores the 'gh-pages' branch 2020-06-17 12:27:01 +02:00
Bryan Boreham
ec75533394 Merge pull request #119 from michalschott/annotationTTL
Adding --annotation-ttl for automatic unlock
2020-05-20 11:30:44 +01:00
Daniel Holbach
edeefcd2b9 Merge pull request #139 from dholbach/move-to-1.18
replay changes from #127
2020-05-11 16:36:51 +02:00
Michal Schott
cf03bc587c Adding unit tests for ttlExpired. 2020-05-05 22:37:18 +02:00
Michal Schott
59a6700add Renaming flag as suggested. 2020-05-05 20:52:10 +02:00
Michal Schott
64ebf53264 Typo in logic. 2020-05-05 14:32:41 +02:00
Michal Schott
615e3d4840 Calculate time difference easier. 2020-05-05 14:10:23 +02:00
Michal Schott
1257d97ead Be clean when this feature is disabled. 2020-05-05 14:10:23 +02:00
Michal Schott
1fc2522c0f Removing spurious change. 2020-05-05 14:10:23 +02:00
Michal Schott
7fb16fed9b Adding annotationTTL. 2020-05-05 14:10:22 +02:00
Daniel Holbach
72a31030db replay changes from #127 2020-05-01 09:07:16 +02:00
26 changed files with 743 additions and 51 deletions

View File

@@ -24,3 +24,5 @@ workflows:
filters:
tags:
only: /.*/
branches:
ignore: gh-pages

6
.github/ct.yaml vendored Normal file
View File

@@ -0,0 +1,6 @@
# See https://github.com/helm/chart-testing#configuration
remote: origin
chart-dirs:
- charts
chart-repos: []
helm-extra-args: --timeout 600s

32
.github/workflows/chart-lint.yml vendored Normal file
View File

@@ -0,0 +1,32 @@
name: lint-chart
on:
pull_request:
paths:
- "charts/**"
jobs:
lint-test:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
with:
fetch-depth: "0"
- name: Run chart-testing (lint)
id: lint
uses: helm/chart-testing-action@v1.0.0-rc.2
with:
command: lint
config: .github/ct.yaml
- name: Create kind cluster
uses: helm/kind-action@v1.0.0-rc.1
if: steps.lint.outputs.changed == 'true'
- name: Run chart-testing (install)
uses: helm/chart-testing-action@v1.0.0-rc.2
with:
command: install
config: .github/ct.yaml

16
.github/workflows/chart-release.yml vendored Normal file
View File

@@ -0,0 +1,16 @@
name: release-chart
on:
push:
tags:
- "*"
jobs:
publish:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Publish Helm chart
uses: stefanprodan/helm-gh-pages@master
with:
token: ${{ secrets.GITHUB_TOKEN }}
charts_dir: charts

View File

@@ -91,9 +91,6 @@ cat kured-ds.yaml >> "$MANIFEST"
sed -i "s#docker.io/weaveworks/kured#docker.io/weaveworks/kured:$VERSION#g" "$MANIFEST"
```
The last thing you need to do is update the `image:` to point to the release
tag, e.g. `docker.io/weaveworks/kured:1.3.0`.
Now you can head to the Github UI, use the version number as tag and upload the
`kured-<release>-dockerhub.yaml` file.

View File

@@ -1,21 +1,24 @@
# kured - Kubernetes Reboot Daemon
<img src="https://github.com/weaveworks/kured/raw/master/img/logo.png" align="right"/>
* [Introduction](#introduction)
* [Kubernetes & OS Compatibility](#kubernetes-&-os-compatibility)
* [Installation](#installation)
* [Configuration](#configuration)
* [Reboot Sentinel File & Period](#reboot-sentinel-file-&-period)
* [Setting a schedule](#setting-a-schedule)
* [Blocking Reboots via Alerts](#blocking-reboots-via-alerts)
* [Blocking Reboots via Pods](#blocking-reboots-via-pods)
* [Prometheus Metrics](#prometheus-metrics)
* [Slack Notifications](#slack-notifications)
* [Overriding Lock Configuration](#overriding-lock-configuration)
* [Reboot Sentinel File & Period](#reboot-sentinel-file-&-period)
* [Setting a schedule](#setting-a-schedule)
* [Blocking Reboots via Alerts](#blocking-reboots-via-alerts)
* [Blocking Reboots via Pods](#blocking-reboots-via-pods)
* [Prometheus Metrics](#prometheus-metrics)
* [Slack Notifications](#slack-notifications)
* [Overriding Lock Configuration](#overriding-lock-configuration)
* [Operation](#operation)
* [Testing](#testing)
* [Disabling Reboots](#disabling-reboots)
* [Manual Unlock](#manual-unlock)
* [Testing](#testing)
* [Disabling Reboots](#disabling-reboots)
* [Manual Unlock](#manual-unlock)
* [Automatic Unlock](#automatic-unlock)
* [Building](#building)
* [Frequently Asked/Anticipated Questions](#frequently-askedanticipated-questions)
* [Getting Help](#getting-help)
@@ -41,12 +44,12 @@ compatibility of one minor version between client and server:
| kured | kubectl | k8s.io/client-go | k8s.io/apimachinery | expected kubernetes compatibility |
|--------|---------|------------------|---------------------|-----------------------------------|
| master | 1.17.5 | v0.17.0 | v0.17.0 | 1.16.x, 1.17.x, 1.18.x |
| 1.4.0 | 1.17.5 | v0.17.0 | v0.17.0 | 1.16.x, 1.17.x, 1.18.x |
| master | 1.17.7 | v0.17.0 | v0.17.0 | 1.16.x, 1.17.x, 1.18.x |
| 1.4.4 | 1.17.7 | v0.17.0 | v0.17.0 | 1.16.x, 1.17.x, 1.18.x |
| 1.3.0 | 1.15.10 | v12.0.0 | release-1.15 | 1.15.x, 1.16.x, 1.17.x |
| 1.2.0 | 1.13.6 | v10.0.0 | release-1.13 | 1.12.x, 1.13.x, 1.14.x |
| 1.1.0 | 1.12.1 | v9.0.0 | release-1.12 | 1.11.x, 1.12.x, 1.13.x |
| 1.0.0 | 1.7.6 | v4.0.0 | release-1.7 | 1.6.x, 1.7.x, 1.8.x |
| 1.0.0 | 1.7.6 | v4.0.0 | release-1.7 | 1.6.x, 1.7.x, 1.8.x |
See the [release notes](https://github.com/weaveworks/kured/releases)
for specific version compatibility information, including which
@@ -60,7 +63,7 @@ Versions >=1.1.0 enter the host mount namespace to invoke
To obtain a default installation without Prometheus alerting interlock
or Slack notifications:
```
```console
kubectl apply -f https://github.com/weaveworks/kured/releases/download/1.3.0/kured-1.3.0-dockerhub.yaml
```
@@ -71,8 +74,9 @@ edit it in accordance with the following section before application.
The following arguments can be passed to kured via the daemonset pod template:
```
```console
Flags:
--annotation-ttl time force clean annotation after this ammount of time (default 0, disabled)
--alert-filter-regexp regexp.Regexp alert names to ignore when checking for active alerts
--blocking-pod-selector stringArray label selector identifying pods whose presence should prevent reboots
--ds-name string name of daemonset on which to place lock (default "kured")
@@ -108,11 +112,11 @@ reboots to predictable schedules. Use `--reboot-days`, `--start-time`,
`--end-time`, and `--time-zone` to set a schedule. For example, business
hours on the west coast USA can be specified with:
```
--reboot-days mon,tue,wed,thu,fri
--start-time 9am
--end-time 5pm
--time-zone America/Los_Angeles
```console
--reboot-days mon,tue,wed,thu,fri
--start-time 9am
--end-time 5pm
--time-zone America/Los_Angeles
```
Times can be formatted in numerous ways, including `5pm`, `5:00pm` `17:00`,
@@ -128,14 +132,14 @@ You may find it desirable to block automatic node reboots when there
are active alerts - you can do so by providing the URL of your
Prometheus server:
```
```console
--prometheus-url=http://prometheus.monitoring.svc.cluster.local
```
By default the presence of *any* active (pending or firing) alerts
will block reboots, however you can ignore specific alerts:
```
```console
--alert-filter-regexp=^(RebootRequired|AnotherBenignAlert|...$
```
@@ -147,14 +151,14 @@ filter.
You can also block reboots of an _individual node_ when specific pods
are scheduled on it:
```
```console
--blocking-pod-selector=runtime=long,cost=expensive
```
Since label selector strings use commas to express logical 'and', you can
specify this parameter multiple times for 'or':
```
```console
--blocking-pod-selector=runtime=long,cost=expensive
--blocking-pod-selector=name=temperamental
```
@@ -172,7 +176,7 @@ running job or a known temperamental pod on a node will stop it rebooting.
Each kured pod exposes a single gauge metric (`:8080/metrics`) that
indicates the presence of the sentinel file:
```
```console
# HELP kured_reboot_required OS requires reboot due to software updates.
# TYPE kured_reboot_required gauge
kured_reboot_required{node="ip-xxx-xxx-xxx-xxx.ec2.internal"} 0
@@ -182,7 +186,7 @@ The purpose of this metric is to power an alert which will summon an
operator if the cluster cannot reboot itself automatically for a
prolonged period:
```
```console
# Alert if a reboot is required for any machines. Acts as a failsafe for the
# reboot daemon, which will not reboot nodes if there are pending alerts save
# this one.
@@ -206,7 +210,7 @@ probe for active alerts before rebooting, be sure to specify
If you specify a Slack hook via `--slack-hook-url`, kured will notify
you immediately prior to rebooting a node:
<img src="https://github.com/weaveworks/kured/raw/master/img/slack-notification.png"/>
![Notification](img/slack-notification.png)
We recommend setting `--slack-username` to be the name of the
environment, e.g. `dev` or `prod`.
@@ -232,7 +236,7 @@ if you have, you will have to adjust the commands accordingly.
You can test your configuration by provoking a reboot on a node:
```
```console
sudo touch /var/run/reboot-required
```
@@ -241,7 +245,7 @@ sudo touch /var/run/reboot-required
If you need to temporarily stop kured from rebooting any nodes, you
can take the lock manually:
```
```console
kubectl -n kube-system annotate ds kured weave.works/kured-node-lock='{"nodeID":"manual"}'
```
@@ -253,12 +257,20 @@ In exceptional circumstances, such as a node experiencing a permanent
failure whilst rebooting, manual intervention may be required to
remove the cluster lock:
```
```console
kubectl -n kube-system annotate ds kured weave.works/kured-node-lock-
```
> NB the `-` at the end of the command is important - it instructs
> `kubectl` to remove that annotation entirely.
### Automatic Unlock
In exceptional circumstances (especially when used with cluster-autoscaler) a node
which holds lock might be killed thus annotation will stay there for ever.
Using `--annotation-ttl=30m` will allow other nodes to take over if TTL has expired (in this case 30min) and continue reboot process.
## Building
See the [CircleCI config](.circleci/config.yml) for the preferred
@@ -269,13 +281,13 @@ repository:
**Building outside $GOPATH:**
```
```console
make
```
**Building inside $GOPATH:**
```
```console
GO111MODULE=on make
```
@@ -298,10 +310,10 @@ versioned manifest from the [release page](https://github.com/weaveworks/kured/r
If you have any questions about, feedback for or problems with `kured`:
- Invite yourself to the <a href="https://slack.weave.works/" target="_blank">Weave Users Slack</a>.
- Ask a question on the [#kured](https://weave-community.slack.com/messages/kured/) slack channel.
- [File an issue](https://github.com/weaveworks/kured/issues/new).
- Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
* Invite yourself to the <a href="https://slack.weave.works/" target="_blank">Weave Users Slack</a>.
* Ask a question on the [#kured](https://weave-community.slack.com/messages/kured/) slack channel.
* [File an issue](https://github.com/weaveworks/kured/issues/new).
* Join us in [our monthly meeting](https://docs.google.com/document/d/1bsHTjHhqaaZ7yJnXF6W8c89UB_yn-OoSZEmDnIP34n8/edit#),
every fourth Wednesday of the month at 16:00 UTC.
Your feedback is always welcome!

21
charts/kured/.helmignore Normal file
View File

@@ -0,0 +1,21 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj

14
charts/kured/Chart.yaml Normal file
View File

@@ -0,0 +1,14 @@
apiVersion: v1
appVersion: "1.4.4"
description: A Helm chart for kured
name: kured
version: 2.0.3
home: https://github.com/weaveworks/kured
maintainers:
- name: dholbach
email: daniel@weave.works
- name: ckotzbauer
email: christian.kotzbauer@gmail.com
sources:
- https://github.com/weaveworks/kured
icon: https://raw.githubusercontent.com/weaveworks/kured/master/img/logo.png

105
charts/kured/README.md Normal file
View File

@@ -0,0 +1,105 @@
# Kured (KUbernetes REboot Daemon)
## Introduction
This chart installs the "Kubernetes Reboot Daemon" using the Helm Package Manager.
## Prerequisites
- Kubernetes 1.9+
## Installing the Chart
To install the chart with the release name `my-release`:
```bash
$ helm repo add kured https://weaveworks.github.io/kured
$ helm install my-release kured/kured
```
## Uninstalling the Chart
To uninstall/delete the `my-release` deployment:
```bash
$ helm delete my-release
```
The command removes all the Kubernetes components associated with the chart and deletes the release.
## Migrate from stable Helm-Chart
The following changes have been made compared to the stable chart:
- **[BREAKING CHANGE]** The `autolock` feature was removed. Use `configuration.startTime` and `configuration.endTime` instead.
- Role inconsistencies have been fixed (allowed verbs for modifying the `DaemonSet`, apiGroup of `PodSecurityPolicy`)
- Added support for affinities.
- Configuration of cli-flags can be made through a `configuration` object.
- Added optional `Service` and `ServiceMonitor` support for metrics endpoint.
## Configuration
| Config | Description | Default |
| ------ | ----------- | ------- |
| `image.repository` | Image repository | `weaveworks/kured` |
| `image.tag` | Image tag | `1.4.4` |
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
| `image.pullSecrets` | Image pull secrets | `[]` |
| `updateStrategy` | Daemonset update strategy | `OnDelete` |
| `podAnnotations` | Annotations to apply to pods (eg to add Prometheus annotations) | `{}` |
| `extraArgs` | Extra arguments to pass to `/usr/bin/kured`. See below. | `{}` |
| `configuration.annotationTtl` | cli-parameter `--annotation-ttl` | `0` |
| `configuration.alertFilterRegexp` | cli-parameter `--alert-filter-regexp` | `""` |
| `configuration.blockingPodSelector` | Array of selectors for multiple cli-parameters `--blocking-pod-selector` | `[]` |
| `configuration.endTime` | cli-parameter `--end-time` | `""` |
| `configuration.lockAnnotation` | cli-parameter `--lock-annotation` | `""` |
| `configuration.period` | cli-parameter `--period` | `""` |
| `configuration.prometheusUrl` | cli-parameter `--prometheus-url` | `""` |
| `configuration.rebootDays` | Array of days for multiple cli-parameters `--reboot-days` | `[]` |
| `configuration.rebootSentinel` | cli-parameter `--reboot-sentinel` | `""` |
| `configuration.slackChannel` | cli-parameter `--slack-channel` | `""` |
| `configuration.slackHookUrl` | cli-parameter `--slack-hook-url` | `""` |
| `configuration.slackUsername` | cli-parameter `--slack-username` | `""` |
| `configuration.startTime` | cli-parameter `--start-time` | `""` |
| `configuration.timeZone` | cli-parameter `--time-zone` | `""` |
| `rbac.create` | Create RBAC roles | `true` |
| `serviceAccount.create` | Create a service account | `true` |
| `serviceAccount.name` | Service account name to create (or use if `serviceAccount.create` is false) | (chart fullname) |
| `podSecurityPolicy.create` | Create podSecurityPolicy | `false` |
| `resources` | Resources requests and limits. | `{}` |
| `metrics.create` | Create a ServiceMonitor for prometheus-operator | `false` |
| `metrics.namespace` | The namespace to create the ServiceMonitor in | `""` |
| `metrics.labels` | Additional labels for the ServiceMonitor | `{}` |
| `metrics.interval` | Interval prometheus should scrape the endpoint | `60s` |
| `metrics.scrapeTimeout` | A custom scrapeTimeout for prometheus | `""` |
| `service.create` | Create a Service for the metrics endpoint | `false` |
| `service.port` | Port of the service to expose | `8080` |
| `service.annotations` | Annotations to apply to the service (eg to add Prometheus annotations) | `{}` |
| `priorityClassName` | Priority Class to be used by the pods | `""` |
| `tolerations` | Tolerations to apply to the daemonset (eg to allow running on master) | `[{"key": "node-role.kubernetes.io/master", "effect": "NoSchedule"}]`|
| `affinity` | Affinity for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
| `nodeSelector` | Node Selector for the daemonset (ie, restrict which nodes kured runs on) | `{}` |
See https://github.com/weaveworks/kured#configuration for values (not contained in the `configuration` object) for `extraArgs`. Note that
```yaml
extraArgs:
foo: 1
bar-baz: 2
```
becomes `/usr/bin/kured ... --foo=1 --bar-baz=2`.
## Prometheus Metrics
Kured exposes a single prometheus metric indicating whether a reboot is required or not (see [kured docs](https://github.com/weaveworks/kured#prometheus-metrics)) for details.
#### Prometheus-Operator
```yaml
metrics:
create: true
```
#### Prometheus Annotations
```yaml
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "8080"
```

View File

@@ -0,0 +1,3 @@
Kured will check for /var/run/reboot-required, and reboot nodes when needed.
See https://github.com/weaveworks/kured/ for details.

View File

@@ -0,0 +1,72 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "kured.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "kured.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "kured.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create the name of the service account to use
*/}}
{{- define "kured.serviceAccountName" -}}
{{- if .Values.serviceAccount.create -}}
{{ default (include "kured.fullname" .) .Values.serviceAccount.name }}
{{- else -}}
{{ default "default" .Values.serviceAccount.name }}
{{- end -}}
{{- end -}}
{{/*
Return the appropriate apiVersion for podsecuritypolicy.
*/}}
{{- define "kured.psp.apiVersion" -}}
{{- if semverCompare "<1.10-0" .Capabilities.KubeVersion.GitVersion -}}
{{- print "extensions/v1beta1" -}}
{{- else -}}
{{- print "policy/v1beta1" -}}
{{- end -}}
{{- end -}}
{{/*
Returns a set of labels applied to each resource.
*/}}
{{- define "kured.labels" -}}
app: {{ template "kured.name" . }}
chart: {{ template "kured.chart" . }}
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
{{- end -}}
{{/*
Returns a set of matchLabels applied.
*/}}
{{- define "kured.matchLabels" -}}
app: {{ template "kured.name" . }}
release: {{ .Release.Name }}
{{- end -}}

View File

@@ -0,0 +1,30 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
rules:
# Allow kured to read spec.unschedulable
# Allow kubectl to drain/uncordon
#
# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
# match https://github.com/kubernetes/kubernetes/blob/v1.12.1/pkg/kubectl/cmd/drain.go
#
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "patch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["list","delete","get"]
- apiGroups: ["extensions"]
resources: ["daemonsets"]
verbs: ["get"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
{{- end -}}

View File

@@ -0,0 +1,16 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ template "kured.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ template "kured.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
{{- end -}}

View File

@@ -0,0 +1,118 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ template "kured.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kured.labels" . | nindent 4 }}
spec:
updateStrategy:
type: {{ .Values.updateStrategy }}
selector:
matchLabels:
{{- include "kured.matchLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "kured.labels" . | nindent 8 }}
{{- if .Values.podAnnotations }}
annotations:
{{- range $key, $value := .Values.podAnnotations }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
spec:
serviceAccountName: {{ template "kured.serviceAccountName" . }}
hostPID: true
restartPolicy: Always
{{- with .Values.image.pullSecrets }}
imagePullSecrets:
{{ toYaml . | indent 8 }}
{{- end }}
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
securityContext:
privileged: true # Give permission to nsenter /proc/1/ns/mnt
resources:
{{ toYaml .Values.resources | indent 12 }}
command:
- /usr/bin/kured
args:
- --ds-name={{ template "kured.fullname" . }}
- --ds-namespace={{ .Release.Namespace }}
{{- if .Values.configuration.annotationTtl }}
- --annotation-ttl={{ .Values.configuration.annotationTtl }}
{{- end }}
{{- if .Values.configuration.alertFilterRegexp }}
- --alert-filter-regexp={{ .Values.configuration.alertFilterRegexp | quote }}
{{- end }}
{{- range .Values.configuration.blockingPodSelector }}
- --blocking-pod-selector={{ . }}
{{- end }}
{{- if .Values.configuration.endTime }}
- --end-time={{ .Values.configuration.endTime }}
{{- end }}
{{- if .Values.configuration.lockAnnotation }}
- --lock-annotation={{ .Values.configuration.lockAnnotation }}
{{- end }}
{{- if .Values.configuration.period }}
- --period={{ .Values.configuration.period }}
{{- end }}
{{- if .Values.configuration.prometheusUrl }}
- --prometheus-url={{ .Values.configuration.prometheusUrl }}
{{- end }}
{{- range .Values.configuration.rebootDays }}
- --reboot-days={{ . }}
{{- end }}
{{- if .Values.configuration.rebootSentinel }}
- --reboot-sentinel={{ .Values.configuration.rebootSentinel }}
{{- end }}
{{- if .Values.configuration.slackChannel }}
- --slack-channel={{ .Values.configuration.slackChannel }}
{{- end }}
{{- if .Values.configuration.slackHookUrl }}
- --slack-hook-url={{ .Values.configuration.slackHookUrl }}
{{- end }}
{{- if .Values.configuration.slackUsername }}
- --slack-username={{ .Values.configuration.slackUsername }}
{{- end }}
{{- if .Values.configuration.startTime }}
- --start-time={{ .Values.configuration.startTime }}
{{- end }}
{{- if .Values.configuration.timeZone }}
- --time-zone={{ .Values.configuration.timeZone }}
{{- end }}
{{- range $key, $value := .Values.extraArgs }}
{{- if $value }}
- --{{ $key }}={{ $value }}
{{- else }}
- --{{ $key }}
{{- end }}
{{- end }}
ports:
- containerPort: 8080
name: metrics
env:
# Pass in the name of the node on which this pod is scheduled
# for use with drain/uncordon operations and lock acquisition
- name: KURED_NODE_ID
valueFrom:
fieldRef:
fieldPath: spec.nodeName
{{- with .Values.tolerations }}
tolerations:
{{ toYaml . | indent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{ toYaml . | indent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{ toYaml . | indent 8 }}
{{- end }}

View File

@@ -0,0 +1,21 @@
{{- if .Values.podSecurityPolicy.create}}
apiVersion: {{ template "kured.psp.apiVersion" . }}
kind: PodSecurityPolicy
metadata:
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
spec:
privileged: true
hostPID: true
allowedCapabilities: ['*']
fsGroup:
rule: RunAsAny
runAsUser:
rule: RunAsAny
seLinux:
rule: RunAsAny
supplementalGroups:
rule: RunAsAny
volumes: ['*']
{{- end }}

View File

@@ -0,0 +1,30 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: {{ .Release.Namespace }}
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
rules:
# Allow kured to lock/unlock itself
- apiGroups: ["extensions"]
resources: ["daemonsets"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["update", "patch"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["update", "patch"]
{{- if .Values.podSecurityPolicy.create }}
- apiGroups: ["extensions"]
resources: ["podsecuritypolicies"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["use"]
- apiGroups: ["policy"]
resources: ["podsecuritypolicies"]
resourceNames: ["{{ template "kured.fullname" . }}"]
verbs: ["use"]
{{- end }}
{{- end -}}

View File

@@ -0,0 +1,17 @@
{{- if .Values.rbac.create -}}
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
namespace: {{ .Release.Namespace }}
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
subjects:
- kind: ServiceAccount
namespace: {{ .Release.Namespace }}
name: {{ template "kured.serviceAccountName" . }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ template "kured.fullname" . }}
{{- end -}}

View File

@@ -0,0 +1,22 @@
{{- if or .Values.service.create .Values.metrics.create }}
apiVersion: v1
kind: Service
metadata:
name: {{ template "kured.fullname" . }}
labels:
{{- include "kured.labels" . | nindent 4 }}
{{- if .Values.service.annotations }}
annotations:
{{- range $key, $value := .Values.service.annotations }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
spec:
type: ClusterIP
ports:
- name: metrics
port: {{ .Values.service.port }}
targetPort: 8080
selector:
{{- include "kured.matchLabels" . | nindent 4 }}
{{- end }}

View File

@@ -0,0 +1,9 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ template "kured.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kured.labels" . | nindent 4 }}
{{- end -}}

View File

@@ -0,0 +1,31 @@
{{- if .Values.metrics.create }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ template "kured.fullname" . }}
{{- if .Values.metrics.namespace }}
namespace: {{ .Values.metrics.namespace }}
{{- end }}
labels:
{{- include "kured.labels" . | nindent 4 }}
{{- if .Values.metrics.labels }}
{{- toYaml .Values.metrics.labels | nindent 4 }}
{{- end }}
spec:
endpoints:
- interval: {{ .Values.metrics.interval }}
{{- if .Values.metrics.scrapeTimeout }}
scrapeTimeout: {{ .Values.metrics.scrapeTimeout }}
{{- end }}
honorLabels: true
targetPort: 8080
path: /metrics
scheme: http
jobLabel: "{{ .Release.Name }}"
selector:
matchLabels:
{{- include "kured.matchLabels" . | nindent 6 }}
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
{{- end }}

61
charts/kured/values.yaml Normal file
View File

@@ -0,0 +1,61 @@
image:
repository: weaveworks/kured
tag: 1.4.4
pullPolicy: IfNotPresent
pullSecrets: []
updateStrategy: OnDelete
podAnnotations: {}
extraArgs: {}
configuration:
annotationTtl: 0 # force clean annotation after this ammount of time (default 0, disabled)
alertFilterRegexp: "" # alert names to ignore when checking for active alerts
blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
endTime: "" # only reboot before this time of day (default "23:59")
lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
period: "" # reboot check period (default 1h0m0s)
prometheusUrl: "" # Prometheus instance to probe for active alerts
rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
slackChannel: "" # slack channel for reboot notfications
slackHookUrl: "" # slack hook URL for reboot notfications
slackUsername: "" # slack username for reboot notfications (default "kured")
startTime: "" # only reboot after this time of day (default "0:00")
timeZone: "" # time-zone to use (valid zones from "time" golang package)
rbac:
create: true
serviceAccount:
create: true
name:
podSecurityPolicy:
create: false
resources: {}
metrics:
create: false
namespace: ""
labels: {}
interval: 60s
scrapeTimeout: ""
service:
create: false
port: 8080
annotations: {}
priorityClassName: ""
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
affinity: {}
nodeSelector: {}

View File

@@ -1,7 +1,7 @@
FROM alpine:3.11
RUN apk update && apk add ca-certificates tzdata && rm -rf /var/cache/apk/*
# NB: you may need to update RBAC permissions when upgrading kubectl - see kured-rbac.yaml for details
ADD https://storage.googleapis.com/kubernetes-release/release/v1.17.5/bin/linux/amd64/kubectl /usr/bin/kubectl
ADD https://storage.googleapis.com/kubernetes-release/release/v1.17.7/bin/linux/amd64/kubectl /usr/bin/kubectl
RUN chmod 0755 /usr/bin/kubectl
COPY ./kured /usr/bin/kured
ENTRYPOINT ["/usr/bin/kured"]

View File

@@ -45,6 +45,8 @@ var (
rebootEnd string
timezone string
annotationTTL time.Duration
// Metrics
rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: "kured",
@@ -97,6 +99,9 @@ func main() {
rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC",
"use this timezone for schedule inputs")
rootCmd.PersistentFlags().DurationVar(&annotationTTL, "annotation-ttl", 0,
"force clean annotation after this ammount of time (default 0, disabled)")
if err := rootCmd.Execute(); err != nil {
log.Fatal(err)
}
@@ -204,8 +209,8 @@ func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
return holding
}
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
holding, holder, err := lock.Acquire(metadata)
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration) bool {
holding, holder, err := lock.Acquire(metadata, TTL)
switch {
case err != nil:
log.Fatalf("Error acquiring lock: %v", err)
@@ -283,7 +288,7 @@ type nodeMeta struct {
Unschedulable bool `json:"unschedulable"`
}
func rebootAsRequired(nodeID string, window *timewindow.TimeWindow) {
func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Duration) {
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal(err)
@@ -314,7 +319,7 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow) {
}
nodeMeta.Unschedulable = node.Spec.Unschedulable
if acquire(lock, &nodeMeta) {
if acquire(lock, &nodeMeta, TTL) {
if !nodeMeta.Unschedulable {
drain(nodeID)
}
@@ -346,8 +351,13 @@ func root(cmd *cobra.Command, args []string) {
log.Infof("Reboot Sentinel: %s every %v", rebootSentinel, period)
log.Infof("Blocking Pod Selectors: %v", podSelectors)
log.Infof("Reboot on: %v", window)
if annotationTTL > 0 {
log.Infof("Force annotation cleanup after: %v", annotationTTL)
} else {
log.Info("Force annotation cleanup disabled.")
}
go rebootAsRequired(nodeID, window)
go rebootAsRequired(nodeID, window, annotationTTL)
go maintainRebootRequiredMetric(nodeID)
http.Handle("/metrics", promhttp.Handler())

View File

@@ -8,7 +8,7 @@ rules:
# Allow kubectl to drain/uncordon
#
# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
# match https://github.com/kubernetes/kubernetes/blob/v1.17.5/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
# match https://github.com/kubernetes/kubernetes/blob/v1.17.7/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
#
- apiGroups: [""]
resources: ["nodes"]

View File

@@ -19,15 +19,17 @@ type DaemonSetLock struct {
}
type lockAnnotationValue struct {
NodeID string `json:"nodeID"`
Metadata interface{} `json:"metadata,omitempty"`
NodeID string `json:"nodeID"`
Metadata interface{} `json:"metadata,omitempty"`
Created time.Time `json:"created"`
TTL time.Duration `json:"TTL"`
}
func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock {
return &DaemonSetLock{client, nodeID, namespace, name, annotation}
}
func (dsl *DaemonSetLock) Acquire(metadata interface{}) (acquired bool, owner string, err error) {
func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (acquired bool, owner string, err error) {
for {
ds, err := dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(dsl.name, metav1.GetOptions{})
if err != nil {
@@ -40,13 +42,18 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}) (acquired bool, owner st
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, "", err
}
if ttlExpired(value.Created, value.TTL) {
return true, value.NodeID, nil
}
return value.NodeID == dsl.nodeID, value.NodeID, nil
}
if ds.ObjectMeta.Annotations == nil {
ds.ObjectMeta.Annotations = make(map[string]string)
}
value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata}
value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata, Created: time.Now().UTC(), TTL: TTL}
valueBytes, err := json.Marshal(&value)
if err != nil {
return false, "", err
@@ -79,6 +86,11 @@ func (dsl *DaemonSetLock) Test(metadata interface{}) (holding bool, err error) {
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return false, err
}
if ttlExpired(value.Created, value.TTL) {
return true, nil
}
return value.NodeID == dsl.nodeID, nil
}
@@ -98,7 +110,7 @@ func (dsl *DaemonSetLock) Release() error {
if err := json.Unmarshal([]byte(valueString), &value); err != nil {
return err
}
if value.NodeID != dsl.nodeID {
if value.NodeID != dsl.nodeID && !ttlExpired(value.Created, value.TTL) {
return fmt.Errorf("Not lock holder: %v", value.NodeID)
}
} else {
@@ -120,3 +132,10 @@ func (dsl *DaemonSetLock) Release() error {
return nil
}
}
func ttlExpired(created time.Time, ttl time.Duration) bool {
if ttl > 0 && time.Since(created) >= ttl {
return true
}
return false
}

View File

@@ -0,0 +1,28 @@
package daemonsetlock
import (
"testing"
"time"
)
func TestTtlExpired(t *testing.T) {
d := time.Date(2020, 05, 05, 14, 15, 0, 0, time.UTC)
second, _ := time.ParseDuration("1s")
zero, _ := time.ParseDuration("0m")
tests := []struct {
created time.Time
ttl time.Duration
result bool
}{
{d, second, true},
{time.Now(), second, false},
{d, zero, false},
}
for i, tst := range tests {
if ttlExpired(tst.created, tst.ttl) != tst.result {
t.Errorf("Test %d failed, expected %v but got %v", i, tst.result, !tst.result)
}
}
}