mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-02-14 18:09:57 +00:00
Add health-check-monitor
This commit is contained in:
13
Makefile
13
Makefile
@@ -113,6 +113,14 @@ endif
|
||||
-tags "$(BUILD_TAGS)" \
|
||||
./test/e2e/problemmaker/problem_maker.go
|
||||
|
||||
./bin/health-checker: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o bin/health-checker \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(BUILD_TAGS)" \
|
||||
cmd/healthchecker/health_checker.go
|
||||
|
||||
Dockerfile: Dockerfile.in
|
||||
sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@
|
||||
ifneq ($(ENABLE_JOURNALD), 1)
|
||||
@@ -134,12 +142,12 @@ e2e-test: vet fmt build-tar
|
||||
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
|
||||
-artifacts-dir=$(ARTIFACTS)
|
||||
|
||||
build-binaries: ./bin/node-problem-detector ./bin/log-counter
|
||||
build-binaries: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker
|
||||
|
||||
build-container: build-binaries Dockerfile
|
||||
docker build -t $(IMAGE) .
|
||||
|
||||
build-tar: ./bin/node-problem-detector ./bin/log-counter ./test/bin/problem-maker
|
||||
build-tar: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
|
||||
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
|
||||
sha1sum $(TARBALL)
|
||||
md5sum $(TARBALL)
|
||||
@@ -164,6 +172,7 @@ push-tar: build-tar
|
||||
push: push-container push-tar
|
||||
|
||||
clean:
|
||||
rm -f bin/health-checker
|
||||
rm -f bin/log-counter
|
||||
rm -f bin/node-problem-detector
|
||||
rm -f test/bin/problem-maker
|
||||
|
||||
57
cmd/healthchecker/health_checker.go
Normal file
57
cmd/healthchecker/health_checker.go
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Set glog flag so that it does not log to files.
|
||||
if err := flag.Set("logtostderr", "true"); err != nil {
|
||||
fmt.Printf("Failed to set logtostderr=true: %v", err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
|
||||
hco := options.NewHealthCheckerOptions()
|
||||
hco.AddFlags(pflag.CommandLine)
|
||||
pflag.Parse()
|
||||
hco.SetDefaults()
|
||||
if err := hco.IsValid(); err != nil {
|
||||
fmt.Println(err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
|
||||
hc, err := healthchecker.NewHealthChecker(hco)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
if !hc.CheckHealth() {
|
||||
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
|
||||
os.Exit(int(types.NonOK))
|
||||
}
|
||||
os.Exit(int(types.OK))
|
||||
}
|
||||
102
cmd/healthchecker/options/options.go
Normal file
102
cmd/healthchecker/options/options.go
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package options
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
// NewHealthCheckerOptions returns an empty health check options struct.
|
||||
func NewHealthCheckerOptions() *HealthCheckerOptions {
|
||||
return &HealthCheckerOptions{}
|
||||
}
|
||||
|
||||
// HealthCheckerOptions are the options used to configure the health checker.
|
||||
type HealthCheckerOptions struct {
|
||||
Component string
|
||||
SystemdService string
|
||||
EnableRepair bool
|
||||
CriCtlPath string
|
||||
CriSocketPath string
|
||||
CoolDownTime time.Duration
|
||||
HealthCheckTimeout time.Duration
|
||||
}
|
||||
|
||||
// AddFlags adds health checker command line options to pflag.
|
||||
func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
|
||||
fs.StringVar(&hco.Component, "component", types.KubeletComponent,
|
||||
"The component to check health for. Supports kubelet, docker and cri")
|
||||
fs.StringVar(&hco.SystemdService, "systemd-service", "",
|
||||
"The underlying systemd service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
|
||||
fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component.")
|
||||
fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl,
|
||||
"The path to the crictl binary. This is used to check health of cri component.")
|
||||
fs.StringVar(&hco.CriSocketPath, "cri-socket-path", types.DefaultCriSocketPath,
|
||||
"The path to the cri socket. Used with crictl to specify the socket path.")
|
||||
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
|
||||
"The duration to wait for the service to be up before attempting repair.")
|
||||
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
|
||||
"The time to wait before marking the component as unhealthy.")
|
||||
}
|
||||
|
||||
// IsValid validates health checker command line options.
|
||||
// Returns error if invalid, nil otherwise.
|
||||
func (hco *HealthCheckerOptions) IsValid() error {
|
||||
// Make sure the component specified is valid.
|
||||
if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent && hco.Component != types.CRIComponent {
|
||||
return fmt.Errorf("the component specified is not supported. Supported components are : <kubelet/docker/cri>")
|
||||
}
|
||||
// Make sure the systemd service is specified if repair is enabled.
|
||||
if hco.EnableRepair && hco.SystemdService == "" {
|
||||
return fmt.Errorf("systemd-service cannot be empty when repair is enabled")
|
||||
}
|
||||
// Skip checking further if the component is not cri.
|
||||
if hco.Component != types.CRIComponent {
|
||||
return nil
|
||||
}
|
||||
// Make sure the crictl path is not empty for cri component.
|
||||
if hco.Component == types.CRIComponent && hco.CriCtlPath == "" {
|
||||
return fmt.Errorf("the crictl-path cannot be empty for cri component")
|
||||
}
|
||||
// Make sure the cri socker path is not empty for cri component.
|
||||
if hco.Component == types.CRIComponent && hco.CriSocketPath == "" {
|
||||
return fmt.Errorf("the cri-socket-path cannot be empty for cri component")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetDefaults sets the defaults values for the dependent flags.
|
||||
func (hco *HealthCheckerOptions) SetDefaults() {
|
||||
if hco.SystemdService != "" {
|
||||
return
|
||||
}
|
||||
if hco.Component != types.CRIComponent {
|
||||
hco.SystemdService = hco.Component
|
||||
return
|
||||
}
|
||||
hco.SystemdService = types.ContainerdService
|
||||
}
|
||||
|
||||
func init() {
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
}
|
||||
76
cmd/healthchecker/options/options_test.go
Normal file
76
cmd/healthchecker/options/options_test.go
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package options
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
func TestIsValid(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
hco HealthCheckerOptions
|
||||
expectError bool
|
||||
}{
|
||||
{
|
||||
name: "valid component",
|
||||
hco: HealthCheckerOptions{
|
||||
Component: types.KubeletComponent,
|
||||
},
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
name: "invalid component",
|
||||
hco: HealthCheckerOptions{
|
||||
Component: "wrongComponent",
|
||||
},
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty crictl-path with cri",
|
||||
hco: HealthCheckerOptions{
|
||||
Component: types.CRIComponent,
|
||||
CriCtlPath: "",
|
||||
EnableRepair: false,
|
||||
},
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty systemd-service and repair enabled",
|
||||
hco: HealthCheckerOptions{
|
||||
Component: types.KubeletComponent,
|
||||
EnableRepair: true,
|
||||
SystemdService: "",
|
||||
},
|
||||
expectError: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range testCases {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
if test.expectError {
|
||||
assert.Error(t, test.hco.IsValid(), "HealthChecker option %+v is invalid. Expected IsValid to return error.", test.hco)
|
||||
} else {
|
||||
assert.NoError(t, test.hco.IsValid(), "HealthChecker option %+v is valid. Expected IsValid to return nil.", test.hco)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
33
config/health-checker-docker.json
Normal file
33
config/health-checker-docker.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10s",
|
||||
"timeout": "3m",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "health-checker",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ContainerRuntimeUnhealthy",
|
||||
"reason": "ContainerRuntimeIsHealthy",
|
||||
"message": "Container runtime on the node is functioning properly"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ContainerRuntimeUnhealthy",
|
||||
"reason": "DockerUnhealthy",
|
||||
"path": "/home/kubernetes/bin/health-checker",
|
||||
"args": [
|
||||
"--component=docker",
|
||||
"--enable-repair=false",
|
||||
"--cooldown-time=2m",
|
||||
"--health-check-timeout=60s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
}
|
||||
]
|
||||
}
|
||||
33
config/health-checker-kubelet.json
Normal file
33
config/health-checker-kubelet.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10s",
|
||||
"timeout": "3m",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "health-checker",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "KubeletUnhealthy",
|
||||
"reason": "KubeletIsHealthy",
|
||||
"message": "kubelet on the node is functioning properly"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KubeletUnhealthy",
|
||||
"reason": "KubeletUnhealthy",
|
||||
"path": "/home/kubernetes/bin/health-checker",
|
||||
"args": [
|
||||
"--component=kubelet",
|
||||
"--enable-repair=false",
|
||||
"--cooldown-time=1m",
|
||||
"--health-check-timeout=10s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
}
|
||||
]
|
||||
}
|
||||
163
pkg/healthchecker/health_checker.go
Normal file
163
pkg/healthchecker/health_checker.go
Normal file
@@ -0,0 +1,163 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package healthchecker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
type healthChecker struct {
|
||||
enableRepair bool
|
||||
healthCheckFunc func() bool
|
||||
// The repair is "best-effort" and ignores the error from the underlying actions.
|
||||
// The bash commands to kill the process will fail if the service is down and hence ignore.
|
||||
repairFunc func()
|
||||
uptimeFunc func() (time.Duration, error)
|
||||
crictlPath string
|
||||
healthCheckTimeout time.Duration
|
||||
coolDownTime time.Duration
|
||||
}
|
||||
|
||||
// NewHealthChecker returns a new health checker configured with the given options.
|
||||
func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, error) {
|
||||
hc := &healthChecker{
|
||||
enableRepair: hco.EnableRepair,
|
||||
crictlPath: hco.CriCtlPath,
|
||||
healthCheckTimeout: hco.HealthCheckTimeout,
|
||||
coolDownTime: hco.CoolDownTime,
|
||||
}
|
||||
hc.healthCheckFunc = getHealthCheckFunc(hco)
|
||||
hc.repairFunc = getRepairFunc(hco)
|
||||
hc.uptimeFunc = getUptimeFunc(hco.SystemdService)
|
||||
return hc, nil
|
||||
}
|
||||
|
||||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
return func() (time.Duration, error) {
|
||||
out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=ActiveEnterTimestamp")
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
val := strings.Split(out, "=")
|
||||
if len(val) < 2 {
|
||||
return time.Duration(0), errors.New("could not parse the service uptime time correctly")
|
||||
}
|
||||
t, err := time.Parse(types.UptimeTimeLayout, val[1])
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
return time.Since(t), nil
|
||||
}
|
||||
}
|
||||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
switch hco.Component {
|
||||
case types.DockerComponent:
|
||||
// Use "docker ps" for docker health check. Not using crictl for docker to remove
|
||||
// dependency on the kubelet.
|
||||
return func() {
|
||||
execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd")
|
||||
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
|
||||
}
|
||||
default:
|
||||
// Just kill the service for all other components
|
||||
return func() {
|
||||
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return func() bool {
|
||||
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
|
||||
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
case types.DockerComponent:
|
||||
return func() bool {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() bool {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CheckHealth checks for the health of the component and tries to repair if enabled.
|
||||
// Returns true if healthy, false otherwise.
|
||||
func (hc *healthChecker) CheckHealth() bool {
|
||||
healthy := hc.healthCheckFunc()
|
||||
if healthy {
|
||||
return true
|
||||
}
|
||||
// The service is unhealthy.
|
||||
// Attempt repair based on flag.
|
||||
if hc.enableRepair {
|
||||
glog.Infof("health-checker: component is unhealthy, proceeding to repair")
|
||||
// repair if the service has been up for the cool down period.
|
||||
uptime, err := hc.uptimeFunc()
|
||||
if err != nil {
|
||||
glog.Infof("health-checker: %v\n", err.Error())
|
||||
}
|
||||
glog.Infof("health-checker: component uptime: %v\n", uptime)
|
||||
if uptime > hc.coolDownTime {
|
||||
hc.repairFunc()
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
|
||||
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, command, args...)
|
||||
glog.Infof("health-checker: executing command : %v\n", cmd)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
glog.Infof("health-checker: command failed : %v, %v\n", err.Error(), out)
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(out), "\n"), nil
|
||||
}
|
||||
118
pkg/healthchecker/health_checker_test.go
Normal file
118
pkg/healthchecker/health_checker_test.go
Normal file
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package healthchecker
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
var repairCalled bool
|
||||
|
||||
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
|
||||
repairCalled = false
|
||||
return &healthChecker{
|
||||
enableRepair: enableRepair,
|
||||
healthCheckFunc: healthCheckFunc,
|
||||
repairFunc: repairFunc,
|
||||
uptimeFunc: uptimeFunc,
|
||||
healthCheckTimeout: time.Second,
|
||||
coolDownTime: 2 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
func healthyFunc() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func unhealthyFunc() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func repairFunc() {
|
||||
repairCalled = true
|
||||
}
|
||||
|
||||
func longServiceUptimeFunc() (time.Duration, error) {
|
||||
return 1 * time.Hour, nil
|
||||
}
|
||||
|
||||
func shortServiceUptimeFunc() (time.Duration, error) {
|
||||
return 1 * time.Second, nil
|
||||
}
|
||||
|
||||
func TestHealthCheck(t *testing.T) {
|
||||
for _, tc := range []struct {
|
||||
description string
|
||||
enableRepair bool
|
||||
healthy bool
|
||||
healthCheckFunc func() bool
|
||||
uptimeFunc func() (time.Duration, error)
|
||||
repairFunc func()
|
||||
repairCalled bool
|
||||
}{
|
||||
{
|
||||
description: "healthy component",
|
||||
enableRepair: true,
|
||||
healthy: true,
|
||||
healthCheckFunc: healthyFunc,
|
||||
repairFunc: repairFunc,
|
||||
uptimeFunc: shortServiceUptimeFunc,
|
||||
repairCalled: false,
|
||||
},
|
||||
{
|
||||
description: "unhealthy component and disabled repair",
|
||||
enableRepair: false,
|
||||
healthy: false,
|
||||
healthCheckFunc: unhealthyFunc,
|
||||
repairFunc: repairFunc,
|
||||
uptimeFunc: shortServiceUptimeFunc,
|
||||
repairCalled: false,
|
||||
},
|
||||
{
|
||||
description: "unhealthy component, enabled repair and component in cool dowm",
|
||||
enableRepair: true,
|
||||
healthy: false,
|
||||
healthCheckFunc: unhealthyFunc,
|
||||
repairFunc: repairFunc,
|
||||
uptimeFunc: shortServiceUptimeFunc,
|
||||
repairCalled: false,
|
||||
},
|
||||
{
|
||||
description: "unhealthy component, enabled repair and component out of cool dowm",
|
||||
enableRepair: true,
|
||||
healthy: false,
|
||||
healthCheckFunc: unhealthyFunc,
|
||||
repairFunc: repairFunc,
|
||||
uptimeFunc: longServiceUptimeFunc,
|
||||
repairCalled: true,
|
||||
},
|
||||
} {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.uptimeFunc, tc.enableRepair)
|
||||
healthy := hc.CheckHealth()
|
||||
if healthy != tc.healthy {
|
||||
t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy)
|
||||
}
|
||||
if repairCalled != tc.repairCalled {
|
||||
t.Errorf("incorrect repairCalled got %t; expected %t", repairCalled, tc.repairCalled)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
37
pkg/healthchecker/types/types.go
Normal file
37
pkg/healthchecker/types/types.go
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
import "time"
|
||||
|
||||
const (
|
||||
DefaultCoolDownTime = 2 * time.Minute
|
||||
DefaultHealthCheckTimeout = 10 * time.Second
|
||||
CmdTimeout = 10 * time.Second
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
KubeletComponent = "kubelet"
|
||||
CRIComponent = "cri"
|
||||
DockerComponent = "docker"
|
||||
ContainerdService = "containerd"
|
||||
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
|
||||
)
|
||||
|
||||
type HealthChecker interface {
|
||||
CheckHealth() bool
|
||||
}
|
||||
Reference in New Issue
Block a user