mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-02-14 18:09:57 +00:00
Add kubelet apiserver connection fail check in health checker
This commit is contained in:
@@ -49,7 +49,12 @@ func main() {
|
||||
fmt.Println(err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
if !hc.CheckHealth() {
|
||||
healthy, err := hc.CheckHealth()
|
||||
if err != nil {
|
||||
fmt.Printf("error checking %v health: %v\n", hco.Component, err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
if !healthy {
|
||||
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
|
||||
os.Exit(int(types.NonOK))
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
"errors"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -33,7 +34,7 @@ import (
|
||||
type healthChecker struct {
|
||||
component string
|
||||
enableRepair bool
|
||||
healthCheckFunc func() bool
|
||||
healthCheckFunc func() (bool, error)
|
||||
// The repair is "best-effort" and ignores the error from the underlying actions.
|
||||
// The bash commands to kill the process will fail if the service is down and hence ignore.
|
||||
repairFunc func()
|
||||
@@ -102,30 +103,23 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool {
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return func() bool {
|
||||
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
|
||||
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
return getKubeletHealthCheckFunc(hco.HealthCheckTimeout)
|
||||
case types.DockerComponent:
|
||||
return func() bool {
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
|
||||
return false
|
||||
return false, nil
|
||||
}
|
||||
return true
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() bool {
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
|
||||
return false
|
||||
return false, nil
|
||||
}
|
||||
return true
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return nil
|
||||
@@ -133,10 +127,13 @@ func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool {
|
||||
|
||||
// CheckHealth checks for the health of the component and tries to repair if enabled.
|
||||
// Returns true if healthy, false otherwise.
|
||||
func (hc *healthChecker) CheckHealth() bool {
|
||||
healthy := hc.healthCheckFunc()
|
||||
func (hc *healthChecker) CheckHealth() (bool, error) {
|
||||
healthy, err := hc.healthCheckFunc()
|
||||
if err != nil {
|
||||
return healthy, err
|
||||
}
|
||||
if healthy {
|
||||
return true
|
||||
return true, nil
|
||||
}
|
||||
// The service is unhealthy.
|
||||
// Attempt repair based on flag.
|
||||
@@ -152,14 +149,13 @@ func (hc *healthChecker) CheckHealth() bool {
|
||||
hc.repairFunc()
|
||||
}
|
||||
}
|
||||
return false
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
|
||||
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, command, args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
@@ -168,3 +164,66 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
|
||||
}
|
||||
return strings.TrimSuffix(string(out), "\n"), nil
|
||||
}
|
||||
|
||||
// kubeletHttpHealthCheck checks the health api response on kubelet.
|
||||
// Returns true for healthy, false otherwise.
|
||||
func kubeletHttpHealthCheck(healthCheckTimeout time.Duration) bool {
|
||||
httpClient := http.Client{Timeout: healthCheckTimeout}
|
||||
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
glog.Info("kubelet failed http health check")
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
|
||||
// by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
|
||||
// Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
|
||||
func kubeletConnectionHealthCheck() (bool, error) {
|
||||
kubeletUptimeFunc := getUptimeFunc(types.KubeletComponent)
|
||||
uptime, err := kubeletUptimeFunc()
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
|
||||
// Query kubelet logs since the logStartTime
|
||||
`journalctl --unit kubelet --since "`+logStartTime+
|
||||
// Grep the pattern for lost connection
|
||||
`" | grep -i "`+types.KubeletClosedConnectionLogPattern+
|
||||
// Get the count of occurrences
|
||||
`" | wc -l`)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
occurrences, err := strconv.Atoi(out)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
if occurrences >= types.KubeletClosedConnectionLogPatternThresholdCount {
|
||||
glog.Infof("kubelet failed apiserver connection check, log pattern occurrences: %v", occurrences)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// getKubeletHealthCheckFunc returns a function that checks for kubelet health and
|
||||
// return false if identified as unhealthy, true otherwise.
|
||||
func getKubeletHealthCheckFunc(healthCheckTimeout time.Duration) func() (bool, error) {
|
||||
return func() (bool, error) {
|
||||
httpHealthy := kubeletHttpHealthCheck(healthCheckTimeout)
|
||||
connectionHealthy, err := kubeletConnectionHealthCheck()
|
||||
// The plugin will return Unknown status code in case there is any error in
|
||||
// checking kubelet health.
|
||||
if err != nil {
|
||||
glog.Infof("Error in determining apiserver connection health: %v", err)
|
||||
return false, err
|
||||
}
|
||||
healthy := httpHealthy && connectionHealthy
|
||||
return healthy, nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ import (
|
||||
|
||||
var repairCalled bool
|
||||
|
||||
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
|
||||
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() (bool, error), uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
|
||||
repairCalled = false
|
||||
return &healthChecker{
|
||||
enableRepair: enableRepair,
|
||||
@@ -37,12 +37,12 @@ func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptime
|
||||
}
|
||||
}
|
||||
|
||||
func healthyFunc() bool {
|
||||
return true
|
||||
func healthyFunc() (bool, error) {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func unhealthyFunc() bool {
|
||||
return false
|
||||
func unhealthyFunc() (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func repairFunc() {
|
||||
@@ -62,7 +62,7 @@ func TestHealthCheck(t *testing.T) {
|
||||
description string
|
||||
enableRepair bool
|
||||
healthy bool
|
||||
healthCheckFunc func() bool
|
||||
healthCheckFunc func() (bool, error)
|
||||
uptimeFunc func() (time.Duration, error)
|
||||
repairFunc func()
|
||||
repairCalled bool
|
||||
@@ -106,7 +106,10 @@ func TestHealthCheck(t *testing.T) {
|
||||
} {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.uptimeFunc, tc.enableRepair)
|
||||
healthy := hc.CheckHealth()
|
||||
healthy, err := hc.CheckHealth()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error occurred got %v; expected nil", err)
|
||||
}
|
||||
if healthy != tc.healthy {
|
||||
t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy)
|
||||
}
|
||||
|
||||
@@ -19,19 +19,25 @@ package types
|
||||
import "time"
|
||||
|
||||
const (
|
||||
DefaultCoolDownTime = 2 * time.Minute
|
||||
DefaultHealthCheckTimeout = 10 * time.Second
|
||||
CmdTimeout = 10 * time.Second
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
KubeletComponent = "kubelet"
|
||||
CRIComponent = "cri"
|
||||
DockerComponent = "docker"
|
||||
ContainerdService = "containerd"
|
||||
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
|
||||
DefaultCoolDownTime = 2 * time.Minute
|
||||
DefaultHealthCheckTimeout = 10 * time.Second
|
||||
CmdTimeout = 10 * time.Second
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
|
||||
LogParsingTimeLayout = "2006-01-02 15:04:05"
|
||||
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
|
||||
KubeletComponent = "kubelet"
|
||||
CRIComponent = "cri"
|
||||
DockerComponent = "docker"
|
||||
ContainerdService = "containerd"
|
||||
|
||||
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
|
||||
KubeletClosedConnectionLogPattern = "use of closed network connection"
|
||||
KubeletClosedConnectionLogPatternThresholdCount = 10
|
||||
)
|
||||
|
||||
type HealthChecker interface {
|
||||
CheckHealth() bool
|
||||
CheckHealth() (bool, error)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user