diff --git a/cmd/healthchecker/health_checker.go b/cmd/healthchecker/health_checker.go index 7917fc7b..3cce076c 100644 --- a/cmd/healthchecker/health_checker.go +++ b/cmd/healthchecker/health_checker.go @@ -49,7 +49,12 @@ func main() { fmt.Println(err) os.Exit(int(types.Unknown)) } - if !hc.CheckHealth() { + healthy, err := hc.CheckHealth() + if err != nil { + fmt.Printf("error checking %v health: %v\n", hco.Component, err) + os.Exit(int(types.Unknown)) + } + if !healthy { fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair) os.Exit(int(types.NonOK)) } diff --git a/pkg/healthchecker/health_checker.go b/pkg/healthchecker/health_checker.go index e0eb448f..ac552d08 100644 --- a/pkg/healthchecker/health_checker.go +++ b/pkg/healthchecker/health_checker.go @@ -21,6 +21,7 @@ import ( "errors" "net/http" "os/exec" + "strconv" "strings" "time" @@ -33,7 +34,7 @@ import ( type healthChecker struct { component string enableRepair bool - healthCheckFunc func() bool + healthCheckFunc func() (bool, error) // The repair is "best-effort" and ignores the error from the underlying actions. // The bash commands to kill the process will fail if the service is down and hence ignore. repairFunc func() @@ -102,30 +103,23 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() { } // getHealthCheckFunc returns the health check function based on the component. -func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool { +func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) { switch hco.Component { case types.KubeletComponent: - return func() bool { - httpClient := http.Client{Timeout: hco.HealthCheckTimeout} - response, err := httpClient.Get(types.KubeletHealthCheckEndpoint) - if err != nil || response.StatusCode != http.StatusOK { - return false - } - return true - } + return getKubeletHealthCheckFunc(hco.HealthCheckTimeout) case types.DockerComponent: - return func() bool { + return func() (bool, error) { if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil { - return false + return false, nil } - return true + return true, nil } case types.CRIComponent: - return func() bool { + return func() (bool, error) { if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil { - return false + return false, nil } - return true + return true, nil } } return nil @@ -133,10 +127,13 @@ func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool { // CheckHealth checks for the health of the component and tries to repair if enabled. // Returns true if healthy, false otherwise. -func (hc *healthChecker) CheckHealth() bool { - healthy := hc.healthCheckFunc() +func (hc *healthChecker) CheckHealth() (bool, error) { + healthy, err := hc.healthCheckFunc() + if err != nil { + return healthy, err + } if healthy { - return true + return true, nil } // The service is unhealthy. // Attempt repair based on flag. @@ -152,14 +149,13 @@ func (hc *healthChecker) CheckHealth() bool { hc.repairFunc() } } - return false + return false, nil } // execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs. func execCommand(timeout time.Duration, command string, args ...string) (string, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - cmd := exec.CommandContext(ctx, command, args...) out, err := cmd.Output() if err != nil { @@ -168,3 +164,66 @@ func execCommand(timeout time.Duration, command string, args ...string) (string, } return strings.TrimSuffix(string(out), "\n"), nil } + +// kubeletHttpHealthCheck checks the health api response on kubelet. +// Returns true for healthy, false otherwise. +func kubeletHttpHealthCheck(healthCheckTimeout time.Duration) bool { + httpClient := http.Client{Timeout: healthCheckTimeout} + response, err := httpClient.Get(types.KubeletHealthCheckEndpoint) + if err != nil || response.StatusCode != http.StatusOK { + glog.Info("kubelet failed http health check") + return false + } + return true +} + +// kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue +// by checking repeated occurrences of log "use of closed network connection" in kubelet logs. +// Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise. +func kubeletConnectionHealthCheck() (bool, error) { + kubeletUptimeFunc := getUptimeFunc(types.KubeletComponent) + uptime, err := kubeletUptimeFunc() + if err != nil { + return true, err + } + logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout) + if err != nil { + return true, err + } + out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c", + // Query kubelet logs since the logStartTime + `journalctl --unit kubelet --since "`+logStartTime+ + // Grep the pattern for lost connection + `" | grep -i "`+types.KubeletClosedConnectionLogPattern+ + // Get the count of occurrences + `" | wc -l`) + if err != nil { + return true, err + } + occurrences, err := strconv.Atoi(out) + if err != nil { + return true, err + } + if occurrences >= types.KubeletClosedConnectionLogPatternThresholdCount { + glog.Infof("kubelet failed apiserver connection check, log pattern occurrences: %v", occurrences) + return false, nil + } + return true, nil +} + +// getKubeletHealthCheckFunc returns a function that checks for kubelet health and +// return false if identified as unhealthy, true otherwise. +func getKubeletHealthCheckFunc(healthCheckTimeout time.Duration) func() (bool, error) { + return func() (bool, error) { + httpHealthy := kubeletHttpHealthCheck(healthCheckTimeout) + connectionHealthy, err := kubeletConnectionHealthCheck() + // The plugin will return Unknown status code in case there is any error in + // checking kubelet health. + if err != nil { + glog.Infof("Error in determining apiserver connection health: %v", err) + return false, err + } + healthy := httpHealthy && connectionHealthy + return healthy, nil + } +} diff --git a/pkg/healthchecker/health_checker_test.go b/pkg/healthchecker/health_checker_test.go index aa21097e..bec7ff5b 100644 --- a/pkg/healthchecker/health_checker_test.go +++ b/pkg/healthchecker/health_checker_test.go @@ -25,7 +25,7 @@ import ( var repairCalled bool -func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker { +func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() (bool, error), uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker { repairCalled = false return &healthChecker{ enableRepair: enableRepair, @@ -37,12 +37,12 @@ func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptime } } -func healthyFunc() bool { - return true +func healthyFunc() (bool, error) { + return true, nil } -func unhealthyFunc() bool { - return false +func unhealthyFunc() (bool, error) { + return false, nil } func repairFunc() { @@ -62,7 +62,7 @@ func TestHealthCheck(t *testing.T) { description string enableRepair bool healthy bool - healthCheckFunc func() bool + healthCheckFunc func() (bool, error) uptimeFunc func() (time.Duration, error) repairFunc func() repairCalled bool @@ -106,7 +106,10 @@ func TestHealthCheck(t *testing.T) { } { t.Run(tc.description, func(t *testing.T) { hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.uptimeFunc, tc.enableRepair) - healthy := hc.CheckHealth() + healthy, err := hc.CheckHealth() + if err != nil { + t.Errorf("unexpected error occurred got %v; expected nil", err) + } if healthy != tc.healthy { t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy) } diff --git a/pkg/healthchecker/types/types.go b/pkg/healthchecker/types/types.go index c4334c66..3f9c6123 100644 --- a/pkg/healthchecker/types/types.go +++ b/pkg/healthchecker/types/types.go @@ -19,19 +19,25 @@ package types import "time" const ( - DefaultCoolDownTime = 2 * time.Minute - DefaultHealthCheckTimeout = 10 * time.Second - CmdTimeout = 10 * time.Second - DefaultCriCtl = "/usr/bin/crictl" - DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock" - KubeletComponent = "kubelet" - CRIComponent = "cri" - DockerComponent = "docker" - ContainerdService = "containerd" - KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz" - UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC" + DefaultCoolDownTime = 2 * time.Minute + DefaultHealthCheckTimeout = 10 * time.Second + CmdTimeout = 10 * time.Second + UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC" + LogParsingTimeLayout = "2006-01-02 15:04:05" + + DefaultCriCtl = "/usr/bin/crictl" + DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock" + + KubeletComponent = "kubelet" + CRIComponent = "cri" + DockerComponent = "docker" + ContainerdService = "containerd" + + KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz" + KubeletClosedConnectionLogPattern = "use of closed network connection" + KubeletClosedConnectionLogPatternThresholdCount = 10 ) type HealthChecker interface { - CheckHealth() bool + CheckHealth() (bool, error) }