Add kubelet apiserver connection fail check in health checker

This commit is contained in:
Archit Bansal
2020-11-05 23:31:43 -08:00
parent f42281ee26
commit 2513756583
4 changed files with 114 additions and 41 deletions

View File

@@ -49,7 +49,12 @@ func main() {
fmt.Println(err)
os.Exit(int(types.Unknown))
}
if !hc.CheckHealth() {
healthy, err := hc.CheckHealth()
if err != nil {
fmt.Printf("error checking %v health: %v\n", hco.Component, err)
os.Exit(int(types.Unknown))
}
if !healthy {
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
os.Exit(int(types.NonOK))
}

View File

@@ -21,6 +21,7 @@ import (
"errors"
"net/http"
"os/exec"
"strconv"
"strings"
"time"
@@ -33,7 +34,7 @@ import (
type healthChecker struct {
component string
enableRepair bool
healthCheckFunc func() bool
healthCheckFunc func() (bool, error)
// The repair is "best-effort" and ignores the error from the underlying actions.
// The bash commands to kill the process will fail if the service is down and hence ignore.
repairFunc func()
@@ -102,30 +103,23 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
}
// getHealthCheckFunc returns the health check function based on the component.
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool {
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
switch hco.Component {
case types.KubeletComponent:
return func() bool {
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
if err != nil || response.StatusCode != http.StatusOK {
return false
}
return true
}
return getKubeletHealthCheckFunc(hco.HealthCheckTimeout)
case types.DockerComponent:
return func() bool {
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
return false
return false, nil
}
return true
return true, nil
}
case types.CRIComponent:
return func() bool {
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
return false
return false, nil
}
return true
return true, nil
}
}
return nil
@@ -133,10 +127,13 @@ func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool {
// CheckHealth checks for the health of the component and tries to repair if enabled.
// Returns true if healthy, false otherwise.
func (hc *healthChecker) CheckHealth() bool {
healthy := hc.healthCheckFunc()
func (hc *healthChecker) CheckHealth() (bool, error) {
healthy, err := hc.healthCheckFunc()
if err != nil {
return healthy, err
}
if healthy {
return true
return true, nil
}
// The service is unhealthy.
// Attempt repair based on flag.
@@ -152,14 +149,13 @@ func (hc *healthChecker) CheckHealth() bool {
hc.repairFunc()
}
}
return false
return false, nil
}
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
cmd := exec.CommandContext(ctx, command, args...)
out, err := cmd.Output()
if err != nil {
@@ -168,3 +164,66 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
}
return strings.TrimSuffix(string(out), "\n"), nil
}
// kubeletHttpHealthCheck checks the health api response on kubelet.
// Returns true for healthy, false otherwise.
func kubeletHttpHealthCheck(healthCheckTimeout time.Duration) bool {
httpClient := http.Client{Timeout: healthCheckTimeout}
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
if err != nil || response.StatusCode != http.StatusOK {
glog.Info("kubelet failed http health check")
return false
}
return true
}
// kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
// by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
// Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
func kubeletConnectionHealthCheck() (bool, error) {
kubeletUptimeFunc := getUptimeFunc(types.KubeletComponent)
uptime, err := kubeletUptimeFunc()
if err != nil {
return true, err
}
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
if err != nil {
return true, err
}
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
// Query kubelet logs since the logStartTime
`journalctl --unit kubelet --since "`+logStartTime+
// Grep the pattern for lost connection
`" | grep -i "`+types.KubeletClosedConnectionLogPattern+
// Get the count of occurrences
`" | wc -l`)
if err != nil {
return true, err
}
occurrences, err := strconv.Atoi(out)
if err != nil {
return true, err
}
if occurrences >= types.KubeletClosedConnectionLogPatternThresholdCount {
glog.Infof("kubelet failed apiserver connection check, log pattern occurrences: %v", occurrences)
return false, nil
}
return true, nil
}
// getKubeletHealthCheckFunc returns a function that checks for kubelet health and
// return false if identified as unhealthy, true otherwise.
func getKubeletHealthCheckFunc(healthCheckTimeout time.Duration) func() (bool, error) {
return func() (bool, error) {
httpHealthy := kubeletHttpHealthCheck(healthCheckTimeout)
connectionHealthy, err := kubeletConnectionHealthCheck()
// The plugin will return Unknown status code in case there is any error in
// checking kubelet health.
if err != nil {
glog.Infof("Error in determining apiserver connection health: %v", err)
return false, err
}
healthy := httpHealthy && connectionHealthy
return healthy, nil
}
}

View File

@@ -25,7 +25,7 @@ import (
var repairCalled bool
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() (bool, error), uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
repairCalled = false
return &healthChecker{
enableRepair: enableRepair,
@@ -37,12 +37,12 @@ func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptime
}
}
func healthyFunc() bool {
return true
func healthyFunc() (bool, error) {
return true, nil
}
func unhealthyFunc() bool {
return false
func unhealthyFunc() (bool, error) {
return false, nil
}
func repairFunc() {
@@ -62,7 +62,7 @@ func TestHealthCheck(t *testing.T) {
description string
enableRepair bool
healthy bool
healthCheckFunc func() bool
healthCheckFunc func() (bool, error)
uptimeFunc func() (time.Duration, error)
repairFunc func()
repairCalled bool
@@ -106,7 +106,10 @@ func TestHealthCheck(t *testing.T) {
} {
t.Run(tc.description, func(t *testing.T) {
hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.uptimeFunc, tc.enableRepair)
healthy := hc.CheckHealth()
healthy, err := hc.CheckHealth()
if err != nil {
t.Errorf("unexpected error occurred got %v; expected nil", err)
}
if healthy != tc.healthy {
t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy)
}

View File

@@ -19,19 +19,25 @@ package types
import "time"
const (
DefaultCoolDownTime = 2 * time.Minute
DefaultHealthCheckTimeout = 10 * time.Second
CmdTimeout = 10 * time.Second
DefaultCriCtl = "/usr/bin/crictl"
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
KubeletComponent = "kubelet"
CRIComponent = "cri"
DockerComponent = "docker"
ContainerdService = "containerd"
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
DefaultCoolDownTime = 2 * time.Minute
DefaultHealthCheckTimeout = 10 * time.Second
CmdTimeout = 10 * time.Second
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
LogParsingTimeLayout = "2006-01-02 15:04:05"
DefaultCriCtl = "/usr/bin/crictl"
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
KubeletComponent = "kubelet"
CRIComponent = "cri"
DockerComponent = "docker"
ContainerdService = "containerd"
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
KubeletClosedConnectionLogPattern = "use of closed network connection"
KubeletClosedConnectionLogPatternThresholdCount = 10
)
type HealthChecker interface {
CheckHealth() bool
CheckHealth() (bool, error)
}