Merge pull request #536 from abansal4032/configure-log-pattern

Make log pattern check configurable in health checker
2026-05-06 01:07:07 +00:00 · 2021-02-17 18:32:51 -08:00
parent fc4f167caa 100f2bf8e6
commit ee5f2d1aa5
4 changed files with 206 additions and 46 deletions
--- a/cmd/healthchecker/options/options.go
+++ b/cmd/healthchecker/options/options.go
@@ -40,6 +40,7 @@ type HealthCheckerOptions struct {
 	CriSocketPath      string
 	CoolDownTime       time.Duration
 	HealthCheckTimeout time.Duration
+	LogPatterns        types.LogPatternFlag
 }

 // AddFlags adds health checker command line options to pflag.
@@ -57,6 +58,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
 		"The duration to wait for the service to be up before attempting repair.")
 	fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
 		"The time to wait before marking the component as unhealthy.")
+	fs.Var(&hco.LogPatterns, "log-pattern",
+		"The log pattern to look for in service journald logs. The format for flag value <failureThresholdCount>:<logPattern>")
 }

 // IsValid validates health checker command line options.
--- a/pkg/healthchecker/health_checker.go
+++ b/pkg/healthchecker/health_checker.go
@@ -33,6 +33,7 @@ import (

 type healthChecker struct {
 	component       string
+	systemdService  string
 	enableRepair    bool
 	healthCheckFunc func() (bool, error)
 	// The repair is "best-effort" and ignores the error from the underlying actions.
@@ -42,6 +43,7 @@ type healthChecker struct {
 	crictlPath         string
 	healthCheckTimeout time.Duration
 	coolDownTime       time.Duration
+	logPatternsToCheck map[string]int
 }

 // NewHealthChecker returns a new health checker configured with the given options.
@@ -52,6 +54,8 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
 		crictlPath:         hco.CriCtlPath,
 		healthCheckTimeout: hco.HealthCheckTimeout,
 		coolDownTime:       hco.CoolDownTime,
+		systemdService:     hco.SystemdService,
+		logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
 	}
 	hc.healthCheckFunc = getHealthCheckFunc(hco)
 	hc.repairFunc = getRepairFunc(hco)
@@ -106,7 +110,14 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
 func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
 	switch hco.Component {
 	case types.KubeletComponent:
-		return getKubeletHealthCheckFunc(hco.HealthCheckTimeout)
+		return func() (bool, error) {
+			httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
+			response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
+			if err != nil || response.StatusCode != http.StatusOK {
+				return false, nil
+			}
+			return true, nil
+		}
 	case types.DockerComponent:
 		return func() (bool, error) {
 			if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
@@ -132,7 +143,11 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
 	if err != nil {
 		return healthy, err
 	}
-	if healthy {
+	logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, hc.logPatternsToCheck)
+	if err != nil {
+		return logPatternHealthy, err
+	}
+	if healthy && logPatternHealthy {
 		return true, nil
 	}
 	// The service is unhealthy.
@@ -165,24 +180,14 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
 	return strings.TrimSuffix(string(out), "\n"), nil
 }

-// kubeletHttpHealthCheck checks the health api response on kubelet.
-// Returns true for healthy, false otherwise.
-func kubeletHttpHealthCheck(healthCheckTimeout time.Duration) bool {
-	httpClient := http.Client{Timeout: healthCheckTimeout}
-	response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
-	if err != nil || response.StatusCode != http.StatusOK {
-		glog.Info("kubelet failed http health check")
-		return false
+// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
+// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
+func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
+	if len(logPatternsToCheck) == 0 {
+		return true, nil
 	}
-	return true
-}
-
-// kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
-// by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
-// Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
-func kubeletConnectionHealthCheck() (bool, error) {
-	kubeletUptimeFunc := getUptimeFunc(types.KubeletComponent)
-	uptime, err := kubeletUptimeFunc()
+	uptimeFunc := getUptimeFunc(service)
+	uptime, err := uptimeFunc()
 	if err != nil {
 		return true, err
 	}
@@ -190,11 +195,23 @@ func kubeletConnectionHealthCheck() (bool, error) {
 	if err != nil {
 		return true, err
 	}
+	for pattern, count := range logPatternsToCheck {
+		healthy, err := checkForPattern(service, logStartTime, pattern, count)
+		if err != nil || !healthy {
+			return healthy, err
+		}
+	}
+	return true, nil
+}
+
+// checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last
+// service restart. (false, nil) otherwise.
+func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
 	out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
-		// Query kubelet logs since the logStartTime
-		`journalctl --unit kubelet --since "`+logStartTime+
-			// Grep the pattern for lost connection
-			`" | grep -i "`+types.KubeletClosedConnectionLogPattern+
+		// Query service logs since the logStartTime
+		`journalctl --unit "`+service+`" --since "`+logStartTime+
+			// Grep the pattern
+			`" | grep -i "`+logPattern+
 			// Get the count of occurrences
 			`" | wc -l`)
 	if err != nil {
@@ -204,26 +221,9 @@ func kubeletConnectionHealthCheck() (bool, error) {
 	if err != nil {
 		return true, err
 	}
-	if occurrences >= types.KubeletClosedConnectionLogPatternThresholdCount {
-		glog.Infof("kubelet failed apiserver connection check, log pattern occurrences: %v", occurrences)
+	if occurrences >= logCountThreshold {
+		glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
 		return false, nil
 	}
 	return true, nil
 }
-
-// getKubeletHealthCheckFunc returns a function that checks for kubelet health and
-// return false if identified as unhealthy, true otherwise.
-func getKubeletHealthCheckFunc(healthCheckTimeout time.Duration) func() (bool, error) {
-	return func() (bool, error) {
-		httpHealthy := kubeletHttpHealthCheck(healthCheckTimeout)
-		connectionHealthy, err := kubeletConnectionHealthCheck()
-		// The plugin will return Unknown status code in case there is any error in
-		// checking kubelet health.
-		if err != nil {
-			glog.Infof("Error in determining apiserver connection health: %v", err)
-			return false, err
-		}
-		healthy := httpHealthy && connectionHealthy
-		return healthy, nil
-	}
-}
--- a/pkg/healthchecker/types/types.go
+++ b/pkg/healthchecker/types/types.go
@@ -16,7 +16,12 @@ limitations under the License.

 package types

-import "time"
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+)

 const (
 	DefaultCoolDownTime       = 2 * time.Minute
@@ -33,11 +38,63 @@ const (
 	DockerComponent   = "docker"
 	ContainerdService = "containerd"

-	KubeletHealthCheckEndpoint                      = "http://127.0.0.1:10248/healthz"
-	KubeletClosedConnectionLogPattern               = "use of closed network connection"
-	KubeletClosedConnectionLogPatternThresholdCount = 10
+	KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
+
+	LogPatternFlagSeparator = ":"
 )

 type HealthChecker interface {
 	CheckHealth() (bool, error)
 }
+
+// LogPatternFlag defines the flag for log pattern health check.
+// It contains a map of <log pattern> to <failure threshold for the pattern>
+type LogPatternFlag struct {
+	logPatternCountMap map[string]int
+}
+
+// String implements the String function for flag.Value interface
+func (lpf *LogPatternFlag) String() string {
+	result := ""
+	for k, v := range lpf.logPatternCountMap {
+		if result != "" {
+			result += " "
+		}
+		result += fmt.Sprintf("%v:%v", k, v)
+	}
+	return result
+}
+
+// Set implements the Set function for flag.Value interface
+func (lpf *LogPatternFlag) Set(value string) error {
+	if lpf.logPatternCountMap == nil {
+		lpf.logPatternCountMap = make(map[string]int)
+	}
+	items := strings.Split(value, ",")
+	for _, item := range items {
+		val := strings.SplitN(item, LogPatternFlagSeparator, 2)
+		if len(val) != 2 {
+			return fmt.Errorf("invalid format of the flag value: %v", val)
+		}
+		countThreshold, err := strconv.Atoi(val[0])
+		if err != nil || countThreshold == 0 {
+			return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
+		}
+		pattern := val[1]
+		if pattern == "" {
+			return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
+		}
+		lpf.logPatternCountMap[pattern] = countThreshold
+	}
+	return nil
+}
+
+// Type implements the Type function for flag.Value interface
+func (lpf *LogPatternFlag) Type() string {
+	return "logPatternFlag"
+}
+
+// GetLogPatternCountMap returns the stored log count map
+func (lpf *LogPatternFlag) GetLogPatternCountMap() map[string]int {
+	return lpf.logPatternCountMap
+}
--- a/pkg/healthchecker/types/types_test.go
+++ b/pkg/healthchecker/types/types_test.go
@@ -0,0 +1,100 @@
+/*
+Copyright 2021 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package types
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestLogPatternFlag(t *testing.T) {
+	testCases := []struct {
+		name                       string
+		value                      string
+		expectedStringVal          string
+		expectedLogPatternCountMap map[string]int
+		expectSetError             bool
+	}{
+		{
+			name:                       "valid single flag value",
+			value:                      "10:pattern1",
+			expectedStringVal:          "pattern1:10",
+			expectedLogPatternCountMap: map[string]int{"pattern1": 10},
+			expectSetError:             false,
+		},
+		{
+			name:                       "valid multiple flag values",
+			value:                      "10:pattern1,20:pattern2",
+			expectedStringVal:          "pattern1:10 pattern2:20",
+			expectedLogPatternCountMap: map[string]int{"pattern1": 10, "pattern2": 20},
+			expectSetError:             false,
+		},
+		{
+			name:           "empty log pattern",
+			value:          "10:",
+			expectSetError: true,
+		},
+		{
+			name:           "0 failure threshold count",
+			value:          "0:pattern1",
+			expectSetError: true,
+		},
+		{
+			name:           "empty failure threshold count",
+			value:          ":pattern1",
+			expectSetError: true,
+		},
+		{
+			name:           "empty failure threshold count and pattern",
+			value:          ":",
+			expectSetError: true,
+		},
+		{
+			name:           "non integer value in failure threshold",
+			value:          "notAnInteger:pattern1",
+			expectSetError: true,
+		},
+		{
+			name:                       "valid log pattern with ':'",
+			value:                      "10:pattern1a:pattern1b,20:pattern2",
+			expectedStringVal:          "pattern1a:pattern1b:10 pattern2:20",
+			expectedLogPatternCountMap: map[string]int{"pattern1a:pattern1b": 10, "pattern2": 20},
+			expectSetError:             false,
+		},
+	}
+
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			flag := LogPatternFlag{}
+			err := flag.Set(test.value)
+			if test.expectSetError {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+				actualStringVal := flag.String()
+				actualLogPatternCountMap := flag.GetLogPatternCountMap()
+				assert.Equal(t, test.expectedStringVal, actualStringVal)
+				if !reflect.DeepEqual(test.expectedLogPatternCountMap, actualLogPatternCountMap) {
+					t.Fatalf("logPatternCountMap mismatch, expected: %v, actual: %v", test.expectedLogPatternCountMap, actualLogPatternCountMap)
+				}
+				assert.Equal(t, test.expectedLogPatternCountMap, actualLogPatternCountMap)
+			}
+		})
+	}
+}