Merge pull request #536 from abansal4032/configure-log-pattern

Make log pattern check configurable in health checker
This commit is contained in:
Kubernetes Prow Robot
2021-02-17 18:32:51 -08:00
committed by GitHub
4 changed files with 206 additions and 46 deletions

View File

@@ -40,6 +40,7 @@ type HealthCheckerOptions struct {
CriSocketPath string
CoolDownTime time.Duration
HealthCheckTimeout time.Duration
LogPatterns types.LogPatternFlag
}
// AddFlags adds health checker command line options to pflag.
@@ -57,6 +58,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
"The duration to wait for the service to be up before attempting repair.")
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
"The time to wait before marking the component as unhealthy.")
fs.Var(&hco.LogPatterns, "log-pattern",
"The log pattern to look for in service journald logs. The format for flag value <failureThresholdCount>:<logPattern>")
}
// IsValid validates health checker command line options.

View File

@@ -33,6 +33,7 @@ import (
type healthChecker struct {
component string
systemdService string
enableRepair bool
healthCheckFunc func() (bool, error)
// The repair is "best-effort" and ignores the error from the underlying actions.
@@ -42,6 +43,7 @@ type healthChecker struct {
crictlPath string
healthCheckTimeout time.Duration
coolDownTime time.Duration
logPatternsToCheck map[string]int
}
// NewHealthChecker returns a new health checker configured with the given options.
@@ -52,6 +54,8 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
crictlPath: hco.CriCtlPath,
healthCheckTimeout: hco.HealthCheckTimeout,
coolDownTime: hco.CoolDownTime,
systemdService: hco.SystemdService,
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
}
hc.healthCheckFunc = getHealthCheckFunc(hco)
hc.repairFunc = getRepairFunc(hco)
@@ -106,7 +110,14 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
switch hco.Component {
case types.KubeletComponent:
return getKubeletHealthCheckFunc(hco.HealthCheckTimeout)
return func() (bool, error) {
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
if err != nil || response.StatusCode != http.StatusOK {
return false, nil
}
return true, nil
}
case types.DockerComponent:
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
@@ -132,7 +143,11 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
if err != nil {
return healthy, err
}
if healthy {
logPatternHealthy, err := logPatternHealthCheck(hc.systemdService, hc.logPatternsToCheck)
if err != nil {
return logPatternHealthy, err
}
if healthy && logPatternHealthy {
return true, nil
}
// The service is unhealthy.
@@ -165,24 +180,14 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
return strings.TrimSuffix(string(out), "\n"), nil
}
// kubeletHttpHealthCheck checks the health api response on kubelet.
// Returns true for healthy, false otherwise.
func kubeletHttpHealthCheck(healthCheckTimeout time.Duration) bool {
httpClient := http.Client{Timeout: healthCheckTimeout}
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
if err != nil || response.StatusCode != http.StatusOK {
glog.Info("kubelet failed http health check")
return false
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
if len(logPatternsToCheck) == 0 {
return true, nil
}
return true
}
// kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
// by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
// Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
func kubeletConnectionHealthCheck() (bool, error) {
kubeletUptimeFunc := getUptimeFunc(types.KubeletComponent)
uptime, err := kubeletUptimeFunc()
uptimeFunc := getUptimeFunc(service)
uptime, err := uptimeFunc()
if err != nil {
return true, err
}
@@ -190,11 +195,23 @@ func kubeletConnectionHealthCheck() (bool, error) {
if err != nil {
return true, err
}
for pattern, count := range logPatternsToCheck {
healthy, err := checkForPattern(service, logStartTime, pattern, count)
if err != nil || !healthy {
return healthy, err
}
}
return true, nil
}
// checkForPattern returns (true, nil) if logPattern occurs atleast logCountThreshold number of times since last
// service restart. (false, nil) otherwise.
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
// Query kubelet logs since the logStartTime
`journalctl --unit kubelet --since "`+logStartTime+
// Grep the pattern for lost connection
`" | grep -i "`+types.KubeletClosedConnectionLogPattern+
// Query service logs since the logStartTime
`journalctl --unit "`+service+`" --since "`+logStartTime+
// Grep the pattern
`" | grep -i "`+logPattern+
// Get the count of occurrences
`" | wc -l`)
if err != nil {
@@ -204,26 +221,9 @@ func kubeletConnectionHealthCheck() (bool, error) {
if err != nil {
return true, err
}
if occurrences >= types.KubeletClosedConnectionLogPatternThresholdCount {
glog.Infof("kubelet failed apiserver connection check, log pattern occurrences: %v", occurrences)
if occurrences >= logCountThreshold {
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
return false, nil
}
return true, nil
}
// getKubeletHealthCheckFunc returns a function that checks for kubelet health and
// return false if identified as unhealthy, true otherwise.
func getKubeletHealthCheckFunc(healthCheckTimeout time.Duration) func() (bool, error) {
return func() (bool, error) {
httpHealthy := kubeletHttpHealthCheck(healthCheckTimeout)
connectionHealthy, err := kubeletConnectionHealthCheck()
// The plugin will return Unknown status code in case there is any error in
// checking kubelet health.
if err != nil {
glog.Infof("Error in determining apiserver connection health: %v", err)
return false, err
}
healthy := httpHealthy && connectionHealthy
return healthy, nil
}
}

View File

@@ -16,7 +16,12 @@ limitations under the License.
package types
import "time"
import (
"fmt"
"strconv"
"strings"
"time"
)
const (
DefaultCoolDownTime = 2 * time.Minute
@@ -33,11 +38,63 @@ const (
DockerComponent = "docker"
ContainerdService = "containerd"
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
KubeletClosedConnectionLogPattern = "use of closed network connection"
KubeletClosedConnectionLogPatternThresholdCount = 10
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
LogPatternFlagSeparator = ":"
)
type HealthChecker interface {
CheckHealth() (bool, error)
}
// LogPatternFlag defines the flag for log pattern health check.
// It contains a map of <log pattern> to <failure threshold for the pattern>
type LogPatternFlag struct {
logPatternCountMap map[string]int
}
// String implements the String function for flag.Value interface
func (lpf *LogPatternFlag) String() string {
result := ""
for k, v := range lpf.logPatternCountMap {
if result != "" {
result += " "
}
result += fmt.Sprintf("%v:%v", k, v)
}
return result
}
// Set implements the Set function for flag.Value interface
func (lpf *LogPatternFlag) Set(value string) error {
if lpf.logPatternCountMap == nil {
lpf.logPatternCountMap = make(map[string]int)
}
items := strings.Split(value, ",")
for _, item := range items {
val := strings.SplitN(item, LogPatternFlagSeparator, 2)
if len(val) != 2 {
return fmt.Errorf("invalid format of the flag value: %v", val)
}
countThreshold, err := strconv.Atoi(val[0])
if err != nil || countThreshold == 0 {
return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
}
pattern := val[1]
if pattern == "" {
return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
}
lpf.logPatternCountMap[pattern] = countThreshold
}
return nil
}
// Type implements the Type function for flag.Value interface
func (lpf *LogPatternFlag) Type() string {
return "logPatternFlag"
}
// GetLogPatternCountMap returns the stored log count map
func (lpf *LogPatternFlag) GetLogPatternCountMap() map[string]int {
return lpf.logPatternCountMap
}

View File

@@ -0,0 +1,100 @@
/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package types
import (
"reflect"
"testing"
"github.com/stretchr/testify/assert"
)
func TestLogPatternFlag(t *testing.T) {
testCases := []struct {
name string
value string
expectedStringVal string
expectedLogPatternCountMap map[string]int
expectSetError bool
}{
{
name: "valid single flag value",
value: "10:pattern1",
expectedStringVal: "pattern1:10",
expectedLogPatternCountMap: map[string]int{"pattern1": 10},
expectSetError: false,
},
{
name: "valid multiple flag values",
value: "10:pattern1,20:pattern2",
expectedStringVal: "pattern1:10 pattern2:20",
expectedLogPatternCountMap: map[string]int{"pattern1": 10, "pattern2": 20},
expectSetError: false,
},
{
name: "empty log pattern",
value: "10:",
expectSetError: true,
},
{
name: "0 failure threshold count",
value: "0:pattern1",
expectSetError: true,
},
{
name: "empty failure threshold count",
value: ":pattern1",
expectSetError: true,
},
{
name: "empty failure threshold count and pattern",
value: ":",
expectSetError: true,
},
{
name: "non integer value in failure threshold",
value: "notAnInteger:pattern1",
expectSetError: true,
},
{
name: "valid log pattern with ':'",
value: "10:pattern1a:pattern1b,20:pattern2",
expectedStringVal: "pattern1a:pattern1b:10 pattern2:20",
expectedLogPatternCountMap: map[string]int{"pattern1a:pattern1b": 10, "pattern2": 20},
expectSetError: false,
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
flag := LogPatternFlag{}
err := flag.Set(test.value)
if test.expectSetError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
actualStringVal := flag.String()
actualLogPatternCountMap := flag.GetLogPatternCountMap()
assert.Equal(t, test.expectedStringVal, actualStringVal)
if !reflect.DeepEqual(test.expectedLogPatternCountMap, actualLogPatternCountMap) {
t.Fatalf("logPatternCountMap mismatch, expected: %v, actual: %v", test.expectedLogPatternCountMap, actualLogPatternCountMap)
}
assert.Equal(t, test.expectedLogPatternCountMap, actualLogPatternCountMap)
}
})
}
}