mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-05-03 07:47:03 +00:00
Merge pull request #458 from abansal4032/logging-improvements
Log custom plugin stderr only if the status is not ok.
This commit is contained in:
@@ -89,7 +89,7 @@ func (p *Plugin) Run() {
|
|||||||
|
|
||||||
// run each rule in parallel and wait for them to complete
|
// run each rule in parallel and wait for them to complete
|
||||||
func (p *Plugin) runRules() {
|
func (p *Plugin) runRules() {
|
||||||
glog.Info("Start to run custom plugins")
|
glog.V(3).Info("Start to run custom plugins")
|
||||||
|
|
||||||
for _, rule := range p.config.Rules {
|
for _, rule := range p.config.Rules {
|
||||||
p.syncChan <- struct{}{}
|
p.syncChan <- struct{}{}
|
||||||
@@ -120,7 +120,7 @@ func (p *Plugin) runRules() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
p.Wait()
|
p.Wait()
|
||||||
glog.Info("Finish running custom plugins")
|
glog.V(3).Info("Finish running custom plugins")
|
||||||
}
|
}
|
||||||
|
|
||||||
// readFromReader reads the maxBytes from the reader and drains the rest.
|
// readFromReader reads the maxBytes from the reader and drains the rest.
|
||||||
@@ -203,12 +203,6 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// log the stderr from the plugin
|
|
||||||
if len(stderr) != 0 {
|
|
||||||
glog.Infof("Start logs from plugin %q \n %s", rule.Path, string(stderr))
|
|
||||||
glog.Infof("End logs from plugin %q", rule.Path)
|
|
||||||
}
|
|
||||||
|
|
||||||
// trim suffix useless bytes
|
// trim suffix useless bytes
|
||||||
output = string(stdout)
|
output = string(stdout)
|
||||||
output = strings.TrimSpace(output)
|
output = strings.TrimSpace(output)
|
||||||
@@ -227,8 +221,10 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||||||
case 0:
|
case 0:
|
||||||
return cpmtypes.OK, output
|
return cpmtypes.OK, output
|
||||||
case 1:
|
case 1:
|
||||||
|
logPluginStderr(rule.Path, string(stderr))
|
||||||
return cpmtypes.NonOK, output
|
return cpmtypes.NonOK, output
|
||||||
default:
|
default:
|
||||||
|
logPluginStderr(rule.Path, string(stderr))
|
||||||
return cpmtypes.Unknown, output
|
return cpmtypes.Unknown, output
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -237,3 +233,10 @@ func (p *Plugin) Stop() {
|
|||||||
p.tomb.Stop()
|
p.tomb.Stop()
|
||||||
glog.Info("Stop plugin execution")
|
glog.Info("Stop plugin execution")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func logPluginStderr(path, logs string) {
|
||||||
|
if len(logs) != 0 {
|
||||||
|
glog.Infof("Start logs from plugin %q \n %s", path, string(logs))
|
||||||
|
glog.Infof("End logs from plugin %q", path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type healthChecker struct {
|
type healthChecker struct {
|
||||||
|
component string
|
||||||
enableRepair bool
|
enableRepair bool
|
||||||
healthCheckFunc func() bool
|
healthCheckFunc func() bool
|
||||||
// The repair is "best-effort" and ignores the error from the underlying actions.
|
// The repair is "best-effort" and ignores the error from the underlying actions.
|
||||||
@@ -45,6 +46,7 @@ type healthChecker struct {
|
|||||||
// NewHealthChecker returns a new health checker configured with the given options.
|
// NewHealthChecker returns a new health checker configured with the given options.
|
||||||
func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, error) {
|
func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, error) {
|
||||||
hc := &healthChecker{
|
hc := &healthChecker{
|
||||||
|
component: hco.Component,
|
||||||
enableRepair: hco.EnableRepair,
|
enableRepair: hco.EnableRepair,
|
||||||
crictlPath: hco.CriCtlPath,
|
crictlPath: hco.CriCtlPath,
|
||||||
healthCheckTimeout: hco.HealthCheckTimeout,
|
healthCheckTimeout: hco.HealthCheckTimeout,
|
||||||
@@ -139,14 +141,14 @@ func (hc *healthChecker) CheckHealth() bool {
|
|||||||
// The service is unhealthy.
|
// The service is unhealthy.
|
||||||
// Attempt repair based on flag.
|
// Attempt repair based on flag.
|
||||||
if hc.enableRepair {
|
if hc.enableRepair {
|
||||||
glog.Infof("health-checker: component is unhealthy, proceeding to repair")
|
|
||||||
// repair if the service has been up for the cool down period.
|
// repair if the service has been up for the cool down period.
|
||||||
uptime, err := hc.uptimeFunc()
|
uptime, err := hc.uptimeFunc()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Infof("health-checker: %v\n", err.Error())
|
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
|
||||||
}
|
}
|
||||||
glog.Infof("health-checker: component uptime: %v\n", uptime)
|
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
|
||||||
if uptime > hc.coolDownTime {
|
if uptime > hc.coolDownTime {
|
||||||
|
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
|
||||||
hc.repairFunc()
|
hc.repairFunc()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -159,10 +161,9 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
cmd := exec.CommandContext(ctx, command, args...)
|
cmd := exec.CommandContext(ctx, command, args...)
|
||||||
glog.Infof("health-checker: executing command : %v\n", cmd)
|
|
||||||
out, err := cmd.Output()
|
out, err := cmd.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Infof("health-checker: command failed : %v, %v\n", err.Error(), out)
|
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return strings.TrimSuffix(string(out), "\n"), nil
|
return strings.TrimSuffix(string(out), "\n"), nil
|
||||||
|
|||||||
Reference in New Issue
Block a user