mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-02-19 20:40:07 +00:00
Fix kernel monitor issues:
* Change `unregister_netdevice` to be an event to fix #47. * Change `KernelPanic` to `KernelOops` because we can't handle kernel panic currently. * Use system boot time instead of "StartPattern" to fix #48.
This commit is contained in:
@@ -1,8 +1,7 @@
|
||||
{
|
||||
"plugin": "journald",
|
||||
"logPath": "/var/log/journal",
|
||||
"lookback": "10m",
|
||||
"startPattern": "Initializing cgroup subsys cpuset",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "kernel-monitor",
|
||||
"conditions": [
|
||||
@@ -25,12 +24,17 @@
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "KernelPanic",
|
||||
"reason": "UnregisterNetDevice",
|
||||
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "KernelOops",
|
||||
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "KernelPanic",
|
||||
"reason": "KernelOops",
|
||||
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
||||
},
|
||||
{
|
||||
@@ -44,12 +48,6 @@
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "DockerHung",
|
||||
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "UnregisterNetDeviceIssue",
|
||||
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -34,8 +34,6 @@ type MonitorConfig struct {
|
||||
DefaultConditions []types.Condition `json:"conditions"`
|
||||
// Rules are the rules kernel monitor will follow to parse the log file.
|
||||
Rules []kerntypes.Rule `json:"rules"`
|
||||
// StartPattern is the pattern of the start line
|
||||
StartPattern string `json:"startPattern, omitempty"`
|
||||
}
|
||||
|
||||
// applyDefaultConfiguration applies default configurations.
|
||||
|
||||
@@ -70,10 +70,7 @@ func NewKernelMonitorOrDie(configPath string) KernelMonitor {
|
||||
glog.Fatalf("Failed to validate matching rules %#v: %v", k.config.Rules, err)
|
||||
}
|
||||
glog.Infof("Finish parsing log file: %+v", k.config)
|
||||
k.watcher, err = logwatchers.GetLogWatcher(k.config.WatcherConfig)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to create log watcher with watcher config %#v: %v", k.config.WatcherConfig, err)
|
||||
}
|
||||
k.watcher = logwatchers.GetLogWatcherOrDie(k.config.WatcherConfig)
|
||||
k.buffer = NewLogBuffer(k.config.BufferSize)
|
||||
// A 1000 size channel should be big enough.
|
||||
k.output = make(chan *types.Status, 1000)
|
||||
@@ -117,12 +114,6 @@ func (k *kernelMonitor) parseLog(log *kerntypes.KernelLog) {
|
||||
// Once there is new log, kernel monitor will push it into the log buffer and try
|
||||
// to match each rule. If any rule is matched, kernel monitor will report a status.
|
||||
k.buffer.Push(log)
|
||||
if matched := k.buffer.Match(k.config.StartPattern); len(matched) != 0 {
|
||||
// Reset the condition if a start log shows up.
|
||||
glog.Infof("Found start log %q, re-initialize the status", generateMessage(matched))
|
||||
k.initializeStatus()
|
||||
return
|
||||
}
|
||||
for _, rule := range k.config.Rules {
|
||||
matched := k.buffer.Match(rule.Pattern)
|
||||
if len(matched) == 0 {
|
||||
|
||||
@@ -17,8 +17,6 @@ limitations under the License.
|
||||
package logwatchers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/kernelmonitor/logwatchers/types"
|
||||
|
||||
"github.com/golang/glog"
|
||||
@@ -32,12 +30,13 @@ func registerLogWatcher(name string, create types.WatcherCreateFunc) {
|
||||
createFuncs[name] = create
|
||||
}
|
||||
|
||||
// GetLogWatcher get a log watcher based on the passed in configuration.
|
||||
func GetLogWatcher(config types.WatcherConfig) (types.LogWatcher, error) {
|
||||
// GetLogWatcherOrDie get a log watcher based on the passed in configuration.
|
||||
// The function panics when encounts an error.
|
||||
func GetLogWatcherOrDie(config types.WatcherConfig) types.LogWatcher {
|
||||
create, ok := createFuncs[config.Plugin]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no create function found for plugin %q", config.Plugin)
|
||||
glog.Fatalf("No create function found for plugin %q", config.Plugin)
|
||||
}
|
||||
glog.Infof("Use log watcher of plugin %q", config.Plugin)
|
||||
return create(config), nil
|
||||
return create(config)
|
||||
}
|
||||
|
||||
@@ -24,5 +24,5 @@ const syslogPluginName = "syslog"
|
||||
|
||||
func init() {
|
||||
// Register the syslog plugin.
|
||||
registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcher)
|
||||
registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcherOrDie)
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"io"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
utilclock "code.cloudfoundry.org/clock"
|
||||
@@ -35,15 +36,22 @@ type syslogWatcher struct {
|
||||
reader *bufio.Reader
|
||||
closer io.Closer
|
||||
logCh chan *kerntypes.KernelLog
|
||||
uptime time.Time
|
||||
tomb *util.Tomb
|
||||
clock utilclock.Clock
|
||||
}
|
||||
|
||||
// NewSyslogWatcher creates a new kernel log watcher.
|
||||
func NewSyslogWatcher(cfg types.WatcherConfig) types.LogWatcher {
|
||||
// NewSyslogWatcherOrDie creates a new kernel log watcher. The function panics
|
||||
// when encounters an error.
|
||||
func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
|
||||
var info syscall.Sysinfo_t
|
||||
if err := syscall.Sysinfo(&info); err != nil {
|
||||
glog.Fatalf("Failed to get system info: %v", err)
|
||||
}
|
||||
return &syslogWatcher{
|
||||
cfg: cfg,
|
||||
tomb: util.NewTomb(),
|
||||
cfg: cfg,
|
||||
uptime: time.Now().Add(time.Duration(-info.Uptime * int64(time.Second))),
|
||||
tomb: util.NewTomb(),
|
||||
// A capacity 1000 buffer should be enough
|
||||
logCh: make(chan *kerntypes.KernelLog, 1000),
|
||||
clock: utilclock.NewClock(),
|
||||
@@ -51,7 +59,7 @@ func NewSyslogWatcher(cfg types.WatcherConfig) types.LogWatcher {
|
||||
}
|
||||
|
||||
// Make sure NewSyslogWathcer is types.WatcherCreateFunc.
|
||||
var _ types.WatcherCreateFunc = NewSyslogWatcher
|
||||
var _ types.WatcherCreateFunc = NewSyslogWatcherOrDie
|
||||
|
||||
// Watch starts the syslog watcher.
|
||||
func (s *syslogWatcher) Watch() (<-chan *kerntypes.KernelLog, error) {
|
||||
@@ -113,8 +121,8 @@ func (s *syslogWatcher) watchLoop() {
|
||||
glog.Warningf("Unable to parse line: %q, %v", line, err)
|
||||
continue
|
||||
}
|
||||
// If the log is older than look back duration, discard it.
|
||||
if s.clock.Since(log.Timestamp) > lookback {
|
||||
// If the log is older than look back duration or system boot time, discard it.
|
||||
if s.clock.Since(log.Timestamp) > lookback || log.Timestamp.Before(s.uptime) {
|
||||
continue
|
||||
}
|
||||
s.logCh <- log
|
||||
|
||||
@@ -36,6 +36,7 @@ func TestWatch(t *testing.T) {
|
||||
testCases := []struct {
|
||||
log string
|
||||
logs []kerntypes.KernelLog
|
||||
uptime time.Time
|
||||
lookback string
|
||||
}{
|
||||
{
|
||||
@@ -96,6 +97,26 @@ func TestWatch(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
// The start point is at the end of the log file, we look back, but
|
||||
// system rebooted at in the middle of the log file.
|
||||
log: `Jan 2 03:04:03 kernel: [0.000000] 1
|
||||
Jan 2 03:04:04 kernel: [1.000000] 2
|
||||
Jan 2 03:04:05 kernel: [2.000000] 3
|
||||
`,
|
||||
uptime: time.Date(time.Now().Year(), time.January, 2, 3, 4, 4, 0, time.Local),
|
||||
lookback: "2s",
|
||||
logs: []kerntypes.KernelLog{
|
||||
{
|
||||
Timestamp: now.Add(-time.Second),
|
||||
Message: "2",
|
||||
},
|
||||
{
|
||||
Timestamp: now,
|
||||
Message: "3",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
for c, test := range testCases {
|
||||
t.Logf("TestCase #%d: %#v", c+1, test)
|
||||
@@ -108,11 +129,13 @@ func TestWatch(t *testing.T) {
|
||||
_, err = f.Write([]byte(test.log))
|
||||
assert.NoError(t, err)
|
||||
|
||||
w := NewSyslogWatcher(types.WatcherConfig{
|
||||
w := NewSyslogWatcherOrDie(types.WatcherConfig{
|
||||
Plugin: "syslog",
|
||||
LogPath: f.Name(),
|
||||
Lookback: test.lookback,
|
||||
})
|
||||
// Set the uptime.
|
||||
w.(*syslogWatcher).uptime = test.uptime
|
||||
// Set the fake clock.
|
||||
w.(*syslogWatcher).clock = fakeClock
|
||||
logCh, err := w.Watch()
|
||||
|
||||
Reference in New Issue
Block a user