Fix kernel monitor issues:

* Change `unregister_netdevice` to be an event to fix #47.
* Change `KernelPanic` to `KernelOops` because we can't handle kernel
panic currently.
* Use system boot time instead of "StartPattern" to fix #48.
This commit is contained in:
Random-Liu
2017-01-20 18:09:01 -08:00
parent b66c4df364
commit d281cb8a15
7 changed files with 54 additions and 37 deletions

View File

@@ -1,8 +1,7 @@
{
"plugin": "journald",
"logPath": "/var/log/journal",
"lookback": "10m",
"startPattern": "Initializing cgroup subsys cpuset",
"lookback": "5m",
"bufferSize": 10,
"source": "kernel-monitor",
"conditions": [
@@ -25,12 +24,17 @@
},
{
"type": "temporary",
"reason": "KernelPanic",
"reason": "UnregisterNetDevice",
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
},
{
"type": "temporary",
"reason": "KernelOops",
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
},
{
"type": "temporary",
"reason": "KernelPanic",
"reason": "KernelOops",
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
},
{
@@ -44,12 +48,6 @@
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "UnregisterNetDeviceIssue",
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
}
]
}

View File

@@ -34,8 +34,6 @@ type MonitorConfig struct {
DefaultConditions []types.Condition `json:"conditions"`
// Rules are the rules kernel monitor will follow to parse the log file.
Rules []kerntypes.Rule `json:"rules"`
// StartPattern is the pattern of the start line
StartPattern string `json:"startPattern, omitempty"`
}
// applyDefaultConfiguration applies default configurations.

View File

@@ -70,10 +70,7 @@ func NewKernelMonitorOrDie(configPath string) KernelMonitor {
glog.Fatalf("Failed to validate matching rules %#v: %v", k.config.Rules, err)
}
glog.Infof("Finish parsing log file: %+v", k.config)
k.watcher, err = logwatchers.GetLogWatcher(k.config.WatcherConfig)
if err != nil {
glog.Fatalf("Failed to create log watcher with watcher config %#v: %v", k.config.WatcherConfig, err)
}
k.watcher = logwatchers.GetLogWatcherOrDie(k.config.WatcherConfig)
k.buffer = NewLogBuffer(k.config.BufferSize)
// A 1000 size channel should be big enough.
k.output = make(chan *types.Status, 1000)
@@ -117,12 +114,6 @@ func (k *kernelMonitor) parseLog(log *kerntypes.KernelLog) {
// Once there is new log, kernel monitor will push it into the log buffer and try
// to match each rule. If any rule is matched, kernel monitor will report a status.
k.buffer.Push(log)
if matched := k.buffer.Match(k.config.StartPattern); len(matched) != 0 {
// Reset the condition if a start log shows up.
glog.Infof("Found start log %q, re-initialize the status", generateMessage(matched))
k.initializeStatus()
return
}
for _, rule := range k.config.Rules {
matched := k.buffer.Match(rule.Pattern)
if len(matched) == 0 {

View File

@@ -17,8 +17,6 @@ limitations under the License.
package logwatchers
import (
"fmt"
"k8s.io/node-problem-detector/pkg/kernelmonitor/logwatchers/types"
"github.com/golang/glog"
@@ -32,12 +30,13 @@ func registerLogWatcher(name string, create types.WatcherCreateFunc) {
createFuncs[name] = create
}
// GetLogWatcher get a log watcher based on the passed in configuration.
func GetLogWatcher(config types.WatcherConfig) (types.LogWatcher, error) {
// GetLogWatcherOrDie get a log watcher based on the passed in configuration.
// The function panics when encounts an error.
func GetLogWatcherOrDie(config types.WatcherConfig) types.LogWatcher {
create, ok := createFuncs[config.Plugin]
if !ok {
return nil, fmt.Errorf("no create function found for plugin %q", config.Plugin)
glog.Fatalf("No create function found for plugin %q", config.Plugin)
}
glog.Infof("Use log watcher of plugin %q", config.Plugin)
return create(config), nil
return create(config)
}

View File

@@ -24,5 +24,5 @@ const syslogPluginName = "syslog"
func init() {
// Register the syslog plugin.
registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcher)
registerLogWatcher(syslogPluginName, syslog.NewSyslogWatcherOrDie)
}

View File

@@ -20,6 +20,7 @@ import (
"bufio"
"bytes"
"io"
"syscall"
"time"
utilclock "code.cloudfoundry.org/clock"
@@ -35,15 +36,22 @@ type syslogWatcher struct {
reader *bufio.Reader
closer io.Closer
logCh chan *kerntypes.KernelLog
uptime time.Time
tomb *util.Tomb
clock utilclock.Clock
}
// NewSyslogWatcher creates a new kernel log watcher.
func NewSyslogWatcher(cfg types.WatcherConfig) types.LogWatcher {
// NewSyslogWatcherOrDie creates a new kernel log watcher. The function panics
// when encounters an error.
func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
var info syscall.Sysinfo_t
if err := syscall.Sysinfo(&info); err != nil {
glog.Fatalf("Failed to get system info: %v", err)
}
return &syslogWatcher{
cfg: cfg,
tomb: util.NewTomb(),
cfg: cfg,
uptime: time.Now().Add(time.Duration(-info.Uptime * int64(time.Second))),
tomb: util.NewTomb(),
// A capacity 1000 buffer should be enough
logCh: make(chan *kerntypes.KernelLog, 1000),
clock: utilclock.NewClock(),
@@ -51,7 +59,7 @@ func NewSyslogWatcher(cfg types.WatcherConfig) types.LogWatcher {
}
// Make sure NewSyslogWathcer is types.WatcherCreateFunc.
var _ types.WatcherCreateFunc = NewSyslogWatcher
var _ types.WatcherCreateFunc = NewSyslogWatcherOrDie
// Watch starts the syslog watcher.
func (s *syslogWatcher) Watch() (<-chan *kerntypes.KernelLog, error) {
@@ -113,8 +121,8 @@ func (s *syslogWatcher) watchLoop() {
glog.Warningf("Unable to parse line: %q, %v", line, err)
continue
}
// If the log is older than look back duration, discard it.
if s.clock.Since(log.Timestamp) > lookback {
// If the log is older than look back duration or system boot time, discard it.
if s.clock.Since(log.Timestamp) > lookback || log.Timestamp.Before(s.uptime) {
continue
}
s.logCh <- log

View File

@@ -36,6 +36,7 @@ func TestWatch(t *testing.T) {
testCases := []struct {
log string
logs []kerntypes.KernelLog
uptime time.Time
lookback string
}{
{
@@ -96,6 +97,26 @@ func TestWatch(t *testing.T) {
},
},
},
{
// The start point is at the end of the log file, we look back, but
// system rebooted at in the middle of the log file.
log: `Jan 2 03:04:03 kernel: [0.000000] 1
Jan 2 03:04:04 kernel: [1.000000] 2
Jan 2 03:04:05 kernel: [2.000000] 3
`,
uptime: time.Date(time.Now().Year(), time.January, 2, 3, 4, 4, 0, time.Local),
lookback: "2s",
logs: []kerntypes.KernelLog{
{
Timestamp: now.Add(-time.Second),
Message: "2",
},
{
Timestamp: now,
Message: "3",
},
},
},
}
for c, test := range testCases {
t.Logf("TestCase #%d: %#v", c+1, test)
@@ -108,11 +129,13 @@ func TestWatch(t *testing.T) {
_, err = f.Write([]byte(test.log))
assert.NoError(t, err)
w := NewSyslogWatcher(types.WatcherConfig{
w := NewSyslogWatcherOrDie(types.WatcherConfig{
Plugin: "syslog",
LogPath: f.Name(),
Lookback: test.lookback,
})
// Set the uptime.
w.(*syslogWatcher).uptime = test.uptime
// Set the fake clock.
w.(*syslogWatcher).clock = fakeClock
logCh, err := w.Watch()