From 27cc83140882b487f48daee467b37f4ca547c417 Mon Sep 17 00:00:00 2001 From: Random-Liu Date: Fri, 3 Feb 2017 03:06:44 -0800 Subject: [PATCH] Add arbitrary daemon log support --- config/docker-monitor-filelog.json | 14 ++ config/docker-monitor.json | 12 ++ config/kernel-monitor-filelog.json | 58 ++++++++ config/kernel-monitor.json | 3 + .../logwatchers/journald/log_watcher.go | 25 ++-- .../logwatchers/syslog/helpers.go | 128 +++++++++--------- .../logwatchers/syslog/helpers_test.go | 50 +++++-- .../logwatchers/syslog/log_watcher.go | 51 +++++-- .../logwatchers/syslog/log_watcher_test.go | 33 +++-- .../logwatchers/types/log_watcher.go | 3 + 10 files changed, 272 insertions(+), 105 deletions(-) create mode 100644 config/docker-monitor-filelog.json create mode 100644 config/docker-monitor.json create mode 100644 config/kernel-monitor-filelog.json diff --git a/config/docker-monitor-filelog.json b/config/docker-monitor-filelog.json new file mode 100644 index 00000000..d4ee0c62 --- /dev/null +++ b/config/docker-monitor-filelog.json @@ -0,0 +1,14 @@ +{ + "plugin": "syslog", + "pluginConfig": { + "timestamp": "^time=\"(\\S*)\"", + "message": "msg=\"([^\n]*)\"", + "timestampFormat": "2006-01-02T15:04:05.999999999-07:00" + }, + "logPath": "/var/log/docker.log", + "lookback": "5m", + "bufferSize": 10, + "source": "docker-monitor", + "conditions": [], + "rules": [] +} diff --git a/config/docker-monitor.json b/config/docker-monitor.json new file mode 100644 index 00000000..226e555c --- /dev/null +++ b/config/docker-monitor.json @@ -0,0 +1,12 @@ +{ + "plugin": "journald", + "pluginConfig": { + "source": "docker" + }, + "logPath": "/var/log/journal", + "lookback": "5m", + "bufferSize": 10, + "source": "docker-monitor", + "conditions": [], + "rules": [] +} diff --git a/config/kernel-monitor-filelog.json b/config/kernel-monitor-filelog.json new file mode 100644 index 00000000..e95b5ad1 --- /dev/null +++ b/config/kernel-monitor-filelog.json @@ -0,0 +1,58 @@ +{ + "plugin": "syslog", + "pluginConfig": { + "timestamp": "^.{15}", + "message": "kernel: \\[.*\\] (.*)", + "timestampFormat": "Jan _2 15:04:05" + }, + "logPath": "/var/log/kern.log", + "lookback": "5m", + "bufferSize": 10, + "source": "kernel-monitor", + "conditions": [ + { + "type": "KernelDeadlock", + "reason": "KernelHasNoDeadlock", + "message": "kernel has no deadlock" + } + ], + "rules": [ + { + "type": "temporary", + "reason": "OOMKilling", + "pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB" + }, + { + "type": "temporary", + "reason": "TaskHung", + "pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "temporary", + "reason": "UnregisterNetDevice", + "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "divide error: 0000 \\[#\\d+\\] SMP" + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "AUFSUmountHung", + "pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "DockerHung", + "pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\." + } + ] +} diff --git a/config/kernel-monitor.json b/config/kernel-monitor.json index 4b6e656d..dcd5113e 100644 --- a/config/kernel-monitor.json +++ b/config/kernel-monitor.json @@ -1,5 +1,8 @@ { "plugin": "journald", + "pluginConfig": { + "source": "kernel" + }, "logPath": "/var/log/journal", "lookback": "5m", "bufferSize": 10, diff --git a/pkg/kernelmonitor/logwatchers/journald/log_watcher.go b/pkg/kernelmonitor/logwatchers/journald/log_watcher.go index 7aecd5ac..aae33a71 100644 --- a/pkg/kernelmonitor/logwatchers/journald/log_watcher.go +++ b/pkg/kernelmonitor/logwatchers/journald/log_watcher.go @@ -114,8 +114,13 @@ func (j *journaldWatcher) watchLoop() { } } -// defaultJournalLogPath is the default path of journal log. -const defaultJournalLogPath = "/var/log/journal" +const ( + // defaultJournalLogPath is the default path of journal log. + defaultJournalLogPath = "/var/log/journal" + + // configSourceKey is the key of source configuration in the plugin configuration. + configSourceKey = "source" +) // getJournal returns a journal client. func getJournal(cfg types.WatcherConfig) (*sdjournal.Journal, error) { @@ -140,14 +145,18 @@ func getJournal(cfg types.WatcherConfig) (*sdjournal.Journal, error) { if err != nil { return nil, fmt.Errorf("failed to lookback %q: %v", since, err) } - // TODO(random-liu): Make this configurable to support parsing other logs. - kernelMatch := sdjournal.Match{ - Field: sdjournal.SD_JOURNAL_FIELD_TRANSPORT, - Value: "kernel", + // Empty source is not allowed and treated as an error. + source := cfg.PluginConfig[configSourceKey] + if source == "" { + return nil, fmt.Errorf("failed to filter journal log, empty source is not allowed") } - err = journal.AddMatch(kernelMatch.String()) + match := sdjournal.Match{ + Field: sdjournal.SD_JOURNAL_FIELD_SYSLOG_IDENTIFIER, + Value: source, + } + err = journal.AddMatch(match.String()) if err != nil { - return nil, fmt.Errorf("failed to add log filter %#v: %v", kernelMatch, err) + return nil, fmt.Errorf("failed to add log filter %#v: %v", match, err) } return journal, nil } diff --git a/pkg/kernelmonitor/logwatchers/syslog/helpers.go b/pkg/kernelmonitor/logwatchers/syslog/helpers.go index 2412c9bb..3f2ed9c3 100644 --- a/pkg/kernelmonitor/logwatchers/syslog/helpers.go +++ b/pkg/kernelmonitor/logwatchers/syslog/helpers.go @@ -17,86 +17,88 @@ package syslog import ( "fmt" - "io" - "os" - "strings" + "regexp" "time" kerntypes "k8s.io/node-problem-detector/pkg/kernelmonitor/types" - "github.com/google/cadvisor/utils/tail" + "github.com/golang/glog" ) -// translate translates the log line into internal type. -func translate(line string) (*kerntypes.KernelLog, error) { - timestamp, message, err := parseLine(line) - if err != nil { - return nil, err +// translator translates log line into internal log type based on user defined +// regular expression. +type translator struct { + timestampRegexp *regexp.Regexp + messageRegexp *regexp.Regexp + timestampFormat string +} + +const ( + // NOTE that we support submatch for both timestamp and message regular expressions. When + // there are multiple matches returned by submatch, only **the last** is used. + // timestampKey is the key of timestamp regular expression in the plugin configuration. + timestampKey = "timestamp" + // messageKey is the key of message regular expression in the plugin configuration. + messageKey = "message" + // timestampFormatKey is the key of timestamp format string in the plugin configuration. + timestampFormatKey = "timestampFormat" +) + +func newTranslatorOrDie(pluginConfig map[string]string) *translator { + if err := validatePluginConfig(pluginConfig); err != nil { + glog.Errorf("Failed to validate plugin configuration %+v: %v", pluginConfig, err) } + return &translator{ + timestampRegexp: regexp.MustCompile(pluginConfig[timestampKey]), + messageRegexp: regexp.MustCompile(pluginConfig[messageKey]), + timestampFormat: pluginConfig[timestampFormatKey], + } +} + +// translate translates the log line into internal type. +func (t *translator) translate(line string) (*kerntypes.KernelLog, error) { + // Parse timestamp. + matches := t.timestampRegexp.FindStringSubmatch(line) + if len(matches) == 0 { + return nil, fmt.Errorf("no timestamp found in line %q with regular expression %v", line, t.timestampRegexp) + } + timestamp, err := time.ParseInLocation(t.timestampFormat, matches[len(matches)-1], time.Local) + if err != nil { + return nil, fmt.Errorf("failed to parse timestamp %q: %v", matches[len(matches)-1], err) + } + // Formalize the timestmap. + timestamp = formalizeTimestamp(timestamp) + // Parse message. + matches = t.messageRegexp.FindStringSubmatch(line) + if len(matches) == 0 { + return nil, fmt.Errorf("no message found in line %q with regular expression %v", line, t.messageRegexp) + } + message := matches[len(matches)-1] return &kerntypes.KernelLog{ Timestamp: timestamp, Message: message, }, nil } -const ( - // timestampLen is the length of timestamp in syslog logging format. - timestampLen = 15 - // messagePrefix is the character before real message. - messagePrefix = "]" -) - -// parseLine parses one log line into timestamp and message. -func parseLine(line string) (time.Time, string, error) { - // Trim the spaces to make sure timestamp could be found - line = strings.TrimSpace(line) - if len(line) < timestampLen { - return time.Time{}, "", fmt.Errorf("the line is too short: %q", line) +// validatePluginConfig validates whether the plugin configuration. +func validatePluginConfig(cfg map[string]string) error { + if cfg[timestampKey] == "" { + return fmt.Errorf("unexpected empty timestamp regular expression") } - // Example line: Jan 1 00:00:00 hostname kernel: [0.000000] component: log message - now := time.Now() - // There is no time zone information in kernel log timestamp, apply the current time - // zone. - timestamp, err := time.ParseInLocation(time.Stamp, line[:timestampLen], time.Local) - if err != nil { - return time.Time{}, "", fmt.Errorf("error parsing timestamp in line %q: %v", line, err) + if cfg[messageKey] == "" { + return fmt.Errorf("unexpected empty message regular expression") } - // There is no year information in kernel log timestamp, apply the current year. - // This could go wrong during looking back phase after kernel monitor is started, - // and the old logs are generated in old year. - timestamp = timestamp.AddDate(now.Year(), 0, 0) - - loc := strings.Index(line, messagePrefix) - if loc == -1 { - return timestamp, "", fmt.Errorf("can't find message prefix %q in line %q", messagePrefix, line) + if cfg[timestampFormatKey] == "" { + return fmt.Errorf("unexpected empty timestamp format string") } - message := strings.Trim(line[loc+1:], " ") - - return timestamp, message, nil + return nil } -// defaultKernelLogPath the default path of syslog kernel log. -const defaultKernelLogPath = "/var/log/kern.log" - -// getLogReader returns log reader for syslog log. Note that getLogReader doesn't look back -// to the rolled out logs. -func getLogReader(path string) (io.ReadCloser, error) { - if path == "" { - path = defaultKernelLogPath +// formalizeTimestamp formalizes the timestamp. We need this because some log doesn't contain full +// timestamp, e.g. syslog. +func formalizeTimestamp(t time.Time) time.Time { + if t.Year() == 0 { + t = t.AddDate(time.Now().Year(), 0, 0) } - // To handle log rotation, tail will not report error immediately if - // the file doesn't exist. So we check file existence frist. - // This could go wrong during mid-rotation. It should recover after - // several restart when the log file is created again. The chance - // is slim but we should still fix this in the future. - // TODO(random-liu): Handle log missing during rotation. - _, err := os.Stat(path) - if err != nil { - return nil, fmt.Errorf("failed to stat the file %q: %v", path, err) - } - tail, err := tail.NewTail(path) - if err != nil { - return nil, fmt.Errorf("failed to tail the file %q: %v", path, err) - } - return tail, nil + return t } diff --git a/pkg/kernelmonitor/logwatchers/syslog/helpers_test.go b/pkg/kernelmonitor/logwatchers/syslog/helpers_test.go index c738ed9f..8e128c19 100644 --- a/pkg/kernelmonitor/logwatchers/syslog/helpers_test.go +++ b/pkg/kernelmonitor/logwatchers/syslog/helpers_test.go @@ -21,6 +21,7 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" kerntypes "k8s.io/node-problem-detector/pkg/kernelmonitor/types" ) @@ -28,12 +29,18 @@ import ( func TestTranslate(t *testing.T) { year := time.Now().Year() testCases := []struct { - input string - err bool - log *kerntypes.KernelLog + config map[string]string + input string + err bool + log *kerntypes.KernelLog }{ { - input: "May 1 12:23:45 hostname kernel: [0.000000] component: log message", + // missing year and timezone + // "timestamp": "^.{15}", + // "message": "kernel \\[.*\\] (.*)", + // "timestampFormat": "Jan _2 15:04:05", + config: getTestPluginConfig(), + input: "May 1 12:23:45 hostname kernel: [0.000000] component: log message", log: &kerntypes.KernelLog{ Timestamp: time.Date(year, time.May, 1, 12, 23, 45, 0, time.Local), Message: "component: log message", @@ -41,7 +48,8 @@ func TestTranslate(t *testing.T) { }, { // no log message - input: "May 21 12:23:45 hostname kernel: [9.999999]", + config: getTestPluginConfig(), + input: "May 21 12:23:45 hostname kernel: [9.999999] ", log: &kerntypes.KernelLog{ Timestamp: time.Date(year, time.May, 21, 12, 23, 45, 0, time.Local), Message: "", @@ -49,18 +57,36 @@ func TestTranslate(t *testing.T) { }, { // the right square bracket is missing - input: "May 21 12:23:45 hostname kernel: [9.999999 component: log message", - err: true, + config: getTestPluginConfig(), + input: "May 21 12:23:45 hostname kernel: [9.999999 component: log message", + err: true, + }, + { + // contains full timestamp + config: map[string]string{ + "timestamp": "^time=\"(\\S*)\"", + "message": "msg=\"([^\n]*)\"", + "timestampFormat": "2006-01-02T15:04:05.999999999-07:00", + }, + input: `time="2017-02-01T17:58:34.999999999-08:00" level=error msg="test log line1\n test log line2"`, + log: &kerntypes.KernelLog{ + Timestamp: time.Date(2017, 2, 1, 17, 58, 34, 999999999, time.FixedZone("PST", -8*3600)), + Message: `test log line1\n test log line2`, + }, }, } for c, test := range testCases { t.Logf("TestCase #%d: %#v", c+1, test) - log, err := translate(test.input) - if (err != nil) != test.err { - t.Errorf("case %d: error assertion failed, got log: %+v, error: %v", c+1, log, err) - continue + trans := newTranslatorOrDie(test.config) + log, err := trans.translate(test.input) + if !test.err { + require.NoError(t, err) + // Use RFC3339Nano to make it easier for comparison. + assert.Equal(t, test.log.Timestamp.Format(time.RFC3339Nano), log.Timestamp.Format(time.RFC3339Nano)) + assert.Equal(t, test.log.Message, log.Message) + } else { + require.Error(t, err) } - assert.Equal(t, test.log, log) } } diff --git a/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go b/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go index 84545790..6b6df7c8 100644 --- a/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go +++ b/pkg/kernelmonitor/logwatchers/syslog/log_watcher.go @@ -19,12 +19,16 @@ package syslog import ( "bufio" "bytes" + "fmt" "io" + "os" + "strings" "syscall" "time" utilclock "code.cloudfoundry.org/clock" "github.com/golang/glog" + "github.com/google/cadvisor/utils/tail" "k8s.io/node-problem-detector/pkg/kernelmonitor/logwatchers/types" kerntypes "k8s.io/node-problem-detector/pkg/kernelmonitor/types" @@ -32,13 +36,14 @@ import ( ) type syslogWatcher struct { - cfg types.WatcherConfig - reader *bufio.Reader - closer io.Closer - logCh chan *kerntypes.KernelLog - uptime time.Time - tomb *util.Tomb - clock utilclock.Clock + cfg types.WatcherConfig + reader *bufio.Reader + closer io.Closer + translator *translator + logCh chan *kerntypes.KernelLog + uptime time.Time + tomb *util.Tomb + clock utilclock.Clock } // NewSyslogWatcherOrDie creates a new kernel log watcher. The function panics @@ -49,9 +54,10 @@ func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher { glog.Fatalf("Failed to get system info: %v", err) } return &syslogWatcher{ - cfg: cfg, - uptime: time.Now().Add(time.Duration(-info.Uptime * int64(time.Second))), - tomb: util.NewTomb(), + cfg: cfg, + translator: newTranslatorOrDie(cfg.PluginConfig), + uptime: time.Now().Add(time.Duration(-info.Uptime * int64(time.Second))), + tomb: util.NewTomb(), // A capacity 1000 buffer should be enough logCh: make(chan *kerntypes.KernelLog, 1000), clock: utilclock.NewClock(), @@ -116,7 +122,7 @@ func (s *syslogWatcher) watchLoop() { } line = buffer.String() buffer.Reset() - log, err := translate(line) + log, err := s.translator.translate(strings.TrimSuffix(line, "\n")) if err != nil { glog.Warningf("Unable to parse line: %q, %v", line, err) continue @@ -128,3 +134,26 @@ func (s *syslogWatcher) watchLoop() { s.logCh <- log } } + +// getLogReader returns log reader for syslog log. Note that getLogReader doesn't look back +// to the rolled out logs. +func getLogReader(path string) (io.ReadCloser, error) { + if path == "" { + return nil, fmt.Errorf("unexpected empty log path") + } + // To handle log rotation, tail will not report error immediately if + // the file doesn't exist. So we check file existence first. + // This could go wrong during mid-rotation. It should recover after + // several restart when the log file is created again. The chance + // is slim but we should still fix this in the future. + // TODO(random-liu): Handle log missing during rotation. + _, err := os.Stat(path) + if err != nil { + return nil, fmt.Errorf("failed to stat the file %q: %v", path, err) + } + tail, err := tail.NewTail(path) + if err != nil { + return nil, fmt.Errorf("failed to tail the file %q: %v", path, err) + } + return tail, nil +} diff --git a/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go b/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go index 2886a021..5f1f376a 100644 --- a/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go +++ b/pkg/kernelmonitor/logwatchers/syslog/log_watcher_test.go @@ -29,6 +29,16 @@ import ( "github.com/stretchr/testify/assert" ) +// getTestPluginConfig returns a plugin config for test. Use configuration for +// kernel log in test. +func getTestPluginConfig() map[string]string { + return map[string]string{ + "timestamp": "^.{15}", + "message": "kernel: \\[.*\\] (.*)", + "timestampFormat": "Jan _2 15:04:05", + } +} + func TestWatch(t *testing.T) { // now is a fake time now := time.Date(time.Now().Year(), time.January, 2, 3, 4, 5, 0, time.Local) @@ -42,8 +52,8 @@ func TestWatch(t *testing.T) { { // The start point is at the head of the log file. log: `Jan 2 03:04:05 kernel: [0.000000] 1 - Jan 2 03:04:06 kernel: [1.000000] 2 - Jan 2 03:04:07 kernel: [2.000000] 3 +Jan 2 03:04:06 kernel: [1.000000] 2 +Jan 2 03:04:07 kernel: [2.000000] 3 `, lookback: "0", logs: []kerntypes.KernelLog{ @@ -64,8 +74,8 @@ func TestWatch(t *testing.T) { { // The start point is in the middle of the log file. log: `Jan 2 03:04:04 kernel: [0.000000] 1 - Jan 2 03:04:05 kernel: [1.000000] 2 - Jan 2 03:04:06 kernel: [2.000000] 3 +Jan 2 03:04:05 kernel: [1.000000] 2 +Jan 2 03:04:06 kernel: [2.000000] 3 `, lookback: "0", logs: []kerntypes.KernelLog{ @@ -82,8 +92,8 @@ func TestWatch(t *testing.T) { { // The start point is at the end of the log file, but we look back. log: `Jan 2 03:04:03 kernel: [0.000000] 1 - Jan 2 03:04:04 kernel: [1.000000] 2 - Jan 2 03:04:05 kernel: [2.000000] 3 +Jan 2 03:04:04 kernel: [1.000000] 2 +Jan 2 03:04:05 kernel: [2.000000] 3 `, lookback: "1s", logs: []kerntypes.KernelLog{ @@ -101,8 +111,8 @@ func TestWatch(t *testing.T) { // The start point is at the end of the log file, we look back, but // system rebooted at in the middle of the log file. log: `Jan 2 03:04:03 kernel: [0.000000] 1 - Jan 2 03:04:04 kernel: [1.000000] 2 - Jan 2 03:04:05 kernel: [2.000000] 3 +Jan 2 03:04:04 kernel: [1.000000] 2 +Jan 2 03:04:05 kernel: [2.000000] 3 `, uptime: time.Date(time.Now().Year(), time.January, 2, 3, 4, 4, 0, time.Local), lookback: "2s", @@ -130,9 +140,10 @@ func TestWatch(t *testing.T) { assert.NoError(t, err) w := NewSyslogWatcherOrDie(types.WatcherConfig{ - Plugin: "syslog", - LogPath: f.Name(), - Lookback: test.lookback, + Plugin: "syslog", + PluginConfig: getTestPluginConfig(), + LogPath: f.Name(), + Lookback: test.lookback, }) // Set the uptime. w.(*syslogWatcher).uptime = test.uptime diff --git a/pkg/kernelmonitor/logwatchers/types/log_watcher.go b/pkg/kernelmonitor/logwatchers/types/log_watcher.go index c916bbfe..c079c018 100644 --- a/pkg/kernelmonitor/logwatchers/types/log_watcher.go +++ b/pkg/kernelmonitor/logwatchers/types/log_watcher.go @@ -33,6 +33,9 @@ type WatcherConfig struct { // Plugin is the name of plugin which is currently used. // Currently supported: syslog, journald. Plugin string `json:"plugin, omitempty"` + // PluginConfig is a key/value configuration of a plugin. Valid configurations + // are defined in different log watcher plugin. + PluginConfig map[string]string `json:"pluginConfig, omitempty"` // LogPath is the path to the log LogPath string `json:"logPath, omitempty"` // Lookback is the time kernel watcher looks up