From 532f933bd81e31f1df16065867ecd07c747c07c0 Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Thu, 23 Jun 2016 22:54:47 -0700 Subject: [PATCH] This PR: 1) Add lookback support in kernel monitor. After started, Kernel monitor will check some old logs to detect problems which happened before last node reboot. 2) Add `lookback` and `startPattern` in kernel monitor configuration. * `lookback` specifies how long time kernel monitor should look back. * `startPattern` specifies which log indicates the node is started. kernel monitor will clear all current node conditions once it finds a node start log. This makes sure that old problems won't change the node condition. 3) Add support for kernel panic monitoring, the null pointer and divide 0 kernel panic will be surfaced as event. Usually kernel monitor will report these events during looking back phase. --- config/kernel-monitor.json | 12 +++ demo/demo | 4 +- demo/divide_zero | 36 +++++++++ demo/null_pointer | 36 +++++++++ node_problem_detector.go | 2 +- pkg/kernelmonitor/kernel_log_watcher.go | 49 +++++++++---- pkg/kernelmonitor/kernel_log_watcher_test.go | 57 +++++++++++---- pkg/kernelmonitor/kernel_monitor.go | 73 +++++++++++-------- pkg/kernelmonitor/kernel_monitor_test.go | 10 +-- pkg/kernelmonitor/translator/translator.go | 55 +++++++------- .../translator/translator_test.go | 23 +++--- pkg/kernelmonitor/types/types.go | 6 +- 12 files changed, 249 insertions(+), 114 deletions(-) create mode 100644 demo/divide_zero create mode 100644 demo/null_pointer diff --git a/config/kernel-monitor.json b/config/kernel-monitor.json index c038c4b3..ae88ee2c 100644 --- a/config/kernel-monitor.json +++ b/config/kernel-monitor.json @@ -1,5 +1,7 @@ { "logPath": "/log/kern.log", + "lookback": "10m", + "startPattern": "Initializing cgroup subsys cpuset", "bufferSize": 10, "source": "kernel-monitor", "conditions": [ @@ -20,6 +22,16 @@ "reason": "TaskHung", "pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\." }, + { + "type": "temporary", + "reason": "KernelPanic", + "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*" + }, + { + "type": "temporary", + "reason": "KernelPanic", + "pattern": "divide error: 0000 \\[#\\d+\\] SMP" + }, { "type": "permanent", "condition": "KernelDeadlock", diff --git a/demo/demo b/demo/demo index 18c29c59..21d773e2 100755 --- a/demo/demo +++ b/demo/demo @@ -31,8 +31,8 @@ rebootAll() { reboot() { LATEST=`gcloud compute ssh -n root@$1 --zone=$ZONE "tail -1 $KERNLOG"` - PREFIX=`echo $LATEST | cut -d "[" -f 1 -`"[0.000000]" - runCmd "gcloud compute ssh -n root@$1 --zone=$ZONE \"echo '$PREFIX reboot' >> $KERNLOG\"" + PREFIX=`echo $LATEST | cut -d "]" -f 1 -`"]" + runCmd "gcloud compute ssh -n root@$1 --zone=$ZONE \"echo '$PREFIX Initializing cgroup subsys cpuset' >> $KERNLOG\"" } case $1 in diff --git a/demo/divide_zero b/demo/divide_zero new file mode 100644 index 00000000..02a9e8d1 --- /dev/null +++ b/demo/divide_zero @@ -0,0 +1,36 @@ +divide error: 0000 [#1] SMP +Modules linked in: dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio libcrc32c xt_statistic xt_nat xt_mark ipt_REJECT xt_tcpudp xt_comment loop veth binfmt_misc sch_htb ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter ip_tables x_tables nf_nat nf_conntrack bridge stp llc aufs(C) nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc crc32_pclmul ppdev aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd evdev psmouse serio_raw parport_pc ttm parport drm_kms_helper drm i2c_piix4 i2c_core processor button thermal_sys autofs4 ext4 crc16 mbcache jbd2 btrfs xor raid6_pq dm_mod ata_generic crct10dif_pclmul crct10dif_common xen_netfront xen_blkfront crc32c_intel ata_piix libata scsi_mod floppy +CPU: 1 PID: 1853 Comm: docker Tainted: G C 3.16.0-4-amd64 #1 Debian 3.16.7-ckt20-1+deb8u4 +Hardware name: Xen HVM domU, BIOS 4.2.amazon 05/12/2016 +task: ffff8801e3657470 ti: ffff8801e47a8000 task.ti: ffff8801e47a8000 +RIP: 0010:[] [] pool_io_hints+0xf0/0x1a0 [dm_thin_pool] +RSP: 0018:ffff8801e47abbc8 EFLAGS: 00010246 +RAX: 0000000000010000 RBX: ffff8801e4736840 RCX: ffff8801c2662000 +RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8801e48c4080 +RBP: ffff8801e47abc10 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffa057f5c8 +R13: 0000000000000001 R14: ffff8801e47abc90 R15: 0000000000000131 +FS: 00007ff465daf700(0000) GS:ffff8801efc20000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 000000c207f1c3fb CR3: 00000001e2a5a000 CR4: 00000000001406e0 +Stack: + ffffffff810a7c71 0000000043e06d70 ffffc9000115f040 0000000000000000 + 0000000043e06d70 ffffc9000115f040 0000000000000000 ffff8800e9da3800 + ffffffffa00ba615 000fffffffffffff 00000000ffffffff 00000000000000ff +Call Trace: + [] ? complete+0x31/0x40 + [] ? dm_calculate_queue_limits+0x95/0x130 [dm_mod] + [] ? dm_swap_table+0x73/0x320 [dm_mod] + [] ? crc_t10dif_generic+0x101/0x1000 [crct10dif_common] + [] ? table_load+0x330/0x330 [dm_mod] + [] ? dev_suspend+0x95/0x220 [dm_mod] + [] ? ctl_ioctl+0x205/0x430 [dm_mod] + [] ? dm_ctl_ioctl+0xf/0x20 [dm_mod] + [] ? do_vfs_ioctl+0x2cf/0x4b0 + [] ? SyS_futex+0x6e/0x150 + [] ? SyS_ioctl+0x81/0xa0 + [] ? system_call_fast_compare_end+0x10/0x15 +Code: 0f 84 a5 00 00 00 3b 96 10 06 00 00 49 c7 c4 c8 f5 57 a0 77 26 8b b6 18 06 00 00 89 d0 c1 e0 09 48 39 f0 0f 82 92 00 00 00 31 d2 <48> f7 f6 85 d2 74 2d 49 c7 c4 70 f5 57 a0 66 90 48 89 e6 e8 28 +RIP [] pool_io_hints+0xf0/0x1a0 [dm_thin_pool] + RSP +---[ end trace fcce781faebae9ce ]--- diff --git a/demo/null_pointer b/demo/null_pointer new file mode 100644 index 00000000..cc2914f5 --- /dev/null +++ b/demo/null_pointer @@ -0,0 +1,36 @@ +BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 +IP: [] pick_next_task_fair+0x6b8/0x820 +PGD 0 +Oops: 0000 [#1] SMP +Modules linked in: binfmt_misc ipt_REJECT xt_nat xt_tcpudp xt_multiport veth xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter ip_tables x_tables nf_nat nf_conntrack bridge stp llc aufs(C) cpufreq_conservative cpufreq_powersave cpufreq_stats cpufreq_userspace xfs libcrc32c x86_pkg_temp_thermal intel_powerclamp intel_rapl coretemp kvm_intel kvm crc32_pclmul iTCO_wdt iTCO_vendor_support aesni_intel aes_x86_64 lrw gf128mul ttm glue_helper evdev ablk_helper cryptd drm_kms_helper drm serio_raw shpchp i2c_i801 lpc_ich mfd_core wmi ipmi_msghandler processor tpm_infineon thermal_sys tpm_tis tpm acpi_pad button autofs4 ext4 crc16 mbcache jbd2 btrfs xor raid6_pq dm_mod raid1 md_mod sg sd_mod crc_t10dif crct10dif_generic crct10dif_pclmul crct10dif_common crc32c_intel ehci_pci ahci igb libahci xhci_hcd ehci_hcd i2c_algo_bit i2c_core libata dca ptp usbcore scsi_mod pps_core usb_common +CPU: 11 PID: 63 Comm: ksoftirqd/11 Tainted: G C 3.16.0-4-amd64 #1 Debian 3.16.7-ckt20-1+deb8u3 +Hardware name: ASUSTeK COMPUTER INC. Z10PA-U8 Series/Z10PA-U8 Series, BIOS 0601 06/26/2015 +task: ffff881fe1d0cb60 ti: ffff881fe1d18000 task.ti: ffff881fe1d18000 +RIP: 0010:[] [] pick_next_task_fair+0x6b8/0x820 +RSP: 0018:ffff881fe1d1bde0 EFLAGS: 00010046 +RAX: 0000000000000000 RBX: ffff881fa84a3580 RCX: 0000000000000000 +RDX: 0000000000000001 RSI: ffff881fb5837628 RDI: ffff881d1fab2308 +RBP: ffff881fb5837600 R08: 0000000000000000 R09: 000000000000b9dc +R10: 0000000000000004 R11: 0000000000000005 R12: 0000000000000000 +R13: 0000000000000000 R14: 0000000000000000 R15: ffff88207fd72f00 +FS: 0000000000000000(0000) GS:ffff88207fd60000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000000000078 CR3: 0000000001813000 CR4: 00000000001407e0 +Stack: + ffff881d1fab2290 000000018109ef94 ffff881fe1d0cb60 0000000000012f00 + ffff88207fd72f78 ffffffff8101b975 ffff881fe1d0cfb8 ffff881fe1d0cb60 + ffff88207fd72f00 000000000000000b 0000000000000000 0000000000000000 +Call Trace: + [] ? sched_clock+0x5/0x10 + [] ? __schedule+0x106/0x700 + [] ? smpboot_thread_fn+0xc6/0x190 + [] ? SyS_setgroups+0x170/0x170 + [] ? kthread+0xbd/0xe0 + [] ? kthread_create_on_node+0x180/0x180 + [] ? ret_from_fork+0x58/0x90 + [] ? kthread_create_on_node+0x180/0x180 +Code: 49 8b 7c 24 78 48 39 fd 74 2f 44 8b 73 68 45 8b 6c 24 68 45 39 ee 0f 8e c7 00 00 00 48 89 ef 48 89 de e8 fc 91 ff ff 48 8b 5b 70 <49> 8b 7c 24 78 48 8b 6b 78 48 39 fd 75 d1 48 85 ed 74 cc 4c 89 +RIP [] pick_next_task_fair+0x6b8/0x820 + RSP +CR2: 0000000000000078 +---[ end trace 61f6991dc9ee5be0 ]--- diff --git a/node_problem_detector.go b/node_problem_detector.go index 28e39308..4eabc1e1 100644 --- a/node_problem_detector.go +++ b/node_problem_detector.go @@ -24,7 +24,7 @@ import ( ) var ( - kernelMonitorConfigPath = flag.String("kernel-monitor", "/config/kernel_monitor", "The path to the kernel monitor config file") + kernelMonitorConfigPath = flag.String("kernel-monitor", "/config/kernel_monitor.json", "The path to the kernel monitor config file") ) func main() { diff --git a/pkg/kernelmonitor/kernel_log_watcher.go b/pkg/kernelmonitor/kernel_log_watcher.go index c74f8c8e..9aab7e31 100644 --- a/pkg/kernelmonitor/kernel_log_watcher.go +++ b/pkg/kernelmonitor/kernel_log_watcher.go @@ -18,7 +18,9 @@ package kernelmonitor import ( "bufio" + "fmt" "os" + "time" "k8s.io/node-problem-detector/pkg/kernelmonitor/translator" "k8s.io/node-problem-detector/pkg/kernelmonitor/types" @@ -26,6 +28,7 @@ import ( "github.com/golang/glog" "github.com/hpcloud/tail" + utilclock "github.com/pivotal-golang/clock" ) const ( @@ -34,7 +37,12 @@ const ( // WatcherConfig is the configuration of kernel log watcher. type WatcherConfig struct { + // KernelLogPath is the path to the kernel log KernelLogPath string `json:"logPath, omitempty"` + // StartPattern is the pattern of the start line + StartPattern string `json:"startPattern, omitempty"` + // Lookback is the time kernel watcher looks up + Lookback string `json:"lookback, omitempty"` } // KernelLogWatcher watches and translates the kernel log. Once there is new log line, @@ -53,6 +61,7 @@ type kernelLogWatcher struct { tl *tail.Tail logCh chan *types.KernelLog tomb *util.Tomb + clock utilclock.Clock } // NewKernelLogWatcher creates a new kernel log watcher. @@ -63,6 +72,7 @@ func NewKernelLogWatcher(cfg WatcherConfig) KernelLogWatcher { tomb: util.NewTomb(), // A capacity 1000 buffer should be enough logCh: make(chan *types.KernelLog, 1000), + clock: utilclock.NewClock(), } } @@ -144,39 +154,46 @@ func (k *kernelLogWatcher) watchLoop() { } } -// getStartPoint parses the newest kernel log file and try to find the latest reboot point. -// Currently we rely on the kernel log timestamp to find the reboot point. The basic idea -// is straight forward: In the whole lifecycle of a node, the kernel log timestamp should -// always increase, only when it is reboot, the timestamp will decrease. We just parse the -// log and find the latest timestamp decreasing, then it should be the latest reboot point. -// TODO(random-liu): A drawback is that if the node is started long time ago, we'll only get -// logs in the newest kernel log file. We may want to improve this in the future. +// getStartPoint finds the start point to parse the log. The start point is either +// the line at (now - lookback) or the first line of kernel log. +// Notice that, kernel log watcher doesn't look back to the rolled out logs. func (k *kernelLogWatcher) getStartPoint(path string) (int64, error) { f, err := os.Open(path) if err != nil { - return -1, err + return 0, fmt.Errorf("failed to open file %q: %v", path, err) } defer f.Close() + lookback, err := parseDuration(k.cfg.Lookback) + if err != nil { + return 0, fmt.Errorf("failed to parse duration %q: %v", k.cfg.Lookback, err) + } start := int64(0) - total := 0 - lastTimestamp := int64(0) reader := bufio.NewReader(f) done := false for !done { line, err := reader.ReadString('\n') if err != nil { + if len(line) == 0 { + // No need to continue parsing if nothing is read + break + } done = true } - total += len(line) log, err := k.trans.Translate(line) if err != nil { glog.Infof("unable to parse line: %q, %v", line, err) - continue + } else if k.clock.Since(log.Timestamp) <= lookback { + break } - if log.Timestamp < lastTimestamp { - start = int64(total - len(line)) - } - lastTimestamp = log.Timestamp + start += int64(len(line)) } return start, nil } + +func parseDuration(s string) (time.Duration, error) { + // If the duration is not configured, just return 0 by default + if s == "" { + return 0, nil + } + return time.ParseDuration(s) +} diff --git a/pkg/kernelmonitor/kernel_log_watcher_test.go b/pkg/kernelmonitor/kernel_log_watcher_test.go index 3bd7d742..9ce1848f 100644 --- a/pkg/kernelmonitor/kernel_log_watcher_test.go +++ b/pkg/kernelmonitor/kernel_log_watcher_test.go @@ -24,50 +24,73 @@ import ( "time" "k8s.io/node-problem-detector/pkg/kernelmonitor/types" + + "github.com/pivotal-golang/clock/fakeclock" ) func TestGetStartPoint(t *testing.T) { + // now is a fake time + now := time.Date(time.Now().Year(), time.January, 2, 3, 4, 5, 0, time.Local) + fakeClock := fakeclock.NewFakeClock(now) testCases := []struct { - log string - logs []types.KernelLog - err bool + log string + logs []types.KernelLog + lookback string }{ { // The start point is at the head of the log file. - log: `kernel: [1.000000] 1 - kernel: [2.000000] 2 - kernel: [3.000000] 3 + log: `Jan 2 03:04:05 kernel: [0.000000] 1 + Jan 2 03:04:06 kernel: [1.000000] 2 + Jan 2 03:04:07 kernel: [2.000000] 3 `, logs: []types.KernelLog{ { - Timestamp: 1000000, + Timestamp: now, Message: "1", }, { - Timestamp: 2000000, + Timestamp: now.Add(time.Second), Message: "2", }, { - Timestamp: 3000000, + Timestamp: now.Add(2 * time.Second), Message: "3", }, }, }, { // The start point is in the middle of the log file. - log: `kernel: [3.000000] 3 - kernel: [1.000000] 1 - kernel: [2.000000] 2 + log: `Jan 2 03:04:04 kernel: [0.000000] 1 + Jan 2 03:04:05 kernel: [1.000000] 2 + Jan 2 03:04:06 kernel: [2.000000] 3 `, logs: []types.KernelLog{ { - Timestamp: 1000000, - Message: "1", + Timestamp: now, + Message: "2", }, { - Timestamp: 2000000, + Timestamp: now.Add(time.Second), + Message: "3", + }, + }, + }, + { + // The start point is at the end of the log file, but we look back. + log: `Jan 2 03:04:03 kernel: [0.000000] 1 + Jan 2 03:04:04 kernel: [1.000000] 2 + Jan 2 03:04:05 kernel: [2.000000] 3 + `, + lookback: "1s", + logs: []types.KernelLog{ + { + Timestamp: now.Add(-time.Second), Message: "2", }, + { + Timestamp: now, + Message: "3", + }, }, }, } @@ -84,7 +107,9 @@ func TestGetStartPoint(t *testing.T) { if err != nil { t.Fatal(err) } - w := NewKernelLogWatcher(WatcherConfig{KernelLogPath: f.Name()}) + w := NewKernelLogWatcher(WatcherConfig{KernelLogPath: f.Name(), Lookback: test.lookback}) + // Set the fake clock. + w.(*kernelLogWatcher).clock = fakeClock logCh, err := w.Watch() if err != nil { t.Fatal(err) diff --git a/pkg/kernelmonitor/kernel_monitor.go b/pkg/kernelmonitor/kernel_monitor.go index cd7293e4..c71ea28f 100644 --- a/pkg/kernelmonitor/kernel_monitor.go +++ b/pkg/kernelmonitor/kernel_monitor.go @@ -58,7 +58,6 @@ type kernelMonitor struct { buffer LogBuffer config MonitorConfig conditions []types.Condition - uptime time.Time logCh <-chan *kerntypes.KernelLog output chan *types.Status tomb *util.Tomb @@ -77,8 +76,6 @@ func NewKernelMonitorOrDie(configPath string) KernelMonitor { if err != nil { panic(err) } - // Initialize the default node conditions - k.conditions = initialConditions(k.config.DefaultConditions) err = validateRules(k.config.Rules) if err != nil { panic(err) @@ -89,8 +86,6 @@ func NewKernelMonitorOrDie(configPath string) KernelMonitor { if err != nil { panic(err) } - k.uptime = time.Now().Add(time.Duration(-info.Uptime * int64(time.Second))) - glog.Infof("Got system boot time: %v", k.uptime) k.watcher = NewKernelLogWatcher(k.config.WatcherConfig) k.buffer = NewLogBuffer(k.config.BufferSize) // A 1000 size channel should be big enough. @@ -117,22 +112,11 @@ func (k *kernelMonitor) Stop() { // monitorLoop is the main loop of kernel monitor. func (k *kernelMonitor) monitorLoop() { defer k.tomb.Done() - k.output <- k.initialStatus() // Update the initial status + k.initializeStatus() for { select { case log := <-k.logCh: - // Once there is new log, kernel monitor will push it into the log buffer and try - // to match each rule. If any rule is matched, kernel monitor will report a status. - k.buffer.Push(log) - for _, rule := range k.config.Rules { - matched := k.buffer.Match(rule.Pattern) - if len(matched) == 0 { - continue - } - status := k.generateStatus(matched, rule) - glog.Infof("New status generated: %+v", status) - k.output <- status - } + k.parseLog(log) case <-k.tomb.Stopping(): k.watcher.Stop() glog.Infof("Kernel monitor stopped") @@ -141,15 +125,33 @@ func (k *kernelMonitor) monitorLoop() { } } +// parseLog parses one log line. +func (k *kernelMonitor) parseLog(log *kerntypes.KernelLog) { + // Once there is new log, kernel monitor will push it into the log buffer and try + // to match each rule. If any rule is matched, kernel monitor will report a status. + k.buffer.Push(log) + if matched := k.buffer.Match(k.config.StartPattern); len(matched) != 0 { + // Reset the condition if a start log shows up. + glog.Infof("Found start log %q, re-initialize the status", generateMessage(matched)) + k.initializeStatus() + return + } + for _, rule := range k.config.Rules { + matched := k.buffer.Match(rule.Pattern) + if len(matched) == 0 { + continue + } + status := k.generateStatus(matched, rule) + glog.Infof("New status generated: %+v", status) + k.output <- status + } +} + // generateStatus generates status from the logs. func (k *kernelMonitor) generateStatus(logs []*kerntypes.KernelLog, rule kerntypes.Rule) *types.Status { // We use the timestamp of the first log line as the timestamp of the status. - timestamp := k.generateTimestamp(logs[0].Timestamp) - messages := []string{} - for _, log := range logs { - messages = append(messages, log.Message) - } - message := concatLogs(messages) + timestamp := logs[0].Timestamp + message := generateMessage(logs) var events []types.Event if rule.Type == kerntypes.Temp { // For temporary error only generate event @@ -181,14 +183,13 @@ func (k *kernelMonitor) generateStatus(logs []*kerntypes.KernelLog, rule kerntyp } } -// generateTimestamp converts the kernel log time to real time. -func (k *kernelMonitor) generateTimestamp(timestamp int64) time.Time { - return k.uptime.Add(time.Duration(timestamp * int64(time.Microsecond))) -} - -// initialStatus returns the initial status with initial condition. -func (k *kernelMonitor) initialStatus() *types.Status { - return &types.Status{ +// initializeStatus initializes the internal condition and also reports it to the node problem detector. +func (k *kernelMonitor) initializeStatus() { + // Initialize the default node conditions + k.conditions = initialConditions(k.config.DefaultConditions) + glog.Infof("Initalize condition generated: %+v", k.conditions) + // Update the initial status + k.output <- &types.Status{ Source: k.config.Source, Conditions: k.conditions, } @@ -215,3 +216,11 @@ func validateRules(rules []kerntypes.Rule) error { } return nil } + +func generateMessage(logs []*kerntypes.KernelLog) string { + messages := []string{} + for _, log := range logs { + messages = append(messages, log.Message) + } + return concatLogs(messages) +} diff --git a/pkg/kernelmonitor/kernel_monitor_test.go b/pkg/kernelmonitor/kernel_monitor_test.go index 1dd51a2f..26dc1856 100644 --- a/pkg/kernelmonitor/kernel_monitor_test.go +++ b/pkg/kernelmonitor/kernel_monitor_test.go @@ -32,7 +32,6 @@ const ( ) func TestGenerateStatus(t *testing.T) { - uptime := time.Unix(1000, 0) initConditions := []types.Condition{ { Type: testConditionA, @@ -47,11 +46,11 @@ func TestGenerateStatus(t *testing.T) { } logs := []*kerntypes.KernelLog{ { - Timestamp: 100000, + Timestamp: time.Unix(1000, 1000), Message: "test message 1", }, { - Timestamp: 200000, + Timestamp: time.Unix(2000, 2000), Message: "test message 2", }, } @@ -72,7 +71,7 @@ func TestGenerateStatus(t *testing.T) { { Type: testConditionA, Status: true, - Transition: time.Unix(1000, 100000*1000), + Transition: time.Unix(1000, 1000), Reason: "test reason", Message: "test message 1\ntest message 2", }, @@ -89,7 +88,7 @@ func TestGenerateStatus(t *testing.T) { Source: testSource, Events: []types.Event{{ Severity: types.Warn, - Timestamp: time.Unix(1000, 100000*1000), + Timestamp: time.Unix(1000, 1000), Reason: "test reason", Message: "test message 1\ntest message 2", }}, @@ -102,7 +101,6 @@ func TestGenerateStatus(t *testing.T) { Source: testSource, }, conditions: initConditions, - uptime: uptime, } got := k.generateStatus(logs, test.rule) if !reflect.DeepEqual(&test.expected, got) { diff --git a/pkg/kernelmonitor/translator/translator.go b/pkg/kernelmonitor/translator/translator.go index 4dc4d9bf..1915a6e4 100644 --- a/pkg/kernelmonitor/translator/translator.go +++ b/pkg/kernelmonitor/translator/translator.go @@ -18,8 +18,8 @@ package translator import ( "fmt" - "strconv" "strings" + "time" "k8s.io/node-problem-detector/pkg/kernelmonitor/types" ) @@ -41,11 +41,7 @@ func NewDefaultTranslator() Translator { } func (t *defaultTranslator) Translate(line string) (*types.KernelLog, error) { - timestr, message, err := parseLine(line) - if err != nil { - return nil, err - } - timestamp, err := parseTimestamp(timestr) + timestamp, message, err := t.parseLine(line) if err != nil { return nil, err } @@ -55,32 +51,35 @@ func (t *defaultTranslator) Translate(line string) (*types.KernelLog, error) { }, nil } -func parseLine(line string) (string, string, error) { +var ( + timestampLen = 15 + messagePrefix = "]" +) + +func (t *defaultTranslator) parseLine(line string) (time.Time, string, error) { + // Trim the spaces to make sure timestamp could be found + line = strings.TrimSpace(line) + if len(line) < timestampLen { + return time.Time{}, "", fmt.Errorf("the line is too short: %q", line) + } // Example line: Jan 1 00:00:00 hostname kernel: [0.000000] component: log message - timestampPrefix := "kernel: [" - timestampSuffix := "]" - idx := strings.Index(line, timestampPrefix) - if idx == -1 { - return "", "", fmt.Errorf("can't find timestamp prefix %q in line %q", timestampPrefix, line) + now := time.Now() + // There is no time zone information in kernel log timestamp, apply the current time + // zone. + timestamp, err := time.ParseInLocation(time.Stamp, line[:timestampLen], time.Local) + if err != nil { + return time.Time{}, "", fmt.Errorf("error parsing timestamp in line %q: %v", line, err) } - line = line[idx+len(timestampPrefix):] + // There is no year information in kernel log timestamp, apply the current year. + // This could go wrong during looking back phase after kernel monitor is started, + // and the old logs are generated in old year. + timestamp = timestamp.AddDate(now.Year(), 0, 0) - idx = strings.Index(line, timestampSuffix) - if idx == -1 { - return "", "", fmt.Errorf("can't find timestamp suffix %q in line %q", timestampSuffix, line) + loc := strings.Index(line, messagePrefix) + if loc == -1 { + return timestamp, "", fmt.Errorf("can't find message prefix %q in line %q", messagePrefix, line) } - - timestamp := strings.Trim(line[:idx], " ") - message := strings.Trim(line[idx+1:], " ") + message := strings.Trim(line[loc+1:], " ") return timestamp, message, nil } - -func parseTimestamp(timestamp string) (int64, error) { - f, err := strconv.ParseFloat(timestamp, 64) - if err != nil { - return 0, err - } - // seconds to microseconds - return int64(f * 1000000), nil -} diff --git a/pkg/kernelmonitor/translator/translator_test.go b/pkg/kernelmonitor/translator/translator_test.go index 032c0349..98333ac4 100644 --- a/pkg/kernelmonitor/translator/translator_test.go +++ b/pkg/kernelmonitor/translator/translator_test.go @@ -18,33 +18,32 @@ package translator import ( "testing" + "time" ) func TestDefaultTranslator(t *testing.T) { tr := NewDefaultTranslator() - + year := time.Now().Year() testCases := []struct { input string err bool - timestamp int64 + timestamp time.Time message string }{ { - input: "Jan 1 00:00:00 hostname kernel: [9.999999] component: log message", - timestamp: 9999999, + input: "May 1 12:23:45 hostname kernel: [0.000000] component: log message", + timestamp: time.Date(year, time.May, 1, 12, 23, 45, 0, time.Local), message: "component: log message", }, { - input: "Jan 1 00:00:00 hostname kernel: [9.999999]", - timestamp: 9999999, + // no log message + input: "May 21 12:23:45 hostname kernel: [9.999999]", + timestamp: time.Date(year, time.May, 21, 12, 23, 45, 0, time.Local), message: "", }, { - input: "Jan 1 00:00:00 hostname kernel: [9.999999 component: log message", - err: true, - }, - { - input: "Jan 1 00:00:00 hostname user: [9.999999] component: log message", + // the right square bracket is missing + input: "May 21 12:23:45 hostname kernel: [9.999999 component: log message", err: true, }, } @@ -58,7 +57,7 @@ func TestDefaultTranslator(t *testing.T) { continue } if test.timestamp != log.Timestamp || test.message != log.Message { - t.Errorf("case %d: expect timestamp: %d, message: %q; got %+v", c+1, test.timestamp, test.message, log) + t.Errorf("case %d: expect %v, %q; got %v, %q", c+1, test.timestamp, test.message, log.Timestamp, log.Message) } } } diff --git a/pkg/kernelmonitor/types/types.go b/pkg/kernelmonitor/types/types.go index 5545ffaf..0793463c 100644 --- a/pkg/kernelmonitor/types/types.go +++ b/pkg/kernelmonitor/types/types.go @@ -16,10 +16,14 @@ limitations under the License. package types +import ( + "time" +) + // KernelLog is the log item returned by translator. It's very easy to extend this // to support other log monitoring, such as docker log monitoring. type KernelLog struct { - Timestamp int64 // microseconds since kernel boot + Timestamp time.Time Message string }