Files
node-problem-detector/pkg/kernelmonitor/kernel_log_watcher.go
Lantao Liu 532f933bd8 This PR:
1) Add lookback support in kernel monitor. After started, Kernel monitor
will check some old logs to detect problems which happened before last
node reboot.
2) Add `lookback` and `startPattern` in kernel monitor configuration.
  * `lookback` specifies how long time kernel monitor should look back.
  * `startPattern` specifies which log indicates the node is started.
  kernel monitor will clear all current node conditions once it finds
  a node start log. This makes sure that old problems won't change the
  node condition.
3) Add support for kernel panic monitoring, the null pointer and divide
0 kernel panic will be surfaced as event. Usually kernel monitor will
report these events during looking back phase.
2016-08-20 19:11:26 -07:00

200 lines
5.9 KiB
Go

/*
Copyright 2016 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kernelmonitor
import (
"bufio"
"fmt"
"os"
"time"
"k8s.io/node-problem-detector/pkg/kernelmonitor/translator"
"k8s.io/node-problem-detector/pkg/kernelmonitor/types"
"k8s.io/node-problem-detector/pkg/kernelmonitor/util"
"github.com/golang/glog"
"github.com/hpcloud/tail"
utilclock "github.com/pivotal-golang/clock"
)
const (
defaultKernelLogPath = "/var/log/kern.log"
)
// WatcherConfig is the configuration of kernel log watcher.
type WatcherConfig struct {
// KernelLogPath is the path to the kernel log
KernelLogPath string `json:"logPath, omitempty"`
// StartPattern is the pattern of the start line
StartPattern string `json:"startPattern, omitempty"`
// Lookback is the time kernel watcher looks up
Lookback string `json:"lookback, omitempty"`
}
// KernelLogWatcher watches and translates the kernel log. Once there is new log line,
// it will translate and report the log.
type KernelLogWatcher interface {
// Watch starts the kernel log watcher and returns a watch channel.
Watch() (<-chan *types.KernelLog, error)
// Stop stops the kernel log watcher.
Stop()
}
type kernelLogWatcher struct {
// trans is the translator translates the log into internal format.
trans translator.Translator
cfg WatcherConfig
tl *tail.Tail
logCh chan *types.KernelLog
tomb *util.Tomb
clock utilclock.Clock
}
// NewKernelLogWatcher creates a new kernel log watcher.
func NewKernelLogWatcher(cfg WatcherConfig) KernelLogWatcher {
return &kernelLogWatcher{
trans: translator.NewDefaultTranslator(),
cfg: cfg,
tomb: util.NewTomb(),
// A capacity 1000 buffer should be enough
logCh: make(chan *types.KernelLog, 1000),
clock: utilclock.NewClock(),
}
}
func (k *kernelLogWatcher) Watch() (<-chan *types.KernelLog, error) {
path := defaultKernelLogPath
if k.cfg.KernelLogPath != "" {
path = k.cfg.KernelLogPath
}
// NOTE(random-liu): This is a hack. KernelMonitor doesn't support some OS distros e.g. GCI. Ideally,
// KernelMonitor should only run on nodes with supported OS distro. However, NodeProblemDetector is
// running as DaemonSet, it has to be deployed on each node (There is no node affinity support for
// DaemonSet now #22205). If some nodes have unsupported OS distro e.g. the OS distro of master node
// in gke/gce is GCI, KernelMonitor will keep throwing out error, and NodeProblemDetector will be
// restarted again and again.
// To avoid this, we decide to add this temporarily hack. When KernelMonitor can't find the kernel
// log file, it will print a log and then return nil channel and no error. Since nil channel will
// always be blocked, the NodeProblemDetector will block forever.
// TODO(random-liu):
// 1. Add journald supports to support GCI.
// 2. Schedule KernelMonitor only on supported node (with node label and selector)
if _, err := os.Stat(path); os.IsNotExist(err) {
glog.Infof("kernel log %q is not found, kernel monitor doesn't support the os distro", path)
return nil, nil
}
start, err := k.getStartPoint(path)
if err != nil {
return nil, err
}
// TODO(random-liu): If the file gets recreated during this interval, the logic
// will go wrong here.
// TODO(random-liu): Rate limit tail file.
// TODO(random-liu): Figure out what happens if log lines are removed.
k.tl, err = tail.TailFile(path, tail.Config{
Location: &tail.SeekInfo{
Offset: start,
Whence: os.SEEK_SET,
},
Poll: true,
ReOpen: true,
Follow: true,
})
if err != nil {
return nil, err
}
glog.Info("Start watching kernel log")
go k.watchLoop()
return k.logCh, nil
}
func (k *kernelLogWatcher) Stop() {
k.tomb.Stop()
}
// watchLoop is the main watch loop of kernel log watcher.
func (k *kernelLogWatcher) watchLoop() {
defer func() {
close(k.logCh)
k.tomb.Done()
}()
for {
select {
case line := <-k.tl.Lines:
// Notice that tail has trimmed '\n'
if line.Err != nil {
glog.Errorf("Tail error: %v", line.Err)
continue
}
log, err := k.trans.Translate(line.Text)
if err != nil {
glog.Infof("Unable to parse line: %q, %v", line, err)
continue
}
k.logCh <- log
case <-k.tomb.Stopping():
k.tl.Stop()
glog.Infof("Stop watching kernel log")
return
}
}
}
// getStartPoint finds the start point to parse the log. The start point is either
// the line at (now - lookback) or the first line of kernel log.
// Notice that, kernel log watcher doesn't look back to the rolled out logs.
func (k *kernelLogWatcher) getStartPoint(path string) (int64, error) {
f, err := os.Open(path)
if err != nil {
return 0, fmt.Errorf("failed to open file %q: %v", path, err)
}
defer f.Close()
lookback, err := parseDuration(k.cfg.Lookback)
if err != nil {
return 0, fmt.Errorf("failed to parse duration %q: %v", k.cfg.Lookback, err)
}
start := int64(0)
reader := bufio.NewReader(f)
done := false
for !done {
line, err := reader.ReadString('\n')
if err != nil {
if len(line) == 0 {
// No need to continue parsing if nothing is read
break
}
done = true
}
log, err := k.trans.Translate(line)
if err != nil {
glog.Infof("unable to parse line: %q, %v", line, err)
} else if k.clock.Since(log.Timestamp) <= lookback {
break
}
start += int64(len(line))
}
return start, nil
}
func parseDuration(s string) (time.Duration, error) {
// If the duration is not configured, just return 0 by default
if s == "" {
return 0, nil
}
return time.ParseDuration(s)
}