diff --git a/Makefile b/Makefile index ae3f6646..8052843f 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,11 @@ fmt: version: @echo $(VERSION) +./bin/log-counter: $(PKG_SOURCES) + CGO_ENABLED=$(CGO_ENABLED) GOOS=linux go build -o bin/log-counter \ + -ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \ + $(BUILD_TAGS) cmd/logcounter/log_counter.go + ./bin/node-problem-detector: $(PKG_SOURCES) CGO_ENABLED=$(CGO_ENABLED) GOOS=linux go build -o bin/node-problem-detector \ -ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \ @@ -88,10 +93,10 @@ Dockerfile: Dockerfile.in test: vet fmt go test -timeout=1m -v -race ./cmd/options ./pkg/... $(BUILD_TAGS) -build-container: ./bin/node-problem-detector Dockerfile +build-container: ./bin/node-problem-detector ./bin/log-counter Dockerfile docker build -t $(IMAGE) . -build-tar: ./bin/node-problem-detector +build-tar: ./bin/node-problem-detector ./bin/log-counter tar -zcvf $(TARBALL) bin/ config/ sha1sum $(TARBALL) md5sum $(TARBALL) @@ -107,5 +112,6 @@ push-tar: build-tar push: push-container push-tar clean: + rm -f bin/log-counter rm -f bin/node-problem-detector rm -f node-problem-detector-*.tar.gz diff --git a/cmd/logcounter/log_counter.go b/cmd/logcounter/log_counter.go new file mode 100644 index 00000000..6d8ebdb4 --- /dev/null +++ b/cmd/logcounter/log_counter.go @@ -0,0 +1,46 @@ +/* +Copyright 2018 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "os" + + "github.com/spf13/pflag" + + "k8s.io/node-problem-detector/cmd/logcounter/options" + "k8s.io/node-problem-detector/pkg/custompluginmonitor/types" + "k8s.io/node-problem-detector/pkg/logcounter" +) + +func main() { + fedo := options.NewLogCounterOptions() + fedo.AddFlags(pflag.CommandLine) + pflag.Parse() + + counter, err := logcounter.NewKmsgLogCounter(fedo) + if err != nil { + fmt.Print(err) + os.Exit(int(types.Unknown)) + } + actual := counter.Count() + if actual >= fedo.Count { + fmt.Printf("Found %d matching logs, which meets the threshold of %d\n", actual, fedo.Count) + os.Exit(int(types.NonOK)) + } + os.Exit(int(types.OK)) +} diff --git a/cmd/logcounter/options/options.go b/cmd/logcounter/options/options.go new file mode 100644 index 00000000..23aec1a1 --- /dev/null +++ b/cmd/logcounter/options/options.go @@ -0,0 +1,48 @@ +/* +Copyright 2018 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + + "github.com/spf13/pflag" +) + +func NewLogCounterOptions() *LogCounterOptions { + return &LogCounterOptions{} +} + +// LogCounterOptions contains frequent event detector command line and application options. +type LogCounterOptions struct { + // command line options. See flag descriptions for the description + Lookback string + Pattern string + Count int +} + +// AddFlags adds log counter command line options to pflag. +func (fedo *LogCounterOptions) AddFlags(fs *pflag.FlagSet) { + fs.StringVar(&fedo.Lookback, "lookback", "", "The time log watcher looks up") + fs.StringVar(&fedo.Pattern, "pattern", "", + "The regular expression to match the problem in log. The pattern must match to the end of the line.") + fs.IntVar(&fedo.Count, "count", 1, + "The number of times the pattern must be found to trigger the condition") +} + +func init() { + pflag.CommandLine.AddGoFlagSet(flag.CommandLine) +} diff --git a/config/kernel-monitor-counter.json b/config/kernel-monitor-counter.json new file mode 100644 index 00000000..e217e927 --- /dev/null +++ b/config/kernel-monitor-counter.json @@ -0,0 +1,31 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "5m", + "timeout": "1m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "kernel-monitor", + "conditions": [ + { + "type": "FrequentUnregisterNetDevice", + "reason": "NoFrequentUnregisterNetDevice", + "message": "node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "FrequentUnregisterNetDevice", + "reason": "UnregisterNetDevice", + "path": "/home/kubernetes/bin/log-counter", + "args": [ + "--lookback=20m", + "--count=3", + "--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+" + ], + "timeout": "1m" + } + ] +} diff --git a/pkg/custompluginmonitor/plugin/plugin.go b/pkg/custompluginmonitor/plugin/plugin.go index 71813a0e..587fff34 100644 --- a/pkg/custompluginmonitor/plugin/plugin.go +++ b/pkg/custompluginmonitor/plugin/plugin.go @@ -108,7 +108,7 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp } defer cancel() - cmd := exec.CommandContext(ctx, rule.Path) + cmd := exec.CommandContext(ctx, rule.Path, rule.Args...) stdout, err := cmd.Output() if err != nil { if _, ok := err.(*exec.ExitError); !ok { diff --git a/pkg/custompluginmonitor/types/types.go b/pkg/custompluginmonitor/types/types.go index eb5b3af2..c0b24172 100644 --- a/pkg/custompluginmonitor/types/types.go +++ b/pkg/custompluginmonitor/types/types.go @@ -48,6 +48,8 @@ type CustomRule struct { Reason string `json:"reason"` // Path is the path to the custom plugin. Path string `json:"path"` + // Args is the args passed to the custom plugin. + Args []string `json:"args"` // Timeout is the timeout string for the custom plugin to execute. TimeoutString *string `json:"timeout"` // Timeout is the timeout for the custom plugin to execute. diff --git a/pkg/logcounter/log_counter.go b/pkg/logcounter/log_counter.go new file mode 100644 index 00000000..1b2f55d2 --- /dev/null +++ b/pkg/logcounter/log_counter.go @@ -0,0 +1,78 @@ +/* +Copyright 2018 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logcounter + +import ( + "fmt" + "time" + + "k8s.io/kubernetes/pkg/util/clock" + + "k8s.io/node-problem-detector/cmd/logcounter/options" + "k8s.io/node-problem-detector/pkg/logcounter/types" + "k8s.io/node-problem-detector/pkg/systemlogmonitor" + "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/kmsg" + watchertypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types" + systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types" +) + +const ( + bufferSize = 1000 + timeout = 1 * time.Second +) + +type logCounter struct { + logCh <-chan *systemtypes.Log + buffer systemlogmonitor.LogBuffer + pattern string + clock clock.Clock +} + +func NewKmsgLogCounter(options *options.LogCounterOptions) (types.LogCounter, error) { + watcher := kmsg.NewKmsgWatcher(watchertypes.WatcherConfig{Lookback: options.Lookback}) + logCh, err := watcher.Watch() + if err != nil { + return nil, fmt.Errorf("error watching kmsg: %v", err) + } + return &logCounter{ + logCh: logCh, + buffer: systemlogmonitor.NewLogBuffer(bufferSize), + pattern: options.Pattern, + clock: clock.RealClock{}, + }, nil +} + +func (e *logCounter) Count() (count int) { + start := e.clock.Now() + for { + select { + case log := <-e.logCh: + // We only want to count events up until the time at which we started. + // Otherwise we would run forever + if start.Before(log.Timestamp) { + return + } + e.buffer.Push(log) + if len(e.buffer.Match(e.pattern)) != 0 { + count++ + } + case <-e.clock.After(timeout): + // Don't block forever if we do not get any new messages + return + } + } +} diff --git a/pkg/logcounter/log_counter_test.go b/pkg/logcounter/log_counter_test.go new file mode 100644 index 00000000..29d01c5e --- /dev/null +++ b/pkg/logcounter/log_counter_test.go @@ -0,0 +1,129 @@ +/* +Copyright 2018 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logcounter + +import ( + "testing" + "time" + + "k8s.io/kubernetes/pkg/util/clock" + + "k8s.io/node-problem-detector/pkg/logcounter/types" + "k8s.io/node-problem-detector/pkg/systemlogmonitor" + systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types" +) + +func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *clock.FakeClock, chan *systemtypes.Log) { + logCh := make(chan *systemtypes.Log) + clock := clock.NewFakeClock(startTime) + return &logCounter{ + logCh: logCh, + buffer: systemlogmonitor.NewLogBuffer(bufferSize), + pattern: pattern, + clock: clock, + }, clock, logCh +} + +func TestCount(t *testing.T) { + startTime := time.Now() + for _, tc := range []struct { + description string + logs []*systemtypes.Log + pattern string + expectedCount int + }{ + { + description: "no logs", + logs: []*systemtypes.Log{}, + pattern: "", + expectedCount: 0, + }, + { + description: "one matching log", + logs: []*systemtypes.Log{ + { + Timestamp: startTime.Add(-time.Second), + Message: "0", + }, + }, + pattern: "0", + expectedCount: 1, + }, + { + description: "one non-matching log", + logs: []*systemtypes.Log{ + { + Timestamp: startTime.Add(-time.Second), + Message: "1", + }, + }, + pattern: "0", + expectedCount: 0, + }, + { + description: "log too new", + logs: []*systemtypes.Log{ + { + Timestamp: startTime.Add(time.Second), + Message: "0", + }, + }, + pattern: "0", + expectedCount: 0, + }, + { + description: "many logs", + logs: []*systemtypes.Log{ + { + Timestamp: startTime.Add(-time.Second), + Message: "0", + }, + { + Timestamp: startTime.Add(-time.Second), + Message: "0", + }, + { + Timestamp: startTime.Add(-time.Second), + Message: "1", + }, + { + Timestamp: startTime.Add(time.Second), + Message: "0", + }, + }, + pattern: "0", + expectedCount: 2, + }, + } { + t.Run(tc.description, func(t *testing.T) { + counter, fakeClock, logCh := NewTestLogCounter(tc.pattern, startTime) + go func(logs []*systemtypes.Log, ch chan<- *systemtypes.Log) { + for _, log := range logs { + ch <- log + } + // trigger the timeout to ensure the test doesn't block permenantly + for { + fakeClock.Step(2 * timeout) + } + }(tc.logs, logCh) + actualCount := counter.Count() + if actualCount != tc.expectedCount { + t.Errorf("got %d; expected %d", actualCount, tc.expectedCount) + } + }) + } +} diff --git a/pkg/logcounter/types/types.go b/pkg/logcounter/types/types.go new file mode 100644 index 00000000..640d3a78 --- /dev/null +++ b/pkg/logcounter/types/types.go @@ -0,0 +1,21 @@ +/* +Copyright 2018 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +type LogCounter interface { + Count() int +}