From fbebcf311b341a2f057ee72c3ed0fe05e220ecce Mon Sep 17 00:00:00 2001 From: Xuewei Zhang Date: Wed, 26 Jun 2019 15:44:55 -0700 Subject: [PATCH] Report metrics from system-log-monitor --- README.md | 2 +- config/docker-monitor.json | 1 + config/kernel-monitor.json | 1 + config/systemd-monitor.json | 1 + pkg/problemmetrics/problem_metrics.go | 114 +++++ pkg/problemmetrics/problem_metrics_stub.go | 35 ++ pkg/problemmetrics/problem_metrics_test.go | 277 +++++++++++ pkg/systemlogmonitor/README.md | 27 + pkg/systemlogmonitor/config.go | 15 +- pkg/systemlogmonitor/log_monitor.go | 48 +- pkg/systemlogmonitor/log_monitor_test.go | 553 ++++++++++++++++++++- pkg/systemlogmonitor/types/types.go | 3 +- pkg/systemstatsmonitor/disk_collector.go | 71 ++- pkg/systemstatsmonitor/host_collector.go | 37 +- pkg/util/metrics/fakes.go | 108 ++++ pkg/util/metrics/fakes_test.go | 243 +++++++++ pkg/util/metrics/helpers.go | 155 +++++- test/build.sh | 4 +- 18 files changed, 1608 insertions(+), 87 deletions(-) create mode 100644 pkg/problemmetrics/problem_metrics.go create mode 100644 pkg/problemmetrics/problem_metrics_stub.go create mode 100644 pkg/problemmetrics/problem_metrics_test.go create mode 100644 pkg/util/metrics/fakes.go create mode 100644 pkg/util/metrics/fakes_test.go diff --git a/README.md b/README.md index 8d8c22ce..dfebc102 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ List of supported problem daemons: | Problem Daemon | NodeCondition | Description | Disabling Build Tag | |----------------|:---------------:|:------------|:--------------------| -| [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) | KernelDeadlock | A system log monitor monitors kernel log and reports problem according to predefined rules. | disable_system_log_monitor +| [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) | KernelDeadlock | A system log monitor monitors kernel log and reports problems and metrics according to predefined rules. | disable_system_log_monitor | [AbrtAdaptor](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) | None | Monitor ABRT log messages and report them further. ABRT (Automatic Bug Report Tool) is health monitoring daemon able to catch kernel problems as well as application crashes of various kinds occurred on the host. For more information visit the [link](https://github.com/abrt). | disable_system_log_monitor | [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json) | On-demand(According to users configuration) | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user defined check scripts. See proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). | disable_custom_plugin_monitor | [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | disable_system_stats_monitor diff --git a/config/docker-monitor.json b/config/docker-monitor.json index d6294d4c..15cc950a 100644 --- a/config/docker-monitor.json +++ b/config/docker-monitor.json @@ -7,6 +7,7 @@ "lookback": "5m", "bufferSize": 10, "source": "docker-monitor", + "metricsReporting": true, "conditions": [], "rules": [ { diff --git a/config/kernel-monitor.json b/config/kernel-monitor.json index aa2dd4a5..5588b608 100644 --- a/config/kernel-monitor.json +++ b/config/kernel-monitor.json @@ -4,6 +4,7 @@ "lookback": "5m", "bufferSize": 10, "source": "kernel-monitor", + "metricsReporting": true, "conditions": [ { "type": "KernelDeadlock", diff --git a/config/systemd-monitor.json b/config/systemd-monitor.json index 17376fe9..4db8cf90 100644 --- a/config/systemd-monitor.json +++ b/config/systemd-monitor.json @@ -7,6 +7,7 @@ "lookback": "", "bufferSize": 10, "source": "systemd-monitor", + "metricsReporting": true, "conditions": [], "rules": [ { diff --git a/pkg/problemmetrics/problem_metrics.go b/pkg/problemmetrics/problem_metrics.go new file mode 100644 index 00000000..2d82ab04 --- /dev/null +++ b/pkg/problemmetrics/problem_metrics.go @@ -0,0 +1,114 @@ +/* +Copyright 2019 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package problemmetrics + +import ( + "errors" + "fmt" + "sync" + + "github.com/golang/glog" + + "k8s.io/node-problem-detector/pkg/util/metrics" +) + +// GlobalProblemMetricsManager is a singleton of ProblemMetricsManager, +// which should be used to manage all problem-converted metrics across all +// problem daemons. +var GlobalProblemMetricsManager *ProblemMetricsManager + +func init() { + GlobalProblemMetricsManager = NewProblemMetricsManagerOrDie() +} + +// ProblemMetricsManager manages problem-converted metrics. +// ProblemMetricsManager is thread-safe. +type ProblemMetricsManager struct { + problemCounter metrics.Int64MetricInterface + problemGauge metrics.Int64MetricInterface + problemTypeToReason map[string]string + problemTypeToReasonMutex sync.Mutex +} + +func NewProblemMetricsManagerOrDie() *ProblemMetricsManager { + pmm := ProblemMetricsManager{} + + var err error + pmm.problemCounter, err = metrics.NewInt64Metric( + "problem_counter", + "Number of times a specific type of problem have occurred.", + "1", + metrics.Sum, + []string{"reason"}) + if err != nil { + glog.Fatalf("Failed to create problem_counter metric: %v", err) + } + + pmm.problemGauge, err = metrics.NewInt64Metric( + "problem_gauge", + "Whether a specific type of problem is affecting the node or not.", + "1", + metrics.LastValue, + []string{"type", "reason"}) + if err != nil { + glog.Fatalf("Failed to create problem_gauge metric: %v", err) + } + + pmm.problemTypeToReason = make(map[string]string) + + return &pmm +} + +// IncrementProblemCounter increments the value of a problem counter. +func (pmm *ProblemMetricsManager) IncrementProblemCounter(reason string, count int64) error { + if pmm.problemCounter == nil { + return errors.New("problem counter is being incremented before initialized.") + } + + return pmm.problemCounter.Record(map[string]string{"reason": reason}, count) +} + +// SetProblemGauge sets the value of a problem gauge. +func (pmm *ProblemMetricsManager) SetProblemGauge(problemType string, reason string, value bool) error { + if pmm.problemGauge == nil { + return errors.New("problem gauge is being set before initialized.") + } + + pmm.problemTypeToReasonMutex.Lock() + defer pmm.problemTypeToReasonMutex.Unlock() + + // We clear the last reason, because the expected behavior is that at any point of time, + // for each type of permanent problem, there should be at most one reason got set to 1. + // This behavior is consistent with the behavior of node condition in Kubernetes. + // However, problemGauges with different "type" and "reason" are considered as different + // metrics in Prometheus. So we need to clear the previous metrics explicitly. + if lastReason, ok := pmm.problemTypeToReason[problemType]; ok { + err := pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": lastReason}, 0) + if err != nil { + return fmt.Errorf("failed to clear previous reason %q for type %q: %v", + problemType, lastReason, err) + } + } + + pmm.problemTypeToReason[problemType] = reason + + var valueInt int64 + if value { + valueInt = 1 + } + return pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": reason}, valueInt) +} diff --git a/pkg/problemmetrics/problem_metrics_stub.go b/pkg/problemmetrics/problem_metrics_stub.go new file mode 100644 index 00000000..de78d5e2 --- /dev/null +++ b/pkg/problemmetrics/problem_metrics_stub.go @@ -0,0 +1,35 @@ +/* +Copyright 2019 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package problemmetrics + +import ( + "k8s.io/node-problem-detector/pkg/util/metrics" +) + +// NewProblemMetricsManagerStub creates a ProblemMetricsManager stubbed by fake metrics. +// The stubbed ProblemMetricsManager and fake metrics are returned. +func NewProblemMetricsManagerStub() (*ProblemMetricsManager, *metrics.FakeInt64Metric, *metrics.FakeInt64Metric) { + fakeProblemCounter := metrics.NewFakeInt64Metric("problem_counter", metrics.Sum, []string{"reason"}) + fakeProblemGauge := metrics.NewFakeInt64Metric("problem_gauge", metrics.LastValue, []string{"type", "reason"}) + + pmm := ProblemMetricsManager{} + pmm.problemCounter = metrics.Int64MetricInterface(fakeProblemCounter) + pmm.problemGauge = metrics.Int64MetricInterface(fakeProblemGauge) + pmm.problemTypeToReason = make(map[string]string) + + return &pmm, fakeProblemCounter, fakeProblemGauge +} diff --git a/pkg/problemmetrics/problem_metrics_test.go b/pkg/problemmetrics/problem_metrics_test.go new file mode 100644 index 00000000..ff9ff8a8 --- /dev/null +++ b/pkg/problemmetrics/problem_metrics_test.go @@ -0,0 +1,277 @@ +/* +Copyright 2019 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package problemmetrics + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "k8s.io/node-problem-detector/pkg/util/metrics" +) + +func TestNewProblem(t *testing.T) { + testCases := []struct { + name string + reasons []string + counts []int64 + expectedMetrics []metrics.Int64MetricRepresentation + }{ + { + name: "no problem at all", + reasons: []string{}, + counts: []int64{}, + expectedMetrics: []metrics.Int64MetricRepresentation{}, + }, + { + name: "one problem happened", + reasons: []string{"foo"}, + counts: []int64{1}, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "foo"}, + Value: 1, + }, + }, + }, + { + name: "one problem happened twice", + reasons: []string{"foo", "foo"}, + counts: []int64{1, 1}, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "foo"}, + Value: 2, + }, + }, + }, + { + name: "two problem happened various times", + reasons: []string{"foo", "bar", "foo"}, + counts: []int64{1, 1, 1}, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "foo"}, + Value: 2, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "bar"}, + Value: 1, + }, + }, + }, + { + name: "two problem initialized", + reasons: []string{"foo", "bar"}, + counts: []int64{0, 0}, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "foo"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "bar"}, + Value: 0, + }, + }, + }, + { + name: "two problem first initialized, then happened various times", + reasons: []string{"foo", "bar", "foo", "bar", "foo"}, + counts: []int64{0, 0, 1, 1, 1}, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "foo"}, + Value: 2, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "bar"}, + Value: 1, + }, + }, + }, + } + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + pmm, fakeProblemCounter, fakeProblemGauge := NewProblemMetricsManagerStub() + + for idx, reason := range test.reasons { + pmm.IncrementProblemCounter(reason, test.counts[idx]) + } + + gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...) + assert.ElementsMatch(t, test.expectedMetrics, gotMetrics, + "expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics) + }) + } +} + +func TestSetProblemGauge(t *testing.T) { + type argumentType struct { + problemType string + reason string + value bool + } + + testCases := []struct { + name string + arguments []argumentType + expectedMetrics []metrics.Int64MetricRepresentation + }{ + { + name: "no permanent problem at all", + arguments: []argumentType{}, + expectedMetrics: []metrics.Int64MetricRepresentation{}, + }, + { + name: "one permanent problem was set once", + arguments: []argumentType{ + {"ProblemTypeA", "ReasonFoo", true}, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"}, + Value: 1, + }, + }, + }, + { + name: "one permanent problem was set twice with same reason", + arguments: []argumentType{ + {"ProblemTypeA", "ReasonFoo", true}, + {"ProblemTypeA", "ReasonFoo", true}, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"}, + Value: 1, + }, + }, + }, + { + name: "one permanent problem was set twice with different reasons", + arguments: []argumentType{ + {"ProblemTypeA", "ReasonFoo", true}, + {"ProblemTypeA", "ReasonBar", true}, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonBar"}, + Value: 1, + }, + }, + }, + { + name: "one permanent problem was set then cleared", + arguments: []argumentType{ + {"ProblemTypeA", "ReasonFoo", true}, + {"ProblemTypeA", "", false}, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": ""}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"}, + Value: 0, + }, + }, + }, + { + name: "one permanent problem was set, cleared, and set again", + arguments: []argumentType{ + {"ProblemTypeA", "ReasonFoo", true}, + {"ProblemTypeA", "", false}, + {"ProblemTypeA", "ReasonBar", true}, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": ""}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonBar"}, + Value: 1, + }, + }, + }, + { + name: "two permanent problems were set and one of them got cleared", + arguments: []argumentType{ + {"ProblemTypeA", "ReasonFoo", true}, + {"ProblemTypeB", "ReasonBar", true}, + {"ProblemTypeA", "", false}, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": ""}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ProblemTypeB", "reason": "ReasonBar"}, + Value: 1, + }, + }, + }, + } + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + pmm, fakeProblemCounter, fakeProblemGauge := NewProblemMetricsManagerStub() + + for _, argument := range test.arguments { + pmm.SetProblemGauge(argument.problemType, argument.reason, argument.value) + } + + gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...) + assert.ElementsMatch(t, test.expectedMetrics, gotMetrics, + "expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics) + }) + } +} diff --git a/pkg/systemlogmonitor/README.md b/pkg/systemlogmonitor/README.md index 4de7a1d3..29487da9 100644 --- a/pkg/systemlogmonitor/README.md +++ b/pkg/systemlogmonitor/README.md @@ -86,3 +86,30 @@ field in the configuration file is the log path. You can always configure System log monitor uses [Log Watcher](./logwatchers/types/log_watcher.go) to support different log management tools. It is easy to implement a new log watcher. + +## Metrics Reporting + +By setting the boolean `metricsReporting` at top level, you can choose to enable or disable +metrics reporting of System Log Monitor. If you omit the field, it will be set to `true` by +default. + +Temporary problems will be reported as counter metrics, such as below example: + +``` +# HELP problem_counter Number of times a specific type of problem have occurred. +# TYPE problem_counter counter +problem_counter{reason="TaskHung"} 2 +``` + +Permanent problems will be reported as both gauge metrics and counter metrics, such as below +example: + +``` +# HELP problem_counter Number of times a specific type of problem have occurred. +# TYPE problem_counter counter +problem_counter{reason="DockerHung"} 1 +# HELP problem_gauge Whether a specific type of problem is affecting the node or not. +# TYPE problem_gauge gauge +problem_gauge{condition="KernelDeadlock",reason="DockerHung"} 1 +``` + diff --git a/pkg/systemlogmonitor/config.go b/pkg/systemlogmonitor/config.go index 6c73f824..2ac0013f 100644 --- a/pkg/systemlogmonitor/config.go +++ b/pkg/systemlogmonitor/config.go @@ -24,6 +24,12 @@ import ( "k8s.io/node-problem-detector/pkg/types" ) +var ( + defaultBufferSize = 10 + defaultLookback = "0" + defaultEnableMetricsReporting = true +) + // MonitorConfig is the configuration of log monitor. type MonitorConfig struct { // WatcherConfig is the configuration of log watcher. @@ -36,15 +42,20 @@ type MonitorConfig struct { DefaultConditions []types.Condition `json:"conditions"` // Rules are the rules log monitor will follow to parse the log file. Rules []systemlogtypes.Rule `json:"rules"` + // EnableMetricsReporting describes whether to report problems as metrics or not. + EnableMetricsReporting *bool `json:"metricsReporting,omitempty"` } // ApplyConfiguration applies default configurations. func (mc *MonitorConfig) ApplyDefaultConfiguration() { if mc.BufferSize == 0 { - mc.BufferSize = 10 + mc.BufferSize = defaultBufferSize + } + if mc.EnableMetricsReporting == nil { + mc.EnableMetricsReporting = &defaultEnableMetricsReporting } if mc.WatcherConfig.Lookback == "" { - mc.WatcherConfig.Lookback = "0" + mc.WatcherConfig.Lookback = defaultLookback } } diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 15f051a7..b20af5d8 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -24,6 +24,7 @@ import ( "github.com/golang/glog" "k8s.io/node-problem-detector/pkg/problemdaemon" + "k8s.io/node-problem-detector/pkg/problemmetrics" "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers" watchertypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types" logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types" @@ -55,9 +56,8 @@ type logMonitor struct { // NewLogMonitorOrDie create a new LogMonitor, panic if error occurs. func NewLogMonitorOrDie(configPath string) types.Monitor { - l := &logMonitor{ - tomb: tomb.NewTomb(), - } + l := &logMonitor{tomb: tomb.NewTomb()} + f, err := ioutil.ReadFile(configPath) if err != nil { glog.Fatalf("Failed to read configuration file %q: %v", configPath, err) @@ -73,13 +73,36 @@ func NewLogMonitorOrDie(configPath string) types.Monitor { glog.Fatalf("Failed to validate matching rules %+v: %v", l.config.Rules, err) } glog.Infof("Finish parsing log monitor config file: %+v", l.config) + l.watcher = logwatchers.GetLogWatcherOrDie(l.config.WatcherConfig) l.buffer = NewLogBuffer(l.config.BufferSize) // A 1000 size channel should be big enough. l.output = make(chan *types.Status, 1000) + + if *l.config.EnableMetricsReporting { + initializeProblemMetricsOrDie(l.config.Rules) + } return l } +// initializeProblemMetricsOrDie creates problem metrics for all problems and set the value to 0, +// panic if error occurs. +func initializeProblemMetricsOrDie(rules []systemlogtypes.Rule) { + for _, rule := range rules { + if rule.Type == types.Perm { + err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false) + if err != nil { + glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v", + rule.Condition, rule.Reason, err) + } + } + err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0) + if err != nil { + glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err) + } + } +} + func (l *logMonitor) Start() (<-chan *types.Status, error) { glog.Info("Start log monitor") var err error @@ -142,6 +165,12 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru Reason: rule.Reason, Message: message, }) + if *l.config.EnableMetricsReporting { + err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 1) + if err != nil { + glog.Errorf("Failed to update problem counter metrics for %q: %v", rule.Reason, err) + } + } } else { // For permanent error changes the condition for i := range l.conditions { @@ -159,6 +188,18 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru rule.Reason, timestamp, )) + + if *l.config.EnableMetricsReporting { + err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, true) + if err != nil { + glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v", + rule.Condition, rule.Reason, err) + } + err = problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 1) + if err != nil { + glog.Errorf("Failed to update problem counter metrics for %q: %v", rule.Reason, err) + } + } } condition.Status = types.True condition.Reason = rule.Reason @@ -166,6 +207,7 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru } } } + return &types.Status{ Source: l.config.Source, // TODO(random-liu): Aggregate events and conditions and then do periodically report. diff --git a/pkg/systemlogmonitor/log_monitor_test.go b/pkg/systemlogmonitor/log_monitor_test.go index 957fa0bc..e68aaa4d 100644 --- a/pkg/systemlogmonitor/log_monitor_test.go +++ b/pkg/systemlogmonitor/log_monitor_test.go @@ -24,9 +24,11 @@ import ( "github.com/stretchr/testify/assert" "k8s.io/node-problem-detector/pkg/problemdaemon" + "k8s.io/node-problem-detector/pkg/problemmetrics" logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types" "k8s.io/node-problem-detector/pkg/types" "k8s.io/node-problem-detector/pkg/util" + "k8s.io/node-problem-detector/pkg/util/metrics" ) const ( @@ -41,7 +43,7 @@ func TestRegistration(t *testing.T) { "System log monitor failed to register itself as a problem daemon.") } -func TestGenerateStatus(t *testing.T) { +func TestGenerateStatusForConditions(t *testing.T) { initConditions := []types.Condition{ { Type: testConditionA, @@ -141,9 +143,558 @@ func TestGenerateStatus(t *testing.T) { // during the test. conditions: append([]types.Condition{}, initConditions...), } + (&l.config).ApplyDefaultConfiguration() got := l.generateStatus(logs, test.rule) if !reflect.DeepEqual(&test.expected, got) { t.Errorf("case %d: expected status %+v, got %+v", c+1, test.expected, got) } } } + +func TestGenerateStatusForMetrics(t *testing.T) { + testCases := []struct { + name string + conditions []types.Condition + triggeredRules []logtypes.Rule + expectedMetrics []metrics.Int64MetricRepresentation + }{ + { + name: "one temporary problem that has not happened", + conditions: []types.Condition{}, + triggeredRules: []logtypes.Rule{}, + expectedMetrics: []metrics.Int64MetricRepresentation{}, + }, + { + name: "one temporary problem happened once", + conditions: []types.Condition{}, + triggeredRules: []logtypes.Rule{ + { + Type: types.Temp, + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 1, + }, + }, + }, + { + name: "one temporary problem happened twice", + conditions: []types.Condition{}, + triggeredRules: []logtypes.Rule{ + { + Type: types.Temp, + Reason: "problem reason foo", + }, + { + Type: types.Temp, + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 2, + }, + }, + }, + { + name: "two different temporary problems happened", + conditions: []types.Condition{}, + triggeredRules: []logtypes.Rule{ + { + Type: types.Temp, + Reason: "problem reason foo", + }, + { + Type: types.Temp, + Reason: "problem reason bar", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 1, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason bar"}, + Value: 1, + }, + }, + }, + { + name: "one permanent problem that is happening", + conditions: []types.Condition{ + { + Type: "ConditionA", + Status: types.False, + }, + }, + triggeredRules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 1, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 1, + }, + }, + }, + { + name: "one permanent problem observed twice with same reason", + conditions: []types.Condition{ + { + Type: "ConditionA", + Status: types.False, + }, + }, + triggeredRules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 1, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 1, + }, + }, + }, + { + name: "one permanent problem observed twice with different reasons", + conditions: []types.Condition{ + { + Type: "ConditionA", + Status: types.False, + }, + }, + triggeredRules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason bar", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason bar"}, + Value: 1, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 1, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason bar"}, + Value: 1, + }, + }, + }, + { + name: "two permanent problem observed once each", + conditions: []types.Condition{ + { + Type: "ConditionA", + Status: types.False, + }, + { + Type: "ConditionB", + Status: types.False, + }, + }, + triggeredRules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionB", + Reason: "problem reason bar", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 1, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionB", "reason": "problem reason bar"}, + Value: 1, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 1, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason bar"}, + Value: 1, + }, + }, + }, + } + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + l := &logMonitor{} + l.conditions = test.conditions + (&l.config).ApplyDefaultConfiguration() + + originalGlobalProblemMetricsManager := problemmetrics.GlobalProblemMetricsManager + defer func() { + problemmetrics.GlobalProblemMetricsManager = originalGlobalProblemMetricsManager + }() + + fakePMM, fakeProblemCounter, fakeProblemGauge := problemmetrics.NewProblemMetricsManagerStub() + problemmetrics.GlobalProblemMetricsManager = fakePMM + + for _, rule := range test.triggeredRules { + l.generateStatus([]*logtypes.Log{{}}, rule) + } + + gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...) + + assert.ElementsMatch(t, test.expectedMetrics, gotMetrics, + "expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics) + }) + } +} + +func TestInitializeProblemMetricsOrDie(t *testing.T) { + testCases := []struct { + name string + rules []logtypes.Rule + expectedMetrics []metrics.Int64MetricRepresentation + }{ + { + name: "no problem type at all", + rules: []logtypes.Rule{}, + expectedMetrics: []metrics.Int64MetricRepresentation{}, + }, + { + name: "one type of temporary problem", + rules: []logtypes.Rule{ + { + Type: types.Temp, + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + }, + }, + { + name: "one type of permanent problem", + rules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + }, + }, + { + name: "duplicate temporary problem types", + rules: []logtypes.Rule{ + { + Type: types.Temp, + Reason: "problem reason foo", + }, + { + Type: types.Temp, + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + }, + }, + { + name: "multiple temporary problem types", + rules: []logtypes.Rule{ + { + Type: types.Temp, + Reason: "problem reason foo", + }, + { + Type: types.Temp, + Reason: "problem reason bar", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason bar"}, + Value: 0, + }, + }, + }, + { + name: "multiple permanent problem types with same condition", + rules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason bar", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason bar"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason bar"}, + Value: 0, + }, + }, + }, + { + name: "multiple permanent problem types with different conditions", + rules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionB", + Reason: "problem reason bar", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionB", "reason": "problem reason bar"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason bar"}, + Value: 0, + }, + }, + }, + { + name: "duplicate permanent problem types", + rules: []logtypes.Rule{ + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + }, + }, + { + name: "mixture of temporary and permanent problem types", + rules: []logtypes.Rule{ + { + Type: types.Temp, + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason hello", + }, + { + Type: types.Perm, + Condition: "ConditionA", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionB", + Reason: "problem reason foo", + }, + { + Type: types.Perm, + Condition: "ConditionB", + Reason: "problem reason bar", + }, + { + Type: types.Temp, + Reason: "problem reason foo", + }, + { + Type: types.Temp, + Reason: "problem reason bar", + }, + }, + expectedMetrics: []metrics.Int64MetricRepresentation{ + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason hello"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionA", "reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionB", "reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_gauge", + Labels: map[string]string{"type": "ConditionB", "reason": "problem reason bar"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason hello"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason foo"}, + Value: 0, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "problem reason bar"}, + Value: 0, + }, + }, + }, + } + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + l := &logMonitor{} + (&l.config).ApplyDefaultConfiguration() + + originalGlobalProblemMetricsManager := problemmetrics.GlobalProblemMetricsManager + defer func() { + problemmetrics.GlobalProblemMetricsManager = originalGlobalProblemMetricsManager + }() + + fakePMM, fakeProblemCounter, fakeProblemGauge := problemmetrics.NewProblemMetricsManagerStub() + problemmetrics.GlobalProblemMetricsManager = fakePMM + + initializeProblemMetricsOrDie(test.rules) + + gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...) + + assert.ElementsMatch(t, test.expectedMetrics, gotMetrics, + "expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics) + }) + } +} diff --git a/pkg/systemlogmonitor/types/types.go b/pkg/systemlogmonitor/types/types.go index c2494b86..b46facd4 100644 --- a/pkg/systemlogmonitor/types/types.go +++ b/pkg/systemlogmonitor/types/types.go @@ -17,8 +17,9 @@ limitations under the License. package types import ( - "k8s.io/node-problem-detector/pkg/types" "time" + + "k8s.io/node-problem-detector/pkg/types" ) // Log is the log item returned by translator. It's very easy to extend this diff --git a/pkg/systemstatsmonitor/disk_collector.go b/pkg/systemstatsmonitor/disk_collector.go index fbf9c844..b439e577 100644 --- a/pkg/systemstatsmonitor/disk_collector.go +++ b/pkg/systemstatsmonitor/disk_collector.go @@ -24,19 +24,15 @@ import ( "github.com/golang/glog" "github.com/shirou/gopsutil/disk" - "go.opencensus.io/stats" - "go.opencensus.io/stats/view" - "go.opencensus.io/tag" ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types" "k8s.io/node-problem-detector/pkg/util/metrics" ) type diskCollector struct { - keyDevice tag.Key - mIOTime *stats.Int64Measure - mWeightedIO *stats.Int64Measure - mAvgQueueLen *stats.Float64Measure + mIOTime *metrics.Int64Metric + mWeightedIO *metrics.Int64Metric + mAvgQueueLen *metrics.Float64Metric config *ssmtypes.DiskStatsConfig @@ -48,36 +44,34 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector dc := diskCollector{config: diskConfig} var err error - dc.keyDevice, err = tag.NewKey("device") + dc.mIOTime, err = metrics.NewInt64Metric( + diskConfig.MetricsConfigs["disk/io_time"].DisplayName, + "The IO time spent on the disk", + "second", + metrics.LastValue, + []string{"device"}) if err != nil { - glog.Fatalf("Failed to create device tag during initializing disk collector: %v", err) + glog.Fatalf("Error initializing metric for disk/io_time: %v", err) } - if diskConfig.MetricsConfigs["disk/io_time"].DisplayName != "" { - dc.mIOTime = metrics.NewInt64Metric( - diskConfig.MetricsConfigs["disk/io_time"].DisplayName, - "The IO time spent on the disk", - "second", - view.LastValue(), - []tag.Key{dc.keyDevice}) + dc.mWeightedIO, err = metrics.NewInt64Metric( + diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName, + "The weighted IO on the disk", + "second", + metrics.LastValue, + []string{"device"}) + if err != nil { + glog.Fatalf("Error initializing metric for disk/weighted_io: %v", err) } - if diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName != "" { - dc.mWeightedIO = metrics.NewInt64Metric( - diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName, - "The weighted IO on the disk", - "second", - view.LastValue(), - []tag.Key{dc.keyDevice}) - } - - if diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName != "" { - dc.mAvgQueueLen = metrics.NewFloat64Metric( - diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName, - "The average queue length on the disk", - "second", - view.LastValue(), - []tag.Key{dc.keyDevice}) + dc.mAvgQueueLen, err = metrics.NewFloat64Metric( + diskConfig.MetricsConfigs["disk/avg_queue_len"].DisplayName, + "The average queue length on the disk", + "second", + metrics.LastValue, + []string{"device"}) + if err != nil { + glog.Fatalf("Error initializing metric for disk/avg_queue_len: %v", err) } dc.historyIOTime = make(map[string]uint64) @@ -119,20 +113,15 @@ func (dc *diskCollector) collect() { } // Attach label {"device": deviceName} to the metrics. - deviceCtx, err := tag.New(context.Background(), tag.Upsert(dc.keyDevice, deviceName)) - if err != nil { - glog.Errorf("Failed to create context with device tag: %v", err) - deviceCtx = context.Background() - } - + tags := map[string]string{"device": deviceName} if dc.mIOTime != nil { - stats.Record(deviceCtx, dc.mIOTime.M(int64(ioCountersStat.IoTime))) + dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime)) } if dc.mWeightedIO != nil { - stats.Record(deviceCtx, dc.mWeightedIO.M(int64(ioCountersStat.WeightedIO))) + dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO)) } if dc.mAvgQueueLen != nil { - stats.Record(deviceCtx, dc.mAvgQueueLen.M(avgQueueLen)) + dc.mAvgQueueLen.Record(tags, avgQueueLen) } } } diff --git a/pkg/systemstatsmonitor/host_collector.go b/pkg/systemstatsmonitor/host_collector.go index fdbcaa1e..c55a4541 100644 --- a/pkg/systemstatsmonitor/host_collector.go +++ b/pkg/systemstatsmonitor/host_collector.go @@ -17,13 +17,8 @@ limitations under the License. package systemstatsmonitor import ( - "context" - "github.com/golang/glog" "github.com/shirou/gopsutil/host" - "go.opencensus.io/stats" - "go.opencensus.io/stats/view" - "go.opencensus.io/tag" ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types" "k8s.io/node-problem-detector/pkg/util" @@ -31,40 +26,35 @@ import ( ) type hostCollector struct { - tags []tag.Mutator - uptime *stats.Int64Measure + tags map[string]string + uptime *metrics.Int64Metric } func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector { - hc := hostCollector{} + hc := hostCollector{map[string]string{}, nil} - keyKernelVersion, err := tag.NewKey("kernel_version") - if err != nil { - glog.Fatalf("Failed to create kernel_version tag during initializing host collector: %v", err) - } kernelVersion, err := host.KernelVersion() if err != nil { glog.Fatalf("Failed to retrieve kernel version: %v", err) } - hc.tags = append(hc.tags, tag.Upsert(keyKernelVersion, kernelVersion)) + hc.tags["kernel_version"] = kernelVersion - keyOSVersion, err := tag.NewKey("os_version") - if err != nil { - glog.Fatalf("Failed to create os_version tag during initializing host collector: %v", err) - } osVersion, err := util.GetOSVersion() if err != nil { glog.Fatalf("Failed to retrieve OS version: %v", err) } - hc.tags = append(hc.tags, tag.Upsert(keyOSVersion, osVersion)) + hc.tags["os_version"] = osVersion if hostConfig.MetricsConfigs["host/uptime"].DisplayName != "" { - hc.uptime = metrics.NewInt64Metric( + hc.uptime, err = metrics.NewInt64Metric( hostConfig.MetricsConfigs["host/uptime"].DisplayName, "The uptime of the operating system", "second", - view.LastValue(), - []tag.Key{keyKernelVersion, keyOSVersion}) + metrics.LastValue, + []string{"kernel_version", "os_version"}) + if err != nil { + glog.Fatalf("Error initializing metric for host/uptime: %v", err) + } } return &hc @@ -82,9 +72,6 @@ func (hc *hostCollector) collect() { } if hc.uptime != nil { - err := stats.RecordWithTags(context.Background(), hc.tags, hc.uptime.M(int64(uptime))) - if err != nil { - glog.Errorf("Failed to record current uptime (%d seconds) of the host: %v", uptime, err) - } + hc.uptime.Record(hc.tags, int64(uptime)) } } diff --git a/pkg/util/metrics/fakes.go b/pkg/util/metrics/fakes.go new file mode 100644 index 00000000..b9842030 --- /dev/null +++ b/pkg/util/metrics/fakes.go @@ -0,0 +1,108 @@ +/* +Copyright 2019 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "errors" + "fmt" + "reflect" +) + +// Int64MetricRepresentation represents a snapshot of an int64 metrics. +// This is used for inspecting fake metrics. +type Int64MetricRepresentation struct { + // Name is the metric name. + Name string + // Labels contains all metric labels in key-value pair format. + Labels map[string]string + // Value is the value of the metric. + Value int64 +} + +// Int64MetricInterface is used to create test double for Int64Metric. +type Int64MetricInterface interface { + // Record records a measurement for the metric, with provided tags as metric labels. + Record(tags map[string]string, measurement int64) error +} + +// FakeInt64Metric implements Int64MetricInterface. +// FakeInt64Metric can be used as a test double for Int64MetricInterface, allowing +// inspection of the metrics. +type FakeInt64Metric struct { + name string + aggregation Aggregation + allowedTags map[string]bool + metrics []Int64MetricRepresentation +} + +func NewFakeInt64Metric(name string, aggregation Aggregation, tagNames []string) *FakeInt64Metric { + if name == "" { + return nil + } + + allowedTags := make(map[string]bool) + for _, tagName := range tagNames { + allowedTags[tagName] = true + } + + fake := FakeInt64Metric{name, aggregation, allowedTags, []Int64MetricRepresentation{}} + return &fake +} + +func (fake *FakeInt64Metric) Record(tags map[string]string, measurement int64) error { + labels := make(map[string]string) + for tagName, tagValue := range tags { + if _, ok := fake.allowedTags[tagName]; !ok { + return fmt.Errorf("tag %q is not allowed", tagName) + } + labels[tagName] = tagValue + } + + metric := Int64MetricRepresentation{ + Name: fake.name, + Labels: labels, + } + + // If there is a metric with equavalent labels, reuse it. + metricIndex := -1 + for index, existingMetric := range fake.metrics { + if !reflect.DeepEqual(existingMetric.Labels, metric.Labels) { + continue + } + metricIndex = index + break + } + // If there is no metric with equalvalent labels, create a new one. + if metricIndex == -1 { + fake.metrics = append(fake.metrics, metric) + metricIndex = len(fake.metrics) - 1 + } + + switch fake.aggregation { + case LastValue: + fake.metrics[metricIndex].Value = measurement + case Sum: + fake.metrics[metricIndex].Value += measurement + default: + return errors.New("unsupported aggregation type") + } + return nil +} + +// ListMetrics returns a snapshot of the current metrics. +func (fake *FakeInt64Metric) ListMetrics() []Int64MetricRepresentation { + return fake.metrics +} diff --git a/pkg/util/metrics/fakes_test.go b/pkg/util/metrics/fakes_test.go new file mode 100644 index 00000000..01266af2 --- /dev/null +++ b/pkg/util/metrics/fakes_test.go @@ -0,0 +1,243 @@ +/* +Copyright 2019 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestFakeInt64Metric(t *testing.T) { + type recordType struct { + tags map[string]string + measurement int64 + } + testCases := []struct { + name string + metricName string + aggregation Aggregation + tagNames []string + records []recordType + expectedMetrics []Int64MetricRepresentation + }{ + { + name: "empty sum metric", + metricName: "foo", + aggregation: Sum, + tagNames: []string{}, + records: []recordType{}, + expectedMetrics: []Int64MetricRepresentation{}, + }, + { + name: "sum metric with no tag", + metricName: "foo", + aggregation: Sum, + tagNames: []string{}, + records: []recordType{ + { + tags: map[string]string{}, + measurement: 1, + }, + { + tags: map[string]string{}, + measurement: 2, + }, + }, + expectedMetrics: []Int64MetricRepresentation{ + { + Name: "foo", + Labels: map[string]string{}, + Value: 3, + }, + }, + }, + { + name: "sum metric with one tag", + metricName: "foo", + aggregation: Sum, + tagNames: []string{"A"}, + records: []recordType{ + { + tags: map[string]string{"A": "1"}, + measurement: 1, + }, + { + tags: map[string]string{"A": "1"}, + measurement: 2, + }, + }, + expectedMetrics: []Int64MetricRepresentation{ + { + Name: "foo", + Labels: map[string]string{"A": "1"}, + Value: 3, + }, + }, + }, + { + name: "sum metric with different tags", + metricName: "foo", + aggregation: Sum, + tagNames: []string{"A", "B"}, + records: []recordType{ + { + tags: map[string]string{"A": "1"}, + measurement: 1, + }, + { + tags: map[string]string{"B": "2"}, + measurement: 2, + }, + { + tags: map[string]string{}, + measurement: 4, + }, + { + tags: map[string]string{"B": "3"}, + measurement: 8, + }, + { + tags: map[string]string{"A": "1"}, + measurement: 16, + }, + }, + expectedMetrics: []Int64MetricRepresentation{ + { + Name: "foo", + Labels: map[string]string{}, + Value: 4, + }, + { + Name: "foo", + Labels: map[string]string{"A": "1"}, + Value: 17, + }, + { + Name: "foo", + Labels: map[string]string{"B": "2"}, + Value: 2, + }, + { + Name: "foo", + Labels: map[string]string{"B": "3"}, + Value: 8, + }, + }, + }, + { + name: "empty gauge metric", + metricName: "foo", + aggregation: LastValue, + tagNames: []string{}, + records: []recordType{}, + expectedMetrics: []Int64MetricRepresentation{}, + }, + { + name: "gauge metric with one measurement", + metricName: "foo", + aggregation: LastValue, + tagNames: []string{}, + records: []recordType{ + { + tags: map[string]string{}, + measurement: 2, + }, + }, + expectedMetrics: []Int64MetricRepresentation{ + { + Name: "foo", + Labels: map[string]string{}, + Value: 2, + }, + }, + }, + { + name: "gauge metric with multiple measurements under same tag", + metricName: "foo", + aggregation: LastValue, + tagNames: []string{"A"}, + records: []recordType{ + { + tags: map[string]string{"A": "1"}, + measurement: 2, + }, + { + tags: map[string]string{"A": "1"}, + measurement: 4, + }, + }, + expectedMetrics: []Int64MetricRepresentation{ + { + Name: "foo", + Labels: map[string]string{"A": "1"}, + Value: 4, + }, + }, + }, + { + name: "gauge metric with multiple measurements under different tags", + metricName: "foo", + aggregation: LastValue, + tagNames: []string{"A", "B"}, + records: []recordType{ + { + tags: map[string]string{"A": "1"}, + measurement: 2, + }, + { + tags: map[string]string{"B": "2"}, + measurement: 4, + }, + { + tags: map[string]string{"A": "1", "B": "2"}, + measurement: 8, + }, + }, + expectedMetrics: []Int64MetricRepresentation{ + { + Name: "foo", + Labels: map[string]string{"A": "1"}, + Value: 2, + }, + { + Name: "foo", + Labels: map[string]string{"B": "2"}, + Value: 4, + }, + { + Name: "foo", + Labels: map[string]string{"A": "1", "B": "2"}, + Value: 8, + }, + }, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + metric := NewFakeInt64Metric(test.metricName, test.aggregation, test.tagNames) + + for _, record := range test.records { + metric.Record(record.tags, record.measurement) + } + + gotMetrics := metric.ListMetrics() + assert.ElementsMatch(t, test.expectedMetrics, gotMetrics, + "expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics) + }) + } +} diff --git a/pkg/util/metrics/helpers.go b/pkg/util/metrics/helpers.go index 7c3eeecb..14e6f5d6 100644 --- a/pkg/util/metrics/helpers.go +++ b/pkg/util/metrics/helpers.go @@ -16,41 +16,174 @@ limitations under the License. package metrics import ( + "context" + "fmt" + "sync" + "go.opencensus.io/stats" "go.opencensus.io/stats/view" "go.opencensus.io/tag" ) -// NewInt64Metric create a stats.Int64 metrics, returns nil when name is empty. -func NewInt64Metric(name string, description string, unit string, aggregation *view.Aggregation, tagKeys []tag.Key) *stats.Int64Measure { +var tagMap map[string]tag.Key +var tagMapMutex sync.RWMutex + +func init() { + tagMapMutex.Lock() + tagMap = make(map[string]tag.Key) + tagMapMutex.Unlock() +} + +// Int64Metric represents an int64 metric. +type Int64Metric struct { + name string + measure *stats.Int64Measure +} + +// Aggregation defines how measurements should be aggregated into data points. +type Aggregation string + +const ( + // LastValue means last measurement overwrites previous measurements (gauge metric). + LastValue Aggregation = "LastValue" + // Sum means last measurement will be added onto previous measurements (counter metric). + Sum Aggregation = "Sum" +) + +// NewInt64Metric create a Int64Metric metric, returns nil when name is empty. +func NewInt64Metric(name string, description string, unit string, aggregation Aggregation, tagNames []string) (*Int64Metric, error) { if name == "" { - return nil + return nil, nil } + + tagKeys, err := getTagKeysFromNames(tagNames) + if err != nil { + return nil, fmt.Errorf("failed to create metric %q because of tag creation failure: %v", name, err) + } + + var aggregationMethod *view.Aggregation + switch aggregation { + case LastValue: + aggregationMethod = view.LastValue() + case Sum: + aggregationMethod = view.Sum() + default: + return nil, fmt.Errorf("unknown aggregation option %q", aggregation) + } + measure := stats.Int64(name, description, unit) newView := &view.View{ Name: name, Measure: measure, Description: description, - Aggregation: aggregation, + Aggregation: aggregationMethod, TagKeys: tagKeys, } view.Register(newView) - return measure + + metric := Int64Metric{name, measure} + return &metric, nil } -// NewFloat64Metric create a stats.Float64 metrics, returns nil when name is empty. -func NewFloat64Metric(name string, description string, unit string, aggregation *view.Aggregation, tagKeys []tag.Key) *stats.Float64Measure { - if name == "" { - return nil +// Record records a measurement for the metric, with provided tags as metric labels. +func (metric *Int64Metric) Record(tags map[string]string, measurement int64) error { + var mutators []tag.Mutator + + tagMapMutex.RLock() + defer tagMapMutex.RUnlock() + + for tagName, tagValue := range tags { + tagKey, ok := tagMap[tagName] + if !ok { + return fmt.Errorf("referencing none existing tag %q in metric %q", tagName, metric.name) + } + mutators = append(mutators, tag.Upsert(tagKey, tagValue)) } + + return stats.RecordWithTags( + context.Background(), + mutators, + metric.measure.M(measurement)) +} + +// Float64Metric represents an float64 metric. +type Float64Metric struct { + name string + measure *stats.Float64Measure +} + +// NewFloat64Metric create a Float64Metric metrics, returns nil when name is empty. +func NewFloat64Metric(name string, description string, unit string, aggregation Aggregation, tagNames []string) (*Float64Metric, error) { + if name == "" { + return nil, nil + } + + tagKeys, err := getTagKeysFromNames(tagNames) + if err != nil { + return nil, fmt.Errorf("failed to create metric %q because of tag creation failure: %v", name, err) + } + + var aggregationMethod *view.Aggregation + switch aggregation { + case LastValue: + aggregationMethod = view.LastValue() + case Sum: + aggregationMethod = view.Sum() + default: + return nil, fmt.Errorf("unknown aggregation option %q", aggregation) + } + measure := stats.Float64(name, description, unit) newView := &view.View{ Name: name, Measure: measure, Description: description, - Aggregation: aggregation, + Aggregation: aggregationMethod, TagKeys: tagKeys, } view.Register(newView) - return measure + + metric := Float64Metric{name, measure} + return &metric, nil +} + +// Record records a measurement for the metric, with provided tags as metric labels. +func (metric *Float64Metric) Record(tags map[string]string, measurement float64) error { + var mutators []tag.Mutator + + tagMapMutex.RLock() + defer tagMapMutex.RUnlock() + + for tagName, tagValue := range tags { + tagKey, ok := tagMap[tagName] + if !ok { + return fmt.Errorf("referencing none existing tag %q in metric %q", tagName, metric.name) + } + mutators = append(mutators, tag.Upsert(tagKey, tagValue)) + } + + return stats.RecordWithTags( + context.Background(), + mutators, + metric.measure.M(measurement)) +} + +func getTagKeysFromNames(tagNames []string) ([]tag.Key, error) { + tagMapMutex.Lock() + defer tagMapMutex.Unlock() + + var tagKeys []tag.Key + var err error + for _, tagName := range tagNames { + tagKey, ok := tagMap[tagName] + if !ok { + tagKey, err = tag.NewKey(tagName) + if err != nil { + return []tag.Key{}, fmt.Errorf("failed to create tag %q: %v", tagName, err) + } + tagMap[tagName] = tagKey + } + tagKeys = append(tagKeys, tagKey) + } + return tagKeys, nil } diff --git a/test/build.sh b/test/build.sh index 8cab8509..c014fa11 100755 --- a/test/build.sh +++ b/test/build.sh @@ -106,8 +106,8 @@ function build-npd-custom-flags() { flags="--v=2" flags+=" --logtostderr" - flags+=" --system-log-monitors=${km_config},${dm_config},${sm_config}" - flags+=" --custom-plugin-monitors=${custom_km_config},${custom_dm_config},${custom_sm_config}" + flags+=" --config.system-log-monitor=${km_config},${dm_config},${sm_config}" + flags+=" --config.custom-plugin-monitor=${custom_km_config},${custom_dm_config},${custom_sm_config}" flags+=" --port=20256" export NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS=${flags}