Report metrics from system-log-monitor

2026-05-21 08:36:33 +00:00 · 2019-06-26 15:44:55 -07:00
parent 30babe906e
commit fbebcf311b
18 changed files with 1608 additions and 87 deletions
--- a/pkg/problemmetrics/problem_metrics.go
+++ b/pkg/problemmetrics/problem_metrics.go
@@ -0,0 +1,114 @@
+/*
+Copyright 2019 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package problemmetrics
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+
+	"github.com/golang/glog"
+
+	"k8s.io/node-problem-detector/pkg/util/metrics"
+)
+
+// GlobalProblemMetricsManager is a singleton of ProblemMetricsManager,
+// which should be used to manage all problem-converted metrics across all
+// problem daemons.
+var GlobalProblemMetricsManager *ProblemMetricsManager
+
+func init() {
+	GlobalProblemMetricsManager = NewProblemMetricsManagerOrDie()
+}
+
+// ProblemMetricsManager manages problem-converted metrics.
+// ProblemMetricsManager is thread-safe.
+type ProblemMetricsManager struct {
+	problemCounter           metrics.Int64MetricInterface
+	problemGauge             metrics.Int64MetricInterface
+	problemTypeToReason      map[string]string
+	problemTypeToReasonMutex sync.Mutex
+}
+
+func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
+	pmm := ProblemMetricsManager{}
+
+	var err error
+	pmm.problemCounter, err = metrics.NewInt64Metric(
+		"problem_counter",
+		"Number of times a specific type of problem have occurred.",
+		"1",
+		metrics.Sum,
+		[]string{"reason"})
+	if err != nil {
+		glog.Fatalf("Failed to create problem_counter metric: %v", err)
+	}
+
+	pmm.problemGauge, err = metrics.NewInt64Metric(
+		"problem_gauge",
+		"Whether a specific type of problem is affecting the node or not.",
+		"1",
+		metrics.LastValue,
+		[]string{"type", "reason"})
+	if err != nil {
+		glog.Fatalf("Failed to create problem_gauge metric: %v", err)
+	}
+
+	pmm.problemTypeToReason = make(map[string]string)
+
+	return &pmm
+}
+
+// IncrementProblemCounter increments the value of a problem counter.
+func (pmm *ProblemMetricsManager) IncrementProblemCounter(reason string, count int64) error {
+	if pmm.problemCounter == nil {
+		return errors.New("problem counter is being incremented before initialized.")
+	}
+
+	return pmm.problemCounter.Record(map[string]string{"reason": reason}, count)
+}
+
+// SetProblemGauge sets the value of a problem gauge.
+func (pmm *ProblemMetricsManager) SetProblemGauge(problemType string, reason string, value bool) error {
+	if pmm.problemGauge == nil {
+		return errors.New("problem gauge is being set before initialized.")
+	}
+
+	pmm.problemTypeToReasonMutex.Lock()
+	defer pmm.problemTypeToReasonMutex.Unlock()
+
+	// We clear the last reason, because the expected behavior is that at any point of time,
+	// for each type of permanent problem, there should be at most one reason got set to 1.
+	// This behavior is consistent with the behavior of node condition in Kubernetes.
+	// However, problemGauges with different "type" and "reason" are considered as different
+	// metrics in Prometheus. So we need to clear the previous metrics explicitly.
+	if lastReason, ok := pmm.problemTypeToReason[problemType]; ok {
+		err := pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": lastReason}, 0)
+		if err != nil {
+			return fmt.Errorf("failed to clear previous reason %q for type %q: %v",
+				problemType, lastReason, err)
+		}
+	}
+
+	pmm.problemTypeToReason[problemType] = reason
+
+	var valueInt int64
+	if value {
+		valueInt = 1
+	}
+	return pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": reason}, valueInt)
+}
--- a/pkg/problemmetrics/problem_metrics_stub.go
+++ b/pkg/problemmetrics/problem_metrics_stub.go
@@ -0,0 +1,35 @@
+/*
+Copyright 2019 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package problemmetrics
+
+import (
+	"k8s.io/node-problem-detector/pkg/util/metrics"
+)
+
+// NewProblemMetricsManagerStub creates a ProblemMetricsManager stubbed by fake metrics.
+// The stubbed ProblemMetricsManager and fake metrics are returned.
+func NewProblemMetricsManagerStub() (*ProblemMetricsManager, *metrics.FakeInt64Metric, *metrics.FakeInt64Metric) {
+	fakeProblemCounter := metrics.NewFakeInt64Metric("problem_counter", metrics.Sum, []string{"reason"})
+	fakeProblemGauge := metrics.NewFakeInt64Metric("problem_gauge", metrics.LastValue, []string{"type", "reason"})
+
+	pmm := ProblemMetricsManager{}
+	pmm.problemCounter = metrics.Int64MetricInterface(fakeProblemCounter)
+	pmm.problemGauge = metrics.Int64MetricInterface(fakeProblemGauge)
+	pmm.problemTypeToReason = make(map[string]string)
+
+	return &pmm, fakeProblemCounter, fakeProblemGauge
+}
--- a/pkg/problemmetrics/problem_metrics_test.go
+++ b/pkg/problemmetrics/problem_metrics_test.go
@@ -0,0 +1,277 @@
+/*
+Copyright 2019 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package problemmetrics
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	"k8s.io/node-problem-detector/pkg/util/metrics"
+)
+
+func TestNewProblem(t *testing.T) {
+	testCases := []struct {
+		name            string
+		reasons         []string
+		counts          []int64
+		expectedMetrics []metrics.Int64MetricRepresentation
+	}{
+		{
+			name:            "no problem at all",
+			reasons:         []string{},
+			counts:          []int64{},
+			expectedMetrics: []metrics.Int64MetricRepresentation{},
+		},
+		{
+			name:    "one problem happened",
+			reasons: []string{"foo"},
+			counts:  []int64{1},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "foo"},
+					Value:  1,
+				},
+			},
+		},
+		{
+			name:    "one problem happened twice",
+			reasons: []string{"foo", "foo"},
+			counts:  []int64{1, 1},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "foo"},
+					Value:  2,
+				},
+			},
+		},
+		{
+			name:    "two problem happened various times",
+			reasons: []string{"foo", "bar", "foo"},
+			counts:  []int64{1, 1, 1},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "foo"},
+					Value:  2,
+				},
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "bar"},
+					Value:  1,
+				},
+			},
+		},
+		{
+			name:    "two problem initialized",
+			reasons: []string{"foo", "bar"},
+			counts:  []int64{0, 0},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "foo"},
+					Value:  0,
+				},
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "bar"},
+					Value:  0,
+				},
+			},
+		},
+		{
+			name:    "two problem first initialized, then happened various times",
+			reasons: []string{"foo", "bar", "foo", "bar", "foo"},
+			counts:  []int64{0, 0, 1, 1, 1},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "foo"},
+					Value:  2,
+				},
+				{
+					Name:   "problem_counter",
+					Labels: map[string]string{"reason": "bar"},
+					Value:  1,
+				},
+			},
+		},
+	}
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			pmm, fakeProblemCounter, fakeProblemGauge := NewProblemMetricsManagerStub()
+
+			for idx, reason := range test.reasons {
+				pmm.IncrementProblemCounter(reason, test.counts[idx])
+			}
+
+			gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...)
+			assert.ElementsMatch(t, test.expectedMetrics, gotMetrics,
+				"expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics)
+		})
+	}
+}
+
+func TestSetProblemGauge(t *testing.T) {
+	type argumentType struct {
+		problemType string
+		reason      string
+		value       bool
+	}
+
+	testCases := []struct {
+		name            string
+		arguments       []argumentType
+		expectedMetrics []metrics.Int64MetricRepresentation
+	}{
+		{
+			name:            "no permanent problem at all",
+			arguments:       []argumentType{},
+			expectedMetrics: []metrics.Int64MetricRepresentation{},
+		},
+		{
+			name: "one permanent problem was set once",
+			arguments: []argumentType{
+				{"ProblemTypeA", "ReasonFoo", true},
+			},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
+					Value:  1,
+				},
+			},
+		},
+		{
+			name: "one permanent problem was set twice with same reason",
+			arguments: []argumentType{
+				{"ProblemTypeA", "ReasonFoo", true},
+				{"ProblemTypeA", "ReasonFoo", true},
+			},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
+					Value:  1,
+				},
+			},
+		},
+		{
+			name: "one permanent problem was set twice with different reasons",
+			arguments: []argumentType{
+				{"ProblemTypeA", "ReasonFoo", true},
+				{"ProblemTypeA", "ReasonBar", true},
+			},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
+					Value:  0,
+				},
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonBar"},
+					Value:  1,
+				},
+			},
+		},
+		{
+			name: "one permanent problem was set then cleared",
+			arguments: []argumentType{
+				{"ProblemTypeA", "ReasonFoo", true},
+				{"ProblemTypeA", "", false},
+			},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": ""},
+					Value:  0,
+				},
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
+					Value:  0,
+				},
+			},
+		},
+		{
+			name: "one permanent problem was set, cleared, and set again",
+			arguments: []argumentType{
+				{"ProblemTypeA", "ReasonFoo", true},
+				{"ProblemTypeA", "", false},
+				{"ProblemTypeA", "ReasonBar", true},
+			},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": ""},
+					Value:  0,
+				},
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
+					Value:  0,
+				},
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonBar"},
+					Value:  1,
+				},
+			},
+		},
+		{
+			name: "two permanent problems were set and one of them got cleared",
+			arguments: []argumentType{
+				{"ProblemTypeA", "ReasonFoo", true},
+				{"ProblemTypeB", "ReasonBar", true},
+				{"ProblemTypeA", "", false},
+			},
+			expectedMetrics: []metrics.Int64MetricRepresentation{
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": ""},
+					Value:  0,
+				},
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
+					Value:  0,
+				},
+				{
+					Name:   "problem_gauge",
+					Labels: map[string]string{"type": "ProblemTypeB", "reason": "ReasonBar"},
+					Value:  1,
+				},
+			},
+		},
+	}
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			pmm, fakeProblemCounter, fakeProblemGauge := NewProblemMetricsManagerStub()
+
+			for _, argument := range test.arguments {
+				pmm.SetProblemGauge(argument.problemType, argument.reason, argument.value)
+			}
+
+			gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...)
+			assert.ElementsMatch(t, test.expectedMetrics, gotMetrics,
+				"expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics)
+		})
+	}
+}