Report metrics from system-log-monitor

This commit is contained in:
Xuewei Zhang
2019-06-26 15:44:55 -07:00
parent 30babe906e
commit fbebcf311b
18 changed files with 1608 additions and 87 deletions

View File

@@ -0,0 +1,114 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package problemmetrics
import (
"errors"
"fmt"
"sync"
"github.com/golang/glog"
"k8s.io/node-problem-detector/pkg/util/metrics"
)
// GlobalProblemMetricsManager is a singleton of ProblemMetricsManager,
// which should be used to manage all problem-converted metrics across all
// problem daemons.
var GlobalProblemMetricsManager *ProblemMetricsManager
func init() {
GlobalProblemMetricsManager = NewProblemMetricsManagerOrDie()
}
// ProblemMetricsManager manages problem-converted metrics.
// ProblemMetricsManager is thread-safe.
type ProblemMetricsManager struct {
problemCounter metrics.Int64MetricInterface
problemGauge metrics.Int64MetricInterface
problemTypeToReason map[string]string
problemTypeToReasonMutex sync.Mutex
}
func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
pmm := ProblemMetricsManager{}
var err error
pmm.problemCounter, err = metrics.NewInt64Metric(
"problem_counter",
"Number of times a specific type of problem have occurred.",
"1",
metrics.Sum,
[]string{"reason"})
if err != nil {
glog.Fatalf("Failed to create problem_counter metric: %v", err)
}
pmm.problemGauge, err = metrics.NewInt64Metric(
"problem_gauge",
"Whether a specific type of problem is affecting the node or not.",
"1",
metrics.LastValue,
[]string{"type", "reason"})
if err != nil {
glog.Fatalf("Failed to create problem_gauge metric: %v", err)
}
pmm.problemTypeToReason = make(map[string]string)
return &pmm
}
// IncrementProblemCounter increments the value of a problem counter.
func (pmm *ProblemMetricsManager) IncrementProblemCounter(reason string, count int64) error {
if pmm.problemCounter == nil {
return errors.New("problem counter is being incremented before initialized.")
}
return pmm.problemCounter.Record(map[string]string{"reason": reason}, count)
}
// SetProblemGauge sets the value of a problem gauge.
func (pmm *ProblemMetricsManager) SetProblemGauge(problemType string, reason string, value bool) error {
if pmm.problemGauge == nil {
return errors.New("problem gauge is being set before initialized.")
}
pmm.problemTypeToReasonMutex.Lock()
defer pmm.problemTypeToReasonMutex.Unlock()
// We clear the last reason, because the expected behavior is that at any point of time,
// for each type of permanent problem, there should be at most one reason got set to 1.
// This behavior is consistent with the behavior of node condition in Kubernetes.
// However, problemGauges with different "type" and "reason" are considered as different
// metrics in Prometheus. So we need to clear the previous metrics explicitly.
if lastReason, ok := pmm.problemTypeToReason[problemType]; ok {
err := pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": lastReason}, 0)
if err != nil {
return fmt.Errorf("failed to clear previous reason %q for type %q: %v",
problemType, lastReason, err)
}
}
pmm.problemTypeToReason[problemType] = reason
var valueInt int64
if value {
valueInt = 1
}
return pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": reason}, valueInt)
}

View File

@@ -0,0 +1,35 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package problemmetrics
import (
"k8s.io/node-problem-detector/pkg/util/metrics"
)
// NewProblemMetricsManagerStub creates a ProblemMetricsManager stubbed by fake metrics.
// The stubbed ProblemMetricsManager and fake metrics are returned.
func NewProblemMetricsManagerStub() (*ProblemMetricsManager, *metrics.FakeInt64Metric, *metrics.FakeInt64Metric) {
fakeProblemCounter := metrics.NewFakeInt64Metric("problem_counter", metrics.Sum, []string{"reason"})
fakeProblemGauge := metrics.NewFakeInt64Metric("problem_gauge", metrics.LastValue, []string{"type", "reason"})
pmm := ProblemMetricsManager{}
pmm.problemCounter = metrics.Int64MetricInterface(fakeProblemCounter)
pmm.problemGauge = metrics.Int64MetricInterface(fakeProblemGauge)
pmm.problemTypeToReason = make(map[string]string)
return &pmm, fakeProblemCounter, fakeProblemGauge
}

View File

@@ -0,0 +1,277 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package problemmetrics
import (
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/node-problem-detector/pkg/util/metrics"
)
func TestNewProblem(t *testing.T) {
testCases := []struct {
name string
reasons []string
counts []int64
expectedMetrics []metrics.Int64MetricRepresentation
}{
{
name: "no problem at all",
reasons: []string{},
counts: []int64{},
expectedMetrics: []metrics.Int64MetricRepresentation{},
},
{
name: "one problem happened",
reasons: []string{"foo"},
counts: []int64{1},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_counter",
Labels: map[string]string{"reason": "foo"},
Value: 1,
},
},
},
{
name: "one problem happened twice",
reasons: []string{"foo", "foo"},
counts: []int64{1, 1},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_counter",
Labels: map[string]string{"reason": "foo"},
Value: 2,
},
},
},
{
name: "two problem happened various times",
reasons: []string{"foo", "bar", "foo"},
counts: []int64{1, 1, 1},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_counter",
Labels: map[string]string{"reason": "foo"},
Value: 2,
},
{
Name: "problem_counter",
Labels: map[string]string{"reason": "bar"},
Value: 1,
},
},
},
{
name: "two problem initialized",
reasons: []string{"foo", "bar"},
counts: []int64{0, 0},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_counter",
Labels: map[string]string{"reason": "foo"},
Value: 0,
},
{
Name: "problem_counter",
Labels: map[string]string{"reason": "bar"},
Value: 0,
},
},
},
{
name: "two problem first initialized, then happened various times",
reasons: []string{"foo", "bar", "foo", "bar", "foo"},
counts: []int64{0, 0, 1, 1, 1},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_counter",
Labels: map[string]string{"reason": "foo"},
Value: 2,
},
{
Name: "problem_counter",
Labels: map[string]string{"reason": "bar"},
Value: 1,
},
},
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
pmm, fakeProblemCounter, fakeProblemGauge := NewProblemMetricsManagerStub()
for idx, reason := range test.reasons {
pmm.IncrementProblemCounter(reason, test.counts[idx])
}
gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...)
assert.ElementsMatch(t, test.expectedMetrics, gotMetrics,
"expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics)
})
}
}
func TestSetProblemGauge(t *testing.T) {
type argumentType struct {
problemType string
reason string
value bool
}
testCases := []struct {
name string
arguments []argumentType
expectedMetrics []metrics.Int64MetricRepresentation
}{
{
name: "no permanent problem at all",
arguments: []argumentType{},
expectedMetrics: []metrics.Int64MetricRepresentation{},
},
{
name: "one permanent problem was set once",
arguments: []argumentType{
{"ProblemTypeA", "ReasonFoo", true},
},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
Value: 1,
},
},
},
{
name: "one permanent problem was set twice with same reason",
arguments: []argumentType{
{"ProblemTypeA", "ReasonFoo", true},
{"ProblemTypeA", "ReasonFoo", true},
},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
Value: 1,
},
},
},
{
name: "one permanent problem was set twice with different reasons",
arguments: []argumentType{
{"ProblemTypeA", "ReasonFoo", true},
{"ProblemTypeA", "ReasonBar", true},
},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
Value: 0,
},
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonBar"},
Value: 1,
},
},
},
{
name: "one permanent problem was set then cleared",
arguments: []argumentType{
{"ProblemTypeA", "ReasonFoo", true},
{"ProblemTypeA", "", false},
},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": ""},
Value: 0,
},
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
Value: 0,
},
},
},
{
name: "one permanent problem was set, cleared, and set again",
arguments: []argumentType{
{"ProblemTypeA", "ReasonFoo", true},
{"ProblemTypeA", "", false},
{"ProblemTypeA", "ReasonBar", true},
},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": ""},
Value: 0,
},
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
Value: 0,
},
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonBar"},
Value: 1,
},
},
},
{
name: "two permanent problems were set and one of them got cleared",
arguments: []argumentType{
{"ProblemTypeA", "ReasonFoo", true},
{"ProblemTypeB", "ReasonBar", true},
{"ProblemTypeA", "", false},
},
expectedMetrics: []metrics.Int64MetricRepresentation{
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": ""},
Value: 0,
},
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeA", "reason": "ReasonFoo"},
Value: 0,
},
{
Name: "problem_gauge",
Labels: map[string]string{"type": "ProblemTypeB", "reason": "ReasonBar"},
Value: 1,
},
},
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
pmm, fakeProblemCounter, fakeProblemGauge := NewProblemMetricsManagerStub()
for _, argument := range test.arguments {
pmm.SetProblemGauge(argument.problemType, argument.reason, argument.value)
}
gotMetrics := append(fakeProblemCounter.ListMetrics(), fakeProblemGauge.ListMetrics()...)
assert.ElementsMatch(t, test.expectedMetrics, gotMetrics,
"expected metrics: %+v, got: %+v", test.expectedMetrics, gotMetrics)
})
}
}