Merge pull request #334 from xueweiz/cumulative

Metric format fixes on host/uptime and disk/*
This commit is contained in:
Kubernetes Prow Robot
2019-08-19 12:27:31 -07:00
committed by GitHub
2 changed files with 23 additions and 14 deletions

View File

@@ -29,6 +29,8 @@ import (
"k8s.io/node-problem-detector/pkg/util/metrics"
)
const deviceNameLabel = "device_name"
type diskCollector struct {
mIOTime *metrics.Int64Metric
mWeightedIO *metrics.Int64Metric
@@ -44,22 +46,25 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
dc := diskCollector{config: diskConfig}
var err error
// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
dc.mIOTime, err = metrics.NewInt64Metric(
diskConfig.MetricsConfigs["disk/io_time"].DisplayName,
"The IO time spent on the disk",
"second",
metrics.LastValue,
[]string{"device"})
metrics.Sum,
[]string{deviceNameLabel})
if err != nil {
glog.Fatalf("Error initializing metric for disk/io_time: %v", err)
}
// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
dc.mWeightedIO, err = metrics.NewInt64Metric(
diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName,
"The weighted IO on the disk",
"second",
metrics.LastValue,
[]string{"device"})
metrics.Sum,
[]string{deviceNameLabel})
if err != nil {
glog.Fatalf("Error initializing metric for disk/weighted_io: %v", err)
}
@@ -69,7 +74,7 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
"The average queue length on the disk",
"second",
metrics.LastValue,
[]string{"device"})
[]string{deviceNameLabel})
if err != nil {
glog.Fatalf("Error initializing metric for disk/avg_queue_len: %v", err)
}
@@ -112,13 +117,13 @@ func (dc *diskCollector) collect() {
avgQueueLen = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime)
}
// Attach label {"device": deviceName} to the metrics.
tags := map[string]string{"device": deviceName}
// Attach label {"device_name": deviceName} to the metrics.
tags := map[string]string{deviceNameLabel: deviceName}
if dc.mIOTime != nil {
dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime))
dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime-lastIOTime))
}
if dc.mWeightedIO != nil {
dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO))
dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO-lastWeightedIO))
}
if dc.mAvgQueueLen != nil {
dc.mAvgQueueLen.Record(tags, avgQueueLen)

View File

@@ -26,12 +26,13 @@ import (
)
type hostCollector struct {
tags map[string]string
uptime *metrics.Int64Metric
tags map[string]string
uptime *metrics.Int64Metric
lastUptime int64
}
func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector {
hc := hostCollector{map[string]string{}, nil}
hc := hostCollector{map[string]string{}, nil, 0}
kernelVersion, err := host.KernelVersion()
if err != nil {
@@ -45,12 +46,13 @@ func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector
}
hc.tags["os_version"] = osVersion
// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
if hostConfig.MetricsConfigs["host/uptime"].DisplayName != "" {
hc.uptime, err = metrics.NewInt64Metric(
hostConfig.MetricsConfigs["host/uptime"].DisplayName,
"The uptime of the operating system",
"second",
metrics.LastValue,
metrics.Sum,
[]string{"kernel_version", "os_version"})
if err != nil {
glog.Fatalf("Error initializing metric for host/uptime: %v", err)
@@ -70,8 +72,10 @@ func (hc *hostCollector) collect() {
glog.Errorf("Failed to retrieve uptime of the host: %v", err)
return
}
uptimeSeconds := int64(uptime)
if hc.uptime != nil {
hc.uptime.Record(hc.tags, int64(uptime))
hc.uptime.Record(hc.tags, uptimeSeconds-hc.lastUptime)
}
hc.lastUptime = uptimeSeconds
}