Merge pull request #334 from xueweiz/cumulative

Metric format fixes on host/uptime and disk/*
2026-05-12 20:29:00 +00:00 · 2019-08-19 12:27:31 -07:00
parent 424b864291 82c2368795
commit 6aa308db81
2 changed files with 23 additions and 14 deletions
--- a/pkg/systemstatsmonitor/disk_collector.go
+++ b/pkg/systemstatsmonitor/disk_collector.go
@@ -29,6 +29,8 @@ import (
 	"k8s.io/node-problem-detector/pkg/util/metrics"
 )

+const deviceNameLabel = "device_name"
+
 type diskCollector struct {
 	mIOTime      *metrics.Int64Metric
 	mWeightedIO  *metrics.Int64Metric
@@ -44,22 +46,25 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
 	dc := diskCollector{config: diskConfig}

 	var err error
+
+	// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
 	dc.mIOTime, err = metrics.NewInt64Metric(
 		diskConfig.MetricsConfigs["disk/io_time"].DisplayName,
 		"The IO time spent on the disk",
 		"second",
-		metrics.LastValue,
-		[]string{"device"})
+		metrics.Sum,
+		[]string{deviceNameLabel})
 	if err != nil {
 		glog.Fatalf("Error initializing metric for disk/io_time: %v", err)
 	}

+	// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
 	dc.mWeightedIO, err = metrics.NewInt64Metric(
 		diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName,
 		"The weighted IO on the disk",
 		"second",
-		metrics.LastValue,
-		[]string{"device"})
+		metrics.Sum,
+		[]string{deviceNameLabel})
 	if err != nil {
 		glog.Fatalf("Error initializing metric for disk/weighted_io: %v", err)
 	}
@@ -69,7 +74,7 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
 		"The average queue length on the disk",
 		"second",
 		metrics.LastValue,
-		[]string{"device"})
+		[]string{deviceNameLabel})
 	if err != nil {
 		glog.Fatalf("Error initializing metric for disk/avg_queue_len: %v", err)
 	}
@@ -112,13 +117,13 @@ func (dc *diskCollector) collect() {
 			avgQueueLen = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime)
 		}

-		// Attach label {"device": deviceName} to the metrics.
-		tags := map[string]string{"device": deviceName}
+		// Attach label {"device_name": deviceName} to the metrics.
+		tags := map[string]string{deviceNameLabel: deviceName}
 		if dc.mIOTime != nil {
-			dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime))
+			dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime-lastIOTime))
 		}
 		if dc.mWeightedIO != nil {
-			dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO))
+			dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO-lastWeightedIO))
 		}
 		if dc.mAvgQueueLen != nil {
 			dc.mAvgQueueLen.Record(tags, avgQueueLen)
--- a/pkg/systemstatsmonitor/host_collector.go
+++ b/pkg/systemstatsmonitor/host_collector.go
@@ -26,12 +26,13 @@ import (
 )

 type hostCollector struct {
-	tags   map[string]string
-	uptime *metrics.Int64Metric
+	tags       map[string]string
+	uptime     *metrics.Int64Metric
+	lastUptime int64
 }

 func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector {
-	hc := hostCollector{map[string]string{}, nil}
+	hc := hostCollector{map[string]string{}, nil, 0}

 	kernelVersion, err := host.KernelVersion()
 	if err != nil {
@@ -45,12 +46,13 @@ func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector
 	}
 	hc.tags["os_version"] = osVersion

+	// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
 	if hostConfig.MetricsConfigs["host/uptime"].DisplayName != "" {
 		hc.uptime, err = metrics.NewInt64Metric(
 			hostConfig.MetricsConfigs["host/uptime"].DisplayName,
 			"The uptime of the operating system",
 			"second",
-			metrics.LastValue,
+			metrics.Sum,
 			[]string{"kernel_version", "os_version"})
 		if err != nil {
 			glog.Fatalf("Error initializing metric for host/uptime: %v", err)
@@ -70,8 +72,10 @@ func (hc *hostCollector) collect() {
 		glog.Errorf("Failed to retrieve uptime of the host: %v", err)
 		return
 	}
+	uptimeSeconds := int64(uptime)

 	if hc.uptime != nil {
-		hc.uptime.Record(hc.tags, int64(uptime))
+		hc.uptime.Record(hc.tags, uptimeSeconds-hc.lastUptime)
 	}
+	hc.lastUptime = uptimeSeconds
 }