From 82c236879516c9b591484fa7c8933a6d0d5ad374 Mon Sep 17 00:00:00 2001
From: Xuewei Zhang <xueweiz@google.com>
Date: Fri, 16 Aug 2019 13:36:50 -0700
Subject: [PATCH] Metric format fixes on host/uptime and disk/*

1. host/uptime, disk/io_time and disk/weighted_io should be
counter/cumulative metrics. SO we have to use the Sum aggregation method
rather than LastValue aggregation method (which will declare the metric
as gauge metric).

2. Renamed label "device" for disk/* metrics to "device_name".
This is to clarify that it is device_name (sda1) rather than device_path
(/dev/sda1)
---
 pkg/systemstatsmonitor/disk_collector.go | 23 ++++++++++++++---------
 pkg/systemstatsmonitor/host_collector.go | 14 +++++++++-----
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/pkg/systemstatsmonitor/disk_collector.go b/pkg/systemstatsmonitor/disk_collector.go
index b439e577..b8328111 100644
--- a/pkg/systemstatsmonitor/disk_collector.go
+++ b/pkg/systemstatsmonitor/disk_collector.go
@@ -29,6 +29,8 @@ import (
 	"k8s.io/node-problem-detector/pkg/util/metrics"
 )
 
+const deviceNameLabel = "device_name"
+
 type diskCollector struct {
 	mIOTime      *metrics.Int64Metric
 	mWeightedIO  *metrics.Int64Metric
@@ -44,22 +46,25 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
 	dc := diskCollector{config: diskConfig}
 
 	var err error
+
+	// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
 	dc.mIOTime, err = metrics.NewInt64Metric(
 		diskConfig.MetricsConfigs["disk/io_time"].DisplayName,
 		"The IO time spent on the disk",
 		"second",
-		metrics.LastValue,
-		[]string{"device"})
+		metrics.Sum,
+		[]string{deviceNameLabel})
 	if err != nil {
 		glog.Fatalf("Error initializing metric for disk/io_time: %v", err)
 	}
 
+	// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
 	dc.mWeightedIO, err = metrics.NewInt64Metric(
 		diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName,
 		"The weighted IO on the disk",
 		"second",
-		metrics.LastValue,
-		[]string{"device"})
+		metrics.Sum,
+		[]string{deviceNameLabel})
 	if err != nil {
 		glog.Fatalf("Error initializing metric for disk/weighted_io: %v", err)
 	}
@@ -69,7 +74,7 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
 		"The average queue length on the disk",
 		"second",
 		metrics.LastValue,
-		[]string{"device"})
+		[]string{deviceNameLabel})
 	if err != nil {
 		glog.Fatalf("Error initializing metric for disk/avg_queue_len: %v", err)
 	}
@@ -112,13 +117,13 @@ func (dc *diskCollector) collect() {
 			avgQueueLen = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime)
 		}
 
-		// Attach label {"device": deviceName} to the metrics.
-		tags := map[string]string{"device": deviceName}
+		// Attach label {"device_name": deviceName} to the metrics.
+		tags := map[string]string{deviceNameLabel: deviceName}
 		if dc.mIOTime != nil {
-			dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime))
+			dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime-lastIOTime))
 		}
 		if dc.mWeightedIO != nil {
-			dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO))
+			dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO-lastWeightedIO))
 		}
 		if dc.mAvgQueueLen != nil {
 			dc.mAvgQueueLen.Record(tags, avgQueueLen)
diff --git a/pkg/systemstatsmonitor/host_collector.go b/pkg/systemstatsmonitor/host_collector.go
index c55a4541..5cdafe04 100644
--- a/pkg/systemstatsmonitor/host_collector.go
+++ b/pkg/systemstatsmonitor/host_collector.go
@@ -26,12 +26,13 @@ import (
 )
 
 type hostCollector struct {
-	tags   map[string]string
-	uptime *metrics.Int64Metric
+	tags       map[string]string
+	uptime     *metrics.Int64Metric
+	lastUptime int64
 }
 
 func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector {
-	hc := hostCollector{map[string]string{}, nil}
+	hc := hostCollector{map[string]string{}, nil, 0}
 
 	kernelVersion, err := host.KernelVersion()
 	if err != nil {
@@ -45,12 +46,13 @@ func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector
 	}
 	hc.tags["os_version"] = osVersion
 
+	// Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric.
 	if hostConfig.MetricsConfigs["host/uptime"].DisplayName != "" {
 		hc.uptime, err = metrics.NewInt64Metric(
 			hostConfig.MetricsConfigs["host/uptime"].DisplayName,
 			"The uptime of the operating system",
 			"second",
-			metrics.LastValue,
+			metrics.Sum,
 			[]string{"kernel_version", "os_version"})
 		if err != nil {
 			glog.Fatalf("Error initializing metric for host/uptime: %v", err)
@@ -70,8 +72,10 @@ func (hc *hostCollector) collect() {
 		glog.Errorf("Failed to retrieve uptime of the host: %v", err)
 		return
 	}
+	uptimeSeconds := int64(uptime)
 
 	if hc.uptime != nil {
-		hc.uptime.Record(hc.tags, int64(uptime))
+		hc.uptime.Record(hc.tags, uptimeSeconds-hc.lastUptime)
 	}
+	hc.lastUptime = uptimeSeconds
 }