From 82c236879516c9b591484fa7c8933a6d0d5ad374 Mon Sep 17 00:00:00 2001 From: Xuewei Zhang Date: Fri, 16 Aug 2019 13:36:50 -0700 Subject: [PATCH] Metric format fixes on host/uptime and disk/* 1. host/uptime, disk/io_time and disk/weighted_io should be counter/cumulative metrics. SO we have to use the Sum aggregation method rather than LastValue aggregation method (which will declare the metric as gauge metric). 2. Renamed label "device" for disk/* metrics to "device_name". This is to clarify that it is device_name (sda1) rather than device_path (/dev/sda1) --- pkg/systemstatsmonitor/disk_collector.go | 23 ++++++++++++++--------- pkg/systemstatsmonitor/host_collector.go | 14 +++++++++----- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pkg/systemstatsmonitor/disk_collector.go b/pkg/systemstatsmonitor/disk_collector.go index b439e577..b8328111 100644 --- a/pkg/systemstatsmonitor/disk_collector.go +++ b/pkg/systemstatsmonitor/disk_collector.go @@ -29,6 +29,8 @@ import ( "k8s.io/node-problem-detector/pkg/util/metrics" ) +const deviceNameLabel = "device_name" + type diskCollector struct { mIOTime *metrics.Int64Metric mWeightedIO *metrics.Int64Metric @@ -44,22 +46,25 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector dc := diskCollector{config: diskConfig} var err error + + // Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric. dc.mIOTime, err = metrics.NewInt64Metric( diskConfig.MetricsConfigs["disk/io_time"].DisplayName, "The IO time spent on the disk", "second", - metrics.LastValue, - []string{"device"}) + metrics.Sum, + []string{deviceNameLabel}) if err != nil { glog.Fatalf("Error initializing metric for disk/io_time: %v", err) } + // Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric. dc.mWeightedIO, err = metrics.NewInt64Metric( diskConfig.MetricsConfigs["disk/weighted_io"].DisplayName, "The weighted IO on the disk", "second", - metrics.LastValue, - []string{"device"}) + metrics.Sum, + []string{deviceNameLabel}) if err != nil { glog.Fatalf("Error initializing metric for disk/weighted_io: %v", err) } @@ -69,7 +74,7 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector "The average queue length on the disk", "second", metrics.LastValue, - []string{"device"}) + []string{deviceNameLabel}) if err != nil { glog.Fatalf("Error initializing metric for disk/avg_queue_len: %v", err) } @@ -112,13 +117,13 @@ func (dc *diskCollector) collect() { avgQueueLen = float64(ioCountersStat.WeightedIO-lastWeightedIO) / float64(ioCountersStat.IoTime-lastIOTime) } - // Attach label {"device": deviceName} to the metrics. - tags := map[string]string{"device": deviceName} + // Attach label {"device_name": deviceName} to the metrics. + tags := map[string]string{deviceNameLabel: deviceName} if dc.mIOTime != nil { - dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime)) + dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime-lastIOTime)) } if dc.mWeightedIO != nil { - dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO)) + dc.mWeightedIO.Record(tags, int64(ioCountersStat.WeightedIO-lastWeightedIO)) } if dc.mAvgQueueLen != nil { dc.mAvgQueueLen.Record(tags, avgQueueLen) diff --git a/pkg/systemstatsmonitor/host_collector.go b/pkg/systemstatsmonitor/host_collector.go index c55a4541..5cdafe04 100644 --- a/pkg/systemstatsmonitor/host_collector.go +++ b/pkg/systemstatsmonitor/host_collector.go @@ -26,12 +26,13 @@ import ( ) type hostCollector struct { - tags map[string]string - uptime *metrics.Int64Metric + tags map[string]string + uptime *metrics.Int64Metric + lastUptime int64 } func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector { - hc := hostCollector{map[string]string{}, nil} + hc := hostCollector{map[string]string{}, nil, 0} kernelVersion, err := host.KernelVersion() if err != nil { @@ -45,12 +46,13 @@ func NewHostCollectorOrDie(hostConfig *ssmtypes.HostStatsConfig) *hostCollector } hc.tags["os_version"] = osVersion + // Use metrics.Sum aggregation method to ensure the metric is a counter/cumulative metric. if hostConfig.MetricsConfigs["host/uptime"].DisplayName != "" { hc.uptime, err = metrics.NewInt64Metric( hostConfig.MetricsConfigs["host/uptime"].DisplayName, "The uptime of the operating system", "second", - metrics.LastValue, + metrics.Sum, []string{"kernel_version", "os_version"}) if err != nil { glog.Fatalf("Error initializing metric for host/uptime: %v", err) @@ -70,8 +72,10 @@ func (hc *hostCollector) collect() { glog.Errorf("Failed to retrieve uptime of the host: %v", err) return } + uptimeSeconds := int64(uptime) if hc.uptime != nil { - hc.uptime.Record(hc.tags, int64(uptime)) + hc.uptime.Record(hc.tags, uptimeSeconds-hc.lastUptime) } + hc.lastUptime = uptimeSeconds }