From 8648fe265a0c8d883bb19c34972d0fb2a0453547 Mon Sep 17 00:00:00 2001 From: Karan Goel Date: Thu, 21 Jan 2021 16:28:08 -0800 Subject: [PATCH] add metric for per-cpu, per-stage timing --- config/system-stats-monitor.json | 197 +++++++++--------- .../stackdriver/stackdriver_exporter.go | 1 + pkg/systemstatsmonitor/README.md | 1 + pkg/systemstatsmonitor/cpu_collector.go | 57 +++-- pkg/systemstatsmonitor/labels.go | 6 + pkg/util/metrics/metric.go | 1 + 6 files changed, 151 insertions(+), 112 deletions(-) diff --git a/config/system-stats-monitor.json b/config/system-stats-monitor.json index 162fd0a8..d3aa107d 100644 --- a/config/system-stats-monitor.json +++ b/config/system-stats-monitor.json @@ -1,99 +1,102 @@ { - "cpu": { - "metricsConfigs": { - "cpu/runnable_task_count": { - "displayName": "cpu/runnable_task_count" - }, - "cpu/usage_time": { - "displayName": "cpu/usage_time" - }, - "cpu/load_1m": { - "displayName": "cpu/load_1m" - }, - "cpu/load_5m": { - "displayName": "cpu/load_5m" - }, - "cpu/load_15m": { - "displayName": "cpu/load_15m" - }, - "system/processes_total": { - "displayName": "system/processes_total" - }, - "system/procs_running": { - "displayName": "system/procs_running" - }, - "system/procs_blocked": { - "displayName": "system/procs_blocked" - }, - "system/interrupts_total": { - "displayName": "system/interrupts_total" - } - } - }, - "disk": { - "metricsConfigs": { - "disk/io_time": { - "displayName": "disk/io_time" - }, - "disk/weighted_io": { - "displayName": "disk/weighted_io" - }, - "disk/avg_queue_len": { - "displayName": "disk/avg_queue_len" - }, - "disk/operation_count": { - "displayName": "disk/operation_count" - }, - "disk/merged_operation_count": { - "displayName": "disk/merged_operation_count" - }, - "disk/operation_bytes_count": { - "displayName": "disk/operation_bytes_count" - }, - "disk/operation_time": { - "displayName": "disk/operation_time" - }, - "disk/bytes_used": { - "displayName": "disk/bytes_used" - } - }, - "includeRootBlk": true, - "includeAllAttachedBlk": true, - "lsblkTimeout": "5s" - }, - "host": { - "metricsConfigs": { - "host/uptime": { - "displayName": "host/uptime" - } - } - }, - "memory": { - "metricsConfigs": { - "memory/bytes_used": { - "displayName": "memory/bytes_used" - }, - "memory/anonymous_used": { - "displayName": "memory/anonymous_used" - }, - "memory/page_cache_used": { - "displayName": "memory/page_cache_used" - }, - "memory/unevictable_used": { - "displayName": "memory/unevictable_used" - }, - "memory/dirty_used": { - "displayName": "memory/dirty_used" - } - } - }, - "osFeature": { - "metricsConfigs": { - "system/os_feature": { - "displayName": "system/os_feature" - } - }, - "KnownModulesConfigPath": "config/guestosconfig/known-modules.json" - }, - "invokeInterval": "60s" + "cpu": { + "metricsConfigs": { + "cpu/load_15m": { + "displayName": "cpu/load_15m" + }, + "cpu/load_1m": { + "displayName": "cpu/load_1m" + }, + "cpu/load_5m": { + "displayName": "cpu/load_5m" + }, + "cpu/runnable_task_count": { + "displayName": "cpu/runnable_task_count" + }, + "cpu/usage_time": { + "displayName": "cpu/usage_time" + }, + "system/cpu_stat": { + "displayName": "system/cpu_stat" + }, + "system/interrupts_total": { + "displayName": "system/interrupts_total" + }, + "system/processes_total": { + "displayName": "system/processes_total" + }, + "system/procs_blocked": { + "displayName": "system/procs_blocked" + }, + "system/procs_running": { + "displayName": "system/procs_running" + } + } + }, + "disk": { + "includeAllAttachedBlk": true, + "includeRootBlk": true, + "lsblkTimeout": "5s", + "metricsConfigs": { + "disk/avg_queue_len": { + "displayName": "disk/avg_queue_len" + }, + "disk/bytes_used": { + "displayName": "disk/bytes_used" + }, + "disk/io_time": { + "displayName": "disk/io_time" + }, + "disk/merged_operation_count": { + "displayName": "disk/merged_operation_count" + }, + "disk/operation_bytes_count": { + "displayName": "disk/operation_bytes_count" + }, + "disk/operation_count": { + "displayName": "disk/operation_count" + }, + "disk/operation_time": { + "displayName": "disk/operation_time" + }, + "disk/weighted_io": { + "displayName": "disk/weighted_io" + } + } + }, + "host": { + "metricsConfigs": { + "host/uptime": { + "displayName": "host/uptime" + } + } + }, + "invokeInterval": "60s", + "memory": { + "metricsConfigs": { + "memory/anonymous_used": { + "displayName": "memory/anonymous_used" + }, + "memory/bytes_used": { + "displayName": "memory/bytes_used" + }, + "memory/dirty_used": { + "displayName": "memory/dirty_used" + }, + "memory/page_cache_used": { + "displayName": "memory/page_cache_used" + }, + "memory/unevictable_used": { + "displayName": "memory/unevictable_used" + } + } + }, + "osFeature": { + "KnownModulesConfigPath": "config/guestosconfig/known-modules.json", + "metricsConfigs": { + "system/os_feature": { + "displayName": "system/os_feature" + } + } + } } diff --git a/pkg/exporters/stackdriver/stackdriver_exporter.go b/pkg/exporters/stackdriver/stackdriver_exporter.go index 50a52289..f14eb0b9 100644 --- a/pkg/exporters/stackdriver/stackdriver_exporter.go +++ b/pkg/exporters/stackdriver/stackdriver_exporter.go @@ -73,6 +73,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{ metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running", metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked", metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total", + metrics.SystemCPUStat: "kubernetes.io/internal/node/guest/system/cpu_stat", metrics.NetDevRxBytes: "kubernetes.io/internal/node/guest/net/rx_bytes", metrics.NetDevRxPackets: "kubernetes.io/internal/node/guest/net/rx_packets", metrics.NetDevRxErrors: "kubernetes.io/internal/node/guest/net/rx_errors", diff --git a/pkg/systemstatsmonitor/README.md b/pkg/systemstatsmonitor/README.md index 2153009e..f098b7ae 100644 --- a/pkg/systemstatsmonitor/README.md +++ b/pkg/systemstatsmonitor/README.md @@ -32,6 +32,7 @@ Below metrics are collected from `cpu` component: * `system/procs_running`: Number of processes currently running. * `system/procs_blocked`: Number of processes currently blocked. * `system/interrupts_total`: Total number of interrupts serviced (cumulative). +* `system/cpu_stats`: Cumulative time each cpu spent in various stages. Collected from `/proc/stats`. Has a label for `cpu` and `stage`. [/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html diff --git a/pkg/systemstatsmonitor/cpu_collector.go b/pkg/systemstatsmonitor/cpu_collector.go index 54d9ac03..bdc08355 100644 --- a/pkg/systemstatsmonitor/cpu_collector.go +++ b/pkg/systemstatsmonitor/cpu_collector.go @@ -17,6 +17,8 @@ limitations under the License. package systemstatsmonitor import ( + "fmt" + "github.com/golang/glog" "github.com/prometheus/procfs" "github.com/shirou/gopsutil/cpu" @@ -50,6 +52,7 @@ type cpuCollector struct { mSystemProcsRunning *metrics.Int64Metric mSystemProcsBlocked *metrics.Int64Metric mSystemInterruptsTotal *metrics.Int64Metric + mSystemCPUStat *metrics.Float64Metric // per-cpu time from /proc/stats config *ssmtypes.CPUStatsConfig @@ -63,13 +66,13 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector { if err != nil { glog.Fatalf("Failed to retrieve kernel version: %v", err) } - cc.tags["kernel_version"] = kernelVersion + cc.tags[kernelVersionLabel] = kernelVersion osVersion, err := util.GetOSVersion() if err != nil { glog.Fatalf("Failed to retrieve OS version: %v", err) } - cc.tags["os_version"] = osVersion + cc.tags[osVersionLabel] = osVersion cc.mRunnableTaskCount, err = metrics.NewFloat64Metric( metrics.CPURunnableTaskCountID, @@ -170,6 +173,17 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector { glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemInterruptsTotal, err) } + cc.mSystemCPUStat, err = metrics.NewFloat64Metric( + metrics.SystemCPUStat, + cpuConfig.MetricsConfigs[string(metrics.SystemCPUStat)].DisplayName, + "Cumulative time each cpu spent in various stages.", + "ns", + metrics.Sum, + []string{osVersionLabel, kernelVersionLabel, cpuLabel, stageLabel}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemCPUStat, err) + } + cc.lastUsageTime = make(map[string]float64) return &cc @@ -238,19 +252,6 @@ func (cc *cpuCollector) recordUsage() { } func (cc *cpuCollector) recordSystemStats() { - if cc.mSystemProcessesTotal == nil { - return - } - if cc.mSystemProcsRunning == nil { - return - } - if cc.mSystemProcsBlocked == nil { - return - } - if cc.mSystemInterruptsTotal == nil { - return - } - fs, err := procfs.NewFS("/proc") stats, err := fs.Stat() if err != nil { @@ -262,6 +263,32 @@ func (cc *cpuCollector) recordSystemStats() { cc.mSystemProcsRunning.Record(cc.tags, int64(stats.ProcessesRunning)) cc.mSystemProcsBlocked.Record(cc.tags, int64(stats.ProcessesBlocked)) cc.mSystemInterruptsTotal.Record(cc.tags, int64(stats.IRQTotal)) + + for i, c := range stats.CPU { + tags := cc.tags + tags[cpuLabel] = fmt.Sprintf("cpu%d", i) + + tags[stageLabel] = "user" + cc.mSystemCPUStat.Record(tags, c.User) + tags[stageLabel] = "nice" + cc.mSystemCPUStat.Record(tags, c.Nice) + tags[stageLabel] = "system" + cc.mSystemCPUStat.Record(tags, c.System) + tags[stageLabel] = "idle" + cc.mSystemCPUStat.Record(tags, c.Idle) + tags[stageLabel] = "iowait" + cc.mSystemCPUStat.Record(tags, c.Iowait) + tags[stageLabel] = "iRQ" + cc.mSystemCPUStat.Record(tags, c.IRQ) + tags[stageLabel] = "softIRQ" + cc.mSystemCPUStat.Record(tags, c.SoftIRQ) + tags[stageLabel] = "steal" + cc.mSystemCPUStat.Record(tags, c.Steal) + tags[stageLabel] = "guest" + cc.mSystemCPUStat.Record(tags, c.Guest) + tags[stageLabel] = "guestNice" + cc.mSystemCPUStat.Record(tags, c.GuestNice) + } } func (cc *cpuCollector) collect() { diff --git a/pkg/systemstatsmonitor/labels.go b/pkg/systemstatsmonitor/labels.go index bf82a90f..a6fbd21c 100644 --- a/pkg/systemstatsmonitor/labels.go +++ b/pkg/systemstatsmonitor/labels.go @@ -45,3 +45,9 @@ const kernelVersionLabel = "kernel_version" // interfaceNameLabel labels the network interface name const interfaceNameLabel = "interface_name" + +// cpuLabel labels the CPU (eg "cpu0") +const cpuLabel = "cpu" + +// stageLabel labels the stage according to the kernel where CPU time was spent +const stageLabel = "stage" diff --git a/pkg/util/metrics/metric.go b/pkg/util/metrics/metric.go index 3aa6d2aa..7b4ac57e 100644 --- a/pkg/util/metrics/metric.go +++ b/pkg/util/metrics/metric.go @@ -46,6 +46,7 @@ const ( SystemProcsRunning MetricID = "system/procs_running" SystemProcsBlocked MetricID = "system/procs_blocked" SystemInterruptsTotal MetricID = "system/interrupts_total" + SystemCPUStat MetricID = "system/cpu_stat" NetDevRxBytes MetricID = "net/rx_bytes" NetDevRxPackets MetricID = "net/rx_packets" NetDevRxErrors MetricID = "net/rx_errors"