diff --git a/config/system-stats-monitor.json b/config/system-stats-monitor.json index 8d15877d..158692f5 100644 --- a/config/system-stats-monitor.json +++ b/config/system-stats-monitor.json @@ -6,6 +6,15 @@ }, "cpu/usage_time": { "displayName": "cpu/usage_time" + }, + "cpu/load_1m": { + "displayName": "cpu/load_1m" + }, + "cpu/load_5m": { + "displayName": "cpu/load_5m" + }, + "cpu/load_15m": { + "displayName": "cpu/load_15m" } } }, diff --git a/pkg/exporters/stackdriver/stackdriver_exporter.go b/pkg/exporters/stackdriver/stackdriver_exporter.go index 68993863..a13b5792 100644 --- a/pkg/exporters/stackdriver/stackdriver_exporter.go +++ b/pkg/exporters/stackdriver/stackdriver_exporter.go @@ -49,6 +49,9 @@ const exporterName = "stackdriver" var NPDMetricToSDMetric = map[metrics.MetricID]string{ metrics.CPURunnableTaskCountID: "compute.googleapis.com/guest/cpu/runnable_task_count", metrics.CPUUsageTimeID: "compute.googleapis.com/guest/cpu/usage_time", + metrics.CPULoad1m: "compute.googleapis.com/guest/cpu/load_1m", + metrics.CPULoad5m: "compute.googleapis.com/guest/cpu/load_5m", + metrics.CPULoad15m: "compute.googleapis.com/guest/cpu/load_15m", metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length", metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used", metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time", diff --git a/pkg/systemstatsmonitor/README.md b/pkg/systemstatsmonitor/README.md index 30d94acb..f0aee8dc 100644 --- a/pkg/systemstatsmonitor/README.md +++ b/pkg/systemstatsmonitor/README.md @@ -25,6 +25,9 @@ Below metrics are collected from `cpu` component: * `cpu_runnable_task_count`: The average number of runnable tasks in the run-queue during the last minute. Collected from [`/proc/loadavg`][/proc doc]. * `cpu_usage_time`: CPU usage, in seconds. The [CPU state][/proc doc] for the corresponding usage is reported under the `state` metric label (e.g. `user`, `nice`, `system`...). +* `cpu_load_1m`: CPU load average over the last 1 minute. Collected from [`/proc/loadavg`][/proc doc]. +* `cpu_load_5m`: CPU load average over the last 5 minutes. Collected from [`/proc/loadavg`][/proc doc]. +* `cpu_load_15m`: CPU load average over the last 15 minutes. Collected from [`/proc/loadavg`][/proc doc]. [/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html diff --git a/pkg/systemstatsmonitor/cpu_collector.go b/pkg/systemstatsmonitor/cpu_collector.go index ba940940..bd784934 100644 --- a/pkg/systemstatsmonitor/cpu_collector.go +++ b/pkg/systemstatsmonitor/cpu_collector.go @@ -38,6 +38,9 @@ const clockTick float64 = 100.0 type cpuCollector struct { mRunnableTaskCount *metrics.Float64Metric mUsageTime *metrics.Float64Metric + mCpuLoad1m *metrics.Float64Metric + mCpuLoad5m *metrics.Float64Metric + mCpuLoad15m *metrics.Float64Metric config *ssmtypes.CPUStatsConfig @@ -71,6 +74,39 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector { glog.Fatalf("Error initializing metric for %q: %v", metrics.CPUUsageTimeID, err) } + cc.mCpuLoad1m, err = metrics.NewFloat64Metric( + metrics.CPULoad1m, + cpuConfig.MetricsConfigs[string(metrics.CPULoad1m)].DisplayName, + "CPU average load (1m)", + "1", + metrics.LastValue, + []string{}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad1m, err) + } + + cc.mCpuLoad5m, err = metrics.NewFloat64Metric( + metrics.CPULoad5m, + cpuConfig.MetricsConfigs[string(metrics.CPULoad5m)].DisplayName, + "CPU average load (5m)", + "1", + metrics.LastValue, + []string{}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad5m, err) + } + + cc.mCpuLoad15m, err = metrics.NewFloat64Metric( + metrics.CPULoad15m, + cpuConfig.MetricsConfigs[string(metrics.CPULoad15m)].DisplayName, + "CPU average load (15m)", + "1", + metrics.LastValue, + []string{}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad15m, err) + } + cc.lastUsageTime = make(map[string]float64) return &cc @@ -88,6 +124,10 @@ func (cc *cpuCollector) recordLoad() { } cc.mRunnableTaskCount.Record(map[string]string{}, loadAvg.Load1) + + cc.mCpuLoad1m.Record(map[string]string{}, loadAvg.Load1) + cc.mCpuLoad5m.Record(map[string]string{}, loadAvg.Load5) + cc.mCpuLoad15m.Record(map[string]string{}, loadAvg.Load15) } func (cc *cpuCollector) recordUsage() { diff --git a/pkg/util/metrics/metric.go b/pkg/util/metrics/metric.go index 4727ad76..55984b47 100644 --- a/pkg/util/metrics/metric.go +++ b/pkg/util/metrics/metric.go @@ -22,6 +22,9 @@ import ( const ( CPURunnableTaskCountID MetricID = "cpu/runnable_task_count" CPUUsageTimeID MetricID = "cpu/usage_time" + CPULoad1m MetricID = "cpu/load_1m" + CPULoad5m MetricID = "cpu/load_5m" + CPULoad15m MetricID = "cpu/load_15m" ProblemCounterID MetricID = "problem_counter" ProblemGaugeID MetricID = "problem_gauge" DiskIOTimeID MetricID = "disk/io_time" diff --git a/test/e2e/metriconly/metrics_test.go b/test/e2e/metriconly/metrics_test.go index 5195396a..aa382901 100644 --- a/test/e2e/metriconly/metrics_test.go +++ b/test/e2e/metriconly/metrics_test.go @@ -77,6 +77,9 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() { assertMetricExist(gotMetrics, "cpu_runnable_task_count", map[string]string{}, true) assertMetricExist(gotMetrics, "cpu_usage_time", map[string]string{}, false) + assertMetricExist(gotMetrics, "cpu_load_1m", map[string]string{}, false) + assertMetricExist(gotMetrics, "cpu_load_5m", map[string]string{}, false) + assertMetricExist(gotMetrics, "cpu_load_15m", map[string]string{}, false) assertMetricExist(gotMetrics, "disk_operation_count", map[string]string{}, false) assertMetricExist(gotMetrics, "disk_merged_operation_count", map[string]string{}, false) assertMetricExist(gotMetrics, "disk_operation_bytes_count", map[string]string{}, false)