mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-03-27 22:07:57 +00:00
Merge pull request #484 from karan/trial-metric
Collect CPU load averages in a separate metric
This commit is contained in:
@@ -6,6 +6,15 @@
|
||||
},
|
||||
"cpu/usage_time": {
|
||||
"displayName": "cpu/usage_time"
|
||||
},
|
||||
"cpu/load_1m": {
|
||||
"displayName": "cpu/load_1m"
|
||||
},
|
||||
"cpu/load_5m": {
|
||||
"displayName": "cpu/load_5m"
|
||||
},
|
||||
"cpu/load_15m": {
|
||||
"displayName": "cpu/load_15m"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -49,6 +49,9 @@ const exporterName = "stackdriver"
|
||||
var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
||||
metrics.CPURunnableTaskCountID: "compute.googleapis.com/guest/cpu/runnable_task_count",
|
||||
metrics.CPUUsageTimeID: "compute.googleapis.com/guest/cpu/usage_time",
|
||||
metrics.CPULoad1m: "compute.googleapis.com/guest/cpu/load_1m",
|
||||
metrics.CPULoad5m: "compute.googleapis.com/guest/cpu/load_5m",
|
||||
metrics.CPULoad15m: "compute.googleapis.com/guest/cpu/load_15m",
|
||||
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
|
||||
metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used",
|
||||
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
|
||||
|
||||
@@ -25,6 +25,9 @@ Below metrics are collected from `cpu` component:
|
||||
|
||||
* `cpu_runnable_task_count`: The average number of runnable tasks in the run-queue during the last minute. Collected from [`/proc/loadavg`][/proc doc].
|
||||
* `cpu_usage_time`: CPU usage, in seconds. The [CPU state][/proc doc] for the corresponding usage is reported under the `state` metric label (e.g. `user`, `nice`, `system`...).
|
||||
* `cpu_load_1m`: CPU load average over the last 1 minute. Collected from [`/proc/loadavg`][/proc doc].
|
||||
* `cpu_load_5m`: CPU load average over the last 5 minutes. Collected from [`/proc/loadavg`][/proc doc].
|
||||
* `cpu_load_15m`: CPU load average over the last 15 minutes. Collected from [`/proc/loadavg`][/proc doc].
|
||||
|
||||
[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html
|
||||
|
||||
|
||||
@@ -38,6 +38,9 @@ const clockTick float64 = 100.0
|
||||
type cpuCollector struct {
|
||||
mRunnableTaskCount *metrics.Float64Metric
|
||||
mUsageTime *metrics.Float64Metric
|
||||
mCpuLoad1m *metrics.Float64Metric
|
||||
mCpuLoad5m *metrics.Float64Metric
|
||||
mCpuLoad15m *metrics.Float64Metric
|
||||
|
||||
config *ssmtypes.CPUStatsConfig
|
||||
|
||||
@@ -71,6 +74,39 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
|
||||
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPUUsageTimeID, err)
|
||||
}
|
||||
|
||||
cc.mCpuLoad1m, err = metrics.NewFloat64Metric(
|
||||
metrics.CPULoad1m,
|
||||
cpuConfig.MetricsConfigs[string(metrics.CPULoad1m)].DisplayName,
|
||||
"CPU average load (1m)",
|
||||
"1",
|
||||
metrics.LastValue,
|
||||
[]string{})
|
||||
if err != nil {
|
||||
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad1m, err)
|
||||
}
|
||||
|
||||
cc.mCpuLoad5m, err = metrics.NewFloat64Metric(
|
||||
metrics.CPULoad5m,
|
||||
cpuConfig.MetricsConfigs[string(metrics.CPULoad5m)].DisplayName,
|
||||
"CPU average load (5m)",
|
||||
"1",
|
||||
metrics.LastValue,
|
||||
[]string{})
|
||||
if err != nil {
|
||||
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad5m, err)
|
||||
}
|
||||
|
||||
cc.mCpuLoad15m, err = metrics.NewFloat64Metric(
|
||||
metrics.CPULoad15m,
|
||||
cpuConfig.MetricsConfigs[string(metrics.CPULoad15m)].DisplayName,
|
||||
"CPU average load (15m)",
|
||||
"1",
|
||||
metrics.LastValue,
|
||||
[]string{})
|
||||
if err != nil {
|
||||
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad15m, err)
|
||||
}
|
||||
|
||||
cc.lastUsageTime = make(map[string]float64)
|
||||
|
||||
return &cc
|
||||
@@ -88,6 +124,10 @@ func (cc *cpuCollector) recordLoad() {
|
||||
}
|
||||
|
||||
cc.mRunnableTaskCount.Record(map[string]string{}, loadAvg.Load1)
|
||||
|
||||
cc.mCpuLoad1m.Record(map[string]string{}, loadAvg.Load1)
|
||||
cc.mCpuLoad5m.Record(map[string]string{}, loadAvg.Load5)
|
||||
cc.mCpuLoad15m.Record(map[string]string{}, loadAvg.Load15)
|
||||
}
|
||||
|
||||
func (cc *cpuCollector) recordUsage() {
|
||||
|
||||
@@ -22,6 +22,9 @@ import (
|
||||
const (
|
||||
CPURunnableTaskCountID MetricID = "cpu/runnable_task_count"
|
||||
CPUUsageTimeID MetricID = "cpu/usage_time"
|
||||
CPULoad1m MetricID = "cpu/load_1m"
|
||||
CPULoad5m MetricID = "cpu/load_5m"
|
||||
CPULoad15m MetricID = "cpu/load_15m"
|
||||
ProblemCounterID MetricID = "problem_counter"
|
||||
ProblemGaugeID MetricID = "problem_gauge"
|
||||
DiskIOTimeID MetricID = "disk/io_time"
|
||||
|
||||
@@ -77,6 +77,9 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
|
||||
|
||||
assertMetricExist(gotMetrics, "cpu_runnable_task_count", map[string]string{}, true)
|
||||
assertMetricExist(gotMetrics, "cpu_usage_time", map[string]string{}, false)
|
||||
assertMetricExist(gotMetrics, "cpu_load_1m", map[string]string{}, false)
|
||||
assertMetricExist(gotMetrics, "cpu_load_5m", map[string]string{}, false)
|
||||
assertMetricExist(gotMetrics, "cpu_load_15m", map[string]string{}, false)
|
||||
assertMetricExist(gotMetrics, "disk_operation_count", map[string]string{}, false)
|
||||
assertMetricExist(gotMetrics, "disk_merged_operation_count", map[string]string{}, false)
|
||||
assertMetricExist(gotMetrics, "disk_operation_bytes_count", map[string]string{}, false)
|
||||
|
||||
Reference in New Issue
Block a user