mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-02-14 18:09:57 +00:00
Merge pull request #516 from karan/system_time
add metric for per-cpu, per-stage timing
This commit is contained in:
@@ -1,99 +1,102 @@
|
||||
{
|
||||
"cpu": {
|
||||
"metricsConfigs": {
|
||||
"cpu/runnable_task_count": {
|
||||
"displayName": "cpu/runnable_task_count"
|
||||
},
|
||||
"cpu/usage_time": {
|
||||
"displayName": "cpu/usage_time"
|
||||
},
|
||||
"cpu/load_1m": {
|
||||
"displayName": "cpu/load_1m"
|
||||
},
|
||||
"cpu/load_5m": {
|
||||
"displayName": "cpu/load_5m"
|
||||
},
|
||||
"cpu/load_15m": {
|
||||
"displayName": "cpu/load_15m"
|
||||
},
|
||||
"system/processes_total": {
|
||||
"displayName": "system/processes_total"
|
||||
},
|
||||
"system/procs_running": {
|
||||
"displayName": "system/procs_running"
|
||||
},
|
||||
"system/procs_blocked": {
|
||||
"displayName": "system/procs_blocked"
|
||||
},
|
||||
"system/interrupts_total": {
|
||||
"displayName": "system/interrupts_total"
|
||||
}
|
||||
}
|
||||
},
|
||||
"disk": {
|
||||
"metricsConfigs": {
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
"disk/weighted_io": {
|
||||
"displayName": "disk/weighted_io"
|
||||
},
|
||||
"disk/avg_queue_len": {
|
||||
"displayName": "disk/avg_queue_len"
|
||||
},
|
||||
"disk/operation_count": {
|
||||
"displayName": "disk/operation_count"
|
||||
},
|
||||
"disk/merged_operation_count": {
|
||||
"displayName": "disk/merged_operation_count"
|
||||
},
|
||||
"disk/operation_bytes_count": {
|
||||
"displayName": "disk/operation_bytes_count"
|
||||
},
|
||||
"disk/operation_time": {
|
||||
"displayName": "disk/operation_time"
|
||||
},
|
||||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
}
|
||||
},
|
||||
"includeRootBlk": true,
|
||||
"includeAllAttachedBlk": true,
|
||||
"lsblkTimeout": "5s"
|
||||
},
|
||||
"host": {
|
||||
"metricsConfigs": {
|
||||
"host/uptime": {
|
||||
"displayName": "host/uptime"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memory": {
|
||||
"metricsConfigs": {
|
||||
"memory/bytes_used": {
|
||||
"displayName": "memory/bytes_used"
|
||||
},
|
||||
"memory/anonymous_used": {
|
||||
"displayName": "memory/anonymous_used"
|
||||
},
|
||||
"memory/page_cache_used": {
|
||||
"displayName": "memory/page_cache_used"
|
||||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/dirty_used": {
|
||||
"displayName": "memory/dirty_used"
|
||||
}
|
||||
}
|
||||
},
|
||||
"osFeature": {
|
||||
"metricsConfigs": {
|
||||
"system/os_feature": {
|
||||
"displayName": "system/os_feature"
|
||||
}
|
||||
},
|
||||
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json"
|
||||
},
|
||||
"invokeInterval": "60s"
|
||||
"cpu": {
|
||||
"metricsConfigs": {
|
||||
"cpu/load_15m": {
|
||||
"displayName": "cpu/load_15m"
|
||||
},
|
||||
"cpu/load_1m": {
|
||||
"displayName": "cpu/load_1m"
|
||||
},
|
||||
"cpu/load_5m": {
|
||||
"displayName": "cpu/load_5m"
|
||||
},
|
||||
"cpu/runnable_task_count": {
|
||||
"displayName": "cpu/runnable_task_count"
|
||||
},
|
||||
"cpu/usage_time": {
|
||||
"displayName": "cpu/usage_time"
|
||||
},
|
||||
"system/cpu_stat": {
|
||||
"displayName": "system/cpu_stat"
|
||||
},
|
||||
"system/interrupts_total": {
|
||||
"displayName": "system/interrupts_total"
|
||||
},
|
||||
"system/processes_total": {
|
||||
"displayName": "system/processes_total"
|
||||
},
|
||||
"system/procs_blocked": {
|
||||
"displayName": "system/procs_blocked"
|
||||
},
|
||||
"system/procs_running": {
|
||||
"displayName": "system/procs_running"
|
||||
}
|
||||
}
|
||||
},
|
||||
"disk": {
|
||||
"includeAllAttachedBlk": true,
|
||||
"includeRootBlk": true,
|
||||
"lsblkTimeout": "5s",
|
||||
"metricsConfigs": {
|
||||
"disk/avg_queue_len": {
|
||||
"displayName": "disk/avg_queue_len"
|
||||
},
|
||||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
},
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
"disk/merged_operation_count": {
|
||||
"displayName": "disk/merged_operation_count"
|
||||
},
|
||||
"disk/operation_bytes_count": {
|
||||
"displayName": "disk/operation_bytes_count"
|
||||
},
|
||||
"disk/operation_count": {
|
||||
"displayName": "disk/operation_count"
|
||||
},
|
||||
"disk/operation_time": {
|
||||
"displayName": "disk/operation_time"
|
||||
},
|
||||
"disk/weighted_io": {
|
||||
"displayName": "disk/weighted_io"
|
||||
}
|
||||
}
|
||||
},
|
||||
"host": {
|
||||
"metricsConfigs": {
|
||||
"host/uptime": {
|
||||
"displayName": "host/uptime"
|
||||
}
|
||||
}
|
||||
},
|
||||
"invokeInterval": "60s",
|
||||
"memory": {
|
||||
"metricsConfigs": {
|
||||
"memory/anonymous_used": {
|
||||
"displayName": "memory/anonymous_used"
|
||||
},
|
||||
"memory/bytes_used": {
|
||||
"displayName": "memory/bytes_used"
|
||||
},
|
||||
"memory/dirty_used": {
|
||||
"displayName": "memory/dirty_used"
|
||||
},
|
||||
"memory/page_cache_used": {
|
||||
"displayName": "memory/page_cache_used"
|
||||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
}
|
||||
}
|
||||
},
|
||||
"osFeature": {
|
||||
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json",
|
||||
"metricsConfigs": {
|
||||
"system/os_feature": {
|
||||
"displayName": "system/os_feature"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,6 +73,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
||||
metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running",
|
||||
metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked",
|
||||
metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total",
|
||||
metrics.SystemCPUStat: "kubernetes.io/internal/node/guest/system/cpu_stat",
|
||||
metrics.NetDevRxBytes: "kubernetes.io/internal/node/guest/net/rx_bytes",
|
||||
metrics.NetDevRxPackets: "kubernetes.io/internal/node/guest/net/rx_packets",
|
||||
metrics.NetDevRxErrors: "kubernetes.io/internal/node/guest/net/rx_errors",
|
||||
|
||||
@@ -32,6 +32,7 @@ Below metrics are collected from `cpu` component:
|
||||
* `system/procs_running`: Number of processes currently running.
|
||||
* `system/procs_blocked`: Number of processes currently blocked.
|
||||
* `system/interrupts_total`: Total number of interrupts serviced (cumulative).
|
||||
* `system/cpu_stats`: Cumulative time each cpu spent in various stages. Collected from `/proc/stats`. Has a label for `cpu` and `stage`.
|
||||
|
||||
[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html
|
||||
|
||||
|
||||
@@ -17,6 +17,8 @@ limitations under the License.
|
||||
package systemstatsmonitor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/prometheus/procfs"
|
||||
"github.com/shirou/gopsutil/cpu"
|
||||
@@ -50,6 +52,7 @@ type cpuCollector struct {
|
||||
mSystemProcsRunning *metrics.Int64Metric
|
||||
mSystemProcsBlocked *metrics.Int64Metric
|
||||
mSystemInterruptsTotal *metrics.Int64Metric
|
||||
mSystemCPUStat *metrics.Float64Metric // per-cpu time from /proc/stats
|
||||
|
||||
config *ssmtypes.CPUStatsConfig
|
||||
|
||||
@@ -63,13 +66,13 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to retrieve kernel version: %v", err)
|
||||
}
|
||||
cc.tags["kernel_version"] = kernelVersion
|
||||
cc.tags[kernelVersionLabel] = kernelVersion
|
||||
|
||||
osVersion, err := util.GetOSVersion()
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to retrieve OS version: %v", err)
|
||||
}
|
||||
cc.tags["os_version"] = osVersion
|
||||
cc.tags[osVersionLabel] = osVersion
|
||||
|
||||
cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
|
||||
metrics.CPURunnableTaskCountID,
|
||||
@@ -170,6 +173,17 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
|
||||
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemInterruptsTotal, err)
|
||||
}
|
||||
|
||||
cc.mSystemCPUStat, err = metrics.NewFloat64Metric(
|
||||
metrics.SystemCPUStat,
|
||||
cpuConfig.MetricsConfigs[string(metrics.SystemCPUStat)].DisplayName,
|
||||
"Cumulative time each cpu spent in various stages.",
|
||||
"ns",
|
||||
metrics.Sum,
|
||||
[]string{osVersionLabel, kernelVersionLabel, cpuLabel, stageLabel})
|
||||
if err != nil {
|
||||
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemCPUStat, err)
|
||||
}
|
||||
|
||||
cc.lastUsageTime = make(map[string]float64)
|
||||
|
||||
return &cc
|
||||
@@ -238,19 +252,6 @@ func (cc *cpuCollector) recordUsage() {
|
||||
}
|
||||
|
||||
func (cc *cpuCollector) recordSystemStats() {
|
||||
if cc.mSystemProcessesTotal == nil {
|
||||
return
|
||||
}
|
||||
if cc.mSystemProcsRunning == nil {
|
||||
return
|
||||
}
|
||||
if cc.mSystemProcsBlocked == nil {
|
||||
return
|
||||
}
|
||||
if cc.mSystemInterruptsTotal == nil {
|
||||
return
|
||||
}
|
||||
|
||||
fs, err := procfs.NewFS("/proc")
|
||||
stats, err := fs.Stat()
|
||||
if err != nil {
|
||||
@@ -262,6 +263,32 @@ func (cc *cpuCollector) recordSystemStats() {
|
||||
cc.mSystemProcsRunning.Record(cc.tags, int64(stats.ProcessesRunning))
|
||||
cc.mSystemProcsBlocked.Record(cc.tags, int64(stats.ProcessesBlocked))
|
||||
cc.mSystemInterruptsTotal.Record(cc.tags, int64(stats.IRQTotal))
|
||||
|
||||
for i, c := range stats.CPU {
|
||||
tags := cc.tags
|
||||
tags[cpuLabel] = fmt.Sprintf("cpu%d", i)
|
||||
|
||||
tags[stageLabel] = "user"
|
||||
cc.mSystemCPUStat.Record(tags, c.User)
|
||||
tags[stageLabel] = "nice"
|
||||
cc.mSystemCPUStat.Record(tags, c.Nice)
|
||||
tags[stageLabel] = "system"
|
||||
cc.mSystemCPUStat.Record(tags, c.System)
|
||||
tags[stageLabel] = "idle"
|
||||
cc.mSystemCPUStat.Record(tags, c.Idle)
|
||||
tags[stageLabel] = "iowait"
|
||||
cc.mSystemCPUStat.Record(tags, c.Iowait)
|
||||
tags[stageLabel] = "iRQ"
|
||||
cc.mSystemCPUStat.Record(tags, c.IRQ)
|
||||
tags[stageLabel] = "softIRQ"
|
||||
cc.mSystemCPUStat.Record(tags, c.SoftIRQ)
|
||||
tags[stageLabel] = "steal"
|
||||
cc.mSystemCPUStat.Record(tags, c.Steal)
|
||||
tags[stageLabel] = "guest"
|
||||
cc.mSystemCPUStat.Record(tags, c.Guest)
|
||||
tags[stageLabel] = "guestNice"
|
||||
cc.mSystemCPUStat.Record(tags, c.GuestNice)
|
||||
}
|
||||
}
|
||||
|
||||
func (cc *cpuCollector) collect() {
|
||||
|
||||
@@ -45,3 +45,9 @@ const kernelVersionLabel = "kernel_version"
|
||||
|
||||
// interfaceNameLabel labels the network interface name
|
||||
const interfaceNameLabel = "interface_name"
|
||||
|
||||
// cpuLabel labels the CPU (eg "cpu0")
|
||||
const cpuLabel = "cpu"
|
||||
|
||||
// stageLabel labels the stage according to the kernel where CPU time was spent
|
||||
const stageLabel = "stage"
|
||||
|
||||
@@ -46,6 +46,7 @@ const (
|
||||
SystemProcsRunning MetricID = "system/procs_running"
|
||||
SystemProcsBlocked MetricID = "system/procs_blocked"
|
||||
SystemInterruptsTotal MetricID = "system/interrupts_total"
|
||||
SystemCPUStat MetricID = "system/cpu_stat"
|
||||
NetDevRxBytes MetricID = "net/rx_bytes"
|
||||
NetDevRxPackets MetricID = "net/rx_packets"
|
||||
NetDevRxErrors MetricID = "net/rx_errors"
|
||||
|
||||
Reference in New Issue
Block a user