Merge pull request #516 from karan/system_time

add metric for per-cpu, per-stage timing
This commit is contained in:
Kubernetes Prow Robot
2021-02-01 18:54:28 -08:00
committed by GitHub
6 changed files with 151 additions and 112 deletions

View File

@@ -1,99 +1,102 @@
{
"cpu": {
"metricsConfigs": {
"cpu/runnable_task_count": {
"displayName": "cpu/runnable_task_count"
},
"cpu/usage_time": {
"displayName": "cpu/usage_time"
},
"cpu/load_1m": {
"displayName": "cpu/load_1m"
},
"cpu/load_5m": {
"displayName": "cpu/load_5m"
},
"cpu/load_15m": {
"displayName": "cpu/load_15m"
},
"system/processes_total": {
"displayName": "system/processes_total"
},
"system/procs_running": {
"displayName": "system/procs_running"
},
"system/procs_blocked": {
"displayName": "system/procs_blocked"
},
"system/interrupts_total": {
"displayName": "system/interrupts_total"
}
}
},
"disk": {
"metricsConfigs": {
"disk/io_time": {
"displayName": "disk/io_time"
},
"disk/weighted_io": {
"displayName": "disk/weighted_io"
},
"disk/avg_queue_len": {
"displayName": "disk/avg_queue_len"
},
"disk/operation_count": {
"displayName": "disk/operation_count"
},
"disk/merged_operation_count": {
"displayName": "disk/merged_operation_count"
},
"disk/operation_bytes_count": {
"displayName": "disk/operation_bytes_count"
},
"disk/operation_time": {
"displayName": "disk/operation_time"
},
"disk/bytes_used": {
"displayName": "disk/bytes_used"
}
},
"includeRootBlk": true,
"includeAllAttachedBlk": true,
"lsblkTimeout": "5s"
},
"host": {
"metricsConfigs": {
"host/uptime": {
"displayName": "host/uptime"
}
}
},
"memory": {
"metricsConfigs": {
"memory/bytes_used": {
"displayName": "memory/bytes_used"
},
"memory/anonymous_used": {
"displayName": "memory/anonymous_used"
},
"memory/page_cache_used": {
"displayName": "memory/page_cache_used"
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
},
"memory/dirty_used": {
"displayName": "memory/dirty_used"
}
}
},
"osFeature": {
"metricsConfigs": {
"system/os_feature": {
"displayName": "system/os_feature"
}
},
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json"
},
"invokeInterval": "60s"
"cpu": {
"metricsConfigs": {
"cpu/load_15m": {
"displayName": "cpu/load_15m"
},
"cpu/load_1m": {
"displayName": "cpu/load_1m"
},
"cpu/load_5m": {
"displayName": "cpu/load_5m"
},
"cpu/runnable_task_count": {
"displayName": "cpu/runnable_task_count"
},
"cpu/usage_time": {
"displayName": "cpu/usage_time"
},
"system/cpu_stat": {
"displayName": "system/cpu_stat"
},
"system/interrupts_total": {
"displayName": "system/interrupts_total"
},
"system/processes_total": {
"displayName": "system/processes_total"
},
"system/procs_blocked": {
"displayName": "system/procs_blocked"
},
"system/procs_running": {
"displayName": "system/procs_running"
}
}
},
"disk": {
"includeAllAttachedBlk": true,
"includeRootBlk": true,
"lsblkTimeout": "5s",
"metricsConfigs": {
"disk/avg_queue_len": {
"displayName": "disk/avg_queue_len"
},
"disk/bytes_used": {
"displayName": "disk/bytes_used"
},
"disk/io_time": {
"displayName": "disk/io_time"
},
"disk/merged_operation_count": {
"displayName": "disk/merged_operation_count"
},
"disk/operation_bytes_count": {
"displayName": "disk/operation_bytes_count"
},
"disk/operation_count": {
"displayName": "disk/operation_count"
},
"disk/operation_time": {
"displayName": "disk/operation_time"
},
"disk/weighted_io": {
"displayName": "disk/weighted_io"
}
}
},
"host": {
"metricsConfigs": {
"host/uptime": {
"displayName": "host/uptime"
}
}
},
"invokeInterval": "60s",
"memory": {
"metricsConfigs": {
"memory/anonymous_used": {
"displayName": "memory/anonymous_used"
},
"memory/bytes_used": {
"displayName": "memory/bytes_used"
},
"memory/dirty_used": {
"displayName": "memory/dirty_used"
},
"memory/page_cache_used": {
"displayName": "memory/page_cache_used"
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
}
}
},
"osFeature": {
"KnownModulesConfigPath": "config/guestosconfig/known-modules.json",
"metricsConfigs": {
"system/os_feature": {
"displayName": "system/os_feature"
}
}
}
}

View File

@@ -73,6 +73,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running",
metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked",
metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total",
metrics.SystemCPUStat: "kubernetes.io/internal/node/guest/system/cpu_stat",
metrics.NetDevRxBytes: "kubernetes.io/internal/node/guest/net/rx_bytes",
metrics.NetDevRxPackets: "kubernetes.io/internal/node/guest/net/rx_packets",
metrics.NetDevRxErrors: "kubernetes.io/internal/node/guest/net/rx_errors",

View File

@@ -32,6 +32,7 @@ Below metrics are collected from `cpu` component:
* `system/procs_running`: Number of processes currently running.
* `system/procs_blocked`: Number of processes currently blocked.
* `system/interrupts_total`: Total number of interrupts serviced (cumulative).
* `system/cpu_stats`: Cumulative time each cpu spent in various stages. Collected from `/proc/stats`. Has a label for `cpu` and `stage`.
[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html

View File

@@ -17,6 +17,8 @@ limitations under the License.
package systemstatsmonitor
import (
"fmt"
"github.com/golang/glog"
"github.com/prometheus/procfs"
"github.com/shirou/gopsutil/cpu"
@@ -50,6 +52,7 @@ type cpuCollector struct {
mSystemProcsRunning *metrics.Int64Metric
mSystemProcsBlocked *metrics.Int64Metric
mSystemInterruptsTotal *metrics.Int64Metric
mSystemCPUStat *metrics.Float64Metric // per-cpu time from /proc/stats
config *ssmtypes.CPUStatsConfig
@@ -63,13 +66,13 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
if err != nil {
glog.Fatalf("Failed to retrieve kernel version: %v", err)
}
cc.tags["kernel_version"] = kernelVersion
cc.tags[kernelVersionLabel] = kernelVersion
osVersion, err := util.GetOSVersion()
if err != nil {
glog.Fatalf("Failed to retrieve OS version: %v", err)
}
cc.tags["os_version"] = osVersion
cc.tags[osVersionLabel] = osVersion
cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
metrics.CPURunnableTaskCountID,
@@ -170,6 +173,17 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemInterruptsTotal, err)
}
cc.mSystemCPUStat, err = metrics.NewFloat64Metric(
metrics.SystemCPUStat,
cpuConfig.MetricsConfigs[string(metrics.SystemCPUStat)].DisplayName,
"Cumulative time each cpu spent in various stages.",
"ns",
metrics.Sum,
[]string{osVersionLabel, kernelVersionLabel, cpuLabel, stageLabel})
if err != nil {
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemCPUStat, err)
}
cc.lastUsageTime = make(map[string]float64)
return &cc
@@ -238,19 +252,6 @@ func (cc *cpuCollector) recordUsage() {
}
func (cc *cpuCollector) recordSystemStats() {
if cc.mSystemProcessesTotal == nil {
return
}
if cc.mSystemProcsRunning == nil {
return
}
if cc.mSystemProcsBlocked == nil {
return
}
if cc.mSystemInterruptsTotal == nil {
return
}
fs, err := procfs.NewFS("/proc")
stats, err := fs.Stat()
if err != nil {
@@ -262,6 +263,32 @@ func (cc *cpuCollector) recordSystemStats() {
cc.mSystemProcsRunning.Record(cc.tags, int64(stats.ProcessesRunning))
cc.mSystemProcsBlocked.Record(cc.tags, int64(stats.ProcessesBlocked))
cc.mSystemInterruptsTotal.Record(cc.tags, int64(stats.IRQTotal))
for i, c := range stats.CPU {
tags := cc.tags
tags[cpuLabel] = fmt.Sprintf("cpu%d", i)
tags[stageLabel] = "user"
cc.mSystemCPUStat.Record(tags, c.User)
tags[stageLabel] = "nice"
cc.mSystemCPUStat.Record(tags, c.Nice)
tags[stageLabel] = "system"
cc.mSystemCPUStat.Record(tags, c.System)
tags[stageLabel] = "idle"
cc.mSystemCPUStat.Record(tags, c.Idle)
tags[stageLabel] = "iowait"
cc.mSystemCPUStat.Record(tags, c.Iowait)
tags[stageLabel] = "iRQ"
cc.mSystemCPUStat.Record(tags, c.IRQ)
tags[stageLabel] = "softIRQ"
cc.mSystemCPUStat.Record(tags, c.SoftIRQ)
tags[stageLabel] = "steal"
cc.mSystemCPUStat.Record(tags, c.Steal)
tags[stageLabel] = "guest"
cc.mSystemCPUStat.Record(tags, c.Guest)
tags[stageLabel] = "guestNice"
cc.mSystemCPUStat.Record(tags, c.GuestNice)
}
}
func (cc *cpuCollector) collect() {

View File

@@ -45,3 +45,9 @@ const kernelVersionLabel = "kernel_version"
// interfaceNameLabel labels the network interface name
const interfaceNameLabel = "interface_name"
// cpuLabel labels the CPU (eg "cpu0")
const cpuLabel = "cpu"
// stageLabel labels the stage according to the kernel where CPU time was spent
const stageLabel = "stage"

View File

@@ -46,6 +46,7 @@ const (
SystemProcsRunning MetricID = "system/procs_running"
SystemProcsBlocked MetricID = "system/procs_blocked"
SystemInterruptsTotal MetricID = "system/interrupts_total"
SystemCPUStat MetricID = "system/cpu_stat"
NetDevRxBytes MetricID = "net/rx_bytes"
NetDevRxPackets MetricID = "net/rx_packets"
NetDevRxErrors MetricID = "net/rx_errors"