diff --git a/config/system-stats-monitor.json b/config/system-stats-monitor.json index 0a64b2a7..715279e2 100644 --- a/config/system-stats-monitor.json +++ b/config/system-stats-monitor.json @@ -15,7 +15,19 @@ }, "cpu/load_15m": { "displayName": "cpu/load_15m" - } + }, + "system/processes_total": { + "displayName": "system/processes_total" + }, + "system/procs_running": { + "displayName": "system/procs_running" + }, + "system/procs_blocked": { + "displayName": "system/procs_blocked" + }, + "system/interrupts_total": { + "displayName": "system/interrupts_total" + } } }, "disk": { diff --git a/pkg/exporters/stackdriver/stackdriver_exporter.go b/pkg/exporters/stackdriver/stackdriver_exporter.go index f3d53e41..1e01c57e 100644 --- a/pkg/exporters/stackdriver/stackdriver_exporter.go +++ b/pkg/exporters/stackdriver/stackdriver_exporter.go @@ -69,6 +69,10 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{ metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count", metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state", metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled", + metrics.SystemProcessesTotal: "kubernetes.io/internal/node/guest/system/processes_total", + metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running", + metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked", + metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total", } func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string { diff --git a/pkg/systemstatsmonitor/README.md b/pkg/systemstatsmonitor/README.md index e100fd7c..791b950d 100644 --- a/pkg/systemstatsmonitor/README.md +++ b/pkg/systemstatsmonitor/README.md @@ -28,6 +28,10 @@ Below metrics are collected from `cpu` component: * `cpu_load_1m`: CPU load average over the last 1 minute. Collected from [`/proc/loadavg`][/proc doc]. * `cpu_load_5m`: CPU load average over the last 5 minutes. Collected from [`/proc/loadavg`][/proc doc]. * `cpu_load_15m`: CPU load average over the last 15 minutes. Collected from [`/proc/loadavg`][/proc doc]. +* `system/processes_total`: Number of forks since boot. +* `system/procs_running`: Number of processes currently running. +* `system/procs_blocked`: Number of processes currently blocked. +* `system/interrupts_total`: Total number of interrupts serviced (cumulative). [/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html diff --git a/pkg/systemstatsmonitor/cpu_collector.go b/pkg/systemstatsmonitor/cpu_collector.go index bd784934..54d9ac03 100644 --- a/pkg/systemstatsmonitor/cpu_collector.go +++ b/pkg/systemstatsmonitor/cpu_collector.go @@ -18,10 +18,13 @@ package systemstatsmonitor import ( "github.com/golang/glog" + "github.com/prometheus/procfs" "github.com/shirou/gopsutil/cpu" + "github.com/shirou/gopsutil/host" "github.com/shirou/gopsutil/load" ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types" + "k8s.io/node-problem-detector/pkg/util" "k8s.io/node-problem-detector/pkg/util/metrics" ) @@ -36,11 +39,17 @@ import ( const clockTick float64 = 100.0 type cpuCollector struct { - mRunnableTaskCount *metrics.Float64Metric - mUsageTime *metrics.Float64Metric - mCpuLoad1m *metrics.Float64Metric - mCpuLoad5m *metrics.Float64Metric - mCpuLoad15m *metrics.Float64Metric + tags map[string]string + + mRunnableTaskCount *metrics.Float64Metric + mUsageTime *metrics.Float64Metric + mCpuLoad1m *metrics.Float64Metric + mCpuLoad5m *metrics.Float64Metric + mCpuLoad15m *metrics.Float64Metric + mSystemProcessesTotal *metrics.Int64Metric + mSystemProcsRunning *metrics.Int64Metric + mSystemProcsBlocked *metrics.Int64Metric + mSystemInterruptsTotal *metrics.Int64Metric config *ssmtypes.CPUStatsConfig @@ -48,9 +57,19 @@ type cpuCollector struct { } func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector { - cc := cpuCollector{config: cpuConfig} + cc := cpuCollector{tags: map[string]string{}, config: cpuConfig} - var err error + kernelVersion, err := host.KernelVersion() + if err != nil { + glog.Fatalf("Failed to retrieve kernel version: %v", err) + } + cc.tags["kernel_version"] = kernelVersion + + osVersion, err := util.GetOSVersion() + if err != nil { + glog.Fatalf("Failed to retrieve OS version: %v", err) + } + cc.tags["os_version"] = osVersion cc.mRunnableTaskCount, err = metrics.NewFloat64Metric( metrics.CPURunnableTaskCountID, @@ -107,6 +126,50 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector { glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad15m, err) } + cc.mSystemProcessesTotal, err = metrics.NewInt64Metric( + metrics.SystemProcessesTotal, + cpuConfig.MetricsConfigs[string(metrics.SystemProcessesTotal)].DisplayName, + "Number of forks since boot.", + "1", + metrics.Sum, + []string{osVersionLabel, kernelVersionLabel}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemProcessesTotal, err) + } + + cc.mSystemProcsRunning, err = metrics.NewInt64Metric( + metrics.SystemProcsRunning, + cpuConfig.MetricsConfigs[string(metrics.SystemProcsRunning)].DisplayName, + "Number of processes currently running.", + "1", + metrics.LastValue, + []string{osVersionLabel, kernelVersionLabel}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemProcsRunning, err) + } + + cc.mSystemProcsBlocked, err = metrics.NewInt64Metric( + metrics.SystemProcsBlocked, + cpuConfig.MetricsConfigs[string(metrics.SystemProcsBlocked)].DisplayName, + "Number of processes currently blocked.", + "1", + metrics.LastValue, + []string{osVersionLabel, kernelVersionLabel}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemProcsBlocked, err) + } + + cc.mSystemInterruptsTotal, err = metrics.NewInt64Metric( + metrics.SystemInterruptsTotal, + cpuConfig.MetricsConfigs[string(metrics.SystemInterruptsTotal)].DisplayName, + "Total number of interrupts serviced (cumulative).", + "1", + metrics.Sum, + []string{osVersionLabel, kernelVersionLabel}) + if err != nil { + glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemInterruptsTotal, err) + } + cc.lastUsageTime = make(map[string]float64) return &cc @@ -174,6 +237,33 @@ func (cc *cpuCollector) recordUsage() { cc.lastUsageTime["guest_nice"] = clockTick * timersStat.GuestNice } +func (cc *cpuCollector) recordSystemStats() { + if cc.mSystemProcessesTotal == nil { + return + } + if cc.mSystemProcsRunning == nil { + return + } + if cc.mSystemProcsBlocked == nil { + return + } + if cc.mSystemInterruptsTotal == nil { + return + } + + fs, err := procfs.NewFS("/proc") + stats, err := fs.Stat() + if err != nil { + glog.Errorf("Failed to retrieve cpu/process stats: %v", err) + return + } + + cc.mSystemProcessesTotal.Record(cc.tags, int64(stats.ProcessCreated)) + cc.mSystemProcsRunning.Record(cc.tags, int64(stats.ProcessesRunning)) + cc.mSystemProcsBlocked.Record(cc.tags, int64(stats.ProcessesBlocked)) + cc.mSystemInterruptsTotal.Record(cc.tags, int64(stats.IRQTotal)) +} + func (cc *cpuCollector) collect() { if cc == nil { return @@ -181,4 +271,5 @@ func (cc *cpuCollector) collect() { cc.recordLoad() cc.recordUsage() + cc.recordSystemStats() } diff --git a/pkg/systemstatsmonitor/labels.go b/pkg/systemstatsmonitor/labels.go index 1b49f626..e34ad3d0 100644 --- a/pkg/systemstatsmonitor/labels.go +++ b/pkg/systemstatsmonitor/labels.go @@ -36,3 +36,9 @@ const featureLabel = "os_feature" // valueLabel labels the value for the features of the guest os system if required const valueLabel = "value" + +// osVersionLabel labels the OS +const osVersionLabel = "os_version" + +// osVersionLabel labels the kernel version +const kernelVersionLabel = "kernel_version" diff --git a/pkg/systemstatsmonitor/osfeature_collector.go b/pkg/systemstatsmonitor/osfeature_collector.go index 170affe7..1dd62c54 100644 --- a/pkg/systemstatsmonitor/osfeature_collector.go +++ b/pkg/systemstatsmonitor/osfeature_collector.go @@ -142,6 +142,9 @@ func (ofc *osFeatureCollector) recordFeaturesFromModules(modules []system.Module } func (ofc *osFeatureCollector) collect() { + if ofc.osFeature == nil { + return + } cmdlineArgs, err := system.CmdlineArgs() if err != nil { glog.Fatalf("Error retrieving cmdline args: %v", err) diff --git a/pkg/util/metrics/metric.go b/pkg/util/metrics/metric.go index cbf816cd..4d1ae847 100644 --- a/pkg/util/metrics/metric.go +++ b/pkg/util/metrics/metric.go @@ -42,6 +42,10 @@ const ( MemoryUnevictableUsedID MetricID = "memory/unevictable_used" MemoryDirtyUsedID MetricID = "memory/dirty_used" OSFeatureID MetricID = "system/os_feature" + SystemProcessesTotal MetricID = "system/processes_total" + SystemProcsRunning MetricID = "system/procs_running" + SystemProcsBlocked MetricID = "system/procs_blocked" + SystemInterruptsTotal MetricID = "system/interrupts_total" ) var MetricMap MetricMapping