mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-05-06 01:07:07 +00:00
added a new metric to retrieve os features like unknown modules
This commit is contained in:
@@ -72,3 +72,19 @@ Below metrics are collected from `memory` component:
|
||||
* `memory_page_cache_used`: Page cache memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not reclaimed until needed. Summing values of all states yields the total page cache memory used.
|
||||
* `memory_unevictable_used`: [Unevictable memory][/proc doc] usage, in Bytes.
|
||||
* `memory_dirty_used`: Dirty pages usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `dirty`, `writeback`). `dirty` means the memory is waiting to be written back to disk, and `writeback` means the memory is actively being written back to disk.
|
||||
|
||||
### OS features
|
||||
|
||||
The guest OS features such as KTD kernel, GPU support are collected. Below are the OS
|
||||
features collected:
|
||||
|
||||
* `KTD`: Enabled, if KTD feature is enabled on OS
|
||||
* `UnifiedCgroupHierarchy`: Enabled, if Unified hierarchy is enabled on OS.
|
||||
* `KernelModuleIntegrity`: Enabled, if load pin security is enabled and modules are signed.
|
||||
* `GPUSupport`: Enabled, if OS has GPU drivers installed like nvidia.
|
||||
* `UnknownModules`: Enabled, if the OS has third party kernel modules installed.
|
||||
UnknownModules are derived from the /proc/modules compared with the known-modules.json.
|
||||
|
||||
And an option:
|
||||
`knownModulesConfigPath`: The path to the file that contains the known modules(default
|
||||
modules) can be set. By default, the path is set to `known-modules.json`
|
||||
@@ -30,3 +30,9 @@ const fsTypeLabel = "fs_type"
|
||||
|
||||
// mountOptionLabel labels the mount_options of the monitored disk device
|
||||
const mountOptionLabel = "mount_option"
|
||||
|
||||
// featureLabel labels the features of the guest os system
|
||||
const featureLabel = "os_feature"
|
||||
|
||||
// valueLabel labels the value for the features of the guest os system if required
|
||||
const valueLabel = "value"
|
||||
|
||||
155
pkg/systemstatsmonitor/osfeature_collector.go
Normal file
155
pkg/systemstatsmonitor/osfeature_collector.go
Normal file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package systemstatsmonitor
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/golang/glog"
|
||||
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/util/metrics"
|
||||
"k8s.io/node-problem-detector/pkg/util/metrics/system"
|
||||
)
|
||||
|
||||
type osFeatureCollector struct {
|
||||
config *ssmtypes.OSFeatureStatsConfig
|
||||
osFeature *metrics.Int64Metric
|
||||
}
|
||||
|
||||
func NewOsFeatureCollectorOrDie(osFeatureConfig *ssmtypes.OSFeatureStatsConfig) *osFeatureCollector {
|
||||
oc := osFeatureCollector{config: osFeatureConfig}
|
||||
var err error
|
||||
// Use metrics.Last aggregation method to ensure the metric is a guage metric.
|
||||
if osFeatureConfig.MetricsConfigs["system/os_feature"].DisplayName != "" {
|
||||
oc.osFeature, err = metrics.NewInt64Metric(
|
||||
metrics.OSFeatureID,
|
||||
osFeatureConfig.MetricsConfigs[string(metrics.OSFeatureID)].DisplayName,
|
||||
"OS Features like GPU support, KTD kernel, third party modules as unknown modules. 1 if the feature is enabled and 0, if disabled.",
|
||||
"1",
|
||||
metrics.LastValue,
|
||||
[]string{featureLabel, valueLabel})
|
||||
if err != nil {
|
||||
glog.Fatalf("Error initializing metric for system/os_feature: %v", err)
|
||||
}
|
||||
}
|
||||
return &oc
|
||||
}
|
||||
|
||||
// recordFeaturesFromCmdline records the guest OS features that can be derived
|
||||
// from the /proc/cmdline
|
||||
// The following features are recorded:
|
||||
// 1. KTD kernel based on csm.enabled value
|
||||
// 2. UnifiedCgroupHierarchy based on systemd.unified_cgroup_hierarchy
|
||||
// 3. KernelModuleIntegrity based on the loadpin enabled and a module signed.
|
||||
func (ofc *osFeatureCollector) recordFeaturesFromCmdline(cmdlineArgs []system.CmdlineArg) {
|
||||
var featuresMap = map[string]int64{
|
||||
"KTD": 0,
|
||||
"UnifiedCgroupHierarchy": 0,
|
||||
"ModuleSigned": 0,
|
||||
"LoadPinEnabled": 0,
|
||||
}
|
||||
for _, cmdlineArg := range cmdlineArgs {
|
||||
// record KTD feature.
|
||||
if cmdlineArg.Key == "csm.enabled" {
|
||||
featuresMap["KTD"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
|
||||
}
|
||||
// record UnifiedCgroupHierarchy feature.
|
||||
if cmdlineArg.Key == "systemd.unified_cgroup_hierarchy" {
|
||||
featuresMap["UnifiedCgroupHierarchy"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
|
||||
}
|
||||
// record KernelModuleIntegrity feature.
|
||||
if cmdlineArg.Key == "module.sig_enforce" {
|
||||
featuresMap["ModuleSigned"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
|
||||
}
|
||||
if cmdlineArg.Key == "loadpin.enabled" {
|
||||
featuresMap["LoadPinEnabled"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
|
||||
}
|
||||
}
|
||||
// Record the feature values.
|
||||
ofc.osFeature.Record(map[string]string{featureLabel: "KTD"}, featuresMap["KTD"])
|
||||
ofc.osFeature.Record(map[string]string{featureLabel: "UnifiedCgroupHierarchy"}, featuresMap["UnifiedCgroupHierarchy"])
|
||||
if featuresMap["ModuleSigned"] == 1 && featuresMap["LoadPinEnabled"] == 1 {
|
||||
ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 1)
|
||||
} else {
|
||||
ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 0)
|
||||
}
|
||||
}
|
||||
|
||||
// recordFeaturesFromCmdline records the guest OS features that can be derived
|
||||
// from the /proc/modules
|
||||
// The following features are recorded:
|
||||
// 1. GPUSupport based on the presence of nvidia module
|
||||
// 2. UnknownModules are tracked based on the presence of thirdparty kernel modules.
|
||||
func (ofc *osFeatureCollector) recordFeaturesFromModules(modules []system.Module) {
|
||||
// Collect known modules (default modules based on guest OS present in known-modules.json)
|
||||
var knownModules []system.Module
|
||||
f, err := ioutil.ReadFile(ofc.config.KnownModulesConfigPath)
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to read configuration file %s: %v",
|
||||
ofc.config.KnownModulesConfigPath, err)
|
||||
}
|
||||
// When the knownModulesConfigPath is not set
|
||||
// it should assume all the metrics are assumed to be default modules.
|
||||
if f != nil {
|
||||
err = json.Unmarshal(f, &knownModules)
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to retrieve known modules %v", err)
|
||||
}
|
||||
} else {
|
||||
knownModules = []system.Module{}
|
||||
}
|
||||
|
||||
var hasGPUSupport = 0
|
||||
unknownModules := []string{}
|
||||
|
||||
// Collect UnknownModules and check GPUSupport
|
||||
for _, module := range modules {
|
||||
// if the module has nvidia modules, then the hasGPUSupport is set.
|
||||
if strings.Contains(module.ModuleName, "nvidia") {
|
||||
hasGPUSupport = 1
|
||||
} else {
|
||||
if module.OutOfTree || module.Proprietary {
|
||||
if !system.ContainsModule(module.ModuleName, knownModules) {
|
||||
unknownModules = append(unknownModules, module.ModuleName)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// record the UnknownModules and GPUSupport
|
||||
if len(unknownModules) > 0 {
|
||||
ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules",
|
||||
valueLabel: strings.Join(unknownModules, ",")}, 1)
|
||||
} else {
|
||||
ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules"},
|
||||
0)
|
||||
}
|
||||
ofc.osFeature.Record(map[string]string{featureLabel: "GPUSupport"},
|
||||
int64(hasGPUSupport))
|
||||
}
|
||||
|
||||
func (ofc *osFeatureCollector) collect() {
|
||||
cmdlineArgs, err := system.CmdlineArgs()
|
||||
if err != nil {
|
||||
glog.Fatalf("Error retrieving cmdline args: %v", err)
|
||||
}
|
||||
ofc.recordFeaturesFromCmdline(cmdlineArgs)
|
||||
modules, err := system.Modules()
|
||||
if err != nil {
|
||||
glog.Fatalf("Error retrieving kernel modules: %v", err)
|
||||
}
|
||||
ofc.recordFeaturesFromModules(modules)
|
||||
}
|
||||
@@ -38,13 +38,14 @@ func init() {
|
||||
}
|
||||
|
||||
type systemStatsMonitor struct {
|
||||
configPath string
|
||||
config ssmtypes.SystemStatsConfig
|
||||
cpuCollector *cpuCollector
|
||||
diskCollector *diskCollector
|
||||
hostCollector *hostCollector
|
||||
memoryCollector *memoryCollector
|
||||
tomb *tomb.Tomb
|
||||
configPath string
|
||||
config ssmtypes.SystemStatsConfig
|
||||
cpuCollector *cpuCollector
|
||||
diskCollector *diskCollector
|
||||
hostCollector *hostCollector
|
||||
memoryCollector *memoryCollector
|
||||
osFeatureCollector *osFeatureCollector
|
||||
tomb *tomb.Tomb
|
||||
}
|
||||
|
||||
// NewSystemStatsMonitorOrDie creates a system stats monitor.
|
||||
@@ -69,6 +70,8 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
|
||||
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
|
||||
}
|
||||
|
||||
glog.Infof("Error: %v", ssm.config)
|
||||
|
||||
err = ssm.config.Validate()
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to validate %s configuration %+v: %v", ssm.configPath, ssm.config, err)
|
||||
@@ -86,6 +89,9 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
|
||||
if len(ssm.config.MemoryConfig.MetricsConfigs) > 0 {
|
||||
ssm.memoryCollector = NewMemoryCollectorOrDie(&ssm.config.MemoryConfig)
|
||||
}
|
||||
if len(ssm.config.OsFeatureConfig.MetricsConfigs) > 0 {
|
||||
ssm.osFeatureCollector = NewOsFeatureCollectorOrDie(&ssm.config.OsFeatureConfig)
|
||||
}
|
||||
return &ssm
|
||||
}
|
||||
|
||||
@@ -110,6 +116,7 @@ func (ssm *systemStatsMonitor) monitorLoop() {
|
||||
ssm.diskCollector.collect()
|
||||
ssm.hostCollector.collect()
|
||||
ssm.memoryCollector.collect()
|
||||
ssm.osFeatureCollector.collect()
|
||||
}
|
||||
|
||||
for {
|
||||
@@ -119,6 +126,7 @@ func (ssm *systemStatsMonitor) monitorLoop() {
|
||||
ssm.diskCollector.collect()
|
||||
ssm.hostCollector.collect()
|
||||
ssm.memoryCollector.collect()
|
||||
ssm.osFeatureCollector.collect()
|
||||
case <-ssm.tomb.Stopping():
|
||||
glog.Infof("System stats monitor stopped: %s", ssm.configPath)
|
||||
return
|
||||
|
||||
@@ -22,8 +22,9 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
defaultInvokeIntervalString = (60 * time.Second).String()
|
||||
defaultlsblkTimeoutString = (5 * time.Second).String()
|
||||
defaultInvokeIntervalString = (60 * time.Second).String()
|
||||
defaultlsblkTimeoutString = (5 * time.Second).String()
|
||||
defaultKnownModulesConfigPath = "config/guestosconfig/known-modules.json"
|
||||
)
|
||||
|
||||
type MetricConfig struct {
|
||||
@@ -50,13 +51,19 @@ type MemoryStatsConfig struct {
|
||||
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
|
||||
}
|
||||
|
||||
type OSFeatureStatsConfig struct {
|
||||
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
|
||||
KnownModulesConfigPath string `json:"knownModulesConfigPath"`
|
||||
}
|
||||
|
||||
type SystemStatsConfig struct {
|
||||
CPUConfig CPUStatsConfig `json:"cpu"`
|
||||
DiskConfig DiskStatsConfig `json:"disk"`
|
||||
HostConfig HostStatsConfig `json:"host"`
|
||||
MemoryConfig MemoryStatsConfig `json:"memory"`
|
||||
InvokeIntervalString string `json:"invokeInterval"`
|
||||
InvokeInterval time.Duration `json:"-"`
|
||||
CPUConfig CPUStatsConfig `json:"cpu"`
|
||||
DiskConfig DiskStatsConfig `json:"disk"`
|
||||
HostConfig HostStatsConfig `json:"host"`
|
||||
MemoryConfig MemoryStatsConfig `json:"memory"`
|
||||
OsFeatureConfig OSFeatureStatsConfig `json:"osFeature"`
|
||||
InvokeIntervalString string `json:"invokeInterval"`
|
||||
InvokeInterval time.Duration `json:"-"`
|
||||
}
|
||||
|
||||
// ApplyConfiguration applies default configurations.
|
||||
@@ -67,6 +74,9 @@ func (ssc *SystemStatsConfig) ApplyConfiguration() error {
|
||||
if ssc.DiskConfig.LsblkTimeoutString == "" {
|
||||
ssc.DiskConfig.LsblkTimeoutString = defaultlsblkTimeoutString
|
||||
}
|
||||
if ssc.OsFeatureConfig.KnownModulesConfigPath == "" {
|
||||
ssc.OsFeatureConfig.KnownModulesConfigPath = defaultKnownModulesConfigPath
|
||||
}
|
||||
|
||||
var err error
|
||||
ssc.InvokeInterval, err = time.ParseDuration(ssc.InvokeIntervalString)
|
||||
|
||||
@@ -43,6 +43,9 @@ func TestApplyConfiguration(t *testing.T) {
|
||||
LsblkTimeout: 5 * time.Second,
|
||||
LsblkTimeoutString: "5s",
|
||||
},
|
||||
OsFeatureConfig: OSFeatureStatsConfig{
|
||||
KnownModulesConfigPath: "config/guestosconfig/known-modules.json",
|
||||
},
|
||||
InvokeIntervalString: "60s",
|
||||
InvokeInterval: 60 * time.Second,
|
||||
},
|
||||
@@ -58,6 +61,9 @@ func TestApplyConfiguration(t *testing.T) {
|
||||
LsblkTimeout: 5 * time.Second,
|
||||
LsblkTimeoutString: "5s",
|
||||
},
|
||||
OsFeatureConfig: OSFeatureStatsConfig{
|
||||
KnownModulesConfigPath: "config/guestosconfig/known-modules.json",
|
||||
},
|
||||
InvokeIntervalString: "1m0s",
|
||||
InvokeInterval: 60 * time.Second,
|
||||
},
|
||||
@@ -72,6 +78,9 @@ func TestApplyConfiguration(t *testing.T) {
|
||||
isError: true,
|
||||
wantedConfig: SystemStatsConfig{
|
||||
DiskConfig: DiskStatsConfig{},
|
||||
OsFeatureConfig: OSFeatureStatsConfig{
|
||||
KnownModulesConfigPath: "config/guestosconfig/known-modules.json",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user