added a new metric to retrieve os features like unknown modules

This commit is contained in:
varsha teratipally
2020-12-15 17:31:10 +00:00
parent 4ad49bbd84
commit eb38b4b598
12 changed files with 308 additions and 15 deletions

View File

@@ -72,3 +72,19 @@ Below metrics are collected from `memory` component:
* `memory_page_cache_used`: Page cache memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not reclaimed until needed. Summing values of all states yields the total page cache memory used.
* `memory_unevictable_used`: [Unevictable memory][/proc doc] usage, in Bytes.
* `memory_dirty_used`: Dirty pages usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `dirty`, `writeback`). `dirty` means the memory is waiting to be written back to disk, and `writeback` means the memory is actively being written back to disk.
### OS features
The guest OS features such as KTD kernel, GPU support are collected. Below are the OS
features collected:
* `KTD`: Enabled, if KTD feature is enabled on OS
* `UnifiedCgroupHierarchy`: Enabled, if Unified hierarchy is enabled on OS.
* `KernelModuleIntegrity`: Enabled, if load pin security is enabled and modules are signed.
* `GPUSupport`: Enabled, if OS has GPU drivers installed like nvidia.
* `UnknownModules`: Enabled, if the OS has third party kernel modules installed.
UnknownModules are derived from the /proc/modules compared with the known-modules.json.
And an option:
`knownModulesConfigPath`: The path to the file that contains the known modules(default
modules) can be set. By default, the path is set to `known-modules.json`

View File

@@ -30,3 +30,9 @@ const fsTypeLabel = "fs_type"
// mountOptionLabel labels the mount_options of the monitored disk device
const mountOptionLabel = "mount_option"
// featureLabel labels the features of the guest os system
const featureLabel = "os_feature"
// valueLabel labels the value for the features of the guest os system if required
const valueLabel = "value"

View File

@@ -0,0 +1,155 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package systemstatsmonitor
import (
"encoding/json"
"io/ioutil"
"strconv"
"strings"
"github.com/golang/glog"
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
"k8s.io/node-problem-detector/pkg/util/metrics"
"k8s.io/node-problem-detector/pkg/util/metrics/system"
)
type osFeatureCollector struct {
config *ssmtypes.OSFeatureStatsConfig
osFeature *metrics.Int64Metric
}
func NewOsFeatureCollectorOrDie(osFeatureConfig *ssmtypes.OSFeatureStatsConfig) *osFeatureCollector {
oc := osFeatureCollector{config: osFeatureConfig}
var err error
// Use metrics.Last aggregation method to ensure the metric is a guage metric.
if osFeatureConfig.MetricsConfigs["system/os_feature"].DisplayName != "" {
oc.osFeature, err = metrics.NewInt64Metric(
metrics.OSFeatureID,
osFeatureConfig.MetricsConfigs[string(metrics.OSFeatureID)].DisplayName,
"OS Features like GPU support, KTD kernel, third party modules as unknown modules. 1 if the feature is enabled and 0, if disabled.",
"1",
metrics.LastValue,
[]string{featureLabel, valueLabel})
if err != nil {
glog.Fatalf("Error initializing metric for system/os_feature: %v", err)
}
}
return &oc
}
// recordFeaturesFromCmdline records the guest OS features that can be derived
// from the /proc/cmdline
// The following features are recorded:
// 1. KTD kernel based on csm.enabled value
// 2. UnifiedCgroupHierarchy based on systemd.unified_cgroup_hierarchy
// 3. KernelModuleIntegrity based on the loadpin enabled and a module signed.
func (ofc *osFeatureCollector) recordFeaturesFromCmdline(cmdlineArgs []system.CmdlineArg) {
var featuresMap = map[string]int64{
"KTD": 0,
"UnifiedCgroupHierarchy": 0,
"ModuleSigned": 0,
"LoadPinEnabled": 0,
}
for _, cmdlineArg := range cmdlineArgs {
// record KTD feature.
if cmdlineArg.Key == "csm.enabled" {
featuresMap["KTD"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
}
// record UnifiedCgroupHierarchy feature.
if cmdlineArg.Key == "systemd.unified_cgroup_hierarchy" {
featuresMap["UnifiedCgroupHierarchy"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
}
// record KernelModuleIntegrity feature.
if cmdlineArg.Key == "module.sig_enforce" {
featuresMap["ModuleSigned"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
}
if cmdlineArg.Key == "loadpin.enabled" {
featuresMap["LoadPinEnabled"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64)
}
}
// Record the feature values.
ofc.osFeature.Record(map[string]string{featureLabel: "KTD"}, featuresMap["KTD"])
ofc.osFeature.Record(map[string]string{featureLabel: "UnifiedCgroupHierarchy"}, featuresMap["UnifiedCgroupHierarchy"])
if featuresMap["ModuleSigned"] == 1 && featuresMap["LoadPinEnabled"] == 1 {
ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 1)
} else {
ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 0)
}
}
// recordFeaturesFromCmdline records the guest OS features that can be derived
// from the /proc/modules
// The following features are recorded:
// 1. GPUSupport based on the presence of nvidia module
// 2. UnknownModules are tracked based on the presence of thirdparty kernel modules.
func (ofc *osFeatureCollector) recordFeaturesFromModules(modules []system.Module) {
// Collect known modules (default modules based on guest OS present in known-modules.json)
var knownModules []system.Module
f, err := ioutil.ReadFile(ofc.config.KnownModulesConfigPath)
if err != nil {
glog.Warningf("Failed to read configuration file %s: %v",
ofc.config.KnownModulesConfigPath, err)
}
// When the knownModulesConfigPath is not set
// it should assume all the metrics are assumed to be default modules.
if f != nil {
err = json.Unmarshal(f, &knownModules)
if err != nil {
glog.Warningf("Failed to retrieve known modules %v", err)
}
} else {
knownModules = []system.Module{}
}
var hasGPUSupport = 0
unknownModules := []string{}
// Collect UnknownModules and check GPUSupport
for _, module := range modules {
// if the module has nvidia modules, then the hasGPUSupport is set.
if strings.Contains(module.ModuleName, "nvidia") {
hasGPUSupport = 1
} else {
if module.OutOfTree || module.Proprietary {
if !system.ContainsModule(module.ModuleName, knownModules) {
unknownModules = append(unknownModules, module.ModuleName)
}
}
}
}
// record the UnknownModules and GPUSupport
if len(unknownModules) > 0 {
ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules",
valueLabel: strings.Join(unknownModules, ",")}, 1)
} else {
ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules"},
0)
}
ofc.osFeature.Record(map[string]string{featureLabel: "GPUSupport"},
int64(hasGPUSupport))
}
func (ofc *osFeatureCollector) collect() {
cmdlineArgs, err := system.CmdlineArgs()
if err != nil {
glog.Fatalf("Error retrieving cmdline args: %v", err)
}
ofc.recordFeaturesFromCmdline(cmdlineArgs)
modules, err := system.Modules()
if err != nil {
glog.Fatalf("Error retrieving kernel modules: %v", err)
}
ofc.recordFeaturesFromModules(modules)
}

View File

@@ -38,13 +38,14 @@ func init() {
}
type systemStatsMonitor struct {
configPath string
config ssmtypes.SystemStatsConfig
cpuCollector *cpuCollector
diskCollector *diskCollector
hostCollector *hostCollector
memoryCollector *memoryCollector
tomb *tomb.Tomb
configPath string
config ssmtypes.SystemStatsConfig
cpuCollector *cpuCollector
diskCollector *diskCollector
hostCollector *hostCollector
memoryCollector *memoryCollector
osFeatureCollector *osFeatureCollector
tomb *tomb.Tomb
}
// NewSystemStatsMonitorOrDie creates a system stats monitor.
@@ -69,6 +70,8 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
}
glog.Infof("Error: %v", ssm.config)
err = ssm.config.Validate()
if err != nil {
glog.Fatalf("Failed to validate %s configuration %+v: %v", ssm.configPath, ssm.config, err)
@@ -86,6 +89,9 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
if len(ssm.config.MemoryConfig.MetricsConfigs) > 0 {
ssm.memoryCollector = NewMemoryCollectorOrDie(&ssm.config.MemoryConfig)
}
if len(ssm.config.OsFeatureConfig.MetricsConfigs) > 0 {
ssm.osFeatureCollector = NewOsFeatureCollectorOrDie(&ssm.config.OsFeatureConfig)
}
return &ssm
}
@@ -110,6 +116,7 @@ func (ssm *systemStatsMonitor) monitorLoop() {
ssm.diskCollector.collect()
ssm.hostCollector.collect()
ssm.memoryCollector.collect()
ssm.osFeatureCollector.collect()
}
for {
@@ -119,6 +126,7 @@ func (ssm *systemStatsMonitor) monitorLoop() {
ssm.diskCollector.collect()
ssm.hostCollector.collect()
ssm.memoryCollector.collect()
ssm.osFeatureCollector.collect()
case <-ssm.tomb.Stopping():
glog.Infof("System stats monitor stopped: %s", ssm.configPath)
return

View File

@@ -22,8 +22,9 @@ import (
)
var (
defaultInvokeIntervalString = (60 * time.Second).String()
defaultlsblkTimeoutString = (5 * time.Second).String()
defaultInvokeIntervalString = (60 * time.Second).String()
defaultlsblkTimeoutString = (5 * time.Second).String()
defaultKnownModulesConfigPath = "config/guestosconfig/known-modules.json"
)
type MetricConfig struct {
@@ -50,13 +51,19 @@ type MemoryStatsConfig struct {
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
}
type OSFeatureStatsConfig struct {
MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
KnownModulesConfigPath string `json:"knownModulesConfigPath"`
}
type SystemStatsConfig struct {
CPUConfig CPUStatsConfig `json:"cpu"`
DiskConfig DiskStatsConfig `json:"disk"`
HostConfig HostStatsConfig `json:"host"`
MemoryConfig MemoryStatsConfig `json:"memory"`
InvokeIntervalString string `json:"invokeInterval"`
InvokeInterval time.Duration `json:"-"`
CPUConfig CPUStatsConfig `json:"cpu"`
DiskConfig DiskStatsConfig `json:"disk"`
HostConfig HostStatsConfig `json:"host"`
MemoryConfig MemoryStatsConfig `json:"memory"`
OsFeatureConfig OSFeatureStatsConfig `json:"osFeature"`
InvokeIntervalString string `json:"invokeInterval"`
InvokeInterval time.Duration `json:"-"`
}
// ApplyConfiguration applies default configurations.
@@ -67,6 +74,9 @@ func (ssc *SystemStatsConfig) ApplyConfiguration() error {
if ssc.DiskConfig.LsblkTimeoutString == "" {
ssc.DiskConfig.LsblkTimeoutString = defaultlsblkTimeoutString
}
if ssc.OsFeatureConfig.KnownModulesConfigPath == "" {
ssc.OsFeatureConfig.KnownModulesConfigPath = defaultKnownModulesConfigPath
}
var err error
ssc.InvokeInterval, err = time.ParseDuration(ssc.InvokeIntervalString)

View File

@@ -43,6 +43,9 @@ func TestApplyConfiguration(t *testing.T) {
LsblkTimeout: 5 * time.Second,
LsblkTimeoutString: "5s",
},
OsFeatureConfig: OSFeatureStatsConfig{
KnownModulesConfigPath: "config/guestosconfig/known-modules.json",
},
InvokeIntervalString: "60s",
InvokeInterval: 60 * time.Second,
},
@@ -58,6 +61,9 @@ func TestApplyConfiguration(t *testing.T) {
LsblkTimeout: 5 * time.Second,
LsblkTimeoutString: "5s",
},
OsFeatureConfig: OSFeatureStatsConfig{
KnownModulesConfigPath: "config/guestosconfig/known-modules.json",
},
InvokeIntervalString: "1m0s",
InvokeInterval: 60 * time.Second,
},
@@ -72,6 +78,9 @@ func TestApplyConfiguration(t *testing.T) {
isError: true,
wantedConfig: SystemStatsConfig{
DiskConfig: DiskStatsConfig{},
OsFeatureConfig: OSFeatureStatsConfig{
KnownModulesConfigPath: "config/guestosconfig/known-modules.json",
},
},
},
}