From eb38b4b598fdf66ebb5b1f4b2ca648549a4d2883 Mon Sep 17 00:00:00 2001 From: varsha teratipally Date: Tue, 15 Dec 2020 17:31:10 +0000 Subject: [PATCH] added a new metric to retrieve os features like unknown modules --- config/guestosconfig/known-modules.json | 69 ++++++++ config/system-stats-monitor.json | 8 + .../stackdriver/stackdriver_exporter.go | 1 + pkg/systemstatsmonitor/README.md | 16 ++ pkg/systemstatsmonitor/labels.go | 6 + pkg/systemstatsmonitor/osfeature_collector.go | 155 ++++++++++++++++++ .../system_stats_monitor.go | 22 ++- pkg/systemstatsmonitor/types/config.go | 26 ++- pkg/systemstatsmonitor/types/config_test.go | 9 + pkg/util/metrics/metric.go | 1 + pkg/util/metrics/system/common.go | 9 + test/e2e/metriconly/metrics_test.go | 1 + 12 files changed, 308 insertions(+), 15 deletions(-) create mode 100644 config/guestosconfig/known-modules.json create mode 100644 pkg/systemstatsmonitor/osfeature_collector.go diff --git a/config/guestosconfig/known-modules.json b/config/guestosconfig/known-modules.json new file mode 100644 index 00000000..433873b5 --- /dev/null +++ b/config/guestosconfig/known-modules.json @@ -0,0 +1,69 @@ +[ + { "moduleName": "xt_MASQUERADE"}, + { "moduleName": "xt_addrtype"}, + { "moduleName": "iptable_nat"}, + { "moduleName": "nf_nat"}, + { "moduleName": "br_netfilter"}, + { "moduleName": "ip6table_filter"}, + { "moduleName": "ip6_tables"}, + { "moduleName": "aesni_intel"}, + { "moduleName": "glue_helper"}, + { "moduleName": "crypto_simd"}, + { "moduleName": "cryptd"}, + { "moduleName": "virtio_balloon"}, + { "moduleName": "loadpin_trigger"}, + { "moduleName":"ip6table_filter"}, + { "moduleName":"ip6_tables"}, + { "moduleName":"iptable_filter"}, + { "moduleName":"bpfilter"}, + { "moduleName":"nls_iso8859_1"}, + { "moduleName":"intel_rapl_msr"}, + { "moduleName":"intel_rapl_common"}, + { "moduleName":"sb_edac"}, + { "moduleName":"rapl"}, + { "moduleName":"input_leds"}, + { "moduleName":"serio_raw"}, + { "moduleName":"pvpanic"}, + { "moduleName":"mac_hid"}, + { "moduleName":"sch_fq_codel"}, + { "moduleName":"ib_iser"}, + { "moduleName":"rdma_cm"}, + { "moduleName":"iw_cm"}, + { "moduleName":"ib_cm"}, + { "moduleName":"ib_core"}, + { "moduleName":"iscsi_tcp"}, + { "moduleName":"libiscsi_tcp"}, + { "moduleName":"libiscsi"}, + { "moduleName":"scsi_transport_iscsi"}, + { "moduleName":"virtio_rng"}, + { "moduleName":"ip_tables"}, + { "moduleName":"x_tables"}, + { "moduleName":"autofs4"}, + { "moduleName":"btrfs"}, + { "moduleName":"zstd_compress"}, + { "moduleName":"raid10"}, + { "moduleName":"raid456"}, + { "moduleName":"async_raid6_recov"}, + { "moduleName":"async_memcpy"}, + { "moduleName":"async_pq"}, + { "moduleName":"async_xor"}, + { "moduleName":"async_tx"}, + { "moduleName":"xor"}, + { "moduleName":"raid6_pq"}, + { "moduleName":"raid1"}, + { "moduleName":"raid0"}, + { "moduleName":"multipath"}, + { "moduleName":"linear"}, + { "moduleName":"crct10dif_pclmul"}, + { "moduleName":"crc32_pclmul"}, + { "moduleName":"ghash_clmulni_intel"}, + { "moduleName":"aesni_intel"}, + { "moduleName":"crypto_simd"}, + { "moduleName":"cryptd"}, + { "moduleName":"glue_helper"}, + { "moduleName":"psmouse"}, + { "moduleName":"virtio_net"}, + { "moduleName":"net_failover"}, + { "moduleName": "failover"}, + { "moduleName":"i2c_piix4"} +] \ No newline at end of file diff --git a/config/system-stats-monitor.json b/config/system-stats-monitor.json index 158692f5..0a64b2a7 100644 --- a/config/system-stats-monitor.json +++ b/config/system-stats-monitor.json @@ -75,5 +75,13 @@ } } }, + "osFeature": { + "metricsConfigs": { + "system/os_feature": { + "displayName": "system/os_feature" + } + }, + "KnownModulesConfigPath": "config/guestosconfig/known-modules.json" + }, "invokeInterval": "60s" } diff --git a/pkg/exporters/stackdriver/stackdriver_exporter.go b/pkg/exporters/stackdriver/stackdriver_exporter.go index a13b5792..f3d53e41 100644 --- a/pkg/exporters/stackdriver/stackdriver_exporter.go +++ b/pkg/exporters/stackdriver/stackdriver_exporter.go @@ -68,6 +68,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{ metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used", metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count", metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state", + metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled", } func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string { diff --git a/pkg/systemstatsmonitor/README.md b/pkg/systemstatsmonitor/README.md index f0aee8dc..e100fd7c 100644 --- a/pkg/systemstatsmonitor/README.md +++ b/pkg/systemstatsmonitor/README.md @@ -72,3 +72,19 @@ Below metrics are collected from `memory` component: * `memory_page_cache_used`: Page cache memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not reclaimed until needed. Summing values of all states yields the total page cache memory used. * `memory_unevictable_used`: [Unevictable memory][/proc doc] usage, in Bytes. * `memory_dirty_used`: Dirty pages usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `dirty`, `writeback`). `dirty` means the memory is waiting to be written back to disk, and `writeback` means the memory is actively being written back to disk. + +### OS features + +The guest OS features such as KTD kernel, GPU support are collected. Below are the OS +features collected: + +* `KTD`: Enabled, if KTD feature is enabled on OS +* `UnifiedCgroupHierarchy`: Enabled, if Unified hierarchy is enabled on OS. +* `KernelModuleIntegrity`: Enabled, if load pin security is enabled and modules are signed. +* `GPUSupport`: Enabled, if OS has GPU drivers installed like nvidia. +* `UnknownModules`: Enabled, if the OS has third party kernel modules installed. +UnknownModules are derived from the /proc/modules compared with the known-modules.json. + +And an option: +`knownModulesConfigPath`: The path to the file that contains the known modules(default +modules) can be set. By default, the path is set to `known-modules.json` \ No newline at end of file diff --git a/pkg/systemstatsmonitor/labels.go b/pkg/systemstatsmonitor/labels.go index 4c1123eb..1b49f626 100644 --- a/pkg/systemstatsmonitor/labels.go +++ b/pkg/systemstatsmonitor/labels.go @@ -30,3 +30,9 @@ const fsTypeLabel = "fs_type" // mountOptionLabel labels the mount_options of the monitored disk device const mountOptionLabel = "mount_option" + +// featureLabel labels the features of the guest os system +const featureLabel = "os_feature" + +// valueLabel labels the value for the features of the guest os system if required +const valueLabel = "value" diff --git a/pkg/systemstatsmonitor/osfeature_collector.go b/pkg/systemstatsmonitor/osfeature_collector.go new file mode 100644 index 00000000..969769c6 --- /dev/null +++ b/pkg/systemstatsmonitor/osfeature_collector.go @@ -0,0 +1,155 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package systemstatsmonitor + +import ( + "encoding/json" + "io/ioutil" + "strconv" + "strings" + + "github.com/golang/glog" + ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types" + "k8s.io/node-problem-detector/pkg/util/metrics" + "k8s.io/node-problem-detector/pkg/util/metrics/system" +) + +type osFeatureCollector struct { + config *ssmtypes.OSFeatureStatsConfig + osFeature *metrics.Int64Metric +} + +func NewOsFeatureCollectorOrDie(osFeatureConfig *ssmtypes.OSFeatureStatsConfig) *osFeatureCollector { + oc := osFeatureCollector{config: osFeatureConfig} + var err error + // Use metrics.Last aggregation method to ensure the metric is a guage metric. + if osFeatureConfig.MetricsConfigs["system/os_feature"].DisplayName != "" { + oc.osFeature, err = metrics.NewInt64Metric( + metrics.OSFeatureID, + osFeatureConfig.MetricsConfigs[string(metrics.OSFeatureID)].DisplayName, + "OS Features like GPU support, KTD kernel, third party modules as unknown modules. 1 if the feature is enabled and 0, if disabled.", + "1", + metrics.LastValue, + []string{featureLabel, valueLabel}) + if err != nil { + glog.Fatalf("Error initializing metric for system/os_feature: %v", err) + } + } + return &oc +} + +// recordFeaturesFromCmdline records the guest OS features that can be derived +// from the /proc/cmdline +// The following features are recorded: +// 1. KTD kernel based on csm.enabled value +// 2. UnifiedCgroupHierarchy based on systemd.unified_cgroup_hierarchy +// 3. KernelModuleIntegrity based on the loadpin enabled and a module signed. +func (ofc *osFeatureCollector) recordFeaturesFromCmdline(cmdlineArgs []system.CmdlineArg) { + var featuresMap = map[string]int64{ + "KTD": 0, + "UnifiedCgroupHierarchy": 0, + "ModuleSigned": 0, + "LoadPinEnabled": 0, + } + for _, cmdlineArg := range cmdlineArgs { + // record KTD feature. + if cmdlineArg.Key == "csm.enabled" { + featuresMap["KTD"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64) + } + // record UnifiedCgroupHierarchy feature. + if cmdlineArg.Key == "systemd.unified_cgroup_hierarchy" { + featuresMap["UnifiedCgroupHierarchy"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64) + } + // record KernelModuleIntegrity feature. + if cmdlineArg.Key == "module.sig_enforce" { + featuresMap["ModuleSigned"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64) + } + if cmdlineArg.Key == "loadpin.enabled" { + featuresMap["LoadPinEnabled"], _ = strconv.ParseInt(cmdlineArg.Value, 10, 64) + } + } + // Record the feature values. + ofc.osFeature.Record(map[string]string{featureLabel: "KTD"}, featuresMap["KTD"]) + ofc.osFeature.Record(map[string]string{featureLabel: "UnifiedCgroupHierarchy"}, featuresMap["UnifiedCgroupHierarchy"]) + if featuresMap["ModuleSigned"] == 1 && featuresMap["LoadPinEnabled"] == 1 { + ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 1) + } else { + ofc.osFeature.Record(map[string]string{featureLabel: "KernelModuleIntegrity"}, 0) + } +} + +// recordFeaturesFromCmdline records the guest OS features that can be derived +// from the /proc/modules +// The following features are recorded: +// 1. GPUSupport based on the presence of nvidia module +// 2. UnknownModules are tracked based on the presence of thirdparty kernel modules. +func (ofc *osFeatureCollector) recordFeaturesFromModules(modules []system.Module) { + // Collect known modules (default modules based on guest OS present in known-modules.json) + var knownModules []system.Module + f, err := ioutil.ReadFile(ofc.config.KnownModulesConfigPath) + if err != nil { + glog.Warningf("Failed to read configuration file %s: %v", + ofc.config.KnownModulesConfigPath, err) + } + // When the knownModulesConfigPath is not set + // it should assume all the metrics are assumed to be default modules. + if f != nil { + err = json.Unmarshal(f, &knownModules) + if err != nil { + glog.Warningf("Failed to retrieve known modules %v", err) + } + } else { + knownModules = []system.Module{} + } + + var hasGPUSupport = 0 + unknownModules := []string{} + + // Collect UnknownModules and check GPUSupport + for _, module := range modules { + // if the module has nvidia modules, then the hasGPUSupport is set. + if strings.Contains(module.ModuleName, "nvidia") { + hasGPUSupport = 1 + } else { + if module.OutOfTree || module.Proprietary { + if !system.ContainsModule(module.ModuleName, knownModules) { + unknownModules = append(unknownModules, module.ModuleName) + } + } + } + } + // record the UnknownModules and GPUSupport + if len(unknownModules) > 0 { + ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules", + valueLabel: strings.Join(unknownModules, ",")}, 1) + } else { + ofc.osFeature.Record(map[string]string{featureLabel: "UnknownModules"}, + 0) + } + ofc.osFeature.Record(map[string]string{featureLabel: "GPUSupport"}, + int64(hasGPUSupport)) +} + +func (ofc *osFeatureCollector) collect() { + cmdlineArgs, err := system.CmdlineArgs() + if err != nil { + glog.Fatalf("Error retrieving cmdline args: %v", err) + } + ofc.recordFeaturesFromCmdline(cmdlineArgs) + modules, err := system.Modules() + if err != nil { + glog.Fatalf("Error retrieving kernel modules: %v", err) + } + ofc.recordFeaturesFromModules(modules) +} diff --git a/pkg/systemstatsmonitor/system_stats_monitor.go b/pkg/systemstatsmonitor/system_stats_monitor.go index f43576b2..10d8e2ed 100644 --- a/pkg/systemstatsmonitor/system_stats_monitor.go +++ b/pkg/systemstatsmonitor/system_stats_monitor.go @@ -38,13 +38,14 @@ func init() { } type systemStatsMonitor struct { - configPath string - config ssmtypes.SystemStatsConfig - cpuCollector *cpuCollector - diskCollector *diskCollector - hostCollector *hostCollector - memoryCollector *memoryCollector - tomb *tomb.Tomb + configPath string + config ssmtypes.SystemStatsConfig + cpuCollector *cpuCollector + diskCollector *diskCollector + hostCollector *hostCollector + memoryCollector *memoryCollector + osFeatureCollector *osFeatureCollector + tomb *tomb.Tomb } // NewSystemStatsMonitorOrDie creates a system stats monitor. @@ -69,6 +70,8 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor { glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err) } + glog.Infof("Error: %v", ssm.config) + err = ssm.config.Validate() if err != nil { glog.Fatalf("Failed to validate %s configuration %+v: %v", ssm.configPath, ssm.config, err) @@ -86,6 +89,9 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor { if len(ssm.config.MemoryConfig.MetricsConfigs) > 0 { ssm.memoryCollector = NewMemoryCollectorOrDie(&ssm.config.MemoryConfig) } + if len(ssm.config.OsFeatureConfig.MetricsConfigs) > 0 { + ssm.osFeatureCollector = NewOsFeatureCollectorOrDie(&ssm.config.OsFeatureConfig) + } return &ssm } @@ -110,6 +116,7 @@ func (ssm *systemStatsMonitor) monitorLoop() { ssm.diskCollector.collect() ssm.hostCollector.collect() ssm.memoryCollector.collect() + ssm.osFeatureCollector.collect() } for { @@ -119,6 +126,7 @@ func (ssm *systemStatsMonitor) monitorLoop() { ssm.diskCollector.collect() ssm.hostCollector.collect() ssm.memoryCollector.collect() + ssm.osFeatureCollector.collect() case <-ssm.tomb.Stopping(): glog.Infof("System stats monitor stopped: %s", ssm.configPath) return diff --git a/pkg/systemstatsmonitor/types/config.go b/pkg/systemstatsmonitor/types/config.go index bb4496dc..7d0e338a 100644 --- a/pkg/systemstatsmonitor/types/config.go +++ b/pkg/systemstatsmonitor/types/config.go @@ -22,8 +22,9 @@ import ( ) var ( - defaultInvokeIntervalString = (60 * time.Second).String() - defaultlsblkTimeoutString = (5 * time.Second).String() + defaultInvokeIntervalString = (60 * time.Second).String() + defaultlsblkTimeoutString = (5 * time.Second).String() + defaultKnownModulesConfigPath = "config/guestosconfig/known-modules.json" ) type MetricConfig struct { @@ -50,13 +51,19 @@ type MemoryStatsConfig struct { MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"` } +type OSFeatureStatsConfig struct { + MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"` + KnownModulesConfigPath string `json:"knownModulesConfigPath"` +} + type SystemStatsConfig struct { - CPUConfig CPUStatsConfig `json:"cpu"` - DiskConfig DiskStatsConfig `json:"disk"` - HostConfig HostStatsConfig `json:"host"` - MemoryConfig MemoryStatsConfig `json:"memory"` - InvokeIntervalString string `json:"invokeInterval"` - InvokeInterval time.Duration `json:"-"` + CPUConfig CPUStatsConfig `json:"cpu"` + DiskConfig DiskStatsConfig `json:"disk"` + HostConfig HostStatsConfig `json:"host"` + MemoryConfig MemoryStatsConfig `json:"memory"` + OsFeatureConfig OSFeatureStatsConfig `json:"osFeature"` + InvokeIntervalString string `json:"invokeInterval"` + InvokeInterval time.Duration `json:"-"` } // ApplyConfiguration applies default configurations. @@ -67,6 +74,9 @@ func (ssc *SystemStatsConfig) ApplyConfiguration() error { if ssc.DiskConfig.LsblkTimeoutString == "" { ssc.DiskConfig.LsblkTimeoutString = defaultlsblkTimeoutString } + if ssc.OsFeatureConfig.KnownModulesConfigPath == "" { + ssc.OsFeatureConfig.KnownModulesConfigPath = defaultKnownModulesConfigPath + } var err error ssc.InvokeInterval, err = time.ParseDuration(ssc.InvokeIntervalString) diff --git a/pkg/systemstatsmonitor/types/config_test.go b/pkg/systemstatsmonitor/types/config_test.go index dc6382f8..31059b9a 100644 --- a/pkg/systemstatsmonitor/types/config_test.go +++ b/pkg/systemstatsmonitor/types/config_test.go @@ -43,6 +43,9 @@ func TestApplyConfiguration(t *testing.T) { LsblkTimeout: 5 * time.Second, LsblkTimeoutString: "5s", }, + OsFeatureConfig: OSFeatureStatsConfig{ + KnownModulesConfigPath: "config/guestosconfig/known-modules.json", + }, InvokeIntervalString: "60s", InvokeInterval: 60 * time.Second, }, @@ -58,6 +61,9 @@ func TestApplyConfiguration(t *testing.T) { LsblkTimeout: 5 * time.Second, LsblkTimeoutString: "5s", }, + OsFeatureConfig: OSFeatureStatsConfig{ + KnownModulesConfigPath: "config/guestosconfig/known-modules.json", + }, InvokeIntervalString: "1m0s", InvokeInterval: 60 * time.Second, }, @@ -72,6 +78,9 @@ func TestApplyConfiguration(t *testing.T) { isError: true, wantedConfig: SystemStatsConfig{ DiskConfig: DiskStatsConfig{}, + OsFeatureConfig: OSFeatureStatsConfig{ + KnownModulesConfigPath: "config/guestosconfig/known-modules.json", + }, }, }, } diff --git a/pkg/util/metrics/metric.go b/pkg/util/metrics/metric.go index 55984b47..cbf816cd 100644 --- a/pkg/util/metrics/metric.go +++ b/pkg/util/metrics/metric.go @@ -41,6 +41,7 @@ const ( MemoryPageCacheUsedID MetricID = "memory/page_cache_used" MemoryUnevictableUsedID MetricID = "memory/unevictable_used" MemoryDirtyUsedID MetricID = "memory/dirty_used" + OSFeatureID MetricID = "system/os_feature" ) var MetricMap MetricMapping diff --git a/pkg/util/metrics/system/common.go b/pkg/util/metrics/system/common.go index 6676c694..949b5d07 100644 --- a/pkg/util/metrics/system/common.go +++ b/pkg/util/metrics/system/common.go @@ -36,3 +36,12 @@ func ReadFileIntoLines(filename string) ([]string, error) { } return result, nil } + +func ContainsModule(key string, values []Module) bool { + for _, val := range values { + if val.ModuleName == key { + return true + } + } + return false +} diff --git a/test/e2e/metriconly/metrics_test.go b/test/e2e/metriconly/metrics_test.go index aa382901..4577597d 100644 --- a/test/e2e/metriconly/metrics_test.go +++ b/test/e2e/metriconly/metrics_test.go @@ -93,6 +93,7 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() { assertMetricExist(gotMetrics, "memory_unevictable_used", map[string]string{}, true) assertMetricExist(gotMetrics, "memory_dirty_used", map[string]string{}, false) assertMetricExist(gotMetrics, "host_uptime", map[string]string{}, false) + assertMetricExist(gotMetrics, "system_os_feature", map[string]string{}, false) }) ginkgo.It("NPD should not report any problem", func() {