From 83b09277f00e600bcc4cfef5da0a5b408a6a6516 Mon Sep 17 00:00:00 2001
From: Xuewei Zhang <xueweiz@google.com>
Date: Tue, 28 Jan 2020 23:59:21 -0800
Subject: [PATCH] Collect more cpu/disk/memory metrics

---
 config/system-stats-monitor.json              |  44 ++++
 .../stackdriver/stackdriver_exporter.go       |  24 +-
 pkg/systemstatsmonitor/README.md              |  52 +++-
 pkg/systemstatsmonitor/cpu_collector.go       | 144 +++++++++++
 pkg/systemstatsmonitor/disk_collector.go      | 229 ++++++++++++++----
 pkg/systemstatsmonitor/labels.go              |  26 ++
 pkg/systemstatsmonitor/memory_collector.go    | 143 +++++++++++
 .../system_stats_monitor.go                   |  22 +-
 pkg/systemstatsmonitor/types/config.go        |  18 +-
 pkg/util/metrics/metric.go                    |  24 +-
 10 files changed, 655 insertions(+), 71 deletions(-)
 create mode 100644 pkg/systemstatsmonitor/cpu_collector.go
 create mode 100644 pkg/systemstatsmonitor/labels.go
 create mode 100644 pkg/systemstatsmonitor/memory_collector.go

diff --git a/config/system-stats-monitor.json b/config/system-stats-monitor.json
index 88faa162..8d15877d 100644
--- a/config/system-stats-monitor.json
+++ b/config/system-stats-monitor.json
@@ -1,4 +1,14 @@
 {
+	"cpu": {
+		"metricsConfigs": {
+			"cpu/runnable_task_count": {
+				"displayName": "cpu/runnable_task_count"
+			},
+			"cpu/usage_time": {
+				"displayName": "cpu/usage_time"
+			}
+		}
+	},
 	"disk": {
 		"metricsConfigs": {
 			"disk/io_time": {
@@ -9,6 +19,21 @@
 			},
 			"disk/avg_queue_len": {
 				"displayName": "disk/avg_queue_len"
+			},
+			"disk/operation_count": {
+				"displayName": "disk/operation_count"
+			},
+			"disk/merged_operation_count": {
+				"displayName": "disk/merged_operation_count"
+			},
+			"disk/operation_bytes_count": {
+				"displayName": "disk/operation_bytes_count"
+			},
+			"disk/operation_time": {
+				"displayName": "disk/operation_time"
+			},
+			"disk/bytes_used": {
+				"displayName": "disk/bytes_used"
 			}
 		},
 		"includeRootBlk": true,
@@ -22,5 +47,24 @@
 			}
 		}
 	},
+	"memory": {
+		"metricsConfigs": {
+			"memory/bytes_used": {
+				"displayName": "memory/bytes_used"
+			},
+			"memory/anonymous_used": {
+				"displayName": "memory/anonymous_used"
+			},
+			"memory/page_cache_used": {
+				"displayName": "memory/page_cache_used"
+			},
+			"memory/unevictable_used": {
+				"displayName": "memory/unevictable_used"
+			},
+			"memory/dirty_used": {
+				"displayName": "memory/dirty_used"
+			}
+		}
+	},
 	"invokeInterval": "60s"
 }
diff --git a/pkg/exporters/stackdriver/stackdriver_exporter.go b/pkg/exporters/stackdriver/stackdriver_exporter.go
index 633a2c57..68993863 100644
--- a/pkg/exporters/stackdriver/stackdriver_exporter.go
+++ b/pkg/exporters/stackdriver/stackdriver_exporter.go
@@ -47,12 +47,24 @@ func init() {
 const exporterName = "stackdriver"
 
 var NPDMetricToSDMetric = map[metrics.MetricID]string{
-	metrics.HostUptimeID:      "compute.googleapis.com/guest/system/uptime",
-	metrics.ProblemCounterID:  "compute.googleapis.com/guest/system/problem_count",
-	metrics.ProblemGaugeID:    "compute.googleapis.com/guest/system/problem_state",
-	metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
-	metrics.DiskIOTimeID:      "compute.googleapis.com/guest/disk/io_time",
-	metrics.DiskWeightedIOID:  "compute.googleapis.com/guest/disk/weighted_io_time",
+	metrics.CPURunnableTaskCountID:  "compute.googleapis.com/guest/cpu/runnable_task_count",
+	metrics.CPUUsageTimeID:          "compute.googleapis.com/guest/cpu/usage_time",
+	metrics.DiskAvgQueueLenID:       "compute.googleapis.com/guest/disk/queue_length",
+	metrics.DiskBytesUsedID:         "compute.googleapis.com/guest/disk/bytes_used",
+	metrics.DiskIOTimeID:            "compute.googleapis.com/guest/disk/io_time",
+	metrics.DiskMergedOpsCountID:    "compute.googleapis.com/guest/disk/merged_operation_count",
+	metrics.DiskOpsBytesID:          "compute.googleapis.com/guest/disk/operation_bytes_count",
+	metrics.DiskOpsCountID:          "compute.googleapis.com/guest/disk/operation_count",
+	metrics.DiskOpsTimeID:           "compute.googleapis.com/guest/disk/operation_time",
+	metrics.DiskWeightedIOID:        "compute.googleapis.com/guest/disk/weighted_io_time",
+	metrics.HostUptimeID:            "compute.googleapis.com/guest/system/uptime",
+	metrics.MemoryAnonymousUsedID:   "compute.googleapis.com/guest/memory/anonymous_used",
+	metrics.MemoryBytesUsedID:       "compute.googleapis.com/guest/memory/bytes_used",
+	metrics.MemoryDirtyUsedID:       "compute.googleapis.com/guest/memory/dirty_used",
+	metrics.MemoryPageCacheUsedID:   "compute.googleapis.com/guest/memory/page_cache_used",
+	metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
+	metrics.ProblemCounterID:        "compute.googleapis.com/guest/system/problem_count",
+	metrics.ProblemGaugeID:          "compute.googleapis.com/guest/system/problem_state",
 }
 
 func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string {
diff --git a/pkg/systemstatsmonitor/README.md b/pkg/systemstatsmonitor/README.md
index 2ec18b68..c378a910 100644
--- a/pkg/systemstatsmonitor/README.md
+++ b/pkg/systemstatsmonitor/README.md
@@ -4,27 +4,67 @@
 
 Currently supported components are:
 
+* cpu
 * disk
+* host
+* memory
 
 See example config file [here](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json).
 
+By setting the `metricsConfigs` field and `displayName` field ([example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json)), you can specify the list of metrics to be collected, and their display names on the Prometheus scaping endpoint.
+
 ## Detailed Configuration Options
 
 ### Global Configurations
 
 Data collection period can be specified globally in the config file, see `invokeInterval` at the [example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json).
 
+### CPU
+
+Below metrics are collected from `cpu` component:
+
+* `cpu_runnable_task_count`: The average number of runnable tasks in the run-queue during the last minute. Collected from [`/proc/loadavg`][/proc doc].
+* `cpu_usage_time`: CPU usage, in seconds. The [CPU state][/proc doc] for the corresponding usage is reported under the `state` metric label (e.g. `user`, `nice`, `system`...).
+
+[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html
+
 ### Disk
 
 Below metrics are collected from `disk` component:
 
-* `disk/io_time`: [# of milliseconds spent doing I/Os on this device](https://www.kernel.org/doc/Documentation/iostats.txt)
-* `disk/weighted_io`: [# of milliseconds spent doing I/Os on this device](https://www.kernel.org/doc/Documentation/iostats.txt)
-* `disk/avg_queue_len`: [average # of requests that was waiting in queue or being serviced during the last `invokeInterval`](https://www.xaprb.com/blog/2010/01/09/how-linux-iostat-computes-its-results/)
+* `disk_io_time`: [# of milliseconds spent doing I/Os on this device][iostat doc]
+* `disk_weighted_io`: [# of milliseconds spent doing I/Os on this device][iostat doc]
+* `disk_avg_queue_len`: [average # of requests that was waiting in queue or being serviced during the last `invokeInterval`](https://www.xaprb.com/blog/2010/01/09/how-linux-iostat-computes-its-results/)
+* `disk_operation_count`: [# of reads/writes completed][iostat doc]
+* `disk_merged_operation_count`: [# of reads/writes merged][iostat doc]
+* `disk_operation_bytes_count`: # of Bytes used for reads/writes on this device
+* `disk_operation_time`: [# of milliseconds spent reading/writing][iostat doc]
+* `disk_bytes_used`: Disk usage in Bytes. The usage state is reported under the `state` metric label (e.g. `used`, `free`). Summing values of all states yields the disk size.
 
-By setting the `metricsConfigs` field and `displayName` field ([example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json)), you can specify the list of metrics to be collected, and their display names on the Prometheus scaping endpoint. The name of the disk block device will be reported in the `device` metrics label.
+The name of the disk block device is reported in the `device_name` metric label (e.g. `sda`).
+
+For the metrics that separates read/write operations, the IO direction is reported in the `direction` metric label (e.g. `read`, `write`).
 
 And a few other options:
-* `includeRootBlk`: When set to `true`, add all block devices that's [not a slave or holder device](http://man7.org/linux/man-pages/man8/lsblk.8.html) to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
+* `includeRootBlk`: When set to `true`, add all block devices that's [not a slave or holder device][lsblk doc] to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
 * `includeAllAttachedBlk`: When set to `true`, add all currently attached block devices to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
-* `lsblkTimeout`: System Stats Monitor uses [`lsblk`](http://man7.org/linux/man-pages/man8/lsblk.8.html) to retrieve block devices information. This option sets the timeout for calling `lsblk` commands.
+* `lsblkTimeout`: System Stats Monitor uses [`lsblk`][lsblk doc] to retrieve block devices information. This option sets the timeout for calling `lsblk` commands.
+
+[iostat doc]: https://www.kernel.org/doc/Documentation/iostats.txt
+[lsblk doc]: http://man7.org/linux/man-pages/man8/lsblk.8.html
+
+### Host
+
+Below metrics are collected from `host` component:
+
+* `host_uptime`: The uptime of the operating system, in seconds. OS version and kernel versions are reported under the `os_version` and `kernel_version` metric label (e.g. `cos 73-11647.217.0`, `4.14.127+`).
+
+### Memory
+
+Below metrics are collected from `memory` component:
+
+* `memory_bytes_used`: Memory usage by each memory state, in Bytes. The memory state is reported under the `state` metric label (e.g. `free`, `used`, `buffered`...). Summing values of all states yields the total memory of the node.
+* `memory_anonymous_used`: Anonymous memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not swapped until needed. Summing values of all states yields the total anonymous memory used.
+* `memory_page_cache_used`: Page cache memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not reclaimed until needed. Summing values of all states yields the total page cache memory used.
+* `memory_unevictable_used`: [Unevictable memory][/proc doc] usage, in Bytes.
+* `memory_dirty_used`: Dirty pages usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `dirty`, `writeback`). `dirty` means the memory is waiting to be written back to disk, and `writeback` means the memory is actively being written back to disk.
diff --git a/pkg/systemstatsmonitor/cpu_collector.go b/pkg/systemstatsmonitor/cpu_collector.go
new file mode 100644
index 00000000..ba940940
--- /dev/null
+++ b/pkg/systemstatsmonitor/cpu_collector.go
@@ -0,0 +1,144 @@
+/*
+Copyright 2020 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package systemstatsmonitor
+
+import (
+	"github.com/golang/glog"
+	"github.com/shirou/gopsutil/cpu"
+	"github.com/shirou/gopsutil/load"
+
+	ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
+	"k8s.io/node-problem-detector/pkg/util/metrics"
+)
+
+// clockTick is the ratio between 1 second and 1 USER_HZ (a clock tick).
+//
+// CLK_TCK is 100 in most architectures. If NPD ever runs on a super special architecture,
+// we can work out a way to detect the clock tick on that architecture (might require
+// cross-compilation with C library or parsing kernel ABIs). For now, it's not worth the
+// complexity.
+//
+// See documentation at http://man7.org/linux/man-pages/man5/proc.5.html
+const clockTick float64 = 100.0
+
+type cpuCollector struct {
+	mRunnableTaskCount *metrics.Float64Metric
+	mUsageTime         *metrics.Float64Metric
+
+	config *ssmtypes.CPUStatsConfig
+
+	lastUsageTime map[string]float64
+}
+
+func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
+	cc := cpuCollector{config: cpuConfig}
+
+	var err error
+
+	cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
+		metrics.CPURunnableTaskCountID,
+		cpuConfig.MetricsConfigs[string(metrics.CPURunnableTaskCountID)].DisplayName,
+		"The average number of runnable tasks in the run-queue during the last minute",
+		"1",
+		metrics.LastValue,
+		[]string{})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.CPURunnableTaskCountID, err)
+	}
+
+	cc.mUsageTime, err = metrics.NewFloat64Metric(
+		metrics.CPUUsageTimeID,
+		cpuConfig.MetricsConfigs[string(metrics.CPUUsageTimeID)].DisplayName,
+		"CPU usage, in seconds",
+		"s",
+		metrics.Sum,
+		[]string{stateLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.CPUUsageTimeID, err)
+	}
+
+	cc.lastUsageTime = make(map[string]float64)
+
+	return &cc
+}
+
+func (cc *cpuCollector) recordLoad() {
+	if cc.mRunnableTaskCount == nil {
+		return
+	}
+
+	loadAvg, err := load.Avg()
+	if err != nil {
+		glog.Errorf("Failed to retrieve average CPU load: %v", err)
+		return
+	}
+
+	cc.mRunnableTaskCount.Record(map[string]string{}, loadAvg.Load1)
+}
+
+func (cc *cpuCollector) recordUsage() {
+	if cc.mUsageTime == nil {
+		return
+	}
+
+	// Set percpu=false to get aggregated usage from all CPUs.
+	timersStats, err := cpu.Times(false)
+	if err != nil {
+		glog.Errorf("Failed to retrieve CPU timers stat: %v", err)
+		return
+	}
+	timersStat := timersStats[0]
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "user"}, clockTick*timersStat.User-cc.lastUsageTime["user"])
+	cc.lastUsageTime["user"] = clockTick * timersStat.User
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "system"}, clockTick*timersStat.System-cc.lastUsageTime["system"])
+	cc.lastUsageTime["system"] = clockTick * timersStat.System
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "idle"}, clockTick*timersStat.Idle-cc.lastUsageTime["idle"])
+	cc.lastUsageTime["idle"] = clockTick * timersStat.Idle
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "nice"}, clockTick*timersStat.Nice-cc.lastUsageTime["nice"])
+	cc.lastUsageTime["nice"] = clockTick * timersStat.Nice
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "iowait"}, clockTick*timersStat.Iowait-cc.lastUsageTime["iowait"])
+	cc.lastUsageTime["iowait"] = clockTick * timersStat.Iowait
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "irq"}, clockTick*timersStat.Irq-cc.lastUsageTime["irq"])
+	cc.lastUsageTime["irq"] = clockTick * timersStat.Irq
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "softirq"}, clockTick*timersStat.Softirq-cc.lastUsageTime["softirq"])
+	cc.lastUsageTime["softirq"] = clockTick * timersStat.Softirq
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "steal"}, clockTick*timersStat.Steal-cc.lastUsageTime["steal"])
+	cc.lastUsageTime["steal"] = clockTick * timersStat.Steal
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "guest"}, clockTick*timersStat.Guest-cc.lastUsageTime["guest"])
+	cc.lastUsageTime["guest"] = clockTick * timersStat.Guest
+
+	cc.mUsageTime.Record(map[string]string{stateLabel: "guest_nice"}, clockTick*timersStat.GuestNice-cc.lastUsageTime["guest_nice"])
+	cc.lastUsageTime["guest_nice"] = clockTick * timersStat.GuestNice
+}
+
+func (cc *cpuCollector) collect() {
+	if cc == nil {
+		return
+	}
+
+	cc.recordLoad()
+	cc.recordUsage()
+}
diff --git a/pkg/systemstatsmonitor/disk_collector.go b/pkg/systemstatsmonitor/disk_collector.go
index 98962348..6e4ba859 100644
--- a/pkg/systemstatsmonitor/disk_collector.go
+++ b/pkg/systemstatsmonitor/disk_collector.go
@@ -29,18 +29,30 @@ import (
 	"k8s.io/node-problem-detector/pkg/util/metrics"
 )
 
-const deviceNameLabel = "device_name"
-
 type diskCollector struct {
-	mIOTime      *metrics.Int64Metric
-	mWeightedIO  *metrics.Int64Metric
-	mAvgQueueLen *metrics.Float64Metric
+	mIOTime         *metrics.Int64Metric
+	mWeightedIO     *metrics.Int64Metric
+	mAvgQueueLen    *metrics.Float64Metric
+	mOpsCount       *metrics.Int64Metric
+	mMergedOpsCount *metrics.Int64Metric
+	mOpsBytes       *metrics.Int64Metric
+	mOpsTime        *metrics.Int64Metric
+	mBytesUsed      *metrics.Int64Metric
 
 	config *ssmtypes.DiskStatsConfig
 
-	historyIOTime     map[string]uint64
-	historyWeightedIO map[string]uint64
-	lastSampleTime    time.Time
+	lastIOTime           map[string]uint64
+	lastWeightedIO       map[string]uint64
+	lastReadCount        map[string]uint64
+	lastWriteCount       map[string]uint64
+	lastMergedReadCount  map[string]uint64
+	lastMergedWriteCount map[string]uint64
+	lastReadBytes        map[string]uint64
+	lastWriteBytes       map[string]uint64
+	lastReadTime         map[string]uint64
+	lastWriteTime        map[string]uint64
+
+	lastSampleTime time.Time
 }
 
 func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector {
@@ -52,7 +64,7 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
 	dc.mIOTime, err = metrics.NewInt64Metric(
 		metrics.DiskIOTimeID,
 		diskConfig.MetricsConfigs[string(metrics.DiskIOTimeID)].DisplayName,
-		"The IO time spent on the disk",
+		"The IO time spent on the disk, in ms",
 		"ms",
 		metrics.Sum,
 		[]string{deviceNameLabel})
@@ -64,7 +76,7 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
 	dc.mWeightedIO, err = metrics.NewInt64Metric(
 		metrics.DiskWeightedIOID,
 		diskConfig.MetricsConfigs[string(metrics.DiskWeightedIOID)].DisplayName,
-		"The weighted IO on the disk",
+		"The weighted IO on the disk, in ms",
 		"ms",
 		metrics.Sum,
 		[]string{deviceNameLabel})
@@ -83,45 +95,86 @@ func NewDiskCollectorOrDie(diskConfig *ssmtypes.DiskStatsConfig) *diskCollector
 		glog.Fatalf("Error initializing metric for disk/avg_queue_len: %v", err)
 	}
 
-	dc.historyIOTime = make(map[string]uint64)
-	dc.historyWeightedIO = make(map[string]uint64)
+	dc.mOpsCount, err = metrics.NewInt64Metric(
+		metrics.DiskOpsCountID,
+		diskConfig.MetricsConfigs[string(metrics.DiskOpsCountID)].DisplayName,
+		"Disk operations count",
+		"1",
+		metrics.Sum,
+		[]string{deviceNameLabel, directionLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.DiskOpsCountID, err)
+	}
+
+	dc.mMergedOpsCount, err = metrics.NewInt64Metric(
+		metrics.DiskMergedOpsCountID,
+		diskConfig.MetricsConfigs[string(metrics.DiskMergedOpsCountID)].DisplayName,
+		"Disk merged operations count",
+		"1",
+		metrics.Sum,
+		[]string{deviceNameLabel, directionLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.DiskMergedOpsCountID, err)
+	}
+
+	dc.mOpsBytes, err = metrics.NewInt64Metric(
+		metrics.DiskOpsBytesID,
+		diskConfig.MetricsConfigs[string(metrics.DiskOpsBytesID)].DisplayName,
+		"Bytes transferred in disk operations",
+		"1",
+		metrics.Sum,
+		[]string{deviceNameLabel, directionLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.DiskOpsBytesID, err)
+	}
+
+	dc.mOpsTime, err = metrics.NewInt64Metric(
+		metrics.DiskOpsTimeID,
+		diskConfig.MetricsConfigs[string(metrics.DiskOpsTimeID)].DisplayName,
+		"Time spent in disk operations, in ms",
+		"ms",
+		metrics.Sum,
+		[]string{deviceNameLabel, directionLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.DiskOpsTimeID, err)
+	}
+
+	dc.mBytesUsed, err = metrics.NewInt64Metric(
+		metrics.DiskBytesUsedID,
+		diskConfig.MetricsConfigs[string(metrics.DiskBytesUsedID)].DisplayName,
+		"Disk bytes used, in Bytes",
+		"Byte",
+		metrics.LastValue,
+		[]string{deviceNameLabel, stateLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.DiskBytesUsedID, err)
+	}
+
+	dc.lastIOTime = make(map[string]uint64)
+	dc.lastWeightedIO = make(map[string]uint64)
+	dc.lastReadCount = make(map[string]uint64)
+	dc.lastWriteCount = make(map[string]uint64)
+	dc.lastMergedReadCount = make(map[string]uint64)
+	dc.lastMergedWriteCount = make(map[string]uint64)
+	dc.lastReadBytes = make(map[string]uint64)
+	dc.lastWriteBytes = make(map[string]uint64)
+	dc.lastReadTime = make(map[string]uint64)
+	dc.lastWriteTime = make(map[string]uint64)
 
 	return &dc
 }
 
-func (dc *diskCollector) collect() {
-	if dc == nil {
-		return
-	}
-
-	blks := []string{}
-	if dc.config.IncludeRootBlk {
-		blks = append(blks, listRootBlockDevices(dc.config.LsblkTimeout)...)
-	}
-	if dc.config.IncludeAllAttachedBlk {
-		blks = append(blks, listAttachedBlockDevices()...)
-	}
-
-	ioCountersStats, err := disk.IOCounters(blks...)
-	if err != nil {
-		glog.Errorf("Failed to retrieve disk IO counters: %v", err)
-		return
-	}
-
-	sampleTime := time.Now()
-
+func (dc *diskCollector) recordIOCounters(ioCountersStats map[string]disk.IOCountersStat, sampleTime time.Time) {
 	for deviceName, ioCountersStat := range ioCountersStats {
-		// Calculate average IO queue length since last measurement.
-		lastIOTime, historyExist := dc.historyIOTime[deviceName]
-		lastWeightedIO := dc.historyWeightedIO[deviceName]
-		lastSampleTime := dc.lastSampleTime
-
-		dc.historyIOTime[deviceName] = ioCountersStat.IoTime
-		dc.historyWeightedIO[deviceName] = ioCountersStat.WeightedIO
-		dc.lastSampleTime = sampleTime
-
-		// Attach label {"device_name": deviceName} to the metrics.
+		// Attach label {"device_name": deviceName} to the following metrics.
 		tags := map[string]string{deviceNameLabel: deviceName}
+
+		// Calculate average IO queue length since last measurement.
+		lastIOTime, historyExist := dc.lastIOTime[deviceName]
+		lastWeightedIO := dc.lastWeightedIO[deviceName]
+		dc.lastIOTime[deviceName] = ioCountersStat.IoTime
+		dc.lastWeightedIO[deviceName] = ioCountersStat.WeightedIO
+
 		if dc.mIOTime != nil {
 			dc.mIOTime.Record(tags, int64(ioCountersStat.IoTime-lastIOTime))
 		}
@@ -131,16 +184,104 @@ func (dc *diskCollector) collect() {
 		if historyExist {
 			avgQueueLen := float64(0.0)
 			if lastWeightedIO != ioCountersStat.WeightedIO {
-				diffSampleTimeMs := sampleTime.Sub(lastSampleTime).Seconds() * 1000
+				diffSampleTimeMs := sampleTime.Sub(dc.lastSampleTime).Seconds() * 1000
 				avgQueueLen = float64(ioCountersStat.WeightedIO-lastWeightedIO) / diffSampleTimeMs
 			}
 			if dc.mAvgQueueLen != nil {
 				dc.mAvgQueueLen.Record(tags, avgQueueLen)
 			}
 		}
+
+		// Attach label {"device_name": deviceName, "direction": "read"} to the following metrics.
+		tags = map[string]string{deviceNameLabel: deviceName, directionLabel: "read"}
+
+		if dc.mOpsCount != nil {
+			dc.mOpsCount.Record(tags, int64(ioCountersStat.ReadCount-dc.lastReadCount[deviceName]))
+			dc.lastReadCount[deviceName] = ioCountersStat.ReadCount
+		}
+		if dc.mMergedOpsCount != nil {
+			dc.mMergedOpsCount.Record(tags, int64(ioCountersStat.MergedReadCount-dc.lastMergedReadCount[deviceName]))
+			dc.lastMergedReadCount[deviceName] = ioCountersStat.MergedReadCount
+		}
+		if dc.mOpsBytes != nil {
+			dc.mOpsBytes.Record(tags, int64(ioCountersStat.ReadBytes-dc.lastReadBytes[deviceName]))
+			dc.lastReadBytes[deviceName] = ioCountersStat.ReadBytes
+		}
+		if dc.mOpsTime != nil {
+			dc.mOpsTime.Record(tags, int64(ioCountersStat.ReadTime-dc.lastReadTime[deviceName]))
+			dc.lastReadTime[deviceName] = ioCountersStat.ReadTime
+		}
+
+		// Attach label {"device_name": deviceName, "direction": "write"} to the following metrics.
+		tags = map[string]string{deviceNameLabel: deviceName, directionLabel: "write"}
+
+		if dc.mOpsCount != nil {
+			dc.mOpsCount.Record(tags, int64(ioCountersStat.WriteCount-dc.lastWriteCount[deviceName]))
+			dc.lastWriteCount[deviceName] = ioCountersStat.WriteCount
+		}
+		if dc.mMergedOpsCount != nil {
+			dc.mMergedOpsCount.Record(tags, int64(ioCountersStat.MergedWriteCount-dc.lastMergedWriteCount[deviceName]))
+			dc.lastMergedWriteCount[deviceName] = ioCountersStat.MergedWriteCount
+		}
+		if dc.mOpsBytes != nil {
+			dc.mOpsBytes.Record(tags, int64(ioCountersStat.WriteBytes-dc.lastWriteBytes[deviceName]))
+			dc.lastWriteBytes[deviceName] = ioCountersStat.WriteBytes
+		}
+		if dc.mOpsTime != nil {
+			dc.mOpsTime.Record(tags, int64(ioCountersStat.WriteTime-dc.lastWriteTime[deviceName]))
+			dc.lastWriteTime[deviceName] = ioCountersStat.WriteTime
+		}
 	}
 }
 
+func (dc *diskCollector) collect() {
+	if dc == nil {
+		return
+	}
+
+	// List available devices.
+	devices := []string{}
+	if dc.config.IncludeRootBlk {
+		devices = append(devices, listRootBlockDevices(dc.config.LsblkTimeout)...)
+	}
+	if dc.config.IncludeAllAttachedBlk {
+		devices = append(devices, listAttachedBlockDevices()...)
+	}
+
+	// Fetch metrics from /proc, /sys.
+	ioCountersStats, err := disk.IOCounters(devices...)
+	if err != nil {
+		glog.Errorf("Failed to retrieve disk IO counters: %v", err)
+		return
+	}
+	partitions, err := disk.Partitions(false)
+	if err != nil {
+		glog.Errorf("Failed to list disk partitions: %v", err)
+		return
+	}
+	sampleTime := time.Now()
+	defer func() { dc.lastSampleTime = sampleTime }()
+
+	// Record metrics regarding disk IO.
+	dc.recordIOCounters(ioCountersStats, sampleTime)
+
+	// Record metrics regarding disk space usage.
+	if dc.mBytesUsed == nil {
+		return
+	}
+	for _, partition := range partitions {
+		usageStat, err := disk.Usage(partition.Mountpoint)
+		if err != nil {
+			glog.Errorf("Failed to retrieve disk usage for %q: %v", partition.Mountpoint, err)
+			continue
+		}
+		deviceName := strings.TrimPrefix(partition.Device, "/dev/")
+		dc.mBytesUsed.Record(map[string]string{deviceNameLabel: deviceName, stateLabel: "free"}, int64(usageStat.Free))
+		dc.mBytesUsed.Record(map[string]string{deviceNameLabel: deviceName, stateLabel: "used"}, int64(usageStat.Used))
+	}
+
+}
+
 // listRootBlockDevices lists all block devices that's not a slave or holder.
 func listRootBlockDevices(timeout time.Duration) []string {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
diff --git a/pkg/systemstatsmonitor/labels.go b/pkg/systemstatsmonitor/labels.go
new file mode 100644
index 00000000..3b5c1858
--- /dev/null
+++ b/pkg/systemstatsmonitor/labels.go
@@ -0,0 +1,26 @@
+/*
+Copyright 2020 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package systemstatsmonitor
+
+// deviceNameLabel labels the monitored disk device, e.g.: "sda", "sda1".
+const deviceNameLabel = "device_name"
+
+// directionLabel labels the direction of the disk operations, e.g.: "read", "write".
+const directionLabel = "direction"
+
+// stateLabel labels the state of disk/memory/cpu usage, e.g.: "free", "used".
+const stateLabel = "state"
diff --git a/pkg/systemstatsmonitor/memory_collector.go b/pkg/systemstatsmonitor/memory_collector.go
new file mode 100644
index 00000000..5b8b65d7
--- /dev/null
+++ b/pkg/systemstatsmonitor/memory_collector.go
@@ -0,0 +1,143 @@
+/*
+Copyright 2020 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package systemstatsmonitor
+
+import (
+	"github.com/golang/glog"
+	"github.com/prometheus/procfs"
+
+	ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
+	"k8s.io/node-problem-detector/pkg/util/metrics"
+)
+
+type memoryCollector struct {
+	mBytesUsed       *metrics.Int64Metric
+	mAnonymousUsed   *metrics.Int64Metric
+	mPageCacheUsed   *metrics.Int64Metric
+	mUnevictableUsed *metrics.Int64Metric
+	mDirtyUsed       *metrics.Int64Metric
+
+	config *ssmtypes.MemoryStatsConfig
+}
+
+func NewMemoryCollectorOrDie(memoryConfig *ssmtypes.MemoryStatsConfig) *memoryCollector {
+	mc := memoryCollector{config: memoryConfig}
+
+	var err error
+
+	mc.mBytesUsed, err = metrics.NewInt64Metric(
+		metrics.MemoryBytesUsedID,
+		memoryConfig.MetricsConfigs[string(metrics.MemoryBytesUsedID)].DisplayName,
+		"Memory usage by each memory state, in Bytes. Summing values of all states yields the total memory on the node.",
+		"Byte",
+		metrics.LastValue,
+		[]string{stateLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.MemoryBytesUsedID, err)
+	}
+
+	mc.mAnonymousUsed, err = metrics.NewInt64Metric(
+		metrics.MemoryAnonymousUsedID,
+		memoryConfig.MetricsConfigs[string(metrics.MemoryAnonymousUsedID)].DisplayName,
+		"Anonymous memory usage, in Bytes. Summing values of all states yields the total anonymous memory used.",
+		"Byte",
+		metrics.LastValue,
+		[]string{stateLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.MemoryAnonymousUsedID, err)
+	}
+
+	mc.mPageCacheUsed, err = metrics.NewInt64Metric(
+		metrics.MemoryPageCacheUsedID,
+		memoryConfig.MetricsConfigs[string(metrics.MemoryPageCacheUsedID)].DisplayName,
+		"Page cache memory usage, in Bytes. Summing values of all states yields the total anonymous memory used.",
+		"Byte",
+		metrics.LastValue,
+		[]string{stateLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.MemoryPageCacheUsedID, err)
+	}
+
+	mc.mUnevictableUsed, err = metrics.NewInt64Metric(
+		metrics.MemoryUnevictableUsedID,
+		memoryConfig.MetricsConfigs[string(metrics.MemoryUnevictableUsedID)].DisplayName,
+		"Unevictable memory usage, in Bytes",
+		"Byte",
+		metrics.LastValue,
+		[]string{})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.MemoryUnevictableUsedID, err)
+	}
+
+	mc.mDirtyUsed, err = metrics.NewInt64Metric(
+		metrics.MemoryDirtyUsedID,
+		memoryConfig.MetricsConfigs[string(metrics.MemoryDirtyUsedID)].DisplayName,
+		"Dirty pages usage, in Bytes. Dirty means the memory is waiting to be written back to disk, and writeback means the memory is actively being written back to disk.",
+		"Byte",
+		metrics.LastValue,
+		[]string{stateLabel})
+	if err != nil {
+		glog.Fatalf("Error initializing metric for %q: %v", metrics.MemoryDirtyUsedID, err)
+	}
+
+	return &mc
+}
+
+func (mc *memoryCollector) collect() {
+	if mc == nil {
+		return
+	}
+
+	proc, err := procfs.NewDefaultFS()
+	if err != nil {
+		glog.Errorf("Failed to find /proc mount point: %v", err)
+		return
+	}
+	meminfo, err := proc.Meminfo()
+	if err != nil {
+		glog.Errorf("Failed to retrieve memory stats: %v", err)
+		return
+	}
+
+	if mc.mBytesUsed != nil {
+		memUsed := meminfo.MemTotal - meminfo.MemFree - meminfo.Buffers - meminfo.Cached - meminfo.Slab
+		mc.mBytesUsed.Record(map[string]string{stateLabel: "free"}, int64(meminfo.MemFree))
+		mc.mBytesUsed.Record(map[string]string{stateLabel: "used"}, int64(memUsed))
+		mc.mBytesUsed.Record(map[string]string{stateLabel: "buffered"}, int64(meminfo.Buffers))
+		mc.mBytesUsed.Record(map[string]string{stateLabel: "cached"}, int64(meminfo.Cached))
+		mc.mBytesUsed.Record(map[string]string{stateLabel: "slab"}, int64(meminfo.Slab))
+	}
+
+	if mc.mDirtyUsed != nil {
+		mc.mDirtyUsed.Record(map[string]string{stateLabel: "dirty"}, int64(meminfo.Dirty))
+		mc.mDirtyUsed.Record(map[string]string{stateLabel: "writeback"}, int64(meminfo.Writeback))
+	}
+
+	if mc.mAnonymousUsed != nil {
+		mc.mAnonymousUsed.Record(map[string]string{stateLabel: "active"}, int64(meminfo.ActiveAnon))
+		mc.mAnonymousUsed.Record(map[string]string{stateLabel: "inactive"}, int64(meminfo.InactiveAnon))
+	}
+
+	if mc.mPageCacheUsed != nil {
+		mc.mPageCacheUsed.Record(map[string]string{stateLabel: "active"}, int64(meminfo.ActiveFile))
+		mc.mPageCacheUsed.Record(map[string]string{stateLabel: "inactive"}, int64(meminfo.InactiveFile))
+	}
+
+	if mc.mUnevictableUsed != nil {
+		mc.mUnevictableUsed.Record(map[string]string{}, int64(meminfo.Unevictable))
+	}
+}
diff --git a/pkg/systemstatsmonitor/system_stats_monitor.go b/pkg/systemstatsmonitor/system_stats_monitor.go
index 0e7fb367..f43576b2 100644
--- a/pkg/systemstatsmonitor/system_stats_monitor.go
+++ b/pkg/systemstatsmonitor/system_stats_monitor.go
@@ -38,11 +38,13 @@ func init() {
 }
 
 type systemStatsMonitor struct {
-	configPath    string
-	config        ssmtypes.SystemStatsConfig
-	diskCollector *diskCollector
-	hostCollector *hostCollector
-	tomb          *tomb.Tomb
+	configPath      string
+	config          ssmtypes.SystemStatsConfig
+	cpuCollector    *cpuCollector
+	diskCollector   *diskCollector
+	hostCollector   *hostCollector
+	memoryCollector *memoryCollector
+	tomb            *tomb.Tomb
 }
 
 // NewSystemStatsMonitorOrDie creates a system stats monitor.
@@ -72,12 +74,18 @@ func NewSystemStatsMonitorOrDie(configPath string) types.Monitor {
 		glog.Fatalf("Failed to validate %s configuration %+v: %v", ssm.configPath, ssm.config, err)
 	}
 
+	if len(ssm.config.CPUConfig.MetricsConfigs) > 0 {
+		ssm.cpuCollector = NewCPUCollectorOrDie(&ssm.config.CPUConfig)
+	}
 	if len(ssm.config.DiskConfig.MetricsConfigs) > 0 {
 		ssm.diskCollector = NewDiskCollectorOrDie(&ssm.config.DiskConfig)
 	}
 	if len(ssm.config.HostConfig.MetricsConfigs) > 0 {
 		ssm.hostCollector = NewHostCollectorOrDie(&ssm.config.HostConfig)
 	}
+	if len(ssm.config.MemoryConfig.MetricsConfigs) > 0 {
+		ssm.memoryCollector = NewMemoryCollectorOrDie(&ssm.config.MemoryConfig)
+	}
 	return &ssm
 }
 
@@ -98,15 +106,19 @@ func (ssm *systemStatsMonitor) monitorLoop() {
 		glog.Infof("System stats monitor stopped: %s", ssm.configPath)
 		return
 	default:
+		ssm.cpuCollector.collect()
 		ssm.diskCollector.collect()
 		ssm.hostCollector.collect()
+		ssm.memoryCollector.collect()
 	}
 
 	for {
 		select {
 		case <-runTicker.C:
+			ssm.cpuCollector.collect()
 			ssm.diskCollector.collect()
 			ssm.hostCollector.collect()
+			ssm.memoryCollector.collect()
 		case <-ssm.tomb.Stopping():
 			glog.Infof("System stats monitor stopped: %s", ssm.configPath)
 			return
diff --git a/pkg/systemstatsmonitor/types/config.go b/pkg/systemstatsmonitor/types/config.go
index c10fd2c2..bb4496dc 100644
--- a/pkg/systemstatsmonitor/types/config.go
+++ b/pkg/systemstatsmonitor/types/config.go
@@ -30,6 +30,10 @@ type MetricConfig struct {
 	DisplayName string `json:"displayName"`
 }
 
+type CPUStatsConfig struct {
+	MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
+}
+
 type DiskStatsConfig struct {
 	MetricsConfigs        map[string]MetricConfig `json:"metricsConfigs"`
 	IncludeRootBlk        bool                    `json:"includeRootBlk"`
@@ -42,11 +46,17 @@ type HostStatsConfig struct {
 	MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
 }
 
+type MemoryStatsConfig struct {
+	MetricsConfigs map[string]MetricConfig `json:"metricsConfigs"`
+}
+
 type SystemStatsConfig struct {
-	DiskConfig           DiskStatsConfig `json:"disk"`
-	HostConfig           HostStatsConfig `json:"host"`
-	InvokeIntervalString string          `json:"invokeInterval"`
-	InvokeInterval       time.Duration   `json:"-"`
+	CPUConfig            CPUStatsConfig    `json:"cpu"`
+	DiskConfig           DiskStatsConfig   `json:"disk"`
+	HostConfig           HostStatsConfig   `json:"host"`
+	MemoryConfig         MemoryStatsConfig `json:"memory"`
+	InvokeIntervalString string            `json:"invokeInterval"`
+	InvokeInterval       time.Duration     `json:"-"`
 }
 
 // ApplyConfiguration applies default configurations.
diff --git a/pkg/util/metrics/metric.go b/pkg/util/metrics/metric.go
index e19f4461..4727ad76 100644
--- a/pkg/util/metrics/metric.go
+++ b/pkg/util/metrics/metric.go
@@ -20,12 +20,24 @@ import (
 )
 
 const (
-	ProblemCounterID  MetricID = "problem_counter"
-	ProblemGaugeID    MetricID = "problem_gauge"
-	DiskIOTimeID      MetricID = "disk/io_time"
-	DiskWeightedIOID  MetricID = "disk/weighted_io"
-	DiskAvgQueueLenID MetricID = "disk/avg_queue_len"
-	HostUptimeID      MetricID = "host/uptime"
+	CPURunnableTaskCountID  MetricID = "cpu/runnable_task_count"
+	CPUUsageTimeID          MetricID = "cpu/usage_time"
+	ProblemCounterID        MetricID = "problem_counter"
+	ProblemGaugeID          MetricID = "problem_gauge"
+	DiskIOTimeID            MetricID = "disk/io_time"
+	DiskWeightedIOID        MetricID = "disk/weighted_io"
+	DiskAvgQueueLenID       MetricID = "disk/avg_queue_len"
+	DiskOpsCountID          MetricID = "disk/operation_count"
+	DiskMergedOpsCountID    MetricID = "disk/merged_operation_count"
+	DiskOpsBytesID          MetricID = "disk/operation_bytes_count"
+	DiskOpsTimeID           MetricID = "disk/operation_time"
+	DiskBytesUsedID         MetricID = "disk/bytes_used"
+	HostUptimeID            MetricID = "host/uptime"
+	MemoryBytesUsedID       MetricID = "memory/bytes_used"
+	MemoryAnonymousUsedID   MetricID = "memory/anonymous_used"
+	MemoryPageCacheUsedID   MetricID = "memory/page_cache_used"
+	MemoryUnevictableUsedID MetricID = "memory/unevictable_used"
+	MemoryDirtyUsedID       MetricID = "memory/dirty_used"
 )
 
 var MetricMap MetricMapping