mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-05-06 01:07:07 +00:00
1) Add lookback support in kernel monitor. After started, Kernel monitor will check some old logs to detect problems which happened before last node reboot. 2) Add `lookback` and `startPattern` in kernel monitor configuration. * `lookback` specifies how long time kernel monitor should look back. * `startPattern` specifies which log indicates the node is started. kernel monitor will clear all current node conditions once it finds a node start log. This makes sure that old problems won't change the node condition. 3) Add support for kernel panic monitoring, the null pointer and divide 0 kernel panic will be surfaced as event. Usually kernel monitor will report these events during looking back phase.
55 lines
1.4 KiB
JSON
55 lines
1.4 KiB
JSON
{
|
|
"logPath": "/log/kern.log",
|
|
"lookback": "10m",
|
|
"startPattern": "Initializing cgroup subsys cpuset",
|
|
"bufferSize": 10,
|
|
"source": "kernel-monitor",
|
|
"conditions": [
|
|
{
|
|
"type": "KernelDeadlock",
|
|
"reason": "KernelHasNoDeadlock",
|
|
"message": "kernel has no deadlock"
|
|
}
|
|
],
|
|
"rules": [
|
|
{
|
|
"type": "temporary",
|
|
"reason": "OOMKilling",
|
|
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "TaskHung",
|
|
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "KernelPanic",
|
|
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "KernelPanic",
|
|
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
|
},
|
|
{
|
|
"type": "permanent",
|
|
"condition": "KernelDeadlock",
|
|
"reason": "AUFSUmountHung",
|
|
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
|
},
|
|
{
|
|
"type": "permanent",
|
|
"condition": "KernelDeadlock",
|
|
"reason": "DockerHung",
|
|
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
|
},
|
|
{
|
|
"type": "permanent",
|
|
"condition": "KernelDeadlock",
|
|
"reason": "UnregisterNetDeviceIssue",
|
|
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
|
|
}
|
|
]
|
|
}
|