mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-02-14 18:09:57 +00:00
CPER is the format used to describe platform hardware error by various tables, such as ERST, BERT and HEST etc. The event severity message is printed here: https://github.com/torvalds/linux/blob/v6.7/drivers/firmware/efi/cper.c#L639 Examples are as below. Corrected error: kernel: {37}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 162 kernel: {37}[Hardware Error]: It has been corrected by h/w and requires no further action kernel: {37}[Hardware Error]: event severity: corrected kernel: {37}[Hardware Error]: Error 0, type: corrected kernel: {37}[Hardware Error]: section_type: memory error kernel: {37}[Hardware Error]: error_status: 0x0000000000000400 kernel: {37}[Hardware Error]: physical_address: 0x000000b50c68ce80 kernel: {37}[Hardware Error]: node: 1 card: 4 module: 0 rank: 0 bank: 1 device: 14 row: 58165 column: 816 kernel: {37}[Hardware Error]: error_type: 2, single-bit ECC kernel: {37}[Hardware Error]: DIMM location: CPU 2 DIMM 30 Recoverable error: kernel: {3}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4 kernel: {3}[Hardware Error]: event severity: recoverable kernel: {3}[Hardware Error]: Error 0, type: recoverable kernel: {3}[Hardware Error]: fru_text: B1 kernel: {3}[Hardware Error]: section_type: memory error kernel: {3}[Hardware Error]: error_status: 0x0000000000000400 kernel: {3}[Hardware Error]: physical_address: 0x000000393cfe5040 kernel: {3}[Hardware Error]: node: 2 card: 0 module: 0 rank: 0 bank: 3 device: 0 row: 34719 column: 320 kernel: {3}[Hardware Error]: DIMM location: not present. DMI handle: 0x0000 Fatal error: kernel: BERT: Error records from previous boot: kernel: [Hardware Error]: event severity: fatal kernel: [Hardware Error]: Error 0, type: fatal kernel: [Hardware Error]: fru_text: DIMM B5 kernel: [Hardware Error]: section_type: memory error kernel: [Hardware Error]: error_status: 0x0000000000000400 kernel: [Hardware Error]: physical_address: 0x000000393d7e4040 kernel: [Hardware Error]: node: 2 card: 4 module: 0 rank: 0 bank: 3 device: 0 row: 34743 column: 256 Steps to test the new metrics. $ echo "kernel: {37}[Hardware Error]: event severity: corrected" | sudo tee /dev/kmsg $ echo "kernel: {3}[Hardware Error]: event severity: recoverable" | sudo tee /dev/kmsg $ echo "kernel: [Hardware Error]: event severity: fatal" | sudo tee /dev/kmsg Expected metrics are as below: $ curl localhost:20257/metrics problem_counter{reason="CperHardwareErrorCorrected"} 1 problem_counter{reason="CperHardwareErrorFatal"} 1 problem_counter{reason="CperHardwareErrorRecoverable"} 1 ... problem_gauge{reason="CperHardwareErrorFatal",type="CperHardwareErrorFatal"} 1 Signed-off-by: Jian Wen <wenjianhn@gmail.com>
101 lines
2.3 KiB
JSON
101 lines
2.3 KiB
JSON
{
|
|
"plugin": "kmsg",
|
|
"logPath": "/dev/kmsg",
|
|
"lookback": "5m",
|
|
"bufferSize": 10,
|
|
"source": "kernel-monitor",
|
|
"metricsReporting": true,
|
|
"conditions": [
|
|
{
|
|
"type": "KernelDeadlock",
|
|
"reason": "KernelHasNoDeadlock",
|
|
"message": "kernel has no deadlock"
|
|
},
|
|
{
|
|
"type": "XfsShutdown",
|
|
"reason": "XfsHasNotShutDown",
|
|
"message": "XFS has not shutdown"
|
|
},
|
|
{
|
|
"type": "CperHardwareErrorFatal",
|
|
"reason": "CperHardwareHasNoFatalError",
|
|
"message": "UEFI CPER has no fatal error"
|
|
}
|
|
],
|
|
"rules": [
|
|
{
|
|
"type": "temporary",
|
|
"reason": "OOMKilling",
|
|
"pattern": "Killed process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "TaskHung",
|
|
"pattern": "task [\\S ]+:\\w+ blocked for more than \\w+ seconds\\."
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "UnregisterNetDevice",
|
|
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "KernelOops",
|
|
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "KernelOops",
|
|
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "Ext4Error",
|
|
"pattern": "EXT4-fs error .*"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "Ext4Warning",
|
|
"pattern": "EXT4-fs warning .*"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "IOError",
|
|
"pattern": "Buffer I/O error .*"
|
|
},
|
|
{
|
|
"type": "permanent",
|
|
"condition": "XfsShutdown",
|
|
"reason": "XfsHasShutdown",
|
|
"pattern": "XFS .* Shutting down filesystem.?"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "MemoryReadError",
|
|
"pattern": "CE memory read error .*"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "CperHardwareErrorCorrected",
|
|
"pattern": ".*\\[Hardware Error\\]: event severity: corrected$"
|
|
},
|
|
{
|
|
"type": "temporary",
|
|
"reason": "CperHardwareErrorRecoverable",
|
|
"pattern": ".*\\[Hardware Error\\]: event severity: recoverable$"
|
|
},
|
|
{
|
|
"type": "permanent",
|
|
"condition": "CperHardwareErrorFatal",
|
|
"reason": "CperHardwareErrorFatal",
|
|
"pattern": ".*\\[Hardware Error\\]: event severity: fatal$"
|
|
},
|
|
{
|
|
"type": "permanent",
|
|
"condition": "KernelDeadlock",
|
|
"reason": "DockerHung",
|
|
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
|
}
|
|
]
|
|
}
|