mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
This commit: - Also switches the rate queries severity to critical as 5% threshold is high for low scale/density clusters and needs to be flagged. - Adds rate queries to openshift alerts file Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
140 lines
8.3 KiB
YAML
140 lines
8.3 KiB
YAML
# etcd
|
|
|
|
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01
|
|
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s
|
|
severity: warning
|
|
|
|
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1
|
|
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
|
|
severity: error
|
|
|
|
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.03
|
|
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
|
|
severity: warning
|
|
|
|
- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0
|
|
description: etcd leader changes observed
|
|
severity: warning
|
|
|
|
- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
|
|
description: etcd cluster database is running full.
|
|
severity: critical
|
|
|
|
- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
|
|
description: etcd database size in use is less than 50% of the actual allocated storage.
|
|
severity: warning
|
|
|
|
- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
|
description: etcd cluster has high number of proposal failures.
|
|
severity: warning
|
|
|
|
- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15
|
|
description: etcd cluster member communication is slow.
|
|
severity: warning
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15
|
|
description: etcd grpc requests are slow.
|
|
severity: critical
|
|
|
|
- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
|
|
description: etcd cluster has high number of failed grpc requests.
|
|
severity: critical
|
|
|
|
- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
|
description: etcd cluster has no leader.
|
|
severity: warning
|
|
|
|
- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
|
description: etcd cluster has insufficient number of members.
|
|
severity: warning
|
|
|
|
- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0
|
|
description: etcd cluster members are down.
|
|
severity: warning
|
|
|
|
# API server
|
|
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1
|
|
description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s
|
|
severity: error
|
|
|
|
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1
|
|
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s
|
|
severity: error
|
|
|
|
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5
|
|
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s
|
|
severity: error
|
|
|
|
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30
|
|
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s
|
|
severity: error
|
|
|
|
# Control plane pods
|
|
- expr: up{apiserver=~"kube-apiserver|openshift-apiserver"} == 0
|
|
description: "{{$labels.apiserver}} {{$labels.instance}} down"
|
|
severity: warning
|
|
|
|
- expr: up{namespace=~"openshift-etcd"} == 0
|
|
description: "{{$labels.namespace}}/{{$labels.pod}} down"
|
|
severity: warning
|
|
|
|
- expr: up{namespace=~"openshift-.*(kube-controller-manager|scheduler|controller-manager|sdn|ovn-kubernetes|dns)"} == 0
|
|
description: "{{$labels.namespace}}/{{$labels.pod}} down"
|
|
severity: warning
|
|
|
|
- expr: up{job=~"crio|kubelet"} == 0
|
|
description: "{{$labels.node}}/{{$labels.job}} down"
|
|
severity: warning
|
|
|
|
- expr: up{job="ovnkube-node"} == 0
|
|
description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down"
|
|
severity: warning
|
|
|
|
# Service sync latency
|
|
- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10
|
|
description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s
|
|
severity: warning
|
|
|
|
# Prometheus alerts
|
|
- expr: ALERTS{severity="critical", alertstate="firing"} > 0
|
|
description: Critical prometheus alert. {{$labels.alertname}}
|
|
severity: warning
|
|
|
|
# etcd CPU and usage increase
|
|
- expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-etcd', container='etcd'}[1m])) * 100 / sum(machine_cpu_cores) > 5
|
|
description: Etcd CPU usage increased significantly by {{$value}}%
|
|
severity: critical
|
|
|
|
# etcd memory usage increase
|
|
- expr: sum(deriv(container_memory_usage_bytes{image!='', namespace='openshift-etcd', container='etcd'}[5m])) * 100 / sum(node_memory_MemTotal_bytes) > 5
|
|
description: Etcd memory usage increased significantly by {{$value}}%
|
|
severity: critical
|
|
|
|
# Openshift API server CPU and memory usage increase
|
|
- expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-apiserver', container='openshift-apiserver'}[1m])) * 100 / sum(machine_cpu_cores) > 5
|
|
description: openshift apiserver cpu usage increased significantly by {{$value}}%
|
|
severity: critical
|
|
|
|
- expr: (sum(deriv(container_memory_usage_bytes{namespace='openshift-apiserver', container='openshift-apiserver'}[5m]))) * 100 / sum(node_memory_MemTotal_bytes) > 5
|
|
description: openshift apiserver memory usage increased significantly by {{$value}}%
|
|
severity: critical
|
|
|
|
# Openshift kube API server CPU and memory usage increase
|
|
- expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-kube-apiserver', container='kube-apiserver'}[1m])) * 100 / sum(machine_cpu_cores) > 5
|
|
description: openshift apiserver cpu usage increased significantly by {{$value}}%
|
|
severity: critical
|
|
|
|
- expr: (sum(deriv(container_memory_usage_bytes{namespace='openshift-kube-apiserver', container='kube-apiserver'}[5m]))) * 100 / sum(node_memory_MemTotal_bytes) > 5
|
|
description: openshift apiserver memory usage increased significantly by {{$value}}%
|
|
severity: critical
|
|
|
|
# Master node CPU usage increase
|
|
- expr: (sum((sum(deriv(pod:container_cpu_usage:sum{container="",pod!=""}[5m])) BY (namespace, pod) * on(pod, namespace) group_left(node) (node_namespace_pod:kube_pod_info:) ) * on(node) group_left(role) (max by (node) (kube_node_role{role="master"})))) * 100 / sum(machine_cpu_cores) > 5
|
|
description: master nodes cpu usage increased significantly by {{$value}}%
|
|
severity: critical
|
|
|
|
# Master nodes memory usage increase
|
|
- expr: (sum((sum(deriv(container_memory_usage_bytes{container="",pod!=""}[5m])) BY (namespace, pod) * on(pod, namespace) group_left(node) (node_namespace_pod:kube_pod_info:) ) * on(node) group_left(role) (max by (node) (kube_node_role{role="master"})))) * 100 / sum(node_memory_MemTotal_bytes) > 5
|
|
description: master nodes memory usage increased significantly by {{$value}}%
|
|
severity: critical
|