From 624f50acd1c6d29b1d58c9399af8df0f51b332b2 Mon Sep 17 00:00:00 2001 From: Naga Ravi Chaitanya Elluri Date: Wed, 31 Jul 2024 13:27:17 -0400 Subject: [PATCH] Output rate of increase for the SLO queries This commit: - Also switches the rate queries severity to critical as 5% threshold is high for low scale/density clusters and needs to be flagged. - Adds rate queries to openshift alerts file Signed-off-by: Naga Ravi Chaitanya Elluri --- config/alerts.yaml | 32 +++++++++++++++--------------- config/alerts_openshift.yaml | 38 ++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/config/alerts.yaml b/config/alerts.yaml index 65a634ef..c8d53cfe 100644 --- a/config/alerts.yaml +++ b/config/alerts.yaml @@ -91,39 +91,39 @@ # etcd CPU and usage increase - expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-etcd', container='etcd'}[1m])) * 100 / sum(machine_cpu_cores) > 5 - description: Etcd CPU usage increased significantly - severity: warning + description: Etcd CPU usage increased significantly by {{$value}}% + severity: critical # etcd memory usage increase - expr: sum(deriv(container_memory_usage_bytes{image!='', namespace='openshift-etcd', container='etcd'}[5m])) * 100 / sum(node_memory_MemTotal_bytes) > 5 - description: Etcd memory usage increased significantly - severity: warning + description: Etcd memory usage increased significantly by {{$value}}% + severity: critical # Openshift API server CPU and memory usage increase - expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-apiserver', container='openshift-apiserver'}[1m])) * 100 / sum(machine_cpu_cores) > 5 - description: openshift apiserver cpu usage increased significantly - severity: warning + description: openshift apiserver cpu usage increased significantly by {{$value}}% + severity: critical - expr: (sum(deriv(container_memory_usage_bytes{namespace='openshift-apiserver', container='openshift-apiserver'}[5m]))) * 100 / sum(node_memory_MemTotal_bytes) > 5 - description: openshift apiserver memory usage increased significantly - severity: warning + description: openshift apiserver memory usage increased significantly by {{$value}}% + severity: critical # Openshift kube API server CPU and memory usage increase - expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-kube-apiserver', container='kube-apiserver'}[1m])) * 100 / sum(machine_cpu_cores) > 5 - description: openshift apiserver cpu usage increased significantly - severity: warning + description: openshift apiserver cpu usage increased significantly by {{$value}}% + severity: critical - expr: (sum(deriv(container_memory_usage_bytes{namespace='openshift-kube-apiserver', container='kube-apiserver'}[5m]))) * 100 / sum(node_memory_MemTotal_bytes) > 5 - description: openshift apiserver memory usage increased significantly - severity: warning + description: openshift apiserver memory usage increased significantly by {{$value}}% + severity: critical # Master node CPU usage increase - expr: (sum((sum(deriv(pod:container_cpu_usage:sum{container="",pod!=""}[5m])) BY (namespace, pod) * on(pod, namespace) group_left(node) (node_namespace_pod:kube_pod_info:) ) * on(node) group_left(role) (max by (node) (kube_node_role{role="master"})))) * 100 / sum(machine_cpu_cores) > 5 - description: master nodes cpu usage increased significantly - severity: warning + description: master nodes cpu usage increased significantly by {{$value}}% + severity: critical # Master nodes memory usage increase - expr: (sum((sum(deriv(container_memory_usage_bytes{container="",pod!=""}[5m])) BY (namespace, pod) * on(pod, namespace) group_left(node) (node_namespace_pod:kube_pod_info:) ) * on(node) group_left(role) (max by (node) (kube_node_role{role="master"})))) * 100 / sum(node_memory_MemTotal_bytes) > 5 - description: master nodes memory usage increased significantly - severity: warning + description: master nodes memory usage increased significantly by {{$value}}% + severity: critical diff --git a/config/alerts_openshift.yaml b/config/alerts_openshift.yaml index 45ff9671..42384f9b 100644 --- a/config/alerts_openshift.yaml +++ b/config/alerts_openshift.yaml @@ -99,3 +99,41 @@ - expr: ALERTS{severity="critical", alertstate="firing"} > 0 description: Critical prometheus alert. {{$labels.alertname}} severity: warning + +# etcd CPU and usage increase +- expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-etcd', container='etcd'}[1m])) * 100 / sum(machine_cpu_cores) > 5 + description: Etcd CPU usage increased significantly by {{$value}}% + severity: critical + +# etcd memory usage increase +- expr: sum(deriv(container_memory_usage_bytes{image!='', namespace='openshift-etcd', container='etcd'}[5m])) * 100 / sum(node_memory_MemTotal_bytes) > 5 + description: Etcd memory usage increased significantly by {{$value}}% + severity: critical + +# Openshift API server CPU and memory usage increase +- expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-apiserver', container='openshift-apiserver'}[1m])) * 100 / sum(machine_cpu_cores) > 5 + description: openshift apiserver cpu usage increased significantly by {{$value}}% + severity: critical + +- expr: (sum(deriv(container_memory_usage_bytes{namespace='openshift-apiserver', container='openshift-apiserver'}[5m]))) * 100 / sum(node_memory_MemTotal_bytes) > 5 + description: openshift apiserver memory usage increased significantly by {{$value}}% + severity: critical + +# Openshift kube API server CPU and memory usage increase +- expr: sum(rate(container_cpu_usage_seconds_total{image!='', namespace='openshift-kube-apiserver', container='kube-apiserver'}[1m])) * 100 / sum(machine_cpu_cores) > 5 + description: openshift apiserver cpu usage increased significantly by {{$value}}% + severity: critical + +- expr: (sum(deriv(container_memory_usage_bytes{namespace='openshift-kube-apiserver', container='kube-apiserver'}[5m]))) * 100 / sum(node_memory_MemTotal_bytes) > 5 + description: openshift apiserver memory usage increased significantly by {{$value}}% + severity: critical + +# Master node CPU usage increase +- expr: (sum((sum(deriv(pod:container_cpu_usage:sum{container="",pod!=""}[5m])) BY (namespace, pod) * on(pod, namespace) group_left(node) (node_namespace_pod:kube_pod_info:) ) * on(node) group_left(role) (max by (node) (kube_node_role{role="master"})))) * 100 / sum(machine_cpu_cores) > 5 + description: master nodes cpu usage increased significantly by {{$value}}% + severity: critical + +# Master nodes memory usage increase +- expr: (sum((sum(deriv(container_memory_usage_bytes{container="",pod!=""}[5m])) BY (namespace, pod) * on(pod, namespace) group_left(node) (node_namespace_pod:kube_pod_info:) ) * on(node) group_left(role) (max by (node) (kube_node_role{role="master"})))) * 100 / sum(node_memory_MemTotal_bytes) > 5 + description: master nodes memory usage increased significantly by {{$value}}% + severity: critical