Files
krkn/config/metrics.yaml
Paige Patton a0bba27edc
Some checks failed
Functional & Unit Tests / Functional & Unit Tests (push) Failing after 9m8s
Functional & Unit Tests / Generate Coverage Badge (push) Has been skipped
triming down metrics
Signed-off-by: Paige Patton <prubenda@redhat.com>
2025-03-24 10:01:50 +00:00

93 lines
4.5 KiB
YAML

metrics:
# API server
- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0
metricName: schedulingThroughput
# Containers & pod metrics
- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node)
metricName: podCPU
- query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node)
metricName: podMemory
- query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0
metricName: containerDiskUsage
# Kubelet & CRI-O metrics
- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
metricName: kubeletCPU
- query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"}
metricName: kubeletMemory
- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
metricName: crioCPU
- query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"}
metricName: crioMemory
# Node metrics
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
metricName: nodeCPU-Masters
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
metricName: nodeMemory-Masters
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0
metricName: nodeCPU-Workers
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[2m:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
metricName: nodeMemory-Workers
- query: avg(node_memory_MemAvailable_bytes) by (instance)
metricName: nodeMemoryAvailable
- query: avg(node_memory_Active_bytes) by (instance)
metricName: nodeMemoryActive
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
metricName: maxMemory-Masters
- query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)
metricName: nodeMemoryCached+nodeMemoryBuffers
- query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
metricName: rxNetworkBytes
- query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
metricName: txNetworkBytes
- query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m])
metricName: nodeDiskWrittenBytes
- query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m])
metricName: nodeDiskReadBytes
- query: sum(rate(etcd_server_leader_changes_seen_total[2m]))
metricName: etcdLeaderChangesRate
# Etcd metrics
- query: etcd_server_is_leader > 0
metricName: etcdServerIsLeader
- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
metricName: 99thEtcdDiskBackendCommitDurationSeconds
- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
metricName: 99thEtcdDiskWalFsyncDurationSeconds
- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
metricName: 99thEtcdRoundTripTimeSeconds
- query: etcd_mvcc_db_total_size_in_bytes
metricName: etcdDBPhysicalSizeBytes
- query: etcd_mvcc_db_total_size_in_use_in_bytes
metricName: etcdDBLogicalSizeBytes
- query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0
metricName: etcdObjectCount
- query: sum by (cluster_version)(etcd_cluster_version)
metricName: etcdVersion
instant: true