mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
Some checks failed
Functional & Unit Tests / Functional & Unit Tests (push) Failing after 9m8s
Functional & Unit Tests / Generate Coverage Badge (push) Has been skipped
Signed-off-by: Paige Patton <prubenda@redhat.com>
93 lines
4.5 KiB
YAML
93 lines
4.5 KiB
YAML
metrics:
|
|
# API server
|
|
- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0
|
|
metricName: schedulingThroughput
|
|
|
|
# Containers & pod metrics
|
|
- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node)
|
|
metricName: podCPU
|
|
|
|
- query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node)
|
|
metricName: podMemory
|
|
|
|
- query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0
|
|
metricName: containerDiskUsage
|
|
|
|
# Kubelet & CRI-O metrics
|
|
- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
|
|
metricName: kubeletCPU
|
|
|
|
- query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"}
|
|
metricName: kubeletMemory
|
|
|
|
- query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
|
|
metricName: crioCPU
|
|
|
|
- query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"}
|
|
metricName: crioMemory
|
|
|
|
# Node metrics
|
|
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
|
|
metricName: nodeCPU-Masters
|
|
|
|
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
|
metricName: nodeMemory-Masters
|
|
|
|
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0
|
|
metricName: nodeCPU-Workers
|
|
|
|
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[2m:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
|
metricName: nodeMemory-Workers
|
|
|
|
- query: avg(node_memory_MemAvailable_bytes) by (instance)
|
|
metricName: nodeMemoryAvailable
|
|
|
|
- query: avg(node_memory_Active_bytes) by (instance)
|
|
metricName: nodeMemoryActive
|
|
|
|
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
|
metricName: maxMemory-Masters
|
|
|
|
- query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)
|
|
metricName: nodeMemoryCached+nodeMemoryBuffers
|
|
|
|
- query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
|
|
metricName: rxNetworkBytes
|
|
|
|
- query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
|
|
metricName: txNetworkBytes
|
|
|
|
- query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m])
|
|
metricName: nodeDiskWrittenBytes
|
|
|
|
- query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m])
|
|
metricName: nodeDiskReadBytes
|
|
|
|
- query: sum(rate(etcd_server_leader_changes_seen_total[2m]))
|
|
metricName: etcdLeaderChangesRate
|
|
|
|
# Etcd metrics
|
|
- query: etcd_server_is_leader > 0
|
|
metricName: etcdServerIsLeader
|
|
|
|
- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
|
|
metricName: 99thEtcdDiskBackendCommitDurationSeconds
|
|
|
|
- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
|
|
metricName: 99thEtcdDiskWalFsyncDurationSeconds
|
|
|
|
- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
|
metricName: 99thEtcdRoundTripTimeSeconds
|
|
|
|
- query: etcd_mvcc_db_total_size_in_bytes
|
|
metricName: etcdDBPhysicalSizeBytes
|
|
|
|
- query: etcd_mvcc_db_total_size_in_use_in_bytes
|
|
metricName: etcdDBLogicalSizeBytes
|
|
|
|
- query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0
|
|
metricName: etcdObjectCount
|
|
|
|
- query: sum by (cluster_version)(etcd_cluster_version)
|
|
metricName: etcdVersion
|
|
instant: true |