Krkn lib prometheus client + kube_burner references removed

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
This commit is contained in:
Tullio Sebastiani
2024-01-09 16:28:49 +01:00
committed by Naga Ravi Chaitanya Elluri
parent 93f1f19411
commit f2d7f88cb8
19 changed files with 154 additions and 275 deletions

90
config/alerts.yaml Normal file
View File

@@ -0,0 +1,90 @@
# etcd
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s
severity: warning
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
severity: error
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
severity: warning
- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0
description: etcd leader changes observed
severity: warning
- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
description: etcd cluster database is running full.
severity: critical
- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
description: etcd database size in use is less than 50% of the actual allocated storage.
severity: warning
- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
description: etcd cluster has high number of proposal failures.
severity: warning
- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15
description: etcd cluster member communication is slow.
severity: warning
- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15
description: etcd grpc requests are slow.
severity: critical
- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
description: etcd cluster has high number of failed grpc requests.
severity: critical
- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
description: etcd cluster has no leader.
severity: warning
- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
description: etcd cluster has insufficient number of members.
severity: warning
- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0
description: etcd cluster members are down.
severity: warning
# API server
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1
description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s
severity: error
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s
severity: error
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s
severity: error
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s
severity: error
# Control plane pods
- expr: up{job=~"crio|kubelet"} == 0
description: "{{$labels.node}}/{{$labels.job}} down"
severity: warning
- expr: up{job="ovnkube-node"} == 0
description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down"
severity: warning
# Service sync latency
- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10
description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s
severity: warning
# Prometheus alerts
- expr: ALERTS{severity="critical", alertstate="firing"} > 0
description: Critical prometheus alert. {{$labels.alertname}}
severity: warning

View File

@@ -51,15 +51,13 @@ cerberus:
performance_monitoring:
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.0/kube-burner-1.7.0-Linux-x86_64.tar.gz"
capture_metrics: False
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
metrics_profile_path: config/metrics-aggregated.yaml
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
uuid: # uuid for the run is generated by default if not set
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
alert_profile: config/alerts # Path or URL to alert profile with the prometheus queries
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
tunings:
wait_duration: 60 # Duration to wait between each chaos scenario
@@ -90,3 +88,6 @@ telemetry:
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
events_backup: True # enables/disables cluster events collection

View File

@@ -20,15 +20,13 @@ cerberus:
performance_monitoring:
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
capture_metrics: False
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
metrics_profile_path: config/metrics-aggregated.yaml
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
uuid: # uuid for the run is generated by default if not set
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
alert_profile: config/alerts # Path to alert profile with the prometheus queries
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries
tunings:
wait_duration: 60 # Duration to wait between each chaos scenario

View File

@@ -19,15 +19,13 @@ cerberus:
performance_monitoring:
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
capture_metrics: False
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
metrics_profile_path: config/metrics-aggregated.yaml
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
uuid: # uuid for the run is generated by default if not set
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
alert_profile: config/alerts # Path to alert profile with the prometheus queries
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos after soak time for the cluster to settle down
tunings:
wait_duration: 60 # Duration to wait between each chaos scenario

View File

@@ -41,15 +41,13 @@ cerberus:
performance_monitoring:
deploy_dashboards: True # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
capture_metrics: True
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
metrics_profile_path: config/metrics-aggregated.yaml
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
uuid: # uuid for the run is generated by default if not set
enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
alert_profile: config/alerts # Path to alert profile with the prometheus queries
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries
tunings:
wait_duration: 60 # Duration to wait between each chaos scenario

View File

@@ -1,15 +0,0 @@
---
global:
writeToFile: true
metricsDirectory: collected-metrics
measurements:
- name: podLatency
esIndex: kraken
indexerConfig:
enabled: true
esServers: [http://0.0.0.0:9200] # Please change this to the respective Elasticsearch in use if you haven't run the podman-compose command to setup the infrastructure containers
insecureSkipVerify: true
defaultIndex: kraken
type: elastic