mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Krkn lib prometheus client + kube_burner references removed
Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
This commit is contained in:
committed by
Naga Ravi Chaitanya Elluri
parent
93f1f19411
commit
f2d7f88cb8
@@ -15,15 +15,13 @@ cerberus:
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift.
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
|
||||
capture_metrics: False
|
||||
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config.
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set.
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error.
|
||||
alert_profile: config/alerts # Path to alert profile with the prometheus queries.
|
||||
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries.
|
||||
|
||||
tunings:
|
||||
wait_duration: 6 # Duration to wait between each chaos scenario.
|
||||
|
||||
@@ -80,7 +80,7 @@ Scenario type | Kubernetes | OpenShift
|
||||
It is important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by:
|
||||
- Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks.
|
||||
- Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/redhat-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml).
|
||||
- Leveraging [kube-burner](docs/alerts.md) alerting feature to fail the runs in case of critical alerts.
|
||||
- Leveraging built-in alert collection feature to fail the runs in case of critical alerts.
|
||||
|
||||
### Signaling
|
||||
In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice.
|
||||
|
||||
90
config/alerts.yaml
Normal file
90
config/alerts.yaml
Normal file
@@ -0,0 +1,90 @@
|
||||
# etcd
|
||||
|
||||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01
|
||||
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s
|
||||
severity: warning
|
||||
|
||||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1
|
||||
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
|
||||
severity: error
|
||||
|
||||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007
|
||||
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
|
||||
severity: warning
|
||||
|
||||
- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0
|
||||
description: etcd leader changes observed
|
||||
severity: warning
|
||||
|
||||
- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
|
||||
description: etcd cluster database is running full.
|
||||
severity: critical
|
||||
|
||||
- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
|
||||
description: etcd database size in use is less than 50% of the actual allocated storage.
|
||||
severity: warning
|
||||
|
||||
- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
description: etcd cluster has high number of proposal failures.
|
||||
severity: warning
|
||||
|
||||
- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15
|
||||
description: etcd cluster member communication is slow.
|
||||
severity: warning
|
||||
|
||||
- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15
|
||||
description: etcd grpc requests are slow.
|
||||
severity: critical
|
||||
|
||||
- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
|
||||
description: etcd cluster has high number of failed grpc requests.
|
||||
severity: critical
|
||||
|
||||
- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
description: etcd cluster has no leader.
|
||||
severity: warning
|
||||
|
||||
- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
||||
description: etcd cluster has insufficient number of members.
|
||||
severity: warning
|
||||
|
||||
- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0
|
||||
description: etcd cluster members are down.
|
||||
severity: warning
|
||||
|
||||
# API server
|
||||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1
|
||||
description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s
|
||||
severity: error
|
||||
|
||||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1
|
||||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s
|
||||
severity: error
|
||||
|
||||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5
|
||||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s
|
||||
severity: error
|
||||
|
||||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30
|
||||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s
|
||||
severity: error
|
||||
|
||||
# Control plane pods
|
||||
|
||||
- expr: up{job=~"crio|kubelet"} == 0
|
||||
description: "{{$labels.node}}/{{$labels.job}} down"
|
||||
severity: warning
|
||||
|
||||
- expr: up{job="ovnkube-node"} == 0
|
||||
description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down"
|
||||
severity: warning
|
||||
|
||||
# Service sync latency
|
||||
- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10
|
||||
description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s
|
||||
severity: warning
|
||||
|
||||
# Prometheus alerts
|
||||
- expr: ALERTS{severity="critical", alertstate="firing"} > 0
|
||||
description: Critical prometheus alert. {{$labels.alertname}}
|
||||
severity: warning
|
||||
@@ -51,15 +51,13 @@ cerberus:
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.0/kube-burner-1.7.0-Linux-x86_64.tar.gz"
|
||||
capture_metrics: False
|
||||
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
alert_profile: config/alerts # Path or URL to alert profile with the prometheus queries
|
||||
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
|
||||
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
@@ -90,3 +88,6 @@ telemetry:
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -20,15 +20,13 @@ cerberus:
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
|
||||
capture_metrics: False
|
||||
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
alert_profile: config/alerts # Path to alert profile with the prometheus queries
|
||||
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries
|
||||
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
|
||||
@@ -19,15 +19,13 @@ cerberus:
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
|
||||
capture_metrics: False
|
||||
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
alert_profile: config/alerts # Path to alert profile with the prometheus queries
|
||||
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries
|
||||
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos after soak time for the cluster to settle down
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
|
||||
@@ -41,15 +41,13 @@ cerberus:
|
||||
performance_monitoring:
|
||||
deploy_dashboards: True # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
|
||||
capture_metrics: True
|
||||
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
alert_profile: config/alerts # Path to alert profile with the prometheus queries
|
||||
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries
|
||||
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
---
|
||||
|
||||
global:
|
||||
writeToFile: true
|
||||
metricsDirectory: collected-metrics
|
||||
measurements:
|
||||
- name: podLatency
|
||||
esIndex: kraken
|
||||
|
||||
indexerConfig:
|
||||
enabled: true
|
||||
esServers: [http://0.0.0.0:9200] # Please change this to the respective Elasticsearch in use if you haven't run the podman-compose command to setup the infrastructure containers
|
||||
insecureSkipVerify: true
|
||||
defaultIndex: kraken
|
||||
type: elastic
|
||||
@@ -11,19 +11,18 @@ performance_monitoring:
|
||||
```
|
||||
|
||||
### Validation and alerting based on the queries defined by the user during chaos
|
||||
Takes PromQL queries as input and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. It uses [Kube-burner](https://kube-burner.readthedocs.io/en/latest/) under the hood. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
|
||||
Takes PromQL queries as input and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
|
||||
|
||||
```
|
||||
performance_monitoring:
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error.
|
||||
alert_profile: config/alerts # Path to alert profile with the prometheus queries.
|
||||
alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries.
|
||||
```
|
||||
|
||||
#### Alert profile
|
||||
A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts) are shipped by default and can be tweaked to add more queries to alert on. User can provide a URL or path to the file in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml). The following are a few alerts examples:
|
||||
A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts.yaml) are shipped by default and can be tweaked to add more queries to alert on. User can provide a URL or path to the file in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml). The following are a few alerts examples:
|
||||
|
||||
```
|
||||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
## Scraping and storing metrics for the run
|
||||
|
||||
There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes them into Elasticsearch. The indexed metrics can be visualized with the help of Grafana.
|
||||
There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config.
|
||||
|
||||
It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Each run has a unique identifier ( uuid ) and all the metrics/documents in Elasticsearch will be associated with it. The uuid is generated automatically if not set in the config. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
|
||||
The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus with the start and end timestamp of the run. Each run has a unique identifier ( uuid ). The uuid is generated automatically if not set in the config. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
|
||||
|
||||
```
|
||||
performance_monitoring:
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
|
||||
capture_metrics: True
|
||||
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config.
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
@@ -31,21 +29,3 @@ metrics:
|
||||
metricName: APIInflightRequests
|
||||
```
|
||||
|
||||
### Indexing
|
||||
Define the Elasticsearch and index to store the metrics/documents in the kube_burner config:
|
||||
|
||||
```
|
||||
global:
|
||||
writeToFile: true
|
||||
metricsDirectory: collected-metrics
|
||||
measurements:
|
||||
- name: podLatency
|
||||
esIndex: kube-burner
|
||||
|
||||
indexerConfig:
|
||||
enabled: true
|
||||
esServers: [https://elastic.example.com:9200]
|
||||
insecureSkipVerify: true
|
||||
defaultIndex: kraken
|
||||
type: elastic
|
||||
```
|
||||
|
||||
@@ -1,116 +0,0 @@
|
||||
import subprocess
|
||||
import logging
|
||||
import urllib.request
|
||||
import shutil
|
||||
import sys
|
||||
import requests
|
||||
import tempfile
|
||||
import kraken.prometheus.client as prometheus
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def setup(url):
|
||||
"""
|
||||
Downloads and unpacks kube-burner binary
|
||||
"""
|
||||
|
||||
filename = "kube_burner.tar"
|
||||
try:
|
||||
logging.info("Fetching kube-burner binary")
|
||||
urllib.request.urlretrieve(url, filename)
|
||||
except Exception as e:
|
||||
logging.error("Failed to download kube-burner binary located at %s" % url, e)
|
||||
sys.exit(1)
|
||||
try:
|
||||
logging.info("Unpacking kube-burner tar ball")
|
||||
shutil.unpack_archive(filename)
|
||||
except Exception as e:
|
||||
logging.error("Failed to unpack the kube-burner binary tarball: %s" % e)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def scrape_metrics(
|
||||
distribution, uuid, prometheus_url, prometheus_bearer_token, start_time, end_time, config_path, metrics_profile
|
||||
):
|
||||
"""
|
||||
Scrapes metrics defined in the profile from Prometheus and indexes them into Elasticsearch
|
||||
"""
|
||||
|
||||
if not prometheus_url:
|
||||
if distribution == "openshift":
|
||||
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
|
||||
prometheus_url, prometheus_bearer_token = prometheus.instance(
|
||||
distribution, prometheus_url, prometheus_bearer_token
|
||||
)
|
||||
else:
|
||||
logging.error("Looks like prometheus url is not defined, exiting")
|
||||
sys.exit(1)
|
||||
command = (
|
||||
"./kube-burner index --uuid "
|
||||
+ str(uuid)
|
||||
+ " -u "
|
||||
+ str(prometheus_url)
|
||||
+ " -t "
|
||||
+ str(prometheus_bearer_token)
|
||||
+ " -m "
|
||||
+ str(metrics_profile)
|
||||
+ " --start "
|
||||
+ str(start_time)
|
||||
+ " --end "
|
||||
+ str(end_time)
|
||||
+ " -c "
|
||||
+ str(config_path)
|
||||
)
|
||||
try:
|
||||
logging.info("Running kube-burner to capture the metrics: %s" % command)
|
||||
logging.info("UUID for the run: %s" % uuid)
|
||||
subprocess.run(command, shell=True, universal_newlines=True)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run kube-burner, error: %s" % (e))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, end_time, alert_profile):
|
||||
"""
|
||||
Scrapes metrics defined in the profile from Prometheus and alerts based on the severity defined
|
||||
"""
|
||||
|
||||
is_url = urlparse(alert_profile)
|
||||
if is_url.scheme and is_url.netloc:
|
||||
response = requests.get(alert_profile)
|
||||
temp_alerts = tempfile.NamedTemporaryFile()
|
||||
temp_alerts.write(response.content)
|
||||
temp_alerts.flush()
|
||||
alert_profile = temp_alerts.name
|
||||
|
||||
if not prometheus_url:
|
||||
if distribution == "openshift":
|
||||
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
|
||||
prometheus_url, prometheus_bearer_token = prometheus.instance(
|
||||
distribution, prometheus_url, prometheus_bearer_token
|
||||
)
|
||||
else:
|
||||
logging.error("Looks like prometheus url is not defined, exiting")
|
||||
sys.exit(1)
|
||||
command = (
|
||||
"./kube-burner check-alerts "
|
||||
+ " -u "
|
||||
+ str(prometheus_url)
|
||||
+ " -t "
|
||||
+ str(prometheus_bearer_token)
|
||||
+ " -a "
|
||||
+ str(alert_profile)
|
||||
+ " --start "
|
||||
+ str(start_time)
|
||||
+ " --end "
|
||||
+ str(end_time)
|
||||
)
|
||||
try:
|
||||
logging.info("Running kube-burner to capture the metrics: %s" % command)
|
||||
output = subprocess.run(command, shell=True, universal_newlines=True)
|
||||
if output.returncode != 0:
|
||||
logging.error("command exited with a non-zero rc, please check the logs for errors or critical alerts")
|
||||
sys.exit(output.returncode)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run kube-burner, error: %s" % (e))
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1 @@
|
||||
from .client import *
|
||||
@@ -1,49 +1,30 @@
|
||||
import datetime
|
||||
import os.path
|
||||
import urllib3
|
||||
import logging
|
||||
import prometheus_api_client
|
||||
import sys
|
||||
import kraken.invoke.command as runcommand
|
||||
|
||||
import yaml
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
||||
|
||||
# Initialize the client
|
||||
def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token):
|
||||
global prom_cli
|
||||
prometheus_url, prometheus_bearer_token = instance(distribution, prometheus_url, prometheus_bearer_token)
|
||||
if prometheus_url and prometheus_bearer_token:
|
||||
bearer = "Bearer " + prometheus_bearer_token
|
||||
headers = {"Authorization": bearer}
|
||||
try:
|
||||
prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True)
|
||||
except Exception as e:
|
||||
logging.error("Not able to initialize the client %s" % e)
|
||||
if alert_profile is None or os.path.exists(alert_profile) is False:
|
||||
logging.error(f"{alert_profile} alert profile does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
with open(alert_profile) as profile:
|
||||
profile_yaml = yaml.safe_load(profile)
|
||||
if not isinstance(profile_yaml, list):
|
||||
logging.error(f"{alert_profile} wrong file format, alert profile must be "
|
||||
f"a valid yaml file containing a list of items with 3 properties: "
|
||||
f"expr, description, severity" )
|
||||
sys.exit(1)
|
||||
else:
|
||||
prom_cli = None
|
||||
|
||||
for alert in profile_yaml:
|
||||
if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
|
||||
logging.error(f"wrong alert {alert}, skipping")
|
||||
|
||||
# Process custom prometheus query
|
||||
def process_prom_query(query):
|
||||
if prom_cli:
|
||||
try:
|
||||
return prom_cli.custom_query(query=query, params=None)
|
||||
except Exception as e:
|
||||
logging.error("Failed to get the metrics: %s" % e)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initialized\n")
|
||||
|
||||
# Get prometheus details
|
||||
def instance(distribution, prometheus_url, prometheus_bearer_token):
|
||||
if distribution == "openshift" and not prometheus_url:
|
||||
url = runcommand.invoke(
|
||||
r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'""" # noqa
|
||||
)
|
||||
prometheus_url = "https://" + url
|
||||
if distribution == "openshift" and not prometheus_bearer_token:
|
||||
prometheus_bearer_token = runcommand.invoke(
|
||||
"oc create token -n openshift-monitoring prometheus-k8s --duration=12h "
|
||||
"|| oc -n openshift-monitoring sa get-token prometheus-k8s "
|
||||
"|| oc sa new-token -n openshift-monitoring prometheus-k8s"
|
||||
)
|
||||
return prometheus_url, prometheus_bearer_token
|
||||
prom_cli.process_alert(alert,
|
||||
datetime.datetime.fromtimestamp(start_time),
|
||||
datetime.datetime.fromtimestamp(end_time))
|
||||
@@ -19,14 +19,13 @@ ibm_cloud_sdk_core
|
||||
ibm_vpc
|
||||
itsdangerous==2.0.1
|
||||
jinja2==3.0.3
|
||||
krkn-lib>=1.4.5
|
||||
krkn-lib >= 1.4.5
|
||||
kubernetes
|
||||
lxml >= 4.3.0
|
||||
oauth2client>=4.1.3
|
||||
openshift-client
|
||||
paramiko
|
||||
podman-compose
|
||||
prometheus_api_client
|
||||
pyVmomi >= 6.7
|
||||
pyfiglet
|
||||
pytest
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
@@ -8,6 +9,7 @@ import optparse
|
||||
import pyfiglet
|
||||
import uuid
|
||||
import time
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
import kraken.time_actions.common_time_functions as time_actions
|
||||
import kraken.performance_dashboards.setup as performance_dashboards
|
||||
import kraken.pod_scenarios.setup as pod_scenarios
|
||||
@@ -15,16 +17,14 @@ import kraken.service_disruption.common_service_disruption_functions as service_
|
||||
import kraken.shut_down.common_shut_down_func as shut_down
|
||||
import kraken.node_actions.run as nodeaction
|
||||
import kraken.managedcluster_scenarios.run as managedcluster_scenarios
|
||||
import kraken.kube_burner.client as kube_burner
|
||||
import kraken.zone_outage.actions as zone_outages
|
||||
import kraken.application_outage.actions as application_outage
|
||||
import kraken.pvc.pvc_scenario as pvc_scenario
|
||||
import kraken.network_chaos.actions as network_chaos
|
||||
import kraken.arcaflow_plugin as arcaflow_plugin
|
||||
import kraken.prometheus as prometheus_plugin
|
||||
import server as server
|
||||
import kraken.prometheus.client as promcli
|
||||
from kraken import plugins
|
||||
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.ocp import KrknOpenshift
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
@@ -33,11 +33,7 @@ from krkn_lib.models.telemetry import ChaosRunTelemetry
|
||||
from krkn_lib.utils import SafeLogger
|
||||
from krkn_lib.utils.functions import get_yaml_item_value
|
||||
|
||||
KUBE_BURNER_URL = (
|
||||
"https://github.com/cloud-bulldozer/kube-burner/"
|
||||
"releases/download/v{version}/kube-burner-{version}-Linux-x86_64.tar.gz"
|
||||
)
|
||||
KUBE_BURNER_VERSION = "1.7.0"
|
||||
|
||||
|
||||
|
||||
# Main function
|
||||
@@ -84,21 +80,7 @@ def main(cfg):
|
||||
config["performance_monitoring"], "repo",
|
||||
"https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
)
|
||||
capture_metrics = get_yaml_item_value(
|
||||
config["performance_monitoring"], "capture_metrics", False
|
||||
)
|
||||
kube_burner_url = get_yaml_item_value(
|
||||
config["performance_monitoring"], "kube_burner_binary_url",
|
||||
KUBE_BURNER_URL.format(version=KUBE_BURNER_VERSION),
|
||||
)
|
||||
config_path = get_yaml_item_value(
|
||||
config["performance_monitoring"], "config_path",
|
||||
"config/kube_burner.yaml"
|
||||
)
|
||||
metrics_profile = get_yaml_item_value(
|
||||
config["performance_monitoring"], "metrics_profile_path",
|
||||
"config/metrics-aggregated.yaml"
|
||||
)
|
||||
|
||||
prometheus_url = config["performance_monitoring"].get("prometheus_url")
|
||||
prometheus_bearer_token = config["performance_monitoring"].get(
|
||||
"prometheus_bearer_token"
|
||||
@@ -147,9 +129,6 @@ def main(cfg):
|
||||
except:
|
||||
kubecli.initialize_clients(None)
|
||||
|
||||
# KrknTelemetry init
|
||||
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
||||
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
||||
|
||||
|
||||
# find node kraken might be running on
|
||||
@@ -179,11 +158,20 @@ def main(cfg):
|
||||
cv = ""
|
||||
if config["kraken"]["distribution"] == "openshift":
|
||||
cv = ocpcli.get_clusterversion_string()
|
||||
if prometheus_url is None:
|
||||
connection_data = ocpcli.get_prometheus_api_connection_data()
|
||||
prometheus_url = connection_data.endpoint
|
||||
prometheus_bearer_token = connection_data.token
|
||||
if cv != "":
|
||||
logging.info(cv)
|
||||
else:
|
||||
logging.info("Cluster version CRD not detected, skipping")
|
||||
|
||||
# KrknTelemetry init
|
||||
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
||||
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
||||
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
||||
|
||||
logging.info("Server URL: %s" % kubecli.get_host())
|
||||
|
||||
# Deploy performance dashboards
|
||||
@@ -351,9 +339,10 @@ def main(cfg):
|
||||
# Check for critical alerts when enabled
|
||||
if check_critical_alerts:
|
||||
logging.info("Checking for critical alerts firing post choas")
|
||||
promcli.initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token)
|
||||
|
||||
##PROM
|
||||
query = r"""ALERTS{severity="critical"}"""
|
||||
critical_alerts = promcli.process_prom_query(query)
|
||||
critical_alerts = prometheus.process_prom_query_in_range(query, datetime.datetime.fromtimestamp(start_time))
|
||||
critical_alerts_count = len(critical_alerts)
|
||||
if critical_alerts_count > 0:
|
||||
logging.error("Critical alerts are firing: %s", critical_alerts)
|
||||
@@ -401,33 +390,13 @@ def main(cfg):
|
||||
else:
|
||||
logging.info("telemetry collection disabled, skipping.")
|
||||
|
||||
# Capture the end time
|
||||
|
||||
|
||||
# Capture metrics for the run
|
||||
if capture_metrics:
|
||||
logging.info("Capturing metrics")
|
||||
kube_burner.setup(kube_burner_url)
|
||||
kube_burner.scrape_metrics(
|
||||
distribution,
|
||||
run_uuid,
|
||||
prometheus_url,
|
||||
prometheus_bearer_token,
|
||||
start_time,
|
||||
end_time,
|
||||
config_path,
|
||||
metrics_profile,
|
||||
)
|
||||
|
||||
# Check for the alerts specified
|
||||
if enable_alerts:
|
||||
logging.info("Alerts checking is enabled")
|
||||
kube_burner.setup(kube_burner_url)
|
||||
if alert_profile:
|
||||
kube_burner.alerts(
|
||||
distribution,
|
||||
prometheus_url,
|
||||
prometheus_bearer_token,
|
||||
prometheus_plugin.alerts(
|
||||
prometheus,
|
||||
start_time,
|
||||
end_time,
|
||||
alert_profile,
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
---
|
||||
deployer:
|
||||
connection: {}
|
||||
connection:
|
||||
cacert: ''
|
||||
cert: ''
|
||||
host: https://api.tsebasti-lab.aws.rhperfscale.org:6443
|
||||
key: ''
|
||||
type: kubernetes
|
||||
log:
|
||||
level: debug
|
||||
|
||||
@@ -2,13 +2,7 @@ input_list:
|
||||
- cpu_count: 1
|
||||
cpu_load_percentage: 80
|
||||
cpu_method: all
|
||||
duration: 30s
|
||||
node_selector: {}
|
||||
# node selector example
|
||||
# node_selector:
|
||||
# kubernetes.io/hostname: master
|
||||
kubeconfig: ""
|
||||
duration: 1s
|
||||
kubeconfig: ''
|
||||
namespace: default
|
||||
|
||||
# duplicate this section to run simultaneous stressors in the same run
|
||||
|
||||
node_selector: {}
|
||||
|
||||
Reference in New Issue
Block a user