From 7e8f0450d6cf0ccd171119f7b0ceb110c5d7d5a7 Mon Sep 17 00:00:00 2001 From: Naga Ravi Chaitanya Elluri Date: Sat, 29 May 2021 21:15:19 -0400 Subject: [PATCH] Add support to scrape and index metrics This commit: - Enables Kraken to leverage kube-burner to scrape metrics from Prometheus and index them into Elasticsearch. This way we can take a look at the metrics in Grafana long term even after the cluster is terminated. - Enables separation of operations based on distribution with OpenShift as the default option. One of the use cases is to capture Prometheus instance details as it's installed by default while it's optional for Kubernetes. --- README.md | 3 + config/config.yaml | 8 ++ config/kube_burner.yaml | 15 +++ config/metrics-aggregated.yaml | 184 +++++++++++++++++++++++++++++++++ config/metrics.yaml | 117 +++++++++++++++++++++ docs/metrics.md | 53 ++++++++++ kraken/kube_burner/__init__.py | 0 kraken/kube_burner/client.py | 62 +++++++++++ kraken/prometheus/__init__.py | 0 kraken/prometheus/client.py | 13 +++ run_kraken.py | 44 ++++++++ setup.cfg | 4 + 12 files changed, 503 insertions(+) create mode 100644 config/kube_burner.yaml create mode 100644 config/metrics-aggregated.yaml create mode 100644 config/metrics.yaml create mode 100644 docs/metrics.md create mode 100644 kraken/kube_burner/__init__.py create mode 100644 kraken/kube_burner/client.py create mode 100644 kraken/prometheus/__init__.py create mode 100644 kraken/prometheus/client.py diff --git a/README.md b/README.md index cf8089e7..53b9ec0c 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,9 @@ It's important to make sure to check if the targeted component recovered from th ### Performance monitoring Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chaos scenarios on various components is key to find out the bottlenecks as it's important to make sure the cluster is healthy in terms if both recovery as well as performance during/after the failure has been injected. Instructions on enabling it can be found [here](docs/performance_dashboards.md). +### Scraping and storing metrics long term +Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes then into Elasticsearch to be able to store and evaluate the state of the runs long term. The indexed metrics can be visualized with the help of Grafana. It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Information on enabling and leveraging this feature can be found [here](docs/metrics.md). + ### Blogs and other useful resources - Blog post on introduction to Kraken: https://www.openshift.com/blog/introduction-to-kraken-a-chaos-tool-for-openshift/kubernetes - Discussion and demo on how Kraken can be leveraged to ensure OpenShift is reliable, performant and scalable: https://www.youtube.com/watch?v=s1PvupI5sD0&ab_channel=OpenShift diff --git a/config/config.yaml b/config/config.yaml index dc993391..e38f7c4c 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,4 +1,5 @@ kraken: + distribution: openshift # Distribution can be kubernetes or openshift kubeconfig_path: /root/.kube/config # Path to kubeconfig exit_on_failure: False # Exit when a post action scenario fails litmus_version: v1.10.0 # Litmus version to install @@ -26,6 +27,13 @@ cerberus: performance_monitoring: deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" + kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" + capture_metrics: False + config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config + metrics_profile_path: config/metrics-aggregated.yaml + prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. + prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. + uuid: # uuid for the run is generated by default if not set tunings: wait_duration: 60 # Duration to wait between each chaos scenario diff --git a/config/kube_burner.yaml b/config/kube_burner.yaml new file mode 100644 index 00000000..6a02ef2d --- /dev/null +++ b/config/kube_burner.yaml @@ -0,0 +1,15 @@ +--- + +global: + writeToFile: true + metricsDirectory: collected-metrics + measurements: + - name: podLatency + esIndex: kube-burner + + indexerConfig: + enabled: true + esServers: [https://elastic.example.com:9200] + insecureSkipVerify: true + defaultIndex: kraken + type: elastic diff --git a/config/metrics-aggregated.yaml b/config/metrics-aggregated.yaml new file mode 100644 index 00000000..417c36ef --- /dev/null +++ b/config/metrics-aggregated.yaml @@ -0,0 +1,184 @@ +metrics: +# API server + - query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0 + metricName: API99thLatency + + - query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0 + metricName: APIRequestRate + + - query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0 + metricName: APIInflightRequests + +# Container & pod metrics + - query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0 + metricName: containerMemory-Masters + + - query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|.*apiserver|authentication|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0 + metricName: containerCPU-Masters + + - query: (sum(irate(container_cpu_usage_seconds_total{pod!="",container="prometheus",namespace="openshift-monitoring"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0 + metricName: containerCPU-Prometheus + + - query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"}[2m]) * 100 and on (node) kube_node_role{role="worker"}) by (namespace, container)) > 0 + metricName: containerCPU-AggregatedWorkers + + - query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"}[2m]) * 100 and on (node) kube_node_role{role="infra"}) by (namespace, container)) > 0 + metricName: containerCPU-AggregatedInfra + + - query: (sum(container_memory_rss{pod!="",namespace="openshift-monitoring",name!="",container="prometheus"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0 + metricName: containerMemory-Prometheus + + - query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"} and on (node) kube_node_role{role="worker"}) by (container, namespace) + metricName: containerMemory-AggregatedWorkers + + - query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"} and on (node) kube_node_role{role="infra"}) by (container, namespace) + metricName: containerMemory-AggregatedInfra + +# Node metrics + - query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0 + metricName: nodeCPU-Masters + + - query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0 + metricName: nodeCPU-AggregatedWorkers + + - query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0 + metricName: nodeCPU-AggregatedInfra + + - query: avg(node_memory_MemAvailable_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryAvailable-Masters + + - query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) + metricName: nodeMemoryAvailable-AggregatedWorkers + + - query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) + metricName: nodeMemoryAvailable-AggregatedInfra + + - query: avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryActive-Masters + + - query: avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) + metricName: nodeMemoryActive-AggregatedWorkers + + - query: avg(avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) + metricName: nodeMemoryActive-AggregatedInfra + + - query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeMemoryCached+nodeMemoryBuffers-Masters + + - query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) + metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedWorkers + + - query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) + metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedInfra + + - query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: rxNetworkBytes-Masters + + - query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: rxNetworkBytes-AggregatedWorkers + + - query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: rxNetworkBytes-AggregatedInfra + + - query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: txNetworkBytes-Masters + + - query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: txNetworkBytes-AggregatedWorkers + + - query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: txNetworkBytes-AggregatedInfra + + - query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeDiskWrittenBytes-Masters + + - query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: nodeDiskWrittenBytes-AggregatedWorkers + + - query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: nodeDiskWrittenBytes-AggregatedInfra + + - query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") + metricName: nodeDiskReadBytes-Masters + + - query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: nodeDiskReadBytes-AggregatedWorkers + + - query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device) + metricName: nodeDiskReadBytes-AggregatedInfra + +# Etcd metrics + - query: sum(rate(etcd_server_leader_changes_seen_total[2m])) + metricName: etcdLeaderChangesRate + + - query: etcd_server_is_leader > 0 + metricName: etcdServerIsLeader + + - query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskBackendCommitDurationSeconds + + - query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskWalFsyncDurationSeconds + + - query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) + metricName: 99thEtcdRoundTripTimeSeconds + + - query: etcd_mvcc_db_total_size_in_bytes + metricName: etcdDBPhysicalSizeBytes + + - query: etcd_mvcc_db_total_size_in_use_in_bytes + metricName: etcdDBLogicalSizeBytes + + - query: sum by (cluster_version)(etcd_cluster_version) + metricName: etcdVersion + instant: true + + - query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0 + metricName: etcdObjectCount + + - query: histogram_quantile(0.99,sum(rate(etcd_request_duration_seconds_bucket[2m])) by (le,operation,apiserver)) > 0 + metricName: P99APIEtcdRequestLatency + +# Cluster metrics + - query: count(kube_namespace_created) + metricName: namespaceCount + + - query: sum(kube_pod_status_phase{}) by (phase) + metricName: podStatusCount + + - query: count(kube_secret_info{}) + metricName: secretCount + + - query: count(kube_deployment_labels{}) + metricName: deploymentCount + + - query: count(kube_configmap_info{}) + metricName: configmapCount + + - query: count(kube_service_info{}) + metricName: serviceCount + + - query: kube_node_role + metricName: nodeRoles + instant: true + + - query: sum(kube_node_status_condition{status="true"}) by (condition) + metricName: nodeStatus + + - query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0 + metricName: containerDiskUsage + + - query: cluster_version{type="completed"} + metricName: clusterVersion + instant: true + +# Golang metrics + + - query: go_memstats_heap_alloc_bytes{job=~"apiserver|api|etcd"} + metricName: goHeapAllocBytes + + - query: go_memstats_heap_inuse_bytes{job=~"apiserver|api|etcd"} + metricName: goHeapInuseBytes + + - query: go_gc_duration_seconds{job=~"apiserver|api|etcd",quantile="1"} + metricName: goGCDurationSeconds diff --git a/config/metrics.yaml b/config/metrics.yaml new file mode 100644 index 00000000..bdad2be8 --- /dev/null +++ b/config/metrics.yaml @@ -0,0 +1,117 @@ +metrics: +# API server + - query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0 + metricName: API99thLatency + + - query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0 + metricName: APIRequestRate + + - query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0 + metricName: APIInflightRequests + +# Containers & pod metrics + - query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node) + metricName: podCPU + + - query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node) + metricName: podMemory + + - query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0 + metricName: containerDiskUsage + +# Kubelet & CRI-O metrics + - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} + metricName: kubeletCPU + + - query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"} + metricName: kubeletMemory + + - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"} + metricName: crioCPU + + - query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"} + metricName: crioMemory + +# Node metrics + - query: sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) > 0 + metricName: nodeCPU + + - query: avg(node_memory_MemAvailable_bytes) by (instance) + metricName: nodeMemoryAvailable + + - query: avg(node_memory_Active_bytes) by (instance) + metricName: nodeMemoryActive + + - query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) + metricName: nodeMemoryCached+nodeMemoryBuffers + + - query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) + metricName: rxNetworkBytes + + - query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) + metricName: txNetworkBytes + + - query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) + metricName: nodeDiskWrittenBytes + + - query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) + metricName: nodeDiskReadBytes + + - query: sum(rate(etcd_server_leader_changes_seen_total[2m])) + metricName: etcdLeaderChangesRate + +# Etcd metrics + - query: etcd_server_is_leader > 0 + metricName: etcdServerIsLeader + + - query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskBackendCommitDurationSeconds + + - query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m])) + metricName: 99thEtcdDiskWalFsyncDurationSeconds + + - query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) + metricName: 99thEtcdRoundTripTimeSeconds + + - query: etcd_mvcc_db_total_size_in_bytes + metricName: etcdDBPhysicalSizeBytes + + - query: etcd_mvcc_db_total_size_in_use_in_bytes + metricName: etcdDBLogicalSizeBytes + + - query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0 + metricName: etcdObjectCount + + - query: sum by (cluster_version)(etcd_cluster_version) + metricName: etcdVersion + instant: true + +# Cluster metrics + - query: count(kube_namespace_created) + metricName: namespaceCount + + - query: sum(kube_pod_status_phase{}) by (phase) + metricName: podStatusCount + + - query: count(kube_secret_info{}) + metricName: secretCount + + - query: count(kube_deployment_labels{}) + metricName: deploymentCount + + - query: count(kube_configmap_info{}) + metricName: configmapCount + + - query: count(kube_service_info{}) + metricName: serviceCount + + - query: kube_node_role + metricName: nodeRoles + instant: true + + - query: sum(kube_node_status_condition{status="true"}) by (condition) + metricName: nodeStatus + + - query: cluster_version{type="completed"} + metricName: clusterVersion + instant: true diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 00000000..f31f97bb --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,53 @@ +## Scraping and storing metrics for the run + +There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes them into Elasticsearch. The indexed metrics can be visualized with the help of Grafana. + +It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Each run has a unique identifier ( uuid ) and all the metrics/documents in Elasticsearch will be associated with it. The uuid is generated automatially if not set in the config. This feature can be enabled in the [config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml) by setting the following: + +``` +performance_monitoring: + deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift + repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" + kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" + capture_metrics: True + config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config + metrics_profile_path: config/metrics-aggregated.yaml + prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. + prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. + uuid: # uuid for the run is generated by default if not set +``` + +### Metrics profile +A couple of [metric profiles](https://github.com/cloud-bulldozer/kraken/tree/master/config) ( [metrics.yaml](https://github.com/cloud-bulldozer/kraken/tree/master/config/metrics.yaml) and [metrics-aggregated.yaml](https://github.com/cloud-bulldozer/kraken/tree/master/config/metrics-aggregated.yaml) are shipped by default and they can be tweaked to add more metrics to capture during the run. Following are the API server metrics for example: + +``` +metrics: +# API server + - query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0 + metricName: API99thLatency + + - query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0 + metricName: APIRequestRate + + - query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0 + metricName: APIInflightRequests +``` + +### Indexing +Define the Elasticsearch and index to store the metrics/documents in the kube_burner config: + +``` +global: + writeToFile: true + metricsDirectory: collected-metrics + measurements: + - name: podLatency + esIndex: kube-burner + + indexerConfig: + enabled: true + esServers: [https://elastic.example.com:9200] + insecureSkipVerify: true + defaultIndex: kraken + type: elastic +``` diff --git a/kraken/kube_burner/__init__.py b/kraken/kube_burner/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kraken/kube_burner/client.py b/kraken/kube_burner/client.py new file mode 100644 index 00000000..0614d38d --- /dev/null +++ b/kraken/kube_burner/client.py @@ -0,0 +1,62 @@ +import subprocess +import logging +import urllib.request +import shutil +import kraken.prometheus.client as prometheus + + +def setup(url): + """ + Downloads and unpacks kube-burner binary + """ + + filename = "kube_burner.tar" + try: + logging.info("Fetching kube-burner binary") + urllib.request.urlretrieve(url, filename) + except Exception as e: + logging.error("Failed to download kube-burner binary located at %s" % url, e) + exit(1) + try: + logging.info("Unpacking kube-burner tar ball") + shutil.unpack_archive(filename) + except Exception as e: + logging.error("Failed to unpack the kube-burner binary tarball: %s" % e) + exit(1) + + +def scrape_metrics( + distribution, uuid, prometheus_url, prometheus_bearer_token, start_time, end_time, config_path, metrics_profile +): + """ + Scrapes metrics defined in the profile from Prometheus and indexes them into Elasticsearch + """ + + if not prometheus_url: + logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster") + prometheus_url, prometheus_bearer_token = prometheus.instance( + distribution, prometheus_url, prometheus_bearer_token + ) + command = ( + "./kube-burner index --uuid " + + str(uuid) + + " -u " + + str(prometheus_url) + + " -t " + + str(prometheus_bearer_token) + + " -m " + + str(metrics_profile) + + " --start " + + str(start_time) + + " --end " + + str(end_time) + + " -c " + + str(config_path) + ) + try: + logging.info("Running kube-burner to capture the metrics: %s" % command) + logging.info("UUID for the run: %s" % uuid) + subprocess.run(command, shell=True, universal_newlines=True) + except Exception as e: + logging.error("Failed to run kube-burner, error: %s" % (e)) + exit(1) diff --git a/kraken/prometheus/__init__.py b/kraken/prometheus/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kraken/prometheus/client.py b/kraken/prometheus/client.py new file mode 100644 index 00000000..8d06ed8c --- /dev/null +++ b/kraken/prometheus/client.py @@ -0,0 +1,13 @@ +import kraken.invoke.command as runcommand + + +# Get prometheus details +def instance(distribution, prometheus_url, prometheus_bearer_token): + if distribution == "openshift" and not prometheus_url: + url = runcommand.invoke( + r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'""" # noqa + ) + prometheus_url = "https://" + url + if distribution == "openshift" and not prometheus_bearer_token: + prometheus_bearer_token = runcommand.invoke("oc -n openshift-monitoring " "sa get-token prometheus-k8s") + return prometheus_url, prometheus_bearer_token diff --git a/run_kraken.py b/run_kraken.py index 0be1e7d8..70b6b408 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -6,6 +6,8 @@ import yaml import logging import optparse import pyfiglet +import uuid +import time import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand import kraken.litmus.common_litmus as common_litmus @@ -13,6 +15,7 @@ import kraken.time_actions.common_time_functions as time_actions import kraken.performance_dashboards.setup as performance_dashboards import kraken.pod_scenarios.setup as pod_scenarios import kraken.node_actions.run as nodeaction +import kraken.kube_burner.client as kube_burner # Main function @@ -26,6 +29,7 @@ def main(cfg): with open(cfg, "r") as f: config = yaml.full_load(f) global kubeconfig_path, wait_duration + distribution = config["kraken"].get("distribution", "openshift") kubeconfig_path = config["kraken"].get("kubeconfig_path", "") chaos_scenarios = config["kraken"].get("chaos_scenarios", []) litmus_version = config["kraken"].get("litmus_version", "v1.9.1") @@ -37,6 +41,16 @@ def main(cfg): dashboard_repo = config["performance_monitoring"].get( "repo", "https://github.com/cloud-bulldozer/performance-dashboards.git" ) # noqa + capture_metrics = config["performance_monitoring"].get("capture_metrics", False) + kube_burner_url = config["performance_monitoring"].get( + "kube_burner_binary_url", + "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz", # noqa + ) + config_path = config["performance_monitoring"].get("config_path", "config/kube_burner.yaml") + metrics_profile = config["performance_monitoring"].get("metrics_profile_path", "config/metrics-aggregated.yaml") + prometheus_url = config["performance_monitoring"].get("prometheus_url", "") + prometheus_bearer_token = config["performance_monitoring"].get("prometheus_bearer_token", "") + run_uuid = config["performance_monitoring"].get("uuid", "") # Initialize clients if not os.path.isfile(kubeconfig_path): @@ -59,6 +73,13 @@ def main(cfg): if deploy_performance_dashboards: performance_dashboards.setup(dashboard_repo) + # Generate uuid for the run + if run_uuid: + logging.info("Using the uuid defined by the user for the run: %s" % run_uuid) + else: + run_uuid = str(uuid.uuid4()) + logging.info("Generated a uuid for the run: %s" % run_uuid) + # Initialize the start iteration to 0 iteration = 0 @@ -75,6 +96,10 @@ def main(cfg): failed_post_scenarios = [] litmus_namespaces = [] litmus_installed = False + + # Capture the start time + start_time = int(time.time()) + # Loop to run the chaos starts here while int(iteration) < iterations: # Inject chaos scenarios specified in the config @@ -111,6 +136,25 @@ def main(cfg): iteration += 1 logging.info("") + + # Capture the end time + end_time = int(time.time()) + + # Capture metrics for the run + if capture_metrics: + logging.info("Capturing metrics") + kube_burner.setup(kube_burner_url) + kube_burner.scrape_metrics( + distribution, + run_uuid, + prometheus_url, + prometheus_bearer_token, + start_time, + end_time, + config_path, + metrics_profile, + ) + if litmus_uninstall and litmus_installed: for namespace in litmus_namespaces: common_litmus.delete_chaos(namespace) diff --git a/setup.cfg b/setup.cfg index 96270366..3d36afb2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,3 +36,7 @@ dists = bdist_wheel [bdist_wheel] # Use this option if your package is pure-python universal = 1 + +[flake8] +# Ignore specified error codes +extend-ignore = W503