From 7e8f0450d6cf0ccd171119f7b0ceb110c5d7d5a7 Mon Sep 17 00:00:00 2001
From: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
Date: Sat, 29 May 2021 21:15:19 -0400
Subject: [PATCH] Add support to scrape and index metrics

This commit:
- Enables Kraken to leverage kube-burner to scrape metrics from
  Prometheus and index them into Elasticsearch. This way we can
  take a look at the metrics in Grafana long term even after the
  cluster is terminated.
- Enables separation of operations based on distribution with
  OpenShift as the default option. One of the use cases is to
  capture Prometheus instance details as it's installed by default
  while it's optional for Kubernetes.
---
 README.md                      |   3 +
 config/config.yaml             |   8 ++
 config/kube_burner.yaml        |  15 +++
 config/metrics-aggregated.yaml | 184 +++++++++++++++++++++++++++++++++
 config/metrics.yaml            | 117 +++++++++++++++++++++
 docs/metrics.md                |  53 ++++++++++
 kraken/kube_burner/__init__.py |   0
 kraken/kube_burner/client.py   |  62 +++++++++++
 kraken/prometheus/__init__.py  |   0
 kraken/prometheus/client.py    |  13 +++
 run_kraken.py                  |  44 ++++++++
 setup.cfg                      |   4 +
 12 files changed, 503 insertions(+)
 create mode 100644 config/kube_burner.yaml
 create mode 100644 config/metrics-aggregated.yaml
 create mode 100644 config/metrics.yaml
 create mode 100644 docs/metrics.md
 create mode 100644 kraken/kube_burner/__init__.py
 create mode 100644 kraken/kube_burner/client.py
 create mode 100644 kraken/prometheus/__init__.py
 create mode 100644 kraken/prometheus/client.py

diff --git a/README.md b/README.md
index cf8089e7..53b9ec0c 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,9 @@ It's important to make sure to check if the targeted component recovered from th
 ### Performance monitoring
 Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chaos scenarios on various components is key to find out the bottlenecks as it's important to make sure the cluster is healthy in terms if both recovery as well as performance during/after the failure has been injected. Instructions on enabling it can be found [here](docs/performance_dashboards.md).
 
+### Scraping and storing metrics long term
+Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes then into Elasticsearch to be able to store and evaluate the state of the runs long term. The indexed metrics can be visualized with the help of Grafana. It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Information on enabling and leveraging this feature can be found [here](docs/metrics.md).
+
 ### Blogs and other useful resources
 - Blog post on introduction to Kraken: https://www.openshift.com/blog/introduction-to-kraken-a-chaos-tool-for-openshift/kubernetes
 - Discussion and demo on how Kraken can be leveraged to ensure OpenShift is reliable, performant and scalable: https://www.youtube.com/watch?v=s1PvupI5sD0&ab_channel=OpenShift
diff --git a/config/config.yaml b/config/config.yaml
index dc993391..e38f7c4c 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,4 +1,5 @@
 kraken:
+    distribution: openshift                                # Distribution can be kubernetes or openshift
     kubeconfig_path: /root/.kube/config                    # Path to kubeconfig
     exit_on_failure: False                                 # Exit when a post action scenario fails
     litmus_version: v1.10.0                                # Litmus version to install
@@ -26,6 +27,13 @@ cerberus:
 performance_monitoring:
     deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
+    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
+    capture_metrics: False
+    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
+    metrics_profile_path: config/metrics-aggregated.yaml
+    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
+    uuid:                                                 # uuid for the run is generated by default if not set
 
 tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
diff --git a/config/kube_burner.yaml b/config/kube_burner.yaml
new file mode 100644
index 00000000..6a02ef2d
--- /dev/null
+++ b/config/kube_burner.yaml
@@ -0,0 +1,15 @@
+---
+
+global:
+  writeToFile: true
+  metricsDirectory: collected-metrics
+  measurements:
+    - name: podLatency
+      esIndex: kube-burner
+
+  indexerConfig:
+    enabled: true
+    esServers: [https://elastic.example.com:9200]
+    insecureSkipVerify: true
+    defaultIndex: kraken
+    type: elastic
diff --git a/config/metrics-aggregated.yaml b/config/metrics-aggregated.yaml
new file mode 100644
index 00000000..417c36ef
--- /dev/null
+++ b/config/metrics-aggregated.yaml
@@ -0,0 +1,184 @@
+metrics:
+# API server
+  - query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0
+    metricName: API99thLatency
+
+  - query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0
+    metricName: APIRequestRate
+
+  - query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
+    metricName: APIInflightRequests
+
+# Container & pod metrics
+  - query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0
+    metricName: containerMemory-Masters
+
+  - query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|.*apiserver|authentication|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0
+    metricName: containerCPU-Masters
+
+  - query: (sum(irate(container_cpu_usage_seconds_total{pod!="",container="prometheus",namespace="openshift-monitoring"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0
+    metricName: containerCPU-Prometheus
+
+  - query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"}[2m]) * 100 and on (node) kube_node_role{role="worker"}) by (namespace, container)) > 0
+    metricName: containerCPU-AggregatedWorkers
+
+  - query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"}[2m]) * 100 and on (node) kube_node_role{role="infra"}) by (namespace, container)) > 0
+    metricName: containerCPU-AggregatedInfra
+
+  - query: (sum(container_memory_rss{pod!="",namespace="openshift-monitoring",name!="",container="prometheus"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0
+    metricName: containerMemory-Prometheus
+
+  - query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"} and on (node) kube_node_role{role="worker"}) by (container, namespace)
+    metricName: containerMemory-AggregatedWorkers
+
+  - query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"} and on (node) kube_node_role{role="infra"}) by (container, namespace)
+    metricName: containerMemory-AggregatedInfra
+
+# Node metrics
+  - query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
+    metricName: nodeCPU-Masters
+
+  - query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0
+    metricName: nodeCPU-AggregatedWorkers
+
+  - query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0
+    metricName: nodeCPU-AggregatedInfra
+
+  - query: avg(node_memory_MemAvailable_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
+    metricName: nodeMemoryAvailable-Masters
+
+  - query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
+    metricName: nodeMemoryAvailable-AggregatedWorkers
+
+  - query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
+    metricName: nodeMemoryAvailable-AggregatedInfra
+
+  - query: avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
+    metricName: nodeMemoryActive-Masters
+
+  - query: avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
+    metricName: nodeMemoryActive-AggregatedWorkers
+
+  - query: avg(avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
+    metricName: nodeMemoryActive-AggregatedInfra
+
+  - query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
+    metricName: nodeMemoryCached+nodeMemoryBuffers-Masters
+
+  - query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
+    metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedWorkers
+
+  - query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
+    metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedInfra
+
+  - query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
+    metricName: rxNetworkBytes-Masters
+
+  - query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: rxNetworkBytes-AggregatedWorkers
+
+  - query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: rxNetworkBytes-AggregatedInfra
+
+  - query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
+    metricName: txNetworkBytes-Masters
+
+  - query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: txNetworkBytes-AggregatedWorkers
+
+  - query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: txNetworkBytes-AggregatedInfra
+
+  - query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
+    metricName: nodeDiskWrittenBytes-Masters
+
+  - query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: nodeDiskWrittenBytes-AggregatedWorkers
+
+  - query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: nodeDiskWrittenBytes-AggregatedInfra
+
+  - query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
+    metricName: nodeDiskReadBytes-Masters
+
+  - query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: nodeDiskReadBytes-AggregatedWorkers
+
+  - query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
+    metricName: nodeDiskReadBytes-AggregatedInfra
+
+# Etcd metrics
+  - query: sum(rate(etcd_server_leader_changes_seen_total[2m]))
+    metricName: etcdLeaderChangesRate
+
+  - query: etcd_server_is_leader > 0
+    metricName: etcdServerIsLeader
+
+  - query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
+    metricName: 99thEtcdDiskBackendCommitDurationSeconds
+
+  - query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
+    metricName: 99thEtcdDiskWalFsyncDurationSeconds
+
+  - query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
+    metricName: 99thEtcdRoundTripTimeSeconds
+
+  - query: etcd_mvcc_db_total_size_in_bytes
+    metricName: etcdDBPhysicalSizeBytes
+
+  - query: etcd_mvcc_db_total_size_in_use_in_bytes
+    metricName: etcdDBLogicalSizeBytes
+
+  - query: sum by (cluster_version)(etcd_cluster_version)
+    metricName: etcdVersion
+    instant: true
+
+  - query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0
+    metricName: etcdObjectCount
+
+  - query: histogram_quantile(0.99,sum(rate(etcd_request_duration_seconds_bucket[2m])) by (le,operation,apiserver)) > 0
+    metricName: P99APIEtcdRequestLatency
+
+# Cluster metrics
+  - query: count(kube_namespace_created)
+    metricName: namespaceCount
+
+  - query: sum(kube_pod_status_phase{}) by (phase)
+    metricName: podStatusCount
+
+  - query: count(kube_secret_info{})
+    metricName: secretCount
+
+  - query: count(kube_deployment_labels{})
+    metricName: deploymentCount
+
+  - query: count(kube_configmap_info{})
+    metricName: configmapCount
+
+  - query: count(kube_service_info{})
+    metricName: serviceCount
+
+  - query: kube_node_role
+    metricName: nodeRoles
+    instant: true
+
+  - query: sum(kube_node_status_condition{status="true"}) by (condition)
+    metricName: nodeStatus
+
+  - query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0
+    metricName: containerDiskUsage
+
+  - query: cluster_version{type="completed"}
+    metricName: clusterVersion
+    instant: true
+
+# Golang metrics
+
+  - query: go_memstats_heap_alloc_bytes{job=~"apiserver|api|etcd"}
+    metricName: goHeapAllocBytes
+
+  - query: go_memstats_heap_inuse_bytes{job=~"apiserver|api|etcd"}
+    metricName: goHeapInuseBytes
+
+  - query: go_gc_duration_seconds{job=~"apiserver|api|etcd",quantile="1"}
+    metricName: goGCDurationSeconds
diff --git a/config/metrics.yaml b/config/metrics.yaml
new file mode 100644
index 00000000..bdad2be8
--- /dev/null
+++ b/config/metrics.yaml
@@ -0,0 +1,117 @@
+metrics:
+# API server
+  - query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0
+    metricName: API99thLatency
+
+  - query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0
+    metricName: APIRequestRate
+
+  - query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
+    metricName: APIInflightRequests
+
+# Containers & pod metrics
+  - query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node)
+    metricName: podCPU
+
+  - query: sum(container_memory_rss{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}) by (pod, namespace, node)
+    metricName: podMemory
+
+  - query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0
+    metricName: containerDiskUsage
+
+# Kubelet & CRI-O metrics
+  - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
+    metricName: kubeletCPU
+
+  - query: sum(process_resident_memory_bytes{service="kubelet",job="kubelet"}) by (node) and on (node) kube_node_role{role="worker"}
+    metricName: kubeletMemory
+
+  - query: sum(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100) by (node) and on (node) kube_node_role{role="worker"}
+    metricName: crioCPU
+
+  - query: sum(process_resident_memory_bytes{service="kubelet",job="crio"}) by (node) and on (node) kube_node_role{role="worker"}
+    metricName: crioMemory
+
+# Node metrics
+  - query: sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) > 0
+    metricName: nodeCPU
+
+  - query: avg(node_memory_MemAvailable_bytes) by (instance)
+    metricName: nodeMemoryAvailable
+
+  - query: avg(node_memory_Active_bytes) by (instance)
+    metricName: nodeMemoryActive
+
+  - query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)
+    metricName: nodeMemoryCached+nodeMemoryBuffers
+
+  - query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
+    metricName: rxNetworkBytes
+
+  - query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m])
+    metricName: txNetworkBytes
+
+  - query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m])
+    metricName: nodeDiskWrittenBytes
+
+  - query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m])
+    metricName: nodeDiskReadBytes
+
+  - query: sum(rate(etcd_server_leader_changes_seen_total[2m]))
+    metricName: etcdLeaderChangesRate
+
+# Etcd metrics
+  - query: etcd_server_is_leader > 0
+    metricName: etcdServerIsLeader
+
+  - query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
+    metricName: 99thEtcdDiskBackendCommitDurationSeconds
+
+  - query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
+    metricName: 99thEtcdDiskWalFsyncDurationSeconds
+
+  - query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
+    metricName: 99thEtcdRoundTripTimeSeconds
+
+  - query: etcd_mvcc_db_total_size_in_bytes
+    metricName: etcdDBPhysicalSizeBytes
+
+  - query: etcd_mvcc_db_total_size_in_use_in_bytes
+    metricName: etcdDBLogicalSizeBytes
+
+  - query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0
+    metricName: etcdObjectCount
+
+  - query: sum by (cluster_version)(etcd_cluster_version)
+    metricName: etcdVersion
+    instant: true
+
+# Cluster metrics
+  - query: count(kube_namespace_created)
+    metricName: namespaceCount
+
+  - query: sum(kube_pod_status_phase{}) by (phase)
+    metricName: podStatusCount
+
+  - query: count(kube_secret_info{})
+    metricName: secretCount
+
+  - query: count(kube_deployment_labels{})
+    metricName: deploymentCount
+
+  - query: count(kube_configmap_info{})
+    metricName: configmapCount
+
+  - query: count(kube_service_info{})
+    metricName: serviceCount
+
+  - query: kube_node_role
+    metricName: nodeRoles
+    instant: true
+
+  - query: sum(kube_node_status_condition{status="true"}) by (condition)
+    metricName: nodeStatus
+
+  - query: cluster_version{type="completed"}
+    metricName: clusterVersion
+    instant: true
diff --git a/docs/metrics.md b/docs/metrics.md
new file mode 100644
index 00000000..f31f97bb
--- /dev/null
+++ b/docs/metrics.md
@@ -0,0 +1,53 @@
+## Scraping and storing metrics for the run
+
+There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken  supports capturing metrics for the duration of the scenarios defined in the config and indexes them into Elasticsearch. The indexed metrics can be visualized with the help of Grafana.
+
+It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Each run has a unique identifier ( uuid ) and all the metrics/documents in Elasticsearch will be associated with it. The uuid is generated automatially if not set in the config. This feature can be enabled in the [config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml) by setting the following:
+
+```
+performance_monitoring:
+    deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
+    repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
+    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
+    capture_metrics: True
+    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
+    metrics_profile_path: config/metrics-aggregated.yaml
+    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
+    uuid:                                                 # uuid for the run is generated by default if not set
+```
+
+### Metrics profile
+A couple of [metric profiles](https://github.com/cloud-bulldozer/kraken/tree/master/config) ( [metrics.yaml](https://github.com/cloud-bulldozer/kraken/tree/master/config/metrics.yaml) and [metrics-aggregated.yaml](https://github.com/cloud-bulldozer/kraken/tree/master/config/metrics-aggregated.yaml) are shipped by default and they can be tweaked to add more metrics to capture during the run. Following are the API server metrics for example:
+
+```
+metrics:
+# API server
+  - query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0
+    metricName: API99thLatency
+
+  - query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0
+    metricName: APIRequestRate
+
+  - query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
+    metricName: APIInflightRequests
+```
+
+### Indexing
+Define the Elasticsearch and index to store the metrics/documents in the kube_burner config:
+
+```
+global:
+  writeToFile: true
+  metricsDirectory: collected-metrics
+  measurements:
+    - name: podLatency
+      esIndex: kube-burner
+
+  indexerConfig:
+    enabled: true
+    esServers: [https://elastic.example.com:9200]
+    insecureSkipVerify: true
+    defaultIndex: kraken
+    type: elastic
+```
diff --git a/kraken/kube_burner/__init__.py b/kraken/kube_burner/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/kraken/kube_burner/client.py b/kraken/kube_burner/client.py
new file mode 100644
index 00000000..0614d38d
--- /dev/null
+++ b/kraken/kube_burner/client.py
@@ -0,0 +1,62 @@
+import subprocess
+import logging
+import urllib.request
+import shutil
+import kraken.prometheus.client as prometheus
+
+
+def setup(url):
+    """
+    Downloads and unpacks kube-burner binary
+    """
+
+    filename = "kube_burner.tar"
+    try:
+        logging.info("Fetching kube-burner binary")
+        urllib.request.urlretrieve(url, filename)
+    except Exception as e:
+        logging.error("Failed to download kube-burner binary located at %s" % url, e)
+        exit(1)
+    try:
+        logging.info("Unpacking kube-burner tar ball")
+        shutil.unpack_archive(filename)
+    except Exception as e:
+        logging.error("Failed to unpack the kube-burner binary tarball: %s" % e)
+        exit(1)
+
+
+def scrape_metrics(
+    distribution, uuid, prometheus_url, prometheus_bearer_token, start_time, end_time, config_path, metrics_profile
+):
+    """
+    Scrapes metrics defined in the profile from Prometheus and indexes them into Elasticsearch
+    """
+
+    if not prometheus_url:
+        logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
+        prometheus_url, prometheus_bearer_token = prometheus.instance(
+            distribution, prometheus_url, prometheus_bearer_token
+        )
+        command = (
+            "./kube-burner index --uuid "
+            + str(uuid)
+            + " -u "
+            + str(prometheus_url)
+            + " -t "
+            + str(prometheus_bearer_token)
+            + " -m "
+            + str(metrics_profile)
+            + " --start "
+            + str(start_time)
+            + " --end "
+            + str(end_time)
+            + " -c "
+            + str(config_path)
+        )
+    try:
+        logging.info("Running kube-burner to capture the metrics: %s" % command)
+        logging.info("UUID for the run: %s" % uuid)
+        subprocess.run(command, shell=True, universal_newlines=True)
+    except Exception as e:
+        logging.error("Failed to run kube-burner, error: %s" % (e))
+        exit(1)
diff --git a/kraken/prometheus/__init__.py b/kraken/prometheus/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/kraken/prometheus/client.py b/kraken/prometheus/client.py
new file mode 100644
index 00000000..8d06ed8c
--- /dev/null
+++ b/kraken/prometheus/client.py
@@ -0,0 +1,13 @@
+import kraken.invoke.command as runcommand
+
+
+# Get prometheus details
+def instance(distribution, prometheus_url, prometheus_bearer_token):
+    if distribution == "openshift" and not prometheus_url:
+        url = runcommand.invoke(
+            r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'"""  # noqa
+        )
+        prometheus_url = "https://" + url
+    if distribution == "openshift" and not prometheus_bearer_token:
+        prometheus_bearer_token = runcommand.invoke("oc -n openshift-monitoring " "sa get-token prometheus-k8s")
+    return prometheus_url, prometheus_bearer_token
diff --git a/run_kraken.py b/run_kraken.py
index 0be1e7d8..70b6b408 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -6,6 +6,8 @@ import yaml
 import logging
 import optparse
 import pyfiglet
+import uuid
+import time
 import kraken.kubernetes.client as kubecli
 import kraken.invoke.command as runcommand
 import kraken.litmus.common_litmus as common_litmus
@@ -13,6 +15,7 @@ import kraken.time_actions.common_time_functions as time_actions
 import kraken.performance_dashboards.setup as performance_dashboards
 import kraken.pod_scenarios.setup as pod_scenarios
 import kraken.node_actions.run as nodeaction
+import kraken.kube_burner.client as kube_burner
 
 
 # Main function
@@ -26,6 +29,7 @@ def main(cfg):
         with open(cfg, "r") as f:
             config = yaml.full_load(f)
         global kubeconfig_path, wait_duration
+        distribution = config["kraken"].get("distribution", "openshift")
         kubeconfig_path = config["kraken"].get("kubeconfig_path", "")
         chaos_scenarios = config["kraken"].get("chaos_scenarios", [])
         litmus_version = config["kraken"].get("litmus_version", "v1.9.1")
@@ -37,6 +41,16 @@ def main(cfg):
         dashboard_repo = config["performance_monitoring"].get(
             "repo", "https://github.com/cloud-bulldozer/performance-dashboards.git"
         )  # noqa
+        capture_metrics = config["performance_monitoring"].get("capture_metrics", False)
+        kube_burner_url = config["performance_monitoring"].get(
+            "kube_burner_binary_url",
+            "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz",  # noqa
+        )
+        config_path = config["performance_monitoring"].get("config_path", "config/kube_burner.yaml")
+        metrics_profile = config["performance_monitoring"].get("metrics_profile_path", "config/metrics-aggregated.yaml")
+        prometheus_url = config["performance_monitoring"].get("prometheus_url", "")
+        prometheus_bearer_token = config["performance_monitoring"].get("prometheus_bearer_token", "")
+        run_uuid = config["performance_monitoring"].get("uuid", "")
 
         # Initialize clients
         if not os.path.isfile(kubeconfig_path):
@@ -59,6 +73,13 @@ def main(cfg):
         if deploy_performance_dashboards:
             performance_dashboards.setup(dashboard_repo)
 
+        # Generate uuid for the run
+        if run_uuid:
+            logging.info("Using the uuid defined by the user for the run: %s" % run_uuid)
+        else:
+            run_uuid = str(uuid.uuid4())
+            logging.info("Generated a uuid for the run: %s" % run_uuid)
+
         # Initialize the start iteration to 0
         iteration = 0
 
@@ -75,6 +96,10 @@ def main(cfg):
         failed_post_scenarios = []
         litmus_namespaces = []
         litmus_installed = False
+
+        # Capture the start time
+        start_time = int(time.time())
+
         # Loop to run the chaos starts here
         while int(iteration) < iterations:
             # Inject chaos scenarios specified in the config
@@ -111,6 +136,25 @@ def main(cfg):
 
             iteration += 1
             logging.info("")
+
+        # Capture the end time
+        end_time = int(time.time())
+
+        # Capture metrics for the run
+        if capture_metrics:
+            logging.info("Capturing metrics")
+            kube_burner.setup(kube_burner_url)
+            kube_burner.scrape_metrics(
+                distribution,
+                run_uuid,
+                prometheus_url,
+                prometheus_bearer_token,
+                start_time,
+                end_time,
+                config_path,
+                metrics_profile,
+            )
+
         if litmus_uninstall and litmus_installed:
             for namespace in litmus_namespaces:
                 common_litmus.delete_chaos(namespace)
diff --git a/setup.cfg b/setup.cfg
index 96270366..3d36afb2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,3 +36,7 @@ dists = bdist_wheel
 [bdist_wheel]
 # Use this option if your package is pure-python
 universal = 1
+
+[flake8]
+# Ignore specified error codes
+extend-ignore = W503