mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 18:10:00 +00:00
Add support to deploy performance dashboards
This commit enables performance monitoring on the cluster when running Kraken to be able to observe how cluster reacts to failures as it's important to make sure the cluster is healthy in terms of both recovery as well as performance.
This commit is contained in:
@@ -31,6 +31,9 @@ It's important to make sure to check if the targeted component recovered from th
|
||||
- Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail. It is highly recommended to turn on the Cerberus health check feature avaliable in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file.
|
||||
|
||||
|
||||
### Performance monitoring
|
||||
Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chaos scenarios on various components is key to find out the bottlenecks as it's important to make sure the cluster is healthy in terms if both recovery as well as performance during/after the failure has been injected. Instructions on enabling it can be found [here](docs/performance_dasboards.md).
|
||||
|
||||
### Blogs and other useful resources
|
||||
- Blog post on introduction to Kraken: https://www.openshift.com/blog/introduction-to-kraken-a-chaos-tool-for-openshift/kubernetes
|
||||
- Discussion and demo on how Kraken can be leveraged to ensure OpenShift is reliable, performant and scalable: https://www.youtube.com/watch?v=s1PvupI5sD0&ab_channel=OpenShift
|
||||
|
||||
@@ -23,6 +23,10 @@ cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
|
||||
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
iterations: 1 # Number of times to execute the scenarios
|
||||
|
||||
@@ -22,8 +22,12 @@ cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
|
||||
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
iterations: 1 # Number of times to execute the scenarios
|
||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever
|
||||
```
|
||||
```
|
||||
|
||||
@@ -21,6 +21,8 @@ $ cd kraken
|
||||
$ pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
**NOTE**: Make sure python3-devel is installed on the system.
|
||||
|
||||
#### Run
|
||||
```
|
||||
$ python3 run_kraken.py --config <config_file_location>
|
||||
|
||||
12
docs/performance_dashboards.md
Normal file
12
docs/performance_dashboards.md
Normal file
@@ -0,0 +1,12 @@
|
||||
## Performance dashboards
|
||||
|
||||
Kraken supports installing a mutable grafana on the cluster with the dashboards loaded to help with monitoring the cluster for things like resource usage to find the outliers, API stats, Etcd health, Critical alerts etc. It can be deployed by enabling the following in the config:
|
||||
|
||||
```
|
||||
performance_monitoring:
|
||||
deploy_dashboards: True
|
||||
```
|
||||
|
||||
The route and credentials to access the dashboards will be printed on the stdout before Kraken starts creating chaos. The dashboards can be edited/modified to include your queries of interest.
|
||||
|
||||
**NOTE**: The dashboards leverage Prometheus for scraping the metrics off of the cluster and currently only supports OpenShift since Prometheus is setup on the cluster by default and leverages routes object to expose the grafana dashboards externally.
|
||||
0
kraken/performance_dashboards/__init__.py
Normal file
0
kraken/performance_dashboards/__init__.py
Normal file
19
kraken/performance_dashboards/setup.py
Normal file
19
kraken/performance_dashboards/setup.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import subprocess
|
||||
import logging
|
||||
import git
|
||||
|
||||
|
||||
# Installs a mutable grafana on the Kubernetes/OpenShift cluster and loads the performance dashboards
|
||||
def setup(repo):
|
||||
command = "cd /tmp/performance-dashboards/dittybopper && ./deploy.sh"
|
||||
delete_repo = "rm -rf /tmp/performance-dashboards || exit 0"
|
||||
logging.info("Cloning, installing mutable grafana on the cluster and loading the dashboards")
|
||||
try:
|
||||
# delete repo to clone the latest copy if exists
|
||||
subprocess.run(delete_repo, shell=True, universal_newlines=True, timeout=45)
|
||||
# clone the repo
|
||||
git.Repo.clone_from(repo, '/tmp/performance-dashboards')
|
||||
# deploy performance dashboards
|
||||
subprocess.run(command, shell=True, universal_newlines=True)
|
||||
except Exception as e:
|
||||
logging.error("Failed to install performance-dashboards, error: %s" % (e))
|
||||
@@ -8,3 +8,4 @@ google-api-python-client
|
||||
kubernetes==12.0.0a1
|
||||
oauth2client>=4.1.3
|
||||
python-openstackclient
|
||||
gitpython
|
||||
|
||||
@@ -17,7 +17,7 @@ from kraken.node_actions.general_cloud_node_scenarios import general_node_scenar
|
||||
from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
|
||||
from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios
|
||||
import kraken.time_actions.common_time_functions as time_actions
|
||||
|
||||
import kraken.performance_dashboards.setup as performance_dashboards
|
||||
|
||||
node_general = False
|
||||
|
||||
@@ -298,6 +298,8 @@ def main(cfg):
|
||||
wait_duration = config["tunings"].get("wait_duration", 60)
|
||||
iterations = config["tunings"].get("iterations", 1)
|
||||
daemon_mode = config["tunings"].get("daemon_mode", False)
|
||||
deploy_performance_dashboards = config["performance_monitoring"].get("deploy_dashboards", False)
|
||||
dashboard_repo = config["performance_monitoring"].get("repo", "https://github.com/cloud-bulldozer/performance-dashboards.git")
|
||||
|
||||
# Initialize clients
|
||||
if not os.path.isfile(kubeconfig_path):
|
||||
@@ -315,6 +317,10 @@ def main(cfg):
|
||||
"'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'") # noqa
|
||||
logging.info("\n%s%s" % (cluster_version, cluster_info))
|
||||
|
||||
# Deploy performance dashboards
|
||||
if deploy_performance_dashboards:
|
||||
performance_dashboards.setup(dashboard_repo)
|
||||
|
||||
# Initialize the start iteration to 0
|
||||
iteration = 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user