diff --git a/README.md b/README.md index bba0943c..bb872dca 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,9 @@ It's important to make sure to check if the targeted component recovered from th - Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail. It is highly recommended to turn on the Cerberus health check feature avaliable in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. +### Performance monitoring +Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chaos scenarios on various components is key to find out the bottlenecks as it's important to make sure the cluster is healthy in terms if both recovery as well as performance during/after the failure has been injected. Instructions on enabling it can be found [here](docs/performance_dasboards.md). + ### Blogs and other useful resources - Blog post on introduction to Kraken: https://www.openshift.com/blog/introduction-to-kraken-a-chaos-tool-for-openshift/kubernetes - Discussion and demo on how Kraken can be leveraged to ensure OpenShift is reliable, performant and scalable: https://www.youtube.com/watch?v=s1PvupI5sD0&ab_channel=OpenShift diff --git a/config/config.yaml b/config/config.yaml index e54453cb..dc993391 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -23,6 +23,10 @@ cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal +performance_monitoring: + deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift + repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" + tunings: wait_duration: 60 # Duration to wait between each chaos scenario iterations: 1 # Number of times to execute the scenarios diff --git a/docs/config.md b/docs/config.md index 83057776..0cff093c 100644 --- a/docs/config.md +++ b/docs/config.md @@ -22,8 +22,12 @@ cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal +performance_monitoring: + deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift + repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" + tunings: wait_duration: 60 # Duration to wait between each chaos scenario iterations: 1 # Number of times to execute the scenarios daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever -``` \ No newline at end of file +``` diff --git a/docs/installation.md b/docs/installation.md index 90f67899..a948fbec 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -21,6 +21,8 @@ $ cd kraken $ pip3 install -r requirements.txt ``` +**NOTE**: Make sure python3-devel is installed on the system. + #### Run ``` $ python3 run_kraken.py --config diff --git a/docs/performance_dashboards.md b/docs/performance_dashboards.md new file mode 100644 index 00000000..d447aeb2 --- /dev/null +++ b/docs/performance_dashboards.md @@ -0,0 +1,12 @@ +## Performance dashboards + +Kraken supports installing a mutable grafana on the cluster with the dashboards loaded to help with monitoring the cluster for things like resource usage to find the outliers, API stats, Etcd health, Critical alerts etc. It can be deployed by enabling the following in the config: + +``` +performance_monitoring: + deploy_dashboards: True +``` + +The route and credentials to access the dashboards will be printed on the stdout before Kraken starts creating chaos. The dashboards can be edited/modified to include your queries of interest. + +**NOTE**: The dashboards leverage Prometheus for scraping the metrics off of the cluster and currently only supports OpenShift since Prometheus is setup on the cluster by default and leverages routes object to expose the grafana dashboards externally. diff --git a/kraken/performance_dashboards/__init__.py b/kraken/performance_dashboards/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kraken/performance_dashboards/setup.py b/kraken/performance_dashboards/setup.py new file mode 100644 index 00000000..57ebee47 --- /dev/null +++ b/kraken/performance_dashboards/setup.py @@ -0,0 +1,19 @@ +import subprocess +import logging +import git + + +# Installs a mutable grafana on the Kubernetes/OpenShift cluster and loads the performance dashboards +def setup(repo): + command = "cd /tmp/performance-dashboards/dittybopper && ./deploy.sh" + delete_repo = "rm -rf /tmp/performance-dashboards || exit 0" + logging.info("Cloning, installing mutable grafana on the cluster and loading the dashboards") + try: + # delete repo to clone the latest copy if exists + subprocess.run(delete_repo, shell=True, universal_newlines=True, timeout=45) + # clone the repo + git.Repo.clone_from(repo, '/tmp/performance-dashboards') + # deploy performance dashboards + subprocess.run(command, shell=True, universal_newlines=True) + except Exception as e: + logging.error("Failed to install performance-dashboards, error: %s" % (e)) diff --git a/requirements.txt b/requirements.txt index aa798eab..32d0c9d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ google-api-python-client kubernetes==12.0.0a1 oauth2client>=4.1.3 python-openstackclient +gitpython diff --git a/run_kraken.py b/run_kraken.py index 0456c2f8..2bf94ac5 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -17,7 +17,7 @@ from kraken.node_actions.general_cloud_node_scenarios import general_node_scenar from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios import kraken.time_actions.common_time_functions as time_actions - +import kraken.performance_dashboards.setup as performance_dashboards node_general = False @@ -298,6 +298,8 @@ def main(cfg): wait_duration = config["tunings"].get("wait_duration", 60) iterations = config["tunings"].get("iterations", 1) daemon_mode = config["tunings"].get("daemon_mode", False) + deploy_performance_dashboards = config["performance_monitoring"].get("deploy_dashboards", False) + dashboard_repo = config["performance_monitoring"].get("repo", "https://github.com/cloud-bulldozer/performance-dashboards.git") # Initialize clients if not os.path.isfile(kubeconfig_path): @@ -315,6 +317,10 @@ def main(cfg): "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'") # noqa logging.info("\n%s%s" % (cluster_version, cluster_info)) + # Deploy performance dashboards + if deploy_performance_dashboards: + performance_dashboards.setup(dashboard_repo) + # Initialize the start iteration to 0 iteration = 0