diff --git a/README.md b/README.md index 3c02ff17..bc8fcc2e 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ $ python3 run_kraken.py --config The report is generated in the run directory and it contains the information about each chaos scenario injection along with timestamps. #### Checking if the cluster is sane after failures injection -[Cerberus](https://github.com/openshift-scale/cerberus) can be used to monitor the cluster under test and the aggregated go/no-go signal generated by it can be consumed by Kraken to determine pass/fail i.e make sure the Kubernetes/OpenShift cluste recovered fine after the failure injetion. +[Cerberus](https://github.com/openshift-scale/cerberus) can be used to monitor the cluster under test and the aggregated go/no-go signal generated by it can be consumed by Kraken to determine pass/fail i.e make sure the Kubernetes/OpenShift cluster recovered fine after the failure injection. It is highly recommended to turn on the Cerberus health check feature avaliable in Kraken after installing and setting up Cerberus. To do that, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the config file. ### Kubernetes/OpenShift chaos scenarios supported Following are the components of Kubernetes/OpenShift for which a basic chaos scenario config exists today. It currently just supports pod based scenarios, we will be adding more soon. Adding a new pod based scenario is as simple as adding a new config under scenarios directory and defining it in the config. diff --git a/config/config.yaml b/config/config.yaml index 5fad664a..c77c7006 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -5,5 +5,9 @@ kraken: - scenarios/openshift-kube-apiserver.yml - scenarios/openshift-apiserver.yml +cerberus: + cerberus_enabled: False # Enable it when cerberus is previously installed + cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal + tunings: - wait_duration: 60 # Duration to wait between each chaos scenario + wait_duration: 60 # Duration to wait between each chaos scenario diff --git a/run_kraken.py b/run_kraken.py index c55d1aa6..bcadc621 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -6,6 +6,7 @@ import time import optparse import logging import yaml +import requests import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand import pyfiglet @@ -23,6 +24,7 @@ def main(cfg): config = yaml.full_load(f) kubeconfig_path = config["kraken"]["kubeconfig_path"] scenarios = config["kraken"]["scenarios"] + cerberus_enabled = config["cerberus"]["cerberus_enabled"] wait_duration = config["tunings"]["wait_duration"] # Initialize clients @@ -46,8 +48,18 @@ def main(cfg): logging.info("Scenario: %s has been successfully injected!" %(scenario)) logging.info("Waiting for the specified duration: %s" %(wait_duration)) time.sleep(wait_duration) - except: - logging.error("Failed to run scenario: %s, please check" %(scenario)) + if cerberus_enabled: + cerberus_url = config["cerberus"]["cerberus_url"] + if not cerberus_url: + logging.error("url where Cerberus publishes True/False signal is not provided.") + sys.exit(1) + cerberus_status = requests.get(cerberus_url).content + cerberus_status = True if cerberus_status == b'True' else False + if not cerberus_status: + logging.error("Received a no-go signal from Cerberus, looks like the cluster is unhealthy. Please check the Cerberus report for more details. Test failed.") + sys.exit(1) + except Exception as e: + logging.error("Failed to run scenario: %s. Encountered the following exception: %s" %(scenario, e)) else: logging.error("Cannot find a config at %s, please check" % (cfg)) sys.exit(1)