diff --git a/config/config.yaml b/config/config.yaml index 92609841..e54453cb 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,7 +1,9 @@ kraken: kubeconfig_path: /root/.kube/config # Path to kubeconfig exit_on_failure: False # Exit when a post action scenario fails - chaos_scenarios: # List of policies/chaos scenarios to load + litmus_version: v1.10.0 # Litmus version to install + litmus_uninstall: False # If you want to uninstall litmus if failure + chaos_scenarios: # List of policies/chaos scenarios to load - pod_scenarios: # List of chaos pod scenarios to load - - scenarios/etcd.yml - - scenarios/regex_openshift_pod_kill.yml @@ -13,6 +15,9 @@ kraken: - - scenarios/openshift-kube-apiserver.yml - time_scenarios: # List of chaos time scenarios to load - scenarios/time_scenarios_example.yml + - litmus_scenarios: # List of litmus scenarios to load + - - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml + - scenarios/node_hog_engine.yaml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed diff --git a/docs/litmus_scenarios.md b/docs/litmus_scenarios.md new file mode 100644 index 00000000..d3b6ed86 --- /dev/null +++ b/docs/litmus_scenarios.md @@ -0,0 +1,41 @@ +### Litmus Scenarios +Kraken consumes [Litmus](https://github.com/litmuschaos/litmus) under the hood for some infrastructure, pod, and node scenarios + +Official Litmus documentation and to read more information on specifics of Litmus resources can be found [here](https://docs.litmuschaos.io/docs/next/getstarted/) + + +#### Litmus Chaos Custom Resources +There are 3 custom resources that are created during each Litmus scenario. Below is a description of the resources: +* ChaosEngine: A resource to link a Kubernetes application or Kubernetes node to a ChaosExperiment. ChaosEngine is watched by Litmus' Chaos-Operator which then invokes Chaos-Experiments +* ChaosExperiment: A resource to group the configuration parameters of a chaos experiment. ChaosExperiment CRs are created by the operator when experiments are invoked by ChaosEngine. +* ChaosResult : A resource to hold the results of a chaos-experiment. The Chaos-exporter reads the results and exports the metrics into a configured Prometheus server. + +### Understanding Litmus Scenarios + +To run Litmus scenarios we need to apply 3 different resources/yaml files to our cluster +1. **Chaos experiments** contain the actual chaos details of a scenario + + i. This is installed automatically by Kraken (does not need to be specified in kraken scenario configuration) + +2. **Service Account**: should be created to allow chaosengine to run experiments in your application namespace. Usually sets just enough permissions to a specific namespace to be able to run the experiment properly + + i. This can be defined using either a link to a yaml file or a downloaded file in the scenarios folder + +3. **Chaos Engine** connects the application instance to a Chaos Experiment. This is where you define the specifics of your scenario; ie: the node or pod name you want to cause chaos within + + i. This is a downloaded yaml file in the scenarios folder, full list of scenarios can be found [here](https://hub.litmuschaos.io/) + +**NOTE**: By default all chaos experiments will be installed based on the version you give in the config file. + +Adding a new Litmus based scenario is as simple as adding references to 2 new yaml files (the Service Account and Chaos engine files for your scenario ) in the Kraken config. + +### Current Scenarios + +Following are the start of scenarios for which a chaos scenario config exists today. + +Component | Description | Working +------------------------ | ---------------------------------------------------------------------------------------------------| ------------------------- | +Node CPU Hog | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time | :heavy_check_mark: | + + + diff --git a/kraken/litmus/common_litmus.py b/kraken/litmus/common_litmus.py new file mode 100644 index 00000000..155d8ef6 --- /dev/null +++ b/kraken/litmus/common_litmus.py @@ -0,0 +1,103 @@ +import kraken.invoke.command as runcommand +import logging +import time +import sys + + +# Install litmus and wait until pod is running +def install_litmus(version): + runcommand.invoke("kubectl apply -f " + "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version) + + runcommand.invoke("oc patch -n litmus deployment.apps/chaos-operator-ce --type=json --patch ' " + "[ { \"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/env/-\", " + "\"value\": { \"name\": \"ANALYTICS\", \"value\": \"FALSE\" } } ]'") + + runcommand.invoke("oc wait deploy -n litmus chaos-operator-ce --for=condition=Available") + + +def deploy_all_experiments(version_string): + + if not version_string.startswith("v"): + logging.error("Incorrect version string for litmus, needs to start with 'v' " + "followed by a number") + sys.exit(1) + version = version_string[1:] + + runcommand.invoke("kubectl apply -f " + "https://hub.litmuschaos.io/api/chaos/%s?file=charts/generic/experiments.yaml" + % version) + + +def delete_experiments(): + runcommand.invoke("kubectl delete chaosengine --all") + + +# Check status of experiment +def check_experiment(engine_name, experiment_name, namespace): + chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath=" + "'{.status.engineStatus}'" % (engine_name, namespace)) + engine_status = chaos_engine.strip() + max_tries = 30 + engine_counter = 0 + while engine_status.lower() != "running" and engine_status.lower() != "completed": + time.sleep(10) + logging.info("Waiting for engine to start running.") + chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath=" + "'{.status.engineStatus}'" % (engine_name, namespace)) + engine_status = chaos_engine.strip() + if engine_counter >= max_tries: + logging.error("Chaos engine took longer than 5 minutes to be running or complete") + return False + engine_counter += 1 + # need to see if error in run + if "notfound" in engine_status.lower(): + logging.info("Chaos engine was not found") + return False + + if not chaos_engine: + return False + chaos_result = runcommand.invoke("kubectl get chaosresult %s" + "-%s -n %s -o " + "jsonpath='{.status.experimentstatus.verdict}'" + % (engine_name, experiment_name, namespace)) + result_counter = 0 + status = chaos_result.strip() + while status == "Awaited": + logging.info("Waiting for chaos result to finish, sleeping 10 seconds") + time.sleep(10) + chaos_result = runcommand.invoke("kubectl get chaosresult %s" + "-%s -n %s -o " + "jsonpath='{.status.experimentstatus.verdict}'" + % (engine_name, experiment_name, namespace)) + status = chaos_result.strip() + if result_counter >= max_tries: + logging.error("Chaos results took longer than 5 minutes to get a final result") + return False + result_counter += 1 + if "notfound" in status.lower(): + logging.info("Chaos result was not found") + return False + + if status == "Pass": + return True + else: + chaos_result = runcommand.invoke("kubectl get chaosresult %s" + "-%s -n %s -o jsonpath=" + "'{.status.experimentstatus.failStep}'" % + (engine_name, experiment_name, namespace)) + logging.info("Chaos result failed information: " + str(chaos_result)) + return False + + +# Delete all chaos engines in a given namespace +def delete_chaos(namespace): + runcommand.invoke("kubectl delete chaosengine --all -n " + str(namespace)) + runcommand.invoke("kubectl delete chaosexperiment --all -n " + str(namespace)) + runcommand.invoke("kubectl delete chaosresult --all -n " + str(namespace)) + + +# Uninstall litmus operator +def uninstall_litmus(version): + runcommand.invoke("kubectl delete -f " + "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version) diff --git a/run_kraken.py b/run_kraken.py index e5422e78..78ee6a7b 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -10,6 +10,7 @@ import requests import pyfiglet import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand +import kraken.litmus.common_litmus as common_litmus import kraken.node_actions.common_node_functions as nodeaction from kraken.node_actions.aws_node_scenarios import aws_node_scenarios from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios @@ -229,6 +230,53 @@ def time_scenarios(scenarios_list, config): publish_kraken_status(config, not_reset) +def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall): + # Loop to run the scenarios starts here + for l_scenario in scenarios_list: + try: + for item in l_scenario: + runcommand.invoke("kubectl apply -f %s" % item) + if "http" in item: + f = requests.get(item) + yaml_item = list(yaml.safe_load_all(f.content))[0] + else: + with open(item, "r") as f: + logging.info("opened yaml" + str(item)) + yaml_item = list(yaml.safe_load_all(f))[0] + + if yaml_item['kind'] == "ChaosEngine": + engine_name = yaml_item['metadata']['name'] + namespace = yaml_item['metadata']['namespace'] + litmus_namespaces.append(namespace) + experiment_names = yaml_item['spec']['experiments'] + for expr in experiment_names: + expr_name = expr['name'] + experiment_result = common_litmus.check_experiment(engine_name, + expr_name, + namespace) + if experiment_result: + logging.info("Scenario: %s has been successfully injected!" + % item) + else: + logging.info("Scenario: %s was not successfully injected!" + % item) + if litmus_uninstall: + for l_item in l_scenario: + logging.info('item ' + str(l_item)) + runcommand.invoke("kubectl delete -f %s" % l_item) + if litmus_uninstall: + for item in l_scenario: + logging.info('item ' + str(item)) + runcommand.invoke("kubectl delete -f %s" % item) + cerberus_integration(config) + logging.info("Waiting for the specified duration: %s" % wait_duration) + time.sleep(wait_duration) + except Exception as e: + logging.error("Failed to run litmus scenario: %s. Encountered " + "the following exception: %s" % (item, e)) + return litmus_namespaces + + # Main function def main(cfg): # Start kraken @@ -242,6 +290,8 @@ def main(cfg): global kubeconfig_path, wait_duration kubeconfig_path = config["kraken"].get("kubeconfig_path", "") chaos_scenarios = config["kraken"].get("chaos_scenarios", []) + litmus_version = config['kraken'].get("litmus_version", 'v1.9.1') + litmus_uninstall = config['kraken'].get("litmus_uninstall", False) wait_duration = config["tunings"].get("wait_duration", 60) iterations = config["tunings"].get("iterations", 1) daemon_mode = config["tunings"].get("daemon_mode", False) @@ -277,6 +327,8 @@ def main(cfg): iterations = int(iterations) failed_post_scenarios = [] + litmus_namespaces = [] + litmus_installed = False # Loop to run the chaos starts here while (int(iteration) < iterations): # Inject chaos scenarios specified in the config @@ -298,9 +350,23 @@ def main(cfg): # Inject time skew chaos scenarios specified in the config elif scenario_type == "time_scenarios": time_scenarios(scenarios_list, config) + elif scenario_type == "litmus_scenarios": + if not litmus_installed: + common_litmus.install_litmus(litmus_version) + common_litmus.deploy_all_experiments(litmus_version) + litmus_installed = True + litmus_namespaces = litmus_scenarios(scenarios_list, config, + litmus_namespaces, + litmus_uninstall) iteration += 1 logging.info("") + if litmus_uninstall and litmus_installed: + for namespace in litmus_namespaces: + common_litmus.delete_chaos(namespace) + common_litmus.delete_experiments() + common_litmus.uninstall_litmus(litmus_version) + if failed_post_scenarios: logging.error("Post scenarios are still failing at the end of all iterations") sys.exit(1) diff --git a/scenarios/node_hog_engine.yaml b/scenarios/node_hog_engine.yaml new file mode 100644 index 00000000..c505c57b --- /dev/null +++ b/scenarios/node_hog_engine.yaml @@ -0,0 +1,25 @@ +apiVersion: litmuschaos.io/v1alpha1 +kind: ChaosEngine +metadata: + name: nginx-chaos + namespace: default +spec: + # It can be true/false + annotationCheck: 'false' + # It can be active/stop + engineState: 'active' + chaosServiceAccount: node-cpu-hog-sa + monitoring: false + # It can be delete/retain + jobCleanUpPolicy: 'delete' + experiments: + - name: node-cpu-hog + spec: + components: + env: + # set chaos duration (in sec) as desired + - name: TOTAL_CHAOS_DURATION + value: '60' + # ENTER THE COMMA SEPARATED TARGET NODES NAME + - name: TARGET_NODES + value: '' \ No newline at end of file