Adding litmus scenario options

2026-04-15 06:57:28 +00:00 · 2020-10-07 16:02:34 -04:00
parent 1baff5e076
commit 1fc9683c8c
5 changed files with 241 additions and 1 deletions
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,7 +1,9 @@
 kraken:
    kubeconfig_path: /root/.kube/config                    # Path to kubeconfig
    exit_on_failure: False                                 # Exit when a post action scenario fails
-    chaos_scenarios:                                         # List of policies/chaos scenarios to load
+    litmus_version: v1.10.0                                # Litmus version to install
+    litmus_uninstall: False                                # If you want to uninstall litmus if failure
+    chaos_scenarios:                                       # List of policies/chaos scenarios to load
        -   pod_scenarios:                                 # List of chaos pod scenarios to load
            - -    scenarios/etcd.yml
            - -    scenarios/regex_openshift_pod_kill.yml
@@ -13,6 +15,9 @@ kraken:
            - -    scenarios/openshift-kube-apiserver.yml
        -   time_scenarios:                                # List of chaos time scenarios to load
            - scenarios/time_scenarios_example.yml
+        -   litmus_scenarios:                              # List of litmus scenarios to load
+            - - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml
+              - scenarios/node_hog_engine.yaml

 cerberus:
    cerberus_enabled: False                                # Enable it when cerberus is previously installed
--- a/docs/litmus_scenarios.md
+++ b/docs/litmus_scenarios.md
@@ -0,0 +1,41 @@
+### Litmus Scenarios
+Kraken consumes [Litmus](https://github.com/litmuschaos/litmus) under the hood for some infrastructure, pod, and node scenarios
+ 
+Official Litmus documentation and to read more information on specifics of Litmus resources can be found [here](https://docs.litmuschaos.io/docs/next/getstarted/)
+
+
+#### Litmus Chaos Custom Resources
+There are 3 custom resources that are created during each Litmus scenario. Below is a description of the resources:
+* ChaosEngine: A resource to link a Kubernetes application or Kubernetes node to a ChaosExperiment. ChaosEngine is watched by Litmus' Chaos-Operator which then invokes Chaos-Experiments
+* ChaosExperiment: A resource to group the configuration parameters of a chaos experiment. ChaosExperiment CRs are created by the operator when experiments are invoked by ChaosEngine.
+* ChaosResult : A resource to hold the results of a chaos-experiment. The Chaos-exporter reads the results and exports the metrics into a configured Prometheus server.
+
+### Understanding Litmus Scenarios  
+
+To run Litmus scenarios we need to apply 3 different resources/yaml files to our cluster
+1. **Chaos experiments** contain the actual chaos details of a scenario
+
+    i. This is installed automatically by Kraken (does not need to be specified in kraken scenario configuration)
+    
+2. **Service Account**: should be created to allow chaosengine to run experiments in your application namespace. Usually sets just enough permissions to a specific namespace to be able to run the experiment properly 
+
+    i. This can be defined using either a link to a yaml file or a downloaded file in the scenarios folder
+    
+3. **Chaos Engine** connects the application instance to a Chaos Experiment. This is where you define the specifics of your scenario; ie: the node or pod name you want to cause chaos within 
+
+    i. This is a downloaded yaml file in the scenarios folder, full list of scenarios can be found [here](https://hub.litmuschaos.io/)
+
+**NOTE**: By default all chaos experiments will be installed based on the version you give in the config file. 
+
+Adding a new Litmus based scenario is as simple as adding references to 2 new yaml files (the Service Account and Chaos engine files for your scenario ) in the Kraken config.
+
+### Current Scenarios
+
+Following are the start of scenarios for which a chaos scenario config exists today. 
+
+Component                | Description                                                                                        | Working
+------------------------ | ---------------------------------------------------------------------------------------------------| ------------------------- |
+Node CPU Hog             | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time                | :heavy_check_mark:        |
+
+
+
--- a/kraken/litmus/common_litmus.py
+++ b/kraken/litmus/common_litmus.py
@@ -0,0 +1,103 @@
+import kraken.invoke.command as runcommand
+import logging
+import time
+import sys
+
+
+# Install litmus and wait until pod is running
+def install_litmus(version):
+    runcommand.invoke("kubectl apply -f "
+                      "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
+
+    runcommand.invoke("oc patch -n litmus deployment.apps/chaos-operator-ce --type=json --patch ' "
+                      "[ { \"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/env/-\", "
+                      "\"value\": { \"name\": \"ANALYTICS\", \"value\": \"FALSE\" } } ]'")
+
+    runcommand.invoke("oc wait deploy -n litmus chaos-operator-ce --for=condition=Available")
+
+
+def deploy_all_experiments(version_string):
+
+    if not version_string.startswith("v"):
+        logging.error("Incorrect version string for litmus, needs to start with 'v' "
+                      "followed by a number")
+        sys.exit(1)
+    version = version_string[1:]
+
+    runcommand.invoke("kubectl apply -f "
+                      "https://hub.litmuschaos.io/api/chaos/%s?file=charts/generic/experiments.yaml"
+                      % version)
+
+
+def delete_experiments():
+    runcommand.invoke("kubectl delete chaosengine --all")
+
+
+# Check status of experiment
+def check_experiment(engine_name, experiment_name, namespace):
+    chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
+                                     "'{.status.engineStatus}'" % (engine_name, namespace))
+    engine_status = chaos_engine.strip()
+    max_tries = 30
+    engine_counter = 0
+    while engine_status.lower() != "running" and engine_status.lower() != "completed":
+        time.sleep(10)
+        logging.info("Waiting for engine to start running.")
+        chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
+                                         "'{.status.engineStatus}'" % (engine_name, namespace))
+        engine_status = chaos_engine.strip()
+        if engine_counter >= max_tries:
+            logging.error("Chaos engine took longer than 5 minutes to be running or complete")
+            return False
+        engine_counter += 1
+        # need to see if error in run
+        if "notfound" in engine_status.lower():
+            logging.info("Chaos engine was not found")
+            return False
+
+    if not chaos_engine:
+        return False
+    chaos_result = runcommand.invoke("kubectl get chaosresult %s"
+                                     "-%s -n %s -o "
+                                     "jsonpath='{.status.experimentstatus.verdict}'"
+                                     % (engine_name, experiment_name, namespace))
+    result_counter = 0
+    status = chaos_result.strip()
+    while status == "Awaited":
+        logging.info("Waiting for chaos result to finish, sleeping 10 seconds")
+        time.sleep(10)
+        chaos_result = runcommand.invoke("kubectl get chaosresult %s"
+                                         "-%s -n %s -o "
+                                         "jsonpath='{.status.experimentstatus.verdict}'"
+                                         % (engine_name, experiment_name, namespace))
+        status = chaos_result.strip()
+        if result_counter >= max_tries:
+            logging.error("Chaos results took longer than 5 minutes to get a final result")
+            return False
+        result_counter += 1
+        if "notfound" in status.lower():
+            logging.info("Chaos result was not found")
+            return False
+
+    if status == "Pass":
+        return True
+    else:
+        chaos_result = runcommand.invoke("kubectl get chaosresult %s"
+                                         "-%s -n %s -o jsonpath="
+                                         "'{.status.experimentstatus.failStep}'" %
+                                         (engine_name, experiment_name, namespace))
+        logging.info("Chaos result failed information: " + str(chaos_result))
+        return False
+
+
+# Delete all chaos engines in a given namespace
+def delete_chaos(namespace):
+    runcommand.invoke("kubectl delete chaosengine --all -n " + str(namespace))
+    runcommand.invoke("kubectl delete chaosexperiment --all -n " + str(namespace))
+    runcommand.invoke("kubectl delete chaosresult --all -n " + str(namespace))
+
+
+# Uninstall litmus operator
+def uninstall_litmus(version):
+    runcommand.invoke("kubectl delete -f "
+                      "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -10,6 +10,7 @@ import requests
 import pyfiglet
 import kraken.kubernetes.client as kubecli
 import kraken.invoke.command as runcommand
+import kraken.litmus.common_litmus as common_litmus
 import kraken.node_actions.common_node_functions as nodeaction
 from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
 from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
@@ -229,6 +230,53 @@ def time_scenarios(scenarios_list, config):
                publish_kraken_status(config, not_reset)


+def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall):
+    # Loop to run the scenarios starts here
+    for l_scenario in scenarios_list:
+        try:
+            for item in l_scenario:
+                runcommand.invoke("kubectl apply -f %s" % item)
+                if "http" in item:
+                    f = requests.get(item)
+                    yaml_item = list(yaml.safe_load_all(f.content))[0]
+                else:
+                    with open(item, "r") as f:
+                        logging.info("opened yaml" + str(item))
+                        yaml_item = list(yaml.safe_load_all(f))[0]
+
+                if yaml_item['kind'] == "ChaosEngine":
+                    engine_name = yaml_item['metadata']['name']
+                    namespace = yaml_item['metadata']['namespace']
+                    litmus_namespaces.append(namespace)
+                    experiment_names = yaml_item['spec']['experiments']
+                    for expr in experiment_names:
+                        expr_name = expr['name']
+                        experiment_result = common_litmus.check_experiment(engine_name,
+                                                                           expr_name,
+                                                                           namespace)
+                        if experiment_result:
+                            logging.info("Scenario: %s has been successfully injected!"
+                                         % item)
+                        else:
+                            logging.info("Scenario: %s was not successfully injected!"
+                                         % item)
+                            if litmus_uninstall:
+                                for l_item in l_scenario:
+                                    logging.info('item ' + str(l_item))
+                                    runcommand.invoke("kubectl delete -f %s" % l_item)
+            if litmus_uninstall:
+                for item in l_scenario:
+                    logging.info('item ' + str(item))
+                    runcommand.invoke("kubectl delete -f %s" % item)
+            cerberus_integration(config)
+            logging.info("Waiting for the specified duration: %s" % wait_duration)
+            time.sleep(wait_duration)
+        except Exception as e:
+            logging.error("Failed to run litmus scenario: %s. Encountered "
+                          "the following exception: %s" % (item, e))
+    return litmus_namespaces
+
+
 # Main function
 def main(cfg):
    # Start kraken
@@ -242,6 +290,8 @@ def main(cfg):
        global kubeconfig_path, wait_duration
        kubeconfig_path = config["kraken"].get("kubeconfig_path", "")
        chaos_scenarios = config["kraken"].get("chaos_scenarios", [])
+        litmus_version = config['kraken'].get("litmus_version", 'v1.9.1')
+        litmus_uninstall = config['kraken'].get("litmus_uninstall", False)
        wait_duration = config["tunings"].get("wait_duration", 60)
        iterations = config["tunings"].get("iterations", 1)
        daemon_mode = config["tunings"].get("daemon_mode", False)
@@ -277,6 +327,8 @@ def main(cfg):
            iterations = int(iterations)

        failed_post_scenarios = []
+        litmus_namespaces = []
+        litmus_installed = False
        # Loop to run the chaos starts here
        while (int(iteration) < iterations):
            # Inject chaos scenarios specified in the config
@@ -298,9 +350,23 @@ def main(cfg):
                        # Inject time skew chaos scenarios specified in the config
                        elif scenario_type == "time_scenarios":
                            time_scenarios(scenarios_list, config)
+                        elif scenario_type == "litmus_scenarios":
+                            if not litmus_installed:
+                                common_litmus.install_litmus(litmus_version)
+                                common_litmus.deploy_all_experiments(litmus_version)
+                                litmus_installed = True
+                            litmus_namespaces = litmus_scenarios(scenarios_list, config,
+                                                                 litmus_namespaces,
+                                                                 litmus_uninstall)

            iteration += 1
            logging.info("")
+        if litmus_uninstall and litmus_installed:
+            for namespace in litmus_namespaces:
+                common_litmus.delete_chaos(namespace)
+            common_litmus.delete_experiments()
+            common_litmus.uninstall_litmus(litmus_version)
+
        if failed_post_scenarios:
            logging.error("Post scenarios are still failing at the end of all iterations")
            sys.exit(1)
--- a/scenarios/node_hog_engine.yaml
+++ b/scenarios/node_hog_engine.yaml
@@ -0,0 +1,25 @@
+apiVersion: litmuschaos.io/v1alpha1
+kind: ChaosEngine
+metadata:
+  name: nginx-chaos
+  namespace: default
+spec:
+  # It can be true/false
+  annotationCheck: 'false'
+  # It can be active/stop
+  engineState: 'active'
+  chaosServiceAccount: node-cpu-hog-sa
+  monitoring: false
+  # It can be delete/retain
+  jobCleanUpPolicy: 'delete'
+  experiments:
+    - name: node-cpu-hog
+      spec:
+        components:
+          env:
+            # set chaos duration (in sec) as desired
+            - name: TOTAL_CHAOS_DURATION
+              value: '60'
+            # ENTER THE COMMA SEPARATED TARGET NODES NAME
+            - name: TARGET_NODES
+              value: ''