Add support to run on Kubernetes

This commit: - Leverages distribution flag in the config set by the user to skip things not supported on OpenShift to be able to run scenarios on Kubernetes. - Adds sample config and scenario files that work on Kubernetes.
2026-02-14 18:10:00 +00:00 · 2022-05-17 17:04:49 -04:00
parent 23d9a26f52
commit 9208f39e06
36 changed files with 179 additions and 90 deletions
--- a/README.md
+++ b/README.md
@@ -56,27 +56,19 @@ Instructions on how to setup the config and the options supported can be found a

 ### Kubernetes/OpenShift chaos scenarios supported

- [Pod Scenarios](docs/pod_scenarios.md)
-
- [Container Scenarios](docs/container_scenarios.md)
-
- [Node Scenarios](docs/node_scenarios.md)
-
- [Time Scenarios](docs/time_scenarios.md)
-
- [Litmus Scenarios](docs/litmus_scenarios.md)
-
- [Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md)
-
- [Namespace Scenarios](docs/namespace_scenarios.md)
-
- [Zone Outage Scenarios](docs/zone_outage.md)
-
- [Application_outages](docs/application_outages.md)
-
- [PVC scenario](docs/pvc_scenario.md)
-
- [Network_Chaos](docs/network_chaos.md)
+Scenario type               | Kubernetes    | OpenShift
+--------------------------- | ------------- | -------------------- |  
+[Pod Scenarios](docs/pod_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
+[Container Scenarios](docs/container_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
+[Node Scenarios](docs/node_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
+[Time Scenarios](docs/time_scenarios.md) | :x: | :heavy_check_mark: |
+[Litmus Scenarios](docs/litmus_scenarios.md) | :x: | :heavy_check_mark: |
+[Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
+[Namespace Scenarios](docs/namespace_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
+[Zone Outage Scenarios](docs/zone_outage.md) | :heavy_check_mark: | :heavy_check_mark: |
+[Application_outages](docs/application_outages.md) | :heavy_check_mark: | :heavy_check_mark: |
+[PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: | :heavy_check_mark: |
+[Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: | :heavy_check_mark: |


 ### Kraken scenario pass/fail criteria and report
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -10,40 +10,40 @@ kraken:
    litmus_uninstall_before_run: True                      # If you want to uninstall litmus before a new run starts
    chaos_scenarios:                                       # List of policies/chaos scenarios to load
        -   container_scenarios:                                 # List of chaos pod scenarios to load
-            - -    scenarios/container_etcd.yml
+            - -    scenarios/openshift/container_etcd.yml
        -   pod_scenarios:
-            - -    scenarios/etcd.yml
-            - -    scenarios/regex_openshift_pod_kill.yml
-              -    scenarios/post_action_regex.py
+            - -    scenarios/openshift/etcd.yml
+            - -    scenarios/openshift/regex_openshift_pod_kill.yml
+              -    scenarios/openshift/post_action_regex.py
        -   node_scenarios:                                # List of chaos node scenarios to load
-            -   scenarios/node_scenarios_example.yml
+            -   scenarios/openshift/node_scenarios_example.yml
        -   pod_scenarios:
-            - -    scenarios/openshift-apiserver.yml
-            - -    scenarios/openshift-kube-apiserver.yml
+            - -    scenarios/openshift/openshift-apiserver.yml
+            - -    scenarios/openshift/openshift-kube-apiserver.yml
        -   time_scenarios:                                # List of chaos time scenarios to load
-            - scenarios/time_scenarios_example.yml
+            - scenarios/openshift/time_scenarios_example.yml
        -   litmus_scenarios:                              # List of litmus scenarios to load
-            - - scenarios/templates/litmus-rbac.yaml
-              - scenarios/node_cpu_hog_engine.yaml
-            - - scenarios/templates/litmus-rbac.yaml
-              - scenarios/node_mem_engine.yaml
-            - - scenarios/templates/litmus-rbac.yaml
-              - scenarios/node_io_engine.yaml
+            - - scenarios/openshift/templates/litmus-rbac.yaml
+              - scenarios/openshift/node_cpu_hog_engine.yaml
+            - - scenarios/openshift/templates/litmus-rbac.yaml
+              - scenarios/openshift/node_mem_engine.yaml
+            - - scenarios/openshift/templates/litmus-rbac.yaml
+              - scenarios/openshift/node_io_engine.yaml
        -   cluster_shut_down_scenarios:
-            - - scenarios/cluster_shut_down_scenario.yml
-              - scenarios/post_action_shut_down.py
+            - - scenarios/openshift/cluster_shut_down_scenario.yml
+              - scenarios/openshift/post_action_shut_down.py
        -   namespace_scenarios:
-             - - scenarios/regex_namespace.yaml
-             - - scenarios/ingress_namespace.yaml
-               - scenarios/post_action_namespace.py
+             - - scenarios/openshift/regex_namespace.yaml
+             - - scenarios/openshift/ingress_namespace.yaml
+               - scenarios/openshift/post_action_namespace.py
        -   zone_outages:
-            - scenarios/zone_outage.yaml
+            - scenarios/openshift/zone_outage.yaml
        -   application_outages:
-            - scenarios/app_outage.yaml
+            - scenarios/openshift/app_outage.yaml
        -   pvc_scenarios:
-            - scenarios/pvc_scenario.yaml
+            - scenarios/openshift/pvc_scenario.yaml
        -   network_chaos:
-            - scenarios/network_chaos.yaml
+            - scenarios/openshift/network_chaos.yaml

 cerberus:
    cerberus_enabled: False                                # Enable it when cerberus is previously installed
--- a/config/config_kubernetes.yaml
+++ b/config/config_kubernetes.yaml
@@ -0,0 +1,38 @@
+kraken:
+    distribution: kubernetes                                # Distribution can be kubernetes or openshift
+    kubeconfig_path: /root/.kube/config                    # Path to kubeconfig
+    exit_on_failure: False                                 # Exit when a post action scenario fails
+    port: 8081
+    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
+    signal_state: RUN                                      # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
+    litmus_version: v1.13.6                                # Litmus version to install
+    litmus_uninstall: False                                # If you want to uninstall litmus if failure
+    litmus_uninstall_before_run: True                      # If you want to uninstall litmus before a new run starts
+    chaos_scenarios:                                       # List of policies/chaos scenarios to load
+        -   container_scenarios:                                 # List of chaos pod scenarios to load
+            - -    scenarios/kube/container_dns.yml
+        -   pod_scenarios:
+            - -    scenarios/kube/scheduler.yml
+
+cerberus:
+    cerberus_enabled: False                                # Enable it when cerberus is previously installed
+    cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
+    check_applicaton_routes: False                         # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
+
+performance_monitoring:
+    deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
+    repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
+    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
+    capture_metrics: False
+    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
+    metrics_profile_path: config/metrics-aggregated.yaml
+    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
+    uuid:                                                 # uuid for the run is generated by default if not set
+    enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
+    alert_profile: config/alerts                          # Path to alert profile with the prometheus queries
+
+tunings:
+    wait_duration: 60                                      # Duration to wait between each chaos scenario
+    iterations: 1                                          # Number of times to execute the scenarios
+    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
--- a/config/config_performance.yaml
+++ b/config/config_performance.yaml
@@ -10,33 +10,34 @@ kraken:
    litmus_uninstall_before_run: True                      # If you want to uninstall litmus before a new run starts
    chaos_scenarios:                                       # List of policies/chaos scenarios to load
        -   pod_scenarios:                                 # List of chaos pod scenarios to load
-            - -    scenarios/etcd.yml
-            - -    scenarios/regex_openshift_pod_kill.yml
-              -    scenarios/post_action_regex.py
+            - -    scenarios/openshift/etcd.yml
+            - -    scenarios/openshift/regex_openshift_pod_kill.yml
+              -    scenarios/openshift/post_action_regex.py
        -   node_scenarios:                                # List of chaos node scenarios to load
-            -   scenarios/node_scenarios_example.yml
+            -   scenarios/openshift/node_scenarios_example.yml
        -   pod_scenarios:
-            - -    scenarios/openshift-apiserver.yml
-            - -    scenarios/openshift-kube-apiserver.yml
+            - -    scenarios/openshift/openshift-apiserver.yml
+            - -    scenarios/openshift/openshift-kube-apiserver.yml
        -   time_scenarios:                                # List of chaos time scenarios to load
-            - scenarios/time_scenarios_example.yml
+            - scenarios/openshift/time_scenarios_example.yml
        -   litmus_scenarios:                              # List of litmus scenarios to load
            - - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml
-              - scenarios/node_cpu_hog_engine.yaml
+              - scenarios/openshift/node_cpu_hog_engine.yaml
        -   cluster_shut_down_scenarios:
-            - - scenarios/cluster_shut_down_scenario.yml
-              - scenarios/post_action_shut_down.py
+            - - scenarios/openshift/cluster_shut_down_scenario.yml
+              - scenarios/openshift/post_action_shut_down.py
        -   namespace_scenarios:
-            - scenarios/regex_namespace.yaml
-            - scenarios/ingress_namespace.yaml
+            - scenarios/openshift/regex_namespace.yaml
+            - scenarios/openshift/ingress_namespace.yaml
        -   zone_outages:
-            - scenarios/zone_outage.yaml
+            - scenarios/openshift/zone_outage.yaml
        -   application_outages:
-            - scenarios/app_outage.yaml
+            - scenarios/openshift/app_outage.yaml
        -   pvc_scenarios:
-            - scenarios/pvc_scenario.yaml
+            - scenarios/openshift/pvc_scenario.yaml
        -   network_chaos:
-            - scenarios/network_chaos.yaml
+            - scenarios/openshift/network_chaos.yaml
+
 cerberus:
    cerberus_enabled: True                                # Enable it when cerberus is previously installed
    cerberus_url: http://0.0.0.0:8080                     # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
--- a/kraken/kube_burner/client.py
+++ b/kraken/kube_burner/client.py
@@ -34,10 +34,14 @@ def scrape_metrics(
    """

    if not prometheus_url:
-        logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
-        prometheus_url, prometheus_bearer_token = prometheus.instance(
-            distribution, prometheus_url, prometheus_bearer_token
-        )
+        if distribution == "openshift":
+            logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
+            prometheus_url, prometheus_bearer_token = prometheus.instance(
+                distribution, prometheus_url, prometheus_bearer_token
+            )
+        else:
+            logging.error("Looks like proemtheus url is not defined, exiting")
+            sys.exit(1)
    command = (
        "./kube-burner index --uuid "
        + str(uuid)
@@ -69,10 +73,14 @@ def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, en
    """

    if not prometheus_url:
-        logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
-        prometheus_url, prometheus_bearer_token = prometheus.instance(
-            distribution, prometheus_url, prometheus_bearer_token
-        )
+        if distribution == "openshift":
+            logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
+            prometheus_url, prometheus_bearer_token = prometheus.instance(
+                distribution, prometheus_url, prometheus_bearer_token
+            )
+        else:
+            logging.error("Looks like proemtheus url is not defined, exiting")
+            sys.exit(1)
    command = (
        "./kube-burner check-alerts "
        + " -u "
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -120,7 +120,7 @@ def container_killing_in_pod(cont_scenario):
    for pod in pods:
        if type(pod) == list:
            container_names = runcommand.invoke(
-                'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
+                'kubectl get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
            ).split(" ")
            container_pod_list.append([pod[0], pod[1], container_names])
        else:
@@ -176,7 +176,9 @@ def check_failed_containers(killed_container_list, wait_time):
    while timer <= wait_time:
        for killed_container in killed_container_list:
            # pod namespace contain name
-            pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1]))
+            pod_output = runcommand.invoke(
+                "kubectl get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1])
+            )
            pod_output_yaml = yaml.full_load(pod_output)
            for statuses in pod_output_yaml["status"]["containerStatuses"]:
                if statuses["name"] == killed_container[2]:
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -175,29 +175,37 @@ def main(cfg):

                        # Inject time skew chaos scenarios specified in the config
                        elif scenario_type == "time_scenarios":
-                            logging.info("Running time skew scenarios")
-                            time_actions.run(scenarios_list, config, wait_duration)
+                            if distribution == "openshift":
+                                logging.info("Running time skew scenarios")
+                                time_actions.run(scenarios_list, config, wait_duration)
+                            else:
+                                logging.error("Litmus scenarios are currently supported only on openshift")
+                                sys.exit(1)

                        # Inject litmus based chaos scenarios
                        elif scenario_type == "litmus_scenarios":
-                            logging.info("Running litmus scenarios")
-                            litmus_namespace = "litmus"
-                            if not litmus_installed:
-                                # Remove Litmus resources before running the scenarios
-                                common_litmus.delete_chaos(litmus_namespace)
-                                common_litmus.delete_chaos_experiments(litmus_namespace)
-                                if litmus_uninstall_before_run:
-                                    common_litmus.uninstall_litmus(litmus_version, litmus_namespace)
-                                common_litmus.install_litmus(litmus_version, litmus_namespace)
-                                common_litmus.deploy_all_experiments(litmus_version, litmus_namespace)
-                                litmus_installed = True
-                                common_litmus.run(
-                                    scenarios_list,
-                                    config,
-                                    litmus_uninstall,
-                                    wait_duration,
-                                    litmus_namespace,
-                                )
+                            if distribution == "openshift":
+                                logging.info("Running litmus scenarios")
+                                litmus_namespace = "litmus"
+                                if not litmus_installed:
+                                    # Remove Litmus resources before running the scenarios
+                                    common_litmus.delete_chaos(litmus_namespace)
+                                    common_litmus.delete_chaos_experiments(litmus_namespace)
+                                    if litmus_uninstall_before_run:
+                                        common_litmus.uninstall_litmus(litmus_version, litmus_namespace)
+                                    common_litmus.install_litmus(litmus_version, litmus_namespace)
+                                    common_litmus.deploy_all_experiments(litmus_version, litmus_namespace)
+                                    litmus_installed = True
+                                    common_litmus.run(
+                                        scenarios_list,
+                                        config,
+                                        litmus_uninstall,
+                                        wait_duration,
+                                        litmus_namespace,
+                                    )
+                            else:
+                                logging.error("Litmus scenarios are currently only supported on openshift")
+                                sys.exit(1)

                        # Inject cluster shutdown scenarios
                        elif scenario_type == "cluster_shut_down_scenarios":
--- a/scenarios/kube/container_dns.yml
+++ b/scenarios/kube/container_dns.yml
@@ -0,0 +1,8 @@
+scenarios:
+- name: "kill dns container"
+  namespace: "kube-system"
+  label_selector: "k8s-app=kube-dns"
+  container_name: ""
+  action: "kill 1"
+  count: 1
+  retry_wait: 60
--- a/scenarios/kube/scheduler.yml
+++ b/scenarios/kube/scheduler.yml
@@ -0,0 +1,32 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "delete scheduler pods"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "kube-system"
+              selector: "k8s-app=kube-scheduler"
+        filters:
+          - randomSample:
+              size: 1
+        actions:
+          - kill:
+              probability: 1
+              force: true
+    - podAction:
+        matches:
+          - labels:
+              namespace: "kube-system"
+              selector: "k8s-app=kube-scheduler"
+        retries:
+          retriesTimeout:
+            timeout: 180
+
+        actions:
+          - checkPodCount:
+              count: 3
--- a/scenarios/openshift/app_outage.yaml
+++ b/scenarios/openshift/app_outage.yaml
--- a/scenarios/openshift/cluster_shut_down_scenario.yml
+++ b/scenarios/openshift/cluster_shut_down_scenario.yml
--- a/scenarios/openshift/container_etcd.yml
+++ b/scenarios/openshift/container_etcd.yml
--- a/scenarios/openshift/customapp_pod.yaml
+++ b/scenarios/openshift/customapp_pod.yaml
--- a/scenarios/openshift/etcd.yml
+++ b/scenarios/openshift/etcd.yml
--- a/scenarios/openshift/ingress_namespace.yaml
+++ b/scenarios/openshift/ingress_namespace.yaml
--- a/scenarios/openshift/network_chaos.yaml
+++ b/scenarios/openshift/network_chaos.yaml
--- a/scenarios/openshift/node_cpu_hog_engine.yaml
+++ b/scenarios/openshift/node_cpu_hog_engine.yaml
--- a/scenarios/openshift/node_io_engine.yaml
+++ b/scenarios/openshift/node_io_engine.yaml
--- a/scenarios/openshift/node_mem_engine.yaml
+++ b/scenarios/openshift/node_mem_engine.yaml
--- a/scenarios/openshift/node_scenarios_example.yml
+++ b/scenarios/openshift/node_scenarios_example.yml
--- a/scenarios/openshift/openshift-apiserver.yml
+++ b/scenarios/openshift/openshift-apiserver.yml
--- a/scenarios/openshift/openshift-kube-apiserver.yml
+++ b/scenarios/openshift/openshift-kube-apiserver.yml
--- a/scenarios/openshift/post_action_etcd_container.py
+++ b/scenarios/openshift/post_action_etcd_container.py
--- a/scenarios/openshift/post_action_etcd_example_py.py
+++ b/scenarios/openshift/post_action_etcd_example_py.py
--- a/scenarios/openshift/post_action_namespace.py
+++ b/scenarios/openshift/post_action_namespace.py
--- a/scenarios/openshift/post_action_prometheus.yml
+++ b/scenarios/openshift/post_action_prometheus.yml
--- a/scenarios/openshift/post_action_regex.py
+++ b/scenarios/openshift/post_action_regex.py
--- a/scenarios/openshift/post_action_regex.sh
+++ b/scenarios/openshift/post_action_regex.sh
--- a/scenarios/openshift/post_action_shut_down.py
+++ b/scenarios/openshift/post_action_shut_down.py
--- a/scenarios/openshift/prometheus.yml
+++ b/scenarios/openshift/prometheus.yml
--- a/scenarios/openshift/pvc_scenario.yaml
+++ b/scenarios/openshift/pvc_scenario.yaml
--- a/scenarios/openshift/regex_namespace.yaml
+++ b/scenarios/openshift/regex_namespace.yaml
--- a/scenarios/openshift/regex_openshift_pod_kill.yml
+++ b/scenarios/openshift/regex_openshift_pod_kill.yml
--- a/scenarios/openshift/templates/litmus-rbac.yaml
+++ b/scenarios/openshift/templates/litmus-rbac.yaml
--- a/scenarios/openshift/time_scenarios_example.yml
+++ b/scenarios/openshift/time_scenarios_example.yml
--- a/scenarios/openshift/zone_outage.yaml
+++ b/scenarios/openshift/zone_outage.yaml