Merge pull request #21 from paigerube14/component_recovered

Check if killed component(s) recovered
2026-04-15 06:57:28 +00:00 · 2020-08-20 15:18:12 -04:00
parent c033aa434e 0fc82090f2
commit 3a78cdfce4
16 changed files with 321 additions and 35 deletions
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,11 +1,15 @@
 kraken:
    kubeconfig_path: /root/.kube/config                    # Path to kubeconfig
+    exit_on_failure: False                                 # Exit when a post action scenario fails
    scenarios:                                             # List of policies/chaos scenarios to load
-        -    scenarios/etcd.yml
-        -    scenarios/openshift-kube-apiserver.yml
-        -    scenarios/openshift-apiserver.yml
-        -    scenarios/regex_openshift_pod_kill.yml
-
+        - -    scenarios/etcd.yml
+          -    scenarios/post_action_etcd_example.sh
+        - -    scenarios/openshift-kube-apiserver.yml
+          -    scenarios/post_action_openshift-kube-apiserver.yml
+        - -    scenarios/openshift-apiserver.yml
+          -    scenarios/post_action_openshift-apiserver.yml
+        - -    scenarios/regex_openshift_pod_kill.yml
+          -    scenarios/post_action_regex.py
 cerberus:
    cerberus_enabled: False                                # Enable it when cerberus is previously installed
    cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
--- a/kraken/invoke/command.py
+++ b/kraken/invoke/command.py
@@ -5,8 +5,10 @@ import logging
 # Invokes a given command and returns the stdout
 def invoke(command):
    try:
-        output = subprocess.check_output(command, shell=True,
-                                         universal_newlines=True)
-    except Exception:
-        logging.error("Failed to run %s" % (command))
-    return output
+        output = subprocess.Popen(command, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (command, e))
+    return out
--- a/kraken/kubernetes/client.py
+++ b/kraken/kubernetes/client.py
@@ -4,6 +4,7 @@ import logging
 import kraken.invoke.command as runcommand
 import json

+
 kraken_node_name = ""


--- a/run_kraken.py
+++ b/run_kraken.py
@@ -12,6 +12,109 @@ import kraken.invoke.command as runcommand
 import pyfiglet


+# Get cerberus status
+def cerberus_integration(config):
+    cerberus_status = True
+    if config["cerberus"]["cerberus_enabled"]:
+        cerberus_url = config["cerberus"]["cerberus_url"]
+        if not cerberus_url:
+            logging.error("url where Cerberus publishes True/False signal is not provided.")
+            sys.exit(1)
+        cerberus_status = requests.get(cerberus_url).content
+        cerberus_status = True if cerberus_status == b'True' else False
+        if not cerberus_status:
+            logging.error("Received a no-go signal from Cerberus, looks like "
+                          "the cluster is unhealthy. Please check the Cerberus "
+                          "report for more details. Test failed.")
+            sys.exit(1)
+        else:
+            logging.info("Received a go signal from Ceberus, the cluster is healthy. "
+                         "Test passed.")
+    return cerberus_status
+
+
+# Function to publish kraken status to cerberus
+def publish_kraken_status(config, failed_post_scenarios):
+    cerberus_status = cerberus_integration(config)
+    if not cerberus_status:
+        if failed_post_scenarios:
+            if config['kraken']['exit_on_failure']:
+                logging.info("Cerberus status is not healthy and post action scenarios "
+                             "are still failing, exiting kraken run")
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is not healthy and post action scenarios "
+                             "are still failing")
+    else:
+
+        if failed_post_scenarios:
+            if config['kraken']['exit_on_failure']:
+                logging.info("Cerberus status is healthy but post action scenarios "
+                             "are still failing, exiting kraken run")
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is healthy but post action scenarios "
+                             "are still failing")
+
+
+def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
+
+    if scenario.endswith(".yaml") or scenario.endswith(".yml"):
+        action_output = runcommand.invoke("powerfulseal autonomous "
+                                          "--use-pod-delete-instead-of-ssh-kill"
+                                          " --policy-file %s --kubeconfig %s --no-cloud"
+                                          " --inventory-kubernetes --headless"
+                                          % (scenario, kubeconfig_path))
+        # read output to make sure no error
+        if "ERROR" in action_output:
+            action_output.split("ERROR")[1].split('\n')[0]
+            if not pre_action_output:
+                logging.info("Powerful seal pre action check failed for " + str(scenario))
+            return False
+        else:
+            logging.info(scenario + " post action checks passed")
+
+    elif scenario.endswith(".py"):
+        action_output = runcommand.invoke("python3 " + scenario).strip()
+        if pre_action_output:
+            if pre_action_output == action_output:
+                logging.info(scenario + " post action checks passed")
+            else:
+                logging.info(scenario + ' post action response did not match pre check output')
+                return False
+    else:
+        # invoke custom bash script
+        action_output = runcommand.invoke(scenario).strip()
+        if pre_action_output:
+            if pre_action_output == action_output:
+                logging.info(scenario + " post action checks passed")
+            else:
+                logging.info(scenario + ' post action response did not match pre check output')
+                return False
+
+    return action_output
+
+
+# Perform the post scenario actions to see if components recovered
+def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
+
+    for failed_scenario in failed_post_scenarios:
+        post_action_output = run_post_action(kubeconfig_path,
+                                             failed_scenario[0], failed_scenario[1])
+        if post_action_output is not False:
+            failed_post_scenarios.remove(failed_scenario)
+        else:
+            logging.info('Post action scenario ' + str(failed_scenario) + "is still failing")
+
+    # check post actions
+    if len(scenario) > 1:
+        post_action_output = run_post_action(kubeconfig_path, scenario[1], pre_action_output)
+        if post_action_output is False:
+            failed_post_scenarios.append([scenario[1], pre_action_output])
+
+    return failed_post_scenarios
+
+
 # Main function
 def main(cfg):
    # Start kraken
@@ -24,7 +127,6 @@ def main(cfg):
            config = yaml.full_load(f)
        kubeconfig_path = config["kraken"]["kubeconfig_path"]
        scenarios = config["kraken"]["scenarios"]
-        cerberus_enabled = config["cerberus"]["cerberus_enabled"]
        wait_duration = config["tunings"]["wait_duration"]
        iterations = config["tunings"]["iterations"]
        daemon_mode = config["tunings"]['daemon_mode']
@@ -59,41 +161,34 @@ def main(cfg):
                         % str(iterations))
            iterations = int(iterations)

+        failed_post_scenarios = []
        # Loop to run the chaos starts here
        while (int(iteration) < iterations):
            # Inject chaos scenarios specified in the config
+            logging.info("Executing scenarios for iteration " + str(iteration))
            try:
                # Loop to run the scenarios starts here
                for scenario in scenarios:
-                    logging.info("Injecting scenario: %s" % (scenario))
+                    pre_action_output = run_post_action(kubeconfig_path, scenario[1])
                    runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
                                      " --policy-file %s --kubeconfig %s --no-cloud"
                                      " --inventory-kubernetes --headless"
-                                      % (scenario, kubeconfig_path))
-                    logging.info("Scenario: %s has been successfully injected!" % (scenario))
+                                      % (scenario[0], kubeconfig_path))

-                    if cerberus_enabled:
-                        cerberus_url = config["cerberus"]["cerberus_url"]
-                        if not cerberus_url:
-                            logging.error("url where Cerberus publishes True/False signal "
-                                          "is not provided.")
-                            sys.exit(1)
-                        cerberus_status = requests.get(cerberus_url).content
-                        cerberus_status = True if cerberus_status == b'True' else False
-                        if not cerberus_status:
-                            logging.error("Received a no-go signal from Cerberus, looks like the"
-                                          " cluster is unhealthy. Please check the Cerberus report"
-                                          " for more details. Test failed.")
-                            sys.exit(1)
-                        else:
-                            logging.info("Received a go signal from Ceberus, the cluster is "
-                                         "healthy. Test passed.")
+                    logging.info("Scenario: %s has been successfully injected!" % (scenario[0]))
                    logging.info("Waiting for the specified duration: %s" % (wait_duration))
                    time.sleep(wait_duration)
+                    failed_post_scenarios = post_actions(kubeconfig_path, scenario,
+                                                         failed_post_scenarios, pre_action_output)
+                    publish_kraken_status(config, failed_post_scenarios)
            except Exception as e:
                logging.error("Failed to run scenario: %s. Encountered the following exception: %s"
-                              % (scenario, e))
+                              % (scenario[0], e))
            iteration += 1
+            logging.info("")
+        if failed_post_scenarios:
+            logging.error("Post scenarios are still failing at the end of all iterations")
+            sys.exit(1)
    else:
        logging.error("Cannot find a config at %s, please check" % (cfg))
        sys.exit(1)
--- a/scenarios/etcd.yml
+++ b/scenarios/etcd.yml
@@ -11,12 +11,9 @@ scenarios:
          - labels:
              namespace: "openshift-etcd"
              selector: "k8s-app=etcd"
-
        filters:
          - randomSample:
              size: 1
-
-        # The actions will be executed in the order specified
        actions:
          - kill:
              probability: 1
--- a/scenarios/openshift-apiserver.yml
+++ b/scenarios/openshift-apiserver.yml
--- a/scenarios/openshift-kube-apiserver.yml
+++ b/scenarios/openshift-kube-apiserver.yml
@@ -11,7 +11,6 @@ scenarios:
          - labels:
              namespace: "openshift-kube-apiserver"
              selector: "app=openshift-kube-apiserver"
-
        filters:
          - randomSample:
              size: 1
--- a/scenarios/post_action_etcd.yml
+++ b/scenarios/post_action_etcd.yml
@@ -0,0 +1,21 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 10
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: etcd"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-etcd"
+              selector: "k8s-app=etcd"
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
--- a/scenarios/post_action_etcd_example.sh
+++ b/scenarios/post_action_etcd_example.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+pods="$(oc get pods -n openshift-etcd | grep -c Running)"
+echo "$pods"
--- a/scenarios/post_action_etcd_example_py.py
+++ b/scenarios/post_action_etcd_example_py.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+import subprocess
+import logging
+
+
+def run(cmd):
+    try:
+        output = subprocess.Popen(cmd, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+        logging.info("out " + str(out))
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (cmd, e))
+    return out
+
+
+pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip()
+
+if pods_running == str(3):
+    print("There were 3 pods running properly")
+else:
+    print("ERROR there were " + str(pods_running) + " pods running instead of 3")
--- a/scenarios/post_action_openshift-apiserver.yml
+++ b/scenarios/post_action_openshift-apiserver.yml
@@ -0,0 +1,23 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: openshift-apiserver"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-apiserver"
+              selector: "app=openshift-apiserver"
+
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
--- a/scenarios/post_action_openshift-kube-apiserver.yml
+++ b/scenarios/post_action_openshift-kube-apiserver.yml
@@ -0,0 +1,21 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: openshift-kube-apiserver"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-kube-apiserver"
+              selector: "app=openshift-kube-apiserver"
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
--- a/scenarios/post_action_regex.py
+++ b/scenarios/post_action_regex.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import subprocess
+import re
+import sys
+from kubernetes import client, config
+from kubernetes.client.rest import ApiException
+import logging
+
+
+# List all namespaces
+def list_namespaces():
+    namespaces = []
+    try:
+        config.load_kube_config()
+        cli = client.CoreV1Api()
+        ret = cli.list_namespace(pretty=True)
+    except ApiException as e:
+        logging.error("Exception when calling \
+                       CoreV1Api->list_namespaced_pod: %s\n" % e)
+    for namespace in ret.items:
+        namespaces.append(namespace.metadata.name)
+    return namespaces
+
+
+# Check if all the watch_namespaces are valid
+def check_namespaces(namespaces):
+    try:
+        valid_namespaces = list_namespaces()
+        regex_namespaces = set(namespaces) - set(valid_namespaces)
+        final_namespaces = set(namespaces) - set(regex_namespaces)
+        valid_regex = set()
+        if regex_namespaces:
+            for namespace in valid_namespaces:
+                for regex_namespace in regex_namespaces:
+                    if re.search(regex_namespace, namespace):
+                        final_namespaces.add(namespace)
+                        valid_regex.add(regex_namespace)
+                        break
+        invalid_namespaces = regex_namespaces - valid_regex
+        if invalid_namespaces:
+            raise Exception("There exists no namespaces matching: %s" % (invalid_namespaces))
+        return list(final_namespaces)
+    except Exception as e:
+        logging.error("%s" % (e))
+        sys.exit(1)
+
+
+def run(cmd):
+    try:
+        output = subprocess.Popen(cmd, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (cmd, e))
+    return out
+
+
+regex_namespace = ["openshift-.*"]
+namespaces = check_namespaces(regex_namespace)
+pods_running = 0
+for namespace in namespaces:
+    new_pods_running = run("oc get pods -n " + namespace + " | grep -c Running").rstrip()
+    try:
+        pods_running += int(new_pods_running)
+    except Exception:
+        continue
+print(pods_running)
--- a/scenarios/post_action_regex.sh
+++ b/scenarios/post_action_regex.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+pods="$(oc get pods -n openshift-etcd | grep -c Running)"
+echo "$pods"
+
+if [ "$pods" -eq 3 ]
+then
+  echo "Pods Pass"
+else
+  # need capital error for proper error catching in run_kraken
+  echo "ERROR pod count $pods doesnt match 3 expected pods"
+fi
--- a/scenarios/post_action_regex_openshift_pod_kill.yml
+++ b/scenarios/post_action_regex_openshift_pod_kill.yml
@@ -0,0 +1,18 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: kill up to 3 pods in any openshift namespace
+    steps:
+    - podAction:
+        matches:
+          - namespace: "openshift-.*"
+        filters:
+          - property:
+             name: "state"
+             value: "Running"
+        actions:
+          - checkPodCount:
+              count: 146
--- a/scenarios/regex_openshift_pod_kill.yml
+++ b/scenarios/regex_openshift_pod_kill.yml