From 0fc82090f2c564f8b513aa3cbd9f321d11069107 Mon Sep 17 00:00:00 2001
From: prubenda <prubenda@redhat.com>
Date: Wed, 10 Jun 2020 09:09:28 -0400
Subject: [PATCH] Adding watch to see if components recovered

---
 config/config.yaml                            |  14 +-
 kraken/invoke/command.py                      |  12 +-
 kraken/kubernetes/client.py                   |   1 +
 run_kraken.py                                 | 137 +++++++++++++++---
 scenarios/etcd.yml                            |   3 -
 scenarios/openshift-apiserver.yml             |   0
 scenarios/openshift-kube-apiserver.yml        |   1 -
 scenarios/post_action_etcd.yml                |  21 +++
 scenarios/post_action_etcd_example.sh         |   3 +
 scenarios/post_action_etcd_example_py.py      |  23 +++
 scenarios/post_action_openshift-apiserver.yml |  23 +++
 .../post_action_openshift-kube-apiserver.yml  |  21 +++
 scenarios/post_action_regex.py                |  68 +++++++++
 scenarios/post_action_regex.sh                |  11 ++
 .../post_action_regex_openshift_pod_kill.yml  |  18 +++
 scenarios/regex_openshift_pod_kill.yml        |   0
 16 files changed, 321 insertions(+), 35 deletions(-)
 mode change 100644 => 100755 scenarios/etcd.yml
 mode change 100644 => 100755 scenarios/openshift-apiserver.yml
 mode change 100644 => 100755 scenarios/openshift-kube-apiserver.yml
 create mode 100755 scenarios/post_action_etcd.yml
 create mode 100755 scenarios/post_action_etcd_example.sh
 create mode 100755 scenarios/post_action_etcd_example_py.py
 create mode 100755 scenarios/post_action_openshift-apiserver.yml
 create mode 100755 scenarios/post_action_openshift-kube-apiserver.yml
 create mode 100755 scenarios/post_action_regex.py
 create mode 100755 scenarios/post_action_regex.sh
 create mode 100755 scenarios/post_action_regex_openshift_pod_kill.yml
 mode change 100644 => 100755 scenarios/regex_openshift_pod_kill.yml

diff --git a/config/config.yaml b/config/config.yaml
index 8ee29990..38528810 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,11 +1,15 @@
 kraken:
     kubeconfig_path: /root/.kube/config                    # Path to kubeconfig
+    exit_on_failure: False                                 # Exit when a post action scenario fails
     scenarios:                                             # List of policies/chaos scenarios to load
-        -    scenarios/etcd.yml
-        -    scenarios/openshift-kube-apiserver.yml
-        -    scenarios/openshift-apiserver.yml
-        -    scenarios/regex_openshift_pod_kill.yml
-
+        - -    scenarios/etcd.yml
+          -    scenarios/post_action_etcd_example.sh
+        - -    scenarios/openshift-kube-apiserver.yml
+          -    scenarios/post_action_openshift-kube-apiserver.yml
+        - -    scenarios/openshift-apiserver.yml
+          -    scenarios/post_action_openshift-apiserver.yml
+        - -    scenarios/regex_openshift_pod_kill.yml
+          -    scenarios/post_action_regex.py
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
     cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
diff --git a/kraken/invoke/command.py b/kraken/invoke/command.py
index 152875ff..c624a9e1 100644
--- a/kraken/invoke/command.py
+++ b/kraken/invoke/command.py
@@ -5,8 +5,10 @@ import logging
 # Invokes a given command and returns the stdout
 def invoke(command):
     try:
-        output = subprocess.check_output(command, shell=True,
-                                         universal_newlines=True)
-    except Exception:
-        logging.error("Failed to run %s" % (command))
-    return output
+        output = subprocess.Popen(command, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (command, e))
+    return out
diff --git a/kraken/kubernetes/client.py b/kraken/kubernetes/client.py
index 2a180843..2bc805ed 100644
--- a/kraken/kubernetes/client.py
+++ b/kraken/kubernetes/client.py
@@ -4,6 +4,7 @@ import logging
 import kraken.invoke.command as runcommand
 import json
 
+
 kraken_node_name = ""
 
 
diff --git a/run_kraken.py b/run_kraken.py
index 9be279c6..12eee900 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -12,6 +12,109 @@ import kraken.invoke.command as runcommand
 import pyfiglet
 
 
+# Get cerberus status
+def cerberus_integration(config):
+    cerberus_status = True
+    if config["cerberus"]["cerberus_enabled"]:
+        cerberus_url = config["cerberus"]["cerberus_url"]
+        if not cerberus_url:
+            logging.error("url where Cerberus publishes True/False signal is not provided.")
+            sys.exit(1)
+        cerberus_status = requests.get(cerberus_url).content
+        cerberus_status = True if cerberus_status == b'True' else False
+        if not cerberus_status:
+            logging.error("Received a no-go signal from Cerberus, looks like "
+                          "the cluster is unhealthy. Please check the Cerberus "
+                          "report for more details. Test failed.")
+            sys.exit(1)
+        else:
+            logging.info("Received a go signal from Ceberus, the cluster is healthy. "
+                         "Test passed.")
+    return cerberus_status
+
+
+# Function to publish kraken status to cerberus
+def publish_kraken_status(config, failed_post_scenarios):
+    cerberus_status = cerberus_integration(config)
+    if not cerberus_status:
+        if failed_post_scenarios:
+            if config['kraken']['exit_on_failure']:
+                logging.info("Cerberus status is not healthy and post action scenarios "
+                             "are still failing, exiting kraken run")
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is not healthy and post action scenarios "
+                             "are still failing")
+    else:
+
+        if failed_post_scenarios:
+            if config['kraken']['exit_on_failure']:
+                logging.info("Cerberus status is healthy but post action scenarios "
+                             "are still failing, exiting kraken run")
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is healthy but post action scenarios "
+                             "are still failing")
+
+
+def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
+
+    if scenario.endswith(".yaml") or scenario.endswith(".yml"):
+        action_output = runcommand.invoke("powerfulseal autonomous "
+                                          "--use-pod-delete-instead-of-ssh-kill"
+                                          " --policy-file %s --kubeconfig %s --no-cloud"
+                                          " --inventory-kubernetes --headless"
+                                          % (scenario, kubeconfig_path))
+        # read output to make sure no error
+        if "ERROR" in action_output:
+            action_output.split("ERROR")[1].split('\n')[0]
+            if not pre_action_output:
+                logging.info("Powerful seal pre action check failed for " + str(scenario))
+            return False
+        else:
+            logging.info(scenario + " post action checks passed")
+
+    elif scenario.endswith(".py"):
+        action_output = runcommand.invoke("python3 " + scenario).strip()
+        if pre_action_output:
+            if pre_action_output == action_output:
+                logging.info(scenario + " post action checks passed")
+            else:
+                logging.info(scenario + ' post action response did not match pre check output')
+                return False
+    else:
+        # invoke custom bash script
+        action_output = runcommand.invoke(scenario).strip()
+        if pre_action_output:
+            if pre_action_output == action_output:
+                logging.info(scenario + " post action checks passed")
+            else:
+                logging.info(scenario + ' post action response did not match pre check output')
+                return False
+
+    return action_output
+
+
+# Perform the post scenario actions to see if components recovered
+def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
+
+    for failed_scenario in failed_post_scenarios:
+        post_action_output = run_post_action(kubeconfig_path,
+                                             failed_scenario[0], failed_scenario[1])
+        if post_action_output is not False:
+            failed_post_scenarios.remove(failed_scenario)
+        else:
+            logging.info('Post action scenario ' + str(failed_scenario) + "is still failing")
+
+    # check post actions
+    if len(scenario) > 1:
+        post_action_output = run_post_action(kubeconfig_path, scenario[1], pre_action_output)
+        if post_action_output is False:
+            failed_post_scenarios.append([scenario[1], pre_action_output])
+
+    return failed_post_scenarios
+
+
 # Main function
 def main(cfg):
     # Start kraken
@@ -24,7 +127,6 @@ def main(cfg):
             config = yaml.full_load(f)
         kubeconfig_path = config["kraken"]["kubeconfig_path"]
         scenarios = config["kraken"]["scenarios"]
-        cerberus_enabled = config["cerberus"]["cerberus_enabled"]
         wait_duration = config["tunings"]["wait_duration"]
         iterations = config["tunings"]["iterations"]
         daemon_mode = config["tunings"]['daemon_mode']
@@ -59,41 +161,34 @@ def main(cfg):
                          % str(iterations))
             iterations = int(iterations)
 
+        failed_post_scenarios = []
         # Loop to run the chaos starts here
         while (int(iteration) < iterations):
             # Inject chaos scenarios specified in the config
+            logging.info("Executing scenarios for iteration " + str(iteration))
             try:
                 # Loop to run the scenarios starts here
                 for scenario in scenarios:
-                    logging.info("Injecting scenario: %s" % (scenario))
+                    pre_action_output = run_post_action(kubeconfig_path, scenario[1])
                     runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
                                       " --policy-file %s --kubeconfig %s --no-cloud"
                                       " --inventory-kubernetes --headless"
-                                      % (scenario, kubeconfig_path))
-                    logging.info("Scenario: %s has been successfully injected!" % (scenario))
+                                      % (scenario[0], kubeconfig_path))
 
-                    if cerberus_enabled:
-                        cerberus_url = config["cerberus"]["cerberus_url"]
-                        if not cerberus_url:
-                            logging.error("url where Cerberus publishes True/False signal "
-                                          "is not provided.")
-                            sys.exit(1)
-                        cerberus_status = requests.get(cerberus_url).content
-                        cerberus_status = True if cerberus_status == b'True' else False
-                        if not cerberus_status:
-                            logging.error("Received a no-go signal from Cerberus, looks like the"
-                                          " cluster is unhealthy. Please check the Cerberus report"
-                                          " for more details. Test failed.")
-                            sys.exit(1)
-                        else:
-                            logging.info("Received a go signal from Ceberus, the cluster is "
-                                         "healthy. Test passed.")
+                    logging.info("Scenario: %s has been successfully injected!" % (scenario[0]))
                     logging.info("Waiting for the specified duration: %s" % (wait_duration))
                     time.sleep(wait_duration)
+                    failed_post_scenarios = post_actions(kubeconfig_path, scenario,
+                                                         failed_post_scenarios, pre_action_output)
+                    publish_kraken_status(config, failed_post_scenarios)
             except Exception as e:
                 logging.error("Failed to run scenario: %s. Encountered the following exception: %s"
-                              % (scenario, e))
+                              % (scenario[0], e))
             iteration += 1
+            logging.info("")
+        if failed_post_scenarios:
+            logging.error("Post scenarios are still failing at the end of all iterations")
+            sys.exit(1)
     else:
         logging.error("Cannot find a config at %s, please check" % (cfg))
         sys.exit(1)
diff --git a/scenarios/etcd.yml b/scenarios/etcd.yml
old mode 100644
new mode 100755
index 2eb303ae..a0e579b4
--- a/scenarios/etcd.yml
+++ b/scenarios/etcd.yml
@@ -11,12 +11,9 @@ scenarios:
           - labels:
               namespace: "openshift-etcd"
               selector: "k8s-app=etcd"
-
         filters:
           - randomSample:
               size: 1
-
-        # The actions will be executed in the order specified
         actions:
           - kill:
               probability: 1
diff --git a/scenarios/openshift-apiserver.yml b/scenarios/openshift-apiserver.yml
old mode 100644
new mode 100755
diff --git a/scenarios/openshift-kube-apiserver.yml b/scenarios/openshift-kube-apiserver.yml
old mode 100644
new mode 100755
index 7bd4039c..94c72c5d
--- a/scenarios/openshift-kube-apiserver.yml
+++ b/scenarios/openshift-kube-apiserver.yml
@@ -11,7 +11,6 @@ scenarios:
           - labels:
               namespace: "openshift-kube-apiserver"
               selector: "app=openshift-kube-apiserver"
-
         filters:
           - randomSample:
               size: 1
diff --git a/scenarios/post_action_etcd.yml b/scenarios/post_action_etcd.yml
new file mode 100755
index 00000000..6a472e26
--- /dev/null
+++ b/scenarios/post_action_etcd.yml
@@ -0,0 +1,21 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 10
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: etcd"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-etcd"
+              selector: "k8s-app=etcd"
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
\ No newline at end of file
diff --git a/scenarios/post_action_etcd_example.sh b/scenarios/post_action_etcd_example.sh
new file mode 100755
index 00000000..f122291b
--- /dev/null
+++ b/scenarios/post_action_etcd_example.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+pods="$(oc get pods -n openshift-etcd | grep -c Running)"
+echo "$pods"
diff --git a/scenarios/post_action_etcd_example_py.py b/scenarios/post_action_etcd_example_py.py
new file mode 100755
index 00000000..1d840625
--- /dev/null
+++ b/scenarios/post_action_etcd_example_py.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+import subprocess
+import logging
+
+
+def run(cmd):
+    try:
+        output = subprocess.Popen(cmd, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+        logging.info("out " + str(out))
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (cmd, e))
+    return out
+
+
+pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip()
+
+if pods_running == str(3):
+    print("There were 3 pods running properly")
+else:
+    print("ERROR there were " + str(pods_running) + " pods running instead of 3")
diff --git a/scenarios/post_action_openshift-apiserver.yml b/scenarios/post_action_openshift-apiserver.yml
new file mode 100755
index 00000000..938b74c2
--- /dev/null
+++ b/scenarios/post_action_openshift-apiserver.yml
@@ -0,0 +1,23 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: openshift-apiserver"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-apiserver"
+              selector: "app=openshift-apiserver"
+
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
diff --git a/scenarios/post_action_openshift-kube-apiserver.yml b/scenarios/post_action_openshift-kube-apiserver.yml
new file mode 100755
index 00000000..7487661b
--- /dev/null
+++ b/scenarios/post_action_openshift-kube-apiserver.yml
@@ -0,0 +1,21 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: openshift-kube-apiserver"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-kube-apiserver"
+              selector: "app=openshift-kube-apiserver"
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
diff --git a/scenarios/post_action_regex.py b/scenarios/post_action_regex.py
new file mode 100755
index 00000000..ce12ab8c
--- /dev/null
+++ b/scenarios/post_action_regex.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import subprocess
+import re
+import sys
+from kubernetes import client, config
+from kubernetes.client.rest import ApiException
+import logging
+
+
+# List all namespaces
+def list_namespaces():
+    namespaces = []
+    try:
+        config.load_kube_config()
+        cli = client.CoreV1Api()
+        ret = cli.list_namespace(pretty=True)
+    except ApiException as e:
+        logging.error("Exception when calling \
+                       CoreV1Api->list_namespaced_pod: %s\n" % e)
+    for namespace in ret.items:
+        namespaces.append(namespace.metadata.name)
+    return namespaces
+
+
+# Check if all the watch_namespaces are valid
+def check_namespaces(namespaces):
+    try:
+        valid_namespaces = list_namespaces()
+        regex_namespaces = set(namespaces) - set(valid_namespaces)
+        final_namespaces = set(namespaces) - set(regex_namespaces)
+        valid_regex = set()
+        if regex_namespaces:
+            for namespace in valid_namespaces:
+                for regex_namespace in regex_namespaces:
+                    if re.search(regex_namespace, namespace):
+                        final_namespaces.add(namespace)
+                        valid_regex.add(regex_namespace)
+                        break
+        invalid_namespaces = regex_namespaces - valid_regex
+        if invalid_namespaces:
+            raise Exception("There exists no namespaces matching: %s" % (invalid_namespaces))
+        return list(final_namespaces)
+    except Exception as e:
+        logging.error("%s" % (e))
+        sys.exit(1)
+
+
+def run(cmd):
+    try:
+        output = subprocess.Popen(cmd, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (cmd, e))
+    return out
+
+
+regex_namespace = ["openshift-.*"]
+namespaces = check_namespaces(regex_namespace)
+pods_running = 0
+for namespace in namespaces:
+    new_pods_running = run("oc get pods -n " + namespace + " | grep -c Running").rstrip()
+    try:
+        pods_running += int(new_pods_running)
+    except Exception:
+        continue
+print(pods_running)
diff --git a/scenarios/post_action_regex.sh b/scenarios/post_action_regex.sh
new file mode 100755
index 00000000..10626cc0
--- /dev/null
+++ b/scenarios/post_action_regex.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+pods="$(oc get pods -n openshift-etcd | grep -c Running)"
+echo "$pods"
+
+if [ "$pods" -eq 3 ]
+then
+  echo "Pods Pass"
+else
+  # need capital error for proper error catching in run_kraken
+  echo "ERROR pod count $pods doesnt match 3 expected pods"
+fi
diff --git a/scenarios/post_action_regex_openshift_pod_kill.yml b/scenarios/post_action_regex_openshift_pod_kill.yml
new file mode 100755
index 00000000..ba011b75
--- /dev/null
+++ b/scenarios/post_action_regex_openshift_pod_kill.yml
@@ -0,0 +1,18 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: kill up to 3 pods in any openshift namespace
+    steps:
+    - podAction:
+        matches:
+          - namespace: "openshift-.*"
+        filters:
+          - property:
+             name: "state"
+             value: "Running"
+        actions:
+          - checkPodCount:
+              count: 146
diff --git a/scenarios/regex_openshift_pod_kill.yml b/scenarios/regex_openshift_pod_kill.yml
old mode 100644
new mode 100755