From 22fcab57f59ff78a0856fd940d9caededf0af5f1 Mon Sep 17 00:00:00 2001
From: Paige Rubendall <prubenda@redhat.com>
Date: Thu, 19 Aug 2021 15:47:16 -0400
Subject: [PATCH] container checking in pod

---
 docs/container_scenarios.md             | 21 +++++++++
 kraken/pod_scenarios/setup.py           | 57 ++++++++++++++++++++-----
 scenarios/container_etcd.yml            |  1 +
 scenarios/post_action_etcd_container.py | 29 +++++++++++++
 4 files changed, 98 insertions(+), 10 deletions(-)
 create mode 100755 scenarios/post_action_etcd_container.py
diff --git a/docs/container_scenarios.md b/docs/container_scenarios.md
index 1efd550a..0922cf81 100644
--- a/docs/container_scenarios.md
+++ b/docs/container_scenarios.md
@@ -14,4 +14,25 @@ scenarios:
   container_name: "<specific container name>"  # This is optional, can take out and will kill all containers in all pods found under namespace and label
   pod_names:  # This is optional, can take out and will select all pods with given namespace and label
   - <pod_name>
+  retry_wait: <number of seconds to wait for container to be running again> (defaults to 120seconds)
+```
+
+#### Post Action
+In all scenarios we do a post chaos check to wait and verify the specific component
+
+Here there are two options:
+1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario
+
+See [scenarios/post_action_etcd_container.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_etcd_container.py) for an example
+```
+-   container_scenarios:                                 # List of chaos pod scenarios to load
+            - -    scenarios/container_etcd.yml
+              -    scenarios/post_action_etcd_container.py
+```
+
+2. Allow kraken to wait and check the killed containers become ready again. Kraken keeps a list of the specific
+containers that were killed as well as the namespaces and pods to verify all containers that were affected recover properly
+
+```
+retry_wait: <seconds to wait for container to recover>
 ```
diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py
index 40ae9e01..7eceb3ac 100644
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -56,21 +56,33 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
 
 def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
     for container_scenario_config in scenarios_list:
+        if len(container_scenario_config) > 1:
+            pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
+        else:
+            pre_action_output = ""
         with open(container_scenario_config[0], "r") as f:
             cont_scenario_config = yaml.full_load(f)
             for cont_scenario in cont_scenario_config["scenarios"]:
-                if len(container_scenario_config) > 1:
-                    pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
-                else:
-                    pre_action_output = ""
                 # capture start time
                 start_time = int(time.time())
-                container_killing_in_pod(cont_scenario)
+                killed_containers = container_killing_in_pod(cont_scenario)
+
+                if len(container_scenario_config) > 1:
+                    try:
+                        failed_post_scenarios = post_actions.check_recovery(
+                            kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
+                        )
+                    except Exception as e:
+                        logging.error("Failed to run post action checks: %s" % e)
+                        sys.exit(1)
+                else:
+                    failed_post_scenarios = check_failed_containers(
+                        killed_containers, cont_scenario.get("retry_wait", 120)
+                    )
+
                 logging.info("Waiting for the specified duration: %s" % (wait_duration))
                 time.sleep(wait_duration)
-                failed_post_scenarios = post_actions.check_recovery(
-                    kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
-                )
+
                 # capture end time
                 end_time = int(time.time())
 
@@ -107,7 +119,6 @@ def container_killing_in_pod(cont_scenario):
     container_pod_list = []
     for pod in pods:
         if type(pod) == list:
-
             container_names = runcommand.invoke(
                 'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
             ).split(" ")
@@ -119,7 +130,7 @@ def container_killing_in_pod(cont_scenario):
             container_pod_list.append([pod, namespace, container_names])
 
     killed_count = 0
-
+    killed_container_list = []
     while killed_count < kill_count:
         if len(container_pod_list) == 0:
             logging.error("Trying to kill more containers than were found, try lowering kill count")
@@ -129,14 +140,17 @@ def container_killing_in_pod(cont_scenario):
         for c_name in selected_container_pod[2]:
             if container_name != "":
                 if c_name == container_name:
+                    killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
                     retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
                     break
             else:
+                killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
                 retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
                 break
         container_pod_list.remove(selected_container_pod)
         killed_count += 1
     logging.info("Scenario " + scenario_name + " successfully injected")
+    return killed_container_list
 
 
 def retry_container_killing(kill_action, podname, namespace, container_name):
@@ -153,3 +167,26 @@ def retry_container_killing(kill_action, podname, namespace, container_name):
             continue
         else:
             continue
+
+
+def check_failed_containers(killed_container_list, wait_time):
+
+    container_ready = []
+    timer = 0
+    while timer <= wait_time:
+        for killed_container in killed_container_list:
+            # pod namespace contain name
+            pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1]))
+            pod_output_yaml = yaml.full_load(pod_output)
+            for statuses in pod_output_yaml["status"]["containerStatuses"]:
+                if statuses["name"] == killed_container[2]:
+                    if str(statuses["ready"]).lower() == "true":
+                        container_ready.append(killed_container)
+        for item in container_ready:
+            killed_container_list = killed_container_list.remove(item)
+        if killed_container_list is None or len(killed_container) == 0:
+            return []
+        timer += 5
+        logging.info("Waiting 5 seconds for containers to become ready")
+        time.sleep(5)
+    return killed_container_list
diff --git a/scenarios/container_etcd.yml b/scenarios/container_etcd.yml
index c896f658..bf93833e 100755
--- a/scenarios/container_etcd.yml
+++ b/scenarios/container_etcd.yml
@@ -5,3 +5,4 @@ scenarios:
   container_name: "etcd"
   action: "kill 1"
   count: 1
+  retry_wait: 60
diff --git a/scenarios/post_action_etcd_container.py b/scenarios/post_action_etcd_container.py
new file mode 100755
index 00000000..ff39723f
--- /dev/null
+++ b/scenarios/post_action_etcd_container.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+import subprocess
+import logging
+import time
+
+
+def run(cmd):
+    try:
+        output = subprocess.Popen(
+            cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+        )
+        (out, err) = output.communicate()
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (cmd, e))
+    return out
+
+
+i = 0
+while i < 100:
+    pods_running = run("oc get pods -n openshift-etcd -l app=etcd | grep -c '4/4'").rstrip()
+    if pods_running == "3":
+        break
+    time.sleep(5)
+    i += 1
+
+if pods_running == str(3):
+    print("There were 3 pods running properly")
+else:
+    print("ERROR there were " + str(pods_running) + " pods running instead of 3")