container checking in pod

2026-04-15 06:57:28 +00:00 · 2021-08-19 15:47:16 -04:00
parent 07ccfbf0aa
commit 22fcab57f5
4 changed files with 98 additions and 10 deletions
--- a/docs/container_scenarios.md
+++ b/docs/container_scenarios.md
@@ -14,4 +14,25 @@ scenarios:
  container_name: "<specific container name>"  # This is optional, can take out and will kill all containers in all pods found under namespace and label
  pod_names:  # This is optional, can take out and will select all pods with given namespace and label
  - <pod_name>
+  retry_wait: <number of seconds to wait for container to be running again> (defaults to 120seconds)
+```
+
+#### Post Action
+In all scenarios we do a post chaos check to wait and verify the specific component
+
+Here there are two options:
+1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario
+
+See [scenarios/post_action_etcd_container.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_etcd_container.py) for an example
+```
+-   container_scenarios:                                 # List of chaos pod scenarios to load
+            - -    scenarios/container_etcd.yml
+              -    scenarios/post_action_etcd_container.py
+```
+
+2. Allow kraken to wait and check the killed containers become ready again. Kraken keeps a list of the specific
+containers that were killed as well as the namespaces and pods to verify all containers that were affected recover properly
+
+```
+retry_wait: <seconds to wait for container to recover>
 ```
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -56,21 +56,33 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur

 def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
    for container_scenario_config in scenarios_list:
+        if len(container_scenario_config) > 1:
+            pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
+        else:
+            pre_action_output = ""
        with open(container_scenario_config[0], "r") as f:
            cont_scenario_config = yaml.full_load(f)
            for cont_scenario in cont_scenario_config["scenarios"]:
-                if len(container_scenario_config) > 1:
-                    pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
-                else:
-                    pre_action_output = ""
                # capture start time
                start_time = int(time.time())
-                container_killing_in_pod(cont_scenario)
+                killed_containers = container_killing_in_pod(cont_scenario)
+
+                if len(container_scenario_config) > 1:
+                    try:
+                        failed_post_scenarios = post_actions.check_recovery(
+                            kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
+                        )
+                    except Exception as e:
+                        logging.error("Failed to run post action checks: %s" % e)
+                        sys.exit(1)
+                else:
+                    failed_post_scenarios = check_failed_containers(
+                        killed_containers, cont_scenario.get("retry_wait", 120)
+                    )
+
                logging.info("Waiting for the specified duration: %s" % (wait_duration))
                time.sleep(wait_duration)
-                failed_post_scenarios = post_actions.check_recovery(
-                    kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
-                )
+
                # capture end time
                end_time = int(time.time())

@@ -107,7 +119,6 @@ def container_killing_in_pod(cont_scenario):
    container_pod_list = []
    for pod in pods:
        if type(pod) == list:
-
            container_names = runcommand.invoke(
                'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
            ).split(" ")
@@ -119,7 +130,7 @@ def container_killing_in_pod(cont_scenario):
            container_pod_list.append([pod, namespace, container_names])

    killed_count = 0
-
+    killed_container_list = []
    while killed_count < kill_count:
        if len(container_pod_list) == 0:
            logging.error("Trying to kill more containers than were found, try lowering kill count")
@@ -129,14 +140,17 @@ def container_killing_in_pod(cont_scenario):
        for c_name in selected_container_pod[2]:
            if container_name != "":
                if c_name == container_name:
+                    killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
                    retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
                    break
            else:
+                killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
                retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
                break
        container_pod_list.remove(selected_container_pod)
        killed_count += 1
    logging.info("Scenario " + scenario_name + " successfully injected")
+    return killed_container_list


 def retry_container_killing(kill_action, podname, namespace, container_name):
@@ -153,3 +167,26 @@ def retry_container_killing(kill_action, podname, namespace, container_name):
            continue
        else:
            continue
+
+
+def check_failed_containers(killed_container_list, wait_time):
+
+    container_ready = []
+    timer = 0
+    while timer <= wait_time:
+        for killed_container in killed_container_list:
+            # pod namespace contain name
+            pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1]))
+            pod_output_yaml = yaml.full_load(pod_output)
+            for statuses in pod_output_yaml["status"]["containerStatuses"]:
+                if statuses["name"] == killed_container[2]:
+                    if str(statuses["ready"]).lower() == "true":
+                        container_ready.append(killed_container)
+        for item in container_ready:
+            killed_container_list = killed_container_list.remove(item)
+        if killed_container_list is None or len(killed_container) == 0:
+            return []
+        timer += 5
+        logging.info("Waiting 5 seconds for containers to become ready")
+        time.sleep(5)
+    return killed_container_list
--- a/scenarios/container_etcd.yml
+++ b/scenarios/container_etcd.yml
@@ -5,3 +5,4 @@ scenarios:
  container_name: "etcd"
  action: "kill 1"
  count: 1
+  retry_wait: 60
--- a/scenarios/post_action_etcd_container.py
+++ b/scenarios/post_action_etcd_container.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+import subprocess
+import logging
+import time
+
+
+def run(cmd):
+    try:
+        output = subprocess.Popen(
+            cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+        )
+        (out, err) = output.communicate()
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (cmd, e))
+    return out
+
+
+i = 0
+while i < 100:
+    pods_running = run("oc get pods -n openshift-etcd -l app=etcd | grep -c '4/4'").rstrip()
+    if pods_running == "3":
+        break
+    time.sleep(5)
+    i += 1
+
+if pods_running == str(3):
+    print("There were 3 pods running properly")
+else:
+    print("ERROR there were " + str(pods_running) + " pods running instead of 3")