From 22fcab57f59ff78a0856fd940d9caededf0af5f1 Mon Sep 17 00:00:00 2001 From: Paige Rubendall Date: Thu, 19 Aug 2021 15:47:16 -0400 Subject: [PATCH] container checking in pod --- docs/container_scenarios.md | 21 +++++++++ kraken/pod_scenarios/setup.py | 57 ++++++++++++++++++++----- scenarios/container_etcd.yml | 1 + scenarios/post_action_etcd_container.py | 29 +++++++++++++ 4 files changed, 98 insertions(+), 10 deletions(-) create mode 100755 scenarios/post_action_etcd_container.py diff --git a/docs/container_scenarios.md b/docs/container_scenarios.md index 1efd550a..0922cf81 100644 --- a/docs/container_scenarios.md +++ b/docs/container_scenarios.md @@ -14,4 +14,25 @@ scenarios: container_name: "" # This is optional, can take out and will kill all containers in all pods found under namespace and label pod_names: # This is optional, can take out and will select all pods with given namespace and label - + retry_wait: (defaults to 120seconds) +``` + +#### Post Action +In all scenarios we do a post chaos check to wait and verify the specific component + +Here there are two options: +1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario + +See [scenarios/post_action_etcd_container.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_etcd_container.py) for an example +``` +- container_scenarios: # List of chaos pod scenarios to load + - - scenarios/container_etcd.yml + - scenarios/post_action_etcd_container.py +``` + +2. Allow kraken to wait and check the killed containers become ready again. Kraken keeps a list of the specific +containers that were killed as well as the namespaces and pods to verify all containers that were affected recover properly + +``` +retry_wait: ``` diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py index 40ae9e01..7eceb3ac 100644 --- a/kraken/pod_scenarios/setup.py +++ b/kraken/pod_scenarios/setup.py @@ -56,21 +56,33 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration): for container_scenario_config in scenarios_list: + if len(container_scenario_config) > 1: + pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1]) + else: + pre_action_output = "" with open(container_scenario_config[0], "r") as f: cont_scenario_config = yaml.full_load(f) for cont_scenario in cont_scenario_config["scenarios"]: - if len(container_scenario_config) > 1: - pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1]) - else: - pre_action_output = "" # capture start time start_time = int(time.time()) - container_killing_in_pod(cont_scenario) + killed_containers = container_killing_in_pod(cont_scenario) + + if len(container_scenario_config) > 1: + try: + failed_post_scenarios = post_actions.check_recovery( + kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output + ) + except Exception as e: + logging.error("Failed to run post action checks: %s" % e) + sys.exit(1) + else: + failed_post_scenarios = check_failed_containers( + killed_containers, cont_scenario.get("retry_wait", 120) + ) + logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) - failed_post_scenarios = post_actions.check_recovery( - kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output - ) + # capture end time end_time = int(time.time()) @@ -107,7 +119,6 @@ def container_killing_in_pod(cont_scenario): container_pod_list = [] for pod in pods: if type(pod) == list: - container_names = runcommand.invoke( 'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1]) ).split(" ") @@ -119,7 +130,7 @@ def container_killing_in_pod(cont_scenario): container_pod_list.append([pod, namespace, container_names]) killed_count = 0 - + killed_container_list = [] while killed_count < kill_count: if len(container_pod_list) == 0: logging.error("Trying to kill more containers than were found, try lowering kill count") @@ -129,14 +140,17 @@ def container_killing_in_pod(cont_scenario): for c_name in selected_container_pod[2]: if container_name != "": if c_name == container_name: + killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name]) retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name) break else: + killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name]) retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name) break container_pod_list.remove(selected_container_pod) killed_count += 1 logging.info("Scenario " + scenario_name + " successfully injected") + return killed_container_list def retry_container_killing(kill_action, podname, namespace, container_name): @@ -153,3 +167,26 @@ def retry_container_killing(kill_action, podname, namespace, container_name): continue else: continue + + +def check_failed_containers(killed_container_list, wait_time): + + container_ready = [] + timer = 0 + while timer <= wait_time: + for killed_container in killed_container_list: + # pod namespace contain name + pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1])) + pod_output_yaml = yaml.full_load(pod_output) + for statuses in pod_output_yaml["status"]["containerStatuses"]: + if statuses["name"] == killed_container[2]: + if str(statuses["ready"]).lower() == "true": + container_ready.append(killed_container) + for item in container_ready: + killed_container_list = killed_container_list.remove(item) + if killed_container_list is None or len(killed_container) == 0: + return [] + timer += 5 + logging.info("Waiting 5 seconds for containers to become ready") + time.sleep(5) + return killed_container_list diff --git a/scenarios/container_etcd.yml b/scenarios/container_etcd.yml index c896f658..bf93833e 100755 --- a/scenarios/container_etcd.yml +++ b/scenarios/container_etcd.yml @@ -5,3 +5,4 @@ scenarios: container_name: "etcd" action: "kill 1" count: 1 + retry_wait: 60 diff --git a/scenarios/post_action_etcd_container.py b/scenarios/post_action_etcd_container.py new file mode 100755 index 00000000..ff39723f --- /dev/null +++ b/scenarios/post_action_etcd_container.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import subprocess +import logging +import time + + +def run(cmd): + try: + output = subprocess.Popen( + cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) + (out, err) = output.communicate() + except Exception as e: + logging.error("Failed to run %s, error: %s" % (cmd, e)) + return out + + +i = 0 +while i < 100: + pods_running = run("oc get pods -n openshift-etcd -l app=etcd | grep -c '4/4'").rstrip() + if pods_running == "3": + break + time.sleep(5) + i += 1 + +if pods_running == str(3): + print("There were 3 pods running properly") +else: + print("ERROR there were " + str(pods_running) + " pods running instead of 3")