container checking in pod

This commit is contained in:
Paige Rubendall
2021-08-19 15:47:16 -04:00
committed by Naga Ravi Chaitanya Elluri
parent 07ccfbf0aa
commit 22fcab57f5
4 changed files with 98 additions and 10 deletions

View File

@@ -14,4 +14,25 @@ scenarios:
container_name: "<specific container name>" # This is optional, can take out and will kill all containers in all pods found under namespace and label
pod_names: # This is optional, can take out and will select all pods with given namespace and label
- <pod_name>
retry_wait: <number of seconds to wait for container to be running again> (defaults to 120seconds)
```
#### Post Action
In all scenarios we do a post chaos check to wait and verify the specific component
Here there are two options:
1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario
See [scenarios/post_action_etcd_container.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_etcd_container.py) for an example
```
- container_scenarios: # List of chaos pod scenarios to load
- - scenarios/container_etcd.yml
- scenarios/post_action_etcd_container.py
```
2. Allow kraken to wait and check the killed containers become ready again. Kraken keeps a list of the specific
containers that were killed as well as the namespaces and pods to verify all containers that were affected recover properly
```
retry_wait: <seconds to wait for container to recover>
```

View File

@@ -56,21 +56,33 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
for container_scenario_config in scenarios_list:
if len(container_scenario_config) > 1:
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
else:
pre_action_output = ""
with open(container_scenario_config[0], "r") as f:
cont_scenario_config = yaml.full_load(f)
for cont_scenario in cont_scenario_config["scenarios"]:
if len(container_scenario_config) > 1:
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
else:
pre_action_output = ""
# capture start time
start_time = int(time.time())
container_killing_in_pod(cont_scenario)
killed_containers = container_killing_in_pod(cont_scenario)
if len(container_scenario_config) > 1:
try:
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
)
except Exception as e:
logging.error("Failed to run post action checks: %s" % e)
sys.exit(1)
else:
failed_post_scenarios = check_failed_containers(
killed_containers, cont_scenario.get("retry_wait", 120)
)
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
)
# capture end time
end_time = int(time.time())
@@ -107,7 +119,6 @@ def container_killing_in_pod(cont_scenario):
container_pod_list = []
for pod in pods:
if type(pod) == list:
container_names = runcommand.invoke(
'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
).split(" ")
@@ -119,7 +130,7 @@ def container_killing_in_pod(cont_scenario):
container_pod_list.append([pod, namespace, container_names])
killed_count = 0
killed_container_list = []
while killed_count < kill_count:
if len(container_pod_list) == 0:
logging.error("Trying to kill more containers than were found, try lowering kill count")
@@ -129,14 +140,17 @@ def container_killing_in_pod(cont_scenario):
for c_name in selected_container_pod[2]:
if container_name != "":
if c_name == container_name:
killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
break
else:
killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
break
container_pod_list.remove(selected_container_pod)
killed_count += 1
logging.info("Scenario " + scenario_name + " successfully injected")
return killed_container_list
def retry_container_killing(kill_action, podname, namespace, container_name):
@@ -153,3 +167,26 @@ def retry_container_killing(kill_action, podname, namespace, container_name):
continue
else:
continue
def check_failed_containers(killed_container_list, wait_time):
container_ready = []
timer = 0
while timer <= wait_time:
for killed_container in killed_container_list:
# pod namespace contain name
pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1]))
pod_output_yaml = yaml.full_load(pod_output)
for statuses in pod_output_yaml["status"]["containerStatuses"]:
if statuses["name"] == killed_container[2]:
if str(statuses["ready"]).lower() == "true":
container_ready.append(killed_container)
for item in container_ready:
killed_container_list = killed_container_list.remove(item)
if killed_container_list is None or len(killed_container) == 0:
return []
timer += 5
logging.info("Waiting 5 seconds for containers to become ready")
time.sleep(5)
return killed_container_list

View File

@@ -5,3 +5,4 @@ scenarios:
container_name: "etcd"
action: "kill 1"
count: 1
retry_wait: 60

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python3
import subprocess
import logging
import time
def run(cmd):
try:
output = subprocess.Popen(
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
(out, err) = output.communicate()
except Exception as e:
logging.error("Failed to run %s, error: %s" % (cmd, e))
return out
i = 0
while i < 100:
pods_running = run("oc get pods -n openshift-etcd -l app=etcd | grep -c '4/4'").rstrip()
if pods_running == "3":
break
time.sleep(5)
i += 1
if pods_running == str(3):
print("There were 3 pods running properly")
else:
print("ERROR there were " + str(pods_running) + " pods running instead of 3")