mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
container checking in pod
This commit is contained in:
committed by
Naga Ravi Chaitanya Elluri
parent
07ccfbf0aa
commit
22fcab57f5
@@ -14,4 +14,25 @@ scenarios:
|
||||
container_name: "<specific container name>" # This is optional, can take out and will kill all containers in all pods found under namespace and label
|
||||
pod_names: # This is optional, can take out and will select all pods with given namespace and label
|
||||
- <pod_name>
|
||||
retry_wait: <number of seconds to wait for container to be running again> (defaults to 120seconds)
|
||||
```
|
||||
|
||||
#### Post Action
|
||||
In all scenarios we do a post chaos check to wait and verify the specific component
|
||||
|
||||
Here there are two options:
|
||||
1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario
|
||||
|
||||
See [scenarios/post_action_etcd_container.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_etcd_container.py) for an example
|
||||
```
|
||||
- container_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/container_etcd.yml
|
||||
- scenarios/post_action_etcd_container.py
|
||||
```
|
||||
|
||||
2. Allow kraken to wait and check the killed containers become ready again. Kraken keeps a list of the specific
|
||||
containers that were killed as well as the namespaces and pods to verify all containers that were affected recover properly
|
||||
|
||||
```
|
||||
retry_wait: <seconds to wait for container to recover>
|
||||
```
|
||||
|
||||
@@ -56,21 +56,33 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
|
||||
|
||||
def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
|
||||
for container_scenario_config in scenarios_list:
|
||||
if len(container_scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
|
||||
else:
|
||||
pre_action_output = ""
|
||||
with open(container_scenario_config[0], "r") as f:
|
||||
cont_scenario_config = yaml.full_load(f)
|
||||
for cont_scenario in cont_scenario_config["scenarios"]:
|
||||
if len(container_scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
|
||||
else:
|
||||
pre_action_output = ""
|
||||
# capture start time
|
||||
start_time = int(time.time())
|
||||
container_killing_in_pod(cont_scenario)
|
||||
killed_containers = container_killing_in_pod(cont_scenario)
|
||||
|
||||
if len(container_scenario_config) > 1:
|
||||
try:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run post action checks: %s" % e)
|
||||
sys.exit(1)
|
||||
else:
|
||||
failed_post_scenarios = check_failed_containers(
|
||||
killed_containers, cont_scenario.get("retry_wait", 120)
|
||||
)
|
||||
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
|
||||
# capture end time
|
||||
end_time = int(time.time())
|
||||
|
||||
@@ -107,7 +119,6 @@ def container_killing_in_pod(cont_scenario):
|
||||
container_pod_list = []
|
||||
for pod in pods:
|
||||
if type(pod) == list:
|
||||
|
||||
container_names = runcommand.invoke(
|
||||
'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
|
||||
).split(" ")
|
||||
@@ -119,7 +130,7 @@ def container_killing_in_pod(cont_scenario):
|
||||
container_pod_list.append([pod, namespace, container_names])
|
||||
|
||||
killed_count = 0
|
||||
|
||||
killed_container_list = []
|
||||
while killed_count < kill_count:
|
||||
if len(container_pod_list) == 0:
|
||||
logging.error("Trying to kill more containers than were found, try lowering kill count")
|
||||
@@ -129,14 +140,17 @@ def container_killing_in_pod(cont_scenario):
|
||||
for c_name in selected_container_pod[2]:
|
||||
if container_name != "":
|
||||
if c_name == container_name:
|
||||
killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
|
||||
retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
|
||||
break
|
||||
else:
|
||||
killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name])
|
||||
retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name)
|
||||
break
|
||||
container_pod_list.remove(selected_container_pod)
|
||||
killed_count += 1
|
||||
logging.info("Scenario " + scenario_name + " successfully injected")
|
||||
return killed_container_list
|
||||
|
||||
|
||||
def retry_container_killing(kill_action, podname, namespace, container_name):
|
||||
@@ -153,3 +167,26 @@ def retry_container_killing(kill_action, podname, namespace, container_name):
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
def check_failed_containers(killed_container_list, wait_time):
|
||||
|
||||
container_ready = []
|
||||
timer = 0
|
||||
while timer <= wait_time:
|
||||
for killed_container in killed_container_list:
|
||||
# pod namespace contain name
|
||||
pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1]))
|
||||
pod_output_yaml = yaml.full_load(pod_output)
|
||||
for statuses in pod_output_yaml["status"]["containerStatuses"]:
|
||||
if statuses["name"] == killed_container[2]:
|
||||
if str(statuses["ready"]).lower() == "true":
|
||||
container_ready.append(killed_container)
|
||||
for item in container_ready:
|
||||
killed_container_list = killed_container_list.remove(item)
|
||||
if killed_container_list is None or len(killed_container) == 0:
|
||||
return []
|
||||
timer += 5
|
||||
logging.info("Waiting 5 seconds for containers to become ready")
|
||||
time.sleep(5)
|
||||
return killed_container_list
|
||||
|
||||
@@ -5,3 +5,4 @@ scenarios:
|
||||
container_name: "etcd"
|
||||
action: "kill 1"
|
||||
count: 1
|
||||
retry_wait: 60
|
||||
|
||||
29
scenarios/post_action_etcd_container.py
Executable file
29
scenarios/post_action_etcd_container.py
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import logging
|
||||
import time
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
i = 0
|
||||
while i < 100:
|
||||
pods_running = run("oc get pods -n openshift-etcd -l app=etcd | grep -c '4/4'").rstrip()
|
||||
if pods_running == "3":
|
||||
break
|
||||
time.sleep(5)
|
||||
i += 1
|
||||
|
||||
if pods_running == str(3):
|
||||
print("There were 3 pods running properly")
|
||||
else:
|
||||
print("ERROR there were " + str(pods_running) + " pods running instead of 3")
|
||||
Reference in New Issue
Block a user