mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 18:10:00 +00:00
adding validation that namespace becomes active
This commit is contained in:
committed by
Naga Ravi Chaitanya Elluri
parent
4a4033605b
commit
22df024312
@@ -29,8 +29,9 @@ kraken:
|
||||
- - scenarios/cluster_shut_down_scenario.yml
|
||||
- scenarios/post_action_shut_down.py
|
||||
- namespace_scenarios:
|
||||
- scenarios/regex_namespace.yaml
|
||||
- scenarios/ingress_namespace.yaml
|
||||
- - scenarios/regex_namespace.yaml
|
||||
- - scenarios/ingress_namespace.yaml
|
||||
- scenarios/post_action_namespace.py
|
||||
- zone_outages:
|
||||
- scenarios/zone_outage.yaml
|
||||
|
||||
|
||||
@@ -26,3 +26,30 @@ scenarios:
|
||||
runs: 1
|
||||
sleep: 15
|
||||
```
|
||||
|
||||
**NOTE:** Many openshift namespaces have finalizers built that protect the namespace from being fully deleted: see documentation [here](https://kubernetes.io/blog/2021/05/14/using-finalizers-to-control-deletion/)
|
||||
The namespaces that do have finalizers enabled will be in left in a terminating state but all the pods running on that namespace will get deleted
|
||||
|
||||
#### Post Action
|
||||
|
||||
In all scenarios we do a post chaos check to wait and verify the specific component
|
||||
|
||||
Here there are two options:
|
||||
|
||||
1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario
|
||||
|
||||
See [scenarios/post_action_namespace.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_namespace.py) for an example
|
||||
|
||||
```
|
||||
- namespace_scenarios:
|
||||
- - scenarios/regex_namespace.yaml
|
||||
- scenarios/post_action_namespace.py
|
||||
```
|
||||
|
||||
|
||||
2. Allow kraken to wait and check the killed namespaces become 'Active' again. Kraken keeps a list of the specific
|
||||
namespaces that were killed to verify all that were affected recover properly
|
||||
|
||||
```
|
||||
wait_time: <seconds to wait for namespace to recover>
|
||||
```
|
||||
|
||||
@@ -35,6 +35,16 @@ def list_namespaces(label_selector=None):
|
||||
return namespaces
|
||||
|
||||
|
||||
# Get namespace status
|
||||
def get_namespace_status(namespace_name):
|
||||
ret = ""
|
||||
try:
|
||||
ret = cli.read_namespace_status(namespace_name)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling CoreV1Api->read_namespace_status: %s\n" % e)
|
||||
return ret.status.phase
|
||||
|
||||
|
||||
# Check if all the watch_namespaces are valid
|
||||
def check_namespaces(namespaces, label_selectors=None):
|
||||
try:
|
||||
|
||||
@@ -4,20 +4,27 @@ import logging
|
||||
import kraken.invoke.command as runcommand
|
||||
import kraken.kubernetes.client as kubecli
|
||||
import kraken.cerberus.setup as cerberus
|
||||
import kraken.post_actions.actions as post_actions
|
||||
import yaml
|
||||
import sys
|
||||
|
||||
|
||||
def run(scenarios_list, config, wait_duration):
|
||||
def run(scenarios_list, config, wait_duration, failed_post_scenarios, kubeconfig_path):
|
||||
for scenario_config in scenarios_list:
|
||||
with open(scenario_config, "r") as f:
|
||||
scenario_config = yaml.full_load(f)
|
||||
for scenario in scenario_config["scenarios"]:
|
||||
if len(scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
|
||||
else:
|
||||
pre_action_output = ""
|
||||
with open(scenario_config[0], "r") as f:
|
||||
scenario_config_yaml = yaml.full_load(f)
|
||||
for scenario in scenario_config_yaml["scenarios"]:
|
||||
scenario_namespace = scenario.get("namespace", "^.*$")
|
||||
scenario_label = scenario.get("label_selector", None)
|
||||
run_count = scenario.get("runs", 1)
|
||||
namespace_action = scenario.get("action", "delete")
|
||||
run_sleep = scenario.get("sleep", 10)
|
||||
wait_time = scenario.get("wait_time", 30)
|
||||
killed_namespaces = []
|
||||
namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
|
||||
start_time = int(time.time())
|
||||
for i in range(run_count):
|
||||
@@ -28,6 +35,7 @@ def run(scenarios_list, config, wait_duration):
|
||||
)
|
||||
sys.exit(1)
|
||||
selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
|
||||
killed_namespaces.append(selected_namespace)
|
||||
try:
|
||||
runcommand.invoke("oc %s project %s" % (namespace_action, selected_namespace))
|
||||
logging.info(namespace_action + " on namespace " + str(selected_namespace) + " was successful")
|
||||
@@ -43,5 +51,38 @@ def run(scenarios_list, config, wait_duration):
|
||||
|
||||
logging.info("Waiting for the specified duration: %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
if len(scenario_config) > 1:
|
||||
try:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run post action checks: %s" % e)
|
||||
sys.exit(1)
|
||||
else:
|
||||
failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time)
|
||||
end_time = int(time.time())
|
||||
cerberus.get_status(config, start_time, end_time)
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
|
||||
|
||||
def check_active_namespace(killed_namespaces, wait_time):
|
||||
active_namespace = []
|
||||
timer = 0
|
||||
while timer < wait_time and killed_namespaces:
|
||||
for namespace_name in killed_namespaces:
|
||||
response = kubecli.get_namespace_status(namespace_name).strip()
|
||||
if response != "Active":
|
||||
continue
|
||||
else:
|
||||
active_namespace.append(namespace_name)
|
||||
killed_namespaces = set(killed_namespaces) - set(active_namespace)
|
||||
if len(killed_namespaces) == 0:
|
||||
return []
|
||||
|
||||
timer += 5
|
||||
time.sleep(5)
|
||||
logging.info("Waiting 5 seconds for namespaces to become active")
|
||||
|
||||
logging.error("Namespaces are still not active after waiting " + str(wait_time) + "seconds")
|
||||
logging.error("Non active namespaces " + str(killed_namespaces))
|
||||
return killed_namespaces
|
||||
|
||||
@@ -160,7 +160,9 @@ def main(cfg):
|
||||
# Inject namespace chaos scenarios
|
||||
elif scenario_type == "namespace_scenarios":
|
||||
logging.info("Running namespace scenarios")
|
||||
namespace_actions.run(scenarios_list, config, wait_duration)
|
||||
namespace_actions.run(
|
||||
scenarios_list, config, wait_duration, failed_post_scenarios, kubeconfig_path
|
||||
)
|
||||
|
||||
# Inject zone failures
|
||||
elif scenario_type == "zone_outages":
|
||||
|
||||
@@ -3,3 +3,4 @@ scenarios:
|
||||
namespace: "^.*ingress.*$"
|
||||
runs: 1
|
||||
sleep: 15
|
||||
wait_time: 300
|
||||
|
||||
28
scenarios/post_action_namespace.py
Executable file
28
scenarios/post_action_namespace.py
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
print("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
i = 0
|
||||
while i < 100:
|
||||
projects_active = run("oc get project | grep 'ingress' | grep -c Active").rstrip()
|
||||
if projects_active == "3":
|
||||
break
|
||||
i += 1
|
||||
time.sleep(5)
|
||||
|
||||
if projects_active == str(3):
|
||||
print("There were 3 projects running properly")
|
||||
else:
|
||||
print("ERROR there were " + str(projects_active) + " projects running instead of 3")
|
||||
@@ -3,3 +3,4 @@ scenarios:
|
||||
namespace: "^.*$"
|
||||
runs: 2
|
||||
sleep: 15
|
||||
wait_time: 300
|
||||
|
||||
Reference in New Issue
Block a user