adding validation that namespace becomes active

This commit is contained in:
Paige Rubendall
2021-08-20 14:59:14 -04:00
committed by Naga Ravi Chaitanya Elluri
parent 4a4033605b
commit 22df024312
8 changed files with 119 additions and 8 deletions

View File

@@ -29,8 +29,9 @@ kraken:
- - scenarios/cluster_shut_down_scenario.yml
- scenarios/post_action_shut_down.py
- namespace_scenarios:
- scenarios/regex_namespace.yaml
- scenarios/ingress_namespace.yaml
- - scenarios/regex_namespace.yaml
- - scenarios/ingress_namespace.yaml
- scenarios/post_action_namespace.py
- zone_outages:
- scenarios/zone_outage.yaml

View File

@@ -26,3 +26,30 @@ scenarios:
runs: 1
sleep: 15
```
**NOTE:** Many openshift namespaces have finalizers built that protect the namespace from being fully deleted: see documentation [here](https://kubernetes.io/blog/2021/05/14/using-finalizers-to-control-deletion/)
The namespaces that do have finalizers enabled will be in left in a terminating state but all the pods running on that namespace will get deleted
#### Post Action
In all scenarios we do a post chaos check to wait and verify the specific component
Here there are two options:
1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario
See [scenarios/post_action_namespace.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_namespace.py) for an example
```
- namespace_scenarios:
- - scenarios/regex_namespace.yaml
- scenarios/post_action_namespace.py
```
2. Allow kraken to wait and check the killed namespaces become 'Active' again. Kraken keeps a list of the specific
namespaces that were killed to verify all that were affected recover properly
```
wait_time: <seconds to wait for namespace to recover>
```

View File

@@ -35,6 +35,16 @@ def list_namespaces(label_selector=None):
return namespaces
# Get namespace status
def get_namespace_status(namespace_name):
ret = ""
try:
ret = cli.read_namespace_status(namespace_name)
except ApiException as e:
logging.error("Exception when calling CoreV1Api->read_namespace_status: %s\n" % e)
return ret.status.phase
# Check if all the watch_namespaces are valid
def check_namespaces(namespaces, label_selectors=None):
try:

View File

@@ -4,20 +4,27 @@ import logging
import kraken.invoke.command as runcommand
import kraken.kubernetes.client as kubecli
import kraken.cerberus.setup as cerberus
import kraken.post_actions.actions as post_actions
import yaml
import sys
def run(scenarios_list, config, wait_duration):
def run(scenarios_list, config, wait_duration, failed_post_scenarios, kubeconfig_path):
for scenario_config in scenarios_list:
with open(scenario_config, "r") as f:
scenario_config = yaml.full_load(f)
for scenario in scenario_config["scenarios"]:
if len(scenario_config) > 1:
pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
else:
pre_action_output = ""
with open(scenario_config[0], "r") as f:
scenario_config_yaml = yaml.full_load(f)
for scenario in scenario_config_yaml["scenarios"]:
scenario_namespace = scenario.get("namespace", "^.*$")
scenario_label = scenario.get("label_selector", None)
run_count = scenario.get("runs", 1)
namespace_action = scenario.get("action", "delete")
run_sleep = scenario.get("sleep", 10)
wait_time = scenario.get("wait_time", 30)
killed_namespaces = []
namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
start_time = int(time.time())
for i in range(run_count):
@@ -28,6 +35,7 @@ def run(scenarios_list, config, wait_duration):
)
sys.exit(1)
selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
killed_namespaces.append(selected_namespace)
try:
runcommand.invoke("oc %s project %s" % (namespace_action, selected_namespace))
logging.info(namespace_action + " on namespace " + str(selected_namespace) + " was successful")
@@ -43,5 +51,38 @@ def run(scenarios_list, config, wait_duration):
logging.info("Waiting for the specified duration: %s" % wait_duration)
time.sleep(wait_duration)
if len(scenario_config) > 1:
try:
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
)
except Exception as e:
logging.error("Failed to run post action checks: %s" % e)
sys.exit(1)
else:
failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time)
end_time = int(time.time())
cerberus.get_status(config, start_time, end_time)
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
def check_active_namespace(killed_namespaces, wait_time):
active_namespace = []
timer = 0
while timer < wait_time and killed_namespaces:
for namespace_name in killed_namespaces:
response = kubecli.get_namespace_status(namespace_name).strip()
if response != "Active":
continue
else:
active_namespace.append(namespace_name)
killed_namespaces = set(killed_namespaces) - set(active_namespace)
if len(killed_namespaces) == 0:
return []
timer += 5
time.sleep(5)
logging.info("Waiting 5 seconds for namespaces to become active")
logging.error("Namespaces are still not active after waiting " + str(wait_time) + "seconds")
logging.error("Non active namespaces " + str(killed_namespaces))
return killed_namespaces

View File

@@ -160,7 +160,9 @@ def main(cfg):
# Inject namespace chaos scenarios
elif scenario_type == "namespace_scenarios":
logging.info("Running namespace scenarios")
namespace_actions.run(scenarios_list, config, wait_duration)
namespace_actions.run(
scenarios_list, config, wait_duration, failed_post_scenarios, kubeconfig_path
)
# Inject zone failures
elif scenario_type == "zone_outages":

View File

@@ -3,3 +3,4 @@ scenarios:
namespace: "^.*ingress.*$"
runs: 1
sleep: 15
wait_time: 300

View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python3
import subprocess
import time
def run(cmd):
try:
output = subprocess.Popen(
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
(out, err) = output.communicate()
except Exception as e:
print("Failed to run %s, error: %s" % (cmd, e))
return out
i = 0
while i < 100:
projects_active = run("oc get project | grep 'ingress' | grep -c Active").rstrip()
if projects_active == "3":
break
i += 1
time.sleep(5)
if projects_active == str(3):
print("There were 3 projects running properly")
else:
print("ERROR there were " + str(projects_active) + " projects running instead of 3")

View File

@@ -3,3 +3,4 @@ scenarios:
namespace: "^.*$"
runs: 2
sleep: 15
wait_time: 300