Improve error handling

This commit:
- Adds timeout to avoid operations hanging for long durations.
- Improves exception handling and exits wherever needed.
- Sets KUBECONFIG env var globoally to access the cluster.
This commit is contained in:
Naga Ravi Chaitanya Elluri
2021-07-16 23:49:47 -04:00
parent f051c1c30f
commit c0b9cb46da
7 changed files with 62 additions and 44 deletions

View File

@@ -20,8 +20,8 @@ Kraken indexes the metrics specified in the profile into Elasticsearch in additi
```
$ cd kraken
$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory
$ podman-compose down or $ docker-compose up # Delete the containers installed
$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory
$ podman-compose down or $ docker-compose down # Delete the containers installed
```
This will manage the Cerberus and Elasticsearch containers on the host on which you are running Kraken.

View File

@@ -1,17 +1,16 @@
import subprocess
import logging
import sys
# Invokes a given command and returns the stdout
def invoke(command):
def invoke(command, timeout=None):
try:
output = subprocess.Popen(
command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
(out, err) = output.communicate()
output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout)
except Exception as e:
logging.error("Failed to run %s, error: %s" % (command, e))
return out
sys.exit(1)
return output
def run(command):

View File

@@ -13,8 +13,12 @@ kraken_node_name = ""
# Load kubeconfig and initialize kubernetes python client
def initialize_clients(kubeconfig_path):
global cli
config.load_kube_config(kubeconfig_path)
cli = client.CoreV1Api()
try:
config.load_kube_config(kubeconfig_path)
cli = client.CoreV1Api()
except ApiException as e:
logging.error("Failed to initialize kubernetes client: %s\n" % e)
sys.exit(1)
# List all namespaces
@@ -241,13 +245,17 @@ def find_kraken_node():
if kraken_pod_name:
# get kraken-deployment pod, find node name
runcommand.invoke("kubectl config set-context --current --namespace=" + str(kraken_project))
pod_json_str = runcommand.invoke("kubectl get pods/" + str(kraken_pod_name) + " -o json")
pod_json = json.loads(pod_json_str)
node_name = pod_json["spec"]["nodeName"]
try:
runcommand.invoke("kubectl config set-context --current --namespace=" + str(kraken_project))
pod_json_str = runcommand.invoke("kubectl get pods/" + str(kraken_pod_name) + " -o json")
pod_json = json.loads(pod_json_str)
node_name = pod_json["spec"]["nodeName"]
# Reset to the default project
runcommand.invoke("kubectl config set-context --current --namespace=default")
# Reset to the default project
runcommand.invoke("kubectl config set-context --current --namespace=default")
global kraken_node_name
kraken_node_name = node_name
global kraken_node_name
kraken_node_name = node_name
except Exception as e:
logging.info("%s" % (e))
sys.exit(1)

View File

@@ -33,11 +33,12 @@ def run(scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_durati
if experiment_result:
logging.info("Scenario: %s has been successfully injected!" % item)
else:
logging.info("Scenario: %s was not successfully injected!" % item)
logging.info("Scenario: %s was not successfully injected, please check" % item)
if litmus_uninstall:
for l_item in l_scenario:
logging.info("item " + str(l_item))
runcommand.invoke("kubectl delete -f %s" % l_item)
sys.exit(1)
if litmus_uninstall:
for item in l_scenario:
logging.info("item " + str(item))
@@ -47,6 +48,7 @@ def run(scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_durati
cerberus.get_status(config)
except Exception as e:
logging.error("Failed to run litmus scenario: %s. Encountered " "the following exception: %s" % (item, e))
sys.exit(1)
return litmus_namespaces

View File

@@ -11,32 +11,39 @@ import random
# Run pod based scenarios
def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
try:
# Loop to run the scenarios starts here
for pod_scenario in scenarios_list:
if len(pod_scenario) > 1:
pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1])
else:
pre_action_output = ""
# Loop to run the scenarios starts here
for pod_scenario in scenarios_list:
if len(pod_scenario) > 1:
pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1])
else:
pre_action_output = ""
try:
scenario_logs = runcommand.invoke(
"powerfulseal autonomous --use-pod-delete-instead-"
"of-ssh-kill --policy-file %s --kubeconfig %s "
"--no-cloud --inventory-kubernetes --headless" % (pod_scenario[0], kubeconfig_path)
)
except Exception as e:
logging.error(
"Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e)
)
sys.exit(1)
# Display pod scenario logs/actions
print(scenario_logs)
# Display pod scenario logs/actions
print(scenario_logs)
logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0]))
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0]))
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
try:
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path, pod_scenario, failed_post_scenarios, pre_action_output
)
cerberus.publish_kraken_status(config, failed_post_scenarios)
except Exception as e:
logging.error("Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e))
except Exception as e:
logging.error("Failed to run post action checks: %s" % e)
sys.exit(1)
return failed_post_scenarios

View File

@@ -45,13 +45,13 @@ def run(kubeconfig_path, scenario, pre_action_output=""):
# Perform the post scenario actions to see if components recovered
def check_recovery(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
for failed_scenario in failed_post_scenarios:
post_action_output = run(kubeconfig_path, failed_scenario[0], failed_scenario[1])
if post_action_output is not False:
failed_post_scenarios.remove(failed_scenario)
else:
logging.info("Post action scenario " + str(failed_scenario) + "is still failing")
if failed_post_scenarios:
for failed_scenario in failed_post_scenarios:
post_action_output = run(kubeconfig_path, failed_scenario[0], failed_scenario[1])
if post_action_output is not False:
failed_post_scenarios.remove(failed_scenario)
else:
logging.info("Post action scenario " + str(failed_scenario) + "is still failing")
# check post actions
if len(scenario) > 1:

View File

@@ -58,8 +58,10 @@ def main(cfg):
# Initialize clients
if not os.path.isfile(kubeconfig_path):
kubeconfig_path = None
logging.error("Cannot read the kubeconfig file at %s, please check" % kubeconfig_path)
sys.exit(1)
logging.info("Initializing client to talk to the Kubernetes cluster")
os.environ["KUBECONFIG"] = str(kubeconfig_path)
kubecli.initialize_clients(kubeconfig_path)
# find node kraken might be running on
@@ -67,9 +69,9 @@ def main(cfg):
# Cluster info
logging.info("Fetching cluster info")
cluster_version = runcommand.invoke("kubectl get clusterversion")
cluster_version = runcommand.invoke("kubectl get clusterversion", 60)
cluster_info = runcommand.invoke(
"kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'"
"kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'", 60
) # noqa
logging.info("\n%s%s" % (cluster_version, cluster_info))
@@ -140,7 +142,7 @@ def main(cfg):
common_litmus.deploy_all_experiments(litmus_version)
litmus_installed = True
litmus_namespaces = common_litmus.run(
scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_duration
scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_duration,
)
elif scenario_type == "cluster_shut_down_scenarios":
shut_down.run(scenarios_list, config, wait_duration)