Improve error handling

This commit: - Adds timeout to avoid operations hanging for long durations. - Improves exception handling and exits wherever needed. - Sets KUBECONFIG env var globoally to access the cluster.
2026-04-15 06:57:28 +00:00 · 2021-07-16 23:49:47 -04:00
parent f051c1c30f
commit c0b9cb46da
7 changed files with 62 additions and 44 deletions
--- a/README.md
+++ b/README.md
@@ -20,8 +20,8 @@ Kraken indexes the metrics specified in the profile into Elasticsearch in additi

 ```
 $ cd kraken
-$ podman-compose up or $ docker-compose up    # Spins up the containers specified in the docker-compose.yml file present in the run directory
-$ podman-compose down or $ docker-compose up  # Delete the containers installed
+$ podman-compose up or $ docker-compose up      # Spins up the containers specified in the docker-compose.yml file present in the run directory
+$ podman-compose down or $ docker-compose down  # Delete the containers installed
 ```
 This will manage the Cerberus and Elasticsearch containers on the host on which you are running Kraken.

--- a/kraken/invoke/command.py
+++ b/kraken/invoke/command.py
@@ -1,17 +1,16 @@
 import subprocess
 import logging
+import sys


 # Invokes a given command and returns the stdout
-def invoke(command):
+def invoke(command, timeout=None):
    try:
-        output = subprocess.Popen(
-            command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-        )
-        (out, err) = output.communicate()
+        output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout)
    except Exception as e:
        logging.error("Failed to run %s, error: %s" % (command, e))
-    return out
+        sys.exit(1)
+    return output


 def run(command):
--- a/kraken/kubernetes/client.py
+++ b/kraken/kubernetes/client.py
@@ -13,8 +13,12 @@ kraken_node_name = ""
 # Load kubeconfig and initialize kubernetes python client
 def initialize_clients(kubeconfig_path):
    global cli
-    config.load_kube_config(kubeconfig_path)
-    cli = client.CoreV1Api()
+    try:
+        config.load_kube_config(kubeconfig_path)
+        cli = client.CoreV1Api()
+    except ApiException as e:
+        logging.error("Failed to initialize kubernetes client: %s\n" % e)
+        sys.exit(1)


 # List all namespaces
@@ -241,13 +245,17 @@ def find_kraken_node():

    if kraken_pod_name:
        # get kraken-deployment pod, find node name
-        runcommand.invoke("kubectl config set-context --current --namespace=" + str(kraken_project))
-        pod_json_str = runcommand.invoke("kubectl get pods/" + str(kraken_pod_name) + " -o json")
-        pod_json = json.loads(pod_json_str)
-        node_name = pod_json["spec"]["nodeName"]
+        try:
+            runcommand.invoke("kubectl config set-context --current --namespace=" + str(kraken_project))
+            pod_json_str = runcommand.invoke("kubectl get pods/" + str(kraken_pod_name) + " -o json")
+            pod_json = json.loads(pod_json_str)
+            node_name = pod_json["spec"]["nodeName"]

-        # Reset to the default project
-        runcommand.invoke("kubectl config set-context --current --namespace=default")
+            # Reset to the default project
+            runcommand.invoke("kubectl config set-context --current --namespace=default")

-        global kraken_node_name
-        kraken_node_name = node_name
+            global kraken_node_name
+            kraken_node_name = node_name
+        except Exception as e:
+            logging.info("%s" % (e))
+            sys.exit(1)
--- a/kraken/litmus/common_litmus.py
+++ b/kraken/litmus/common_litmus.py
@@ -33,11 +33,12 @@ def run(scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_durati
                        if experiment_result:
                            logging.info("Scenario: %s has been successfully injected!" % item)
                        else:
-                            logging.info("Scenario: %s was not successfully injected!" % item)
+                            logging.info("Scenario: %s was not successfully injected, please check" % item)
                            if litmus_uninstall:
                                for l_item in l_scenario:
                                    logging.info("item " + str(l_item))
                                    runcommand.invoke("kubectl delete -f %s" % l_item)
+                            sys.exit(1)
            if litmus_uninstall:
                for item in l_scenario:
                    logging.info("item " + str(item))
@@ -47,6 +48,7 @@ def run(scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_durati
            cerberus.get_status(config)
        except Exception as e:
            logging.error("Failed to run litmus scenario: %s. Encountered " "the following exception: %s" % (item, e))
+            sys.exit(1)
    return litmus_namespaces


--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -11,32 +11,39 @@ import random

 # Run pod based scenarios
 def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
-    try:
-        # Loop to run the scenarios starts here
-        for pod_scenario in scenarios_list:
-            if len(pod_scenario) > 1:
-                pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1])
-            else:
-                pre_action_output = ""
+    # Loop to run the scenarios starts here
+    for pod_scenario in scenarios_list:
+        if len(pod_scenario) > 1:
+            pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1])
+        else:
+            pre_action_output = ""
+        try:
            scenario_logs = runcommand.invoke(
                "powerfulseal autonomous --use-pod-delete-instead-"
                "of-ssh-kill --policy-file %s --kubeconfig %s "
                "--no-cloud --inventory-kubernetes --headless" % (pod_scenario[0], kubeconfig_path)
            )
+        except Exception as e:
+            logging.error(
+                "Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e)
+            )
+            sys.exit(1)

-            # Display pod scenario logs/actions
-            print(scenario_logs)
+        # Display pod scenario logs/actions
+        print(scenario_logs)

-            logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0]))
-            logging.info("Waiting for the specified duration: %s" % (wait_duration))
-            time.sleep(wait_duration)
+        logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0]))
+        logging.info("Waiting for the specified duration: %s" % (wait_duration))
+        time.sleep(wait_duration)

+        try:
            failed_post_scenarios = post_actions.check_recovery(
                kubeconfig_path, pod_scenario, failed_post_scenarios, pre_action_output
            )
            cerberus.publish_kraken_status(config, failed_post_scenarios)
-    except Exception as e:
-        logging.error("Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e))
+        except Exception as e:
+            logging.error("Failed to run post action checks: %s" % e)
+            sys.exit(1)
    return failed_post_scenarios


--- a/kraken/post_actions/actions.py
+++ b/kraken/post_actions/actions.py
@@ -45,13 +45,13 @@ def run(kubeconfig_path, scenario, pre_action_output=""):

 # Perform the post scenario actions to see if components recovered
 def check_recovery(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
-
-    for failed_scenario in failed_post_scenarios:
-        post_action_output = run(kubeconfig_path, failed_scenario[0], failed_scenario[1])
-        if post_action_output is not False:
-            failed_post_scenarios.remove(failed_scenario)
-        else:
-            logging.info("Post action scenario " + str(failed_scenario) + "is still failing")
+    if failed_post_scenarios:
+        for failed_scenario in failed_post_scenarios:
+            post_action_output = run(kubeconfig_path, failed_scenario[0], failed_scenario[1])
+            if post_action_output is not False:
+                failed_post_scenarios.remove(failed_scenario)
+            else:
+                logging.info("Post action scenario " + str(failed_scenario) + "is still failing")

    # check post actions
    if len(scenario) > 1:
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -58,8 +58,10 @@ def main(cfg):

        # Initialize clients
        if not os.path.isfile(kubeconfig_path):
-            kubeconfig_path = None
+            logging.error("Cannot read the kubeconfig file at %s, please check" % kubeconfig_path)
+            sys.exit(1)
        logging.info("Initializing client to talk to the Kubernetes cluster")
+        os.environ["KUBECONFIG"] = str(kubeconfig_path)
        kubecli.initialize_clients(kubeconfig_path)

        # find node kraken might be running on
@@ -67,9 +69,9 @@ def main(cfg):

        # Cluster info
        logging.info("Fetching cluster info")
-        cluster_version = runcommand.invoke("kubectl get clusterversion")
+        cluster_version = runcommand.invoke("kubectl get clusterversion", 60)
        cluster_info = runcommand.invoke(
-            "kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'"
+            "kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'", 60
        )  # noqa
        logging.info("\n%s%s" % (cluster_version, cluster_info))

@@ -140,7 +142,7 @@ def main(cfg):
                                common_litmus.deploy_all_experiments(litmus_version)
                                litmus_installed = True
                            litmus_namespaces = common_litmus.run(
-                                scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_duration
+                                scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_duration,
                            )
                        elif scenario_type == "cluster_shut_down_scenarios":
                            shut_down.run(scenarios_list, config, wait_duration)