mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Improve error handling
This commit: - Adds timeout to avoid operations hanging for long durations. - Improves exception handling and exits wherever needed. - Sets KUBECONFIG env var globoally to access the cluster.
This commit is contained in:
@@ -20,8 +20,8 @@ Kraken indexes the metrics specified in the profile into Elasticsearch in additi
|
||||
|
||||
```
|
||||
$ cd kraken
|
||||
$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory
|
||||
$ podman-compose down or $ docker-compose up # Delete the containers installed
|
||||
$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory
|
||||
$ podman-compose down or $ docker-compose down # Delete the containers installed
|
||||
```
|
||||
This will manage the Cerberus and Elasticsearch containers on the host on which you are running Kraken.
|
||||
|
||||
|
||||
@@ -1,17 +1,16 @@
|
||||
import subprocess
|
||||
import logging
|
||||
import sys
|
||||
|
||||
|
||||
# Invokes a given command and returns the stdout
|
||||
def invoke(command):
|
||||
def invoke(command, timeout=None):
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (command, e))
|
||||
return out
|
||||
sys.exit(1)
|
||||
return output
|
||||
|
||||
|
||||
def run(command):
|
||||
|
||||
@@ -13,8 +13,12 @@ kraken_node_name = ""
|
||||
# Load kubeconfig and initialize kubernetes python client
|
||||
def initialize_clients(kubeconfig_path):
|
||||
global cli
|
||||
config.load_kube_config(kubeconfig_path)
|
||||
cli = client.CoreV1Api()
|
||||
try:
|
||||
config.load_kube_config(kubeconfig_path)
|
||||
cli = client.CoreV1Api()
|
||||
except ApiException as e:
|
||||
logging.error("Failed to initialize kubernetes client: %s\n" % e)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# List all namespaces
|
||||
@@ -241,13 +245,17 @@ def find_kraken_node():
|
||||
|
||||
if kraken_pod_name:
|
||||
# get kraken-deployment pod, find node name
|
||||
runcommand.invoke("kubectl config set-context --current --namespace=" + str(kraken_project))
|
||||
pod_json_str = runcommand.invoke("kubectl get pods/" + str(kraken_pod_name) + " -o json")
|
||||
pod_json = json.loads(pod_json_str)
|
||||
node_name = pod_json["spec"]["nodeName"]
|
||||
try:
|
||||
runcommand.invoke("kubectl config set-context --current --namespace=" + str(kraken_project))
|
||||
pod_json_str = runcommand.invoke("kubectl get pods/" + str(kraken_pod_name) + " -o json")
|
||||
pod_json = json.loads(pod_json_str)
|
||||
node_name = pod_json["spec"]["nodeName"]
|
||||
|
||||
# Reset to the default project
|
||||
runcommand.invoke("kubectl config set-context --current --namespace=default")
|
||||
# Reset to the default project
|
||||
runcommand.invoke("kubectl config set-context --current --namespace=default")
|
||||
|
||||
global kraken_node_name
|
||||
kraken_node_name = node_name
|
||||
global kraken_node_name
|
||||
kraken_node_name = node_name
|
||||
except Exception as e:
|
||||
logging.info("%s" % (e))
|
||||
sys.exit(1)
|
||||
|
||||
@@ -33,11 +33,12 @@ def run(scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_durati
|
||||
if experiment_result:
|
||||
logging.info("Scenario: %s has been successfully injected!" % item)
|
||||
else:
|
||||
logging.info("Scenario: %s was not successfully injected!" % item)
|
||||
logging.info("Scenario: %s was not successfully injected, please check" % item)
|
||||
if litmus_uninstall:
|
||||
for l_item in l_scenario:
|
||||
logging.info("item " + str(l_item))
|
||||
runcommand.invoke("kubectl delete -f %s" % l_item)
|
||||
sys.exit(1)
|
||||
if litmus_uninstall:
|
||||
for item in l_scenario:
|
||||
logging.info("item " + str(item))
|
||||
@@ -47,6 +48,7 @@ def run(scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_durati
|
||||
cerberus.get_status(config)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run litmus scenario: %s. Encountered " "the following exception: %s" % (item, e))
|
||||
sys.exit(1)
|
||||
return litmus_namespaces
|
||||
|
||||
|
||||
|
||||
@@ -11,32 +11,39 @@ import random
|
||||
|
||||
# Run pod based scenarios
|
||||
def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
|
||||
try:
|
||||
# Loop to run the scenarios starts here
|
||||
for pod_scenario in scenarios_list:
|
||||
if len(pod_scenario) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1])
|
||||
else:
|
||||
pre_action_output = ""
|
||||
# Loop to run the scenarios starts here
|
||||
for pod_scenario in scenarios_list:
|
||||
if len(pod_scenario) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1])
|
||||
else:
|
||||
pre_action_output = ""
|
||||
try:
|
||||
scenario_logs = runcommand.invoke(
|
||||
"powerfulseal autonomous --use-pod-delete-instead-"
|
||||
"of-ssh-kill --policy-file %s --kubeconfig %s "
|
||||
"--no-cloud --inventory-kubernetes --headless" % (pod_scenario[0], kubeconfig_path)
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e)
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Display pod scenario logs/actions
|
||||
print(scenario_logs)
|
||||
# Display pod scenario logs/actions
|
||||
print(scenario_logs)
|
||||
|
||||
logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0]))
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0]))
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
try:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path, pod_scenario, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e))
|
||||
except Exception as e:
|
||||
logging.error("Failed to run post action checks: %s" % e)
|
||||
sys.exit(1)
|
||||
return failed_post_scenarios
|
||||
|
||||
|
||||
|
||||
@@ -45,13 +45,13 @@ def run(kubeconfig_path, scenario, pre_action_output=""):
|
||||
|
||||
# Perform the post scenario actions to see if components recovered
|
||||
def check_recovery(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
|
||||
|
||||
for failed_scenario in failed_post_scenarios:
|
||||
post_action_output = run(kubeconfig_path, failed_scenario[0], failed_scenario[1])
|
||||
if post_action_output is not False:
|
||||
failed_post_scenarios.remove(failed_scenario)
|
||||
else:
|
||||
logging.info("Post action scenario " + str(failed_scenario) + "is still failing")
|
||||
if failed_post_scenarios:
|
||||
for failed_scenario in failed_post_scenarios:
|
||||
post_action_output = run(kubeconfig_path, failed_scenario[0], failed_scenario[1])
|
||||
if post_action_output is not False:
|
||||
failed_post_scenarios.remove(failed_scenario)
|
||||
else:
|
||||
logging.info("Post action scenario " + str(failed_scenario) + "is still failing")
|
||||
|
||||
# check post actions
|
||||
if len(scenario) > 1:
|
||||
|
||||
@@ -58,8 +58,10 @@ def main(cfg):
|
||||
|
||||
# Initialize clients
|
||||
if not os.path.isfile(kubeconfig_path):
|
||||
kubeconfig_path = None
|
||||
logging.error("Cannot read the kubeconfig file at %s, please check" % kubeconfig_path)
|
||||
sys.exit(1)
|
||||
logging.info("Initializing client to talk to the Kubernetes cluster")
|
||||
os.environ["KUBECONFIG"] = str(kubeconfig_path)
|
||||
kubecli.initialize_clients(kubeconfig_path)
|
||||
|
||||
# find node kraken might be running on
|
||||
@@ -67,9 +69,9 @@ def main(cfg):
|
||||
|
||||
# Cluster info
|
||||
logging.info("Fetching cluster info")
|
||||
cluster_version = runcommand.invoke("kubectl get clusterversion")
|
||||
cluster_version = runcommand.invoke("kubectl get clusterversion", 60)
|
||||
cluster_info = runcommand.invoke(
|
||||
"kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'"
|
||||
"kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'", 60
|
||||
) # noqa
|
||||
logging.info("\n%s%s" % (cluster_version, cluster_info))
|
||||
|
||||
@@ -140,7 +142,7 @@ def main(cfg):
|
||||
common_litmus.deploy_all_experiments(litmus_version)
|
||||
litmus_installed = True
|
||||
litmus_namespaces = common_litmus.run(
|
||||
scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_duration
|
||||
scenarios_list, config, litmus_namespaces, litmus_uninstall, wait_duration,
|
||||
)
|
||||
elif scenario_type == "cluster_shut_down_scenarios":
|
||||
shut_down.run(scenarios_list, config, wait_duration)
|
||||
|
||||
Reference in New Issue
Block a user