From 0fc82090f2c564f8b513aa3cbd9f321d11069107 Mon Sep 17 00:00:00 2001 From: prubenda Date: Wed, 10 Jun 2020 09:09:28 -0400 Subject: [PATCH] Adding watch to see if components recovered --- config/config.yaml | 14 +- kraken/invoke/command.py | 12 +- kraken/kubernetes/client.py | 1 + run_kraken.py | 137 +++++++++++++++--- scenarios/etcd.yml | 3 - scenarios/openshift-apiserver.yml | 0 scenarios/openshift-kube-apiserver.yml | 1 - scenarios/post_action_etcd.yml | 21 +++ scenarios/post_action_etcd_example.sh | 3 + scenarios/post_action_etcd_example_py.py | 23 +++ scenarios/post_action_openshift-apiserver.yml | 23 +++ .../post_action_openshift-kube-apiserver.yml | 21 +++ scenarios/post_action_regex.py | 68 +++++++++ scenarios/post_action_regex.sh | 11 ++ .../post_action_regex_openshift_pod_kill.yml | 18 +++ scenarios/regex_openshift_pod_kill.yml | 0 16 files changed, 321 insertions(+), 35 deletions(-) mode change 100644 => 100755 scenarios/etcd.yml mode change 100644 => 100755 scenarios/openshift-apiserver.yml mode change 100644 => 100755 scenarios/openshift-kube-apiserver.yml create mode 100755 scenarios/post_action_etcd.yml create mode 100755 scenarios/post_action_etcd_example.sh create mode 100755 scenarios/post_action_etcd_example_py.py create mode 100755 scenarios/post_action_openshift-apiserver.yml create mode 100755 scenarios/post_action_openshift-kube-apiserver.yml create mode 100755 scenarios/post_action_regex.py create mode 100755 scenarios/post_action_regex.sh create mode 100755 scenarios/post_action_regex_openshift_pod_kill.yml mode change 100644 => 100755 scenarios/regex_openshift_pod_kill.yml diff --git a/config/config.yaml b/config/config.yaml index 8ee29990..38528810 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,11 +1,15 @@ kraken: kubeconfig_path: /root/.kube/config # Path to kubeconfig + exit_on_failure: False # Exit when a post action scenario fails scenarios: # List of policies/chaos scenarios to load - - scenarios/etcd.yml - - scenarios/openshift-kube-apiserver.yml - - scenarios/openshift-apiserver.yml - - scenarios/regex_openshift_pod_kill.yml - + - - scenarios/etcd.yml + - scenarios/post_action_etcd_example.sh + - - scenarios/openshift-kube-apiserver.yml + - scenarios/post_action_openshift-kube-apiserver.yml + - - scenarios/openshift-apiserver.yml + - scenarios/post_action_openshift-apiserver.yml + - - scenarios/regex_openshift_pod_kill.yml + - scenarios/post_action_regex.py cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal diff --git a/kraken/invoke/command.py b/kraken/invoke/command.py index 152875ff..c624a9e1 100644 --- a/kraken/invoke/command.py +++ b/kraken/invoke/command.py @@ -5,8 +5,10 @@ import logging # Invokes a given command and returns the stdout def invoke(command): try: - output = subprocess.check_output(command, shell=True, - universal_newlines=True) - except Exception: - logging.error("Failed to run %s" % (command)) - return output + output = subprocess.Popen(command, shell=True, + universal_newlines=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, err) = output.communicate() + except Exception as e: + logging.error("Failed to run %s, error: %s" % (command, e)) + return out diff --git a/kraken/kubernetes/client.py b/kraken/kubernetes/client.py index 2a180843..2bc805ed 100644 --- a/kraken/kubernetes/client.py +++ b/kraken/kubernetes/client.py @@ -4,6 +4,7 @@ import logging import kraken.invoke.command as runcommand import json + kraken_node_name = "" diff --git a/run_kraken.py b/run_kraken.py index 9be279c6..12eee900 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -12,6 +12,109 @@ import kraken.invoke.command as runcommand import pyfiglet +# Get cerberus status +def cerberus_integration(config): + cerberus_status = True + if config["cerberus"]["cerberus_enabled"]: + cerberus_url = config["cerberus"]["cerberus_url"] + if not cerberus_url: + logging.error("url where Cerberus publishes True/False signal is not provided.") + sys.exit(1) + cerberus_status = requests.get(cerberus_url).content + cerberus_status = True if cerberus_status == b'True' else False + if not cerberus_status: + logging.error("Received a no-go signal from Cerberus, looks like " + "the cluster is unhealthy. Please check the Cerberus " + "report for more details. Test failed.") + sys.exit(1) + else: + logging.info("Received a go signal from Ceberus, the cluster is healthy. " + "Test passed.") + return cerberus_status + + +# Function to publish kraken status to cerberus +def publish_kraken_status(config, failed_post_scenarios): + cerberus_status = cerberus_integration(config) + if not cerberus_status: + if failed_post_scenarios: + if config['kraken']['exit_on_failure']: + logging.info("Cerberus status is not healthy and post action scenarios " + "are still failing, exiting kraken run") + sys.exit(1) + else: + logging.info("Cerberus status is not healthy and post action scenarios " + "are still failing") + else: + + if failed_post_scenarios: + if config['kraken']['exit_on_failure']: + logging.info("Cerberus status is healthy but post action scenarios " + "are still failing, exiting kraken run") + sys.exit(1) + else: + logging.info("Cerberus status is healthy but post action scenarios " + "are still failing") + + +def run_post_action(kubeconfig_path, scenario, pre_action_output=""): + + if scenario.endswith(".yaml") or scenario.endswith(".yml"): + action_output = runcommand.invoke("powerfulseal autonomous " + "--use-pod-delete-instead-of-ssh-kill" + " --policy-file %s --kubeconfig %s --no-cloud" + " --inventory-kubernetes --headless" + % (scenario, kubeconfig_path)) + # read output to make sure no error + if "ERROR" in action_output: + action_output.split("ERROR")[1].split('\n')[0] + if not pre_action_output: + logging.info("Powerful seal pre action check failed for " + str(scenario)) + return False + else: + logging.info(scenario + " post action checks passed") + + elif scenario.endswith(".py"): + action_output = runcommand.invoke("python3 " + scenario).strip() + if pre_action_output: + if pre_action_output == action_output: + logging.info(scenario + " post action checks passed") + else: + logging.info(scenario + ' post action response did not match pre check output') + return False + else: + # invoke custom bash script + action_output = runcommand.invoke(scenario).strip() + if pre_action_output: + if pre_action_output == action_output: + logging.info(scenario + " post action checks passed") + else: + logging.info(scenario + ' post action response did not match pre check output') + return False + + return action_output + + +# Perform the post scenario actions to see if components recovered +def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output): + + for failed_scenario in failed_post_scenarios: + post_action_output = run_post_action(kubeconfig_path, + failed_scenario[0], failed_scenario[1]) + if post_action_output is not False: + failed_post_scenarios.remove(failed_scenario) + else: + logging.info('Post action scenario ' + str(failed_scenario) + "is still failing") + + # check post actions + if len(scenario) > 1: + post_action_output = run_post_action(kubeconfig_path, scenario[1], pre_action_output) + if post_action_output is False: + failed_post_scenarios.append([scenario[1], pre_action_output]) + + return failed_post_scenarios + + # Main function def main(cfg): # Start kraken @@ -24,7 +127,6 @@ def main(cfg): config = yaml.full_load(f) kubeconfig_path = config["kraken"]["kubeconfig_path"] scenarios = config["kraken"]["scenarios"] - cerberus_enabled = config["cerberus"]["cerberus_enabled"] wait_duration = config["tunings"]["wait_duration"] iterations = config["tunings"]["iterations"] daemon_mode = config["tunings"]['daemon_mode'] @@ -59,41 +161,34 @@ def main(cfg): % str(iterations)) iterations = int(iterations) + failed_post_scenarios = [] # Loop to run the chaos starts here while (int(iteration) < iterations): # Inject chaos scenarios specified in the config + logging.info("Executing scenarios for iteration " + str(iteration)) try: # Loop to run the scenarios starts here for scenario in scenarios: - logging.info("Injecting scenario: %s" % (scenario)) + pre_action_output = run_post_action(kubeconfig_path, scenario[1]) runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill" " --policy-file %s --kubeconfig %s --no-cloud" " --inventory-kubernetes --headless" - % (scenario, kubeconfig_path)) - logging.info("Scenario: %s has been successfully injected!" % (scenario)) + % (scenario[0], kubeconfig_path)) - if cerberus_enabled: - cerberus_url = config["cerberus"]["cerberus_url"] - if not cerberus_url: - logging.error("url where Cerberus publishes True/False signal " - "is not provided.") - sys.exit(1) - cerberus_status = requests.get(cerberus_url).content - cerberus_status = True if cerberus_status == b'True' else False - if not cerberus_status: - logging.error("Received a no-go signal from Cerberus, looks like the" - " cluster is unhealthy. Please check the Cerberus report" - " for more details. Test failed.") - sys.exit(1) - else: - logging.info("Received a go signal from Ceberus, the cluster is " - "healthy. Test passed.") + logging.info("Scenario: %s has been successfully injected!" % (scenario[0])) logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) + failed_post_scenarios = post_actions(kubeconfig_path, scenario, + failed_post_scenarios, pre_action_output) + publish_kraken_status(config, failed_post_scenarios) except Exception as e: logging.error("Failed to run scenario: %s. Encountered the following exception: %s" - % (scenario, e)) + % (scenario[0], e)) iteration += 1 + logging.info("") + if failed_post_scenarios: + logging.error("Post scenarios are still failing at the end of all iterations") + sys.exit(1) else: logging.error("Cannot find a config at %s, please check" % (cfg)) sys.exit(1) diff --git a/scenarios/etcd.yml b/scenarios/etcd.yml old mode 100644 new mode 100755 index 2eb303ae..a0e579b4 --- a/scenarios/etcd.yml +++ b/scenarios/etcd.yml @@ -11,12 +11,9 @@ scenarios: - labels: namespace: "openshift-etcd" selector: "k8s-app=etcd" - filters: - randomSample: size: 1 - - # The actions will be executed in the order specified actions: - kill: probability: 1 diff --git a/scenarios/openshift-apiserver.yml b/scenarios/openshift-apiserver.yml old mode 100644 new mode 100755 diff --git a/scenarios/openshift-kube-apiserver.yml b/scenarios/openshift-kube-apiserver.yml old mode 100644 new mode 100755 index 7bd4039c..94c72c5d --- a/scenarios/openshift-kube-apiserver.yml +++ b/scenarios/openshift-kube-apiserver.yml @@ -11,7 +11,6 @@ scenarios: - labels: namespace: "openshift-kube-apiserver" selector: "app=openshift-kube-apiserver" - filters: - randomSample: size: 1 diff --git a/scenarios/post_action_etcd.yml b/scenarios/post_action_etcd.yml new file mode 100755 index 00000000..6a472e26 --- /dev/null +++ b/scenarios/post_action_etcd.yml @@ -0,0 +1,21 @@ +config: + runStrategy: + runs: 1 + maxSecondsBetweenRuns: 10 + minSecondsBetweenRuns: 1 +scenarios: + - name: "check 3 pods are in namespace with selector: etcd" + steps: + - podAction: + matches: + - labels: + namespace: "openshift-etcd" + selector: "k8s-app=etcd" + filters: + - property: + name: "state" + value: "Running" + # The actions will be executed in the order specified + actions: + - checkPodCount: + count: 3 \ No newline at end of file diff --git a/scenarios/post_action_etcd_example.sh b/scenarios/post_action_etcd_example.sh new file mode 100755 index 00000000..f122291b --- /dev/null +++ b/scenarios/post_action_etcd_example.sh @@ -0,0 +1,3 @@ +#!/bin/bash +pods="$(oc get pods -n openshift-etcd | grep -c Running)" +echo "$pods" diff --git a/scenarios/post_action_etcd_example_py.py b/scenarios/post_action_etcd_example_py.py new file mode 100755 index 00000000..1d840625 --- /dev/null +++ b/scenarios/post_action_etcd_example_py.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +import subprocess +import logging + + +def run(cmd): + try: + output = subprocess.Popen(cmd, shell=True, + universal_newlines=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, err) = output.communicate() + logging.info("out " + str(out)) + except Exception as e: + logging.error("Failed to run %s, error: %s" % (cmd, e)) + return out + + +pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip() + +if pods_running == str(3): + print("There were 3 pods running properly") +else: + print("ERROR there were " + str(pods_running) + " pods running instead of 3") diff --git a/scenarios/post_action_openshift-apiserver.yml b/scenarios/post_action_openshift-apiserver.yml new file mode 100755 index 00000000..938b74c2 --- /dev/null +++ b/scenarios/post_action_openshift-apiserver.yml @@ -0,0 +1,23 @@ +config: + runStrategy: + runs: 1 + maxSecondsBetweenRuns: 30 + minSecondsBetweenRuns: 1 +scenarios: + - name: "check 3 pods are in namespace with selector: openshift-apiserver" + steps: + - podAction: + matches: + - labels: + namespace: "openshift-apiserver" + selector: "app=openshift-apiserver" + + filters: + - property: + name: "state" + value: "Running" + + # The actions will be executed in the order specified + actions: + - checkPodCount: + count: 3 diff --git a/scenarios/post_action_openshift-kube-apiserver.yml b/scenarios/post_action_openshift-kube-apiserver.yml new file mode 100755 index 00000000..7487661b --- /dev/null +++ b/scenarios/post_action_openshift-kube-apiserver.yml @@ -0,0 +1,21 @@ +config: + runStrategy: + runs: 1 + maxSecondsBetweenRuns: 30 + minSecondsBetweenRuns: 1 +scenarios: + - name: "check 3 pods are in namespace with selector: openshift-kube-apiserver" + steps: + - podAction: + matches: + - labels: + namespace: "openshift-kube-apiserver" + selector: "app=openshift-kube-apiserver" + filters: + - property: + name: "state" + value: "Running" + # The actions will be executed in the order specified + actions: + - checkPodCount: + count: 3 diff --git a/scenarios/post_action_regex.py b/scenarios/post_action_regex.py new file mode 100755 index 00000000..ce12ab8c --- /dev/null +++ b/scenarios/post_action_regex.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +import subprocess +import re +import sys +from kubernetes import client, config +from kubernetes.client.rest import ApiException +import logging + + +# List all namespaces +def list_namespaces(): + namespaces = [] + try: + config.load_kube_config() + cli = client.CoreV1Api() + ret = cli.list_namespace(pretty=True) + except ApiException as e: + logging.error("Exception when calling \ + CoreV1Api->list_namespaced_pod: %s\n" % e) + for namespace in ret.items: + namespaces.append(namespace.metadata.name) + return namespaces + + +# Check if all the watch_namespaces are valid +def check_namespaces(namespaces): + try: + valid_namespaces = list_namespaces() + regex_namespaces = set(namespaces) - set(valid_namespaces) + final_namespaces = set(namespaces) - set(regex_namespaces) + valid_regex = set() + if regex_namespaces: + for namespace in valid_namespaces: + for regex_namespace in regex_namespaces: + if re.search(regex_namespace, namespace): + final_namespaces.add(namespace) + valid_regex.add(regex_namespace) + break + invalid_namespaces = regex_namespaces - valid_regex + if invalid_namespaces: + raise Exception("There exists no namespaces matching: %s" % (invalid_namespaces)) + return list(final_namespaces) + except Exception as e: + logging.error("%s" % (e)) + sys.exit(1) + + +def run(cmd): + try: + output = subprocess.Popen(cmd, shell=True, + universal_newlines=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, err) = output.communicate() + except Exception as e: + logging.error("Failed to run %s, error: %s" % (cmd, e)) + return out + + +regex_namespace = ["openshift-.*"] +namespaces = check_namespaces(regex_namespace) +pods_running = 0 +for namespace in namespaces: + new_pods_running = run("oc get pods -n " + namespace + " | grep -c Running").rstrip() + try: + pods_running += int(new_pods_running) + except Exception: + continue +print(pods_running) diff --git a/scenarios/post_action_regex.sh b/scenarios/post_action_regex.sh new file mode 100755 index 00000000..10626cc0 --- /dev/null +++ b/scenarios/post_action_regex.sh @@ -0,0 +1,11 @@ +#!/bin/bash +pods="$(oc get pods -n openshift-etcd | grep -c Running)" +echo "$pods" + +if [ "$pods" -eq 3 ] +then + echo "Pods Pass" +else + # need capital error for proper error catching in run_kraken + echo "ERROR pod count $pods doesnt match 3 expected pods" +fi diff --git a/scenarios/post_action_regex_openshift_pod_kill.yml b/scenarios/post_action_regex_openshift_pod_kill.yml new file mode 100755 index 00000000..ba011b75 --- /dev/null +++ b/scenarios/post_action_regex_openshift_pod_kill.yml @@ -0,0 +1,18 @@ +config: + runStrategy: + runs: 1 + maxSecondsBetweenRuns: 30 + minSecondsBetweenRuns: 1 +scenarios: + - name: kill up to 3 pods in any openshift namespace + steps: + - podAction: + matches: + - namespace: "openshift-.*" + filters: + - property: + name: "state" + value: "Running" + actions: + - checkPodCount: + count: 146 diff --git a/scenarios/regex_openshift_pod_kill.yml b/scenarios/regex_openshift_pod_kill.yml old mode 100644 new mode 100755