From cdf3bc03d2691c6960401e52cc7d853994e30ad3 Mon Sep 17 00:00:00 2001 From: Naga Ravi Chaitanya Elluri Date: Tue, 28 Sep 2021 14:19:04 -0400 Subject: [PATCH] Add support to block traffic to an application This commit enables users to simulate a downtime of an application by blocking the traffic for the specified duration to see how it/other components communicating with it behave in case of downtime. --- README.md | 2 + config/config.yaml | 2 + config/config_performance.yaml | 2 + docs/application_outages.md | 17 +++++++++ kraken/application_outage/__init__.py | 0 kraken/application_outage/actions.py | 55 +++++++++++++++++++++++++++ kraken/invoke/command.py | 2 + requirements.txt | 1 + run_kraken.py | 6 +++ scenarios/app_outage.yaml | 5 +++ 10 files changed, 92 insertions(+) create mode 100644 docs/application_outages.md create mode 100644 kraken/application_outage/__init__.py create mode 100644 kraken/application_outage/actions.py create mode 100644 scenarios/app_outage.yaml diff --git a/README.md b/README.md index 702056d1..3aebb8e8 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,8 @@ Instructions on how to setup the config and the options supported can be found a - [Zone Outage Scenarios](docs/zone_outage.md) +- [Application_outages](docs/application_outages.md) + ### Kraken scenario pass/fail criteria and report It's important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by: diff --git a/config/config.yaml b/config/config.yaml index 25c92fb2..e8c4823c 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -34,6 +34,8 @@ kraken: - scenarios/post_action_namespace.py - zone_outages: - scenarios/zone_outage.yaml + - application_outages: + - scenarios/app_outage.yaml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed diff --git a/config/config_performance.yaml b/config/config_performance.yaml index 02570cea..27230ac9 100644 --- a/config/config_performance.yaml +++ b/config/config_performance.yaml @@ -27,6 +27,8 @@ kraken: - scenarios/ingress_namespace.yaml - zone_outages: - scenarios/zone_outage.yaml + - application_outages: + - scenarios/app_outage.yaml cerberus: cerberus_enabled: True # Enable it when cerberus is previously installed cerberus_url: http://0.0.0.0:8080 # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal diff --git a/docs/application_outages.md b/docs/application_outages.md new file mode 100644 index 00000000..9f460ae4 --- /dev/null +++ b/docs/application_outages.md @@ -0,0 +1,17 @@ +### Application outages +Scenario to block the traffic ( Ingress/Egress ) of an application matching the labels for the specified duration of time to understand the behavior of the service/other services which depend it during the downtime. This helps with the planning the requirements accordingly be it improving the timeouts or tweaking the alerts etc. + +##### Sample scenario config +``` +application_outage: # Scenario to create an outage of an application by blocking traffic + duration: 600 # Duration in seconds after which the routes will be accessible + namespace: # Namespace to target - all application routes will go inaccessible if pod selector is empty + pod_selector: {app=foo} # Pods to target + block: [Ingress, Egress] # It can be Ingress or Egress or Ingress, Egress +``` + +##### Debugging steps in case of failures +Kraken creates a network policy blocking the ingress/egress traffic to create an outage, in case of failures before reverting back the network policy, you can delete it manually by executing the following commands to stop the outage: +``` +$ oc delete networkpolicy/kraken-deny -n +``` diff --git a/kraken/application_outage/__init__.py b/kraken/application_outage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py new file mode 100644 index 00000000..717af97a --- /dev/null +++ b/kraken/application_outage/actions.py @@ -0,0 +1,55 @@ +import yaml +import logging +import time +import kraken.cerberus.setup as cerberus +from jinja2 import Template +import kraken.invoke.command as runcommand + + +# Reads the scenario config, applies and deletes a network policy to +# block the traffic for the specified duration +def run(scenarios_list, config, wait_duration): + failed_post_scenarios = "" + for app_outage_config in scenarios_list: + if len(app_outage_config) > 1: + with open(app_outage_config, "r") as f: + app_outage_config_yaml = yaml.full_load(f) + scenario_config = app_outage_config_yaml["application_outage"] + pod_selector = scenario_config.get("pod_selector", "{}") + traffic_type = scenario_config.get("block", "[Ingress, Egress]") + namespace = scenario_config.get("namespace", "") + duration = scenario_config.get("duration", 60) + + start_time = int(time.time()) + + network_policy_template = """--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: kraken-deny +spec: + podSelector: {{ pod_selector }} + policyTypes: {{ traffic_type }} + """ + t = Template(network_policy_template) + rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type) + # Write the rendered template to a file + with open("kraken_network_policy.yaml", "w") as f: + f.write(rendered_spec) + # Block the traffic by creating network policy + logging.info("Creating the network policy") + runcommand.invoke("kubectl create -f %s -n %s" % ("kraken_network_policy.yaml", namespace)) + + # wait for the specified duration + logging.info("Waiting for the specified duration in the config: %s" % (duration)) + time.sleep(duration) + + # unblock the traffic by deleting the network policy + logging.info("Deleting the network policy") + runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace)) + + logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration)) + time.sleep(wait_duration) + + end_time = int(time.time()) + cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) diff --git a/kraken/invoke/command.py b/kraken/invoke/command.py index c367671a..2dc99e53 100644 --- a/kraken/invoke/command.py +++ b/kraken/invoke/command.py @@ -1,5 +1,6 @@ import subprocess import logging +import sys # Invokes a given command and returns the stdout @@ -9,6 +10,7 @@ def invoke(command, timeout=None): output = subprocess.check_output(command, shell=True, universal_newlines=True, timeout=timeout) except Exception as e: logging.error("Failed to run %s, error: %s" % (command, e)) + sys.exit(1) return output diff --git a/requirements.txt b/requirements.txt index 6f98bec5..bc6b3482 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ openshift-client python-ipmi podman-compose docker-compose +jinja2 diff --git a/run_kraken.py b/run_kraken.py index 9b2349e1..b034cc2d 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -19,6 +19,7 @@ import kraken.shut_down.common_shut_down_func as shut_down import kraken.node_actions.run as nodeaction import kraken.kube_burner.client as kube_burner import kraken.zone_outage.actions as zone_outages +import kraken.application_outage.actions as application_outage # Main function @@ -169,6 +170,11 @@ def main(cfg): logging.info("Inject zone outages") zone_outages.run(scenarios_list, config, wait_duration) + # Application outages + elif scenario_type == "application_outages": + logging.info("Injecting application outage") + application_outage.run(scenarios_list, config, wait_duration) + iteration += 1 logging.info("") diff --git a/scenarios/app_outage.yaml b/scenarios/app_outage.yaml new file mode 100644 index 00000000..4a5d39ea --- /dev/null +++ b/scenarios/app_outage.yaml @@ -0,0 +1,5 @@ +application_outage: # Scenario to create an outage of an application by blocking traffic + duration: 600 # Duration in seconds after which the routes will be accessible + namespace: # Namespace to target - all application routes will go inaccessible if pod selector is empty + pod_selector: {app=foo} # Pods to target + block: [Ingress, Egress] # It can be Ingress or Egress or Ingress, Egress