From 6456eec76af66b5acfe240a5aa06f83ff1ca5a96 Mon Sep 17 00:00:00 2001
From: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
Date: Mon, 9 Aug 2021 20:44:29 -0400
Subject: [PATCH] Add zone outage scenarios

This commit adds support to create zone outage in AWS by denying both
ingress and egress traffic to the instances belonging to a particular
subnet belonging to the zone by tweaking the network acl. This creates
an outage of all the nodes in the zone - both master and workers.
---
 README.md                                 |  2 +
 config/config.yaml                        |  2 +
 config/config_performance.yaml            |  6 ++-
 docs/zone_outage.md                       | 25 ++++++++++
 kraken/node_actions/aws_node_scenarios.py | 60 +++++++++++++++++++++++
 kraken/zone_outage/__init__.py            |  0
 kraken/zone_outage/actions.py             | 57 +++++++++++++++++++++
 run_kraken.py                             |  6 +++
 scenarios/zone_outage.yaml                |  5 ++
 9 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 docs/zone_outage.md
 create mode 100644 kraken/zone_outage/__init__.py
 create mode 100644 kraken/zone_outage/actions.py
 create mode 100644 scenarios/zone_outage.yaml

diff --git a/README.md b/README.md
index 2b12dc52..fcc95ec7 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,8 @@ Kraken supports pod, node, time/date and [litmus](https://github.com/litmuschaos
 
 - [Namespace Scenarios](docs/namespace_scenarios.md)
 
+- [Zone Outage Scenarios](docs/zone_outage.md)
+
 
 ### Kraken scenario pass/fail criteria and report
 It's important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by:
diff --git a/config/config.yaml b/config/config.yaml
index f6f4c186..273f5c83 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -27,6 +27,8 @@ kraken:
         -   namespace_scenarios:
             - scenarios/regex_namespace.yaml
             - scenarios/ingress_namespace.yaml
+        -   zone_outages:
+            - scenarios/zone_outage.yaml
 
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
diff --git a/config/config_performance.yaml b/config/config_performance.yaml
index c6e35fed..02570cea 100644
--- a/config/config_performance.yaml
+++ b/config/config_performance.yaml
@@ -22,13 +22,15 @@ kraken:
         -   cluster_shut_down_scenarios:
             - - scenarios/cluster_shut_down_scenario.yml
               - scenarios/post_action_shut_down.py
-        - namespace_scenarios:
+        -   namespace_scenarios:
             - scenarios/regex_namespace.yaml
             - scenarios/ingress_namespace.yaml
+        -   zone_outages:
+            - scenarios/zone_outage.yaml
 cerberus:
     cerberus_enabled: True                                # Enable it when cerberus is previously installed
     cerberus_url: http://0.0.0.0:8080                     # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
-    check_applicaton_routes: False                         # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
+    check_applicaton_routes: False                        # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
 
 performance_monitoring:
     deploy_dashboards: True                               # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
diff --git a/docs/zone_outage.md b/docs/zone_outage.md
new file mode 100644
index 00000000..4bea5feb
--- /dev/null
+++ b/docs/zone_outage.md
@@ -0,0 +1,25 @@
+### Zone outage scenario
+Scenario to create outage in a targeted zone in the public cloud to understand the impact on both Kubernetes/OpenShift platform as well as applications running on the worker nodes in that zone. It tweaks the network acl of the zone to simulate the failure and that in turn will stop both ingress and egress traffic from all the nodes in a particualar zone for the specified duration and reverts it back to the previous state. Zone outage can be injected by placing the zone_outage config file under zone_outages option in the [kraken config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml). Refer to [zone_outage_scenario](https://github.com/openshift-scale/kraken/blob/master/scenarios/zone_outage.yaml) config file for the parameters that need to be defined.
+
+Refer to [cloud setup](cloud_setup.md) to configure your cli properly for the cloud provider of the cluster you want to shut down
+
+##### Current accepted cloud types:
+* [AWS](cloud_setup.md#aws)
+
+##### Sample scenario config
+```
+zone_outage:                                         # Scenario to create an outage of a zone by tweaking network ACL
+  cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
+  duration: 600                                      # duration in seconds after which the zone will be back online
+  vpc_id:                                            # cluster virtual private network to target
+  subnet_id:                                         # subnet-id to deny both ingress and egress traffic
+```
+
+**NOTE**: vpc_id and subnet_id can be obtained from the cloud web console by selecting one of the instances in the targeted zone ( us-west-2a for example ).
+
+##### Debugging steps in case of failures
+In case of failures during the steps which revert back the network acl to allow traffic and bring back the cluster nodes in the zone, the nodes in the particular zone will be in `NotReady` condition. Here is how to fix it:
+- OpenShift by default deploys the nodes in different zones for fault tolerance, for example us-west-2a, us-west-2b, us-west-2c. The cluster is associated with a virtual private network and each zone has it's own subnet with a network acl which defines the ingress and egress traffic rules at the zone level unlike security groups which are at an instance level.
+- From the cloud web console, select one of the instances in the zone which is down and go to the subnet_id specified in the config.
+- Look at the network acl associtated with the subnet and you will see both ingress and egress traffic being denied which is expected as Kraken deliberately injects it.
+- Kraken just switches the network acl while still keeping the original or default network acl around, switching to the default network acl from the drop down menu will get back the nodes in the targeted zone into Ready state.
diff --git a/kraken/node_actions/aws_node_scenarios.py b/kraken/node_actions/aws_node_scenarios.py
index e8c37f8b..46225ace 100644
--- a/kraken/node_actions/aws_node_scenarios.py
+++ b/kraken/node_actions/aws_node_scenarios.py
@@ -90,6 +90,66 @@ class AWS:
             logging.error("Failed to get status waiting for %s to be terminated %s" % (instance_id, e))
             return False
 
+    # Creates a deny network acl and returns the id
+    def create_default_network_acl(self, vpc_id):
+        try:
+            logging.info("Trying to create a default deny network acl")
+            response = self.boto_client.create_network_acl(VpcId=vpc_id)
+            acl_id = response["NetworkAcl"]["NetworkAclId"]
+            logging.info("Created a network acl, id=%s" % acl_id)
+        except Exception as e:
+            logging.error(
+                "Failed to create the default network_acl: %s"
+                "Making sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
+            )
+            sys.exit(1)
+        return acl_id
+
+    # Replace network acl association
+    def replace_network_acl_association(self, association_id, acl_id):
+        try:
+            logging.info("Replacing the network acl associated with the subnet")
+            status = self.boto_client.replace_network_acl_association(AssociationId=association_id, NetworkAclId=acl_id)
+            logging.info(status)
+            new_association_id = status["NewAssociationId"]
+        except Exception as e:
+            logging.error("Failed to replace network acl association: %s" % (e))
+            sys.exit(1)
+        return new_association_id
+
+    # Describe network acl
+    def describe_network_acls(self, vpc_id, subnet_id):
+        try:
+            response = self.boto_client.describe_network_acls(
+                Filters=[
+                    {"Name": "vpc-id", "Values": [vpc_id]},
+                    {"Name": "association.subnet-id", "Values": [subnet_id]},
+                ]
+            )
+        except Exception as e:
+            logging.error(
+                "Failed to describe network acl: %s."
+                "Making sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
+            )
+            sys.exit(1)
+        associations = response["NetworkAcls"][0]["Associations"]
+        # grab the current network_acl in use
+        original_acl_id = response["NetworkAcls"][0]["Associations"][0]["NetworkAclId"]
+        return associations, original_acl_id
+
+    # Delete network acl
+    def delete_network_acl(self, acl_id):
+        try:
+            logging.info("Deleting the network acl: %s" % (acl_id))
+            self.boto_client.delete_network_acl(NetworkAclId=acl_id)
+        except Exception as e:
+            logging.error(
+                "Failed to delete network_acl %s: %s"
+                "Making sure you have aws cli configured on the host and set for the region of your vpc/subnet"
+                % (acl_id, e)
+            )
+            sys.exit(1)
+
 
 class aws_node_scenarios(abstract_node_scenarios):
     def __init__(self):
diff --git a/kraken/zone_outage/__init__.py b/kraken/zone_outage/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/kraken/zone_outage/actions.py b/kraken/zone_outage/actions.py
new file mode 100644
index 00000000..700e280f
--- /dev/null
+++ b/kraken/zone_outage/actions.py
@@ -0,0 +1,57 @@
+import yaml
+import sys
+import logging
+import time
+from kraken.node_actions.aws_node_scenarios import AWS
+import kraken.cerberus.setup as cerberus
+
+
+# filters the subnet of interest and applies the network acl to create zone outage
+def run(scenarios_list, config, wait_duration):
+    failed_post_scenarios = ""
+    for zone_outage_config in scenarios_list:
+        if len(zone_outage_config) > 1:
+            with open(zone_outage_config, "r") as f:
+                zone_outage_config_yaml = yaml.full_load(f)
+                scenario_config = zone_outage_config_yaml["zone_outage"]
+                vpc_id = scenario_config["vpc_id"]
+                subnet_id = scenario_config["subnet_id"]
+                duration = scenario_config["duration"]
+                cloud_type = scenario_config["cloud_type"]
+                network_association_ids = []
+
+                if cloud_type.lower() == "aws":
+                    cloud_object = AWS()
+                else:
+                    logging.error("Cloud type " + cloud_type + " is not currently supported for zone outage scenarios")
+                    sys.exit(1)
+
+                start_time = int(time.time())
+
+                associations, original_acl_id = cloud_object.describe_network_acls(vpc_id, subnet_id)
+                for entry in associations:
+                    if entry["SubnetId"] == subnet_id:
+                        network_association_ids.append(entry["NetworkAclAssociationId"])
+                logging.info(
+                    "Network association ids associated with the subnet %s: %s" % (subnet_id, network_association_ids)
+                )
+                acl_id = cloud_object.create_default_network_acl(vpc_id)
+                new_association_id = cloud_object.replace_network_acl_association(network_association_ids[0], acl_id)
+
+                # wait for the specified duration
+                logging.info("Waiting for the specified duration: %s" % (duration))
+                time.sleep(duration)
+
+                # replace the applied acl with the previous acl in use
+                logging.info("Replacing the applied acl with the original acl: %s" % (original_acl_id))
+                cloud_object.replace_network_acl_association(new_association_id, original_acl_id)
+
+                # delete the network acl created for the run
+                logging.info("Deleting the network acl created for the run: %s" % (acl_id))
+                cloud_object.delete_network_acl(acl_id)
+
+                logging.info("Waiting for the specified duration: %s" % (wait_duration))
+                time.sleep(wait_duration)
+
+                end_time = int(time.time())
+                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
diff --git a/run_kraken.py b/run_kraken.py
index baca75cd..69351837 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -18,6 +18,7 @@ import kraken.namespace_actions.common_namespace_functions as namespace_actions
 import kraken.shut_down.common_shut_down_func as shut_down
 import kraken.node_actions.run as nodeaction
 import kraken.kube_burner.client as kube_burner
+import kraken.zone_outage.actions as zone_outages
 
 
 # Main function
@@ -157,6 +158,11 @@ def main(cfg):
                             logging.info("Running namespace scenarios")
                             namespace_actions.run(scenarios_list, config, wait_duration)
 
+                        # Inject zone failures
+                        elif scenario_type == "zone_outages":
+                            logging.info("Inject zone outages")
+                            zone_outages.run(scenarios_list, config, wait_duration)
+
             iteration += 1
             logging.info("")
 
diff --git a/scenarios/zone_outage.yaml b/scenarios/zone_outage.yaml
new file mode 100644
index 00000000..8ca7ffbf
--- /dev/null
+++ b/scenarios/zone_outage.yaml
@@ -0,0 +1,5 @@
+zone_outage:                                         # Scenario to create an outage of a zone by tweaking network ACL
+  cloud_type: aws                                    # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
+  duration: 600                                      # duration in seconds after which the zone will be back online
+  vpc_id:                                            # cluster virtual private network to target
+  subnet_id:                                         # subnet-id to deny both ingress and egress traffic