two types of zone outage

Signed-off-by: Paige Patton <prubenda@redhat.com>
2026-02-14 09:59:59 +00:00 · 2025-03-17 09:09:15 -04:00
parent 2624102d65
commit 83d99bbb02
3 changed files with 133 additions and 78 deletions
--- a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py
+++ b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py
@@ -224,7 +224,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
    def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
        super().__init__(kubecli, affected_nodes_status)
        self.gcp = GCP()
-        print("selfkeys" + str(vars(self)))

    # Node scenario to start the node
    def node_start_scenario(self, instance_kill_count, node, timeout):
--- a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py
+++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py
@@ -2,15 +2,21 @@ import logging
 import time

 import yaml
+
+from multiprocessing.pool import ThreadPool
+from itertools import repeat
+
+from krkn_lib.k8s import KrknKubernetes
+from krkn_lib.models.k8s import AffectedNodeStatus
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
-from krkn_lib.utils import log_exception

-from krkn import utils
+from krkn_lib.utils import get_yaml_item_value
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 from krkn.scenario_plugins.native.network import cerberus
-from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS

+from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
+from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios

 class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
    def run(
@@ -25,92 +31,138 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
            with open(scenario, "r") as f:
                zone_outage_config_yaml = yaml.full_load(f)
                scenario_config = zone_outage_config_yaml["zone_outage"]
-                vpc_id = scenario_config["vpc_id"]
-                subnet_ids = scenario_config["subnet_id"]
-                duration = scenario_config["duration"]
                cloud_type = scenario_config["cloud_type"]
-                # Add support for user-provided default network ACL
-                default_acl_id = scenario_config.get("default_acl_id")
-                ids = {}
-                acl_ids_created = []
-
-                if cloud_type.lower() == "aws":
-                    cloud_object = AWS()
-                else:
-                    logging.error(
-                        "ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
-                        "zone outage scenarios" % cloud_type
-                    )
-                    return 1
-
                start_time = int(time.time())
-
-                for subnet_id in subnet_ids:
-                    logging.info("Targeting subnet_id")
-                    network_association_ids = []
-                    associations, original_acl_id = cloud_object.describe_network_acls(
-                        vpc_id, subnet_id
-                    )
-                    for entry in associations:
-                        if entry["SubnetId"] == subnet_id:
-                            network_association_ids.append(
-                                entry["NetworkAclAssociationId"]
-                            )
-                    logging.info(
-                        "Network association ids associated with "
-                        "the subnet %s: %s" % (subnet_id, network_association_ids)
-                    )
-                    
-                    # Use provided default ACL if available, otherwise create a new one
-                    if default_acl_id:
-                        acl_id = default_acl_id
-                        logging.info(
-                            "Using provided default ACL ID %s - this ACL will not be deleted after the scenario", 
-                            default_acl_id
-                        )
-                        # Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
+                if cloud_type.lower() == "aws":
+                    self.cloud_object = AWS()
+                    self.network_based_zone(scenario_config)
+                else:
+                    kubecli = lib_telemetry.get_lib_kubernetes()
+                    if cloud_type.lower() == "gcp":
+                        affected_nodes_status = AffectedNodeStatus()
+                        self.cloud_object = gcp_node_scenarios(kubecli, affected_nodes_status)
+                        self.node_based_zone(scenario_config, kubecli)
+                        affected_nodes_status = self.cloud_object.affected_nodes_status
+                        scenario_telemetry.affected_nodes.extend(affected_nodes_status.affected_nodes)
                    else:
-                        acl_id = cloud_object.create_default_network_acl(vpc_id)
-                        logging.info("Created new default ACL %s", acl_id)
-                        acl_ids_created.append(acl_id)
-
-                    new_association_id = cloud_object.replace_network_acl_association(
-                        network_association_ids[0], acl_id
-                    )
-
-                    # capture the orginal_acl_id, created_acl_id and
-                    # new association_id to use during the recovery
-                    ids[new_association_id] = original_acl_id
-
-                # wait for the specified duration
-                logging.info(
-                    "Waiting for the specified duration " "in the config: %s" % duration
-                )
-                time.sleep(duration)
-
-                # replace the applied acl with the previous acl in use
-                for new_association_id, original_acl_id in ids.items():
-                    cloud_object.replace_network_acl_association(
-                        new_association_id, original_acl_id
-                    )
-                logging.info(
-                    "Wating for 60 seconds to make sure " "the changes are in place"
-                )
-                time.sleep(60)
-
-                # delete the network acl created for the run
-                for acl_id in acl_ids_created:
-                    cloud_object.delete_network_acl(acl_id)
+                        logging.error(
+                            "ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
+                            "zone outage scenarios" % cloud_type
+                        )
+                        return 1

                end_time = int(time.time())
                cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
-        except (RuntimeError, Exception):
+        except (RuntimeError, Exception) as e:
            logging.error(
                f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}"
            )
            return 1
        else:
            return 0
+        
+    def node_based_zone(self, scenario_config: dict[str, any], kubecli: KrknKubernetes ):
+        zone = scenario_config["zone"]
+        duration = get_yaml_item_value(scenario_config, "duration", 60)
+        timeout = get_yaml_item_value(scenario_config, "timeout", 180)
+        label_selector = f"topology.kubernetes.io/zone={zone}"
+        try: 
+            # get list of nodes in zone/region
+            nodes = kubecli.list_killable_nodes(label_selector)
+            # stop nodes in parallel 
+            pool = ThreadPool(processes=len(nodes))
+    
+            pool.starmap(
+                self.cloud_object.node_stop_scenario,zip(repeat(1), nodes, repeat(timeout))
+            )
+
+            pool.close()
+
+            logging.info(
+                "Waiting for the specified duration " "in the config: %s" % duration
+            )
+            time.sleep(duration)
+
+            # start nodes in parallel 
+            pool = ThreadPool(processes=len(nodes))
+            pool.starmap(
+                self.cloud_object.node_start_scenario,zip(repeat(1), nodes, repeat(timeout))
+            )
+            pool.close()
+        except Exception as e:
+            logging.info(
+                f"Node based zone outage scenario failed with exception: {e}"
+            )
+            return 1
+        else:
+            return 0
+
+    def network_based_zone(self, scenario_config: dict[str, any]):
+
+        vpc_id = scenario_config["vpc_id"]
+        subnet_ids = scenario_config["subnet_id"]
+        duration = scenario_config["duration"]
+        # Add support for user-provided default network ACL
+        default_acl_id = scenario_config.get("default_acl_id")
+        ids = {}
+        acl_ids_created = []
+        for subnet_id in subnet_ids:
+            logging.info("Targeting subnet_id")
+            network_association_ids = []
+            associations, original_acl_id = self.cloud_object.describe_network_acls(
+                vpc_id, subnet_id
+            )
+            for entry in associations:
+                if entry["SubnetId"] == subnet_id:
+                    network_association_ids.append(
+                        entry["NetworkAclAssociationId"]
+                    )
+            logging.info(
+                "Network association ids associated with "
+                "the subnet %s: %s" % (subnet_id, network_association_ids)
+            )
+            
+            # Use provided default ACL if available, otherwise create a new one
+            if default_acl_id:
+                acl_id = default_acl_id
+                logging.info(
+                    "Using provided default ACL ID %s - this ACL will not be deleted after the scenario", 
+                    default_acl_id
+                )
+                # Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
+            else:
+                acl_id = self.cloud_object.create_default_network_acl(vpc_id)
+                logging.info("Created new default ACL %s", acl_id)
+                acl_ids_created.append(acl_id)
+
+            new_association_id = self.cloud_object.replace_network_acl_association(
+                network_association_ids[0], acl_id
+            )
+
+            # capture the orginal_acl_id, created_acl_id and
+            # new association_id to use during the recovery
+            ids[new_association_id] = original_acl_id
+
+        # wait for the specified duration
+        logging.info(
+            "Waiting for the specified duration " "in the config: %s" % duration
+        )
+        time.sleep(duration)
+
+        # replace the applied acl with the previous acl in use
+        for new_association_id, original_acl_id in ids.items():
+            self.cloud_object.replace_network_acl_association(
+                new_association_id, original_acl_id
+            )
+        logging.info(
+            "Wating for 60 seconds to make sure " "the changes are in place"
+        )
+        time.sleep(60)
+
+        # delete the network acl created for the run
+        for acl_id in acl_ids_created:
+            self.cloud_object.delete_network_acl(acl_id)
+

    def get_scenario_types(self) -> list[str]:
        return ["zone_outages_scenarios"]
--- a/scenarios/openshift/zone_outage_gcp.yaml
+++ b/scenarios/openshift/zone_outage_gcp.yaml
@@ -0,0 +1,4 @@
+zone_outage:                                         # Scenario to create an outage of a zone by tweaking network ACL
+  cloud_type: gcp                                    # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
+  duration: 600                                      # duration in seconds after which the zone will be back online
+  zone: <zone>                    # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.