From 83d99bbb02ee3ff923555a87c386fb0df0196e56 Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Mon, 17 Mar 2025 09:09:15 -0400 Subject: [PATCH] two types of zone outage Signed-off-by: Paige Patton --- .../node_actions/gcp_node_scenarios.py | 1 - .../zone_outage_scenario_plugin.py | 206 +++++++++++------- scenarios/openshift/zone_outage_gcp.yaml | 4 + 3 files changed, 133 insertions(+), 78 deletions(-) create mode 100644 scenarios/openshift/zone_outage_gcp.yaml diff --git a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py index 540cd009..ca8ee5b8 100644 --- a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py @@ -224,7 +224,6 @@ class gcp_node_scenarios(abstract_node_scenarios): def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus): super().__init__(kubecli, affected_nodes_status) self.gcp = GCP() - print("selfkeys" + str(vars(self))) # Node scenario to start the node def node_start_scenario(self, instance_kill_count, node, timeout): diff --git a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py index bce7d051..1146fe8f 100644 --- a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py +++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py @@ -2,15 +2,21 @@ import logging import time import yaml + +from multiprocessing.pool import ThreadPool +from itertools import repeat + +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.k8s import AffectedNodeStatus from krkn_lib.models.telemetry import ScenarioTelemetry from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift -from krkn_lib.utils import log_exception -from krkn import utils +from krkn_lib.utils import get_yaml_item_value from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin from krkn.scenario_plugins.native.network import cerberus -from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS +from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS +from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios class ZoneOutageScenarioPlugin(AbstractScenarioPlugin): def run( @@ -25,92 +31,138 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin): with open(scenario, "r") as f: zone_outage_config_yaml = yaml.full_load(f) scenario_config = zone_outage_config_yaml["zone_outage"] - vpc_id = scenario_config["vpc_id"] - subnet_ids = scenario_config["subnet_id"] - duration = scenario_config["duration"] cloud_type = scenario_config["cloud_type"] - # Add support for user-provided default network ACL - default_acl_id = scenario_config.get("default_acl_id") - ids = {} - acl_ids_created = [] - - if cloud_type.lower() == "aws": - cloud_object = AWS() - else: - logging.error( - "ZoneOutageScenarioPlugin Cloud type %s is not currently supported for " - "zone outage scenarios" % cloud_type - ) - return 1 - start_time = int(time.time()) - - for subnet_id in subnet_ids: - logging.info("Targeting subnet_id") - network_association_ids = [] - associations, original_acl_id = cloud_object.describe_network_acls( - vpc_id, subnet_id - ) - for entry in associations: - if entry["SubnetId"] == subnet_id: - network_association_ids.append( - entry["NetworkAclAssociationId"] - ) - logging.info( - "Network association ids associated with " - "the subnet %s: %s" % (subnet_id, network_association_ids) - ) - - # Use provided default ACL if available, otherwise create a new one - if default_acl_id: - acl_id = default_acl_id - logging.info( - "Using provided default ACL ID %s - this ACL will not be deleted after the scenario", - default_acl_id - ) - # Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup + if cloud_type.lower() == "aws": + self.cloud_object = AWS() + self.network_based_zone(scenario_config) + else: + kubecli = lib_telemetry.get_lib_kubernetes() + if cloud_type.lower() == "gcp": + affected_nodes_status = AffectedNodeStatus() + self.cloud_object = gcp_node_scenarios(kubecli, affected_nodes_status) + self.node_based_zone(scenario_config, kubecli) + affected_nodes_status = self.cloud_object.affected_nodes_status + scenario_telemetry.affected_nodes.extend(affected_nodes_status.affected_nodes) else: - acl_id = cloud_object.create_default_network_acl(vpc_id) - logging.info("Created new default ACL %s", acl_id) - acl_ids_created.append(acl_id) - - new_association_id = cloud_object.replace_network_acl_association( - network_association_ids[0], acl_id - ) - - # capture the orginal_acl_id, created_acl_id and - # new association_id to use during the recovery - ids[new_association_id] = original_acl_id - - # wait for the specified duration - logging.info( - "Waiting for the specified duration " "in the config: %s" % duration - ) - time.sleep(duration) - - # replace the applied acl with the previous acl in use - for new_association_id, original_acl_id in ids.items(): - cloud_object.replace_network_acl_association( - new_association_id, original_acl_id - ) - logging.info( - "Wating for 60 seconds to make sure " "the changes are in place" - ) - time.sleep(60) - - # delete the network acl created for the run - for acl_id in acl_ids_created: - cloud_object.delete_network_acl(acl_id) + logging.error( + "ZoneOutageScenarioPlugin Cloud type %s is not currently supported for " + "zone outage scenarios" % cloud_type + ) + return 1 end_time = int(time.time()) cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) - except (RuntimeError, Exception): + except (RuntimeError, Exception) as e: logging.error( f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}" ) return 1 else: return 0 + + def node_based_zone(self, scenario_config: dict[str, any], kubecli: KrknKubernetes ): + zone = scenario_config["zone"] + duration = get_yaml_item_value(scenario_config, "duration", 60) + timeout = get_yaml_item_value(scenario_config, "timeout", 180) + label_selector = f"topology.kubernetes.io/zone={zone}" + try: + # get list of nodes in zone/region + nodes = kubecli.list_killable_nodes(label_selector) + # stop nodes in parallel + pool = ThreadPool(processes=len(nodes)) + + pool.starmap( + self.cloud_object.node_stop_scenario,zip(repeat(1), nodes, repeat(timeout)) + ) + + pool.close() + + logging.info( + "Waiting for the specified duration " "in the config: %s" % duration + ) + time.sleep(duration) + + # start nodes in parallel + pool = ThreadPool(processes=len(nodes)) + pool.starmap( + self.cloud_object.node_start_scenario,zip(repeat(1), nodes, repeat(timeout)) + ) + pool.close() + except Exception as e: + logging.info( + f"Node based zone outage scenario failed with exception: {e}" + ) + return 1 + else: + return 0 + + def network_based_zone(self, scenario_config: dict[str, any]): + + vpc_id = scenario_config["vpc_id"] + subnet_ids = scenario_config["subnet_id"] + duration = scenario_config["duration"] + # Add support for user-provided default network ACL + default_acl_id = scenario_config.get("default_acl_id") + ids = {} + acl_ids_created = [] + for subnet_id in subnet_ids: + logging.info("Targeting subnet_id") + network_association_ids = [] + associations, original_acl_id = self.cloud_object.describe_network_acls( + vpc_id, subnet_id + ) + for entry in associations: + if entry["SubnetId"] == subnet_id: + network_association_ids.append( + entry["NetworkAclAssociationId"] + ) + logging.info( + "Network association ids associated with " + "the subnet %s: %s" % (subnet_id, network_association_ids) + ) + + # Use provided default ACL if available, otherwise create a new one + if default_acl_id: + acl_id = default_acl_id + logging.info( + "Using provided default ACL ID %s - this ACL will not be deleted after the scenario", + default_acl_id + ) + # Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup + else: + acl_id = self.cloud_object.create_default_network_acl(vpc_id) + logging.info("Created new default ACL %s", acl_id) + acl_ids_created.append(acl_id) + + new_association_id = self.cloud_object.replace_network_acl_association( + network_association_ids[0], acl_id + ) + + # capture the orginal_acl_id, created_acl_id and + # new association_id to use during the recovery + ids[new_association_id] = original_acl_id + + # wait for the specified duration + logging.info( + "Waiting for the specified duration " "in the config: %s" % duration + ) + time.sleep(duration) + + # replace the applied acl with the previous acl in use + for new_association_id, original_acl_id in ids.items(): + self.cloud_object.replace_network_acl_association( + new_association_id, original_acl_id + ) + logging.info( + "Wating for 60 seconds to make sure " "the changes are in place" + ) + time.sleep(60) + + # delete the network acl created for the run + for acl_id in acl_ids_created: + self.cloud_object.delete_network_acl(acl_id) + def get_scenario_types(self) -> list[str]: return ["zone_outages_scenarios"] diff --git a/scenarios/openshift/zone_outage_gcp.yaml b/scenarios/openshift/zone_outage_gcp.yaml new file mode 100644 index 00000000..bc56d70b --- /dev/null +++ b/scenarios/openshift/zone_outage_gcp.yaml @@ -0,0 +1,4 @@ +zone_outage: # Scenario to create an outage of a zone by tweaking network ACL + cloud_type: gcp # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario. + duration: 600 # duration in seconds after which the zone will be back online + zone: # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.