two types of zone outage
Some checks failed
Functional & Unit Tests / Functional & Unit Tests (push) Has been cancelled
Functional & Unit Tests / Generate Coverage Badge (push) Has been cancelled

Signed-off-by: Paige Patton <prubenda@redhat.com>
This commit is contained in:
Paige Patton
2025-03-17 09:09:15 -04:00
committed by Naga Ravi Chaitanya Elluri
parent 2624102d65
commit 83d99bbb02
3 changed files with 133 additions and 78 deletions

View File

@@ -224,7 +224,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
super().__init__(kubecli, affected_nodes_status)
self.gcp = GCP()
print("selfkeys" + str(vars(self)))
# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):

View File

@@ -2,15 +2,21 @@ import logging
import time
import yaml
from multiprocessing.pool import ThreadPool
from itertools import repeat
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.models.k8s import AffectedNodeStatus
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
from krkn_lib.utils import log_exception
from krkn import utils
from krkn_lib.utils import get_yaml_item_value
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
from krkn.scenario_plugins.native.network import cerberus
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios
class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
def run(
@@ -25,92 +31,138 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
with open(scenario, "r") as f:
zone_outage_config_yaml = yaml.full_load(f)
scenario_config = zone_outage_config_yaml["zone_outage"]
vpc_id = scenario_config["vpc_id"]
subnet_ids = scenario_config["subnet_id"]
duration = scenario_config["duration"]
cloud_type = scenario_config["cloud_type"]
# Add support for user-provided default network ACL
default_acl_id = scenario_config.get("default_acl_id")
ids = {}
acl_ids_created = []
if cloud_type.lower() == "aws":
cloud_object = AWS()
else:
logging.error(
"ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
"zone outage scenarios" % cloud_type
)
return 1
start_time = int(time.time())
for subnet_id in subnet_ids:
logging.info("Targeting subnet_id")
network_association_ids = []
associations, original_acl_id = cloud_object.describe_network_acls(
vpc_id, subnet_id
)
for entry in associations:
if entry["SubnetId"] == subnet_id:
network_association_ids.append(
entry["NetworkAclAssociationId"]
)
logging.info(
"Network association ids associated with "
"the subnet %s: %s" % (subnet_id, network_association_ids)
)
# Use provided default ACL if available, otherwise create a new one
if default_acl_id:
acl_id = default_acl_id
logging.info(
"Using provided default ACL ID %s - this ACL will not be deleted after the scenario",
default_acl_id
)
# Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
if cloud_type.lower() == "aws":
self.cloud_object = AWS()
self.network_based_zone(scenario_config)
else:
kubecli = lib_telemetry.get_lib_kubernetes()
if cloud_type.lower() == "gcp":
affected_nodes_status = AffectedNodeStatus()
self.cloud_object = gcp_node_scenarios(kubecli, affected_nodes_status)
self.node_based_zone(scenario_config, kubecli)
affected_nodes_status = self.cloud_object.affected_nodes_status
scenario_telemetry.affected_nodes.extend(affected_nodes_status.affected_nodes)
else:
acl_id = cloud_object.create_default_network_acl(vpc_id)
logging.info("Created new default ACL %s", acl_id)
acl_ids_created.append(acl_id)
new_association_id = cloud_object.replace_network_acl_association(
network_association_ids[0], acl_id
)
# capture the orginal_acl_id, created_acl_id and
# new association_id to use during the recovery
ids[new_association_id] = original_acl_id
# wait for the specified duration
logging.info(
"Waiting for the specified duration " "in the config: %s" % duration
)
time.sleep(duration)
# replace the applied acl with the previous acl in use
for new_association_id, original_acl_id in ids.items():
cloud_object.replace_network_acl_association(
new_association_id, original_acl_id
)
logging.info(
"Wating for 60 seconds to make sure " "the changes are in place"
)
time.sleep(60)
# delete the network acl created for the run
for acl_id in acl_ids_created:
cloud_object.delete_network_acl(acl_id)
logging.error(
"ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
"zone outage scenarios" % cloud_type
)
return 1
end_time = int(time.time())
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
except (RuntimeError, Exception):
except (RuntimeError, Exception) as e:
logging.error(
f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}"
)
return 1
else:
return 0
def node_based_zone(self, scenario_config: dict[str, any], kubecli: KrknKubernetes ):
zone = scenario_config["zone"]
duration = get_yaml_item_value(scenario_config, "duration", 60)
timeout = get_yaml_item_value(scenario_config, "timeout", 180)
label_selector = f"topology.kubernetes.io/zone={zone}"
try:
# get list of nodes in zone/region
nodes = kubecli.list_killable_nodes(label_selector)
# stop nodes in parallel
pool = ThreadPool(processes=len(nodes))
pool.starmap(
self.cloud_object.node_stop_scenario,zip(repeat(1), nodes, repeat(timeout))
)
pool.close()
logging.info(
"Waiting for the specified duration " "in the config: %s" % duration
)
time.sleep(duration)
# start nodes in parallel
pool = ThreadPool(processes=len(nodes))
pool.starmap(
self.cloud_object.node_start_scenario,zip(repeat(1), nodes, repeat(timeout))
)
pool.close()
except Exception as e:
logging.info(
f"Node based zone outage scenario failed with exception: {e}"
)
return 1
else:
return 0
def network_based_zone(self, scenario_config: dict[str, any]):
vpc_id = scenario_config["vpc_id"]
subnet_ids = scenario_config["subnet_id"]
duration = scenario_config["duration"]
# Add support for user-provided default network ACL
default_acl_id = scenario_config.get("default_acl_id")
ids = {}
acl_ids_created = []
for subnet_id in subnet_ids:
logging.info("Targeting subnet_id")
network_association_ids = []
associations, original_acl_id = self.cloud_object.describe_network_acls(
vpc_id, subnet_id
)
for entry in associations:
if entry["SubnetId"] == subnet_id:
network_association_ids.append(
entry["NetworkAclAssociationId"]
)
logging.info(
"Network association ids associated with "
"the subnet %s: %s" % (subnet_id, network_association_ids)
)
# Use provided default ACL if available, otherwise create a new one
if default_acl_id:
acl_id = default_acl_id
logging.info(
"Using provided default ACL ID %s - this ACL will not be deleted after the scenario",
default_acl_id
)
# Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
else:
acl_id = self.cloud_object.create_default_network_acl(vpc_id)
logging.info("Created new default ACL %s", acl_id)
acl_ids_created.append(acl_id)
new_association_id = self.cloud_object.replace_network_acl_association(
network_association_ids[0], acl_id
)
# capture the orginal_acl_id, created_acl_id and
# new association_id to use during the recovery
ids[new_association_id] = original_acl_id
# wait for the specified duration
logging.info(
"Waiting for the specified duration " "in the config: %s" % duration
)
time.sleep(duration)
# replace the applied acl with the previous acl in use
for new_association_id, original_acl_id in ids.items():
self.cloud_object.replace_network_acl_association(
new_association_id, original_acl_id
)
logging.info(
"Wating for 60 seconds to make sure " "the changes are in place"
)
time.sleep(60)
# delete the network acl created for the run
for acl_id in acl_ids_created:
self.cloud_object.delete_network_acl(acl_id)
def get_scenario_types(self) -> list[str]:
return ["zone_outages_scenarios"]

View File

@@ -0,0 +1,4 @@
zone_outage: # Scenario to create an outage of a zone by tweaking network ACL
cloud_type: gcp # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
duration: 600 # duration in seconds after which the zone will be back online
zone: <zone> # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.