Add node_disk_detach_attach_scenario for aws under node scenarios

Resolves #678

Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com>

Add functions for aws detach disk scenario

Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com>

Add detach disk scenario in node scenario

Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com>

Add disk_deatch_attach_scenario in docs

Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com>
This commit is contained in:
jtydlack
2024-08-28 10:20:17 +02:00
committed by Naga Ravi Chaitanya Elluri
parent 2ba20fa483
commit 0c30d89a1b
5 changed files with 143 additions and 3 deletions

View File

@@ -4,7 +4,7 @@ The following node chaos scenarios are supported:
1. **node_start_scenario**: Scenario to stop the node instance.
2. **node_stop_scenario**: Scenario to stop the node instance.
3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware.
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
4. **node_termination_scenario**: Scenario to terminate the node instance.
5. **node_reboot_scenario**: Scenario to reboot the node instance.
6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance.
@@ -12,6 +12,7 @@ The following node chaos scenarios are supported:
8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance.
9. **node_crash_scenario**: Scenario to crash the node instance.
10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status.
11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration.
**NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
@@ -20,6 +21,8 @@ The following node chaos scenarios are supported:
, node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP
, VMware and Alibaba.
**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk.
#### AWS

View File

@@ -36,6 +36,20 @@ class abstract_node_scenarios:
self.helper_node_start_scenario(instance_kill_count, node, timeout)
logging.info("helper_node_stop_start_scenario has been successfully injected!")
# Node scenario to detach and attach the disk
def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
logging.info("Starting disk_detach_attach_scenario injection")
disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node)
if disk_attachment_details:
self.disk_detach_scenario(instance_kill_count, node, timeout)
logging.info("Waiting for %s seconds before attaching the disk" % (duration))
time.sleep(duration)
self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
logging.info("node_disk_detach_attach_scenario has been successfully injected!")
else:
logging.error("Node %s has only root disk attached" % (node))
logging.error("node_disk_detach_attach_scenario failed!")
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
pass

View File

@@ -12,7 +12,8 @@ from krkn_lib.k8s import KrknKubernetes
class AWS:
def __init__(self):
self.boto_client = boto3.client("ec2")
self.boto_instance = boto3.resource("ec2").Instance("id")
self.boto_resource = boto3.resource("ec2")
self.boto_instance = self.boto_resource.Instance("id")
# Get the instance ID of the node
def get_instance_id(self, node):
@@ -179,6 +180,72 @@ class AWS:
raise RuntimeError()
# Detach volume
def detach_volumes(self, volumes_ids: list):
for volume in volumes_ids:
try:
self.boto_client.detach_volume(VolumeId=volume, Force=True)
except Exception as e:
logging.error(
"Detaching volume %s failed with exception: %s"
% (volume, e)
)
# Attach volume
def attach_volume(self, attachment: dict):
try:
if self.get_volume_state(attachment["VolumeId"]) == "in-use":
logging.info(
"Volume %s is already in use." % attachment["VolumeId"]
)
return
logging.info(
"Attaching the %s volumes to instance %s."
% (attachment["VolumeId"], attachment["InstanceId"])
)
self.boto_client.attach_volume(
InstanceId=attachment["InstanceId"],
Device=attachment["Device"],
VolumeId=attachment["VolumeId"]
)
except Exception as e:
logging.error(
"Failed attaching disk %s to the %s instance. "
"Encountered following exception: %s"
% (attachment['VolumeId'], attachment['InstanceId'], e)
)
raise RuntimeError()
# Get IDs of node volumes
def get_volumes_ids(self, instance_id: list):
response = self.boto_client.describe_instances(InstanceIds=instance_id)
instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"]
root_volume_device_name = self.get_root_volume_id(instance_id)
volume_ids = []
for device in instance_attachment_details:
if device["DeviceName"] != root_volume_device_name:
volume_id = device["Ebs"]["VolumeId"]
volume_ids.append(volume_id)
return volume_ids
# Get volumes attachment details
def get_volume_attachment_details(self, volume_ids: list):
response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
volumes_details = response["Volumes"]
return volumes_details
# Get root volume
def get_root_volume_id(self, instance_id):
instance_id = instance_id[0]
instance = self.boto_resource.Instance(instance_id)
root_volume_id = instance.root_device_name
return root_volume_id
# Get volume state
def get_volume_state(self, volume_id: str):
volume = self.boto_resource.Volume(volume_id)
state = volume.state
return state
# krkn_lib
class aws_node_scenarios(abstract_node_scenarios):
@@ -290,3 +357,49 @@ class aws_node_scenarios(abstract_node_scenarios):
logging.error("node_reboot_scenario injection failed!")
raise RuntimeError()
# Get volume attachment info
def get_disk_attachment_info(self, instance_kill_count, node):
for _ in range(instance_kill_count):
try:
logging.info("Obtaining disk attachment information")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
if volumes_ids:
vol_attachment_details = self.aws.get_volume_attachment_details(
volumes_ids
)
return vol_attachment_details
return
except Exception as e:
logging.error(
"Failed to obtain disk attachment information of %s node. "
"Encounteres following exception: %s." % (node, e)
)
raise RuntimeError()
# Node scenario to detach the volume
def disk_detach_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting disk_detach_scenario injection")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
logging.info(
"Detaching the %s volumes from instance %s "
% (volumes_ids, node)
)
self.aws.detach_volumes(volumes_ids)
except Exception as e:
logging.error(
"Failed to detach disk from %s node. Encountered following"
"exception: %s." % (node, e)
)
logging.debug("")
raise RuntimeError()
# Node scenario to attach the volume
def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
for _ in range(instance_kill_count):
for attachment in attachment_details:
self.aws.attach_volume(attachment["Attachments"][0])

View File

@@ -163,7 +163,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
logging.info("action" + str(action))
# Get the scenario specifics for running action nodes
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
if action == "node_stop_start_scenario":
if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"):
duration = get_yaml_item_value(node_scenario, "duration", 120)
timeout = get_yaml_item_value(node_scenario, "timeout", 120)
@@ -200,6 +200,9 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
node_scenario_object.node_reboot_scenario(
run_kill_count, single_node, timeout
)
elif action == "node_disk_detach_attach_scenario":
node_scenario_object.node_disk_detach_attach_scenario(
run_kill_count, single_node, timeout, duration)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(
run_kill_count, single_node, timeout

View File

@@ -16,3 +16,10 @@ node_scenarios:
instance_count: 1
timeout: 120
cloud_type: aws
- actions:
- node_disk_detach_attach_scenario
node_name:
label_selector:
instance_count: 1
timeout: 120
cloud_type: aws