mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 18:10:00 +00:00
Add node_disk_detach_attach_scenario for aws under node scenarios
Resolves #678 Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Add functions for aws detach disk scenario Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Add detach disk scenario in node scenario Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Add disk_deatch_attach_scenario in docs Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com>
This commit is contained in:
committed by
Naga Ravi Chaitanya Elluri
parent
2ba20fa483
commit
0c30d89a1b
@@ -4,7 +4,7 @@ The following node chaos scenarios are supported:
|
||||
|
||||
1. **node_start_scenario**: Scenario to stop the node instance.
|
||||
2. **node_stop_scenario**: Scenario to stop the node instance.
|
||||
3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware.
|
||||
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
|
||||
4. **node_termination_scenario**: Scenario to terminate the node instance.
|
||||
5. **node_reboot_scenario**: Scenario to reboot the node instance.
|
||||
6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance.
|
||||
@@ -12,6 +12,7 @@ The following node chaos scenarios are supported:
|
||||
8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance.
|
||||
9. **node_crash_scenario**: Scenario to crash the node instance.
|
||||
10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status.
|
||||
11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration.
|
||||
|
||||
|
||||
**NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
|
||||
@@ -20,6 +21,8 @@ The following node chaos scenarios are supported:
|
||||
, node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP
|
||||
, VMware and Alibaba.
|
||||
|
||||
**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk.
|
||||
|
||||
|
||||
#### AWS
|
||||
|
||||
|
||||
@@ -36,6 +36,20 @@ class abstract_node_scenarios:
|
||||
self.helper_node_start_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("helper_node_stop_start_scenario has been successfully injected!")
|
||||
|
||||
# Node scenario to detach and attach the disk
|
||||
def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
|
||||
logging.info("Starting disk_detach_attach_scenario injection")
|
||||
disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node)
|
||||
if disk_attachment_details:
|
||||
self.disk_detach_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("Waiting for %s seconds before attaching the disk" % (duration))
|
||||
time.sleep(duration)
|
||||
self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
|
||||
logging.info("node_disk_detach_attach_scenario has been successfully injected!")
|
||||
else:
|
||||
logging.error("Node %s has only root disk attached" % (node))
|
||||
logging.error("node_disk_detach_attach_scenario failed!")
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
pass
|
||||
|
||||
@@ -12,7 +12,8 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
class AWS:
|
||||
def __init__(self):
|
||||
self.boto_client = boto3.client("ec2")
|
||||
self.boto_instance = boto3.resource("ec2").Instance("id")
|
||||
self.boto_resource = boto3.resource("ec2")
|
||||
self.boto_instance = self.boto_resource.Instance("id")
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
@@ -179,6 +180,72 @@ class AWS:
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Detach volume
|
||||
def detach_volumes(self, volumes_ids: list):
|
||||
for volume in volumes_ids:
|
||||
try:
|
||||
self.boto_client.detach_volume(VolumeId=volume, Force=True)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Detaching volume %s failed with exception: %s"
|
||||
% (volume, e)
|
||||
)
|
||||
|
||||
# Attach volume
|
||||
def attach_volume(self, attachment: dict):
|
||||
try:
|
||||
if self.get_volume_state(attachment["VolumeId"]) == "in-use":
|
||||
logging.info(
|
||||
"Volume %s is already in use." % attachment["VolumeId"]
|
||||
)
|
||||
return
|
||||
logging.info(
|
||||
"Attaching the %s volumes to instance %s."
|
||||
% (attachment["VolumeId"], attachment["InstanceId"])
|
||||
)
|
||||
self.boto_client.attach_volume(
|
||||
InstanceId=attachment["InstanceId"],
|
||||
Device=attachment["Device"],
|
||||
VolumeId=attachment["VolumeId"]
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed attaching disk %s to the %s instance. "
|
||||
"Encountered following exception: %s"
|
||||
% (attachment['VolumeId'], attachment['InstanceId'], e)
|
||||
)
|
||||
raise RuntimeError()
|
||||
|
||||
# Get IDs of node volumes
|
||||
def get_volumes_ids(self, instance_id: list):
|
||||
response = self.boto_client.describe_instances(InstanceIds=instance_id)
|
||||
instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"]
|
||||
root_volume_device_name = self.get_root_volume_id(instance_id)
|
||||
volume_ids = []
|
||||
for device in instance_attachment_details:
|
||||
if device["DeviceName"] != root_volume_device_name:
|
||||
volume_id = device["Ebs"]["VolumeId"]
|
||||
volume_ids.append(volume_id)
|
||||
return volume_ids
|
||||
|
||||
# Get volumes attachment details
|
||||
def get_volume_attachment_details(self, volume_ids: list):
|
||||
response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
|
||||
volumes_details = response["Volumes"]
|
||||
return volumes_details
|
||||
|
||||
# Get root volume
|
||||
def get_root_volume_id(self, instance_id):
|
||||
instance_id = instance_id[0]
|
||||
instance = self.boto_resource.Instance(instance_id)
|
||||
root_volume_id = instance.root_device_name
|
||||
return root_volume_id
|
||||
|
||||
# Get volume state
|
||||
def get_volume_state(self, volume_id: str):
|
||||
volume = self.boto_resource.Volume(volume_id)
|
||||
state = volume.state
|
||||
return state
|
||||
|
||||
# krkn_lib
|
||||
class aws_node_scenarios(abstract_node_scenarios):
|
||||
@@ -290,3 +357,49 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Get volume attachment info
|
||||
def get_disk_attachment_info(self, instance_kill_count, node):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Obtaining disk attachment information")
|
||||
instance_id = (self.aws.get_instance_id(node)).split()
|
||||
volumes_ids = self.aws.get_volumes_ids(instance_id)
|
||||
if volumes_ids:
|
||||
vol_attachment_details = self.aws.get_volume_attachment_details(
|
||||
volumes_ids
|
||||
)
|
||||
return vol_attachment_details
|
||||
return
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to obtain disk attachment information of %s node. "
|
||||
"Encounteres following exception: %s." % (node, e)
|
||||
)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to detach the volume
|
||||
def disk_detach_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting disk_detach_scenario injection")
|
||||
instance_id = (self.aws.get_instance_id(node)).split()
|
||||
volumes_ids = self.aws.get_volumes_ids(instance_id)
|
||||
logging.info(
|
||||
"Detaching the %s volumes from instance %s "
|
||||
% (volumes_ids, node)
|
||||
)
|
||||
self.aws.detach_volumes(volumes_ids)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to detach disk from %s node. Encountered following"
|
||||
"exception: %s." % (node, e)
|
||||
)
|
||||
logging.debug("")
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to attach the volume
|
||||
def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
for attachment in attachment_details:
|
||||
self.aws.attach_volume(attachment["Attachments"][0])
|
||||
|
||||
@@ -163,7 +163,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
logging.info("action" + str(action))
|
||||
# Get the scenario specifics for running action nodes
|
||||
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
|
||||
if action == "node_stop_start_scenario":
|
||||
if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"):
|
||||
duration = get_yaml_item_value(node_scenario, "duration", 120)
|
||||
|
||||
timeout = get_yaml_item_value(node_scenario, "timeout", 120)
|
||||
@@ -200,6 +200,9 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
node_scenario_object.node_reboot_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_disk_detach_attach_scenario":
|
||||
node_scenario_object.node_disk_detach_attach_scenario(
|
||||
run_kill_count, single_node, timeout, duration)
|
||||
elif action == "stop_start_kubelet_scenario":
|
||||
node_scenario_object.stop_start_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
|
||||
@@ -16,3 +16,10 @@ node_scenarios:
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: aws
|
||||
- actions:
|
||||
- node_disk_detach_attach_scenario
|
||||
node_name:
|
||||
label_selector:
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: aws
|
||||
Reference in New Issue
Block a user