mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-03-16 00:20:34 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
440890d252 | ||
|
|
69bf20fc76 | ||
|
|
2a42a2dc31 |
@@ -2,7 +2,7 @@
|
||||
|
||||
The following node chaos scenarios are supported:
|
||||
|
||||
1. **node_start_scenario**: Scenario to stop the node instance.
|
||||
1. **node_start_scenario**: Scenario to start the node instance.
|
||||
2. **node_stop_scenario**: Scenario to stop the node instance.
|
||||
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
|
||||
4. **node_termination_scenario**: Scenario to terminate the node instance.
|
||||
|
||||
@@ -239,6 +239,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
vm_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = vm_id
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s " % (node, vm_id)
|
||||
)
|
||||
@@ -263,6 +264,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
vm_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = vm_id
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s " % (node, vm_id)
|
||||
)
|
||||
@@ -289,6 +291,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
"Starting node_termination_scenario injection by first stopping instance"
|
||||
)
|
||||
vm_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = vm_id
|
||||
self.alibaba.stop_instances(vm_id)
|
||||
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
|
||||
logging.info(
|
||||
@@ -316,6 +319,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
instance_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info("Rebooting the node with instance ID: %s " % (instance_id))
|
||||
self.alibaba.reboot_instances(instance_id)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
|
||||
@@ -272,6 +272,7 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
@@ -299,6 +300,7 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
@@ -325,6 +327,7 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Terminating the node %s with instance ID: %s "
|
||||
% (node, instance_id)
|
||||
@@ -358,6 +361,7 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection" + str(node))
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
|
||||
@@ -170,7 +170,7 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
@@ -197,6 +197,7 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
@@ -221,8 +222,8 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
affected_node = AffectedNode(node)
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Terminating the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
@@ -257,6 +258,7 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Rebooting the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
|
||||
@@ -109,20 +109,28 @@ class BM:
|
||||
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle()
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, bmc_addr, node_name):
|
||||
def wait_until_running(self, bmc_addr, node_name, affected_node):
|
||||
start_time = time.time()
|
||||
while (
|
||||
not self.get_ipmi_connection(bmc_addr, node_name)
|
||||
.get_chassis_status()
|
||||
.power_on
|
||||
):
|
||||
time.sleep(1)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, bmc_addr, node_name):
|
||||
def wait_until_stopped(self, bmc_addr, node_name, affected_node):
|
||||
start_time = time.time()
|
||||
while (
|
||||
self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on
|
||||
):
|
||||
time.sleep(1)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
|
||||
|
||||
# krkn_lib
|
||||
@@ -134,15 +142,17 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
affected_node.node_id = bmc_addr
|
||||
logging.info(
|
||||
"Starting the node %s with bmc address: %s " % (node, bmc_addr)
|
||||
)
|
||||
self.bm.start_instances(bmc_addr, node)
|
||||
self.bm.wait_until_running(bmc_addr, node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
self.bm.wait_until_running(bmc_addr, node, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with bmc address: %s is in running state" % (bmc_addr)
|
||||
)
|
||||
@@ -155,6 +165,7 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -163,6 +174,7 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
affected_node.node_id = bmc_addr
|
||||
logging.info(
|
||||
"Stopping the node %s with bmc address: %s " % (node, bmc_addr)
|
||||
)
|
||||
|
||||
@@ -49,6 +49,7 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
container_id = self.docker.get_container_id(node)
|
||||
affected_node.node_id = container_id
|
||||
logging.info(
|
||||
"Starting the node %s with container ID: %s " % (node, container_id)
|
||||
)
|
||||
@@ -74,6 +75,7 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
container_id = self.docker.get_container_id(node)
|
||||
affected_node.node_id = container_id
|
||||
logging.info(
|
||||
"Stopping the node %s with container ID: %s " % (node, container_id)
|
||||
)
|
||||
|
||||
@@ -234,6 +234,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
@@ -252,7 +253,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
logging.info("started affected node" + str(affected_node.to_json()))
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
@@ -263,6 +263,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
@@ -280,7 +281,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
logging.info("stopedd affected node" + str(affected_node.to_json()))
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to terminate the node
|
||||
@@ -291,6 +291,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Terminating the node %s with instance ID: %s "
|
||||
% (node, instance_id)
|
||||
@@ -325,6 +326,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
|
||||
@@ -15,6 +15,8 @@ from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP
|
||||
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD
|
||||
from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud
|
||||
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode
|
||||
|
||||
class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
@@ -38,7 +40,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status
|
||||
)
|
||||
|
||||
scenario_telemetry.affected_nodes = affected_nodes_status
|
||||
scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
return 0
|
||||
@@ -56,7 +58,6 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
else:
|
||||
pool = ThreadPool(processes=processes)
|
||||
logging.info("nodes type " + str(type(nodes[0])))
|
||||
if type(nodes[0]) is tuple:
|
||||
node_id = []
|
||||
node_info = []
|
||||
@@ -105,9 +106,8 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
node_id = []
|
||||
for node in nodes:
|
||||
instance_id = cloud_object.get_instance_id(node)
|
||||
affected_nodes_status.affected_nodes.append(AffectedNode(node))
|
||||
affected_nodes_status.affected_nodes.append(AffectedNode(node, node_id=instance_id))
|
||||
node_id.append(instance_id)
|
||||
logging.info("node id list " + str(node_id))
|
||||
for _ in range(runs):
|
||||
logging.info("Starting cluster_shut_down scenario injection")
|
||||
stopping_nodes = set(node_id)
|
||||
@@ -117,8 +117,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
while len(stopping_nodes) > 0:
|
||||
for node in stopping_nodes:
|
||||
affected_node = affected_nodes_status.get_affected_node_index(node)
|
||||
# need to add in time that is passing while waiting for other nodes to be stopped
|
||||
affected_node.set_cloud_stopping_time(time.time() - start_time)
|
||||
|
||||
if type(node) is tuple:
|
||||
node_status = cloud_object.wait_until_stopped(
|
||||
node[1], node[0], timeout, affected_node
|
||||
@@ -129,6 +128,8 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
# Only want to remove node from stopping list
|
||||
# when fully stopped/no error
|
||||
if node_status:
|
||||
# need to add in time that is passing while waiting for other nodes to be stopped
|
||||
affected_node.set_cloud_stopping_time(time.time() - start_time)
|
||||
stopped_nodes.remove(node)
|
||||
|
||||
stopping_nodes = stopped_nodes.copy()
|
||||
@@ -148,7 +149,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
for node in not_running_nodes:
|
||||
affected_node = affected_nodes_status.get_affected_node_index(node)
|
||||
# need to add in time that is passing while waiting for other nodes to be running
|
||||
affected_node.set_cloud_running_time(time.time() - start_time)
|
||||
|
||||
if type(node) is tuple:
|
||||
node_status = cloud_object.wait_until_running(
|
||||
node[1], node[0], timeout, affected_node
|
||||
@@ -156,8 +157,10 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
else:
|
||||
node_status = cloud_object.wait_until_running(node, timeout, affected_node)
|
||||
if node_status:
|
||||
affected_node.set_cloud_running_time(time.time() - start_time)
|
||||
restarted_nodes.remove(node)
|
||||
not_running_nodes = restarted_nodes.copy()
|
||||
|
||||
logging.info("Waiting for 150s to allow cluster component initialization")
|
||||
time.sleep(150)
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ google-cloud-compute==1.22.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.5
|
||||
krkn-lib==4.0.7
|
||||
krkn-lib==4.0.8
|
||||
lxml==5.1.0
|
||||
kubernetes==28.1.0
|
||||
numpy==1.26.4
|
||||
|
||||
Reference in New Issue
Block a user