Compare commits

...

3 Commits

Author SHA1 Message Date
Paige Patton
440890d252 adding v4.0.8 version (#756)
Some checks failed
Functional & Unit Tests / Functional & Unit Tests (push) Failing after 3m50s
Functional & Unit Tests / Generate Coverage Badge (push) Has been skipped
Signed-off-by: Paige Patton <prubenda@redhat.com>
2025-02-05 13:46:58 -05:00
Meghana Katta
69bf20fc76 Fixed the spelling mistake
Signed-off-by: Meghana Katta <mkatta@mkatta-thinkpadt14gen4.bengluru.csb>
2025-02-05 12:53:30 -05:00
Paige Patton
2a42a2dc31 adding node id to affected node
Some checks failed
Functional & Unit Tests / Functional & Unit Tests (push) Failing after 9m9s
Functional & Unit Tests / Generate Coverage Badge (push) Has been skipped
2025-02-03 19:30:52 -05:00
9 changed files with 46 additions and 17 deletions

View File

@@ -2,7 +2,7 @@
The following node chaos scenarios are supported:
1. **node_start_scenario**: Scenario to stop the node instance.
1. **node_start_scenario**: Scenario to start the node instance.
2. **node_stop_scenario**: Scenario to stop the node instance.
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
4. **node_termination_scenario**: Scenario to terminate the node instance.

View File

@@ -239,6 +239,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_start_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
affected_node.node_id = vm_id
logging.info(
"Starting the node %s with instance ID: %s " % (node, vm_id)
)
@@ -263,6 +264,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_stop_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
affected_node.node_id = vm_id
logging.info(
"Stopping the node %s with instance ID: %s " % (node, vm_id)
)
@@ -289,6 +291,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
"Starting node_termination_scenario injection by first stopping instance"
)
vm_id = self.alibaba.get_instance_id(node)
affected_node.node_id = vm_id
self.alibaba.stop_instances(vm_id)
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
logging.info(
@@ -316,6 +319,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_reboot_scenario injection")
instance_id = self.alibaba.get_instance_id(node)
affected_node.node_id = instance_id
logging.info("Rebooting the node with instance ID: %s " % (instance_id))
self.alibaba.reboot_instances(instance_id)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)

View File

@@ -272,6 +272,7 @@ class aws_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_start_scenario injection")
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Starting the node %s with instance ID: %s " % (node, instance_id)
)
@@ -299,6 +300,7 @@ class aws_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_stop_scenario injection")
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Stopping the node %s with instance ID: %s " % (node, instance_id)
)
@@ -325,6 +327,7 @@ class aws_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_termination_scenario injection")
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Terminating the node %s with instance ID: %s "
% (node, instance_id)
@@ -358,6 +361,7 @@ class aws_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_reboot_scenario injection" + str(node))
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
)

View File

@@ -170,7 +170,7 @@ class azure_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_start_scenario injection")
vm_name, resource_group = self.azure.get_instance_id(node)
affected_node.node_id = vm_name
logging.info(
"Starting the node %s with instance ID: %s "
% (vm_name, resource_group)
@@ -197,6 +197,7 @@ class azure_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_stop_scenario injection")
vm_name, resource_group = self.azure.get_instance_id(node)
affected_node.node_id = vm_name
logging.info(
"Stopping the node %s with instance ID: %s "
% (vm_name, resource_group)
@@ -221,8 +222,8 @@ class azure_node_scenarios(abstract_node_scenarios):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_termination_scenario injection")
affected_node = AffectedNode(node)
vm_name, resource_group = self.azure.get_instance_id(node)
affected_node.node_id = vm_name
logging.info(
"Terminating the node %s with instance ID: %s "
% (vm_name, resource_group)
@@ -257,6 +258,7 @@ class azure_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_reboot_scenario injection")
vm_name, resource_group = self.azure.get_instance_id(node)
affected_node.node_id = vm_name
logging.info(
"Rebooting the node %s with instance ID: %s "
% (vm_name, resource_group)

View File

@@ -109,20 +109,28 @@ class BM:
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle()
# Wait until the node instance is running
def wait_until_running(self, bmc_addr, node_name):
def wait_until_running(self, bmc_addr, node_name, affected_node):
start_time = time.time()
while (
not self.get_ipmi_connection(bmc_addr, node_name)
.get_chassis_status()
.power_on
):
time.sleep(1)
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("running", end_time - start_time)
# Wait until the node instance is stopped
def wait_until_stopped(self, bmc_addr, node_name):
def wait_until_stopped(self, bmc_addr, node_name, affected_node):
start_time = time.time()
while (
self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on
):
time.sleep(1)
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("stopped", end_time - start_time)
# krkn_lib
@@ -134,15 +142,17 @@ class bm_node_scenarios(abstract_node_scenarios):
# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_start_scenario injection")
bmc_addr = self.bm.get_bmc_addr(node)
affected_node.node_id = bmc_addr
logging.info(
"Starting the node %s with bmc address: %s " % (node, bmc_addr)
)
self.bm.start_instances(bmc_addr, node)
self.bm.wait_until_running(bmc_addr, node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
self.bm.wait_until_running(bmc_addr, node, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
logging.info(
"Node with bmc address: %s is in running state" % (bmc_addr)
)
@@ -155,6 +165,7 @@ class bm_node_scenarios(abstract_node_scenarios):
)
logging.error("node_start_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)
# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -163,6 +174,7 @@ class bm_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_stop_scenario injection")
bmc_addr = self.bm.get_bmc_addr(node)
affected_node.node_id = bmc_addr
logging.info(
"Stopping the node %s with bmc address: %s " % (node, bmc_addr)
)

View File

@@ -49,6 +49,7 @@ class docker_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_start_scenario injection")
container_id = self.docker.get_container_id(node)
affected_node.node_id = container_id
logging.info(
"Starting the node %s with container ID: %s " % (node, container_id)
)
@@ -74,6 +75,7 @@ class docker_node_scenarios(abstract_node_scenarios):
try:
logging.info("Starting node_stop_scenario injection")
container_id = self.docker.get_container_id(node)
affected_node.node_id = container_id
logging.info(
"Stopping the node %s with container ID: %s " % (node, container_id)
)

View File

@@ -234,6 +234,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
logging.info("Starting node_start_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Starting the node %s with instance ID: %s " % (node, instance_id)
)
@@ -252,7 +253,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
logging.error("node_start_scenario injection failed!")
raise RuntimeError()
logging.info("started affected node" + str(affected_node.to_json()))
self.affected_nodes_status.affected_nodes.append(affected_node)
# Node scenario to stop the node
@@ -263,6 +263,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
logging.info("Starting node_stop_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Stopping the node %s with instance ID: %s " % (node, instance_id)
)
@@ -280,7 +281,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
logging.error("node_stop_scenario injection failed!")
raise RuntimeError()
logging.info("stopedd affected node" + str(affected_node.to_json()))
self.affected_nodes_status.affected_nodes.append(affected_node)
# Node scenario to terminate the node
@@ -291,6 +291,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
logging.info("Starting node_termination_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Terminating the node %s with instance ID: %s "
% (node, instance_id)
@@ -325,6 +326,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
logging.info("Starting node_reboot_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
)

View File

@@ -15,6 +15,8 @@ from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD
from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode
class ShutDownScenarioPlugin(AbstractScenarioPlugin):
@@ -38,7 +40,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status
)
scenario_telemetry.affected_nodes = affected_nodes_status
scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes
end_time = int(time.time())
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
return 0
@@ -56,7 +58,6 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
pool = ThreadPool(processes=len(nodes))
else:
pool = ThreadPool(processes=processes)
logging.info("nodes type " + str(type(nodes[0])))
if type(nodes[0]) is tuple:
node_id = []
node_info = []
@@ -105,9 +106,8 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
node_id = []
for node in nodes:
instance_id = cloud_object.get_instance_id(node)
affected_nodes_status.affected_nodes.append(AffectedNode(node))
affected_nodes_status.affected_nodes.append(AffectedNode(node, node_id=instance_id))
node_id.append(instance_id)
logging.info("node id list " + str(node_id))
for _ in range(runs):
logging.info("Starting cluster_shut_down scenario injection")
stopping_nodes = set(node_id)
@@ -117,8 +117,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
while len(stopping_nodes) > 0:
for node in stopping_nodes:
affected_node = affected_nodes_status.get_affected_node_index(node)
# need to add in time that is passing while waiting for other nodes to be stopped
affected_node.set_cloud_stopping_time(time.time() - start_time)
if type(node) is tuple:
node_status = cloud_object.wait_until_stopped(
node[1], node[0], timeout, affected_node
@@ -129,6 +128,8 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
# Only want to remove node from stopping list
# when fully stopped/no error
if node_status:
# need to add in time that is passing while waiting for other nodes to be stopped
affected_node.set_cloud_stopping_time(time.time() - start_time)
stopped_nodes.remove(node)
stopping_nodes = stopped_nodes.copy()
@@ -148,7 +149,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
for node in not_running_nodes:
affected_node = affected_nodes_status.get_affected_node_index(node)
# need to add in time that is passing while waiting for other nodes to be running
affected_node.set_cloud_running_time(time.time() - start_time)
if type(node) is tuple:
node_status = cloud_object.wait_until_running(
node[1], node[0], timeout, affected_node
@@ -156,8 +157,10 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
else:
node_status = cloud_object.wait_until_running(node, timeout, affected_node)
if node_status:
affected_node.set_cloud_running_time(time.time() - start_time)
restarted_nodes.remove(node)
not_running_nodes = restarted_nodes.copy()
logging.info("Waiting for 150s to allow cluster component initialization")
time.sleep(150)

View File

@@ -15,7 +15,7 @@ google-cloud-compute==1.22.0
ibm_cloud_sdk_core==3.18.0
ibm_vpc==0.20.0
jinja2==3.1.5
krkn-lib==4.0.7
krkn-lib==4.0.8
lxml==5.1.0
kubernetes==28.1.0
numpy==1.26.4