adding v4.0.8 version (#756 )

Signed-off-by: Paige Patton <prubenda@redhat.com>
Fixed the spelling mistake
2026-03-16 00:20:34 +00:00 · 2025-02-05 13:46:58 -05:00 · 2025-02-05 12:53:30 -05:00 · 2025-02-03 19:30:52 -05:00
9 changed files with 46 additions and 17 deletions
--- a/docs/node_scenarios.md
+++ b/docs/node_scenarios.md
@@ -2,7 +2,7 @@

 The following node chaos scenarios are supported:

-1. **node_start_scenario**: Scenario to stop the node instance.
+1. **node_start_scenario**: Scenario to start the node instance.
 2. **node_stop_scenario**: Scenario to stop the node instance.
 3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
 4. **node_termination_scenario**: Scenario to terminate the node instance.
--- a/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py
+++ b/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py
@@ -239,6 +239,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_start_scenario injection")
                vm_id = self.alibaba.get_instance_id(node)
+                affected_node.node_id = vm_id
                logging.info(
                    "Starting the node %s with instance ID: %s " % (node, vm_id)
                )
@@ -263,6 +264,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_stop_scenario injection")
                vm_id = self.alibaba.get_instance_id(node)
+                affected_node.node_id = vm_id
                logging.info(
                    "Stopping the node %s with instance ID: %s " % (node, vm_id)
                )
@@ -289,6 +291,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
                    "Starting node_termination_scenario injection by first stopping instance"
                )
                vm_id = self.alibaba.get_instance_id(node)
+                affected_node.node_id = vm_id
                self.alibaba.stop_instances(vm_id)
                self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
                logging.info(
@@ -316,6 +319,7 @@ class alibaba_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_reboot_scenario injection")
                instance_id = self.alibaba.get_instance_id(node)
+                affected_node.node_id = instance_id
                logging.info("Rebooting the node with instance ID: %s " % (instance_id))
                self.alibaba.reboot_instances(instance_id)
                nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
--- a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py
+++ b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py
@@ -272,6 +272,7 @@ class aws_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_start_scenario injection")
                instance_id = self.aws.get_instance_id(node)
+                affected_node.node_id = instance_id
                logging.info(
                    "Starting the node %s with instance ID: %s " % (node, instance_id)
                )
@@ -299,6 +300,7 @@ class aws_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_stop_scenario injection")
                instance_id = self.aws.get_instance_id(node)
+                affected_node.node_id = instance_id
                logging.info(
                    "Stopping the node %s with instance ID: %s " % (node, instance_id)
                )
@@ -325,6 +327,7 @@ class aws_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_termination_scenario injection")
                instance_id = self.aws.get_instance_id(node)
+                affected_node.node_id = instance_id
                logging.info(
                    "Terminating the node %s with instance ID: %s "
                    % (node, instance_id)
@@ -358,6 +361,7 @@ class aws_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_reboot_scenario injection" + str(node))
                instance_id = self.aws.get_instance_id(node)
+                affected_node.node_id = instance_id
                logging.info(
                    "Rebooting the node %s with instance ID: %s " % (node, instance_id)
                )
--- a/krkn/scenario_plugins/node_actions/az_node_scenarios.py
+++ b/krkn/scenario_plugins/node_actions/az_node_scenarios.py
@@ -170,7 +170,7 @@ class azure_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_start_scenario injection")
                vm_name, resource_group = self.azure.get_instance_id(node)
-                
+                affected_node.node_id = vm_name
                logging.info(
                    "Starting the node %s with instance ID: %s "
                    % (vm_name, resource_group)
@@ -197,6 +197,7 @@ class azure_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_stop_scenario injection")
                vm_name, resource_group = self.azure.get_instance_id(node)
+                affected_node.node_id = vm_name
                logging.info(
                    "Stopping the node %s with instance ID: %s "
                    % (vm_name, resource_group)
@@ -221,8 +222,8 @@ class azure_node_scenarios(abstract_node_scenarios):
            affected_node = AffectedNode(node)
            try:
                logging.info("Starting node_termination_scenario injection")
-                affected_node = AffectedNode(node)
                vm_name, resource_group = self.azure.get_instance_id(node)
+                affected_node.node_id = vm_name
                logging.info(
                    "Terminating the node %s with instance ID: %s "
                    % (vm_name, resource_group)
@@ -257,6 +258,7 @@ class azure_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_reboot_scenario injection")
                vm_name, resource_group = self.azure.get_instance_id(node)
+                affected_node.node_id = vm_name
                logging.info(
                    "Rebooting the node %s with instance ID: %s "
                    % (vm_name, resource_group)
--- a/krkn/scenario_plugins/node_actions/bm_node_scenarios.py
+++ b/krkn/scenario_plugins/node_actions/bm_node_scenarios.py
@@ -109,20 +109,28 @@ class BM:
        self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle()

    # Wait until the node instance is running
-    def wait_until_running(self, bmc_addr, node_name):
+    def wait_until_running(self, bmc_addr, node_name, affected_node):
+        start_time = time.time()
        while (
            not self.get_ipmi_connection(bmc_addr, node_name)
            .get_chassis_status()
            .power_on
        ):
            time.sleep(1)
+        end_time = time.time()
+        if affected_node:
+            affected_node.set_affected_node_status("running", end_time - start_time)

    # Wait until the node instance is stopped
-    def wait_until_stopped(self, bmc_addr, node_name):
+    def wait_until_stopped(self, bmc_addr, node_name, affected_node):
+        start_time = time.time()
        while (
            self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on
        ):
            time.sleep(1)
+        end_time = time.time()
+        if affected_node:
+            affected_node.set_affected_node_status("stopped", end_time - start_time)


 # krkn_lib
@@ -134,15 +142,17 @@ class bm_node_scenarios(abstract_node_scenarios):
    # Node scenario to start the node
    def node_start_scenario(self, instance_kill_count, node, timeout):
        for _ in range(instance_kill_count):
+            affected_node = AffectedNode(node)
            try:
                logging.info("Starting node_start_scenario injection")
                bmc_addr = self.bm.get_bmc_addr(node)
+                affected_node.node_id = bmc_addr
                logging.info(
                    "Starting the node %s with bmc address: %s " % (node, bmc_addr)
                )
                self.bm.start_instances(bmc_addr, node)
-                self.bm.wait_until_running(bmc_addr, node)
-                nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
+                self.bm.wait_until_running(bmc_addr, node, affected_node)
+                nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
                logging.info(
                    "Node with bmc address: %s is in running state" % (bmc_addr)
                )
@@ -155,6 +165,7 @@ class bm_node_scenarios(abstract_node_scenarios):
                )
                logging.error("node_start_scenario injection failed!")
                raise e
+            self.affected_nodes_status.affected_nodes.append(affected_node)

    # Node scenario to stop the node
    def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -163,6 +174,7 @@ class bm_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_stop_scenario injection")
                bmc_addr = self.bm.get_bmc_addr(node)
+                affected_node.node_id = bmc_addr
                logging.info(
                    "Stopping the node %s with bmc address: %s " % (node, bmc_addr)
                )
--- a/krkn/scenario_plugins/node_actions/docker_node_scenarios.py
+++ b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py
@@ -49,6 +49,7 @@ class docker_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_start_scenario injection")
                container_id = self.docker.get_container_id(node)
+                affected_node.node_id = container_id
                logging.info(
                    "Starting the node %s with container ID: %s " % (node, container_id)
                )
@@ -74,6 +75,7 @@ class docker_node_scenarios(abstract_node_scenarios):
            try:
                logging.info("Starting node_stop_scenario injection")
                container_id = self.docker.get_container_id(node)
+                affected_node.node_id = container_id
                logging.info(
                    "Stopping the node %s with container ID: %s " % (node, container_id)
                )
--- a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py
+++ b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py
@@ -234,6 +234,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
                logging.info("Starting node_start_scenario injection")
                instance = self.gcp.get_node_instance(node)
                instance_id = self.gcp.get_instance_name(instance)
+                affected_node.node_id = instance_id
                logging.info(
                    "Starting the node %s with instance ID: %s " % (node, instance_id)
                )
@@ -252,7 +253,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
                logging.error("node_start_scenario injection failed!")

                raise RuntimeError()
-            logging.info("started affected node" + str(affected_node.to_json()))
            self.affected_nodes_status.affected_nodes.append(affected_node)

    # Node scenario to stop the node
@@ -263,6 +263,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
                logging.info("Starting node_stop_scenario injection")
                instance = self.gcp.get_node_instance(node)
                instance_id = self.gcp.get_instance_name(instance)
+                affected_node.node_id = instance_id
                logging.info(
                    "Stopping the node %s with instance ID: %s " % (node, instance_id)
                )
@@ -280,7 +281,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
                logging.error("node_stop_scenario injection failed!")

                raise RuntimeError()
-            logging.info("stopedd affected node" + str(affected_node.to_json()))
            self.affected_nodes_status.affected_nodes.append(affected_node)

    # Node scenario to terminate the node
@@ -291,6 +291,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
                logging.info("Starting node_termination_scenario injection")
                instance = self.gcp.get_node_instance(node)
                instance_id = self.gcp.get_instance_name(instance)
+                affected_node.node_id = instance_id
                logging.info(
                    "Terminating the node %s with instance ID: %s "
                    % (node, instance_id)
@@ -325,6 +326,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
                logging.info("Starting node_reboot_scenario injection")
                instance = self.gcp.get_node_instance(node)
                instance_id = self.gcp.get_instance_name(instance)
+                affected_node.node_id = instance_id
                logging.info(
                    "Rebooting the node %s with instance ID: %s " % (node, instance_id)
                )
--- a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py
+++ b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py
@@ -15,6 +15,8 @@ from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP
 from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD
 from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud

+import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
+
 from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode

 class ShutDownScenarioPlugin(AbstractScenarioPlugin):
@@ -38,7 +40,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
                    shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status
                )

-                scenario_telemetry.affected_nodes = affected_nodes_status
+                scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes
                end_time = int(time.time())
                cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
                return 0
@@ -56,7 +58,6 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
                pool = ThreadPool(processes=len(nodes))
            else:
                pool = ThreadPool(processes=processes)
-            logging.info("nodes type " + str(type(nodes[0])))
            if type(nodes[0]) is tuple:
                node_id = []
                node_info = []
@@ -105,9 +106,8 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
        node_id = []
        for node in nodes:
            instance_id = cloud_object.get_instance_id(node)
-            affected_nodes_status.affected_nodes.append(AffectedNode(node))
+            affected_nodes_status.affected_nodes.append(AffectedNode(node, node_id=instance_id))
            node_id.append(instance_id)
-        logging.info("node id list " + str(node_id))
        for _ in range(runs):
            logging.info("Starting cluster_shut_down scenario injection")
            stopping_nodes = set(node_id)
@@ -117,8 +117,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
            while len(stopping_nodes) > 0:
                for node in stopping_nodes:
                    affected_node = affected_nodes_status.get_affected_node_index(node)
-                    # need to add in time that is passing while waiting for other nodes to be stopped
-                    affected_node.set_cloud_stopping_time(time.time() - start_time)
+                    
                    if type(node) is tuple:
                        node_status = cloud_object.wait_until_stopped(
                            node[1], node[0], timeout, affected_node
@@ -129,6 +128,8 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
                    # Only want to remove node from stopping list
                    # when fully stopped/no error
                    if node_status:
+                        # need to add in time that is passing while waiting for other nodes to be stopped
+                        affected_node.set_cloud_stopping_time(time.time() - start_time)
                        stopped_nodes.remove(node)

                stopping_nodes = stopped_nodes.copy()
@@ -148,7 +149,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
                for node in not_running_nodes:
                    affected_node = affected_nodes_status.get_affected_node_index(node)
                    # need to add in time that is passing while waiting for other nodes to be running
-                    affected_node.set_cloud_running_time(time.time() - start_time)
+                    
                    if type(node) is tuple:
                        node_status = cloud_object.wait_until_running(
                            node[1], node[0], timeout, affected_node
@@ -156,8 +157,10 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
                    else:
                        node_status = cloud_object.wait_until_running(node, timeout, affected_node)
                    if node_status:
+                        affected_node.set_cloud_running_time(time.time() - start_time)
                        restarted_nodes.remove(node)
                not_running_nodes = restarted_nodes.copy()
+
            logging.info("Waiting for 150s to allow cluster component initialization")
            time.sleep(150)

--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ google-cloud-compute==1.22.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
 jinja2==3.1.5
-krkn-lib==4.0.7
+krkn-lib==4.0.8
 lxml==5.1.0
 kubernetes==28.1.0
 numpy==1.26.4
Author	SHA1	Message	Date
Paige Patton	440890d252	adding v4.0.8 version (#756 ) Some checks failed Functional & Unit Tests / Functional & Unit Tests (push) Failing after 3m50s Functional & Unit Tests / Generate Coverage Badge (push) Has been skipped Signed-off-by: Paige Patton <prubenda@redhat.com>	2025-02-05 13:46:58 -05:00
Meghana Katta	69bf20fc76	Fixed the spelling mistake Signed-off-by: Meghana Katta <mkatta@mkatta-thinkpadt14gen4.bengluru.csb>	2025-02-05 12:53:30 -05:00
Paige Patton	2a42a2dc31	adding node id to affected node Some checks failed Functional & Unit Tests / Functional & Unit Tests (push) Failing after 9m9s Functional & Unit Tests / Generate Coverage Badge (push) Has been skipped	2025-02-03 19:30:52 -05:00