Add duration parameter for node scenarios

This option is enabled only for node_stop_start scenario where user will want to stop the node for certain duration to understand the impact before starting the node back on. This commit also bumps the timeout for the scenario to 360 seconds from 120 seconds to make sure there's enough time for the node to get to Ready state from the Kubernetes side after the node is started on the infra side. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
2026-02-14 18:10:00 +00:00 · 2024-08-11 19:23:02 -04:00
parent 5484828b67
commit 1057917731
7 changed files with 32 additions and 10 deletions
--- a/kraken/node_actions/abstract_node_scenarios.py
+++ b/kraken/node_actions/abstract_node_scenarios.py
@@ -1,5 +1,6 @@
 import sys
 import logging
+import time
 import kraken.invoke.command as runcommand
 import kraken.node_actions.common_node_functions as nodeaction
 from krkn_lib.k8s import KrknKubernetes
@@ -18,9 +19,11 @@ class abstract_node_scenarios:
        pass

    # Node scenario to stop and then start the node
-    def node_stop_start_scenario(self, instance_kill_count, node, timeout):
+    def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration):
        logging.info("Starting node_stop_start_scenario injection")
        self.node_stop_scenario(instance_kill_count, node, timeout)
+        logging.info("Waiting for %s seconds before starting the node" % (duration))
+        time.sleep(duration)
        self.node_start_scenario(instance_kill_count, node, timeout)
        logging.info("node_stop_start_scenario has been successfully injected!")

--- a/kraken/node_actions/run.py
+++ b/kraken/node_actions/run.py
@@ -100,6 +100,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K
    )
    node_name = get_yaml_item_value(node_scenario, "node_name", "")
    label_selector = get_yaml_item_value(node_scenario, "label_selector", "")
+    if action == "node_stop_start_scenario":
+        duration = get_yaml_item_value(node_scenario, "duration", 120)
    timeout = get_yaml_item_value(node_scenario, "timeout", 120)
    service = get_yaml_item_value(node_scenario, "service", "")
    ssh_private_key = get_yaml_item_value(
@@ -121,7 +123,7 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K
                elif action == "node_stop_scenario":
                    node_scenario_object.node_stop_scenario(run_kill_count, single_node, timeout)
                elif action == "node_stop_start_scenario":
-                    node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout)
+                    node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout, duration)
                elif action == "node_termination_scenario":
                    node_scenario_object.node_termination_scenario(run_kill_count, single_node, timeout)
                elif action == "node_reboot_scenario":
--- a/scenarios/openshift/aws_node_scenarios.yml
+++ b/scenarios/openshift/aws_node_scenarios.yml
@@ -1,14 +1,13 @@
 node_scenarios:
  - actions:                                                        # node chaos scenarios to be injected
    - node_stop_start_scenario
-    - stop_start_kubelet_scenario
-    - node_crash_scenario
    node_name:                                                      # node on which scenario has to be injected; can set multiple names separated by comma
    label_selector: node-role.kubernetes.io/worker                  # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
    instance_count: 1                                               # Number of nodes to perform action/select that match the label selector
    runs: 1                                                         # number of times to inject each scenario under actions (will perform on same node each time)
-    timeout: 120                                                    # duration to wait for completion of node scenario injection
-    cloud_type: aws                                                 # cloud type on which Kubernetes/OpenShift runs
+    timeout: 360                                                    # duration to wait for completion of node scenario injection
+    duration: 120                                                   # duration to stop the node before running the start action
+    cloud_type: aws                                                 # cloud type on which Kubernetes/OpenShift runs  
  - actions:
    - node_reboot_scenario
    node_name:
--- a/scenarios/openshift/azure_node_scenarios.yml
+++ b/scenarios/openshift/azure_node_scenarios.yml
@@ -6,3 +6,11 @@ node_scenarios:
    instance_count: 1
    timeout: 120
    cloud_type: azure
+  - actions:
+    - node_stop_start_scenario
+    node_name:
+    label_selector: node-role.kubernetes.io/infra
+    instance_count: 1
+    timeout: 360
+    duration: 120
+    cloud_type: azure
--- a/scenarios/openshift/baremetal_node_scenarios.yml
+++ b/scenarios/openshift/baremetal_node_scenarios.yml
@@ -5,8 +5,9 @@ node_scenarios:
    label_selector: node-role.kubernetes.io/worker                  # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection.
    instance_count: 1                                               # Number of nodes to perform action/select that match the label selector.
    runs: 1                                                         # Number of times to inject each scenario under actions (will perform on same node each time).
-    timeout: 120                                                    # Duration to wait for completion of node scenario injection.
-    cloud_type: bm                                                 # Cloud type on which Kubernetes/OpenShift runs.
+    timeout: 360                                                    # Duration to wait for completion of node scenario injection.
+    duration: 120                                                   # Duration to stop the node before running the start action
+    cloud_type: bm                                                  # Cloud type on which Kubernetes/OpenShift runs.
    bmc_user: defaultuser                                           # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines.
    bmc_password: defaultpass                                       # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines.
    bmc_info:                                                       # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info.
--- a/scenarios/openshift/gcp_node_scenarios.yml
+++ b/scenarios/openshift/gcp_node_scenarios.yml
@@ -6,3 +6,11 @@ node_scenarios:
    instance_count: 1
    timeout: 120
    cloud_type: gcp
+  - actions:
+    - node_stop_start_scenario
+    node_name:
+    label_selector: node-role.kubernetes.io/worker
+    instance_count: 1
+    timeout: 360
+    duration: 120
+    cloud_type: gcp
--- a/scenarios/openshift/ibmcloud_node_scenarios.yml
+++ b/scenarios/openshift/ibmcloud_node_scenarios.yml
@@ -5,5 +5,6 @@
    label_selector: "node-role.kubernetes.io/worker"    # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection 
    runs: 1                             # Number of times to inject each scenario under actions (will perform on same node each time)                                                           
    instance_count: 1                   # Number of nodes to perform action/select that match the label selector                                             
-    timeout: 30                         # Duration to wait for completion of node scenario injection
-    skip_openshift_checks: False        # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario 
+    timeout: 360                         # Duration to wait for completion of node scenario injection
+    duration: 120                       # Duration to stop the node before running the start action 
+    skip_openshift_checks: False        # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario