diff --git a/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py index 8fed4c78..677892d3 100644 --- a/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py @@ -18,20 +18,20 @@ class abstract_node_scenarios: self.node_action_kube_check = node_action_kube_check # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): pass # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): pass # Node scenario to stop and then start the node - def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration): + def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration, poll_interval): logging.info("Starting node_stop_start_scenario injection") - self.node_stop_scenario(instance_kill_count, node, timeout) + self.node_stop_scenario(instance_kill_count, node, timeout, poll_interval) logging.info("Waiting for %s seconds before starting the node" % (duration)) time.sleep(duration) - self.node_start_scenario(instance_kill_count, node, timeout) + self.node_start_scenario(instance_kill_count, node, timeout, poll_interval) self.affected_nodes_status.merge_affected_nodes() logging.info("node_stop_start_scenario has been successfully injected!") @@ -56,7 +56,7 @@ class abstract_node_scenarios: logging.error("node_disk_detach_attach_scenario failed!") # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): pass # Node scenario to reboot the node diff --git a/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py b/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py index 610b1ff1..39889844 100644 --- a/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py @@ -234,7 +234,7 @@ class alibaba_node_scenarios(abstract_node_scenarios): # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -260,7 +260,7 @@ class alibaba_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -286,7 +286,7 @@ class alibaba_node_scenarios(abstract_node_scenarios): # Might need to stop and then release the instance # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: diff --git a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py index 50dc8488..36704cdd 100644 --- a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py @@ -77,10 +77,21 @@ class AWS: # until a successful state is reached. An error is returned after 40 failed checks # Setting timeout for consistency with other cloud functions # Wait until the node instance is running - def wait_until_running(self, instance_id, timeout=600, affected_node=None): + def wait_until_running(self, instance_id, timeout=600, affected_node=None, poll_interval=15): try: start_time = time.time() - self.boto_instance.wait_until_running(InstanceIds=[instance_id]) + if timeout > 0: + max_attempts = max(1, int(timeout / poll_interval)) + else: + max_attempts = 40 + + self.boto_instance.wait_until_running( + InstanceIds=[instance_id], + WaiterConfig={ + 'Delay': poll_interval, + 'MaxAttempts': max_attempts + } + ) end_time = time.time() if affected_node: affected_node.set_affected_node_status("running", end_time - start_time) @@ -93,10 +104,21 @@ class AWS: return False # Wait until the node instance is stopped - def wait_until_stopped(self, instance_id, timeout=600, affected_node= None): + def wait_until_stopped(self, instance_id, timeout=600, affected_node= None, poll_interval=15): try: start_time = time.time() - self.boto_instance.wait_until_stopped(InstanceIds=[instance_id]) + if timeout > 0: + max_attempts = max(1, int(timeout / poll_interval)) + else: + max_attempts = 40 + + self.boto_instance.wait_until_stopped( + InstanceIds=[instance_id], + WaiterConfig={ + 'Delay': poll_interval, + 'MaxAttempts': max_attempts + } + ) end_time = time.time() if affected_node: affected_node.set_affected_node_status("stopped", end_time - start_time) @@ -109,10 +131,21 @@ class AWS: return False # Wait until the node instance is terminated - def wait_until_terminated(self, instance_id, timeout=600, affected_node= None): + def wait_until_terminated(self, instance_id, timeout=600, affected_node= None, poll_interval=15): try: start_time = time.time() - self.boto_instance.wait_until_terminated(InstanceIds=[instance_id]) + if timeout > 0: + max_attempts = max(1, int(timeout / poll_interval)) + else: + max_attempts = 40 + + self.boto_instance.wait_until_terminated( + InstanceIds=[instance_id], + WaiterConfig={ + 'Delay': poll_interval, + 'MaxAttempts': max_attempts + } + ) end_time = time.time() if affected_node: affected_node.set_affected_node_status("terminated", end_time - start_time) @@ -267,7 +300,7 @@ class aws_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -278,7 +311,7 @@ class aws_node_scenarios(abstract_node_scenarios): "Starting the node %s with instance ID: %s " % (node, instance_id) ) self.aws.start_instances(instance_id) - self.aws.wait_until_running(instance_id, affected_node=affected_node) + self.aws.wait_until_running(instance_id, timeout=timeout, affected_node=affected_node, poll_interval=poll_interval) if self.node_action_kube_check: nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node) logging.info( @@ -296,7 +329,7 @@ class aws_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -307,7 +340,7 @@ class aws_node_scenarios(abstract_node_scenarios): "Stopping the node %s with instance ID: %s " % (node, instance_id) ) self.aws.stop_instances(instance_id) - self.aws.wait_until_stopped(instance_id, affected_node=affected_node) + self.aws.wait_until_stopped(instance_id, timeout=timeout, affected_node=affected_node, poll_interval=poll_interval) logging.info( "Node with instance ID: %s is in stopped state" % (instance_id) ) @@ -324,7 +357,7 @@ class aws_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -336,7 +369,7 @@ class aws_node_scenarios(abstract_node_scenarios): % (node, instance_id) ) self.aws.terminate_instances(instance_id) - self.aws.wait_until_terminated(instance_id, affected_node=affected_node) + self.aws.wait_until_terminated(instance_id, timeout=timeout, affected_node=affected_node, poll_interval=poll_interval) for _ in range(timeout): if node not in self.kubecli.list_nodes(): break diff --git a/krkn/scenario_plugins/node_actions/az_node_scenarios.py b/krkn/scenario_plugins/node_actions/az_node_scenarios.py index e9f28946..2ef26ef1 100644 --- a/krkn/scenario_plugins/node_actions/az_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/az_node_scenarios.py @@ -218,7 +218,7 @@ class azure_node_scenarios(abstract_node_scenarios): # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -246,7 +246,7 @@ class azure_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -273,7 +273,7 @@ class azure_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: diff --git a/krkn/scenario_plugins/node_actions/bm_node_scenarios.py b/krkn/scenario_plugins/node_actions/bm_node_scenarios.py index f7ca37f8..8d80b075 100644 --- a/krkn/scenario_plugins/node_actions/bm_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/bm_node_scenarios.py @@ -153,7 +153,7 @@ class bm_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -182,7 +182,7 @@ class bm_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -210,7 +210,7 @@ class bm_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): logging.info("Node termination scenario is not supported on baremetal") # Node scenario to reboot the node diff --git a/krkn/scenario_plugins/node_actions/docker_node_scenarios.py b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py index 10724f10..04ad20fc 100644 --- a/krkn/scenario_plugins/node_actions/docker_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py @@ -44,7 +44,7 @@ class docker_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -71,7 +71,7 @@ class docker_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -97,7 +97,7 @@ class docker_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): try: logging.info("Starting node_termination_scenario injection") diff --git a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py index 7d07bf80..78eec21d 100644 --- a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py @@ -227,7 +227,7 @@ class gcp_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -257,7 +257,7 @@ class gcp_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -286,7 +286,7 @@ class gcp_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: diff --git a/krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py b/krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py index 6bfa1af6..830b7d35 100644 --- a/krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py @@ -18,21 +18,21 @@ class general_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): logging.info( "Node start is not set up yet for this cloud type, " "no action is going to be taken" ) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): logging.info( "Node stop is not set up yet for this cloud type," " no action is going to be taken" ) # Node scenario to terminate the node - def node_termination_scenario(self, instance_kill_count, node, timeout): + def node_termination_scenario(self, instance_kill_count, node, timeout, poll_interval): logging.info( "Node termination is not set up yet for this cloud type, " "no action is going to be taken" diff --git a/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py b/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py index 5c33b12a..30bca718 100644 --- a/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py @@ -284,7 +284,7 @@ class ibm_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): try: instance_id = self.ibmcloud.get_instance_id( node) affected_node = AffectedNode(node, node_id=instance_id) @@ -317,7 +317,7 @@ class ibm_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): try: instance_id = self.ibmcloud.get_instance_id(node) for _ in range(instance_kill_count): @@ -366,7 +366,7 @@ class ibm_node_scenarios(abstract_node_scenarios): logging.error("node_reboot_scenario injection failed!") - def node_terminate_scenario(self, instance_kill_count, node, timeout): + def node_terminate_scenario(self, instance_kill_count, node, timeout, poll_interval): try: instance_id = self.ibmcloud.get_instance_id(node) for _ in range(instance_kill_count): diff --git a/krkn/scenario_plugins/node_actions/ibmcloud_power_node_scenarios.py b/krkn/scenario_plugins/node_actions/ibmcloud_power_node_scenarios.py index b75bb25b..80c478be 100644 --- a/krkn/scenario_plugins/node_actions/ibmcloud_power_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/ibmcloud_power_node_scenarios.py @@ -298,7 +298,7 @@ class ibmcloud_power_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): try: instance_id = self.ibmcloud_power.get_instance_id( node) affected_node = AffectedNode(node, node_id=instance_id) @@ -331,7 +331,7 @@ class ibmcloud_power_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): try: instance_id = self.ibmcloud_power.get_instance_id(node) for _ in range(instance_kill_count): @@ -380,7 +380,7 @@ class ibmcloud_power_node_scenarios(abstract_node_scenarios): logging.error("node_reboot_scenario injection failed!") - def node_terminate_scenario(self, instance_kill_count, node, timeout): + def node_terminate_scenario(self, instance_kill_count, node, timeout, poll_interval): try: instance_id = self.ibmcloud_power.get_instance_id(node) for _ in range(instance_kill_count): diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py index 857dad08..cc866f5d 100644 --- a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py +++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py @@ -236,7 +236,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin): # Get the scenario specifics for running action nodes run_kill_count = get_yaml_item_value(node_scenario, "runs", 1) duration = get_yaml_item_value(node_scenario, "duration", 120) - + poll_interval = get_yaml_item_value(node_scenario, "poll_interval", 15) timeout = get_yaml_item_value(node_scenario, "timeout", 120) service = get_yaml_item_value(node_scenario, "service", "") soft_reboot = get_yaml_item_value(node_scenario, "soft_reboot", False) @@ -254,19 +254,19 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin): else: if action == "node_start_scenario": node_scenario_object.node_start_scenario( - run_kill_count, single_node, timeout + run_kill_count, single_node, timeout, poll_interval ) elif action == "node_stop_scenario": node_scenario_object.node_stop_scenario( - run_kill_count, single_node, timeout + run_kill_count, single_node, timeout, poll_interval ) elif action == "node_stop_start_scenario": node_scenario_object.node_stop_start_scenario( - run_kill_count, single_node, timeout, duration + run_kill_count, single_node, timeout, duration, poll_interval ) elif action == "node_termination_scenario": node_scenario_object.node_termination_scenario( - run_kill_count, single_node, timeout + run_kill_count, single_node, timeout, poll_interval ) elif action == "node_reboot_scenario": node_scenario_object.node_reboot_scenario( diff --git a/krkn/scenario_plugins/node_actions/openstack_node_scenarios.py b/krkn/scenario_plugins/node_actions/openstack_node_scenarios.py index 2971450b..e3840e0a 100644 --- a/krkn/scenario_plugins/node_actions/openstack_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/openstack_node_scenarios.py @@ -122,7 +122,7 @@ class openstack_node_scenarios(abstract_node_scenarios): self.node_action_kube_check = node_action_kube_check # Node scenario to start the node - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: @@ -147,7 +147,7 @@ class openstack_node_scenarios(abstract_node_scenarios): self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): for _ in range(instance_kill_count): affected_node = AffectedNode(node) try: diff --git a/krkn/scenario_plugins/node_actions/vmware_node_scenarios.py b/krkn/scenario_plugins/node_actions/vmware_node_scenarios.py index ee32df38..20ed23c3 100644 --- a/krkn/scenario_plugins/node_actions/vmware_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/vmware_node_scenarios.py @@ -389,7 +389,7 @@ class vmware_node_scenarios(abstract_node_scenarios): self.vsphere = vSphere() self.node_action_kube_check = node_action_kube_check - def node_start_scenario(self, instance_kill_count, node, timeout): + def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): try: for _ in range(instance_kill_count): affected_node = AffectedNode(node) @@ -409,7 +409,7 @@ class vmware_node_scenarios(abstract_node_scenarios): f"node_start_scenario injection failed! " f"Error was: {str(e)}" ) - def node_stop_scenario(self, instance_kill_count, node, timeout): + def node_stop_scenario(self, instance_kill_count, node, timeout, poll_interval): try: for _ in range(instance_kill_count): affected_node = AffectedNode(node) @@ -456,7 +456,7 @@ class vmware_node_scenarios(abstract_node_scenarios): ) - def node_terminate_scenario(self, instance_kill_count, node, timeout): + def node_terminate_scenario(self, instance_kill_count, node, timeout, poll_interval): try: for _ in range(instance_kill_count): affected_node = AffectedNode(node) diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml index cf8da497..ebc729b2 100644 --- a/scenarios/openshift/aws_node_scenarios.yml +++ b/scenarios/openshift/aws_node_scenarios.yml @@ -10,6 +10,7 @@ node_scenarios: cloud_type: aws # cloud type on which Kubernetes/OpenShift runs parallel: true # Run action on label or node name in parallel or sequential, defaults to sequential kube_check: true # Run the kubernetes api calls to see if the node gets to a certain state during the node scenario + poll_interval: 15 # Time interval(in seconds) to periodically check the node's status - actions: - node_reboot_scenario node_name: