From d3e01db574df8930ff330f5bc6c071ff62894f40 Mon Sep 17 00:00:00 2001 From: prubenda Date: Wed, 11 Nov 2020 10:44:27 -0500 Subject: [PATCH 1/2] adding start to fix for all other cloud types --- docs/node_scenarios.md | 10 +++- kraken/node_actions/gcp_node_scenarios.py | 39 +++++++++------- .../general_cloud_node_scenarios.py | 32 +++++++++++++ run_kraken.py | 46 ++++++++++++++++--- 4 files changed, 103 insertions(+), 24 deletions(-) create mode 100644 kraken/node_actions/general_cloud_node_scenarios.py diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index b94416dc..8b1e01c0 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -28,7 +28,9 @@ A google service account is required to give proper authentication to GCP for no After creating the service account you'll need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS=""``` -**NOTE**: The `stop_start_kubelet_scenario` and `node_crash_scenario` scenarios are supported as they are independent of the cloud platform. +**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform. + +Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types Node scenarios can be injected by placing the node scenarios config files under node_scenarios option in the kraken config. Refer to [node_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/node_scenarios_example.yml) config file. @@ -51,4 +53,10 @@ node_scenarios: instance_kill_count: 1 timeout: 120 cloud_type: aws + - actions: + - node_crash_scenario + node_name: + label_selector: node-role.kubernetes.io/infra + instance_kill_count: 1 + timeout: 120 ``` diff --git a/kraken/node_actions/gcp_node_scenarios.py b/kraken/node_actions/gcp_node_scenarios.py index 5b9c7951..9bc2b432 100644 --- a/kraken/node_actions/gcp_node_scenarios.py +++ b/kraken/node_actions/gcp_node_scenarios.py @@ -15,7 +15,8 @@ class GCP: self.project = runcommand.invoke('gcloud config get-value project').split('/n')[0].strip() logging.info("project " + str(self.project) + "!") credentials = GoogleCredentials.get_application_default() - self.client = discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False) + self.client = discovery.build('compute', 'v1', credentials=credentials, + cache_discovery=False) # Get the instance ID of the node def get_instance_id(self, node): @@ -23,50 +24,56 @@ class GCP: while zone_request is not None: zone_response = zone_request.execute() for zone in zone_response['items']: - instances_request = self.client.instances().list(project=self.project, zone=zone['name']) + instances_request = self.client.instances().list(project=self.project, + zone=zone['name']) while instances_request is not None: instance_response = instances_request.execute() if "items" in instance_response.keys(): for instance in instance_response['items']: if instance['name'] in node: return instance['name'], zone['name'] - instances_request = self.client.zones().list_next(previous_request=instances_request, - previous_response=instance_response) - zone_request = self.client.zones().list_next(previous_request=zone_request, previous_response=zone_response) + instances_request = self.client.zones().list_next( + previous_request=instances_request, + previous_response=instance_response) + zone_request = self.client.zones().list_next(previous_request=zone_request, + previous_response=zone_response) logging.info('no instances ') # Start the node instance def start_instances(self, zone, instance_id): - self.client.instances().start(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().start(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Stop the node instance def stop_instances(self, zone, instance_id): - self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().stop(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Start the node instance def suspend_instances(self, zone, instance_id): - self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id).execute() - - # Stop the node instance - def stop_instances(self, zone, instance_id): - self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Terminate the node instance def terminate_instances(self, zone, instance_id): - self.client.instances().delete(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().delete(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Reboot the node instance def reboot_instances(self, zone, instance_id): - response = self.client.instances().reset(project=self.project, zone=zone, instance=instance_id).execute() + response = self.client.instances().reset(project=self.project, zone=zone, + instance=instance_id).execute() logging.info('response reboot ' + str(response)) # Get instance status def get_instance_status(self, zone, instance_id, expected_status, timeout): - # statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING, and TERMINATED. + # statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING, + # and TERMINATED. i = 0 sleeper = 5 while i <= timeout: - instStatus = self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute() + instStatus = self.client.instances().get(project=self.project, zone=zone, + instance=instance_id).execute() logging.info("Status of vm " + str(instStatus['status'])) if instStatus['status'] == expected_status: return True diff --git a/kraken/node_actions/general_cloud_node_scenarios.py b/kraken/node_actions/general_cloud_node_scenarios.py new file mode 100644 index 00000000..cb4399f3 --- /dev/null +++ b/kraken/node_actions/general_cloud_node_scenarios.py @@ -0,0 +1,32 @@ +import logging +from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios + + +class GENERAL: + def __init__(self): + pass + + +class general_node_scenarios(abstract_node_scenarios): + def __init__(self): + self.general = GENERAL() + + # Node scenario to start the node + def node_start_scenario(self, instance_kill_count, node, timeout): + logging.info("Node start is not set up yet for this cloud type, " + "no action is going to be taken") + + # Node scenario to stop the node + def node_stop_scenario(self, instance_kill_count, node, timeout): + logging.info("Node stop is not set up yet for this cloud type," + " no action is going to be taken") + + # Node scenario to terminate the node + def node_termination_scenario(self, instance_kill_count, node, timeout): + logging.info("Node termination is not set up yet for this cloud type, " + "no action is going to be taken") + + # Node scenario to reboot the node + def node_reboot_scenario(self, instance_kill_count, node, timeout): + logging.info("Node reboot is not set up yet for this cloud type," + " no action is going to be taken") diff --git a/run_kraken.py b/run_kraken.py index f7862741..77585891 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -12,16 +12,29 @@ import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand import kraken.node_actions.common_node_functions as nodeaction from kraken.node_actions.aws_node_scenarios import aws_node_scenarios +from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios import kraken.time_actions.common_time_functions as time_actions +node_general = False + + # Get the node scenarios object of specfied cloud type def get_node_scenario_object(node_scenario): + if "cloud_type" not in node_scenario.keys() or node_scenario['cloud_type'] == "generic": + global node_general + node_general = True + return general_node_scenarios() if node_scenario['cloud_type'] == 'aws': return aws_node_scenarios() elif node_scenario['cloud_type'] == 'gcp': return gcp_node_scenarios() + else: + logging.error("Cloud type " + node_scenario['cloud_type'] + " is not currently supported; " + "try using 'generic' if wanting to stop/start kubelet or fork bomb on any " + "cluster") + sys.exit(1) # Inject the specified node scenario @@ -34,19 +47,38 @@ def inject_node_scenario(action, node_scenario, node_scenario_object): # Get the node to apply the scenario node = nodeaction.get_node(node_name, label_selector) if action == "node_start_scenario": - node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) + if node_general: + logging.info("Node start is not set up for generic cloud type, skipping action") + else: + node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) elif action == "node_stop_scenario": - node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) + if node_general: + logging.info("Node stop is not set up for generic cloud type, skipping action") + else: + node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) elif action == "node_stop_start_scenario": - node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout) + if node_general: + logging.info("Node stop/start is not set up for generic cloud type, skipping action") + else: + node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout) elif action == "node_termination_scenario": - node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout) + if node_general: + logging.info("Node termination is not set up for generic cloud type, skipping action") + else: + node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout) elif action == "node_reboot_scenario": - node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) + if node_general: + logging.info("Node reboot is not set up for generic cloud type, skipping action") + else: + node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) + elif action == "stop_start_kubelet_scenario": + if node_general: + logging.info("Node stop/start kubelet is not set up for generic cloud type, " + "skipping action") + else: + node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout) elif action == "stop_kubelet_scenario": node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) - elif action == "stop_start_kubelet_scenario": - node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout) elif action == "node_crash_scenario": node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout) From 4d4ffdccbcbecc0c5de6a1ec8fce0d934bcca7e1 Mon Sep 17 00:00:00 2001 From: Mike Fiedler Date: Tue, 24 Nov 2020 20:43:11 -0500 Subject: [PATCH 2/2] Reorganize scenario selection/injection --- run_kraken.py | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/run_kraken.py b/run_kraken.py index 77585891..e5422e78 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -39,6 +39,7 @@ def get_node_scenario_object(node_scenario): # Inject the specified node scenario def inject_node_scenario(action, node_scenario, node_scenario_object): + generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") # Get the node scenario configurations instance_kill_count = node_scenario.get("instance_kill_count", 1) node_name = node_scenario.get("node_name", "") @@ -46,41 +47,26 @@ def inject_node_scenario(action, node_scenario, node_scenario_object): timeout = node_scenario.get("timeout", 120) # Get the node to apply the scenario node = nodeaction.get_node(node_name, label_selector) - if action == "node_start_scenario": - if node_general: - logging.info("Node start is not set up for generic cloud type, skipping action") - else: + + if node_general and action not in generic_cloud_scenarios: + logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action") + else: + if action == "node_start_scenario": node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) - elif action == "node_stop_scenario": - if node_general: - logging.info("Node stop is not set up for generic cloud type, skipping action") - else: + elif action == "node_stop_scenario": node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) - elif action == "node_stop_start_scenario": - if node_general: - logging.info("Node stop/start is not set up for generic cloud type, skipping action") - else: + elif action == "node_stop_start_scenario": node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout) - elif action == "node_termination_scenario": - if node_general: - logging.info("Node termination is not set up for generic cloud type, skipping action") - else: + elif action == "node_termination_scenario": node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout) - elif action == "node_reboot_scenario": - if node_general: - logging.info("Node reboot is not set up for generic cloud type, skipping action") - else: + elif action == "node_reboot_scenario": node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) - elif action == "stop_start_kubelet_scenario": - if node_general: - logging.info("Node stop/start kubelet is not set up for generic cloud type, " - "skipping action") - else: + elif action == "stop_start_kubelet_scenario": node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout) - elif action == "stop_kubelet_scenario": - node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) - elif action == "node_crash_scenario": - node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout) + elif action == "stop_kubelet_scenario": + node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) + elif action == "node_crash_scenario": + node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout) # Get cerberus status