diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index b94416dc..8b1e01c0 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -28,7 +28,9 @@ A google service account is required to give proper authentication to GCP for no After creating the service account you'll need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS=""``` -**NOTE**: The `stop_start_kubelet_scenario` and `node_crash_scenario` scenarios are supported as they are independent of the cloud platform. +**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform. + +Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types Node scenarios can be injected by placing the node scenarios config files under node_scenarios option in the kraken config. Refer to [node_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/node_scenarios_example.yml) config file. @@ -51,4 +53,10 @@ node_scenarios: instance_kill_count: 1 timeout: 120 cloud_type: aws + - actions: + - node_crash_scenario + node_name: + label_selector: node-role.kubernetes.io/infra + instance_kill_count: 1 + timeout: 120 ``` diff --git a/kraken/node_actions/gcp_node_scenarios.py b/kraken/node_actions/gcp_node_scenarios.py index 5b9c7951..9bc2b432 100644 --- a/kraken/node_actions/gcp_node_scenarios.py +++ b/kraken/node_actions/gcp_node_scenarios.py @@ -15,7 +15,8 @@ class GCP: self.project = runcommand.invoke('gcloud config get-value project').split('/n')[0].strip() logging.info("project " + str(self.project) + "!") credentials = GoogleCredentials.get_application_default() - self.client = discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False) + self.client = discovery.build('compute', 'v1', credentials=credentials, + cache_discovery=False) # Get the instance ID of the node def get_instance_id(self, node): @@ -23,50 +24,56 @@ class GCP: while zone_request is not None: zone_response = zone_request.execute() for zone in zone_response['items']: - instances_request = self.client.instances().list(project=self.project, zone=zone['name']) + instances_request = self.client.instances().list(project=self.project, + zone=zone['name']) while instances_request is not None: instance_response = instances_request.execute() if "items" in instance_response.keys(): for instance in instance_response['items']: if instance['name'] in node: return instance['name'], zone['name'] - instances_request = self.client.zones().list_next(previous_request=instances_request, - previous_response=instance_response) - zone_request = self.client.zones().list_next(previous_request=zone_request, previous_response=zone_response) + instances_request = self.client.zones().list_next( + previous_request=instances_request, + previous_response=instance_response) + zone_request = self.client.zones().list_next(previous_request=zone_request, + previous_response=zone_response) logging.info('no instances ') # Start the node instance def start_instances(self, zone, instance_id): - self.client.instances().start(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().start(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Stop the node instance def stop_instances(self, zone, instance_id): - self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().stop(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Start the node instance def suspend_instances(self, zone, instance_id): - self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id).execute() - - # Stop the node instance - def stop_instances(self, zone, instance_id): - self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Terminate the node instance def terminate_instances(self, zone, instance_id): - self.client.instances().delete(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().delete(project=self.project, zone=zone, instance=instance_id)\ + .execute() # Reboot the node instance def reboot_instances(self, zone, instance_id): - response = self.client.instances().reset(project=self.project, zone=zone, instance=instance_id).execute() + response = self.client.instances().reset(project=self.project, zone=zone, + instance=instance_id).execute() logging.info('response reboot ' + str(response)) # Get instance status def get_instance_status(self, zone, instance_id, expected_status, timeout): - # statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING, and TERMINATED. + # statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING, + # and TERMINATED. i = 0 sleeper = 5 while i <= timeout: - instStatus = self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute() + instStatus = self.client.instances().get(project=self.project, zone=zone, + instance=instance_id).execute() logging.info("Status of vm " + str(instStatus['status'])) if instStatus['status'] == expected_status: return True diff --git a/kraken/node_actions/general_cloud_node_scenarios.py b/kraken/node_actions/general_cloud_node_scenarios.py new file mode 100644 index 00000000..cb4399f3 --- /dev/null +++ b/kraken/node_actions/general_cloud_node_scenarios.py @@ -0,0 +1,32 @@ +import logging +from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios + + +class GENERAL: + def __init__(self): + pass + + +class general_node_scenarios(abstract_node_scenarios): + def __init__(self): + self.general = GENERAL() + + # Node scenario to start the node + def node_start_scenario(self, instance_kill_count, node, timeout): + logging.info("Node start is not set up yet for this cloud type, " + "no action is going to be taken") + + # Node scenario to stop the node + def node_stop_scenario(self, instance_kill_count, node, timeout): + logging.info("Node stop is not set up yet for this cloud type," + " no action is going to be taken") + + # Node scenario to terminate the node + def node_termination_scenario(self, instance_kill_count, node, timeout): + logging.info("Node termination is not set up yet for this cloud type, " + "no action is going to be taken") + + # Node scenario to reboot the node + def node_reboot_scenario(self, instance_kill_count, node, timeout): + logging.info("Node reboot is not set up yet for this cloud type," + " no action is going to be taken") diff --git a/run_kraken.py b/run_kraken.py index f7862741..e5422e78 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -12,20 +12,34 @@ import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand import kraken.node_actions.common_node_functions as nodeaction from kraken.node_actions.aws_node_scenarios import aws_node_scenarios +from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios import kraken.time_actions.common_time_functions as time_actions +node_general = False + + # Get the node scenarios object of specfied cloud type def get_node_scenario_object(node_scenario): + if "cloud_type" not in node_scenario.keys() or node_scenario['cloud_type'] == "generic": + global node_general + node_general = True + return general_node_scenarios() if node_scenario['cloud_type'] == 'aws': return aws_node_scenarios() elif node_scenario['cloud_type'] == 'gcp': return gcp_node_scenarios() + else: + logging.error("Cloud type " + node_scenario['cloud_type'] + " is not currently supported; " + "try using 'generic' if wanting to stop/start kubelet or fork bomb on any " + "cluster") + sys.exit(1) # Inject the specified node scenario def inject_node_scenario(action, node_scenario, node_scenario_object): + generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") # Get the node scenario configurations instance_kill_count = node_scenario.get("instance_kill_count", 1) node_name = node_scenario.get("node_name", "") @@ -33,22 +47,26 @@ def inject_node_scenario(action, node_scenario, node_scenario_object): timeout = node_scenario.get("timeout", 120) # Get the node to apply the scenario node = nodeaction.get_node(node_name, label_selector) - if action == "node_start_scenario": - node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) - elif action == "node_stop_scenario": - node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) - elif action == "node_stop_start_scenario": - node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout) - elif action == "node_termination_scenario": - node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout) - elif action == "node_reboot_scenario": - node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) - elif action == "stop_kubelet_scenario": - node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) - elif action == "stop_start_kubelet_scenario": - node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout) - elif action == "node_crash_scenario": - node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout) + + if node_general and action not in generic_cloud_scenarios: + logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action") + else: + if action == "node_start_scenario": + node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) + elif action == "node_stop_scenario": + node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) + elif action == "node_stop_start_scenario": + node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout) + elif action == "node_termination_scenario": + node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout) + elif action == "node_reboot_scenario": + node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) + elif action == "stop_start_kubelet_scenario": + node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout) + elif action == "stop_kubelet_scenario": + node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) + elif action == "node_crash_scenario": + node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout) # Get cerberus status