Merge pull request #50 from paigerube14/node_clouds

Adding a general cloud provider file for kubelet reset and fork bomb on all clouds
2026-04-15 06:57:28 +00:00 · 2020-11-30 08:58:21 -05:00
parent 89c1cb04fa 37a3e7bf21
commit 8a6c9fe0c0
4 changed files with 98 additions and 33 deletions
--- a/docs/node_scenarios.md
+++ b/docs/node_scenarios.md
@@ -28,7 +28,9 @@ A google service account is required to give proper authentication to GCP for no
 
 After creating the service account you'll need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```

-**NOTE**: The `stop_start_kubelet_scenario` and `node_crash_scenario` scenarios are supported as they are independent of the cloud platform.
+**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform.
+
+Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types


 Node scenarios can be injected by placing the node scenarios config files under node_scenarios option in the kraken config. Refer to [node_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/node_scenarios_example.yml) config file.
@@ -51,4 +53,10 @@ node_scenarios:
    instance_kill_count: 1
    timeout: 120
    cloud_type: aws
+  - actions:
+    - node_crash_scenario
+    node_name:
+    label_selector: node-role.kubernetes.io/infra
+    instance_kill_count: 1
+    timeout: 120
 ```
--- a/kraken/node_actions/gcp_node_scenarios.py
+++ b/kraken/node_actions/gcp_node_scenarios.py
@@ -15,7 +15,8 @@ class GCP:
        self.project = runcommand.invoke('gcloud config get-value project').split('/n')[0].strip()
        logging.info("project " + str(self.project) + "!")
        credentials = GoogleCredentials.get_application_default()
-        self.client = discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False)
+        self.client = discovery.build('compute', 'v1', credentials=credentials,
+                                      cache_discovery=False)

    # Get the instance ID of the node
    def get_instance_id(self, node):
@@ -23,50 +24,56 @@ class GCP:
        while zone_request is not None:
            zone_response = zone_request.execute()
            for zone in zone_response['items']:
-                instances_request = self.client.instances().list(project=self.project, zone=zone['name'])
+                instances_request = self.client.instances().list(project=self.project,
+                                                                 zone=zone['name'])
                while instances_request is not None:
                    instance_response = instances_request.execute()
                    if "items" in instance_response.keys():
                        for instance in instance_response['items']:
                            if instance['name'] in node:
                                return instance['name'], zone['name']
-                    instances_request = self.client.zones().list_next(previous_request=instances_request,
-                                                                          previous_response=instance_response)
-            zone_request = self.client.zones().list_next(previous_request=zone_request, previous_response=zone_response)
+                    instances_request = self.client.zones().list_next(
+                        previous_request=instances_request,
+                        previous_response=instance_response)
+            zone_request = self.client.zones().list_next(previous_request=zone_request,
+                                                         previous_response=zone_response)
        logging.info('no instances ')

    # Start the node instance
    def start_instances(self, zone, instance_id):
-        self.client.instances().start(project=self.project, zone=zone, instance=instance_id).execute()
+        self.client.instances().start(project=self.project, zone=zone, instance=instance_id)\
+            .execute()

    # Stop the node instance
    def stop_instances(self, zone, instance_id):
-        self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute()
+        self.client.instances().stop(project=self.project, zone=zone, instance=instance_id)\
+            .execute()

    # Start the node instance
    def suspend_instances(self, zone, instance_id):
-        self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id).execute()
-
-    # Stop the node instance
-    def stop_instances(self, zone, instance_id):
-        self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute()
+        self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id)\
+            .execute()

    # Terminate the node instance
    def terminate_instances(self, zone, instance_id):
-        self.client.instances().delete(project=self.project, zone=zone, instance=instance_id).execute()
+        self.client.instances().delete(project=self.project, zone=zone, instance=instance_id)\
+            .execute()

    # Reboot the node instance
    def reboot_instances(self, zone, instance_id):
-        response = self.client.instances().reset(project=self.project, zone=zone, instance=instance_id).execute()
+        response = self.client.instances().reset(project=self.project, zone=zone,
+                                                 instance=instance_id).execute()
        logging.info('response reboot ' + str(response))

    # Get instance status
    def get_instance_status(self, zone, instance_id, expected_status, timeout):
-        # statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING, and TERMINATED.
+        # statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING,
+        # and TERMINATED.
        i = 0
        sleeper = 5
        while i <= timeout:
-            instStatus = self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute()
+            instStatus = self.client.instances().get(project=self.project, zone=zone,
+                                                     instance=instance_id).execute()
            logging.info("Status of vm " + str(instStatus['status']))
            if instStatus['status'] == expected_status:
                return True
--- a/kraken/node_actions/general_cloud_node_scenarios.py
+++ b/kraken/node_actions/general_cloud_node_scenarios.py
@@ -0,0 +1,32 @@
+import logging
+from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
+
+
+class GENERAL:
+    def __init__(self):
+        pass
+
+
+class general_node_scenarios(abstract_node_scenarios):
+    def __init__(self):
+        self.general = GENERAL()
+
+    # Node scenario to start the node
+    def node_start_scenario(self, instance_kill_count, node, timeout):
+        logging.info("Node start is not set up yet for this cloud type, "
+                     "no action is going to be taken")
+
+    # Node scenario to stop the node
+    def node_stop_scenario(self, instance_kill_count, node, timeout):
+        logging.info("Node stop is not set up yet for this cloud type,"
+                     " no action is going to be taken")
+
+    # Node scenario to terminate the node
+    def node_termination_scenario(self, instance_kill_count, node, timeout):
+        logging.info("Node termination is not set up yet for this cloud type, "
+                     "no action is going to be taken")
+
+    # Node scenario to reboot the node
+    def node_reboot_scenario(self, instance_kill_count, node, timeout):
+        logging.info("Node reboot is not set up yet for this cloud type,"
+                     " no action is going to be taken")
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -12,20 +12,34 @@ import kraken.kubernetes.client as kubecli
 import kraken.invoke.command as runcommand
 import kraken.node_actions.common_node_functions as nodeaction
 from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
+from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
 from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
 import kraken.time_actions.common_time_functions as time_actions


+node_general = False
+
+
 # Get the node scenarios object of specfied cloud type
 def get_node_scenario_object(node_scenario):
+    if "cloud_type" not in node_scenario.keys() or node_scenario['cloud_type'] == "generic":
+        global node_general
+        node_general = True
+        return general_node_scenarios()
    if node_scenario['cloud_type'] == 'aws':
        return aws_node_scenarios()
    elif node_scenario['cloud_type'] == 'gcp':
        return gcp_node_scenarios()
+    else:
+        logging.error("Cloud type " + node_scenario['cloud_type'] + " is not currently supported; "
+                      "try using 'generic' if wanting to stop/start kubelet or fork bomb on any "
+                      "cluster")
+        sys.exit(1)


 # Inject the specified node scenario
 def inject_node_scenario(action, node_scenario, node_scenario_object):
+    generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
    # Get the node scenario configurations
    instance_kill_count = node_scenario.get("instance_kill_count", 1)
    node_name = node_scenario.get("node_name", "")
@@ -33,22 +47,26 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
    timeout = node_scenario.get("timeout", 120)
    # Get the node to apply the scenario
    node = nodeaction.get_node(node_name, label_selector)
-    if action == "node_start_scenario":
-        node_scenario_object.node_start_scenario(instance_kill_count, node, timeout)
-    elif action == "node_stop_scenario":
-        node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout)
-    elif action == "node_stop_start_scenario":
-        node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout)
-    elif action == "node_termination_scenario":
-        node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout)
-    elif action == "node_reboot_scenario":
-        node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout)
-    elif action == "stop_kubelet_scenario":
-        node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
-    elif action == "stop_start_kubelet_scenario":
-        node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout)
-    elif action == "node_crash_scenario":
-        node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
+
+    if node_general and action not in generic_cloud_scenarios:
+        logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action")
+    else:
+        if action == "node_start_scenario":
+            node_scenario_object.node_start_scenario(instance_kill_count, node, timeout)
+        elif action == "node_stop_scenario":
+            node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout)
+        elif action == "node_stop_start_scenario":
+            node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout)
+        elif action == "node_termination_scenario":
+            node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout)
+        elif action == "node_reboot_scenario":
+            node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout)
+        elif action == "stop_start_kubelet_scenario":
+            node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout)
+        elif action == "stop_kubelet_scenario":
+            node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
+        elif action == "node_crash_scenario":
+            node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)


 # Get cerberus status