Merge pull request #50 from paigerube14/node_clouds

Adding a general cloud provider file for kubelet reset and fork bomb on all clouds
This commit is contained in:
Mike Fiedler
2020-11-30 08:58:21 -05:00
committed by GitHub
4 changed files with 98 additions and 33 deletions

View File

@@ -28,7 +28,9 @@ A google service account is required to give proper authentication to GCP for no
After creating the service account you'll need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```
**NOTE**: The `stop_start_kubelet_scenario` and `node_crash_scenario` scenarios are supported as they are independent of the cloud platform.
**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform.
Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types
Node scenarios can be injected by placing the node scenarios config files under node_scenarios option in the kraken config. Refer to [node_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/node_scenarios_example.yml) config file.
@@ -51,4 +53,10 @@ node_scenarios:
instance_kill_count: 1
timeout: 120
cloud_type: aws
- actions:
- node_crash_scenario
node_name:
label_selector: node-role.kubernetes.io/infra
instance_kill_count: 1
timeout: 120
```

View File

@@ -15,7 +15,8 @@ class GCP:
self.project = runcommand.invoke('gcloud config get-value project').split('/n')[0].strip()
logging.info("project " + str(self.project) + "!")
credentials = GoogleCredentials.get_application_default()
self.client = discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False)
self.client = discovery.build('compute', 'v1', credentials=credentials,
cache_discovery=False)
# Get the instance ID of the node
def get_instance_id(self, node):
@@ -23,50 +24,56 @@ class GCP:
while zone_request is not None:
zone_response = zone_request.execute()
for zone in zone_response['items']:
instances_request = self.client.instances().list(project=self.project, zone=zone['name'])
instances_request = self.client.instances().list(project=self.project,
zone=zone['name'])
while instances_request is not None:
instance_response = instances_request.execute()
if "items" in instance_response.keys():
for instance in instance_response['items']:
if instance['name'] in node:
return instance['name'], zone['name']
instances_request = self.client.zones().list_next(previous_request=instances_request,
previous_response=instance_response)
zone_request = self.client.zones().list_next(previous_request=zone_request, previous_response=zone_response)
instances_request = self.client.zones().list_next(
previous_request=instances_request,
previous_response=instance_response)
zone_request = self.client.zones().list_next(previous_request=zone_request,
previous_response=zone_response)
logging.info('no instances ')
# Start the node instance
def start_instances(self, zone, instance_id):
self.client.instances().start(project=self.project, zone=zone, instance=instance_id).execute()
self.client.instances().start(project=self.project, zone=zone, instance=instance_id)\
.execute()
# Stop the node instance
def stop_instances(self, zone, instance_id):
self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute()
self.client.instances().stop(project=self.project, zone=zone, instance=instance_id)\
.execute()
# Start the node instance
def suspend_instances(self, zone, instance_id):
self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id).execute()
# Stop the node instance
def stop_instances(self, zone, instance_id):
self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute()
self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id)\
.execute()
# Terminate the node instance
def terminate_instances(self, zone, instance_id):
self.client.instances().delete(project=self.project, zone=zone, instance=instance_id).execute()
self.client.instances().delete(project=self.project, zone=zone, instance=instance_id)\
.execute()
# Reboot the node instance
def reboot_instances(self, zone, instance_id):
response = self.client.instances().reset(project=self.project, zone=zone, instance=instance_id).execute()
response = self.client.instances().reset(project=self.project, zone=zone,
instance=instance_id).execute()
logging.info('response reboot ' + str(response))
# Get instance status
def get_instance_status(self, zone, instance_id, expected_status, timeout):
# statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING, and TERMINATED.
# statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING,
# and TERMINATED.
i = 0
sleeper = 5
while i <= timeout:
instStatus = self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute()
instStatus = self.client.instances().get(project=self.project, zone=zone,
instance=instance_id).execute()
logging.info("Status of vm " + str(instStatus['status']))
if instStatus['status'] == expected_status:
return True

View File

@@ -0,0 +1,32 @@
import logging
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
class GENERAL:
def __init__(self):
pass
class general_node_scenarios(abstract_node_scenarios):
def __init__(self):
self.general = GENERAL()
# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
logging.info("Node start is not set up yet for this cloud type, "
"no action is going to be taken")
# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
logging.info("Node stop is not set up yet for this cloud type,"
" no action is going to be taken")
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
logging.info("Node termination is not set up yet for this cloud type, "
"no action is going to be taken")
# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
logging.info("Node reboot is not set up yet for this cloud type,"
" no action is going to be taken")

View File

@@ -12,20 +12,34 @@ import kraken.kubernetes.client as kubecli
import kraken.invoke.command as runcommand
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
import kraken.time_actions.common_time_functions as time_actions
node_general = False
# Get the node scenarios object of specfied cloud type
def get_node_scenario_object(node_scenario):
if "cloud_type" not in node_scenario.keys() or node_scenario['cloud_type'] == "generic":
global node_general
node_general = True
return general_node_scenarios()
if node_scenario['cloud_type'] == 'aws':
return aws_node_scenarios()
elif node_scenario['cloud_type'] == 'gcp':
return gcp_node_scenarios()
else:
logging.error("Cloud type " + node_scenario['cloud_type'] + " is not currently supported; "
"try using 'generic' if wanting to stop/start kubelet or fork bomb on any "
"cluster")
sys.exit(1)
# Inject the specified node scenario
def inject_node_scenario(action, node_scenario, node_scenario_object):
generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
# Get the node scenario configurations
instance_kill_count = node_scenario.get("instance_kill_count", 1)
node_name = node_scenario.get("node_name", "")
@@ -33,22 +47,26 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
timeout = node_scenario.get("timeout", 120)
# Get the node to apply the scenario
node = nodeaction.get_node(node_name, label_selector)
if action == "node_start_scenario":
node_scenario_object.node_start_scenario(instance_kill_count, node, timeout)
elif action == "node_stop_scenario":
node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout)
elif action == "node_stop_start_scenario":
node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout)
elif action == "node_termination_scenario":
node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout)
elif action == "node_reboot_scenario":
node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout)
elif action == "stop_kubelet_scenario":
node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "node_crash_scenario":
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
if node_general and action not in generic_cloud_scenarios:
logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action")
else:
if action == "node_start_scenario":
node_scenario_object.node_start_scenario(instance_kill_count, node, timeout)
elif action == "node_stop_scenario":
node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout)
elif action == "node_stop_start_scenario":
node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout)
elif action == "node_termination_scenario":
node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout)
elif action == "node_reboot_scenario":
node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "stop_kubelet_scenario":
node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "node_crash_scenario":
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
# Get cerberus status