From c7bb32f633e5b3f020c5182b46e5ae4778995dcc Mon Sep 17 00:00:00 2001 From: prubenda Date: Fri, 23 Oct 2020 09:15:47 -0400 Subject: [PATCH] Adding azure to node scenarios --- docs/node_scenarios.md | 20 ++- kraken/node_actions/az_node_scenarios.py | 179 +++++++++++++++++++++++ requirements.txt | 3 + run_kraken.py | 5 + 4 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 kraken/node_actions/az_node_scenarios.py diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index 9fa8ab50..864970b2 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -39,6 +39,24 @@ The supported node level chaos scenarios on an OPENSTACK cloud are `node_stop_st To execute the scenario, ensure the value for `ssh_private_key` in the node scenarios config file is set with the correct private key file path for ssh connection to the helper node. Ensure passwordless ssh is configured on the host running Kraken and the helper node to avoid connection errors. +#### Azure + +**NOTE**: For Azure node killing scenarios, make sure [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) is installed + +You will also need to create a service principal and give it the correct access, see [here](https://docs.openshift.com/container-platform/4.5/installing/installing_azure/installing-azure-account.html) for creating the service principal and setting the proper permissions + +To properly run the service principal requires “Azure Active Directory Graph/Application.ReadWrite.OwnedBy” api permission granted and “User Access Administrator” + +Before running you'll need to set the following: +1. Login using ```az login``` + +2. ```export AZURE_TENANT_ID=``` + +3. ```export AZURE_CLIENT_SECRET=``` + +4. ```export AZURE_CLIENT_ID=``` + + **NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform. Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types @@ -62,7 +80,7 @@ node_scenarios: label_selector: node-role.kubernetes.io/infra instance_kill_count: 1 timeout: 120 - cloud_type: aws + cloud_type: azure - actions: - node_crash_scenario node_name: diff --git a/kraken/node_actions/az_node_scenarios.py b/kraken/node_actions/az_node_scenarios.py new file mode 100644 index 00000000..f873d48e --- /dev/null +++ b/kraken/node_actions/az_node_scenarios.py @@ -0,0 +1,179 @@ +import sys +import time +from azure.mgmt.compute import ComputeManagementClient +from azure.identity import DefaultAzureCredential +import logging +import kraken.kubernetes.client as kubecli +import kraken.node_actions.common_node_functions as nodeaction +from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import kraken.invoke.command as runcommand +import yaml + + +class Azure: + def __init__(self): + logging.info('azure ' + str(self)) + # Acquire a credential object using CLI-based authentication. + credentials = DefaultAzureCredential() + logging.info("credential " + str(credentials)) + az_account = runcommand.invoke("az account list -o yaml") + az_account_yaml = yaml.load(az_account, Loader=yaml.FullLoader) + subscription_id = az_account_yaml[0]['id'] + self.compute_client = ComputeManagementClient(credentials, subscription_id) + + # Get the instance ID of the node + def get_instance_id(self, node_name): + vm_list = self.compute_client.virtual_machines.list_all() + for vm in vm_list: + array = vm.id.split("/") + resource_group = array[4] + vm_name = array[-1] + if node_name == vm_name: + return resource_group + logging.error("Couldn't find vm with name " + str(node_name)) + + # Start the node instance + def start_instances(self, group_name, vm_name): + self.compute_client.virtual_machines.begin_start(group_name, vm_name) + + # Stop the node instance + def stop_instances(self, group_name, vm_name): + self.compute_client.virtual_machines.begin_power_off(group_name, vm_name) + + # Terminate the node instance + def terminate_instances(self, group_name, vm_name): + self.compute_client.virtual_machines.begin_delete(group_name, vm_name) + + # Reboot the node instance + def reboot_instances(self, group_name, vm_name): + self.compute_client.virtual_machines.begin_restart(group_name, vm_name) + + def get_vm_status(self, resource_group, vm_name): + statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name)\ + .statuses + status = len(statuses) >= 2 and statuses[1] + return status + + # Wait until the node instance is running + def wait_until_running(self, resource_group, vm_name, timeout): + time_counter = 0 + status = self.get_vm_status(resource_group, vm_name) + while status and status.code != 'PowerState/running': + status = self.get_vm_status(resource_group, vm_name) + logging.info("Vm %s is still not running, sleeping for 5 seconds" % vm_name) + time.sleep(5) + time_counter += 5 + if time_counter >= timeout: + logging.info("Vm %s is still not ready in allotted time" % vm_name) + break + + # Wait until the node instance is stopped + def wait_until_stopped(self, resource_group, vm_name, timeout): + time_counter = 0 + status = self.get_vm_status(resource_group, vm_name) + while status and status.code != 'PowerState/stopped': + status = self.get_vm_status(resource_group, vm_name) + logging.info("Vm %s is still stopping, sleeping for 5 seconds" % vm_name) + time.sleep(5) + time_counter += 5 + if time_counter >= timeout: + logging.info("Vm %s is still not stopped in allotted time" % vm_name) + break + + # Wait until the node instance is terminated + def wait_until_terminated(self, resource_group, vm_name): + statuses = self.compute_client.virtual_machines.instance_view(resource_group, + vm_name).statuses[0] + logging.info("vm status " + str(statuses)) + while statuses.code == "ProvisioningState/deleting": + try: + statuses = self.compute_client.virtual_machines.instance_view(resource_group, + vm_name).statuses[0] + logging.info("Vm %s is still deleting, waiting 10 seconds" % vm_name) + time.sleep(10) + except Exception: + logging.info("Vm %s is terminated" % vm_name) + break + + +class azure_node_scenarios(abstract_node_scenarios): + def __init__(self): + logging.info("init in azure") + self.azure = Azure() + + # Node scenario to start the node + def node_start_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting node_start_scenario injection") + resource_group = self.azure.get_instance_id(node) + logging.info("Starting the node %s with instance ID: %s " % (node, resource_group)) + self.azure.start_instances(resource_group, node) + self.azure.wait_until_running(resource_group, node, timeout) + nodeaction.wait_for_ready_status(node, timeout) + logging.info("Node with instance ID: %s is in running state" % node) + logging.info("node_start_scenario has been successfully injected!") + except Exception as e: + logging.error("Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e)) + logging.error("node_start_scenario injection failed!") + sys.exit(1) + + # Node scenario to stop the node + def node_stop_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting node_stop_scenario injection") + resource_group = self.azure.get_instance_id(node) + logging.info("Stopping the node %s with instance ID: %s " % (node, resource_group)) + self.azure.stop_instances(resource_group, node) + self.azure.wait_until_stopped(resource_group, node, timeout) + logging.info("Node with instance ID: %s is in stopped state" % node) + nodeaction.wait_for_unknown_status(node, timeout) + except Exception as e: + logging.error("Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % e) + logging.error("node_stop_scenario injection failed!") + sys.exit(1) + + # Node scenario to terminate the node + def node_termination_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting node_termination_scenario injection") + resource_group = self.azure.get_instance_id(node) + logging.info("Terminating the node %s with instance ID: %s " + % (node, resource_group)) + self.azure.terminate_instances(resource_group, node) + self.azure.wait_until_terminated(resource_group, node) + for _ in range(timeout): + if node not in kubecli.list_nodes(): + break + time.sleep(1) + if node in kubecli.list_nodes(): + raise Exception("Node could not be terminated") + logging.info("Node with instance ID: %s has been terminated" % node) + logging.info("node_termination_scenario has been successfully injected!") + except Exception as e: + logging.error("Failed to terminate node instance. Encountered following exception:" + " %s. Test Failed" % (e)) + logging.error("node_termination_scenario injection failed!") + sys.exit(1) + + # Node scenario to reboot the node + def node_reboot_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting node_reboot_scenario injection") + resource_group = self.azure.get_instance_id(node) + logging.info("Rebooting the node %s with instance ID: %s " % (node, resource_group)) + self.azure.reboot_instances(resource_group, node) + nodeaction.wait_for_unknown_status(node, timeout) + nodeaction.wait_for_ready_status(node, timeout) + logging.info("Node with instance ID: %s has been rebooted" % (node)) + logging.info("node_reboot_scenario has been successfully injected!") + except Exception as e: + logging.error("Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed" % (e)) + logging.error("node_reboot_scenario injection failed!") + sys.exit(1) diff --git a/requirements.txt b/requirements.txt index 544680d7..f0011a49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,9 @@ git+https://github.com/powerfulseal/powerfulseal.git requests boto3 google-api-python-client +azure-mgmt-compute +azure-keyvault +azure-identity kubernetes==12.0.0a1 oauth2client>=4.1.3 python-openstackclient diff --git a/run_kraken.py b/run_kraken.py index 1dd80b40..80eaf9ef 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -14,6 +14,7 @@ import kraken.litmus.common_litmus as common_litmus import kraken.node_actions.common_node_functions as nodeaction from kraken.node_actions.aws_node_scenarios import aws_node_scenarios from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios +from kraken.node_actions.az_node_scenarios import azure_node_scenarios from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios import kraken.time_actions.common_time_functions as time_actions @@ -34,6 +35,8 @@ def get_node_scenario_object(node_scenario): return gcp_node_scenarios() elif node_scenario['cloud_type'] == 'openstack': return openstack_node_scenarios() + elif node_scenario['cloud_type'] == 'azure' or node_scenario['cloud_type'] == 'az': + return azure_node_scenarios() else: logging.error("Cloud type " + node_scenario['cloud_type'] + " is not currently supported; " "try using 'generic' if wanting to stop/start kubelet or fork bomb on any " @@ -85,6 +88,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object): instance_kill_count, node_scenario['helper_node_ip'], timeout) node_scenario_object.helper_node_service_status( node_scenario['helper_node_ip'], service, ssh_private_key, timeout) + else: + logging.info('There is no node action that matches %s, skipping scenario' % action) # Get cerberus status