diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index 5bc9dd2f..9fa8ab50 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -10,6 +10,7 @@ Following node chaos scenarios are supported: 6. **stop_kubelet_scenario**: scenario to stop the kubelet of the node instance. 7. **stop_start_kubelet_scenario**: scenario to stop and start the kubelet of the node instance. 8. **node_crash_scenario**: scenario to crash the node instance. +9. **stop_start_helper_node_scenario**: scenario to stop and start the helper node and check service status. **NOTE**: If the node doesn't recover from the node_crash_scenario injection, reboot the node to get it back to Ready state. @@ -34,6 +35,10 @@ After creating the service account you'll need to enable the account using the f The supported node level chaos scenarios on an OPENSTACK cloud are `node_stop_start_scenario`, `stop_start_kubelet_scenario` and `node_reboot_scenario`. +**NOTE**: For `stop_start_helper_node_scenario`, visit [here](https://github.com/RedHatOfficial/ocp4-helpernode) to learn more about the helper node and its usage. + +To execute the scenario, ensure the value for `ssh_private_key` in the node scenarios config file is set with the correct private key file path for ssh connection to the helper node. Ensure passwordless ssh is configured on the host running Kraken and the helper node to avoid connection errors. + **NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform. Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types @@ -64,4 +69,15 @@ node_scenarios: label_selector: node-role.kubernetes.io/infra instance_kill_count: 1 timeout: 120 + - actions: + - stop_start_helper_node_scenario # node chaos scenario for helper node + instance_kill_count: 1 + timeout: 120 + helper_node_ip: # ip address of the helper node + service: # check status of the services on the helper node + - haproxy + - dhcpd + - named + ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node + cloud_type: openstack ``` diff --git a/kraken/node_actions/abstract_node_scenarios.py b/kraken/node_actions/abstract_node_scenarios.py index a9d81f6b..eb645283 100644 --- a/kraken/node_actions/abstract_node_scenarios.py +++ b/kraken/node_actions/abstract_node_scenarios.py @@ -21,6 +21,12 @@ class abstract_node_scenarios: self.node_start_scenario(instance_kill_count, node, timeout) logging.info("node_stop_start_scenario has been successfully injected!") + def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout): + logging.info("Starting helper_node_stop_start_scenario injection") + self.helper_node_stop_scenario(instance_kill_count, node, timeout) + self.helper_node_start_scenario(instance_kill_count, node, timeout) + logging.info("helper_node_stop_start_scenario has been successfully injected!") + # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): pass @@ -66,3 +72,7 @@ class abstract_node_scenarios: "Test Failed" % (e)) logging.error("node_crash_scenario injection failed!") sys.exit(1) + + # Node scenario to check service status on helper node + def node_service_status(self, node, service, ssh_private_key, timeout): + pass diff --git a/kraken/node_actions/common_node_functions.py b/kraken/node_actions/common_node_functions.py index bbe66f94..37bd573a 100644 --- a/kraken/node_actions/common_node_functions.py +++ b/kraken/node_actions/common_node_functions.py @@ -1,6 +1,7 @@ import time import random import logging +import paramiko import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand @@ -36,6 +37,36 @@ def wait_for_unknown_status(node, timeout): if kubecli.get_node_status(node) != "Unknown": raise Exception("Node condition status isn't Unknown") + # Get the ip of the cluster node def get_node_ip(node): - return runcommand.invoke("kubectl get node %s -o jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node)) + return runcommand.invoke("kubectl get node %s -o " + "jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" + % (node)) + + +def check_service_status(node, service, ssh_private_key, timeout): + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + i = 0 + sleeper = 1 + while i <= timeout: + try: + time.sleep(sleeper) + i += sleeper + logging.info("Trying to ssh to instance: %s" % (node)) + connection = ssh.connect(node, username='root', key_filename=ssh_private_key, + timeout=800, banner_timeout=400) + if connection is None: + break + except Exception: + pass + for service_name in service: + logging.info("Checking status of Service: %s" % (service_name)) + stdin, stdout, stderr = ssh.exec_command("systemctl status %s | grep '^ Active' " + "| awk '{print $2}'" % (service_name)) + service_status = stdout.readlines()[0] + logging.info("Status of service %s is %s \n" % (service_name, service_status.strip())) + if(service_status.strip() != "active"): + logging.error("Service %s is in %s state" % (service_name, service_status.strip())) + ssh.close() diff --git a/kraken/node_actions/openstack_node_scenarios.py b/kraken/node_actions/openstack_node_scenarios.py index 33396abd..c7f664df 100644 --- a/kraken/node_actions/openstack_node_scenarios.py +++ b/kraken/node_actions/openstack_node_scenarios.py @@ -1,9 +1,6 @@ import sys import time import logging -import subprocess -import requests -import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand import kraken.node_actions.common_node_functions as nodeaction from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios @@ -14,21 +11,20 @@ class OPENSTACKCLOUD: self.Wait = 30 # Start the node instance - def start_instances(self,node): + def start_instances(self, node): runcommand.invoke("openstack server start %s" % (node)) - logging.info("OPENSTACKCLOUD CLI INFO: Completed instance start action for node %s" % (node)) + logging.info("OPENSTACK CLI INFO: Completed instance start action for node %s" % (node)) # Stop the node instance def stop_instances(self, node): runcommand.invoke("openstack server stop %s" % (node)) - logging.info("OPENSTACKCLOUD CLI INFO: Completed instance stop action for node %s" % (node)) - #return action_output - + logging.info("OPENSTACK CLI INFO: Completed instance stop action for node %s" % (node)) + # return action_output # Reboot the node instance - def reboot_instances(self,node): + def reboot_instances(self, node): runcommand.invoke("openstack server reboot --soft %s" % (node)) - logging.info("OPENSTACKCLOUD CLI INFO: Completed instance reboot action for node %s" % (node)) + logging.info("OPENSTACK CLI INFO: Completed instance reboot action for node %s" % (node)) # Wait until the node instance is running def wait_until_running(self, node): @@ -43,7 +39,9 @@ class OPENSTACKCLOUD: i = 0 sleeper = 1 while i <= timeout: - instStatus = runcommand.invoke("openstack server show %s | tr -d ' ' | grep '^|status' | cut -d '|' -f3 | tr -d '\n'" % (node)) + instStatus = runcommand.invoke("openstack server show %s | tr -d ' ' |" + "grep '^|status' |" + "cut -d '|' -f3 | tr -d '\n'" % (node)) logging.info("instance status is %s" % (instStatus)) logging.info("expected status is %s" % (expected_status)) if (instStatus.strip() == expected_status): @@ -51,9 +49,9 @@ class OPENSTACKCLOUD: return True time.sleep(sleeper) i += sleeper - + # Get the openstack instance name - def get_openstack_nodename(self,os_node_ip): + def get_openstack_nodename(self, os_node_ip): server_list = runcommand.invoke("openstack server list | grep %s" % (os_node_ip)) list_of_servers = server_list.split('\n') for item in list_of_servers: @@ -61,15 +59,16 @@ class OPENSTACKCLOUD: counter = 0 for i in items: if i.strip() != "" and counter == 2: - node_name = i.strip() - logging.info("Openstack node name is %s " % (node_name)) - counter += 1 - continue + node_name = i.strip() + logging.info("Openstack node name is %s " % (node_name)) + counter += 1 + continue item_list = i.split('=') if len(item_list) == 2 and item_list[-1].strip() == os_node_ip: - return node_name + return node_name counter += 1 - + + class openstack_node_scenarios(abstract_node_scenarios): def __init__(self): self.openstackcloud = OPENSTACKCLOUD() @@ -111,7 +110,6 @@ class openstack_node_scenarios(abstract_node_scenarios): logging.error("node_stop_scenario injection failed!") sys.exit(1) - # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): for _ in range(instance_kill_count): @@ -131,3 +129,47 @@ class openstack_node_scenarios(abstract_node_scenarios): logging.error("node_reboot_scenario injection failed!") sys.exit(1) + # Node scenario to start the node + def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting helper_node_start_scenario injection") + openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip()) + logging.info("Starting the helper node %s" % (openstack_node_name)) + self.openstackcloud.start_instances(openstack_node_name) + self.openstackcloud.wait_until_running(openstack_node_name) + logging.info("Helper node with IP: %s is in running state" % (node_ip)) + logging.info("node_start_scenario has been successfully injected!") + except Exception as e: + logging.error("Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e)) + logging.error("helper_node_start_scenario injection failed!") + sys.exit(1) + + # Node scenario to stop the node + def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting helper_node_stop_scenario injection") + openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip()) + logging.info("Stopping the helper node %s " % (openstack_node_name)) + self.openstackcloud.stop_instances(openstack_node_name) + self.openstackcloud.wait_until_stopped(openstack_node_name) + logging.info("Helper node with IP: %s is in stopped state" % (node_ip)) + except Exception as e: + logging.error("Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % (e)) + logging.error("helper_node_stop_scenario injection failed!") + sys.exit(1) + + def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout): + try: + logging.info("Checking service status on the helper node") + nodeaction.check_service_status(node_ip.strip(), service, ssh_private_key, timeout) + logging.info("Service status checked on %s" % (node_ip)) + logging.info("Check service status is successfuly injected!") + except Exception as e: + logging.error("Failed to check service status. Encountered following exception:" + " %s. Test Failed" % (e)) + logging.error("helper_node_service_status injection failed!") + sys.exit(1) diff --git a/requirements.txt b/requirements.txt index 32d0c9d0..544680d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ kubernetes==12.0.0a1 oauth2client>=4.1.3 python-openstackclient gitpython +paramiko diff --git a/run_kraken.py b/run_kraken.py index 2bf94ac5..1dd80b40 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -49,6 +49,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object): node_name = node_scenario.get("node_name", "") label_selector = node_scenario.get("label_selector", "") timeout = node_scenario.get("timeout", 120) + service = node_scenario.get("service", "") + ssh_private_key = node_scenario.get("ssh_private_key", "~/.ssh/id_rsa") # Get the node to apply the scenario node = nodeaction.get_node(node_name, label_selector) @@ -71,6 +73,18 @@ def inject_node_scenario(action, node_scenario, node_scenario_object): node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) elif action == "node_crash_scenario": node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout) + elif action == "stop_start_helper_node_scenario": + if node_scenario['cloud_type'] != "openstack": + logging.error("Scenario: " + action + " is not supported for " + "cloud type " + node_scenario['cloud_type'] + ", skipping action") + else: + if not node_scenario['helper_node_ip']: + logging.error("Helper node IP address is not provided") + sys.exit(1) + node_scenario_object.helper_node_stop_start_scenario( + instance_kill_count, node_scenario['helper_node_ip'], timeout) + node_scenario_object.helper_node_service_status( + node_scenario['helper_node_ip'], service, ssh_private_key, timeout) # Get cerberus status