Merge pull request #68 from pravin-dsilva/bastion_scenario

Add node level chaos scenarios for bastion node
This commit is contained in:
Mike Fiedler
2021-02-16 15:31:23 -05:00
committed by GitHub
6 changed files with 135 additions and 21 deletions

View File

@@ -10,6 +10,7 @@ Following node chaos scenarios are supported:
6. **stop_kubelet_scenario**: scenario to stop the kubelet of the node instance.
7. **stop_start_kubelet_scenario**: scenario to stop and start the kubelet of the node instance.
8. **node_crash_scenario**: scenario to crash the node instance.
9. **stop_start_helper_node_scenario**: scenario to stop and start the helper node and check service status.
**NOTE**: If the node doesn't recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
@@ -34,6 +35,10 @@ After creating the service account you'll need to enable the account using the f
The supported node level chaos scenarios on an OPENSTACK cloud are `node_stop_start_scenario`, `stop_start_kubelet_scenario` and `node_reboot_scenario`.
**NOTE**: For `stop_start_helper_node_scenario`, visit [here](https://github.com/RedHatOfficial/ocp4-helpernode) to learn more about the helper node and its usage.
To execute the scenario, ensure the value for `ssh_private_key` in the node scenarios config file is set with the correct private key file path for ssh connection to the helper node. Ensure passwordless ssh is configured on the host running Kraken and the helper node to avoid connection errors.
**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform.
Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types
@@ -64,4 +69,15 @@ node_scenarios:
label_selector: node-role.kubernetes.io/infra
instance_kill_count: 1
timeout: 120
- actions:
- stop_start_helper_node_scenario # node chaos scenario for helper node
instance_kill_count: 1
timeout: 120
helper_node_ip: # ip address of the helper node
service: # check status of the services on the helper node
- haproxy
- dhcpd
- named
ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node
cloud_type: openstack
```

View File

@@ -21,6 +21,12 @@ class abstract_node_scenarios:
self.node_start_scenario(instance_kill_count, node, timeout)
logging.info("node_stop_start_scenario has been successfully injected!")
def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting helper_node_stop_start_scenario injection")
self.helper_node_stop_scenario(instance_kill_count, node, timeout)
self.helper_node_start_scenario(instance_kill_count, node, timeout)
logging.info("helper_node_stop_start_scenario has been successfully injected!")
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
pass
@@ -66,3 +72,7 @@ class abstract_node_scenarios:
"Test Failed" % (e))
logging.error("node_crash_scenario injection failed!")
sys.exit(1)
# Node scenario to check service status on helper node
def node_service_status(self, node, service, ssh_private_key, timeout):
pass

View File

@@ -1,6 +1,7 @@
import time
import random
import logging
import paramiko
import kraken.kubernetes.client as kubecli
import kraken.invoke.command as runcommand
@@ -36,6 +37,36 @@ def wait_for_unknown_status(node, timeout):
if kubecli.get_node_status(node) != "Unknown":
raise Exception("Node condition status isn't Unknown")
# Get the ip of the cluster node
def get_node_ip(node):
return runcommand.invoke("kubectl get node %s -o jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node))
return runcommand.invoke("kubectl get node %s -o "
"jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'"
% (node))
def check_service_status(node, service, ssh_private_key, timeout):
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
i = 0
sleeper = 1
while i <= timeout:
try:
time.sleep(sleeper)
i += sleeper
logging.info("Trying to ssh to instance: %s" % (node))
connection = ssh.connect(node, username='root', key_filename=ssh_private_key,
timeout=800, banner_timeout=400)
if connection is None:
break
except Exception:
pass
for service_name in service:
logging.info("Checking status of Service: %s" % (service_name))
stdin, stdout, stderr = ssh.exec_command("systemctl status %s | grep '^ Active' "
"| awk '{print $2}'" % (service_name))
service_status = stdout.readlines()[0]
logging.info("Status of service %s is %s \n" % (service_name, service_status.strip()))
if(service_status.strip() != "active"):
logging.error("Service %s is in %s state" % (service_name, service_status.strip()))
ssh.close()

View File

@@ -1,9 +1,6 @@
import sys
import time
import logging
import subprocess
import requests
import kraken.kubernetes.client as kubecli
import kraken.invoke.command as runcommand
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
@@ -14,21 +11,20 @@ class OPENSTACKCLOUD:
self.Wait = 30
# Start the node instance
def start_instances(self,node):
def start_instances(self, node):
runcommand.invoke("openstack server start %s" % (node))
logging.info("OPENSTACKCLOUD CLI INFO: Completed instance start action for node %s" % (node))
logging.info("OPENSTACK CLI INFO: Completed instance start action for node %s" % (node))
# Stop the node instance
def stop_instances(self, node):
runcommand.invoke("openstack server stop %s" % (node))
logging.info("OPENSTACKCLOUD CLI INFO: Completed instance stop action for node %s" % (node))
#return action_output
logging.info("OPENSTACK CLI INFO: Completed instance stop action for node %s" % (node))
# return action_output
# Reboot the node instance
def reboot_instances(self,node):
def reboot_instances(self, node):
runcommand.invoke("openstack server reboot --soft %s" % (node))
logging.info("OPENSTACKCLOUD CLI INFO: Completed instance reboot action for node %s" % (node))
logging.info("OPENSTACK CLI INFO: Completed instance reboot action for node %s" % (node))
# Wait until the node instance is running
def wait_until_running(self, node):
@@ -43,7 +39,9 @@ class OPENSTACKCLOUD:
i = 0
sleeper = 1
while i <= timeout:
instStatus = runcommand.invoke("openstack server show %s | tr -d ' ' | grep '^|status' | cut -d '|' -f3 | tr -d '\n'" % (node))
instStatus = runcommand.invoke("openstack server show %s | tr -d ' ' |"
"grep '^|status' |"
"cut -d '|' -f3 | tr -d '\n'" % (node))
logging.info("instance status is %s" % (instStatus))
logging.info("expected status is %s" % (expected_status))
if (instStatus.strip() == expected_status):
@@ -51,9 +49,9 @@ class OPENSTACKCLOUD:
return True
time.sleep(sleeper)
i += sleeper
# Get the openstack instance name
def get_openstack_nodename(self,os_node_ip):
def get_openstack_nodename(self, os_node_ip):
server_list = runcommand.invoke("openstack server list | grep %s" % (os_node_ip))
list_of_servers = server_list.split('\n')
for item in list_of_servers:
@@ -61,15 +59,16 @@ class OPENSTACKCLOUD:
counter = 0
for i in items:
if i.strip() != "" and counter == 2:
node_name = i.strip()
logging.info("Openstack node name is %s " % (node_name))
counter += 1
continue
node_name = i.strip()
logging.info("Openstack node name is %s " % (node_name))
counter += 1
continue
item_list = i.split('=')
if len(item_list) == 2 and item_list[-1].strip() == os_node_ip:
return node_name
return node_name
counter += 1
class openstack_node_scenarios(abstract_node_scenarios):
def __init__(self):
self.openstackcloud = OPENSTACKCLOUD()
@@ -111,7 +110,6 @@ class openstack_node_scenarios(abstract_node_scenarios):
logging.error("node_stop_scenario injection failed!")
sys.exit(1)
# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
@@ -131,3 +129,47 @@ class openstack_node_scenarios(abstract_node_scenarios):
logging.error("node_reboot_scenario injection failed!")
sys.exit(1)
# Node scenario to start the node
def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting helper_node_start_scenario injection")
openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
logging.info("Starting the helper node %s" % (openstack_node_name))
self.openstackcloud.start_instances(openstack_node_name)
self.openstackcloud.wait_until_running(openstack_node_name)
logging.info("Helper node with IP: %s is in running state" % (node_ip))
logging.info("node_start_scenario has been successfully injected!")
except Exception as e:
logging.error("Failed to start node instance. Encountered following "
"exception: %s. Test Failed" % (e))
logging.error("helper_node_start_scenario injection failed!")
sys.exit(1)
# Node scenario to stop the node
def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting helper_node_stop_scenario injection")
openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
logging.info("Stopping the helper node %s " % (openstack_node_name))
self.openstackcloud.stop_instances(openstack_node_name)
self.openstackcloud.wait_until_stopped(openstack_node_name)
logging.info("Helper node with IP: %s is in stopped state" % (node_ip))
except Exception as e:
logging.error("Failed to stop node instance. Encountered following exception: %s. "
"Test Failed" % (e))
logging.error("helper_node_stop_scenario injection failed!")
sys.exit(1)
def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout):
try:
logging.info("Checking service status on the helper node")
nodeaction.check_service_status(node_ip.strip(), service, ssh_private_key, timeout)
logging.info("Service status checked on %s" % (node_ip))
logging.info("Check service status is successfuly injected!")
except Exception as e:
logging.error("Failed to check service status. Encountered following exception:"
" %s. Test Failed" % (e))
logging.error("helper_node_service_status injection failed!")
sys.exit(1)

View File

@@ -9,3 +9,4 @@ kubernetes==12.0.0a1
oauth2client>=4.1.3
python-openstackclient
gitpython
paramiko

View File

@@ -49,6 +49,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
node_name = node_scenario.get("node_name", "")
label_selector = node_scenario.get("label_selector", "")
timeout = node_scenario.get("timeout", 120)
service = node_scenario.get("service", "")
ssh_private_key = node_scenario.get("ssh_private_key", "~/.ssh/id_rsa")
# Get the node to apply the scenario
node = nodeaction.get_node(node_name, label_selector)
@@ -71,6 +73,18 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "node_crash_scenario":
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
elif action == "stop_start_helper_node_scenario":
if node_scenario['cloud_type'] != "openstack":
logging.error("Scenario: " + action + " is not supported for "
"cloud type " + node_scenario['cloud_type'] + ", skipping action")
else:
if not node_scenario['helper_node_ip']:
logging.error("Helper node IP address is not provided")
sys.exit(1)
node_scenario_object.helper_node_stop_start_scenario(
instance_kill_count, node_scenario['helper_node_ip'], timeout)
node_scenario_object.helper_node_service_status(
node_scenario['helper_node_ip'], service, ssh_private_key, timeout)
# Get cerberus status