mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Merge pull request #68 from pravin-dsilva/bastion_scenario
Add node level chaos scenarios for bastion node
This commit is contained in:
@@ -10,6 +10,7 @@ Following node chaos scenarios are supported:
|
||||
6. **stop_kubelet_scenario**: scenario to stop the kubelet of the node instance.
|
||||
7. **stop_start_kubelet_scenario**: scenario to stop and start the kubelet of the node instance.
|
||||
8. **node_crash_scenario**: scenario to crash the node instance.
|
||||
9. **stop_start_helper_node_scenario**: scenario to stop and start the helper node and check service status.
|
||||
|
||||
**NOTE**: If the node doesn't recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
|
||||
|
||||
@@ -34,6 +35,10 @@ After creating the service account you'll need to enable the account using the f
|
||||
|
||||
The supported node level chaos scenarios on an OPENSTACK cloud are `node_stop_start_scenario`, `stop_start_kubelet_scenario` and `node_reboot_scenario`.
|
||||
|
||||
**NOTE**: For `stop_start_helper_node_scenario`, visit [here](https://github.com/RedHatOfficial/ocp4-helpernode) to learn more about the helper node and its usage.
|
||||
|
||||
To execute the scenario, ensure the value for `ssh_private_key` in the node scenarios config file is set with the correct private key file path for ssh connection to the helper node. Ensure passwordless ssh is configured on the host running Kraken and the helper node to avoid connection errors.
|
||||
|
||||
**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform.
|
||||
|
||||
Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types
|
||||
@@ -64,4 +69,15 @@ node_scenarios:
|
||||
label_selector: node-role.kubernetes.io/infra
|
||||
instance_kill_count: 1
|
||||
timeout: 120
|
||||
- actions:
|
||||
- stop_start_helper_node_scenario # node chaos scenario for helper node
|
||||
instance_kill_count: 1
|
||||
timeout: 120
|
||||
helper_node_ip: # ip address of the helper node
|
||||
service: # check status of the services on the helper node
|
||||
- haproxy
|
||||
- dhcpd
|
||||
- named
|
||||
ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node
|
||||
cloud_type: openstack
|
||||
```
|
||||
|
||||
@@ -21,6 +21,12 @@ class abstract_node_scenarios:
|
||||
self.node_start_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("node_stop_start_scenario has been successfully injected!")
|
||||
|
||||
def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("Starting helper_node_stop_start_scenario injection")
|
||||
self.helper_node_stop_scenario(instance_kill_count, node, timeout)
|
||||
self.helper_node_start_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("helper_node_stop_start_scenario has been successfully injected!")
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
pass
|
||||
@@ -66,3 +72,7 @@ class abstract_node_scenarios:
|
||||
"Test Failed" % (e))
|
||||
logging.error("node_crash_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
# Node scenario to check service status on helper node
|
||||
def node_service_status(self, node, service, ssh_private_key, timeout):
|
||||
pass
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import paramiko
|
||||
import kraken.kubernetes.client as kubecli
|
||||
import kraken.invoke.command as runcommand
|
||||
|
||||
@@ -36,6 +37,36 @@ def wait_for_unknown_status(node, timeout):
|
||||
if kubecli.get_node_status(node) != "Unknown":
|
||||
raise Exception("Node condition status isn't Unknown")
|
||||
|
||||
|
||||
# Get the ip of the cluster node
|
||||
def get_node_ip(node):
|
||||
return runcommand.invoke("kubectl get node %s -o jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node))
|
||||
return runcommand.invoke("kubectl get node %s -o "
|
||||
"jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'"
|
||||
% (node))
|
||||
|
||||
|
||||
def check_service_status(node, service, ssh_private_key, timeout):
|
||||
ssh = paramiko.SSHClient()
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
i = 0
|
||||
sleeper = 1
|
||||
while i <= timeout:
|
||||
try:
|
||||
time.sleep(sleeper)
|
||||
i += sleeper
|
||||
logging.info("Trying to ssh to instance: %s" % (node))
|
||||
connection = ssh.connect(node, username='root', key_filename=ssh_private_key,
|
||||
timeout=800, banner_timeout=400)
|
||||
if connection is None:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
for service_name in service:
|
||||
logging.info("Checking status of Service: %s" % (service_name))
|
||||
stdin, stdout, stderr = ssh.exec_command("systemctl status %s | grep '^ Active' "
|
||||
"| awk '{print $2}'" % (service_name))
|
||||
service_status = stdout.readlines()[0]
|
||||
logging.info("Status of service %s is %s \n" % (service_name, service_status.strip()))
|
||||
if(service_status.strip() != "active"):
|
||||
logging.error("Service %s is in %s state" % (service_name, service_status.strip()))
|
||||
ssh.close()
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
import requests
|
||||
import kraken.kubernetes.client as kubecli
|
||||
import kraken.invoke.command as runcommand
|
||||
import kraken.node_actions.common_node_functions as nodeaction
|
||||
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
|
||||
@@ -14,21 +11,20 @@ class OPENSTACKCLOUD:
|
||||
self.Wait = 30
|
||||
|
||||
# Start the node instance
|
||||
def start_instances(self,node):
|
||||
def start_instances(self, node):
|
||||
runcommand.invoke("openstack server start %s" % (node))
|
||||
logging.info("OPENSTACKCLOUD CLI INFO: Completed instance start action for node %s" % (node))
|
||||
logging.info("OPENSTACK CLI INFO: Completed instance start action for node %s" % (node))
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, node):
|
||||
runcommand.invoke("openstack server stop %s" % (node))
|
||||
logging.info("OPENSTACKCLOUD CLI INFO: Completed instance stop action for node %s" % (node))
|
||||
#return action_output
|
||||
|
||||
logging.info("OPENSTACK CLI INFO: Completed instance stop action for node %s" % (node))
|
||||
# return action_output
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self,node):
|
||||
def reboot_instances(self, node):
|
||||
runcommand.invoke("openstack server reboot --soft %s" % (node))
|
||||
logging.info("OPENSTACKCLOUD CLI INFO: Completed instance reboot action for node %s" % (node))
|
||||
logging.info("OPENSTACK CLI INFO: Completed instance reboot action for node %s" % (node))
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, node):
|
||||
@@ -43,7 +39,9 @@ class OPENSTACKCLOUD:
|
||||
i = 0
|
||||
sleeper = 1
|
||||
while i <= timeout:
|
||||
instStatus = runcommand.invoke("openstack server show %s | tr -d ' ' | grep '^|status' | cut -d '|' -f3 | tr -d '\n'" % (node))
|
||||
instStatus = runcommand.invoke("openstack server show %s | tr -d ' ' |"
|
||||
"grep '^|status' |"
|
||||
"cut -d '|' -f3 | tr -d '\n'" % (node))
|
||||
logging.info("instance status is %s" % (instStatus))
|
||||
logging.info("expected status is %s" % (expected_status))
|
||||
if (instStatus.strip() == expected_status):
|
||||
@@ -51,9 +49,9 @@ class OPENSTACKCLOUD:
|
||||
return True
|
||||
time.sleep(sleeper)
|
||||
i += sleeper
|
||||
|
||||
|
||||
# Get the openstack instance name
|
||||
def get_openstack_nodename(self,os_node_ip):
|
||||
def get_openstack_nodename(self, os_node_ip):
|
||||
server_list = runcommand.invoke("openstack server list | grep %s" % (os_node_ip))
|
||||
list_of_servers = server_list.split('\n')
|
||||
for item in list_of_servers:
|
||||
@@ -61,15 +59,16 @@ class OPENSTACKCLOUD:
|
||||
counter = 0
|
||||
for i in items:
|
||||
if i.strip() != "" and counter == 2:
|
||||
node_name = i.strip()
|
||||
logging.info("Openstack node name is %s " % (node_name))
|
||||
counter += 1
|
||||
continue
|
||||
node_name = i.strip()
|
||||
logging.info("Openstack node name is %s " % (node_name))
|
||||
counter += 1
|
||||
continue
|
||||
item_list = i.split('=')
|
||||
if len(item_list) == 2 and item_list[-1].strip() == os_node_ip:
|
||||
return node_name
|
||||
return node_name
|
||||
counter += 1
|
||||
|
||||
|
||||
|
||||
class openstack_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self):
|
||||
self.openstackcloud = OPENSTACKCLOUD()
|
||||
@@ -111,7 +110,6 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
@@ -131,3 +129,47 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
# Node scenario to start the node
|
||||
def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting helper_node_start_scenario injection")
|
||||
openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
|
||||
logging.info("Starting the helper node %s" % (openstack_node_name))
|
||||
self.openstackcloud.start_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_running(openstack_node_name)
|
||||
logging.info("Helper node with IP: %s is in running state" % (node_ip))
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Encountered following "
|
||||
"exception: %s. Test Failed" % (e))
|
||||
logging.error("helper_node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting helper_node_stop_scenario injection")
|
||||
openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
|
||||
logging.info("Stopping the helper node %s " % (openstack_node_name))
|
||||
self.openstackcloud.stop_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_stopped(openstack_node_name)
|
||||
logging.info("Helper node with IP: %s is in stopped state" % (node_ip))
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. "
|
||||
"Test Failed" % (e))
|
||||
logging.error("helper_node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout):
|
||||
try:
|
||||
logging.info("Checking service status on the helper node")
|
||||
nodeaction.check_service_status(node_ip.strip(), service, ssh_private_key, timeout)
|
||||
logging.info("Service status checked on %s" % (node_ip))
|
||||
logging.info("Check service status is successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to check service status. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error("helper_node_service_status injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -9,3 +9,4 @@ kubernetes==12.0.0a1
|
||||
oauth2client>=4.1.3
|
||||
python-openstackclient
|
||||
gitpython
|
||||
paramiko
|
||||
|
||||
@@ -49,6 +49,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
|
||||
node_name = node_scenario.get("node_name", "")
|
||||
label_selector = node_scenario.get("label_selector", "")
|
||||
timeout = node_scenario.get("timeout", 120)
|
||||
service = node_scenario.get("service", "")
|
||||
ssh_private_key = node_scenario.get("ssh_private_key", "~/.ssh/id_rsa")
|
||||
# Get the node to apply the scenario
|
||||
node = nodeaction.get_node(node_name, label_selector)
|
||||
|
||||
@@ -71,6 +73,18 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
|
||||
node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
|
||||
elif action == "node_crash_scenario":
|
||||
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
|
||||
elif action == "stop_start_helper_node_scenario":
|
||||
if node_scenario['cloud_type'] != "openstack":
|
||||
logging.error("Scenario: " + action + " is not supported for "
|
||||
"cloud type " + node_scenario['cloud_type'] + ", skipping action")
|
||||
else:
|
||||
if not node_scenario['helper_node_ip']:
|
||||
logging.error("Helper node IP address is not provided")
|
||||
sys.exit(1)
|
||||
node_scenario_object.helper_node_stop_start_scenario(
|
||||
instance_kill_count, node_scenario['helper_node_ip'], timeout)
|
||||
node_scenario_object.helper_node_service_status(
|
||||
node_scenario['helper_node_ip'], service, ssh_private_key, timeout)
|
||||
|
||||
|
||||
# Get cerberus status
|
||||
|
||||
Reference in New Issue
Block a user