Files
krkn/kraken/node_actions/run.py
Naga Ravi Chaitanya Elluri 716057eab6 Monitor user application availability during chaos
Current Kraken integration with Cerberus monitors the cluster as well as the
application health post chaos and pass/fails if they are not healthy after chaos.
This commit adds ability to monitor the user application health during the chaos
and fails the run in case of downtime as it's potentially a downtime in case of
customers environment as well. It is especially useful in case of control plane
failure scenarios including API server, Etcd, Ingress etc.
2021-07-27 13:15:57 -04:00

113 lines
5.5 KiB
Python

import yaml
import logging
import sys
import time
from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
from kraken.node_actions.az_node_scenarios import azure_node_scenarios
from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios
from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
import kraken.node_actions.common_node_functions as common_node_functions
import kraken.cerberus.setup as cerberus
node_general = False
# Get the node scenarios object of specfied cloud type
def get_node_scenario_object(node_scenario):
if "cloud_type" not in node_scenario.keys() or node_scenario["cloud_type"] == "generic":
global node_general
node_general = True
return general_node_scenarios()
if node_scenario["cloud_type"] == "aws":
return aws_node_scenarios()
elif node_scenario["cloud_type"] == "gcp":
return gcp_node_scenarios()
elif node_scenario["cloud_type"] == "openstack":
return openstack_node_scenarios()
elif node_scenario["cloud_type"] == "azure" or node_scenario["cloud_type"] == "az":
return azure_node_scenarios()
elif node_scenario["cloud_type"] == "bm":
return bm_node_scenarios(
node_scenario.get("bmc_info"), node_scenario.get("bmc_user", None), node_scenario.get("bmc_password", None)
)
else:
logging.error(
"Cloud type " + node_scenario["cloud_type"] + " is not currently supported; "
"try using 'generic' if wanting to stop/start kubelet or fork bomb on any "
"cluster"
)
sys.exit(1)
# Run defined scenarios
def run(scenarios_list, config, wait_duration):
for node_scenario_config in scenarios_list:
with open(node_scenario_config, "r") as f:
node_scenario_config = yaml.full_load(f)
for node_scenario in node_scenario_config["node_scenarios"]:
node_scenario_object = get_node_scenario_object(node_scenario)
if node_scenario["actions"]:
for action in node_scenario["actions"]:
start_time = int(time.time())
inject_node_scenario(action, node_scenario, node_scenario_object)
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.get_status(config, start_time, end_time)
logging.info("")
# Inject the specified node scenario
def inject_node_scenario(action, node_scenario, node_scenario_object):
generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
# Get the node scenario configurations
instance_kill_count = node_scenario.get("instance_kill_count", 1)
node_name = node_scenario.get("node_name", "")
label_selector = node_scenario.get("label_selector", "")
timeout = node_scenario.get("timeout", 120)
service = node_scenario.get("service", "")
ssh_private_key = node_scenario.get("ssh_private_key", "~/.ssh/id_rsa")
# Get the node to apply the scenario
node = common_node_functions.get_node(node_name, label_selector)
if node_general and action not in generic_cloud_scenarios:
logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action")
else:
if action == "node_start_scenario":
node_scenario_object.node_start_scenario(instance_kill_count, node, timeout)
elif action == "node_stop_scenario":
node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout)
elif action == "node_stop_start_scenario":
node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout)
elif action == "node_termination_scenario":
node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout)
elif action == "node_reboot_scenario":
node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "stop_kubelet_scenario":
node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "node_crash_scenario":
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
elif action == "stop_start_helper_node_scenario":
if node_scenario["cloud_type"] != "openstack":
logging.error(
"Scenario: " + action + " is not supported for "
"cloud type " + node_scenario["cloud_type"] + ", skipping action"
)
else:
if not node_scenario["helper_node_ip"]:
logging.error("Helper node IP address is not provided")
sys.exit(1)
node_scenario_object.helper_node_stop_start_scenario(
instance_kill_count, node_scenario["helper_node_ip"], timeout
)
node_scenario_object.helper_node_service_status(
node_scenario["helper_node_ip"], service, ssh_private_key, timeout
)
else:
logging.info("There is no node action that matches %s, skipping scenario" % action)