mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Current Kraken integration with Cerberus monitors the cluster as well as the application health post chaos and pass/fails if they are not healthy after chaos. This commit adds ability to monitor the user application health during the chaos and fails the run in case of downtime as it's potentially a downtime in case of customers environment as well. It is especially useful in case of control plane failure scenarios including API server, Etcd, Ingress etc.
113 lines
5.5 KiB
Python
113 lines
5.5 KiB
Python
import yaml
|
|
import logging
|
|
import sys
|
|
import time
|
|
from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
|
|
from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
|
|
from kraken.node_actions.az_node_scenarios import azure_node_scenarios
|
|
from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
|
|
from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios
|
|
from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
|
|
import kraken.node_actions.common_node_functions as common_node_functions
|
|
import kraken.cerberus.setup as cerberus
|
|
|
|
|
|
node_general = False
|
|
|
|
|
|
# Get the node scenarios object of specfied cloud type
|
|
def get_node_scenario_object(node_scenario):
|
|
if "cloud_type" not in node_scenario.keys() or node_scenario["cloud_type"] == "generic":
|
|
global node_general
|
|
node_general = True
|
|
return general_node_scenarios()
|
|
if node_scenario["cloud_type"] == "aws":
|
|
return aws_node_scenarios()
|
|
elif node_scenario["cloud_type"] == "gcp":
|
|
return gcp_node_scenarios()
|
|
elif node_scenario["cloud_type"] == "openstack":
|
|
return openstack_node_scenarios()
|
|
elif node_scenario["cloud_type"] == "azure" or node_scenario["cloud_type"] == "az":
|
|
return azure_node_scenarios()
|
|
elif node_scenario["cloud_type"] == "bm":
|
|
return bm_node_scenarios(
|
|
node_scenario.get("bmc_info"), node_scenario.get("bmc_user", None), node_scenario.get("bmc_password", None)
|
|
)
|
|
else:
|
|
logging.error(
|
|
"Cloud type " + node_scenario["cloud_type"] + " is not currently supported; "
|
|
"try using 'generic' if wanting to stop/start kubelet or fork bomb on any "
|
|
"cluster"
|
|
)
|
|
sys.exit(1)
|
|
|
|
|
|
# Run defined scenarios
|
|
def run(scenarios_list, config, wait_duration):
|
|
for node_scenario_config in scenarios_list:
|
|
with open(node_scenario_config, "r") as f:
|
|
node_scenario_config = yaml.full_load(f)
|
|
for node_scenario in node_scenario_config["node_scenarios"]:
|
|
node_scenario_object = get_node_scenario_object(node_scenario)
|
|
if node_scenario["actions"]:
|
|
for action in node_scenario["actions"]:
|
|
start_time = int(time.time())
|
|
inject_node_scenario(action, node_scenario, node_scenario_object)
|
|
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
|
time.sleep(wait_duration)
|
|
end_time = int(time.time())
|
|
cerberus.get_status(config, start_time, end_time)
|
|
logging.info("")
|
|
|
|
|
|
# Inject the specified node scenario
|
|
def inject_node_scenario(action, node_scenario, node_scenario_object):
|
|
generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
|
|
# Get the node scenario configurations
|
|
instance_kill_count = node_scenario.get("instance_kill_count", 1)
|
|
node_name = node_scenario.get("node_name", "")
|
|
label_selector = node_scenario.get("label_selector", "")
|
|
timeout = node_scenario.get("timeout", 120)
|
|
service = node_scenario.get("service", "")
|
|
ssh_private_key = node_scenario.get("ssh_private_key", "~/.ssh/id_rsa")
|
|
# Get the node to apply the scenario
|
|
node = common_node_functions.get_node(node_name, label_selector)
|
|
|
|
if node_general and action not in generic_cloud_scenarios:
|
|
logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action")
|
|
else:
|
|
if action == "node_start_scenario":
|
|
node_scenario_object.node_start_scenario(instance_kill_count, node, timeout)
|
|
elif action == "node_stop_scenario":
|
|
node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout)
|
|
elif action == "node_stop_start_scenario":
|
|
node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout)
|
|
elif action == "node_termination_scenario":
|
|
node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout)
|
|
elif action == "node_reboot_scenario":
|
|
node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout)
|
|
elif action == "stop_start_kubelet_scenario":
|
|
node_scenario_object.stop_start_kubelet_scenario(instance_kill_count, node, timeout)
|
|
elif action == "stop_kubelet_scenario":
|
|
node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
|
|
elif action == "node_crash_scenario":
|
|
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
|
|
elif action == "stop_start_helper_node_scenario":
|
|
if node_scenario["cloud_type"] != "openstack":
|
|
logging.error(
|
|
"Scenario: " + action + " is not supported for "
|
|
"cloud type " + node_scenario["cloud_type"] + ", skipping action"
|
|
)
|
|
else:
|
|
if not node_scenario["helper_node_ip"]:
|
|
logging.error("Helper node IP address is not provided")
|
|
sys.exit(1)
|
|
node_scenario_object.helper_node_stop_start_scenario(
|
|
instance_kill_count, node_scenario["helper_node_ip"], timeout
|
|
)
|
|
node_scenario_object.helper_node_service_status(
|
|
node_scenario["helper_node_ip"], service, ssh_private_key, timeout
|
|
)
|
|
else:
|
|
logging.info("There is no node action that matches %s, skipping scenario" % action)
|