Krkn telemetry integration (#435)

* adapted config.yaml to the new feature

* temporarly pointing requirement.txt to the lib feature branch

* run_kraken.py + arcaflow scenarios refactoring


typo

* plugin scenario

* node scenarios


return failed scenarios

* container scenarios


fix

* time scenarios

* cluster shutdown  scenarios

* namespace scenarios

* zone outage scenarios

* app outage scenarios

* pvc scenarios

* network chaos scenarios

* run_kraken.py adaptation to telemetry

* prometheus telemetry upload + config.yaml


some fixes


typos and logs


max retries in config


telemetry id with run_uuid


safe_logger

* catch send_telemetry exception

* scenario collection bug fixes

* telemetry enabled check

* telemetry run tag

* requirements pointing to main + archive_size

* requirements.txt and config.yaml update

* added telemetry config to common config

* fixed scenario array elements for telemetry
This commit is contained in:
Tullio Sebastiani
2023-08-10 20:42:53 +02:00
committed by GitHub
parent 491dc17267
commit 39c0152b7b
19 changed files with 964 additions and 583 deletions

View File

@@ -29,3 +29,15 @@ tunings:
wait_duration: 6 # Duration to wait between each chaos scenario.
iterations: 1 # Number of times to execute the scenarios.
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
telemetry:
enabled: False # enable/disables the telemetry collection feature
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
username: username # telemetry service username
password: password # telemetry service password
prometheus_backup: True # enables/disables prometheus data collection
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
backup_threads: 5 # number of telemetry download/upload threads
archive_path: /tmp # local path where the archive files will be temporarly stored
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is

View File

@@ -7,41 +7,8 @@ kraken:
signal_address: 0.0.0.0 # Signal listening address
port: 8081 # Signal port
chaos_scenarios: # List of policies/chaos scenarios to load
- arcaflow_scenarios:
- scenarios/arcaflow/cpu-hog/input.yaml
- scenarios/arcaflow/memory-hog/input.yaml
- container_scenarios: # List of chaos pod scenarios to load
- - scenarios/openshift/container_etcd.yml
- plugin_scenarios:
- scenarios/openshift/etcd.yml
- scenarios/openshift/regex_openshift_pod_kill.yml
- scenarios/openshift/vmware_node_scenarios.yml
- scenarios/openshift/ibmcloud_node_scenarios.yml
- scenarios/openshift/network_chaos_ingress.yml
- scenarios/openshift/pod_network_outage.yml
- scenarios/openshift/pod_network_shaping.yml
- node_scenarios: # List of chaos node scenarios to load
- scenarios/openshift/node_scenarios_example.yml
- plugin_scenarios:
- scenarios/openshift/openshift-apiserver.yml
- scenarios/openshift/openshift-kube-apiserver.yml
- time_scenarios: # List of chaos time scenarios to load
- scenarios/openshift/time_scenarios_example.yml
- cluster_shut_down_scenarios:
- - scenarios/openshift/cluster_shut_down_scenario.yml
- scenarios/openshift/post_action_shut_down.py
- namespace_scenarios:
- - scenarios/openshift/regex_namespace.yaml
- - scenarios/openshift/ingress_namespace.yaml
- scenarios/openshift/post_action_namespace.py
- zone_outages:
- scenarios/openshift/zone_outage.yaml
- application_outages:
- scenarios/openshift/app_outage.yaml
- pvc_scenarios:
- scenarios/openshift/pvc_scenario.yaml
- network_chaos:
- scenarios/openshift/network_chaos.yaml
- application_outages:
- scenarios/openshift/app_outage.yaml
cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
@@ -65,3 +32,20 @@ tunings:
wait_duration: 60 # Duration to wait between each chaos scenario
iterations: 1 # Number of times to execute the scenarios
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever
telemetry:
enabled: False # enable/disables the telemetry collection feature
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
username: username # telemetry service username
password: password # telemetry service password
prometheus_backup: True # enables/disables prometheus data collection
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
backup_threads: 5 # number of telemetry download/upload threads
archive_path: /tmp # local path where the archive files will be temporarly stored
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
# simultaneously).
# For unstable/slow connection is better to keep this value low
# increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
# failed chunk without affecting the whole upload.

View File

@@ -4,25 +4,32 @@ import time
import kraken.cerberus.setup as cerberus
from jinja2 import Template
import kraken.invoke.command as runcommand
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
# Reads the scenario config, applies and deletes a network policy to
# block the traffic for the specified duration
def run(scenarios_list, config, wait_duration):
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_post_scenarios = ""
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for app_outage_config in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = app_outage_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
if len(app_outage_config) > 1:
with open(app_outage_config, "r") as f:
app_outage_config_yaml = yaml.full_load(f)
scenario_config = app_outage_config_yaml["application_outage"]
pod_selector = scenario_config.get("pod_selector", "{}")
traffic_type = scenario_config.get("block", "[Ingress, Egress]")
namespace = scenario_config.get("namespace", "")
duration = scenario_config.get("duration", 60)
try:
with open(app_outage_config, "r") as f:
app_outage_config_yaml = yaml.full_load(f)
scenario_config = app_outage_config_yaml["application_outage"]
pod_selector = scenario_config.get("pod_selector", "{}")
traffic_type = scenario_config.get("block", "[Ingress, Egress]")
namespace = scenario_config.get("namespace", "")
duration = scenario_config.get("duration", 60)
start_time = int(time.time())
start_time = int(time.time())
network_policy_template = """---
network_policy_template = """---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
@@ -31,28 +38,38 @@ spec:
podSelector:
matchLabels: {{ pod_selector }}
policyTypes: {{ traffic_type }}
"""
t = Template(network_policy_template)
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
# Write the rendered template to a file
with open("kraken_network_policy.yaml", "w") as f:
f.write(rendered_spec)
# Block the traffic by creating network policy
logging.info("Creating the network policy")
runcommand.invoke(
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
)
"""
t = Template(network_policy_template)
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
# Write the rendered template to a file
with open("kraken_network_policy.yaml", "w") as f:
f.write(rendered_spec)
# Block the traffic by creating network policy
logging.info("Creating the network policy")
runcommand.invoke(
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
)
# wait for the specified duration
logging.info("Waiting for the specified duration in the config: %s" % (duration))
time.sleep(duration)
# wait for the specified duration
logging.info("Waiting for the specified duration in the config: %s" % (duration))
time.sleep(duration)
# unblock the traffic by deleting the network policy
logging.info("Deleting the network policy")
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
# unblock the traffic by deleting the network policy
logging.info("Deleting the network policy")
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except Exception as e :
scenario_telemetry.exitStatus = 1
failed_scenarios.append(app_outage_config)
telemetry.log_exception(app_outage_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries
end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)

View File

@@ -1,3 +1,5 @@
import time
import arcaflow
import os
import yaml
@@ -6,22 +8,31 @@ import sys
from pathlib import Path
from typing import List
from .context_auth import ContextAuth
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
def run(scenarios_list: List[str], kubeconfig_path: str):
def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
failed_post_scenarios = []
for scenario in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = scenario
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry,scenario)
engine_args = build_args(scenario)
run_workflow(engine_args, kubeconfig_path)
status_code = run_workflow(engine_args, kubeconfig_path)
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetry.exitStatus = status_code
scenario_telemetries.append(scenario_telemetry)
if status_code != 0:
failed_post_scenarios.append(scenario)
return failed_post_scenarios, scenario_telemetries
def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str):
def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
set_arca_kubeconfig(engine_args, kubeconfig_path)
exit_status = arcaflow.run(engine_args)
if exit_status != 0:
logging.error(
f"failed to run arcaflow scenario {engine_args.input}"
)
sys.exit(exit_status)
return exit_status
def build_args(input_file: str) -> arcaflow.EngineArgs:

View File

@@ -6,7 +6,7 @@ import kraken.cerberus.setup as cerberus
import kraken.post_actions.actions as post_actions
import yaml
import sys
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
# krkn_lib_kubernetes
def run(
@@ -15,72 +15,96 @@ def run(
wait_duration,
failed_post_scenarios,
kubeconfig_path,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes
):
kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
telemetry: KrknTelemetry
) -> (list[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for scenario_config in scenarios_list:
if len(scenario_config) > 1:
pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
else:
pre_action_output = ""
with open(scenario_config[0], "r") as f:
scenario_config_yaml = yaml.full_load(f)
for scenario in scenario_config_yaml["scenarios"]:
scenario_namespace = scenario.get("namespace", "")
scenario_label = scenario.get("label_selector", "")
if scenario_namespace is not None and scenario_namespace.strip() != "":
if scenario_label is not None and scenario_label.strip() != "":
logging.error("You can only have namespace or label set in your namespace scenario")
logging.error(
"Current scenario config has namespace '%s' and label selector '%s'"
% (scenario_namespace, scenario_label)
)
logging.error(
"Please set either namespace to blank ('') or label_selector to blank ('') to continue"
)
sys.exit(1)
delete_count = scenario.get("delete_count", 1)
run_count = scenario.get("runs", 1)
run_sleep = scenario.get("sleep", 10)
wait_time = scenario.get("wait_time", 30)
killed_namespaces = []
start_time = int(time.time())
for i in range(run_count):
namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
for j in range(delete_count):
if len(namespaces) == 0:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = scenario_config[0]
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0])
try:
if len(scenario_config) > 1:
pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
else:
pre_action_output = ""
with open(scenario_config[0], "r") as f:
scenario_config_yaml = yaml.full_load(f)
for scenario in scenario_config_yaml["scenarios"]:
scenario_namespace = scenario.get("namespace", "")
scenario_label = scenario.get("label_selector", "")
if scenario_namespace is not None and scenario_namespace.strip() != "":
if scenario_label is not None and scenario_label.strip() != "":
logging.error("You can only have namespace or label set in your namespace scenario")
logging.error(
"Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
% (str(run_count), scenario_namespace, str(scenario_label))
"Current scenario config has namespace '%s' and label selector '%s'"
% (scenario_namespace, scenario_label)
)
sys.exit(1)
selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
killed_namespaces.append(selected_namespace)
try:
kubecli.delete_namespace(selected_namespace)
logging.info("Delete on namespace %s was successful" % str(selected_namespace))
except Exception as e:
logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
logging.info("Namespace action error: " + str(e))
sys.exit(1)
namespaces.remove(selected_namespace)
logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
time.sleep(run_sleep)
logging.info("Waiting for the specified duration: %s" % wait_duration)
time.sleep(wait_duration)
if len(scenario_config) > 1:
try:
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
logging.error(
"Please set either namespace to blank ('') or label_selector to blank ('') to continue"
)
# removed_exit
# sys.exit(1)
raise RuntimeError()
delete_count = scenario.get("delete_count", 1)
run_count = scenario.get("runs", 1)
run_sleep = scenario.get("sleep", 10)
wait_time = scenario.get("wait_time", 30)
killed_namespaces = []
start_time = int(time.time())
for i in range(run_count):
namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
for j in range(delete_count):
if len(namespaces) == 0:
logging.error(
"Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
% (str(run_count), scenario_namespace, str(scenario_label))
)
# removed_exit
# sys.exit(1)
raise RuntimeError()
selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
killed_namespaces.append(selected_namespace)
try:
kubecli.delete_namespace(selected_namespace)
logging.info("Delete on namespace %s was successful" % str(selected_namespace))
except Exception as e:
logging.error("Failed to run post action checks: %s" % e)
sys.exit(1)
else:
failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
logging.info("Namespace action error: " + str(e))
# removed_exit
# sys.exit(1)
raise RuntimeError()
namespaces.remove(selected_namespace)
logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
time.sleep(run_sleep)
logging.info("Waiting for the specified duration: %s" % wait_duration)
time.sleep(wait_duration)
if len(scenario_config) > 1:
try:
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
)
except Exception as e:
logging.error("Failed to run post action checks: %s" % e)
# removed_exit
# sys.exit(1)
raise RuntimeError()
else:
failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except (Exception, RuntimeError):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(scenario_config[0])
telemetry.log_exception(scenario_config[0])
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries
# krkn_lib_kubernetes
def check_active_namespace(killed_namespaces, wait_time, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):

View File

@@ -8,88 +8,104 @@ import krkn_lib_kubernetes
from jinja2 import Environment, FileSystemLoader
import kraken.cerberus.setup as cerberus
import kraken.node_actions.common_node_functions as common_node_functions
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
# krkn_lib_kubernetes
# Reads the scenario config and introduces traffic variations in Node's host network interface.
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_post_scenarios = ""
logging.info("Runing the Network Chaos tests")
failed_post_scenarios = ""
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for net_config in scenarios_list:
with open(net_config, "r") as file:
param_lst = ["latency", "loss", "bandwidth"]
test_config = yaml.safe_load(file)
test_dict = test_config["network_chaos"]
test_duration = int(test_dict.get("duration", 300))
test_interface = test_dict.get("interfaces", [])
test_node = test_dict.get("node_name", "")
test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
test_execution = test_dict.get("execution", "serial")
test_instance_count = test_dict.get("instance_count", 1)
test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
if test_node:
node_name_list = test_node.split(",")
else:
node_name_list = [test_node]
nodelst = []
for single_node_name in node_name_list:
nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
env = Environment(loader=file_loader, autoescape=True)
pod_template = env.get_template("pod.j2")
test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
joblst = []
egress_lst = [i for i in param_lst if i in test_egress]
chaos_config = {
"network_chaos": {
"duration": test_duration,
"interfaces": test_interface,
"node_name": ",".join(nodelst),
"execution": test_execution,
"instance_count": test_instance_count,
"egress": test_egress,
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = net_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, net_config)
try:
with open(net_config, "r") as file:
param_lst = ["latency", "loss", "bandwidth"]
test_config = yaml.safe_load(file)
test_dict = test_config["network_chaos"]
test_duration = int(test_dict.get("duration", 300))
test_interface = test_dict.get("interfaces", [])
test_node = test_dict.get("node_name", "")
test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
test_execution = test_dict.get("execution", "serial")
test_instance_count = test_dict.get("instance_count", 1)
test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
if test_node:
node_name_list = test_node.split(",")
else:
node_name_list = [test_node]
nodelst = []
for single_node_name in node_name_list:
nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
env = Environment(loader=file_loader, autoescape=True)
pod_template = env.get_template("pod.j2")
test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
joblst = []
egress_lst = [i for i in param_lst if i in test_egress]
chaos_config = {
"network_chaos": {
"duration": test_duration,
"interfaces": test_interface,
"node_name": ",".join(nodelst),
"execution": test_execution,
"instance_count": test_instance_count,
"egress": test_egress,
}
}
}
logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
job_template = env.get_template("job.j2")
try:
for i in egress_lst:
for node in nodelst:
exec_cmd = get_egress_cmd(
test_execution, test_interface, i, test_dict["egress"], duration=test_duration
)
logging.info("Executing %s on node %s" % (exec_cmd, node))
job_body = yaml.safe_load(
job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
)
joblst.append(job_body["metadata"]["name"])
api_response = kubecli.create_job(job_body)
if api_response is None:
raise Exception("Error creating job")
if test_execution == "serial":
logging.info("Waiting for serial job to finish")
logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
job_template = env.get_template("job.j2")
try:
for i in egress_lst:
for node in nodelst:
exec_cmd = get_egress_cmd(
test_execution, test_interface, i, test_dict["egress"], duration=test_duration
)
logging.info("Executing %s on node %s" % (exec_cmd, node))
job_body = yaml.safe_load(
job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
)
joblst.append(job_body["metadata"]["name"])
api_response = kubecli.create_job(job_body)
if api_response is None:
raise Exception("Error creating job")
if test_execution == "serial":
logging.info("Waiting for serial job to finish")
start_time = int(time.time())
wait_for_job(joblst[:], kubecli, test_duration + 300)
logging.info("Waiting for wait_duration %s" % wait_duration)
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
if test_execution == "parallel":
break
if test_execution == "parallel":
logging.info("Waiting for parallel job to finish")
start_time = int(time.time())
wait_for_job(joblst[:], kubecli, test_duration + 300)
logging.info("Waiting for wait_duration %s" % wait_duration)
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
if test_execution == "parallel":
break
if test_execution == "parallel":
logging.info("Waiting for parallel job to finish")
start_time = int(time.time())
wait_for_job(joblst[:], kubecli, test_duration + 300)
logging.info("Waiting for wait_duration %s" % wait_duration)
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except Exception as e:
logging.error("Network Chaos exiting due to Exception %s" % e)
sys.exit(1)
finally:
logging.info("Deleting jobs")
delete_job(joblst[:], kubecli)
except Exception as e:
logging.error("Network Chaos exiting due to Exception %s" % e)
raise RuntimeError()
finally:
logging.info("Deleting jobs")
delete_job(joblst[:], kubecli)
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(net_config)
telemetry.log_exception(net_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries
# krkn_lib_kubernetes
@@ -110,7 +126,8 @@ def verify_interface(test_interface, nodelst, template, kubecli: krkn_lib_kubern
for interface in test_interface:
if interface not in interface_lst:
logging.error("Interface %s not found in node %s interface list %s" % (interface, nodelst[pod_index], interface_lst))
sys.exit(1)
#sys.exit(1)
raise RuntimeError()
return test_interface
finally:
logging.info("Deleteing pod to query interface on node")
@@ -158,7 +175,7 @@ def delete_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
logging.error(pod_log)
except Exception:
logging.warning("Exception in getting job status")
api_response = kubecli.delete_job(name=jobname, namespace="default")
kubecli.delete_job(name=jobname, namespace="default")
def get_egress_cmd(execution, test_interface, mod, vallst, duration=30):

View File

@@ -27,7 +27,9 @@ class AWS:
logging.error(
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Stop the node instance
def stop_instances(self, instance_id):
@@ -36,7 +38,9 @@ class AWS:
logging.info("EC2 instance: " + str(instance_id) + " stopped")
except Exception as e:
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Terminate the node instance
def terminate_instances(self, instance_id):
@@ -47,7 +51,9 @@ class AWS:
logging.error(
"Failed to terminate node instance %s. Encountered following " "exception: %s." % (instance_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Reboot the node instance
def reboot_instances(self, instance_id):
@@ -58,7 +64,9 @@ class AWS:
logging.error(
"Failed to reboot node instance %s. Encountered following " "exception: %s." % (instance_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Below functions poll EC2.Client.describe_instances() every 15 seconds
# until a successful state is reached. An error is returned after 40 failed checks
@@ -102,7 +110,9 @@ class AWS:
"Failed to create the default network_acl: %s"
"Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
return acl_id
# Replace network acl association
@@ -114,7 +124,9 @@ class AWS:
new_association_id = status["NewAssociationId"]
except Exception as e:
logging.error("Failed to replace network acl association: %s" % (e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
return new_association_id
# Describe network acl
@@ -131,7 +143,9 @@ class AWS:
"Failed to describe network acl: %s."
"Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
associations = response["NetworkAcls"][0]["Associations"]
# grab the current network_acl in use
original_acl_id = response["NetworkAcls"][0]["Associations"][0]["NetworkAclId"]
@@ -148,7 +162,9 @@ class AWS:
"Make sure you have aws cli configured on the host and set for the region of your vpc/subnet"
% (acl_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# krkn_lib_kubernetes
class aws_node_scenarios(abstract_node_scenarios):
@@ -173,7 +189,9 @@ class aws_node_scenarios(abstract_node_scenarios):
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
)
logging.error("node_start_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -189,7 +207,9 @@ class aws_node_scenarios(abstract_node_scenarios):
except Exception as e:
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
logging.error("node_stop_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -213,7 +233,9 @@ class aws_node_scenarios(abstract_node_scenarios):
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
)
logging.error("node_termination_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -232,4 +254,6 @@ class aws_node_scenarios(abstract_node_scenarios):
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
)
logging.error("node_reboot_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()

View File

@@ -39,7 +39,9 @@ class Azure:
logging.info("vm name " + str(vm_name) + " started")
except Exception as e:
logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (vm_name, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Stop the node instance
def stop_instances(self, group_name, vm_name):
@@ -48,7 +50,9 @@ class Azure:
logging.info("vm name " + str(vm_name) + " stopped")
except Exception as e:
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (vm_name, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Terminate the node instance
def terminate_instances(self, group_name, vm_name):
@@ -59,7 +63,9 @@ class Azure:
logging.error(
"Failed to terminate node instance %s. Encountered following " "exception: %s." % (vm_name, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Reboot the node instance
def reboot_instances(self, group_name, vm_name):
@@ -68,7 +74,9 @@ class Azure:
logging.info("vm name " + str(vm_name) + " rebooted")
except Exception as e:
logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (vm_name, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
def get_vm_status(self, resource_group, vm_name):
statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses
@@ -145,7 +153,9 @@ class azure_node_scenarios(abstract_node_scenarios):
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
)
logging.error("node_start_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -161,7 +171,9 @@ class azure_node_scenarios(abstract_node_scenarios):
except Exception as e:
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e)
logging.error("node_stop_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -185,7 +197,9 @@ class azure_node_scenarios(abstract_node_scenarios):
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
)
logging.error("node_termination_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -204,4 +218,6 @@ class azure_node_scenarios(abstract_node_scenarios):
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
)
logging.error("node_reboot_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()

View File

@@ -45,7 +45,9 @@ class GCP:
logging.error(
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Stop the node instance
def stop_instances(self, zone, instance_id):
@@ -54,7 +56,9 @@ class GCP:
logging.info("vm name " + str(instance_id) + " stopped")
except Exception as e:
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Start the node instance
def suspend_instances(self, zone, instance_id):
@@ -65,7 +69,9 @@ class GCP:
logging.error(
"Failed to suspend node instance %s. Encountered following " "exception: %s." % (instance_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Terminate the node instance
def terminate_instances(self, zone, instance_id):
@@ -76,7 +82,9 @@ class GCP:
logging.error(
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Reboot the node instance
def reboot_instances(self, zone, instance_id):
@@ -87,7 +95,9 @@ class GCP:
logging.error(
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Get instance status
def get_instance_status(self, zone, instance_id, expected_status, timeout):
@@ -156,7 +166,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
)
logging.error("node_start_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -173,7 +185,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
except Exception as e:
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
logging.error("node_stop_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -197,7 +211,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % e
)
logging.error("node_termination_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -215,4 +231,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
)
logging.error("node_reboot_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()

View File

@@ -24,7 +24,9 @@ class OPENSTACKCLOUD:
logging.info("Instance: " + str(node) + " started")
except Exception as e:
logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (node, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Stop the node instance
def stop_instances(self, node):
@@ -33,7 +35,9 @@ class OPENSTACKCLOUD:
logging.info("Instance: " + str(node) + " stopped")
except Exception as e:
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (node, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Reboot the node instance
def reboot_instances(self, node):
@@ -42,7 +46,9 @@ class OPENSTACKCLOUD:
logging.info("Instance: " + str(node) + " rebooted")
except Exception as e:
logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (node, e))
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Wait until the node instance is running
def wait_until_running(self, node, timeout):
@@ -109,7 +115,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
)
logging.error("node_start_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -125,7 +133,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
except Exception as e:
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
logging.error("node_stop_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -144,7 +154,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
)
logging.error("node_reboot_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to start the node
def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
@@ -162,7 +174,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
)
logging.error("helper_node_start_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# Node scenario to stop the node
def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
@@ -177,7 +191,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
except Exception as e:
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
logging.error("helper_node_stop_scenario injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout):
try:
@@ -188,4 +204,6 @@ class openstack_node_scenarios(abstract_node_scenarios):
except Exception as e:
logging.error("Failed to check service status. Encountered following exception:" " %s. Test Failed" % (e))
logging.error("helper_node_service_status injection failed!")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()

View File

@@ -13,7 +13,7 @@ from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
from kraken.node_actions.docker_node_scenarios import docker_node_scenarios
import kraken.node_actions.common_node_functions as common_node_functions
import kraken.cerberus.setup as cerberus
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
node_general = False
@@ -53,8 +53,14 @@ def get_node_scenario_object(node_scenario, kubecli: krkn_lib_kubernetes.KrknLib
# Run defined scenarios
# krkn_lib_kubernetes
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for node_scenario_config in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = node_scenario_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config)
with open(node_scenario_config, "r") as f:
node_scenario_config = yaml.full_load(f)
for node_scenario in node_scenario_config["node_scenarios"]:
@@ -62,12 +68,24 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
if node_scenario["actions"]:
for action in node_scenario["actions"]:
start_time = int(time.time())
inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.get_status(config, start_time, end_time)
logging.info("")
try:
inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.get_status(config, start_time, end_time)
logging.info("")
except (RuntimeError, Exception) as e:
scenario_telemetry.exitStatus = 1
failed_scenarios.append(node_scenario_config)
telemetry.log_exception(node_scenario_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries
# Inject the specified node scenario

View File

@@ -13,6 +13,7 @@ from kraken.plugins.run_python_plugin import run_python_file
from kraken.plugins.network.ingress_shaping import network_chaos
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_outage
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
@dataclasses.dataclass
@@ -225,16 +226,25 @@ PLUGINS = Plugins(
)
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int) -> List[str]:
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetry) -> (List[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
for scenario in scenarios:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = scenario
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, scenario)
logging.info('scenario '+ str(scenario))
try:
PLUGINS.run(scenario, kubeconfig_path, kraken_config)
except Exception as e:
scenario_telemetry.exitStatus = 1
failed_post_scenarios.append(scenario)
logging.error("Error while running {}: {}".format(scenario, e))
return failed_post_scenarios
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
telemetry.log_exception(scenario)
else:
scenario_telemetry.exitStatus = 0
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
scenario_telemetries.append(scenario_telemetry)
scenario_telemetry.endTimeStamp = time.time()
return failed_post_scenarios
return failed_post_scenarios, scenario_telemetries

View File

@@ -10,7 +10,7 @@ import time
import yaml
import sys
import random
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
# Run pod based scenarios
def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
@@ -67,8 +67,22 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
return failed_post_scenarios
# krkn_lib_kubernetes
def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def container_run(kubeconfig_path,
scenarios_list,
config,
failed_post_scenarios,
wait_duration,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_scenarios = []
scenario_telemetries: list[ScenarioTelemetry] = []
for container_scenario_config in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = container_scenario_config[0]
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
if len(container_scenario_config) > 1:
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
else:
@@ -78,30 +92,41 @@ def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios
for cont_scenario in cont_scenario_config["scenarios"]:
# capture start time
start_time = int(time.time())
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
if len(container_scenario_config) > 1:
try:
try:
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
if len(container_scenario_config) > 1:
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
kubeconfig_path,
container_scenario_config,
failed_post_scenarios,
pre_action_output
)
except Exception as e:
logging.error("Failed to run post action checks: %s" % e)
sys.exit(1)
else:
failed_post_scenarios = check_failed_containers(
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
)
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
# capture end time
end_time = int(time.time())
# publish cerberus status
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except (RuntimeError, Exception):
failed_scenarios.append(container_scenario_config[0])
telemetry.log_exception(container_scenario_config[0])
scenario_telemetry.exitStatus = 1
# removed_exit
# sys.exit(1)
else:
failed_post_scenarios = check_failed_containers(
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
)
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
return failed_scenarios, scenario_telemetries
# capture end time
end_time = int(time.time())
# publish cerberus status
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
logging.info("")
def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
@@ -114,7 +139,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
kill_count = cont_scenario.get("count", 1)
if type(pod_names) != list:
logging.error("Please make sure your pod_names are in a list format")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
if len(pod_names) == 0:
if namespace == "*":
# returns double array of pod name and namespace
@@ -126,7 +153,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
if namespace == "*":
logging.error("You must specify the namespace to kill a container in a specific pod")
logging.error("Scenario " + scenario_name + " failed")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
pods = pod_names
# get container and pod name
container_pod_list = []
@@ -147,7 +176,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
if len(container_pod_list) == 0:
logging.error("Trying to kill more containers than were found, try lowering kill count")
logging.error("Scenario " + scenario_name + " failed")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
selected_container_pod = container_pod_list[random.randint(0, len(container_pod_list) - 1)]
for c_name in selected_container_pod[2]:
if container_name != "":
@@ -178,6 +209,7 @@ def retry_container_killing(kill_action, podname, namespace, container_name, kub
time.sleep(2)
continue
else:
logging.warning(response)
continue

View File

@@ -7,150 +7,233 @@ import krkn_lib_kubernetes
import yaml
from ..cerberus import setup as cerberus
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
# krkn_lib_kubernetes
def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
"""
Reads the scenario config and creates a temp file to fill up the PVC
"""
failed_post_scenarios = ""
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for app_config in scenarios_list:
if len(app_config) > 1:
with open(app_config, "r") as f:
config_yaml = yaml.full_load(f)
scenario_config = config_yaml["pvc_scenario"]
pvc_name = scenario_config.get("pvc_name", "")
pod_name = scenario_config.get("pod_name", "")
namespace = scenario_config.get("namespace", "")
target_fill_percentage = scenario_config.get(
"fill_percentage", "50"
)
duration = scenario_config.get("duration", 60)
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = app_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, app_config)
try:
if len(app_config) > 1:
with open(app_config, "r") as f:
config_yaml = yaml.full_load(f)
scenario_config = config_yaml["pvc_scenario"]
pvc_name = scenario_config.get("pvc_name", "")
pod_name = scenario_config.get("pod_name", "")
namespace = scenario_config.get("namespace", "")
target_fill_percentage = scenario_config.get(
"fill_percentage", "50"
)
duration = scenario_config.get("duration", 60)
logging.info(
"Input params:\n"
"pvc_name: '%s'\n"
"pod_name: '%s'\n"
"namespace: '%s'\n"
"target_fill_percentage: '%s%%'\nduration: '%ss'"
% (
str(pvc_name),
str(pod_name),
str(namespace),
str(target_fill_percentage),
str(duration)
)
)
# Check input params
if namespace is None:
logging.error(
"You must specify the namespace where the PVC is"
)
sys.exit(1)
if pvc_name is None and pod_name is None:
logging.error(
"You must specify the pvc_name or the pod_name"
)
sys.exit(1)
if pvc_name and pod_name:
logging.info(
"pod_name will be ignored, pod_name used will be "
"a retrieved from the pod used in the pvc_name"
"Input params:\n"
"pvc_name: '%s'\n"
"pod_name: '%s'\n"
"namespace: '%s'\n"
"target_fill_percentage: '%s%%'\nduration: '%ss'"
% (
str(pvc_name),
str(pod_name),
str(namespace),
str(target_fill_percentage),
str(duration)
)
)
# Get pod name
if pvc_name:
if pod_name:
logging.info(
"pod_name '%s' will be overridden with one of "
"the pods mounted in the PVC" % (str(pod_name))
)
pvc = kubecli.get_pvc_info(pvc_name, namespace)
try:
# random generator not used for
# security/cryptographic purposes.
pod_name = random.choice(pvc.podNames) # nosec
logging.info("Pod name: %s" % pod_name)
except Exception:
# Check input params
if namespace is None:
logging.error(
"Pod associated with %s PVC, on namespace %s, "
"not found" % (str(pvc_name), str(namespace))
"You must specify the namespace where the PVC is"
)
sys.exit(1)
# Get volume name
pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
if pod is None:
logging.error(
"Exiting as pod '%s' doesn't exist "
"in namespace '%s'" % (
str(pod_name),
str(namespace)
#sys.exit(1)
raise RuntimeError()
if pvc_name is None and pod_name is None:
logging.error(
"You must specify the pvc_name or the pod_name"
)
# sys.exit(1)
raise RuntimeError()
if pvc_name and pod_name:
logging.info(
"pod_name will be ignored, pod_name used will be "
"a retrieved from the pod used in the pvc_name"
)
)
sys.exit(1)
for volume in pod.volumes:
if volume.pvcName is not None:
volume_name = volume.name
pvc_name = volume.pvcName
# Get pod name
if pvc_name:
if pod_name:
logging.info(
"pod_name '%s' will be overridden with one of "
"the pods mounted in the PVC" % (str(pod_name))
)
pvc = kubecli.get_pvc_info(pvc_name, namespace)
break
if 'pvc' not in locals():
logging.error(
"Pod '%s' in namespace '%s' does not use a pvc" % (
try:
# random generator not used for
# security/cryptographic purposes.
pod_name = random.choice(pvc.podNames) # nosec
logging.info("Pod name: %s" % pod_name)
except Exception:
logging.error(
"Pod associated with %s PVC, on namespace %s, "
"not found" % (str(pvc_name), str(namespace))
)
# sys.exit(1)
raise RuntimeError()
# Get volume name
pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
if pod is None:
logging.error(
"Exiting as pod '%s' doesn't exist "
"in namespace '%s'" % (
str(pod_name),
str(namespace)
)
)
# sys.exit(1)
raise RuntimeError()
for volume in pod.volumes:
if volume.pvcName is not None:
volume_name = volume.name
pvc_name = volume.pvcName
pvc = kubecli.get_pvc_info(pvc_name, namespace)
break
if 'pvc' not in locals():
logging.error(
"Pod '%s' in namespace '%s' does not use a pvc" % (
str(pod_name),
str(namespace)
)
)
# sys.exit(1)
raise RuntimeError()
logging.info("Volume name: %s" % volume_name)
logging.info("PVC name: %s" % pvc_name)
# Get container name and mount path
for container in pod.containers:
for vol in container.volumeMounts:
if vol.name == volume_name:
mount_path = vol.mountPath
container_name = container.name
break
logging.info("Container path: %s" % container_name)
logging.info("Mount path: %s" % mount_path)
# Get PVC capacity and used bytes
command = "df %s -B 1024 | sed 1d" % (str(mount_path))
command_output = (
kubecli.exec_cmd_in_pod(
command,
pod_name,
namespace,
container_name,
"sh"
)
).split()
pvc_used_kb = int(command_output[2])
pvc_capacity_kb = pvc_used_kb + int(command_output[3])
logging.info("PVC used: %s KB" % pvc_used_kb)
logging.info("PVC capacity: %s KB" % pvc_capacity_kb)
# Check valid fill percentage
current_fill_percentage = pvc_used_kb / pvc_capacity_kb
if not (
current_fill_percentage * 100
< float(target_fill_percentage)
<= 99
):
logging.error(
"Target fill percentage (%.2f%%) is lower than "
"current fill percentage (%.2f%%) "
"or higher than 99%%" % (
target_fill_percentage,
current_fill_percentage * 100
)
)
# sys.exit(1)
raise RuntimeError()
# Calculate file size
file_size_kb = int(
(
float(
target_fill_percentage / 100
) * float(pvc_capacity_kb)
) - float(pvc_used_kb)
)
logging.debug("File size: %s KB" % file_size_kb)
file_name = "kraken.tmp"
logging.info(
"Creating %s file, %s KB size, in pod %s at %s (ns %s)"
% (
str(file_name),
str(file_size_kb),
str(pod_name),
str(mount_path),
str(namespace)
)
)
sys.exit(1)
logging.info("Volume name: %s" % volume_name)
logging.info("PVC name: %s" % pvc_name)
# Get container name and mount path
for container in pod.containers:
for vol in container.volumeMounts:
if vol.name == volume_name:
mount_path = vol.mountPath
container_name = container.name
break
logging.info("Container path: %s" % container_name)
logging.info("Mount path: %s" % mount_path)
# Get PVC capacity and used bytes
command = "df %s -B 1024 | sed 1d" % (str(mount_path))
command_output = (
start_time = int(time.time())
# Create temp file in the PVC
full_path = "%s/%s" % (str(mount_path), str(file_name))
command = "fallocate -l $((%s*1024)) %s" % (
str(file_size_kb),
str(full_path)
)
logging.debug(
"Create temp file in the PVC command:\n %s" % command
)
kubecli.exec_cmd_in_pod(
command,
pod_name,
namespace,
container_name,
)
).split()
pvc_used_kb = int(command_output[2])
pvc_capacity_kb = pvc_used_kb + int(command_output[3])
logging.info("PVC used: %s KB" % pvc_used_kb)
logging.info("PVC capacity: %s KB" % pvc_capacity_kb)
# Check valid fill percentage
current_fill_percentage = pvc_used_kb / pvc_capacity_kb
if not (
current_fill_percentage * 100
< float(target_fill_percentage)
<= 99
):
logging.error(
"Target fill percentage (%.2f%%) is lower than "
"current fill percentage (%.2f%%) "
"or higher than 99%%" % (
target_fill_percentage,
current_fill_percentage * 100
)
# Check if file is created
command = "ls -lh %s" % (str(mount_path))
logging.debug("Check file is created command:\n %s" % command)
response = kubecli.exec_cmd_in_pod(
command, pod_name, namespace, container_name, "sh"
)
sys.exit(1)
logging.info("\n" + str(response))
if str(file_name).lower() in str(response).lower():
logging.info(
"%s file successfully created" % (str(full_path))
)
else:
logging.error(
"Failed to create tmp file with %s size" % (
str(file_size_kb)
)
)
remove_temp_file(
file_name,
full_path,
pod_name,
namespace,
container_name,
mount_path,
file_size_kb,
kubecli
)
# sys.exit(1)
raise RuntimeError()
# Calculate file size
file_size_kb = int(
@@ -197,14 +280,13 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
logging.info("\n" + str(response))
if str(file_name).lower() in str(response).lower():
logging.info(
"%s file successfully created" % (str(full_path))
)
else:
logging.error(
"Failed to create tmp file with %s size" % (
str(file_size_kb)
"Waiting for the specified duration in the config: %ss" % (
duration
)
)
time.sleep(duration)
logging.info("Finish waiting")
remove_temp_file(
file_name,
full_path,
@@ -215,35 +297,25 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
file_size_kb,
kubecli
)
sys.exit(1)
# Wait for the specified duration
logging.info(
"Waiting for the specified duration in the config: %ss" % (
duration
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
failed_post_scenarios,
start_time,
end_time
)
)
time.sleep(duration)
logging.info("Finish waiting")
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(app_config)
telemetry.log_exception(app_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries
remove_temp_file(
file_name,
full_path,
pod_name,
namespace,
container_name,
mount_path,
file_size_kb,
kubecli
)
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
failed_post_scenarios,
start_time,
end_time
)
# krkn_lib_kubernetes
@@ -275,7 +347,7 @@ def remove_temp_file(
logging.error(
"Failed to delete tmp file with %s size" % (str(file_size_kb))
)
sys.exit(1)
raise RuntimeError()
def toKbytes(value):
@@ -284,7 +356,7 @@ def toKbytes(value):
"PVC capacity %s does not match expression "
"regexp '^[0-9]+[K|M|G|T]i$'"
)
sys.exit(1)
raise RuntimeError()
unit = {"K": 0, "M": 1, "G": 2, "T": 3}
base = 1024 if ("i" in value) else 1000
exp = unit[value[-2:-1]]

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python
import os
import sys
import yaml
import logging
@@ -13,7 +13,7 @@ from ..node_actions.aws_node_scenarios import AWS
from ..node_actions.openstack_node_scenarios import OPENSTACKCLOUD
from ..node_actions.az_node_scenarios import Azure
from ..node_actions.gcp_node_scenarios import GCP
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
def multiprocess_nodes(cloud_object_function, nodes):
try:
@@ -59,7 +59,9 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
"Cloud type %s is not currently supported for cluster shut down" %
cloud_type
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
nodes = kubecli.list_nodes()
node_id = []
@@ -128,9 +130,16 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
# krkn_lib_kubernetes
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_post_scenarios = []
failed_scenarios = []
scenario_telemetries: list[ScenarioTelemetry] = []
for shut_down_config in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = shut_down_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, shut_down_config[0])
if len(shut_down_config) > 1:
pre_action_output = post_actions.run("", shut_down_config[1])
else:
@@ -140,18 +149,32 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
shut_down_config_scenario = \
shut_down_config_yaml["cluster_shut_down_scenario"]
start_time = int(time.time())
cluster_shut_down(shut_down_config_scenario, kubecli)
logging.info(
"Waiting for the specified duration: %s" % (wait_duration)
)
time.sleep(wait_duration)
failed_post_scenarios = post_actions.check_recovery(
"", shut_down_config, failed_post_scenarios, pre_action_output
)
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
failed_post_scenarios,
start_time,
end_time
)
try:
cluster_shut_down(shut_down_config_scenario, kubecli)
logging.info(
"Waiting for the specified duration: %s" % (wait_duration)
)
time.sleep(wait_duration)
failed_post_scenarios = post_actions.check_recovery(
"", shut_down_config, failed_post_scenarios, pre_action_output
)
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
failed_post_scenarios,
start_time,
end_time
)
except (RuntimeError, Exception):
telemetry.log_exception(shut_down_config[0])
failed_scenarios.append(shut_down_config[0])
scenario_telemetry.exitStatus = 1
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries

View File

@@ -8,6 +8,7 @@ import random
import krkn_lib_kubernetes
from ..cerberus import setup as cerberus
from ..invoke import command as runcommand
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
# krkn_lib_kubernetes
def pod_exec(pod_name, command, namespace, container_name, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
@@ -93,7 +94,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
for name in scenario["object_name"]:
if "namespace" not in scenario.keys():
logging.error("Need to set namespace when using pod name")
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
pod_names.append([name, scenario["namespace"]])
elif "namespace" in scenario.keys() and scenario["namespace"]:
if "label_selector" not in scenario.keys():
@@ -127,7 +130,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
"Cannot find pods matching the namespace/label_selector, "
"please check"
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
pod_counter = 0
for pod in pod_names:
if len(pod) > 1:
@@ -152,7 +157,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
"in pod %s in namespace %s"
% (selected_container_name, pod[0], pod[1])
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
pod_names[pod_counter].append(selected_container_name)
else:
selected_container_name = get_container_name(
@@ -178,7 +185,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
scenario["namespace"]
)
)
sys.exit(1)
# removed_exit
# sys.exit(1)
raise RuntimeError()
pod_names[pod_counter].append(selected_container_name)
logging.info("Reset date/time on pod " + str(pod[0]))
pod_counter += 1
@@ -299,24 +308,41 @@ def check_date_time(object_type, names, kubecli: krkn_lib_kubernetes.KrknLibKube
# krkn_lib_kubernetes
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_scenarios = []
scenario_telemetries: list[ScenarioTelemetry] = []
for time_scenario_config in scenarios_list:
with open(time_scenario_config, "r") as f:
scenario_config = yaml.full_load(f)
for time_scenario in scenario_config["time_scenarios"]:
start_time = int(time.time())
object_type, object_names = skew_time(time_scenario, kubecli)
not_reset = check_date_time(object_type, object_names, kubecli)
if len(not_reset) > 0:
logging.info("Object times were not reset")
logging.info(
"Waiting for the specified duration: %s" % (wait_duration)
)
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
not_reset,
start_time,
end_time
)
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = time_scenario_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config)
try:
with open(time_scenario_config, "r") as f:
scenario_config = yaml.full_load(f)
for time_scenario in scenario_config["time_scenarios"]:
start_time = int(time.time())
object_type, object_names = skew_time(time_scenario, kubecli)
not_reset = check_date_time(object_type, object_names, kubecli)
if len(not_reset) > 0:
logging.info("Object times were not reset")
logging.info(
"Waiting for the specified duration: %s" % (wait_duration)
)
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
not_reset,
start_time,
end_time
)
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
telemetry.log_exception(time_scenario_config)
failed_scenarios.append(time_scenario_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries

View File

@@ -1,100 +1,119 @@
import yaml
import sys
import logging
import time
from ..node_actions.aws_node_scenarios import AWS
from ..cerberus import setup as cerberus
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
def run(scenarios_list, config, wait_duration):
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]) :
"""
filters the subnet of interest and applies the network acl
to create zone outage
"""
failed_post_scenarios = ""
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for zone_outage_config in scenarios_list:
if len(zone_outage_config) > 1:
with open(zone_outage_config, "r") as f:
zone_outage_config_yaml = yaml.full_load(f)
scenario_config = zone_outage_config_yaml["zone_outage"]
vpc_id = scenario_config["vpc_id"]
subnet_ids = scenario_config["subnet_id"]
duration = scenario_config["duration"]
cloud_type = scenario_config["cloud_type"]
ids = {}
acl_ids_created = []
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = zone_outage_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config)
try:
if len(zone_outage_config) > 1:
with open(zone_outage_config, "r") as f:
zone_outage_config_yaml = yaml.full_load(f)
scenario_config = zone_outage_config_yaml["zone_outage"]
vpc_id = scenario_config["vpc_id"]
subnet_ids = scenario_config["subnet_id"]
duration = scenario_config["duration"]
cloud_type = scenario_config["cloud_type"]
ids = {}
acl_ids_created = []
if cloud_type.lower() == "aws":
cloud_object = AWS()
else:
logging.error(
"Cloud type %s is not currently supported for "
"zone outage scenarios"
% cloud_type
)
sys.exit(1)
start_time = int(time.time())
for subnet_id in subnet_ids:
logging.info("Targeting subnet_id")
network_association_ids = []
associations, original_acl_id = \
cloud_object.describe_network_acls(vpc_id, subnet_id)
for entry in associations:
if entry["SubnetId"] == subnet_id:
network_association_ids.append(
entry["NetworkAclAssociationId"]
)
logging.info(
"Network association ids associated with "
"the subnet %s: %s"
% (subnet_id, network_association_ids)
)
acl_id = cloud_object.create_default_network_acl(vpc_id)
new_association_id = \
cloud_object.replace_network_acl_association(
network_association_ids[0], acl_id
if cloud_type.lower() == "aws":
cloud_object = AWS()
else:
logging.error(
"Cloud type %s is not currently supported for "
"zone outage scenarios"
% cloud_type
)
# removed_exit
# sys.exit(1)
raise RuntimeError()
# capture the orginal_acl_id, created_acl_id and
# new association_id to use during the recovery
ids[new_association_id] = original_acl_id
acl_ids_created.append(acl_id)
start_time = int(time.time())
# wait for the specified duration
logging.info(
"Waiting for the specified duration "
"in the config: %s" % (duration)
)
time.sleep(duration)
for subnet_id in subnet_ids:
logging.info("Targeting subnet_id")
network_association_ids = []
associations, original_acl_id = \
cloud_object.describe_network_acls(vpc_id, subnet_id)
for entry in associations:
if entry["SubnetId"] == subnet_id:
network_association_ids.append(
entry["NetworkAclAssociationId"]
)
logging.info(
"Network association ids associated with "
"the subnet %s: %s"
% (subnet_id, network_association_ids)
)
acl_id = cloud_object.create_default_network_acl(vpc_id)
new_association_id = \
cloud_object.replace_network_acl_association(
network_association_ids[0], acl_id
)
# replace the applied acl with the previous acl in use
for new_association_id, original_acl_id in ids.items():
cloud_object.replace_network_acl_association(
new_association_id,
original_acl_id
# capture the orginal_acl_id, created_acl_id and
# new association_id to use during the recovery
ids[new_association_id] = original_acl_id
acl_ids_created.append(acl_id)
# wait for the specified duration
logging.info(
"Waiting for the specified duration "
"in the config: %s" % (duration)
)
logging.info(
"Wating for 60 seconds to make sure "
"the changes are in place"
)
time.sleep(60)
time.sleep(duration)
# delete the network acl created for the run
for acl_id in acl_ids_created:
cloud_object.delete_network_acl(acl_id)
# replace the applied acl with the previous acl in use
for new_association_id, original_acl_id in ids.items():
cloud_object.replace_network_acl_association(
new_association_id,
original_acl_id
)
logging.info(
"Wating for 60 seconds to make sure "
"the changes are in place"
)
time.sleep(60)
logging.info(
"End of scenario. "
"Waiting for the specified duration: %s" % (wait_duration)
)
time.sleep(wait_duration)
# delete the network acl created for the run
for acl_id in acl_ids_created:
cloud_object.delete_network_acl(acl_id)
logging.info(
"End of scenario. "
"Waiting for the specified duration: %s" % (wait_duration)
)
time.sleep(wait_duration)
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
failed_post_scenarios,
start_time,
end_time
)
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(zone_outage_config)
telemetry.log_exception(zone_outage_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries
end_time = int(time.time())
cerberus.publish_kraken_status(
config,
failed_post_scenarios,
start_time,
end_time
)

View File

@@ -37,4 +37,4 @@ prometheus_api_client
ibm_cloud_sdk_core
ibm_vpc
pytest
krkn-lib-kubernetes > 0.1.1
krkn-lib-kubernetes >= 0.1.3

View File

@@ -25,7 +25,7 @@ import kraken.arcaflow_plugin as arcaflow_plugin
import server as server
import kraken.prometheus.client as promcli
from kraken import plugins
from krkn_lib_kubernetes import KrknLibKubernetes
from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry, SafeLogger
KUBE_BURNER_URL = (
"https://github.com/cloud-bulldozer/kube-burner/"
@@ -98,13 +98,34 @@ def main(cfg):
)
sys.exit(1)
logging.info("Initializing client to talk to the Kubernetes cluster")
# Generate uuid for the run
if run_uuid:
logging.info(
"Using the uuid defined by the user for the run: %s" % run_uuid
)
else:
run_uuid = str(uuid.uuid4())
logging.info("Generated a uuid for the run: %s" % run_uuid)
# request_id for telemetry is generated once here and used everywhere
telemetry_request_id = f"{int(time.time())}-{run_uuid}"
if config["telemetry"].get("run_tag"):
telemetry_request_id = f"{telemetry_request_id}-{config['telemetry']['run_tag']}"
telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log'
safe_logger = SafeLogger(filename=telemetry_log_file)
try:
kubeconfig_path
os.environ["KUBECONFIG"] = str(kubeconfig_path)
# krkn-lib-kubernetes init
kubecli = KrknLibKubernetes(kubeconfig_path=kubeconfig_path)
except:
kubecli.initialize_clients(None)
# KrknTelemetry init
telemetry = KrknTelemetry(safe_logger, kubecli)
# find node kraken might be running on
kubecli.find_kraken_node()
@@ -141,14 +162,7 @@ def main(cfg):
if deploy_performance_dashboards:
performance_dashboards.setup(dashboard_repo, distribution)
# Generate uuid for the run
if run_uuid:
logging.info(
"Using the uuid defined by the user for the run: %s" % run_uuid
)
else:
run_uuid = str(uuid.uuid4())
logging.info("Generated a uuid for the run: %s" % run_uuid)
# Initialize the start iteration to 0
iteration = 0
@@ -171,7 +185,8 @@ def main(cfg):
# Capture the start time
start_time = int(time.time())
litmus_installed = False
chaos_telemetry = ChaosRunTelemetry()
chaos_telemetry.run_uuid = run_uuid
# Loop to run the chaos starts here
while int(iteration) < iterations and run_signal != "STOP":
# Inject chaos scenarios specified in the config
@@ -203,36 +218,41 @@ def main(cfg):
)
sys.exit(1)
elif scenario_type == "arcaflow_scenarios":
failed_post_scenarios = arcaflow_plugin.run(
scenarios_list, kubeconfig_path
failed_post_scenarios, scenario_telemetries = arcaflow_plugin.run(
scenarios_list, kubeconfig_path, telemetry
)
chaos_telemetry.scenarios.extend(scenario_telemetries)
elif scenario_type == "plugin_scenarios":
failed_post_scenarios = plugins.run(
failed_post_scenarios, scenario_telemetries = plugins.run(
scenarios_list,
kubeconfig_path,
kraken_config,
failed_post_scenarios,
wait_duration,
telemetry
)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# krkn_lib_kubernetes
elif scenario_type == "container_scenarios":
logging.info("Running container scenarios")
failed_post_scenarios = pod_scenarios.container_run(
failed_post_scenarios, scenario_telemetries = pod_scenarios.container_run(
kubeconfig_path,
scenarios_list,
config,
failed_post_scenarios,
wait_duration,
kubecli
kubecli,
telemetry
)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Inject node chaos scenarios specified in the config
# krkn_lib_kubernetes
elif scenario_type == "node_scenarios":
logging.info("Running node scenarios")
nodeaction.run(scenarios_list, config, wait_duration, kubecli)
failed_post_scenarios, scenario_telemetries = nodeaction.run(scenarios_list, config, wait_duration, kubecli, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Inject managedcluster chaos scenarios specified in the config
# krkn_lib_kubernetes
elif scenario_type == "managedcluster_scenarios":
@@ -247,7 +267,8 @@ def main(cfg):
elif scenario_type == "time_scenarios":
if distribution == "openshift":
logging.info("Running time skew scenarios")
time_actions.run(scenarios_list, config, wait_duration, kubecli)
failed_post_scenarios, scenario_telemetries = time_actions.run(scenarios_list, config, wait_duration, kubecli, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
else:
logging.error(
"Litmus scenarios are currently "
@@ -297,44 +318,48 @@ def main(cfg):
# Inject cluster shutdown scenarios
# krkn_lib_kubernetes
elif scenario_type == "cluster_shut_down_scenarios":
shut_down.run(scenarios_list, config, wait_duration, kubecli)
failed_post_scenarios, scenario_telemetries = shut_down.run(scenarios_list, config, wait_duration, kubecli, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Inject namespace chaos scenarios
# krkn_lib_kubernetes
elif scenario_type == "namespace_scenarios":
logging.info("Running namespace scenarios")
namespace_actions.run(
failed_post_scenarios, scenario_telemetries = namespace_actions.run(
scenarios_list,
config,
wait_duration,
failed_post_scenarios,
kubeconfig_path,
kubecli
kubecli,
telemetry
)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Inject zone failures
elif scenario_type == "zone_outages":
logging.info("Inject zone outages")
zone_outages.run(scenarios_list, config, wait_duration)
failed_post_scenarios, scenario_telemetries = zone_outages.run(scenarios_list, config, wait_duration, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Application outages
elif scenario_type == "application_outages":
logging.info("Injecting application outage")
application_outage.run(
scenarios_list, config, wait_duration
)
failed_post_scenarios, scenario_telemetries = application_outage.run(
scenarios_list, config, wait_duration, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# PVC scenarios
# krkn_lib_kubernetes
elif scenario_type == "pvc_scenarios":
logging.info("Running PVC scenario")
pvc_scenario.run(scenarios_list, config, kubecli)
failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Network scenarios
# krkn_lib_kubernetes
elif scenario_type == "network_chaos":
logging.info("Running Network Chaos")
network_chaos.run(scenarios_list, config, wait_duration, kubecli)
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry)
# Check for critical alerts when enabled
if check_critical_alerts:
@@ -353,6 +378,21 @@ def main(cfg):
iteration += 1
logging.info("")
# telemetry
if config["telemetry"]["enabled"]:
logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}")
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
try:
telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
safe_logger.info("archives download started:")
prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
safe_logger.info("archives upload started:")
telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
except Exception as e:
logging.error(f"failed to send telemetry data: {str(e)}")
else:
logging.info("telemetry collection disabled, skipping.")
# Capture the end time
end_time = int(time.time())