Krkn telemetry integration (#435)

* adapted config.yaml to the new feature * temporarly pointing requirement.txt to the lib feature branch * run_kraken.py + arcaflow scenarios refactoring typo * plugin scenario * node scenarios return failed scenarios * container scenarios fix * time scenarios * cluster shutdown scenarios * namespace scenarios * zone outage scenarios * app outage scenarios * pvc scenarios * network chaos scenarios * run_kraken.py adaptation to telemetry * prometheus telemetry upload + config.yaml some fixes typos and logs max retries in config telemetry id with run_uuid safe_logger * catch send_telemetry exception * scenario collection bug fixes * telemetry enabled check * telemetry run tag * requirements pointing to main + archive_size * requirements.txt and config.yaml update * added telemetry config to common config * fixed scenario array elements for telemetry
2026-02-14 18:10:00 +00:00 · 2023-08-10 20:42:53 +02:00
parent 491dc17267
commit 39c0152b7b
19 changed files with 964 additions and 583 deletions
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -29,3 +29,15 @@ tunings:
    wait_duration: 6                                       # Duration to wait between each chaos scenario.
    iterations: 1                                          # Number of times to execute the scenarios.
    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever.
+telemetry:
+    enabled: False                                           # enable/disables the telemetry collection feature
+    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    username: username                                      # telemetry service username
+    password: password                                      # telemetry service password
+    prometheus_backup: True                                 # enables/disables prometheus data collection
+    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
+    backup_threads: 5                                       # number of telemetry download/upload threads
+    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
+    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
+    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -7,41 +7,8 @@ kraken:
    signal_address: 0.0.0.0                                # Signal listening address
    port: 8081                                             # Signal port
    chaos_scenarios:                                       # List of policies/chaos scenarios to load
-        -   arcaflow_scenarios:
-                - scenarios/arcaflow/cpu-hog/input.yaml
-                - scenarios/arcaflow/memory-hog/input.yaml
-        -   container_scenarios:                                 # List of chaos pod scenarios to load
-            - -    scenarios/openshift/container_etcd.yml
-        -   plugin_scenarios:
-            - scenarios/openshift/etcd.yml
-            - scenarios/openshift/regex_openshift_pod_kill.yml
-            - scenarios/openshift/vmware_node_scenarios.yml
-            - scenarios/openshift/ibmcloud_node_scenarios.yml
-            - scenarios/openshift/network_chaos_ingress.yml
-            - scenarios/openshift/pod_network_outage.yml
-            - scenarios/openshift/pod_network_shaping.yml
-        -   node_scenarios:                                # List of chaos node scenarios to load
-            -   scenarios/openshift/node_scenarios_example.yml
-        -   plugin_scenarios:
-            - scenarios/openshift/openshift-apiserver.yml
-            - scenarios/openshift/openshift-kube-apiserver.yml
-        -   time_scenarios:                                # List of chaos time scenarios to load
-            - scenarios/openshift/time_scenarios_example.yml
-        -   cluster_shut_down_scenarios:
-            - - scenarios/openshift/cluster_shut_down_scenario.yml
-              - scenarios/openshift/post_action_shut_down.py
-        -   namespace_scenarios:
-             - - scenarios/openshift/regex_namespace.yaml
-             - - scenarios/openshift/ingress_namespace.yaml
-               - scenarios/openshift/post_action_namespace.py
-        -   zone_outages:
-            - scenarios/openshift/zone_outage.yaml
-        -   application_outages:
-            - scenarios/openshift/app_outage.yaml
-        -   pvc_scenarios:
-            - scenarios/openshift/pvc_scenario.yaml
-        -   network_chaos:
-            - scenarios/openshift/network_chaos.yaml
+        - application_outages:
+              - scenarios/openshift/app_outage.yaml

 cerberus:
    cerberus_enabled: False                                # Enable it when cerberus is previously installed
@@ -65,3 +32,20 @@ tunings:
    wait_duration: 60                                      # Duration to wait between each chaos scenario
    iterations: 1                                          # Number of times to execute the scenarios
    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
+telemetry:
+    enabled: False                                           # enable/disables the telemetry collection feature
+    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    username: username                                      # telemetry service username
+    password: password                                      # telemetry service password
+    prometheus_backup: True                                 # enables/disables prometheus data collection
+    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
+    backup_threads: 5                                       # number of telemetry download/upload threads
+    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
+    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
+    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
+                                                            # simultaneously).
+                                                            # For unstable/slow connection is better to keep this value low
+                                                            # increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
+                                                            # failed chunk without affecting the whole upload.
--- a/kraken/application_outage/actions.py
+++ b/kraken/application_outage/actions.py
@@ -4,25 +4,32 @@ import time
 import kraken.cerberus.setup as cerberus
 from jinja2 import Template
 import kraken.invoke.command as runcommand
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 # Reads the scenario config, applies and deletes a network policy to
 # block the traffic for the specified duration
-def run(scenarios_list, config, wait_duration):
+def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
    failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
    for app_outage_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = app_outage_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
        if len(app_outage_config) > 1:
-            with open(app_outage_config, "r") as f:
-                app_outage_config_yaml = yaml.full_load(f)
-                scenario_config = app_outage_config_yaml["application_outage"]
-                pod_selector = scenario_config.get("pod_selector", "{}")
-                traffic_type = scenario_config.get("block", "[Ingress, Egress]")
-                namespace = scenario_config.get("namespace", "")
-                duration = scenario_config.get("duration", 60)
+            try:
+                with open(app_outage_config, "r") as f:
+                    app_outage_config_yaml = yaml.full_load(f)
+                    scenario_config = app_outage_config_yaml["application_outage"]
+                    pod_selector = scenario_config.get("pod_selector", "{}")
+                    traffic_type = scenario_config.get("block", "[Ingress, Egress]")
+                    namespace = scenario_config.get("namespace", "")
+                    duration = scenario_config.get("duration", 60)

-                start_time = int(time.time())
+                    start_time = int(time.time())

-                network_policy_template = """---
+                    network_policy_template = """---
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -31,28 +38,38 @@ spec:
  podSelector:
    matchLabels: {{ pod_selector }}
  policyTypes: {{ traffic_type }}
-                """
-                t = Template(network_policy_template)
-                rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
-                # Write the rendered template to a file
-                with open("kraken_network_policy.yaml", "w") as f:
-                    f.write(rendered_spec)
-                # Block the traffic by creating network policy
-                logging.info("Creating the network policy")
-                runcommand.invoke(
-                    "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
-                )
+                    """
+                    t = Template(network_policy_template)
+                    rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
+                    # Write the rendered template to a file
+                    with open("kraken_network_policy.yaml", "w") as f:
+                        f.write(rendered_spec)
+                    # Block the traffic by creating network policy
+                    logging.info("Creating the network policy")
+                    runcommand.invoke(
+                        "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
+                    )

-                # wait for the specified duration
-                logging.info("Waiting for the specified duration in the config: %s" % (duration))
-                time.sleep(duration)
+                    # wait for the specified duration
+                    logging.info("Waiting for the specified duration in the config: %s" % (duration))
+                    time.sleep(duration)

-                # unblock the traffic by deleting the network policy
-                logging.info("Deleting the network policy")
-                runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
+                    # unblock the traffic by deleting the network policy
+                    logging.info("Deleting the network policy")
+                    runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))

-                logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
-                time.sleep(wait_duration)
+                    logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
+                    time.sleep(wait_duration)
+
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+            except Exception as e :
+                scenario_telemetry.exitStatus = 1
+                failed_scenarios.append(app_outage_config)
+                telemetry.log_exception(app_outage_config)
+            else:
+                scenario_telemetry.exitStatus = 0
+            scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries

-                end_time = int(time.time())
-                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
--- a/kraken/arcaflow_plugin/arcaflow_plugin.py
+++ b/kraken/arcaflow_plugin/arcaflow_plugin.py
@@ -1,3 +1,5 @@
+import time
+
 import arcaflow
 import os
 import yaml
@@ -6,22 +8,31 @@ import sys
 from pathlib import Path
 from typing import List
 from .context_auth import ContextAuth
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry


-def run(scenarios_list: List[str], kubeconfig_path: str):
+def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_post_scenarios = []
    for scenario in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry,scenario)
        engine_args = build_args(scenario)
-        run_workflow(engine_args, kubeconfig_path)
+        status_code = run_workflow(engine_args, kubeconfig_path)
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetry.exitStatus = status_code
+        scenario_telemetries.append(scenario_telemetry)
+        if status_code != 0:
+            failed_post_scenarios.append(scenario)
+    return failed_post_scenarios, scenario_telemetries


-def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str):
+def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
    set_arca_kubeconfig(engine_args, kubeconfig_path)
    exit_status = arcaflow.run(engine_args)
-    if exit_status != 0:
-        logging.error(
-            f"failed to run arcaflow scenario {engine_args.input}"
-        )
-        sys.exit(exit_status)
+    return exit_status


 def build_args(input_file: str) -> arcaflow.EngineArgs:
--- a/kraken/namespace_actions/common_namespace_functions.py
+++ b/kraken/namespace_actions/common_namespace_functions.py
@@ -6,7 +6,7 @@ import kraken.cerberus.setup as cerberus
 import kraken.post_actions.actions as post_actions
 import yaml
 import sys
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 # krkn_lib_kubernetes
 def run(
@@ -15,72 +15,96 @@ def run(
        wait_duration,
        failed_post_scenarios,
        kubeconfig_path,
-        kubecli: krkn_lib_kubernetes.KrknLibKubernetes
-):
-
+        kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
+        telemetry: KrknTelemetry
+) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
    for scenario_config in scenarios_list:
-        if len(scenario_config) > 1:
-            pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
-        else:
-            pre_action_output = ""
-        with open(scenario_config[0], "r") as f:
-            scenario_config_yaml = yaml.full_load(f)
-            for scenario in scenario_config_yaml["scenarios"]:
-                scenario_namespace = scenario.get("namespace", "")
-                scenario_label = scenario.get("label_selector", "")
-                if scenario_namespace is not None and scenario_namespace.strip() != "":
-                    if scenario_label is not None and scenario_label.strip() != "":
-                        logging.error("You can only have namespace or label set in your namespace scenario")
-                        logging.error(
-                            "Current scenario config has namespace '%s' and label selector '%s'"
-                            % (scenario_namespace, scenario_label)
-                        )
-                        logging.error(
-                            "Please set either namespace to blank ('') or label_selector to blank ('') to continue"
-                        )
-                        sys.exit(1)
-                delete_count = scenario.get("delete_count", 1)
-                run_count = scenario.get("runs", 1)
-                run_sleep = scenario.get("sleep", 10)
-                wait_time = scenario.get("wait_time", 30)
-                killed_namespaces = []
-                start_time = int(time.time())
-                for i in range(run_count):
-                    namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
-                    for j in range(delete_count):
-                        if len(namespaces) == 0:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario_config[0]
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0])
+        try:
+            if len(scenario_config) > 1:
+                pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
+            else:
+                pre_action_output = ""
+            with open(scenario_config[0], "r") as f:
+                scenario_config_yaml = yaml.full_load(f)
+                for scenario in scenario_config_yaml["scenarios"]:
+                    scenario_namespace = scenario.get("namespace", "")
+                    scenario_label = scenario.get("label_selector", "")
+                    if scenario_namespace is not None and scenario_namespace.strip() != "":
+                        if scenario_label is not None and scenario_label.strip() != "":
+                            logging.error("You can only have namespace or label set in your namespace scenario")
                            logging.error(
-                                "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
-                                % (str(run_count), scenario_namespace, str(scenario_label))
+                                "Current scenario config has namespace '%s' and label selector '%s'"
+                                % (scenario_namespace, scenario_label)
                            )
-                            sys.exit(1)
-                        selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
-                        killed_namespaces.append(selected_namespace)
-                        try:
-                            kubecli.delete_namespace(selected_namespace)
-                            logging.info("Delete on namespace %s was successful" % str(selected_namespace))
-                        except Exception as e:
-                            logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
-                            logging.info("Namespace action error: " + str(e))
-                            sys.exit(1)
-                        namespaces.remove(selected_namespace)
-                        logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
-                        time.sleep(run_sleep)
-
-                        logging.info("Waiting for the specified duration: %s" % wait_duration)
-                        time.sleep(wait_duration)
-                        if len(scenario_config) > 1:
-                            try:
-                                failed_post_scenarios = post_actions.check_recovery(
-                                    kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
+                            logging.error(
+                                "Please set either namespace to blank ('') or label_selector to blank ('') to continue"
+                            )
+                            # removed_exit
+                            # sys.exit(1)
+                            raise RuntimeError()
+                    delete_count = scenario.get("delete_count", 1)
+                    run_count = scenario.get("runs", 1)
+                    run_sleep = scenario.get("sleep", 10)
+                    wait_time = scenario.get("wait_time", 30)
+                    killed_namespaces = []
+                    start_time = int(time.time())
+                    for i in range(run_count):
+                        namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
+                        for j in range(delete_count):
+                            if len(namespaces) == 0:
+                                logging.error(
+                                    "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
+                                    % (str(run_count), scenario_namespace, str(scenario_label))
                                )
+                                # removed_exit
+                                # sys.exit(1)
+                                raise RuntimeError()
+                            selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
+                            killed_namespaces.append(selected_namespace)
+                            try:
+                                kubecli.delete_namespace(selected_namespace)
+                                logging.info("Delete on namespace %s was successful" % str(selected_namespace))
                            except Exception as e:
-                                logging.error("Failed to run post action checks: %s" % e)
-                                sys.exit(1)
-                        else:
-                            failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                                logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
+                                logging.info("Namespace action error: " + str(e))
+                                # removed_exit
+                                # sys.exit(1)
+                                raise RuntimeError()
+                            namespaces.remove(selected_namespace)
+                            logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
+                            time.sleep(run_sleep)
+
+                            logging.info("Waiting for the specified duration: %s" % wait_duration)
+                            time.sleep(wait_duration)
+                            if len(scenario_config) > 1:
+                                try:
+                                    failed_post_scenarios = post_actions.check_recovery(
+                                        kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
+                                    )
+                                except Exception as e:
+                                    logging.error("Failed to run post action checks: %s" % e)
+                                    # removed_exit
+                                    # sys.exit(1)
+                                    raise RuntimeError()
+                            else:
+                                failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+        except (Exception, RuntimeError):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(scenario_config[0])
+            telemetry.log_exception(scenario_config[0])
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries

 # krkn_lib_kubernetes
 def check_active_namespace(killed_namespaces, wait_time, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
--- a/kraken/network_chaos/actions.py
+++ b/kraken/network_chaos/actions.py
@@ -8,88 +8,104 @@ import krkn_lib_kubernetes
 from jinja2 import Environment, FileSystemLoader
 import kraken.cerberus.setup as cerberus
 import kraken.node_actions.common_node_functions as common_node_functions
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 # krkn_lib_kubernetes
 # Reads the scenario config and introduces traffic variations in Node's host network interface.
-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
    failed_post_scenarios = ""
    logging.info("Runing the Network Chaos tests")
+    failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
    for net_config in scenarios_list:
-        with open(net_config, "r") as file:
-            param_lst = ["latency", "loss", "bandwidth"]
-            test_config = yaml.safe_load(file)
-            test_dict = test_config["network_chaos"]
-            test_duration = int(test_dict.get("duration", 300))
-            test_interface = test_dict.get("interfaces", [])
-            test_node = test_dict.get("node_name", "")
-            test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
-            test_execution = test_dict.get("execution", "serial")
-            test_instance_count = test_dict.get("instance_count", 1)
-            test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
-            if test_node:
-                node_name_list = test_node.split(",")
-            else:
-                node_name_list = [test_node]
-            nodelst = []
-            for single_node_name in node_name_list:
-                nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
-            file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
-            env = Environment(loader=file_loader, autoescape=True)
-            pod_template = env.get_template("pod.j2")
-            test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
-            joblst = []
-            egress_lst = [i for i in param_lst if i in test_egress]
-            chaos_config = {
-                "network_chaos": {
-                    "duration": test_duration,
-                    "interfaces": test_interface,
-                    "node_name": ",".join(nodelst),
-                    "execution": test_execution,
-                    "instance_count": test_instance_count,
-                    "egress": test_egress,
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = net_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, net_config)
+        try:
+            with open(net_config, "r") as file:
+                param_lst = ["latency", "loss", "bandwidth"]
+                test_config = yaml.safe_load(file)
+                test_dict = test_config["network_chaos"]
+                test_duration = int(test_dict.get("duration", 300))
+                test_interface = test_dict.get("interfaces", [])
+                test_node = test_dict.get("node_name", "")
+                test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
+                test_execution = test_dict.get("execution", "serial")
+                test_instance_count = test_dict.get("instance_count", 1)
+                test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
+                if test_node:
+                    node_name_list = test_node.split(",")
+                else:
+                    node_name_list = [test_node]
+                nodelst = []
+                for single_node_name in node_name_list:
+                    nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
+                file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
+                env = Environment(loader=file_loader, autoescape=True)
+                pod_template = env.get_template("pod.j2")
+                test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
+                joblst = []
+                egress_lst = [i for i in param_lst if i in test_egress]
+                chaos_config = {
+                    "network_chaos": {
+                        "duration": test_duration,
+                        "interfaces": test_interface,
+                        "node_name": ",".join(nodelst),
+                        "execution": test_execution,
+                        "instance_count": test_instance_count,
+                        "egress": test_egress,
+                    }
                }
-            }
-            logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
-            job_template = env.get_template("job.j2")
-            try:
-                for i in egress_lst:
-                    for node in nodelst:
-                        exec_cmd = get_egress_cmd(
-                            test_execution, test_interface, i, test_dict["egress"], duration=test_duration
-                        )
-                        logging.info("Executing %s on node %s" % (exec_cmd, node))
-                        job_body = yaml.safe_load(
-                            job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
-                        )
-                        joblst.append(job_body["metadata"]["name"])
-                        api_response = kubecli.create_job(job_body)
-                        if api_response is None:
-                            raise Exception("Error creating job")
-                    if test_execution == "serial":
-                        logging.info("Waiting for serial job to finish")
+                logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
+                job_template = env.get_template("job.j2")
+                try:
+                    for i in egress_lst:
+                        for node in nodelst:
+                            exec_cmd = get_egress_cmd(
+                                test_execution, test_interface, i, test_dict["egress"], duration=test_duration
+                            )
+                            logging.info("Executing %s on node %s" % (exec_cmd, node))
+                            job_body = yaml.safe_load(
+                                job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
+                            )
+                            joblst.append(job_body["metadata"]["name"])
+                            api_response = kubecli.create_job(job_body)
+                            if api_response is None:
+                                raise Exception("Error creating job")
+                        if test_execution == "serial":
+                            logging.info("Waiting for serial job to finish")
+                            start_time = int(time.time())
+                            wait_for_job(joblst[:], kubecli, test_duration + 300)
+                            logging.info("Waiting for wait_duration %s" % wait_duration)
+                            time.sleep(wait_duration)
+                            end_time = int(time.time())
+                            cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                        if test_execution == "parallel":
+                            break
+                    if test_execution == "parallel":
+                        logging.info("Waiting for parallel job to finish")
                        start_time = int(time.time())
                        wait_for_job(joblst[:], kubecli, test_duration + 300)
                        logging.info("Waiting for wait_duration %s" % wait_duration)
                        time.sleep(wait_duration)
                        end_time = int(time.time())
                        cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
-                    if test_execution == "parallel":
-                        break
-                if test_execution == "parallel":
-                    logging.info("Waiting for parallel job to finish")
-                    start_time = int(time.time())
-                    wait_for_job(joblst[:], kubecli, test_duration + 300)
-                    logging.info("Waiting for wait_duration %s" % wait_duration)
-                    time.sleep(wait_duration)
-                    end_time = int(time.time())
-                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
-            except Exception as e:
-                logging.error("Network Chaos exiting due to Exception %s" % e)
-                sys.exit(1)
-            finally:
-                logging.info("Deleting jobs")
-                delete_job(joblst[:], kubecli)
+                except Exception as e:
+                    logging.error("Network Chaos exiting due to Exception %s" % e)
+                    raise RuntimeError()
+                finally:
+                    logging.info("Deleting jobs")
+                    delete_job(joblst[:], kubecli)
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(net_config)
+            telemetry.log_exception(net_config)
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries


 # krkn_lib_kubernetes
@@ -110,7 +126,8 @@ def verify_interface(test_interface, nodelst, template, kubecli: krkn_lib_kubern
            for interface in test_interface:
                if interface not in interface_lst:
                    logging.error("Interface %s not found in node %s interface list %s" % (interface, nodelst[pod_index], interface_lst))
-                    sys.exit(1)
+                    #sys.exit(1)
+                    raise RuntimeError()
        return test_interface
    finally:
        logging.info("Deleteing pod to query interface on node")
@@ -158,7 +175,7 @@ def delete_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                logging.error(pod_log)
        except Exception:
            logging.warning("Exception in getting job status")
-        api_response = kubecli.delete_job(name=jobname, namespace="default")
+        kubecli.delete_job(name=jobname, namespace="default")


 def get_egress_cmd(execution, test_interface, mod, vallst, duration=30):
--- a/kraken/node_actions/aws_node_scenarios.py
+++ b/kraken/node_actions/aws_node_scenarios.py
@@ -27,7 +27,9 @@ class AWS:
            logging.error(
                "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Stop the node instance
    def stop_instances(self, instance_id):
@@ -36,7 +38,9 @@ class AWS:
            logging.info("EC2 instance: " + str(instance_id) + " stopped")
        except Exception as e:
            logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Terminate the node instance
    def terminate_instances(self, instance_id):
@@ -47,7 +51,9 @@ class AWS:
            logging.error(
                "Failed to terminate node instance %s. Encountered following " "exception: %s." % (instance_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Reboot the node instance
    def reboot_instances(self, instance_id):
@@ -58,7 +64,9 @@ class AWS:
            logging.error(
                "Failed to reboot node instance %s. Encountered following " "exception: %s." % (instance_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Below functions poll EC2.Client.describe_instances() every 15 seconds
    # until a successful state is reached. An error is returned after 40 failed checks
@@ -102,7 +110,9 @@ class AWS:
                "Failed to create the default network_acl: %s"
                "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
        return acl_id

    # Replace network acl association
@@ -114,7 +124,9 @@ class AWS:
            new_association_id = status["NewAssociationId"]
        except Exception as e:
            logging.error("Failed to replace network acl association: %s" % (e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
        return new_association_id

    # Describe network acl
@@ -131,7 +143,9 @@ class AWS:
                "Failed to describe network acl: %s."
                "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
        associations = response["NetworkAcls"][0]["Associations"]
        # grab the current network_acl in use
        original_acl_id = response["NetworkAcls"][0]["Associations"][0]["NetworkAclId"]
@@ -148,7 +162,9 @@ class AWS:
                "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet"
                % (acl_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

 # krkn_lib_kubernetes
 class aws_node_scenarios(abstract_node_scenarios):
@@ -173,7 +189,9 @@ class aws_node_scenarios(abstract_node_scenarios):
                    "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                )
                logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to stop the node
    def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -189,7 +207,9 @@ class aws_node_scenarios(abstract_node_scenarios):
            except Exception as e:
                logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to terminate the node
    def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -213,7 +233,9 @@ class aws_node_scenarios(abstract_node_scenarios):
                    "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
                )
                logging.error("node_termination_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to reboot the node
    def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -232,4 +254,6 @@ class aws_node_scenarios(abstract_node_scenarios):
                    "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                )
                logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
--- a/kraken/node_actions/az_node_scenarios.py
+++ b/kraken/node_actions/az_node_scenarios.py
@@ -39,7 +39,9 @@ class Azure:
            logging.info("vm name " + str(vm_name) + " started")
        except Exception as e:
            logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (vm_name, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Stop the node instance
    def stop_instances(self, group_name, vm_name):
@@ -48,7 +50,9 @@ class Azure:
            logging.info("vm name " + str(vm_name) + " stopped")
        except Exception as e:
            logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (vm_name, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Terminate the node instance
    def terminate_instances(self, group_name, vm_name):
@@ -59,7 +63,9 @@ class Azure:
            logging.error(
                "Failed to terminate node instance %s. Encountered following " "exception: %s." % (vm_name, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Reboot the node instance
    def reboot_instances(self, group_name, vm_name):
@@ -68,7 +74,9 @@ class Azure:
            logging.info("vm name " + str(vm_name) + " rebooted")
        except Exception as e:
            logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (vm_name, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    def get_vm_status(self, resource_group, vm_name):
        statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses
@@ -145,7 +153,9 @@ class azure_node_scenarios(abstract_node_scenarios):
                    "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                )
                logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to stop the node
    def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -161,7 +171,9 @@ class azure_node_scenarios(abstract_node_scenarios):
            except Exception as e:
                logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e)
                logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to terminate the node
    def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -185,7 +197,9 @@ class azure_node_scenarios(abstract_node_scenarios):
                    "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
                )
                logging.error("node_termination_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to reboot the node
    def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -204,4 +218,6 @@ class azure_node_scenarios(abstract_node_scenarios):
                    "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                )
                logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
--- a/kraken/node_actions/gcp_node_scenarios.py
+++ b/kraken/node_actions/gcp_node_scenarios.py
@@ -45,7 +45,9 @@ class GCP:
            logging.error(
                "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Stop the node instance
    def stop_instances(self, zone, instance_id):
@@ -54,7 +56,9 @@ class GCP:
            logging.info("vm name " + str(instance_id) + " stopped")
        except Exception as e:
            logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Start the node instance
    def suspend_instances(self, zone, instance_id):
@@ -65,7 +69,9 @@ class GCP:
            logging.error(
                "Failed to suspend node instance %s. Encountered following " "exception: %s." % (instance_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Terminate the node instance
    def terminate_instances(self, zone, instance_id):
@@ -76,7 +82,9 @@ class GCP:
            logging.error(
                "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Reboot the node instance
    def reboot_instances(self, zone, instance_id):
@@ -87,7 +95,9 @@ class GCP:
            logging.error(
                "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Get instance status
    def get_instance_status(self, zone, instance_id, expected_status, timeout):
@@ -156,7 +166,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
                    "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                )
                logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to stop the node
    def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -173,7 +185,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
            except Exception as e:
                logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to terminate the node
    def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -197,7 +211,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
                    "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % e
                )
                logging.error("node_termination_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to reboot the node
    def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -215,4 +231,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
                    "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                )
                logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
--- a/kraken/node_actions/openstack_node_scenarios.py
+++ b/kraken/node_actions/openstack_node_scenarios.py
@@ -24,7 +24,9 @@ class OPENSTACKCLOUD:
            logging.info("Instance: " + str(node) + " started")
        except Exception as e:
            logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (node, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Stop the node instance
    def stop_instances(self, node):
@@ -33,7 +35,9 @@ class OPENSTACKCLOUD:
            logging.info("Instance: " + str(node) + " stopped")
        except Exception as e:
            logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (node, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Reboot the node instance
    def reboot_instances(self, node):
@@ -42,7 +46,9 @@ class OPENSTACKCLOUD:
            logging.info("Instance: " + str(node) + " rebooted")
        except Exception as e:
            logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (node, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()

    # Wait until the node instance is running
    def wait_until_running(self, node, timeout):
@@ -109,7 +115,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
                    "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                )
                logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to stop the node
    def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -125,7 +133,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
            except Exception as e:
                logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to reboot the node
    def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -144,7 +154,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
                    "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                )
                logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to start the node
    def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
@@ -162,7 +174,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
                    "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                )
                logging.error("helper_node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    # Node scenario to stop the node
    def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
@@ -177,7 +191,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
            except Exception as e:
                logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                logging.error("helper_node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()

    def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout):
        try:
@@ -188,4 +204,6 @@ class openstack_node_scenarios(abstract_node_scenarios):
        except Exception as e:
            logging.error("Failed to check service status. Encountered following exception:" " %s. Test Failed" % (e))
            logging.error("helper_node_service_status injection failed!")
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
--- a/kraken/node_actions/run.py
+++ b/kraken/node_actions/run.py
@@ -13,7 +13,7 @@ from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
 from kraken.node_actions.docker_node_scenarios import docker_node_scenarios
 import kraken.node_actions.common_node_functions as common_node_functions
 import kraken.cerberus.setup as cerberus
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 node_general = False

@@ -53,8 +53,14 @@ def get_node_scenario_object(node_scenario, kubecli: krkn_lib_kubernetes.KrknLib

 # Run defined scenarios
 # krkn_lib_kubernetes
-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
    for node_scenario_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = node_scenario_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config)
        with open(node_scenario_config, "r") as f:
            node_scenario_config = yaml.full_load(f)
            for node_scenario in node_scenario_config["node_scenarios"]:
@@ -62,12 +68,24 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
                if node_scenario["actions"]:
                    for action in node_scenario["actions"]:
                        start_time = int(time.time())
-                        inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
-                        logging.info("Waiting for the specified duration: %s" % (wait_duration))
-                        time.sleep(wait_duration)
-                        end_time = int(time.time())
-                        cerberus.get_status(config, start_time, end_time)
-                        logging.info("")
+                        try:
+                            inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
+                            logging.info("Waiting for the specified duration: %s" % (wait_duration))
+                            time.sleep(wait_duration)
+                            end_time = int(time.time())
+                            cerberus.get_status(config, start_time, end_time)
+                            logging.info("")
+                        except (RuntimeError, Exception) as e:
+                            scenario_telemetry.exitStatus = 1
+                            failed_scenarios.append(node_scenario_config)
+                            telemetry.log_exception(node_scenario_config)
+                        else:
+                            scenario_telemetry.exitStatus = 0
+
+                        scenario_telemetry.endTimeStamp = time.time()
+                        scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries


 # Inject the specified node scenario
--- a/kraken/plugins/init.py
+++ b/kraken/plugins/init.py
@@ -13,6 +13,7 @@ from kraken.plugins.run_python_plugin import run_python_file
 from kraken.plugins.network.ingress_shaping import network_chaos
 from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_outage
 from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry


@dataclasses.dataclass
@@ -225,16 +226,25 @@ PLUGINS = Plugins(
 )


-def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int) -> List[str]:
+def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetry) -> (List[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
    for scenario in scenarios:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, scenario)
        logging.info('scenario '+ str(scenario))
        try:
            PLUGINS.run(scenario, kubeconfig_path, kraken_config)
        except Exception as e:
+            scenario_telemetry.exitStatus = 1
            failed_post_scenarios.append(scenario)
-            logging.error("Error while running {}: {}".format(scenario, e))
-            return failed_post_scenarios
-        logging.info("Waiting for the specified duration: %s" % (wait_duration))
-        time.sleep(wait_duration)
+            telemetry.log_exception(scenario)
+        else:
+            scenario_telemetry.exitStatus = 0
+            logging.info("Waiting for the specified duration: %s" % (wait_duration))
+            time.sleep(wait_duration)
+        scenario_telemetries.append(scenario_telemetry)
+        scenario_telemetry.endTimeStamp = time.time()

-    return failed_post_scenarios
+    return failed_post_scenarios, scenario_telemetries
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -10,7 +10,7 @@ import time
 import yaml
 import sys
 import random
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 # Run pod based scenarios
 def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
@@ -67,8 +67,22 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
    return failed_post_scenarios

 # krkn_lib_kubernetes
-def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def container_run(kubeconfig_path,
+                  scenarios_list,
+                  config,
+                  failed_post_scenarios,
+                  wait_duration,
+                  kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
+                  telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+
+    failed_scenarios = []
+    scenario_telemetries: list[ScenarioTelemetry] = []
+
    for container_scenario_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = container_scenario_config[0]
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
        if len(container_scenario_config) > 1:
            pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
        else:
@@ -78,30 +92,41 @@ def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios
            for cont_scenario in cont_scenario_config["scenarios"]:
                # capture start time
                start_time = int(time.time())
-                killed_containers = container_killing_in_pod(cont_scenario, kubecli)
-
-                if len(container_scenario_config) > 1:
-                    try:
+                try:
+                    killed_containers = container_killing_in_pod(cont_scenario, kubecli)
+                    if len(container_scenario_config) > 1:
                        failed_post_scenarios = post_actions.check_recovery(
-                            kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
+                            kubeconfig_path,
+                            container_scenario_config,
+                            failed_post_scenarios,
+                            pre_action_output
                        )
-                    except Exception as e:
-                        logging.error("Failed to run post action checks: %s" % e)
-                        sys.exit(1)
+                    else:
+                        failed_post_scenarios = check_failed_containers(
+                            killed_containers, cont_scenario.get("retry_wait", 120), kubecli
+                        )
+
+                    logging.info("Waiting for the specified duration: %s" % (wait_duration))
+                    time.sleep(wait_duration)
+
+                    # capture end time
+                    end_time = int(time.time())
+
+                    # publish cerberus status
+                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                except (RuntimeError, Exception):
+                    failed_scenarios.append(container_scenario_config[0])
+                    telemetry.log_exception(container_scenario_config[0])
+                    scenario_telemetry.exitStatus = 1
+                    # removed_exit
+                    # sys.exit(1)
                else:
-                    failed_post_scenarios = check_failed_containers(
-                        killed_containers, cont_scenario.get("retry_wait", 120), kubecli
-                    )
+                    scenario_telemetry.exitStatus = 0
+                scenario_telemetry.endTimeStamp = time.time()
+                scenario_telemetries.append(scenario_telemetry)

-                logging.info("Waiting for the specified duration: %s" % (wait_duration))
-                time.sleep(wait_duration)
+    return failed_scenarios, scenario_telemetries

-                # capture end time
-                end_time = int(time.time())
-
-                # publish cerberus status
-                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
-                logging.info("")


 def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
@@ -114,7 +139,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
    kill_count = cont_scenario.get("count", 1)
    if type(pod_names) != list:
        logging.error("Please make sure your pod_names are in a list format")
-        sys.exit(1)
+        # removed_exit
+        # sys.exit(1)
+        raise RuntimeError()
    if len(pod_names) == 0:
        if namespace == "*":
            # returns double array of pod name and namespace
@@ -126,7 +153,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
        if namespace == "*":
            logging.error("You must specify the namespace to kill a container in a specific pod")
            logging.error("Scenario " + scenario_name + " failed")
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
        pods = pod_names
    # get container and pod name
    container_pod_list = []
@@ -147,7 +176,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
        if len(container_pod_list) == 0:
            logging.error("Trying to kill more containers than were found, try lowering kill count")
            logging.error("Scenario " + scenario_name + " failed")
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
        selected_container_pod = container_pod_list[random.randint(0, len(container_pod_list) - 1)]
        for c_name in selected_container_pod[2]:
            if container_name != "":
@@ -178,6 +209,7 @@ def retry_container_killing(kill_action, podname, namespace, container_name, kub
            time.sleep(2)
            continue
        else:
+            logging.warning(response)
            continue


--- a/kraken/pvc/pvc_scenario.py
+++ b/kraken/pvc/pvc_scenario.py
@@ -7,150 +7,233 @@ import krkn_lib_kubernetes
 import yaml

 from ..cerberus import setup as cerberus
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 # krkn_lib_kubernetes
-def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
    """
    Reads the scenario config and creates a temp file to fill up the PVC
    """
    failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
    for app_config in scenarios_list:
-        if len(app_config) > 1:
-            with open(app_config, "r") as f:
-                config_yaml = yaml.full_load(f)
-                scenario_config = config_yaml["pvc_scenario"]
-                pvc_name = scenario_config.get("pvc_name", "")
-                pod_name = scenario_config.get("pod_name", "")
-                namespace = scenario_config.get("namespace", "")
-                target_fill_percentage = scenario_config.get(
-                    "fill_percentage", "50"
-                )
-                duration = scenario_config.get("duration", 60)
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = app_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, app_config)
+        try:
+            if len(app_config) > 1:
+                with open(app_config, "r") as f:
+                    config_yaml = yaml.full_load(f)
+                    scenario_config = config_yaml["pvc_scenario"]
+                    pvc_name = scenario_config.get("pvc_name", "")
+                    pod_name = scenario_config.get("pod_name", "")
+                    namespace = scenario_config.get("namespace", "")
+                    target_fill_percentage = scenario_config.get(
+                        "fill_percentage", "50"
+                    )
+                    duration = scenario_config.get("duration", 60)

-                logging.info(
-                    "Input params:\n"
-                    "pvc_name: '%s'\n"
-                    "pod_name: '%s'\n"
-                    "namespace: '%s'\n"
-                    "target_fill_percentage: '%s%%'\nduration: '%ss'"
-                    % (
-                        str(pvc_name),
-                        str(pod_name),
-                        str(namespace),
-                        str(target_fill_percentage),
-                        str(duration)
-                    )
-                )
-
-                # Check input params
-                if namespace is None:
-                    logging.error(
-                        "You must specify the namespace where the PVC is"
-                    )
-                    sys.exit(1)
-                if pvc_name is None and pod_name is None:
-                    logging.error(
-                        "You must specify the pvc_name or the pod_name"
-                    )
-                    sys.exit(1)
-                if pvc_name and pod_name:
                    logging.info(
-                        "pod_name will be ignored, pod_name used will be "
-                        "a retrieved from the pod used in the pvc_name"
+                        "Input params:\n"
+                        "pvc_name: '%s'\n"
+                        "pod_name: '%s'\n"
+                        "namespace: '%s'\n"
+                        "target_fill_percentage: '%s%%'\nduration: '%ss'"
+                        % (
+                            str(pvc_name),
+                            str(pod_name),
+                            str(namespace),
+                            str(target_fill_percentage),
+                            str(duration)
+                        )
                    )

-                # Get pod name
-                if pvc_name:
-                    if pod_name:
-                        logging.info(
-                            "pod_name '%s' will be overridden with one of "
-                            "the pods mounted in the PVC" % (str(pod_name))
-                        )
-                    pvc = kubecli.get_pvc_info(pvc_name, namespace)
-                    try:
-                        # random generator not used for
-                        # security/cryptographic purposes.
-                        pod_name = random.choice(pvc.podNames)  # nosec
-                        logging.info("Pod name: %s" % pod_name)
-                    except Exception:
+                    # Check input params
+                    if namespace is None:
                        logging.error(
-                            "Pod associated with %s PVC, on namespace %s, "
-                            "not found" % (str(pvc_name), str(namespace))
+                            "You must specify the namespace where the PVC is"
                        )
-                        sys.exit(1)
-
-                # Get volume name
-                pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
-
-                if pod is None:
-                    logging.error(
-                        "Exiting as pod '%s' doesn't exist "
-                        "in namespace '%s'" % (
-                            str(pod_name),
-                            str(namespace)
+                        #sys.exit(1)
+                        raise RuntimeError()
+                    if pvc_name is None and pod_name is None:
+                        logging.error(
+                            "You must specify the pvc_name or the pod_name"
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()
+                    if pvc_name and pod_name:
+                        logging.info(
+                            "pod_name will be ignored, pod_name used will be "
+                            "a retrieved from the pod used in the pvc_name"
                        )
-                    )
-                    sys.exit(1)

-                for volume in pod.volumes:
-                    if volume.pvcName is not None:
-                        volume_name = volume.name
-                        pvc_name = volume.pvcName
+                    # Get pod name
+                    if pvc_name:
+                        if pod_name:
+                            logging.info(
+                                "pod_name '%s' will be overridden with one of "
+                                "the pods mounted in the PVC" % (str(pod_name))
+                            )
                        pvc = kubecli.get_pvc_info(pvc_name, namespace)
-                        break
-                if 'pvc' not in locals():
-                    logging.error(
-                        "Pod '%s' in namespace '%s' does not use a pvc" % (
+                        try:
+                            # random generator not used for
+                            # security/cryptographic purposes.
+                            pod_name = random.choice(pvc.podNames)  # nosec
+                            logging.info("Pod name: %s" % pod_name)
+                        except Exception:
+                            logging.error(
+                                "Pod associated with %s PVC, on namespace %s, "
+                                "not found" % (str(pvc_name), str(namespace))
+                            )
+                            # sys.exit(1)
+                            raise RuntimeError()
+
+                    # Get volume name
+                    pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
+
+                    if pod is None:
+                        logging.error(
+                            "Exiting as pod '%s' doesn't exist "
+                            "in namespace '%s'" % (
+                                str(pod_name),
+                                str(namespace)
+                            )
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()
+
+                    for volume in pod.volumes:
+                        if volume.pvcName is not None:
+                            volume_name = volume.name
+                            pvc_name = volume.pvcName
+                            pvc = kubecli.get_pvc_info(pvc_name, namespace)
+                            break
+                    if 'pvc' not in locals():
+                        logging.error(
+                            "Pod '%s' in namespace '%s' does not use a pvc" % (
+                                str(pod_name),
+                                str(namespace)
+                            )
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()
+                    logging.info("Volume name: %s" % volume_name)
+                    logging.info("PVC name: %s" % pvc_name)
+
+                    # Get container name and mount path
+                    for container in pod.containers:
+                        for vol in container.volumeMounts:
+                            if vol.name == volume_name:
+                                mount_path = vol.mountPath
+                                container_name = container.name
+                                break
+                    logging.info("Container path: %s" % container_name)
+                    logging.info("Mount path: %s" % mount_path)
+
+                    # Get PVC capacity and used bytes
+                    command = "df %s -B 1024 | sed 1d" % (str(mount_path))
+                    command_output = (
+                        kubecli.exec_cmd_in_pod(
+                            command,
+                            pod_name,
+                            namespace,
+                            container_name,
+                            "sh"
+                        )
+                    ).split()
+                    pvc_used_kb = int(command_output[2])
+                    pvc_capacity_kb = pvc_used_kb + int(command_output[3])
+                    logging.info("PVC used: %s KB" % pvc_used_kb)
+                    logging.info("PVC capacity: %s KB" % pvc_capacity_kb)
+
+                    # Check valid fill percentage
+                    current_fill_percentage = pvc_used_kb / pvc_capacity_kb
+                    if not (
+                        current_fill_percentage * 100
+                        < float(target_fill_percentage)
+                        <= 99
+                    ):
+                        logging.error(
+                            "Target fill percentage (%.2f%%) is lower than "
+                            "current fill percentage (%.2f%%) "
+                            "or higher than 99%%" % (
+                                target_fill_percentage,
+                                current_fill_percentage * 100
+                            )
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()
+
+                    # Calculate file size
+                    file_size_kb = int(
+                        (
+                            float(
+                                target_fill_percentage / 100
+                            ) * float(pvc_capacity_kb)
+                        ) - float(pvc_used_kb)
+                    )
+                    logging.debug("File size: %s KB" % file_size_kb)
+
+                    file_name = "kraken.tmp"
+                    logging.info(
+                        "Creating %s file, %s KB size, in pod %s at %s (ns %s)"
+                        % (
+                            str(file_name),
+                            str(file_size_kb),
                            str(pod_name),
+                            str(mount_path),
                            str(namespace)
                        )
                    )
-                    sys.exit(1)
-                logging.info("Volume name: %s" % volume_name)
-                logging.info("PVC name: %s" % pvc_name)

-                # Get container name and mount path
-                for container in pod.containers:
-                    for vol in container.volumeMounts:
-                        if vol.name == volume_name:
-                            mount_path = vol.mountPath
-                            container_name = container.name
-                            break
-                logging.info("Container path: %s" % container_name)
-                logging.info("Mount path: %s" % mount_path)
-
-                # Get PVC capacity and used bytes
-                command = "df %s -B 1024 | sed 1d" % (str(mount_path))
-                command_output = (
+                    start_time = int(time.time())
+                    # Create temp file in the PVC
+                    full_path = "%s/%s" % (str(mount_path), str(file_name))
+                    command = "fallocate -l $((%s*1024)) %s" % (
+                        str(file_size_kb),
+                        str(full_path)
+                    )
+                    logging.debug(
+                        "Create temp file in the PVC command:\n %s" % command
+                    )
                    kubecli.exec_cmd_in_pod(
                        command,
                        pod_name,
                        namespace,
                        container_name,
                    )
-                ).split()
-                pvc_used_kb = int(command_output[2])
-                pvc_capacity_kb = pvc_used_kb + int(command_output[3])
-                logging.info("PVC used: %s KB" % pvc_used_kb)
-                logging.info("PVC capacity: %s KB" % pvc_capacity_kb)

-                # Check valid fill percentage
-                current_fill_percentage = pvc_used_kb / pvc_capacity_kb
-                if not (
-                    current_fill_percentage * 100
-                    < float(target_fill_percentage)
-                    <= 99
-                ):
-                    logging.error(
-                        "Target fill percentage (%.2f%%) is lower than "
-                        "current fill percentage (%.2f%%) "
-                        "or higher than 99%%" % (
-                            target_fill_percentage,
-                            current_fill_percentage * 100
-                        )
+                    # Check if file is created
+                    command = "ls -lh %s" % (str(mount_path))
+                    logging.debug("Check file is created command:\n %s" % command)
+                    response = kubecli.exec_cmd_in_pod(
+                        command, pod_name, namespace, container_name, "sh"
                    )
-                    sys.exit(1)
+                    logging.info("\n" + str(response))
+                    if str(file_name).lower() in str(response).lower():
+                        logging.info(
+                            "%s file successfully created" % (str(full_path))
+                        )
+                    else:
+                        logging.error(
+                            "Failed to create tmp file with %s size" % (
+                                str(file_size_kb)
+                            )
+                        )
+                        remove_temp_file(
+                            file_name,
+                            full_path,
+                            pod_name,
+                            namespace,
+                            container_name,
+                            mount_path,
+                            file_size_kb,
+                            kubecli
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()

                # Calculate file size
                file_size_kb = int(
@@ -197,14 +280,13 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                logging.info("\n" + str(response))
                if str(file_name).lower() in str(response).lower():
                    logging.info(
-                        "%s file successfully created" % (str(full_path))
-                    )
-                else:
-                    logging.error(
-                        "Failed to create tmp file with %s size" % (
-                            str(file_size_kb)
+                        "Waiting for the specified duration in the config: %ss" % (
+                            duration
                        )
                    )
+                    time.sleep(duration)
+                    logging.info("Finish waiting")
+
                    remove_temp_file(
                        file_name,
                        full_path,
@@ -215,35 +297,25 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                        file_size_kb,
                        kubecli
                    )
-                    sys.exit(1)

-                # Wait for the specified duration
-                logging.info(
-                    "Waiting for the specified duration in the config: %ss" % (
-                        duration
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(
+                        config,
+                        failed_post_scenarios,
+                        start_time,
+                        end_time
                    )
-                )
-                time.sleep(duration)
-                logging.info("Finish waiting")
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(app_config)
+            telemetry.log_exception(app_config)
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries

-                remove_temp_file(
-                    file_name,
-                    full_path,
-                    pod_name,
-                    namespace,
-                    container_name,
-                    mount_path,
-                    file_size_kb,
-                    kubecli
-                )

-                end_time = int(time.time())
-                cerberus.publish_kraken_status(
-                    config,
-                    failed_post_scenarios,
-                    start_time,
-                    end_time
-                )


 # krkn_lib_kubernetes
@@ -275,7 +347,7 @@ def remove_temp_file(
        logging.error(
            "Failed to delete tmp file with %s size" % (str(file_size_kb))
        )
-        sys.exit(1)
+        raise RuntimeError()


 def toKbytes(value):
@@ -284,7 +356,7 @@ def toKbytes(value):
            "PVC capacity %s does not match expression "
            "regexp '^[0-9]+[K|M|G|T]i$'"
        )
-        sys.exit(1)
+        raise RuntimeError()
    unit = {"K": 0, "M": 1, "G": 2, "T": 3}
    base = 1024 if ("i" in value) else 1000
    exp = unit[value[-2:-1]]
--- a/kraken/shut_down/common_shut_down_func.py
+++ b/kraken/shut_down/common_shut_down_func.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-
+import os
 import sys
 import yaml
 import logging
@@ -13,7 +13,7 @@ from ..node_actions.aws_node_scenarios import AWS
 from ..node_actions.openstack_node_scenarios import OPENSTACKCLOUD
 from ..node_actions.az_node_scenarios import Azure
 from ..node_actions.gcp_node_scenarios import GCP
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 def multiprocess_nodes(cloud_object_function, nodes):
    try:
@@ -59,7 +59,9 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
            "Cloud type %s is not currently supported for cluster shut down" %
            cloud_type
        )
-        sys.exit(1)
+        # removed_exit
+        # sys.exit(1)
+        raise RuntimeError()

    nodes = kubecli.list_nodes()
    node_id = []
@@ -128,9 +130,16 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube

 # krkn_lib_kubernetes

-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
    failed_post_scenarios = []
+    failed_scenarios = []
+    scenario_telemetries: list[ScenarioTelemetry] = []
+
    for shut_down_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = shut_down_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, shut_down_config[0])
        if len(shut_down_config) > 1:
            pre_action_output = post_actions.run("", shut_down_config[1])
        else:
@@ -140,18 +149,32 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
            shut_down_config_scenario = \
                shut_down_config_yaml["cluster_shut_down_scenario"]
            start_time = int(time.time())
-            cluster_shut_down(shut_down_config_scenario, kubecli)
-            logging.info(
-                "Waiting for the specified duration: %s" % (wait_duration)
-            )
-            time.sleep(wait_duration)
-            failed_post_scenarios = post_actions.check_recovery(
-                "", shut_down_config, failed_post_scenarios, pre_action_output
-            )
-            end_time = int(time.time())
-            cerberus.publish_kraken_status(
-                config,
-                failed_post_scenarios,
-                start_time,
-                end_time
-            )
+            try:
+                cluster_shut_down(shut_down_config_scenario, kubecli)
+                logging.info(
+                    "Waiting for the specified duration: %s" % (wait_duration)
+                )
+                time.sleep(wait_duration)
+                failed_post_scenarios = post_actions.check_recovery(
+                    "", shut_down_config, failed_post_scenarios, pre_action_output
+                )
+                end_time = int(time.time())
+                cerberus.publish_kraken_status(
+                    config,
+                    failed_post_scenarios,
+                    start_time,
+                    end_time
+                )
+
+            except (RuntimeError, Exception):
+                telemetry.log_exception(shut_down_config[0])
+                failed_scenarios.append(shut_down_config[0])
+                scenario_telemetry.exitStatus = 1
+            else:
+                scenario_telemetry.exitStatus = 0
+
+            scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries
+
--- a/kraken/time_actions/common_time_functions.py
+++ b/kraken/time_actions/common_time_functions.py
@@ -8,6 +8,7 @@ import random
 import krkn_lib_kubernetes
 from ..cerberus import setup as cerberus
 from ..invoke import command as runcommand
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

 # krkn_lib_kubernetes
 def pod_exec(pod_name, command, namespace, container_name, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
@@ -93,7 +94,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
            for name in scenario["object_name"]:
                if "namespace" not in scenario.keys():
                    logging.error("Need to set namespace when using pod name")
-                    sys.exit(1)
+                    # removed_exit
+                    # sys.exit(1)
+                    raise RuntimeError()
                pod_names.append([name, scenario["namespace"]])
        elif "namespace" in scenario.keys() and scenario["namespace"]:
            if "label_selector" not in scenario.keys():
@@ -127,7 +130,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                "Cannot find pods matching the namespace/label_selector, "
                "please check"
            )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
        pod_counter = 0
        for pod in pod_names:
            if len(pod) > 1:
@@ -152,7 +157,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                        "in pod %s in namespace %s"
                        % (selected_container_name, pod[0], pod[1])
                    )
-                    sys.exit(1)
+                    # removed_exit
+                    # sys.exit(1)
+                    raise RuntimeError()
                pod_names[pod_counter].append(selected_container_name)
            else:
                selected_container_name = get_container_name(
@@ -178,7 +185,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                            scenario["namespace"]
                        )
                    )
-                    sys.exit(1)
+                    # removed_exit
+                    # sys.exit(1)
+                    raise RuntimeError()
                pod_names[pod_counter].append(selected_container_name)
            logging.info("Reset date/time on pod " + str(pod[0]))
            pod_counter += 1
@@ -299,24 +308,41 @@ def check_date_time(object_type, names, kubecli: krkn_lib_kubernetes.KrknLibKube


 # krkn_lib_kubernetes
-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+    failed_scenarios = []
+    scenario_telemetries: list[ScenarioTelemetry] = []
    for time_scenario_config in scenarios_list:
-        with open(time_scenario_config, "r") as f:
-            scenario_config = yaml.full_load(f)
-            for time_scenario in scenario_config["time_scenarios"]:
-                start_time = int(time.time())
-                object_type, object_names = skew_time(time_scenario, kubecli)
-                not_reset = check_date_time(object_type, object_names, kubecli)
-                if len(not_reset) > 0:
-                    logging.info("Object times were not reset")
-                logging.info(
-                    "Waiting for the specified duration: %s" % (wait_duration)
-                )
-                time.sleep(wait_duration)
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(
-                    config,
-                    not_reset,
-                    start_time,
-                    end_time
-                )
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = time_scenario_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config)
+        try:
+            with open(time_scenario_config, "r") as f:
+                scenario_config = yaml.full_load(f)
+                for time_scenario in scenario_config["time_scenarios"]:
+                    start_time = int(time.time())
+                    object_type, object_names = skew_time(time_scenario, kubecli)
+                    not_reset = check_date_time(object_type, object_names, kubecli)
+                    if len(not_reset) > 0:
+                        logging.info("Object times were not reset")
+                    logging.info(
+                        "Waiting for the specified duration: %s" % (wait_duration)
+                    )
+                    time.sleep(wait_duration)
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(
+                        config,
+                        not_reset,
+                        start_time,
+                        end_time
+                    )
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            telemetry.log_exception(time_scenario_config)
+            failed_scenarios.append(time_scenario_config)
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries
--- a/kraken/zone_outage/actions.py
+++ b/kraken/zone_outage/actions.py
@@ -1,100 +1,119 @@
 import yaml
-import sys
 import logging
 import time
 from ..node_actions.aws_node_scenarios import AWS
 from ..cerberus import setup as cerberus
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

-
-def run(scenarios_list, config, wait_duration):
+def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]) :
    """
    filters the subnet of interest and applies the network acl
    to create zone outage
    """
    failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
+
    for zone_outage_config in scenarios_list:
-        if len(zone_outage_config) > 1:
-            with open(zone_outage_config, "r") as f:
-                zone_outage_config_yaml = yaml.full_load(f)
-                scenario_config = zone_outage_config_yaml["zone_outage"]
-                vpc_id = scenario_config["vpc_id"]
-                subnet_ids = scenario_config["subnet_id"]
-                duration = scenario_config["duration"]
-                cloud_type = scenario_config["cloud_type"]
-                ids = {}
-                acl_ids_created = []
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = zone_outage_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config)
+        try:
+            if len(zone_outage_config) > 1:
+                with open(zone_outage_config, "r") as f:
+                    zone_outage_config_yaml = yaml.full_load(f)
+                    scenario_config = zone_outage_config_yaml["zone_outage"]
+                    vpc_id = scenario_config["vpc_id"]
+                    subnet_ids = scenario_config["subnet_id"]
+                    duration = scenario_config["duration"]
+                    cloud_type = scenario_config["cloud_type"]
+                    ids = {}
+                    acl_ids_created = []

-                if cloud_type.lower() == "aws":
-                    cloud_object = AWS()
-                else:
-                    logging.error(
-                        "Cloud type %s is not currently supported for "
-                        "zone outage scenarios"
-                        % cloud_type
-                    )
-                    sys.exit(1)
-
-                start_time = int(time.time())
-
-                for subnet_id in subnet_ids:
-                    logging.info("Targeting subnet_id")
-                    network_association_ids = []
-                    associations, original_acl_id = \
-                        cloud_object.describe_network_acls(vpc_id, subnet_id)
-                    for entry in associations:
-                        if entry["SubnetId"] == subnet_id:
-                            network_association_ids.append(
-                                entry["NetworkAclAssociationId"]
-                            )
-                    logging.info(
-                        "Network association ids associated with "
-                        "the subnet %s: %s"
-                        % (subnet_id, network_association_ids)
-                    )
-                    acl_id = cloud_object.create_default_network_acl(vpc_id)
-                    new_association_id = \
-                        cloud_object.replace_network_acl_association(
-                            network_association_ids[0], acl_id
+                    if cloud_type.lower() == "aws":
+                        cloud_object = AWS()
+                    else:
+                        logging.error(
+                            "Cloud type %s is not currently supported for "
+                            "zone outage scenarios"
+                            % cloud_type
                        )
+                        # removed_exit
+                        # sys.exit(1)
+                        raise RuntimeError()

-                    # capture the orginal_acl_id, created_acl_id and
-                    # new association_id to use during the recovery
-                    ids[new_association_id] = original_acl_id
-                    acl_ids_created.append(acl_id)
+                    start_time = int(time.time())

-                # wait for the specified duration
-                logging.info(
-                    "Waiting for the specified duration "
-                    "in the config: %s" % (duration)
-                )
-                time.sleep(duration)
+                    for subnet_id in subnet_ids:
+                        logging.info("Targeting subnet_id")
+                        network_association_ids = []
+                        associations, original_acl_id = \
+                            cloud_object.describe_network_acls(vpc_id, subnet_id)
+                        for entry in associations:
+                            if entry["SubnetId"] == subnet_id:
+                                network_association_ids.append(
+                                    entry["NetworkAclAssociationId"]
+                                )
+                        logging.info(
+                            "Network association ids associated with "
+                            "the subnet %s: %s"
+                            % (subnet_id, network_association_ids)
+                        )
+                        acl_id = cloud_object.create_default_network_acl(vpc_id)
+                        new_association_id = \
+                            cloud_object.replace_network_acl_association(
+                                network_association_ids[0], acl_id
+                            )

-                # replace the applied acl with the previous acl in use
-                for new_association_id, original_acl_id in ids.items():
-                    cloud_object.replace_network_acl_association(
-                        new_association_id,
-                        original_acl_id
+                        # capture the orginal_acl_id, created_acl_id and
+                        # new association_id to use during the recovery
+                        ids[new_association_id] = original_acl_id
+                        acl_ids_created.append(acl_id)
+
+                    # wait for the specified duration
+                    logging.info(
+                        "Waiting for the specified duration "
+                        "in the config: %s" % (duration)
                    )
-                logging.info(
-                    "Wating for 60 seconds to make sure "
-                    "the changes are in place"
-                )
-                time.sleep(60)
+                    time.sleep(duration)

-                # delete the network acl created for the run
-                for acl_id in acl_ids_created:
-                    cloud_object.delete_network_acl(acl_id)
+                    # replace the applied acl with the previous acl in use
+                    for new_association_id, original_acl_id in ids.items():
+                        cloud_object.replace_network_acl_association(
+                            new_association_id,
+                            original_acl_id
+                        )
+                    logging.info(
+                        "Wating for 60 seconds to make sure "
+                        "the changes are in place"
+                    )
+                    time.sleep(60)

-                logging.info(
-                    "End of scenario. "
-                    "Waiting for the specified duration: %s" % (wait_duration)
-                )
-                time.sleep(wait_duration)
+                    # delete the network acl created for the run
+                    for acl_id in acl_ids_created:
+                        cloud_object.delete_network_acl(acl_id)
+
+                    logging.info(
+                        "End of scenario. "
+                        "Waiting for the specified duration: %s" % (wait_duration)
+                    )
+                    time.sleep(wait_duration)
+
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(
+                        config,
+                        failed_post_scenarios,
+                        start_time,
+                        end_time
+                    )
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(zone_outage_config)
+            telemetry.log_exception(zone_outage_config)
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries

-                end_time = int(time.time())
-                cerberus.publish_kraken_status(
-                    config,
-                    failed_post_scenarios,
-                    start_time,
-                    end_time
-                )
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,4 @@ prometheus_api_client
 ibm_cloud_sdk_core
 ibm_vpc
 pytest
-krkn-lib-kubernetes > 0.1.1
+krkn-lib-kubernetes >= 0.1.3
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -25,7 +25,7 @@ import kraken.arcaflow_plugin as arcaflow_plugin
 import server as server
 import kraken.prometheus.client as promcli
 from kraken import plugins
-from krkn_lib_kubernetes import KrknLibKubernetes
+from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry, SafeLogger

 KUBE_BURNER_URL = (
    "https://github.com/cloud-bulldozer/kube-burner/"
@@ -98,13 +98,34 @@ def main(cfg):
            )
            sys.exit(1)
        logging.info("Initializing client to talk to the Kubernetes cluster")
+
+        # Generate uuid for the run
+        if run_uuid:
+            logging.info(
+                "Using the uuid defined by the user for the run: %s" % run_uuid
+            )
+        else:
+            run_uuid = str(uuid.uuid4())
+            logging.info("Generated a uuid for the run: %s" % run_uuid)
+
+        # request_id for telemetry is generated once here and used everywhere
+        telemetry_request_id = f"{int(time.time())}-{run_uuid}"
+        if config["telemetry"].get("run_tag"):
+            telemetry_request_id = f"{telemetry_request_id}-{config['telemetry']['run_tag']}"
+        telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log'
+        safe_logger = SafeLogger(filename=telemetry_log_file)
+
        try:
            kubeconfig_path
            os.environ["KUBECONFIG"] = str(kubeconfig_path)
+            # krkn-lib-kubernetes init
            kubecli = KrknLibKubernetes(kubeconfig_path=kubeconfig_path)
        except:
            kubecli.initialize_clients(None)

+        # KrknTelemetry init
+        telemetry = KrknTelemetry(safe_logger, kubecli)
+
        # find node kraken might be running on
        kubecli.find_kraken_node()

@@ -141,14 +162,7 @@ def main(cfg):
        if deploy_performance_dashboards:
            performance_dashboards.setup(dashboard_repo, distribution)

-        # Generate uuid for the run
-        if run_uuid:
-            logging.info(
-                "Using the uuid defined by the user for the run: %s" % run_uuid
-            )
-        else:
-            run_uuid = str(uuid.uuid4())
-            logging.info("Generated a uuid for the run: %s" % run_uuid)
+

        # Initialize the start iteration to 0
        iteration = 0
@@ -171,7 +185,8 @@ def main(cfg):
        # Capture the start time
        start_time = int(time.time())
        litmus_installed = False
-
+        chaos_telemetry = ChaosRunTelemetry()
+        chaos_telemetry.run_uuid = run_uuid
        # Loop to run the chaos starts here
        while int(iteration) < iterations and run_signal != "STOP":
            # Inject chaos scenarios specified in the config
@@ -203,36 +218,41 @@ def main(cfg):
                            )
                            sys.exit(1)
                        elif scenario_type == "arcaflow_scenarios":
-                            failed_post_scenarios = arcaflow_plugin.run(
-                                scenarios_list, kubeconfig_path
+                            failed_post_scenarios, scenario_telemetries = arcaflow_plugin.run(
+                                scenarios_list, kubeconfig_path, telemetry
                            )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        elif scenario_type == "plugin_scenarios":
-                            failed_post_scenarios = plugins.run(
+                            failed_post_scenarios, scenario_telemetries = plugins.run(
                                scenarios_list,
                                kubeconfig_path,
                                kraken_config,
                                failed_post_scenarios,
                                wait_duration,
+                                telemetry
                            )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                        # krkn_lib_kubernetes
                        elif scenario_type == "container_scenarios":
                            logging.info("Running container scenarios")
-                            failed_post_scenarios = pod_scenarios.container_run(
+                            failed_post_scenarios, scenario_telemetries = pod_scenarios.container_run(
                                kubeconfig_path,
                                scenarios_list,
                                config,
                                failed_post_scenarios,
                                wait_duration,
-                                kubecli
+                                kubecli,
+                                telemetry
                            )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # Inject node chaos scenarios specified in the config
                        # krkn_lib_kubernetes
                        elif scenario_type == "node_scenarios":
                            logging.info("Running node scenarios")
-                            nodeaction.run(scenarios_list, config, wait_duration, kubecli)
-
+                            failed_post_scenarios, scenario_telemetries = nodeaction.run(scenarios_list, config, wait_duration, kubecli, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                        # Inject managedcluster chaos scenarios specified in the config
                        # krkn_lib_kubernetes
                        elif scenario_type == "managedcluster_scenarios":
@@ -247,7 +267,8 @@ def main(cfg):
                        elif scenario_type == "time_scenarios":
                            if distribution == "openshift":
                                logging.info("Running time skew scenarios")
-                                time_actions.run(scenarios_list, config, wait_duration, kubecli)
+                                failed_post_scenarios, scenario_telemetries = time_actions.run(scenarios_list, config, wait_duration, kubecli, telemetry)
+                                chaos_telemetry.scenarios.extend(scenario_telemetries)
                            else:
                                logging.error(
                                    "Litmus scenarios are currently "
@@ -297,44 +318,48 @@ def main(cfg):
                        # Inject cluster shutdown scenarios
                        # krkn_lib_kubernetes
                        elif scenario_type == "cluster_shut_down_scenarios":
-                            shut_down.run(scenarios_list, config, wait_duration, kubecli)
+                            failed_post_scenarios, scenario_telemetries = shut_down.run(scenarios_list, config, wait_duration, kubecli, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # Inject namespace chaos scenarios
                        # krkn_lib_kubernetes
                        elif scenario_type == "namespace_scenarios":
                            logging.info("Running namespace scenarios")
-                            namespace_actions.run(
+                            failed_post_scenarios, scenario_telemetries = namespace_actions.run(
                                scenarios_list,
                                config,
                                wait_duration,
                                failed_post_scenarios,
                                kubeconfig_path,
-                                kubecli
+                                kubecli,
+                                telemetry
                            )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # Inject zone failures
                        elif scenario_type == "zone_outages":
                            logging.info("Inject zone outages")
-                            zone_outages.run(scenarios_list, config, wait_duration)
-
+                            failed_post_scenarios, scenario_telemetries = zone_outages.run(scenarios_list, config, wait_duration, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                        # Application outages
                        elif scenario_type == "application_outages":
                            logging.info("Injecting application outage")
-                            application_outage.run(
-                                scenarios_list, config, wait_duration
-                            )
+                            failed_post_scenarios, scenario_telemetries = application_outage.run(
+                                scenarios_list, config, wait_duration, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # PVC scenarios
                        # krkn_lib_kubernetes
                        elif scenario_type == "pvc_scenarios":
                            logging.info("Running PVC scenario")
-                            pvc_scenario.run(scenarios_list, config, kubecli)
+                            failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # Network scenarios
                        # krkn_lib_kubernetes
                        elif scenario_type == "network_chaos":
                            logging.info("Running Network Chaos")
-                            network_chaos.run(scenarios_list, config, wait_duration, kubecli)
+                            failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry)

                        # Check for critical alerts when enabled
                        if check_critical_alerts:
@@ -353,6 +378,21 @@ def main(cfg):
            iteration += 1
            logging.info("")

+        # telemetry
+        if config["telemetry"]["enabled"]:
+            logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}")
+            logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
+            try:
+                telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
+                safe_logger.info("archives download started:")
+                prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
+                safe_logger.info("archives upload started:")
+                telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
+            except Exception as e:
+                logging.error(f"failed to send telemetry data: {str(e)}")
+        else:
+            logging.info("telemetry collection disabled, skipping.")
+
        # Capture the end time
        end_time = int(time.time())