Dockerfiles update (#614 )

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
Bump werkzeug from 2.2.2 to 2.3.8 in /utils/chaos_ai/docker (#610 )
2026-03-17 09:00:34 +00:00 · 2024-04-24 11:40:58 -04:00 · 2024-04-23 15:26:51 +02:00 · 2024-04-23 10:49:01 +02:00 · 2024-04-16 10:41:31 -04:00 · 2024-04-09 23:32:17 -04:00
64 changed files with 1925 additions and 249 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ __pycache__/*
 *.out
 kube-burner*
 kube_burner*
+recommender_*.json

 # Project files
 .ropeproject
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -49,3 +49,4 @@ telemetry:
        - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
    events_backup: True                                     # enables/disables cluster events collection
+    telemetry_group: "funtests"
--- a/CI/tests/common.sh
+++ b/CI/tests/common.sh
@@ -1,15 +1,23 @@
 ERRORED=false

 function finish {
-    if [ $? -eq 1 ] && [ $ERRORED != "true" ]
+    if [ $? != 0 ] && [ $ERRORED != "true" ]
    then
        error
    fi
 }

 function error {
-    echo "Error caught."
-    ERRORED=true
+    exit_code=$?
+    if [ $exit_code == 1 ]
+    then
+      echo "Error caught."
+      ERRORED=true
+    elif [ $exit_code == 2 ]
+    then
+      echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
+      exit 0
+    fi
 }

 function get_node {
--- a/CI/tests/test_container.sh
+++ b/CI/tests/test_container.sh
@@ -8,11 +8,11 @@ trap finish EXIT
 pod_file="CI/scenarios/hello_pod.yaml"

 function functional_test_container_crash {
-  yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
-  yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
-  yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
+  yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
+  yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
+  yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
  export scenario_type="container_scenarios"
-  export scenario_file="- scenarios/openshift/app_outage.yaml"
+  export scenario_file="- scenarios/openshift/container_etcd.yml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml

--- a/CI/tests/test_telemetry.sh
+++ b/CI/tests/test_telemetry.sh
@@ -14,18 +14,22 @@ function functional_test_telemetry {
  export RUN_TAG="funtest-telemetry"
  yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
  yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
  yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
+
  export scenario_type="arcaflow_scenarios"
  export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
-  python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
-  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
+  retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
+  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
  $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
  echo "checking if telemetry files are uploaded on s3"
  cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
-  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
-  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
+  cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded"  && exit 1 )
+  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded"  && exit 1 )
+  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded"  && exit 1 )
  echo "all files uploaded!"
  echo "Telemetry Collection: Success"
 }
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
 # Krkn aka Kraken
-[![Docker Repository on Quay](https://quay.io/repository/krkn-chaos/krkn/status "Docker Repository on Quay")](https://quay.io/repository/krkn-chaos/krkn?tab=tags&tag=latest)
 ![Workflow-Status](https://github.com/krkn-chaos/krkn/actions/workflows/docker-image.yml/badge.svg)

 ![Krkn logo](media/logo.png)
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,5 +1,5 @@
 kraken:
-    distribution: openshift                                # Distribution can be kubernetes or openshift
+    distribution: kubernetes                                # Distribution can be kubernetes or openshift
    kubeconfig_path: ~/.kube/config                        # Path to kubeconfig
    exit_on_failure: False                                 # Exit when a post action scenario fails
    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
@@ -15,7 +15,7 @@ kraken:
        - application_outages:
            - scenarios/openshift/app_outage.yaml
        - container_scenarios:                             # List of chaos pod scenarios to load
-            - -    scenarios/openshift/container_etcd.yml
+            - - scenarios/openshift/container_etcd.yml
        - plugin_scenarios:
            - scenarios/openshift/etcd.yml
            - scenarios/openshift/regex_openshift_pod_kill.yml
@@ -23,7 +23,7 @@ kraken:
            - scenarios/openshift/network_chaos_ingress.yml
            - scenarios/openshift/prom_kill.yml
        - node_scenarios:                                  # List of chaos node scenarios to load
-            -   scenarios/openshift/node_scenarios_example.yml
+            - scenarios/openshift/node_scenarios_example.yml
        - plugin_scenarios:
            - scenarios/openshift/openshift-apiserver.yml
            - scenarios/openshift/openshift-kube-apiserver.yml
@@ -51,7 +51,7 @@ cerberus:
 performance_monitoring:
    deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
    repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_url:                                      # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
    uuid:                                                 # uuid for the run is generated by default if not set
    enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
@@ -65,14 +65,19 @@ telemetry:
    enabled: False                                           # enable/disables the telemetry collection feature
    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
    username: username                                      # telemetry service username
-    password: password                                      # telemetry service password
+    password: password                                    # telemetry service password
    prometheus_backup: True                                 # enables/disables prometheus data collection
+    prometheus_namespace: ""                                # namespace where prometheus is deployed (if distribution is kubernetes)
+    prometheus_container_name: ""                           # name of the prometheus container name (if distribution is kubernetes)
+    prometheus_pod_name: ""                                 # name of the prometheus pod (if distribution is kubernetes)
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
-    archive_size: 500000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+    archive_size: 500000
+    telemetry_group: ''                                     # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
+    # the size of the prometheus data archive size in KB. The lower the size of archive is
                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
                                                            # simultaneously).
                                                            # For unstable/slow connection is better to keep this value low
--- a/config/recommender_config.yaml
+++ b/config/recommender_config.yaml
@@ -1,5 +1,5 @@
 application: openshift-etcd
-namespace: openshift-etcd
+namespaces: openshift-etcd
 labels: app=openshift-etcd
 kubeconfig: ~/.kube/config.yaml
 prometheus_endpoint: <Prometheus_Endpoint>
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
 scrape_duration: 10m
 chaos_library: "kraken"
 log_level: INFO
+json_output_file: False
+json_output_folder_path:

 # for output purpose only do not change if not needed
 chaos_tests:
@@ -26,4 +28,8 @@ chaos_tests:
    - pod_network_chaos
  MEM:
    - node_memory_hog
-    - pvc_disk_fill
+    - pvc_disk_fill
+
+threshold: .7
+cpu_threshold: .5
+mem_threshold: .5
--- a/containers/Dockerfile
+++ b/containers/Dockerfile
@@ -12,7 +12,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
 # Install dependencies
 RUN yum install -y git python39 python3-pip jq gettext wget && \
    python3.9 -m pip install -U pip && \
-    git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.7 /root/kraken && \
+    git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.13 /root/kraken && \
    mkdir -p /root/.kube && cd /root/kraken && \
    pip3.9 install -r requirements.txt && \
    pip3.9 install virtualenv && \
@@ -20,7 +20,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \

 # Get Kubernetes and OpenShift clients from stable releases
 WORKDIR /tmp
-RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
+RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl

 WORKDIR /root/kraken

--- a/containers/Dockerfile-ppc64le
+++ b/containers/Dockerfile-ppc64le
@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
 # Install dependencies
 RUN yum install -y git python39 python3-pip jq gettext wget && \
    python3.9 -m pip install -U pip && \
-    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.7 /root/kraken && \
+    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.13 /root/kraken && \
    mkdir -p /root/.kube && cd /root/kraken && \
    pip3.9 install -r requirements.txt && \
    pip3.9 install virtualenv && \
@@ -22,7 +22,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \

 # Get Kubernetes and OpenShift clients from stable releases
 WORKDIR /tmp
-RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
+RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl

 WORKDIR /root/kraken

--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the
    namespace_pattern: ^<namespace>$
    label_selector: <pod label>
    kill: <number of pods to kill>
- id: wait-for-pods
-  config:
-    namespace_pattern: ^<namespace>$
-    label_selector: <pod label>
-    count: <expected number of pods that match namespace and label>
+    krkn_pod_recovery_time: <expected time for the pod to become ready>
 ```

 #### Node Scenario Yaml Template
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -22,7 +22,7 @@ the capabilities of the current supported scenarios.
 Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
 ```
 $ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
-$ cd kraken
+$ cd krkn
 ```

 #### Install the dependencies
--- a/docs/pod_scenarios.md
+++ b/docs/pod_scenarios.md
@@ -17,11 +17,8 @@ You can then create the scenario file with the following contents:
  config:
    namespace_pattern: ^kube-system$
    label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: k8s-app=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
+    
 ```

 Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.
--- a/kraken/application_outage/actions.py
+++ b/kraken/application_outage/actions.py
@@ -4,6 +4,7 @@ import time
 import kraken.cerberus.setup as cerberus
 from jinja2 import Template
 import kraken.invoke.command as runcommand
+from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.utils.functions import get_yaml_item_value, log_exception
@@ -11,7 +12,7 @@ from krkn_lib.utils.functions import get_yaml_item_value, log_exception

 # Reads the scenario config, applies and deletes a network policy to
 # block the traffic for the specified duration
-def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
+def run(scenarios_list, config, wait_duration,kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
    failed_post_scenarios = ""
    scenario_telemetries: list[ScenarioTelemetry] = []
    failed_scenarios = []
@@ -49,25 +50,22 @@ spec:
  podSelector:
    matchLabels: {{ pod_selector }}
  policyTypes: {{ traffic_type }}
-                    """
+"""
                    t = Template(network_policy_template)
                    rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
-                    # Write the rendered template to a file
-                    with open("kraken_network_policy.yaml", "w") as f:
-                        f.write(rendered_spec)
+                    yaml_spec = yaml.safe_load(rendered_spec)
                    # Block the traffic by creating network policy
                    logging.info("Creating the network policy")
-                    runcommand.invoke(
-                        "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
-                    )

+                    kubecli.create_net_policy(yaml_spec, namespace)
+                   
                    # wait for the specified duration
                    logging.info("Waiting for the specified duration in the config: %s" % (duration))
                    time.sleep(duration)

                    # unblock the traffic by deleting the network policy
                    logging.info("Deleting the network policy")
-                    runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
+                    kubecli.delete_net_policy("kraken-deny", namespace)

                    logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
                    time.sleep(wait_duration)
--- a/kraken/chaos_recommender/analysis.py
+++ b/kraken/chaos_recommender/analysis.py
@@ -4,13 +4,10 @@ import pandas as pd
 import kraken.chaos_recommender.kraken_tests as kraken_tests
 import time

-threshold = .7  # Adjust the threshold as needed
-heatmap_cpu_threshold = .5
-heatmap_mem_threshold = .5
-
 KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"

-#Placeholder, this should be done with topology
+
+# Placeholder, this should be done with topology
 def return_critical_services():
    return ["web", "cart"]

@@ -19,15 +16,18 @@ def load_telemetry_data(file_path):
    data = pd.read_csv(file_path, delimiter=r"\s+")
    return data

+
 def calculate_zscores(data):
    zscores = pd.DataFrame()
+    zscores["Namespace"] = data["namespace"]
    zscores["Service"] = data["service"]
    zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
    zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
    zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
    return zscores

-def identify_outliers(data):
+
+def identify_outliers(data, threshold):
    outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
    outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
    outliers_network = data[data["Network"] > threshold]["Service"].tolist()
@@ -47,44 +47,85 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
    return cpu_services, mem_services


-def analysis(file_path, chaos_tests_config):
+def analysis(file_path, namespaces, chaos_tests_config, threshold,
+             heatmap_cpu_threshold, heatmap_mem_threshold):
    # Load the telemetry data from file
+    logging.info("Fetching the Telemetry data...")
    data = load_telemetry_data(file_path)

    # Calculate Z-scores for CPU, Memory, and Network columns
    zscores = calculate_zscores(data)
+    # Dict for saving analysis data -- key is the namespace
+    analysis_data = {}

-    # Identify outliers
-    outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
-    cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
+    # Identify outliers for each namespace
+    for namespace in namespaces:

-    # Display the identified outliers
-    logging.info("======================== Profiling ==================================")
-    logging.info(f"CPU outliers: {outliers_cpu}")
-    logging.info(f"Memory outliers: {outliers_memory}")
-    logging.info(f"Network outliers: {outliers_network}")
-    logging.info("===================== HeatMap Analysis ==============================")
+        logging.info(f"Identifying outliers for namespace {namespace}...")
+
+        namespace_zscores = zscores.loc[zscores["Namespace"] == namespace]
+        namespace_data = data.loc[data["namespace"] == namespace]
+        outliers_cpu, outliers_memory, outliers_network = identify_outliers(
+            namespace_zscores, threshold)
+        cpu_services, mem_services = get_services_above_heatmap_threshold(
+            namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold)
+
+        analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory,
+                                                 outliers_network,
+                                                 cpu_services, mem_services,
+                                                 chaos_tests_config)
+
+        if cpu_services:
+            logging.info(f"These services use significant CPU compared to "
+                         f"their assigned limits: {cpu_services}")
+        else:
+            logging.info("There are no services that are using significant "
+                         "CPU compared to their assigned limits "
+                         "(infinite in case no limits are set).")
+        if mem_services:
+            logging.info(f"These services use significant MEMORY compared to "
+                         f"their assigned limits: {mem_services}")
+        else:
+            logging.info("There are no services that are using significant "
+                         "MEMORY compared to their assigned limits "
+                         "(infinite in case no limits are set).")
+        time.sleep(2)
+
+    logging.info("Please check data in utilisation.txt for further analysis")
+
+    return analysis_data
+
+
+def analysis_json(outliers_cpu, outliers_memory, outliers_network,
+                  cpu_services, mem_services, chaos_tests_config):
+
+    profiling = {
+        "cpu_outliers": outliers_cpu,
+        "memory_outliers": outliers_memory,
+        "network_outliers": outliers_network
+    }
+
+    heatmap = {
+        "services_with_cpu_heatmap_above_threshold": cpu_services,
+        "services_with_mem_heatmap_above_threshold": mem_services
+    }
+
+    recommendations = {}

    if cpu_services:
-        logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
-    else:
-        logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
+        cpu_recommend = {"services": cpu_services,
+                         "tests": chaos_tests_config['CPU']}
+        recommendations["cpu_services_recommendations"] = cpu_recommend
+
    if mem_services:
-        logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
-    else:
-        logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
-    time.sleep(2)
-    logging.info("======================= Recommendations =============================")
-    if cpu_services:
-        logging.info(f"Recommended tests for {str(cpu_services)}  :\n {chaos_tests_config['CPU']}")
-        logging.info("\n")
-    if mem_services:
-        logging.info(f"Recommended tests for {str(mem_services)}  :\n {chaos_tests_config['MEM']}")
-        logging.info("\n")
+        mem_recommend = {"services": mem_services,
+                         "tests": chaos_tests_config['MEM']}
+        recommendations["mem_services_recommendations"] = mem_recommend

    if outliers_network:
-        logging.info(f"Recommended tests for  str(outliers_network)  :\n {chaos_tests_config['NETWORK']}")
-        logging.info("\n")
+        outliers_network_recommend = {"outliers_networks": outliers_network,
+                                      "tests": chaos_tests_config['NETWORK']}
+        recommendations["outliers_network_recommendations"] = (
+            outliers_network_recommend)

-    logging.info("\n")
-    logging.info("Please check data in utilisation.txt for further analysis")
+    return [profiling, heatmap, recommendations]
--- a/kraken/chaos_recommender/prometheus.py
+++ b/kraken/chaos_recommender/prometheus.py
@@ -1,6 +1,5 @@
 import logging

-import pandas
 from prometheus_api_client import PrometheusConnect
 import pandas as pd
 import urllib3
@@ -8,6 +7,7 @@ import urllib3

 saved_metrics_path = "./utilisation.txt"

+
 def convert_data_to_dataframe(data, label):
    df = pd.DataFrame()
    df['service'] = [item['metric']['pod'] for item in data]
@@ -25,21 +25,27 @@ def convert_data(data, service):
        result[pod_name] = value
    return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value

-def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
-    df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
-    merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
-    services = df_cpu.service.unique()
-    logging.info(services)

-    for s in services:
-
-        new_row_df = pd.DataFrame( {"service": s, "CPU" : convert_data(cpu_data, s),
-                    "CPU_LIMITS" : convert_data(cpu_limits_result, s),
-                    "MEM" : convert_data(mem_data, s), "MEM_LIMITS" : convert_data(mem_limits_result, s),
-                    "NETWORK" : convert_data(network_data, s)}, index=[0])
-        merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
+def save_utilization_to_file(utilization, filename):
+    merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
+    for namespace in utilization:
+        # Loading utilization_data[] for namespace
+        # indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network
+        utilization_data = utilization[namespace]
+        df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU")
+        services = df_cpu.service.unique()
+        logging.info(f"Services for namespace {namespace}: {services}")

+        for s in services:

+            new_row_df = pd.DataFrame({
+                "namespace": namespace, "service": s,
+                "CPU": convert_data(utilization_data[0], s),
+                "CPU_LIMITS": convert_data(utilization_data[1], s),
+                "MEM": convert_data(utilization_data[2], s),
+                "MEM_LIMITS": convert_data(utilization_data[3], s),
+                "NETWORK": convert_data(utilization_data[4], s)}, index=[0])
+            merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)

    # Convert columns to string
    merged_df['CPU'] = merged_df['CPU'].astype(str)
@@ -57,40 +63,50 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r

    merged_df.to_csv(filename, sep='\t', index=False)

-def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
+
+def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
+                                      namespaces, scrape_duration):
    urllib3.disable_warnings()
-    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
+    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={
+        'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)

-    # Fetch CPU utilization
-    cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
-    logging.info(cpu_query)
-    cpu_result = prometheus.custom_query(cpu_query)
-    cpu_data = cpu_result
-
-
-    cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
-    logging.info(cpu_limits_query)
-    cpu_limits_result = prometheus.custom_query(cpu_limits_query)
-
-
-    mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
-    logging.info(mem_query)
-    mem_result = prometheus.custom_query(mem_query)
-    mem_data = mem_result
-
-    mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
-    logging.info(mem_limits_query)
-    mem_limits_result = prometheus.custom_query(mem_limits_query)
-
-
-    network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
-    (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
-    network_result = prometheus.custom_query(network_query)
-    logging.info(network_query)
-    network_data = network_result
-
-
-    save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
-    return saved_metrics_path
+    # Dicts for saving utilisation and queries -- key is namespace
+    utilization = {}
+    queries = {}
+
+    logging.info("Fetching utilization...")
+    for namespace in namespaces:
+
+        # Fetch CPU utilization
+        cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
+        cpu_result = prometheus.custom_query(cpu_query)
+
+        cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
+        cpu_limits_result = prometheus.custom_query(cpu_limits_query)
+
+        mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
+        mem_result = prometheus.custom_query(mem_query)
+
+        mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
+        mem_limits_result = prometheus.custom_query(mem_limits_query)
+
+        network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
+        (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
+        network_result = prometheus.custom_query(network_query)
+
+        utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result]
+        queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)
+
+    save_utilization_to_file(utilization, saved_metrics_path)
+    return saved_metrics_path, queries


+def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query):
+    queries = {
+        "cpu_query": cpu_query,
+        "cpu_limit_query": cpu_limits_query,
+        "memory_query": mem_query,
+        "memory_limit_query": mem_limits_query,
+        "network_query": network_query
+    }
+    return queries
--- a/kraken/plugins/init.py
+++ b/kraken/plugins/init.py
@@ -2,11 +2,14 @@ import dataclasses
 import json
 import logging
 from os.path import abspath
-from typing import List, Dict
+from typing import List, Dict, Any
 import time

 from arcaflow_plugin_sdk import schema, serialization, jsonschema
 from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
+from krkn_lib.k8s import KrknKubernetes
+from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
+
 import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
 import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
 from kraken.plugins.run_python_plugin import run_python_file
@@ -47,11 +50,14 @@ class Plugins:
                )
            self.steps_by_id[step.schema.id] = step

+    def unserialize_scenario(self, file: str) -> Any:
+        return serialization.load_from_file(abspath(file))
+
    def run(self, file: str, kubeconfig_path: str, kraken_config: str):
        """
        Run executes a series of steps
        """
-        data = serialization.load_from_file(abspath(file))
+        data = self.unserialize_scenario(abspath(file))
        if not isinstance(data, list):
            raise Exception(
                "Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
@@ -213,6 +219,12 @@ PLUGINS = Plugins(
                "error"
            ]
        ),
+        PluginStep(
+            network_chaos,
+            [
+                "error"
+            ]
+        ),        
        PluginStep(
            pod_outage,
            [
@@ -235,7 +247,15 @@ PLUGINS = Plugins(
 )


-def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]):
+def run(scenarios: List[str],
+        kubeconfig_path: str,
+        kraken_config: str,
+        failed_post_scenarios: List[str],
+        wait_duration: int,
+        telemetry: KrknTelemetryKubernetes,
+        kubecli: KrknKubernetes
+        ) -> (List[str], list[ScenarioTelemetry]):
+
    scenario_telemetries: list[ScenarioTelemetry] = []
    for scenario in scenarios:
        scenario_telemetry = ScenarioTelemetry()
@@ -243,10 +263,21 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
        scenario_telemetry.startTimeStamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, scenario)
        logging.info('scenario ' + str(scenario))
+        pool = PodsMonitorPool(kubecli)
+        kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"]
+
        try:
+            start_monitoring(pool, kill_scenarios)
            PLUGINS.run(scenario, kubeconfig_path, kraken_config)
+            result = pool.join()
+            scenario_telemetry.affected_pods = result
+            if result.error:
+                raise Exception(f"unrecovered pods: {result.error}")
+
        except Exception as e:
+            logging.error(f"scenario exception: {str(e)}")
            scenario_telemetry.exitStatus = 1
+            pool.cancel()
            failed_post_scenarios.append(scenario)
            log_exception(scenario)
        else:
@@ -257,3 +288,31 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
        scenario_telemetry.endTimeStamp = time.time()

    return failed_post_scenarios, scenario_telemetries
+
+
+def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]):
+    for kill_scenario in scenarios:
+        recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
+        if ("namespace_pattern" in kill_scenario["config"] and
+                "label_selector" in kill_scenario["config"]):
+            namespace_pattern = kill_scenario["config"]["namespace_pattern"]
+            label_selector = kill_scenario["config"]["label_selector"]
+            pool.select_and_monitor_by_namespace_pattern_and_label(
+                namespace_pattern=namespace_pattern,
+                label_selector=label_selector,
+                max_timeout=recovery_time)
+            logging.info(
+                f"waiting {recovery_time} seconds for pod recovery, "
+                f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}")
+
+        elif ("namespace_pattern" in kill_scenario["config"] and
+              "name_pattern" in kill_scenario["config"]):
+            namespace_pattern = kill_scenario["config"]["namespace_pattern"]
+            name_pattern = kill_scenario["config"]["name_pattern"]
+            pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern,
+                                                                          namespace_pattern=namespace_pattern,
+                                                                          max_timeout=recovery_time)
+            logging.info(f"waiting {recovery_time} seconds for pod recovery, "
+                         f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}")
+        else:
+            raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration")
--- a/kraken/plugins/network/ingress_shaping.py
+++ b/kraken/plugins/network/ingress_shaping.py
@@ -62,7 +62,7 @@ class NetworkScenarioConfig:
        typing.Optional[int],
        validation.min(1)
    ] = field(
-        default=300,
+        default=30,
        metadata={
            "name": "Wait Duration",
            "description":
@@ -864,7 +864,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
                )
            logging.info("Waiting for parallel job to finish")
            start_time = int(time.time())
-            wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
+            wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
            end_time = int(time.time())
            if publish:
                cerberus.publish_kraken_status(
@@ -893,7 +893,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
                    )
                logging.info("Waiting for serial job to finish")
                start_time = int(time.time())
-                wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
+                wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
                logging.info("Deleting jobs")
                delete_jobs(cli, batch_cli, job_list[:])
                job_list = []
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -1,9 +1,13 @@
 import logging
 import time
+from typing import Any
+
 import yaml
 import sys
 import random
 import arcaflow_plugin_kill_pod
+from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
+
 import kraken.cerberus.setup as cerberus
 import kraken.post_actions.actions as post_actions
 from krkn_lib.k8s import KrknKubernetes
@@ -79,6 +83,7 @@ def container_run(kubeconfig_path,

    failed_scenarios = []
    scenario_telemetries: list[ScenarioTelemetry] = []
+    pool = PodsMonitorPool(kubecli)

    for container_scenario_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
@@ -91,23 +96,17 @@ def container_run(kubeconfig_path,
            pre_action_output = ""
        with open(container_scenario_config[0], "r") as f:
            cont_scenario_config = yaml.full_load(f)
+            start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool)
            for cont_scenario in cont_scenario_config["scenarios"]:
                # capture start time
                start_time = int(time.time())
                try:
                    killed_containers = container_killing_in_pod(cont_scenario, kubecli)
-                    if len(container_scenario_config) > 1:
-                        failed_post_scenarios = post_actions.check_recovery(
-                            kubeconfig_path,
-                            container_scenario_config,
-                            failed_post_scenarios,
-                            pre_action_output
-                        )
-                    else:
-                        failed_post_scenarios = check_failed_containers(
-                            killed_containers, cont_scenario.get("retry_wait", 120), kubecli
-                        )
-
+                    logging.info(f"killed containers: {str(killed_containers)}")
+                    result = pool.join()
+                    if result.error:
+                        raise Exception(f"pods failed to recovery: {result.error}")
+                    scenario_telemetry.affected_pods = result
                    logging.info("Waiting for the specified duration: %s" % (wait_duration))
                    time.sleep(wait_duration)

@@ -117,6 +116,7 @@ def container_run(kubeconfig_path,
                    # publish cerberus status
                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
                except (RuntimeError, Exception):
+                    pool.cancel()
                    failed_scenarios.append(container_scenario_config[0])
                    log_exception(container_scenario_config[0])
                    scenario_telemetry.exitStatus = 1
@@ -129,6 +129,16 @@ def container_run(kubeconfig_path,

    return failed_scenarios, scenario_telemetries

+def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool):
+    for kill_scenario in kill_scenarios:
+        namespace_pattern = f"^{kill_scenario['namespace']}$"
+        label_selector = kill_scenario["label_selector"]
+        recovery_time = kill_scenario["expected_recovery_time"]
+        pool.select_and_monitor_by_namespace_pattern_and_label(
+            namespace_pattern=namespace_pattern,
+            label_selector=label_selector,
+            max_timeout=recovery_time)
+

 def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
    scenario_name = get_yaml_item_value(cont_scenario, "name", "")
--- a/kraken/prometheus/client.py
+++ b/kraken/prometheus/client.py
@@ -1,10 +1,13 @@
 import datetime
 import os.path
+from typing import Optional
+
 import urllib3
 import logging
 import sys

 import yaml
+from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):

            prom_cli.process_alert(alert,
                                   datetime.datetime.fromtimestamp(start_time),
-                                   datetime.datetime.fromtimestamp(end_time))
+                                   datetime.datetime.fromtimestamp(end_time))
+
+
+def critical_alerts(prom_cli: KrknPrometheus,
+                    summary: ChaosRunAlertSummary,
+                    run_id,
+                    scenario,
+                    start_time,
+                    end_time):
+    summary.scenario = scenario
+    summary.run_id = run_id
+    query = r"""ALERTS{severity="critical"}"""
+    logging.info("Checking for critical alerts firing post chaos")
+
+    during_critical_alerts = prom_cli.process_prom_query_in_range(
+        query,
+        start_time=datetime.datetime.fromtimestamp(start_time),
+        end_time=end_time
+
+    )
+
+    for alert in during_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.chaos_alerts.append(alert)
+
+
+    post_critical_alerts = prom_cli.process_query(
+        query
+    )
+
+    for alert in post_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.post_chaos_alerts.append(alert)
+
+    during_critical_alerts_count = len(during_critical_alerts)
+    post_critical_alerts_count = len(post_critical_alerts)
+    firing_alerts = False
+
+    if during_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if post_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if not firing_alerts:
+        logging.info("No critical alerts are firing!!")
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ google-api-python-client==2.116.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
 jinja2==3.1.3
-krkn-lib==1.4.12
+krkn-lib==2.1.2
 lxml==5.1.0
 kubernetes==26.1.0
 oauth2client==4.1.3
@@ -38,3 +38,4 @@ zope.interface==5.4.0

 git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
 git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
+cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -9,6 +9,8 @@ import optparse
 import pyfiglet
 import uuid
 import time
+
+from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 import kraken.time_actions.common_time_functions as time_actions
 import kraken.performance_dashboards.setup as performance_dashboards
@@ -183,7 +185,7 @@ def main(cfg):
        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
        telemetry_elastic = KrknElastic(safe_logger,elastic_url)
-
+        summary = ChaosRunAlertSummary()
        if enable_alerts or check_critical_alerts:
            prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)

@@ -215,8 +217,8 @@ def main(cfg):

        # Capture the start time
        start_time = int(time.time())
-        critical_alerts_count = 0
-
+        post_critical_alerts = 0
+        chaos_output = ChaosRunOutput()
        chaos_telemetry = ChaosRunTelemetry()
        chaos_telemetry.run_uuid = run_uuid
        # Loop to run the chaos starts here
@@ -262,7 +264,8 @@ def main(cfg):
                                kraken_config,
                                failed_post_scenarios,
                                wait_duration,
-                                telemetry_k8s
+                                telemetry_k8s,
+                                kubecli
                            )
                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                        # krkn_lib
@@ -330,7 +333,7 @@ def main(cfg):
                        elif scenario_type == "application_outages":
                            logging.info("Injecting application outage")
                            failed_post_scenarios, scenario_telemetries = application_outage.run(
-                                scenarios_list, config, wait_duration, telemetry_k8s)
+                                scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # PVC scenarios
@@ -347,22 +350,21 @@ def main(cfg):
                            failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)

                        # Check for critical alerts when enabled
+                        post_critical_alerts = 0
                        if check_critical_alerts:
-                            logging.info("Checking for critical alerts firing post choas")
+                            prometheus_plugin.critical_alerts(prometheus,
+                                                              summary,
+                                                              run_uuid,
+                                                              scenario_type,
+                                                              start_time,
+                                                              datetime.datetime.now())

-                            ##PROM
-                            query = r"""ALERTS{severity="critical"}"""
-                            end_time = datetime.datetime.now()
-                            critical_alerts = prometheus.process_query(
-                                query
-                            )
-                            critical_alerts_count = len(critical_alerts)
-                            if critical_alerts_count > 0:
-                                logging.error("Critical alerts are firing: %s", critical_alerts)
-                                logging.error("Please check, exiting")
+                            chaos_output.critical_alerts = summary
+                            post_critical_alerts = len(summary.post_chaos_alerts)
+                            if post_critical_alerts > 0:
+                                logging.error("Post chaos critical alerts firing please check, exiting")
                                break
-                            else:
-                                logging.info("No critical alerts are firing!!")
+

            iteration += 1
            logging.info("")
@@ -382,14 +384,18 @@ def main(cfg):
            telemetry_k8s.collect_cluster_metadata(chaos_telemetry)

        decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
-        logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
+        chaos_output.telemetry = decoded_chaos_run_telemetry
+        logging.info(f"Chaos data:\n{chaos_output.to_json()}")
        telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
        if config["telemetry"]["enabled"]:
-            logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
+            logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
+                         f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
+                         f'{telemetry_request_id}')
            logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
            try:
                telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
                telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
+                telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
                # prometheus data collection is available only on Openshift
                if config["telemetry"]["prometheus_backup"]:
                    prometheus_archive_files = ''
@@ -439,15 +445,15 @@ def main(cfg):
                logging.error("Alert profile is not defined")
                sys.exit(1)

-        if critical_alerts_count > 0:
+        if post_critical_alerts > 0:
            logging.error("Critical alerts are firing, please check; exiting")
-            sys.exit(1)
+            sys.exit(2)

        if failed_post_scenarios:
            logging.error(
                "Post scenarios are still failing at the end of all iterations"
            )
-            sys.exit(1)
+            sys.exit(2)

        logging.info(
            "Successfully finished running Kraken. UUID for the run: "
--- a/scenarios/arcaflow/cpu-hog/config.yaml
+++ b/scenarios/arcaflow/cpu-hog/config.yaml
@@ -4,7 +4,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/arcaflow/io-hog/config.yaml
+++ b/scenarios/arcaflow/io-hog/config.yaml
@@ -3,7 +3,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/arcaflow/memory-hog/config.yaml
+++ b/scenarios/arcaflow/memory-hog/config.yaml
@@ -4,7 +4,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/kind/scheduler.yml
+++ b/scenarios/kind/scheduler.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^kube-system$
    label_selector: component=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: component=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/kube/pod.yml
+++ b/scenarios/kube/pod.yml
@@ -4,3 +4,4 @@
    name_pattern: ^nginx-.*$
    namespace_pattern: ^default$
    kill: 1
+    krkn_pod_recovery_time: 120
--- a/scenarios/kube/scheduler.yml
+++ b/scenarios/kube/scheduler.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^kube-system$
    label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: k8s-app=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/container_etcd.yml
+++ b/scenarios/openshift/container_etcd.yml
@@ -5,4 +5,4 @@ scenarios:
  container_name: "etcd"
  action: 1
  count: 1
-  expected_recovery_time: 60
+  expected_recovery_time: 120
--- a/scenarios/openshift/customapp_pod.yaml
+++ b/scenarios/openshift/customapp_pod.yaml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^acme-air$
    name_pattern: .*
- id: wait-for-pods
-  config:
-    namespace_pattern: ^acme-air$
-    name_pattern: .*
-    count: 8
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/etcd.yml
+++ b/scenarios/openshift/etcd.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^openshift-etcd$
    label_selector: k8s-app=etcd
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-etcd$
-    label_selector: k8s-app=etcd
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/openshift-apiserver.yml
+++ b/scenarios/openshift/openshift-apiserver.yml
@@ -3,8 +3,5 @@
  config:
    namespace_pattern: ^openshift-apiserver$
    label_selector: app=openshift-apiserver-a
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-apiserver$
-    label_selector: app=openshift-apiserver-a
-    count: 3
+    krkn_pod_recovery_time: 120
+
--- a/scenarios/openshift/openshift-kube-apiserver.yml
+++ b/scenarios/openshift/openshift-kube-apiserver.yml
@@ -3,8 +3,5 @@
  config:
    namespace_pattern: ^openshift-kube-apiserver$
    label_selector: app=openshift-kube-apiserver
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-kube-apiserver$
-    label_selector: app=openshift-kube-apiserver
-    count: 3
+    krkn_pod_recovery_time: 120
+
--- a/scenarios/openshift/post_action_prometheus.yml
+++ b/scenarios/openshift/post_action_prometheus.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: app=prometheus
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: app=prometheus
-    count: 2
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/prom_kill.yml
+++ b/scenarios/openshift/prom_kill.yml
@@ -2,8 +2,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
-    count: 1
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/prometheus.yml
+++ b/scenarios/openshift/prometheus.yml
@@ -3,9 +3,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: app=prometheus
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: app=prometheus
-    count: 2
-    timeout: 180
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/regex_openshift_pod_kill.yml
+++ b/scenarios/openshift/regex_openshift_pod_kill.yml
@@ -4,3 +4,4 @@
    namespace_pattern: ^openshift-.*$
    name_pattern: .*
    kill: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/plugin.schema.json
+++ b/scenarios/plugin.schema.json
@@ -60,7 +60,14 @@
 										"default": 1,
 										"title": "Backoff",
 										"description": "How many seconds to wait between checks for the target pod status."
+									},
+									"krkn_pod_recovery_time": {
+										"type": "integer",
+										"default": 30,
+										"title": "Recovery Time",
+										"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
 									}
+
 								},
 								"required": [
 									"namespace_pattern"
@@ -112,6 +119,12 @@
 								"default": 1,
 								"title": "Backoff",
 								"description": "How many seconds to wait between checks for the target pod status."
+							},
+							"krkn_pod_recovery_time": {
+								"type": "integer",
+								"default": 30,
+								"title": "Recovery Time",
+								"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
 							}
 						},
 						"required": [
--- a/utils/chaos_ai/README.md
+++ b/utils/chaos_ai/README.md
@@ -0,0 +1,39 @@
+# aichaos
+Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
+
+## Generate python package wheel file
+```
+python3.9 generate_wheel_package.py sdist bdist_wheel
+```
+This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder. 
+
+## Build Image
+```
+cd docker
+podman build -t aichaos:1.0 .
+OR
+docker build -t aichaos:1.0 .
+```
+
+## Run Chaos AI
+```
+podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
+OR
+docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
+```
+
+The output should look like:
+```
+$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
+ * Serving Flask app 'swagger_api' (lazy loading)
+ * Environment: production
+   WARNING: This is a development server. Do not use it in a production deployment.
+   Use a production WSGI server instead.
+ * Debug mode: on
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on all addresses (0.0.0.0)
+ * Running on http://127.0.0.1:5001
+ * Running on http://172.17.0.2:5001
+```
+
+You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
--- a/utils/chaos_ai/config/experiments/.gitkeep
+++ b/utils/chaos_ai/config/experiments/.gitkeep
--- a/utils/chaos_ai/docker/Dockerfile
+++ b/utils/chaos_ai/docker/Dockerfile
@@ -0,0 +1,21 @@
+FROM bitnami/kubectl:1.20.9 as kubectl
+FROM python:3.9
+WORKDIR /app
+RUN pip3 install --upgrade pip
+COPY config config/
+COPY requirements.txt .
+RUN mkdir -p /app/logs
+RUN pip3 install -r requirements.txt
+
+COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
+
+COPY swagger_api.py .
+ENV PYTHONUNBUFFERED=1
+
+RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
+
+RUN apt-get update && apt-get install -y podman
+
+COPY aichaos-0.0.1-py3-none-any.whl .
+RUN pip3 install aichaos-0.0.1-py3-none-any.whl
+CMD ["python3", "swagger_api.py"]
--- a/utils/chaos_ai/docker/aichaos-config.json
+++ b/utils/chaos_ai/docker/aichaos-config.json
@@ -0,0 +1,7 @@
+{
+  "command": "podman",
+  "chaosengine": "kraken",
+  "faults": "pod-delete",
+  "iterations": 1,
+  "maxfaults": 5
+}
--- a/utils/chaos_ai/docker/config/experiments/log.yml
+++ b/utils/chaos_ai/docker/config/experiments/log.yml
@@ -0,0 +1,15 @@
+
+    Get Log from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/pod-delete.json
+++ b/utils/chaos_ai/docker/config/pod-delete.json
@@ -0,0 +1,36 @@
+{
+  "apiVersion": "1.0",
+  "kind": "ChaosEngine",
+  "metadata": {
+    "name": "engine-cartns3"
+  },
+  "spec": {
+    "engineState": "active",
+    "annotationCheck": "false",
+    "appinfo": {
+      "appns": "robot-shop",
+      "applabel": "service=payment",
+      "appkind": "deployment"
+    },
+    "chaosServiceAccount": "pod-delete-sa",
+    "experiments": [
+      {
+        "name": "pod-delete",
+        "spec": {
+          "components": {
+            "env": [
+              {
+                "name": "FORCE",
+                "value": "true"
+              },
+              {
+                "name": "TOTAL_CHAOS_DURATION",
+                "value": "120"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  }
+}
--- a/utils/chaos_ai/docker/config/yml/chaosGen.yml
+++ b/utils/chaos_ai/docker/config/yml/chaosGen.yml
@@ -0,0 +1,40 @@
+
+Generate chaos on an application deployed on a cluster.
+---
+    tags:
+      - ChaosAI API
+    parameters:
+      - name: file
+        in: formData
+        type: file
+        required: true
+        description: Kube-config file
+      - name: namespace
+        in: formData
+        type: string
+        default: robot-shop
+        required: true
+        description: Namespace to test
+      - name: podlabels
+        in: formData
+        type: string
+        default: service=cart,service=payment
+        required: true
+        description: Pod labels to test
+      - name: nodelabels
+        in: formData
+        type: string
+        required: false
+        description: Node labels to test
+      - name: urls
+        in: formData
+        type: string
+        default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
+        required: true
+        description: Application URLs to test
+
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Chaos ID for the initiated chaos.
--- a/utils/chaos_ai/docker/config/yml/episodes.yml
+++ b/utils/chaos_ai/docker/config/yml/episodes.yml
@@ -0,0 +1,15 @@
+
+    Get Episodes from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/log.yml
+++ b/utils/chaos_ai/docker/config/yml/log.yml
@@ -0,0 +1,15 @@
+
+    Get Log from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/qtable.yml
+++ b/utils/chaos_ai/docker/config/yml/qtable.yml
@@ -0,0 +1,15 @@
+
+    Get QTable from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/status.yml
+++ b/utils/chaos_ai/docker/config/yml/status.yml
@@ -0,0 +1,15 @@
+
+     Get status of the Constraints ID.---
+    tags:
+      - ChaosAI API
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Chaos for the given ID.
--- a/utils/chaos_ai/docker/requirements.txt
+++ b/utils/chaos_ai/docker/requirements.txt
@@ -0,0 +1,6 @@
+numpy
+pandas
+requests
+Flask==2.1.0
+Werkzeug==2.3.8
+flasgger==0.9.5
--- a/utils/chaos_ai/docker/swagger_api.py
+++ b/utils/chaos_ai/docker/swagger_api.py
@@ -0,0 +1,186 @@
+import json, os
+import logging
+# import numpy as np
+# import pandas as pd
+import threading
+from datetime import datetime
+from flask import Flask, request
+from flasgger import Swagger
+from flasgger.utils import swag_from
+# import zipfile
+import sys
+
+# sys.path.append("..")
+from src.aichaos_main import AIChaos
+
+app = Flask(__name__)
+Swagger(app)
+flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "app", "logs") + '/'
+
+
+class AIChaosSwagger:
+    def __init__(self, flaskdir=''):
+        self.flaskdir = flaskdir
+
+    @app.route("/")
+    def empty(params=''):
+        return "AI Chaos Repository!"
+
+    def startchaos(self, kubeconfigfile, file_id, params):
+        print('[StartChaos]', file_id, kubeconfigfile)
+        dir = flaskdir
+        outfile = ''.join([dir, 'out-', file_id])
+        initfile = ''.join([dir, 'init-', file_id])
+        with open(initfile, 'w'):
+            pass
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        # kubeconfigfile = params['file']
+        os.environ["KUBECONFIG"] = kubeconfigfile
+        os.system("export KUBECONFIG="+kubeconfigfile)
+        os.system("echo $KUBECONFIG")
+        print('setting kubeconfig')
+        params['command'] = 'podman'
+        params['chaosengine'] = 'kraken'
+        params['faults'] = 'pod-delete'
+        params['iterations'] = 1
+        params['maxfaults'] = 5
+        if os.path.isfile('/config/aichaos-config.json'):
+            with open('/config/aichaos-config.json') as f:
+                config_params = json.load(f)
+                params['command'] = config_params['command']
+                params['chaosengine'] = config_params['chaosengine']
+                params['faults']= config_params['faults']
+                params['iterations'] = config_params['iterations']
+                params['maxfaults'] = config_params['maxfaults']
+        # faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
+        faults = []
+        for f in params['faults'].split(','):
+            if f in ['pod-delete']:
+                for p in params['podlabels'].split(','):
+                    faults.append(f + ':' + p)
+            elif f in ['network-chaos', 'node-memory-hog', 'node-cpu-hog']:
+                for p in params['nodelabels'].split(','):
+                    faults.append(f + ':' + p)
+            else:
+                pass
+
+        print('#faults:', len(faults), faults)
+        states = {'200': 0, '500': 1, '501': 2, '502': 3, '503': 4, '504': 5,
+                  '401': 6,  '403': 7,  '404': 8,  '429': 9,
+                  'Timeout': 10, 'Other': 11}
+        rewards = {'200': -1, '500': 0.8, '501': 0.8, '502': 0.8, '503': 0.8, '504': 0.8,
+                   '401': 1,  '403': 1,  '404': 1,  '429': 1,
+                   'Timeout': 1, 'Other': 1}
+        logfile = self.flaskdir + 'log_' + str(file_id)
+        qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
+        efile = self.flaskdir + 'efile_' + str(file_id)
+        epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
+        # probe_url = params['probeurl']
+        cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
+                'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
+                'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
+        aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                          logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
+                          urls=params['urls'].split(','), namespace=params['namespace'],
+                          max_faults=int(params['maxfaults']),
+                          num_requests=10, timeout=2,
+                          chaos_engine=params['chaosengine'],
+                          chaos_dir='config/', kubeconfig=kubeconfigfile,
+                          loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=int(params['iterations']),
+                          command=params['command'])
+        print('checking kubeconfig')
+        os.system("echo $KUBECONFIG")
+        aichaos.start_chaos()
+
+        file = open(outfile, "w")
+        file.write('done')
+        file.close()
+        os.remove(initfile)
+        # os.remove(csvfile)
+        # ConstraintsInference().remove_temp_files(dir, file_id)
+        return 'WRITE'
+
+    @app.route('/GenerateChaos/', methods=['POST'])
+    @swag_from('config/yml/chaosGen.yml')
+    def chaos_gen():
+        dir = flaskdir
+        sw = AIChaosSwagger(flaskdir=dir)
+        f = request.files['file']
+        list = os.listdir(dir)
+        for i in range(10000):
+            fname = 'kubeconfig-'+str(i)
+            if fname not in list:
+                break
+        kubeconfigfile = ''.join([dir, 'kubeconfig-', str(i)])
+        f.save(kubeconfigfile)
+        # creating empty file
+        open(kubeconfigfile, 'a').close()
+        # print('HEADER:', f.headers)
+        print('[GenerateChaos] reqs:', request.form.to_dict())
+        # print('[GenerateChaos]', f.filename, datetime.now())
+        thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
+        thread.daemon = True
+        print(thread.getName())
+        thread.start()
+        return 'Chaos ID: ' + str(i)
+
+    @app.route('/GetStatus/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/status.yml')
+    def get_status(chaosid):
+        print('[GetStatus]', chaosid, flaskdir)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            return 'Completed'
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Does not exist'
+
+    @app.route('/GetQTable/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/qtable.yml')
+    def get_qtable(chaosid):
+        print('[GetQTable]', chaosid)
+        qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(qfile):
+            f = open(qfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+    @app.route('/GetEpisodes/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/episodes.yml')
+    def get_episodes(chaosid):
+        print('[GetEpisodes]', chaosid)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+    @app.route('/GetLog/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/log.yml')
+    def get_log(chaosid):
+        print('[GetLog]', chaosid)
+        epfile = flaskdir + 'log_' + str(chaosid)
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port='5001')
--- a/utils/chaos_ai/generate_wheel_package.py
+++ b/utils/chaos_ai/generate_wheel_package.py
@@ -0,0 +1,21 @@
+import setuptools
+# from setuptools_cythonize import get_cmdclass
+
+setuptools.setup(
+    # cmdclass=get_cmdclass(),
+    name="aichaos",
+    version="0.0.1",
+    author="Sandeep Hans",
+    author_email="shans001@in.ibm.com",
+    description="Chaos AI",
+    long_description="Chaos Engineering using AI",
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.9',
+)
--- a/utils/chaos_ai/requirements.txt
+++ b/utils/chaos_ai/requirements.txt
@@ -0,0 +1,10 @@
+numpy
+pandas
+notebook
+jupyterlab
+jupyter
+seaborn
+requests
+wheel
+Flask==2.1.0
+flasgger==0.9.5
--- a/utils/chaos_ai/src/init.py
+++ b/utils/chaos_ai/src/init.py
--- a/utils/chaos_ai/src/aichaos.py
+++ b/utils/chaos_ai/src/aichaos.py
@@ -0,0 +1,213 @@
+import json
+import os
+import random
+import sys
+
+import numpy as np
+import logging
+
+
+class AIChaos:
+    def __init__(self, states=None, faults=None, rewards=None, pod_names=[], chaos_dir=None,
+                 chaos_experiment='experiment.json',
+                 chaos_journal='journal.json', iterations=1000, static_run=False):
+        self.faults = faults
+        self.pod_names = pod_names
+        self.states = states
+        self.rewards = rewards
+        self.episodes = []
+
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.chaos_journal = chaos_journal
+
+        self.iterations = iterations
+        # Initialize parameters
+        self.gamma = 0.75  # Discount factor
+        self.alpha = 0.9  # Learning rate
+
+        # Initializing Q-Values
+        # self.Q = np.array(np.zeros([9, 9]))
+        # self.Q = np.array(np.zeros([len(faults), len(faults)]))
+        # currently action is a single fault, later on we will do multiple faults together
+        # For multiple faults, the no of cols in q-matrix will be all combinations of faults (infinite)
+        # eg. {f1,f2},f3,f4,{f4,f5} - f1,f2  in parallel, then f3, then f4,  then f4,f5 in parallel produces end state
+        # self.Q = np.array(np.zeros([len(states), len(states)]))
+        self.Q = np.array(np.zeros([len(states), len(faults)]))
+        self.state_matrix = np.array(np.zeros([len(states), len(states)]))
+
+        # may be Q is a dictionary of dictionaries, for each state there is a dictionary of faults
+        # Q = {'500' = {'f1f2f4': 0.3, 'f1':  0.5}, '404' = {'f2': 0.22}}
+
+        self.logger = logging.getLogger()
+        # run from old static experiment and journal files
+        self.static_run = static_run
+
+    # End state is reached when system is down or return error code like '500','404'
+    def get_next_state(self):
+        self.logger.info('[GET_NEXT_STATE]')
+        f = open(self.chaos_dir + self.chaos_journal)
+        data = json.load(f)
+
+        # before the experiment (if before steady state is false, after is null?)
+        for probe in data['steady_states']['before']['probes']:
+            if not probe['tolerance_met']:
+                # start_state = probe['activity']['tolerance']
+                # end_state = probe['status']
+                start_state, end_state = None, None
+                return start_state, end_state
+
+        # after the experiment
+        for probe in data['steady_states']['after']['probes']:
+            # if probe['output']['status'] == probe['activity']['tolerance']:
+            if not probe['tolerance_met']:
+                # print(probe)
+                start_state = probe['activity']['tolerance']
+                end_state = probe['output']['status']
+                # end_state = probe['status']
+                return start_state, end_state
+        # if tolerances for all probes are met
+        start_state = probe['activity']['tolerance']
+        end_state = probe['activity']['tolerance']
+        return start_state, end_state
+
+    def inject_faults(self, fault, pod_name):
+        self.logger.info('[INJECT_FAULT] ' + fault)
+        f = open(self.chaos_dir + self.chaos_experiment)
+        data = json.load(f)
+        for m in data['method']:
+            if 'provider' in m:
+                if fault == 'kill_microservice':
+                    m['name'] = 'kill-microservice'
+                    m['provider']['module'] = 'chaosk8s.actions'
+                    m['provider']['arguments']['name'] = pod_name
+                else:
+                    m['provider']['arguments']['name_pattern'] = pod_name
+                m['provider']['func'] = fault
+
+                print('[INJECT_FAULT] method:', m)
+                # self.logger.info('[INJECT_FAULT] ' + m['provider']['arguments']['name_pattern'])
+                # self.logger.info('[INJECT_FAULT] ' + str(m))
+
+        exp_file = self.chaos_dir + 'experiment_' + str(random.randint(1, 10)) + '.json'
+        with open(exp_file, 'w') as f:
+            json.dump(data, f)
+        exp_file = self.chaos_dir + 'experiment.json'
+        # execute faults
+        # cmd = 'cd ' + self.chaos_dir + ';chaos run ' + self.chaos_experiment
+        cmd = 'cd ' + self.chaos_dir + ';chaos run ' + exp_file
+        if not self.static_run:
+            os.system(cmd)
+
+    def create_episode(self):
+        self.logger.info('[CREATE_EPISODE]')
+        episode = []
+        while True:
+            # inject more faults
+            # TODO: model - choose faults based on q-learning ...
+            fault_pod = random.choice(self.faults)
+            fault = fault_pod.split(':')[0]
+            pod_name = fault_pod.split(':')[1]
+            # fault = random.choice(self.faults)
+            # pod_name = random.choice(self.pod_names)
+            # fault = lstm_model.get_next_fault(episode)
+            # fault = get_max_prob_fault(episode)
+
+            self.inject_faults(fault, pod_name)
+            start_state, next_state = self.get_next_state()
+            print('[CREATE EPISODE]', start_state, next_state)
+            # if before state tolerance is not met
+            if start_state is None and next_state is None:
+                continue
+
+            episode.append({'fault': fault, 'pod_name': pod_name})
+            self.update_q_fault(fault_pod, episode, start_state, next_state)
+            # self.update_q_fault(fault, episode, start_state, next_state)
+            # if an end_state is reached
+            # if next_state is not None:
+            if start_state != next_state:
+                self.logger.info('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
+                self.logger.info('[CREATE_EPISODE] END STATE:' + str(next_state))
+                return episode, start_state, next_state
+
+    def update_q_fault(self, fault, episode, start_state, end_state):
+        self.logger.info('[UPDATE_Q]')
+        print('[UPDATE_Q] ', str(start_state), str(end_state))
+        if end_state is None:
+            end_state = start_state
+
+        # reward is dependent on the error response (eg. '404') and length of episode
+        reward = self.rewards[str(end_state)] / len(episode)
+        current_state = self.states[str(start_state)]
+        next_state = self.states[str(end_state)]
+        fault_index = self.faults.index(fault)
+
+        TD = reward + \
+             self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+             self.Q[current_state, fault_index]
+        self.Q[current_state, fault_index] += self.alpha * TD
+
+        # update state matrix
+        TD_state = reward + \
+                   self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
+                   self.state_matrix[current_state, next_state]
+        self.state_matrix[current_state, next_state] += self.alpha * TD_state
+
+    # def update_q(self, episode, start_state, end_state):
+    #     self.logger.info('[UPDATE_Q]')
+    #     if end_state is None:
+    #         end_state = start_state
+    #
+    #     # reward is dependent on the error response (eg. '404') and length of episode
+    #     reward = self.rewards[str(end_state)] / len(episode)
+    #     current_state = self.states[str(start_state)]
+    #     next_state = self.states[str(end_state)]
+    #     TD = reward + \
+    #          self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+    #          self.Q[current_state, next_state]
+    #     self.Q[current_state, next_state] += self.alpha * TD
+
+    def start_chaos(self):
+        for i in range(self.iterations):
+            episode, start_state, end_state = self.create_episode()
+            # update Q matrix
+            # will do it with each fault injection
+            # self.update_q(episode, start_state, end_state)
+            print(self.Q)
+            print(self.state_matrix)
+
+
+def test_chaos():
+    svc_list = ['cart', 'catalogue', 'dispatch', 'mongodb', 'mysql', 'payment', 'rabbitmq', 'ratings', 'redis',
+                'shipping', 'user', 'web']
+    # Define faults
+    # faults = ['terminate_pods']
+    #     faults = ['terminate_pods:' + x for x in pod_names]
+    faults = ['kill_microservice:' + x for x in svc_list]
+    # Define the states
+    states = {
+        '200': 0,
+        '500': 1,
+        '404': 2
+    }
+    # Define rewards, currently not used
+    rewards = {
+        '200': 0,
+        '500': 0.8,
+        '404': 1
+    }
+
+    # cdir = '/Users/sandeephans/Downloads/chaos/chaostoolkit-samples-master/service-down-not-visible-to-users/'
+    cdir = '/Users/sandeephans/Downloads/openshift/'
+    cexp = 'experiment.json'
+    cjournal = 'journal.json'
+
+    aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                      chaos_dir=cdir, chaos_experiment=cexp, chaos_journal=cjournal,
+                      static_run=False)
+    aichaos.start_chaos()
+
+
+if __name__ == '__main__':
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+    test_chaos()
--- a/utils/chaos_ai/src/aichaos_main.py
+++ b/utils/chaos_ai/src/aichaos_main.py
@@ -0,0 +1,248 @@
+import json
+import os
+import random
+
+import numpy as np
+import pandas as pd
+import logging
+
+# sys.path.insert(1, os.path.join(sys.path[0], '..'))
+import src.utils as utils
+from src.kraken_utils import KrakenUtils
+from src.qlearning import QLearning
+from src.test_application import TestApplication
+
+
+class AIChaos:
+    def __init__(self, namespace='robot-shop', states=None, faults=None, rewards=None, urls=[], max_faults=5,
+                 service_weights=None, ctd_subsets=None, pod_names=[], chaos_dir='../config/', kubeconfig='~/.kube/config',
+                 chaos_experiment='experiment.json', logfile='log', qfile='qfile.csv', efile='efile', epfile='episodes.json',
+                 loglevel=logging.INFO,
+                 chaos_journal='journal.json', iterations=10, alpha=0.9, gamma=0.2, epsilon=0.3,
+                 num_requests=10, sleep_time=1, timeout=2, chaos_engine='kraken', dstk_probes=None,
+                 static_run=False, all_faults=False, command='podman'):
+        self.namespace = namespace
+        self.faults = faults
+        self.unused_faults = faults.copy()
+        self.all_faults = all_faults
+        self.pod_names = pod_names
+        self.states = states
+        self.rewards = rewards
+        self.urls = urls
+        self.max_faults = max_faults
+        self.episodes = []
+        self.service_weights = service_weights
+        self.ctd_subsets = ctd_subsets
+
+        self.kubeconfig = kubeconfig
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.chaos_journal = chaos_journal
+        self.command = command
+
+        if chaos_engine == 'kraken':
+            self.chaos_engine = KrakenUtils(namespace, kubeconfig=kubeconfig, chaos_dir=chaos_dir, chaos_experiment=chaos_experiment, command=self.command)
+        else:
+            self.chaos_engine = None
+
+        self.iterations = iterations
+        # Initialize RL parameters
+        self.epsilon = epsilon  # epsilon decay policy
+        # self.epsdecay = 0
+
+        # log files
+        self.logfile = logfile
+        self.qfile = qfile
+        self.efile = efile
+        self.epfile = epfile
+        open(efile, 'w+').close()
+        open(logfile, 'w+').close()
+        open(logfile, 'r+').truncate(0)
+        logging.getLogger("requests").setLevel(logging.WARNING)
+        logging.getLogger("urllib3").setLevel(logging.WARNING)
+        logging.basicConfig(filename=logfile, filemode='w+', level=loglevel)
+        self.logger = logging.getLogger(logfile.replace('/',''))
+        self.logger.addHandler(logging.FileHandler(logfile))
+
+        self.testapp = TestApplication(num_requests, timeout, sleep_time)
+        self.ql = QLearning(gamma, alpha, faults, states, rewards, urls)
+
+        # run from old static experiment and journal files
+        self.static_run = static_run
+
+    def realistic(self, faults_pods):
+        self.logger.debug('[Realistic] ' + str(faults_pods))
+        fp = faults_pods.copy()
+        for f1 in faults_pods:
+            for f2 in faults_pods:
+                if f1 == f2:
+                    continue
+                if f1 in fp and f2 in fp:
+                    f1_fault, load_1 = utils.get_load(f1.split(':')[0])
+                    f1_pod = f1.split(':')[1]
+                    f2_fault, load_2 = utils.get_load(f2.split(':')[0])
+                    f2_pod = f2.split(':')[1]
+                    if f1_pod == f2_pod:
+                        if f1_fault == 'pod-delete':
+                            fp.remove(f2)
+                        if f1_fault == f2_fault:
+                            # if int(load_1) > int(load_2):
+                            # randomly remove one fault from same faults with different params
+                            fp.remove(f2)
+        if self.service_weights is None:
+            return fp
+
+        fp_copy = fp.copy()
+        for f in fp:
+            f_fault = f.split(':')[0]
+            f_pod = f.split(':')[1].replace('service=', '')
+            self.logger.debug('[ServiceWeights] ' + f + ' ' + str(self.service_weights[f_pod][f_fault]))
+            if self.service_weights[f_pod][f_fault] == 0:
+                fp_copy.remove(f)
+
+        self.logger.debug('[Realistic] ' + str(fp_copy))
+        return fp_copy
+
+    def select_faults(self):
+        max_faults = min(self.max_faults, len(self.unused_faults))
+        num_faults = random.randint(1, max_faults)
+        if self.all_faults:
+            num_faults = len(self.unused_faults)
+        if random.random() > self.epsilon:
+            self.logger.info('[Exploration]')
+            # faults_pods = random.sample(self.faults, k=num_faults)
+            # using used faults list to avoid starvation
+            faults_pods = random.sample(self.unused_faults, k=num_faults)
+            faults_pods = self.realistic(faults_pods)
+            for f in faults_pods:
+                self.unused_faults.remove(f)
+            if len(self.unused_faults) == 0:
+                self.unused_faults = self.faults.copy()
+        else:
+            self.logger.info('[Exploitation]')
+            first_row = self.ql.Q[:, 0, :][0]
+            top_k_indices = np.argpartition(first_row, -num_faults)[-num_faults:]
+            faults_pods = [self.faults[i] for i in top_k_indices]
+            faults_pods = self.realistic(faults_pods)
+
+        return faults_pods
+
+    def create_episode(self, ctd_subset=None):
+        self.logger.debug('[CREATE_EPISODE]')
+        episode = []
+
+        if ctd_subset is None:
+            faults_pods = self.select_faults()
+        else:
+            faults_pods = ctd_subset
+            self.logger.info('CTD Subset: ' + str(faults_pods))
+
+        # faults_pods = self.realistic(faults_pods)
+        if len(faults_pods) == 0:
+            return [], 200, 200
+
+        engines = []
+        for fp in faults_pods:
+            fault = fp.split(':')[0]
+            pod_name = fp.split(':')[1]
+            engine = self.chaos_engine.inject_faults(fault, pod_name)
+            engines.append(engine)
+            episode.append({'fault': fault, 'pod_name': pod_name})
+        self.logger.info('[create_episode]' + str(faults_pods))
+        engines_running = self.chaos_engine.wait_engines(engines)
+        self.logger.info('[create_episode] engines_running' + str(engines_running))
+        if not engines_running:
+            return None, None, None
+
+        # randomly shuffling urls 
+        urls = random.sample(self.urls, len(self.urls))
+        ep_json = []
+        for url in urls:
+            start_state, next_state = self.testapp.test_load(url)
+            self.logger.info('[CREATE EPISODE]' + str(start_state) + ',' + str(next_state))
+            # if before state tolerance is not met
+            if start_state is None and next_state is None:
+                # self.cleanup()
+                self.chaos_engine.stop_engines()
+                continue
+
+                ### episode.append({'fault': fault, 'pod_name': pod_name})
+                # self.update_q_fault(fault_pod, episode, start_state, next_state)
+            url_index = self.urls.index(url)
+            self.logger.info('[CREATEEPISODE]' + str(url) + ':' + str(url_index))
+            for fp in faults_pods:
+                self.ql.update_q_fault(fp, episode, start_state, next_state, self.urls.index(url))
+            ep_json.append({'start_state': start_state, 'next_state': next_state, 'url': url, 'faults': episode})
+
+        self.logger.debug('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
+        self.logger.debug('[CREATE_EPISODE] END STATE:' + str(next_state))
+
+        self.chaos_engine.print_result(engines)
+        self.chaos_engine.stop_engines(episode=episode)
+        # ep_json = {'start_state': start_state, 'next_state': next_state, 'faults': episode}
+
+        return ep_json, start_state, next_state
+
+    def start_chaos(self):
+        self.logger.info('[INITIALIZING]')
+        self.logger.info('Logfile: '+self.logfile)
+        self.logger.info('Loggerfile: '+self.logger.handlers[0].stream.name)
+        self.logger.info('Chaos Engine: ' + self.chaos_engine.get_name())
+        self.logger.debug('Faults:' + str(self.faults))
+
+        self.chaos_engine.cleanup()
+        if self.ctd_subsets is None:
+            for i in range(self.iterations):
+                episode, start_state, end_state = self.create_episode()
+                self.logger.debug('[start_chaos]' + str(i) + ' ' + str(episode))
+                if episode is None:
+                    continue
+                # update Q matrix
+                # will do it with each fault injection
+                # self.update_q(episode, start_state, end_state)
+                # if episode['next_state'] != '200':
+                self.episodes.extend(episode)
+                self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
+                # print(i, self.state_matrix)
+                self.write_q()
+                self.write_episode(episode)
+        else:
+            for i, subset in enumerate(self.ctd_subsets):
+                episode, start_state, end_state = self.create_episode(subset)
+                self.logger.debug('[start_chaos]' + str(episode))
+                if episode is None:
+                    continue
+                self.episodes.append(episode)
+                self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
+                self.write_q()
+                self.write_episode(episode)
+
+        self.chaos_engine.cleanup()
+        # self.remove_temp_file()
+        with open(self.epfile, 'w', encoding='utf-8') as f:
+            json.dump(self.episodes, f, ensure_ascii=False, indent=4)
+        self.logger.info('COMPLETE!!!')
+
+    def write_q(self):
+        df = pd.DataFrame(self.ql.Q[:, 0, :], index=self.urls, columns=self.faults)
+        df.to_csv(self.qfile)
+        return df
+
+    def write_episode(self, episode):
+        for ep in episode:
+            with open(self.efile, "a") as outfile:
+                x = [e['fault'] + ':' + e['pod_name'] for e in ep['faults']]
+                x.append(ep['url'])
+                x.append(str(ep['next_state']))
+                outfile.write(','.join(x) + '\n')
+
+    def remove_temp_file(self):
+        mydir = self.chaos_dir + 'experiments'
+        print('Removing temp files from: '+mydir)
+        self.logger.debug('Removing temp files: '+mydir)
+        if os.path.exists(mydir):
+            return
+        filelist = [f for f in os.listdir(mydir) if f.endswith(".json")]
+        for f in filelist:
+            print(f)
+            os.remove(os.path.join(mydir, f))
--- a/utils/chaos_ai/src/experiments.py
+++ b/utils/chaos_ai/src/experiments.py
@@ -0,0 +1,56 @@
+import random
+
+
+class Experiments:
+    def __init__(self):
+        self.k = 0
+
+    def monotonic(self, aichaos, num_sets=3):
+        for i in range(num_sets):
+            faults_pods = random.sample(aichaos.faults, k=2)
+            faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
+
+            resp1, resp2, resp_both = 0, 0, 0
+            for fl in faults_set:
+                engines = []
+                for fp in fl:
+                    fault = fp.split(':')[0]
+                    pod_name = fp.split(':')[1]
+                    engine = aichaos.inject_faults_litmus(fault, pod_name)
+                    engines.append(engine)
+                aichaos.litmus.wait_engines(engines)
+
+                for index, url in enumerate(aichaos.urls):
+                    start_state, next_state = aichaos.test_load(url)
+                    print(i, fl, next_state)
+                    # self.write(str(fl), next_state)
+                    if resp1 == 0:
+                        resp1 = next_state
+                    elif resp2 == 0:
+                        resp2 = next_state
+                    else:
+                        resp_both = next_state
+
+                aichaos.litmus.stop_engines()
+            self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
+        print('Experiment Complete!!!')
+
+    @staticmethod
+    def write(fault, next_state):
+        with open("experiment", "a") as outfile:
+            outfile.write(fault + ',' + str(next_state) + ',' + '\n')
+
+
+    @staticmethod
+    def write_resp(faults, resp1, resp2, resp3):
+        monotonic = True
+        if resp3 == 200:
+            if resp1 != 200 or resp2 != 200:
+                monotonic = False
+        else:
+            if resp1 == 200 and resp2 == 200:
+                monotonic = False
+
+        with open("experiment", "a") as outfile:
+            # outfile.write(faults + ',' + str(resp1) + ',' + '\n')
+            outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
--- a/utils/chaos_ai/src/kraken_utils.py
+++ b/utils/chaos_ai/src/kraken_utils.py
@@ -0,0 +1,99 @@
+import json
+import os
+import time
+import logging
+
+import src.utils as utils
+
+
+class KrakenUtils:
+    def __init__(self, namespace='robot-shop', chaos_dir='../config/',
+                 chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.namespace = namespace
+        self.kubeconfig = kubeconfig
+        self.logger = logging.getLogger()
+        self.engines = []
+        self.wait_checks = wait_checks
+        self.command = command
+
+    def exp_status(self, engine='engine-cartns3'):
+        substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
+        substr = '|'.join(substring_list)
+        # cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
+        # cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
+        cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
+        line = os.popen(cmd).read()
+        self.logger.debug('[exp_status]'+line)
+        # if 'Waiting for the specified duration' in line:
+        # if 'Waiting for' in line or 'waiting for' in line:
+        # if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
+        if any(map(line.__contains__, substring_list)):
+            return 'Running'
+        return 'Not Running'
+ 
+    # print chaos result, check if litmus showed any error
+    def print_result(self, engines):
+        # self.logger.debug('')
+        for e in engines:
+            # cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
+            # line = os.popen(cmd).read()
+            # self.logger.debug('[Chaos Result] '+e+' : '+line)
+            self.logger.debug('[KRAKEN][Chaos Result] '+e)
+
+    def wait_engines(self, engines=[]):
+        status = 'Completed'
+        max_checks = self.wait_checks
+        for e in engines:
+            self.logger.info('[Wait Engines] ' + e)
+            for i in range(max_checks):
+                status = self.exp_status(e)
+                if status == 'Running':
+                    break
+                time.sleep(1)
+            # return False, if even one engine is not running
+            if status != 'Running':
+                return False
+
+        self.engines = engines
+        # return True if all engines are running
+        return True
+
+
+    def cleanup(self):
+        self.logger.debug('Removing previous engines')
+        # cmd = "docker rm $(docker ps -q -f 'status=exited')"
+        if len(self.engines) > 0:
+            cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
+            os.system(cmd)
+        self.engines = []
+
+        cmd = self.command+" container prune -f >> temp"
+        os.system(cmd)
+        self.logger.debug('Engines removed')
+
+    def stop_engines(self, episode=[]):
+        self.cleanup()
+
+    def get_name(self):
+        return 'kraken'
+
+    def inject_faults(self, fault, pod_name):
+        self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
+        fault, load = utils.get_load(fault)
+        engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
+        if fault == 'pod-delete':
+            cmd = self.command+' run  -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
+        elif fault == 'network-chaos':
+            # 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10  --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'        
+            cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120  --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
+        elif fault == 'node-memory-hog':
+            cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
+        elif fault == 'node-cpu-hog':
+            cmd = self.command+'  run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
+        else:
+            cmd = 'echo'
+        self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
+        os.system(cmd)
+        return engine
--- a/utils/chaos_ai/src/qlearning.py
+++ b/utils/chaos_ai/src/qlearning.py
@@ -0,0 +1,62 @@
+import logging
+
+import numpy as np
+
+
+class QLearning:
+    def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
+        self.gamma = gamma  # Discount factor
+        self.alpha = alpha  # Learning rate
+        self.faults = faults
+        self.states = states
+        self.rewards = rewards
+
+        # Initializing Q-Values
+        # self.Q = np.array(np.zeros([len(states), len(states)]))
+        self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
+        self.state_matrix = np.array(np.zeros([len(states), len(states)]))
+
+        self.logger = logging.getLogger()
+
+    def update_q_fault(self, fault, episode, start_state, end_state, url_index):
+        self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
+        if end_state is None:
+            end_state = start_state
+        if end_state not in self.states:
+            end_state = 'Other'
+        # reward is dependent on the error response (eg. '404') and length of episode
+        reward = self.rewards[str(end_state)] / len(episode)
+        current_state = self.states[str(start_state)]
+        next_state = self.states[str(end_state)]
+        fault_index = self.faults.index(fault)
+        # self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
+        # self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
+        # self.logger.debug(
+        #     'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
+        # self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
+
+        TD = reward + \
+             self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
+             self.Q[url_index, current_state, fault_index]
+        self.Q[url_index, current_state, fault_index] += self.alpha * TD
+
+        # update state matrix
+        TD_state = reward + \
+                   self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
+                   self.state_matrix[current_state, next_state]
+        self.state_matrix[current_state, next_state] += self.alpha * TD_state
+        # self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
+
+    # def update_q(self, episode, start_state, end_state):
+    #     self.logger.info('[UPDATE_Q]')
+    #     if end_state is None:
+    #         end_state = start_state
+    #
+    #     # reward is dependent on the error response (eg. '404') and length of episode
+    #     reward = self.rewards[str(end_state)] / len(episode)
+    #     current_state = self.states[str(start_state)]
+    #     next_state = self.states[str(end_state)]
+    #     TD = reward + \
+    #          self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+    #          self.Q[current_state, next_state]
+    #     self.Q[current_state, next_state] += self.alpha * TD
--- a/utils/chaos_ai/src/swagger_api.py
+++ b/utils/chaos_ai/src/swagger_api.py
@@ -0,0 +1,171 @@
+import json, os
+import logging
+# import numpy as np
+# import pandas as pd
+import threading
+from datetime import datetime
+from flask import Flask, request
+from flasgger import Swagger
+from flasgger.utils import swag_from
+# import zipfile
+import sys
+
+sys.path.append("..")
+from aichaos_main import AIChaos
+
+app = Flask(__name__)
+Swagger(app)
+flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "experiments",
+                        "flask") + '/'
+
+
+class AIChaosSwagger:
+    def __init__(self, flaskdir=''):
+        self.flaskdir = flaskdir
+
+    @app.route("/")
+    def empty(params=''):
+        return "AI Chaos Repository!"
+
+    def startchaos(self, kubeconfigfile, file_id, params):
+        print('[StartChaos]', file_id, kubeconfigfile)
+        dir = flaskdir
+        outfile = ''.join([dir, 'out-', file_id])
+        initfile = ''.join([dir, 'init-', file_id])
+        with open(initfile, 'w'):
+            pass
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        # cons = ConstraintsInference(outdir=dir).get_constraints(csvfile, file_id, params, verbose=False,
+        #                                                         write_local=False)
+        os.environ["KUBECONFIG"] = kubeconfigfile
+        params['command'] = 'podman'
+        params['chaos_engine'] = 'kraken'
+        params['faults'] = 'pod-delete'
+        params['iterations'] = 1
+        params['maxfaults'] = 5
+        if os.path.isfile('/config/aichaos-config.json'):
+            with open('/config/aichaos-config.json') as f:
+                config_params = json.load(f)
+                params['command'] = config_params['command']
+                params['chaos_engine'] = config_params['chaos_engine']
+                params['faults']= config_params['faults']
+                params['iterations'] = config_params['iterations']
+                params['maxfaults'] = config_params['maxfaults']
+        faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
+        print('#faults:', len(faults), faults)
+        states = {'200': 0, '500': 1, '502': 2, '503': 3, '404': 4, 'Timeout': 5}
+        rewards = {'200': -1, '500': 0.8, '502': 0.8, '503': 0.8, '404': 1, 'Timeout': 1}
+        logfile = self.flaskdir + 'log_' + str(file_id)
+        qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
+        efile = self.flaskdir + 'efile_' + str(file_id)
+        epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
+        probe_url = params['probeurl']
+        probes = {'pod-delete': 'executeprobe', 'cpu-hog': 'wolffi/cpu_load', 'disk-fill': 'wolffi/memory_load',
+                  'io_load': 'wolffi/io_load', 'http_delay': 'wolffi/http_delay', 'packet_delay': 'wolffi/packet_delay',
+                  'packet_duplication': 'wolffi/packet_duplication', 'packet_loss': 'wolffi/packet_loss',
+                  'packet_corruption': 'wolffi/packet_corruption',
+                  'packet_reordering': 'wolffi/packet_reordering', 'network_load': 'wolffi/network_load',
+                  'http_bad_request': 'wolffi/http_bad_request',
+                  'http_unauthorized': 'wolffi/http_unauthorized', 'http_forbidden': 'wolffi/http_forbidden',
+                  'http_not_found': 'wolffi/http_not_found',
+                  'http_method_not_allowed': 'wolffi/http_method_not_allowed',
+                  'http_not_acceptable': 'wolffi/http_not_acceptable',
+                  'http_request_timeout': 'wolffi/http_request_timeout',
+                  'http_unprocessable_entity': 'wolffi/http_unprocessable_entity',
+                  'http_internal_server_error': 'wolffi/http_internal_server_error',
+                  'http_not_implemented': 'wolffi/http_not_implemented',
+                  'http_bad_gateway': 'wolffi/http_bad_gateway',
+                  'http_service_unavailable': 'wolffi/http_service_unavailable',
+                  'bandwidth_restrict': 'wolffi/bandwidth_restrict',
+                  'pod_cpu_load': 'wolffi/pod_cpu_load', 'pod_memory_load': 'wolffi/pod_memory_load',
+                  'pod_io_load': 'wolffi/pod_io_load',
+                  'pod_network_load': 'wolffi/pod_network_load'
+                  }
+        dstk_probes = {k: probe_url + v for k, v in probes.items()}
+        cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
+                'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
+                'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
+        aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                          logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
+                          urls=params['urls'].split(','), namespace=params['namespace'],
+                          max_faults=params['maxfaults'],
+                          num_requests=10, timeout=2,
+                          chaos_engine=params['chaos_engine'], dstk_probes=dstk_probes, command=params['command'],
+                          loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=params['iterations'])
+        aichaos.start_chaos()
+
+        file = open(outfile, "w")
+        file.write('done')
+        file.close()
+        os.remove(initfile)
+        # os.remove(csvfile)
+        # ConstraintsInference().remove_temp_files(dir, file_id)
+        return 'WRITE'
+
+    @app.route('/GenerateChaos/', methods=['POST'])
+    @swag_from('../config/yml/chaosGen.yml')
+    def chaos_gen():
+        dir = flaskdir
+        sw = AIChaosSwagger(flaskdir=dir)
+        f = request.files['file']
+        list = os.listdir(dir)
+        for i in range(10000):
+            if str(i) not in list:
+                break
+        kubeconfigfile = ''.join([dir, str(i)])
+        f.save(kubeconfigfile)
+        print('HEADER:', f.headers)
+        print('[GenerateChaos] reqs:', request.form.to_dict())
+        print('[GenerateChaos]', f.filename, datetime.now())
+        # thread = threading.Thread(target=sw.write_constraints, args=(csvfile, str(i), parameters))
+        thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
+        thread.daemon = True
+        print(thread.getName())
+        thread.start()
+        return 'Chaos ID: ' + str(i)
+
+    @app.route('/GetStatus/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/status.yml')
+    def get_status(chaosid):
+        print('[GetStatus]', chaosid, flaskdir)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            return 'Completed'
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Does not exist'
+
+    @app.route('/GetQTable/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/qtable.yml')
+    def get_qtable(chaosid):
+        print('[GetQTable]', chaosid)
+        qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(qfile):
+            f = open(qfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+    @app.route('/GetEpisodes/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/episodes.yml')
+    def get_episodes(chaosid):
+        print('[GetEpisodes]', chaosid)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port='5001')
--- a/utils/chaos_ai/src/test_application.py
+++ b/utils/chaos_ai/src/test_application.py
@@ -0,0 +1,83 @@
+import json
+import logging
+import time
+import requests
+
+
+class TestApplication:
+    def __init__(self, num_requests=10, timeout=2, sleep_time=1):
+        self.num_requests = num_requests
+        self.timeout = timeout
+        self.sleep_time = sleep_time
+        self.logger = logging.getLogger()
+
+    def test_load(self, url=''):
+        # url = 'http://192.168.49.2:31902/api/cart/health'
+        timeout_count = 0
+        avg_lat = 0
+        for i in range(self.num_requests):
+            try:
+                r = requests.get(url, verify=False, timeout=self.timeout)
+                avg_lat += r.elapsed.total_seconds()
+                self.logger.info(
+                    url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
+                    + " {:.2f}".format(avg_lat))
+                if r.status_code != 200:
+                    return '200', r.status_code
+            # except requests.exceptions.Timeout as toe:
+            except Exception as toe:
+                self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
+                timeout_count += 1
+                if timeout_count > 3:
+                    return '200', 'Timeout'
+            # except Exception as e:
+            #   self.logger.debug('Connection refused!'+str(e))
+            time.sleep(self.sleep_time)
+        self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
+        return '200', '200'
+
+    # def test_load_hey(self):
+    #     cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
+    #     os.system(cmd)
+    #     with open('temp') as f:
+    #         datafile = f.readlines()
+    #     found = False
+    #     for line in datafile:
+    #         if 'Status code distribution:' in line:
+    #             found = True
+    #         if found:
+    #             print('[test_load]', line)
+    #             m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
+    #             if m is not None:
+    #                 resp_code = m.group(1)
+    #                 if resp_code != 200:
+    #                     return '200', resp_code
+    #     return '200', '200'
+
+    # # End state is reached when system is down or return error code like '500','404'
+    # def get_next_state(self):
+    #     self.logger.info('[GET_NEXT_STATE]')
+    #     f = open(self.chaos_dir + self.chaos_journal)
+    #     data = json.load(f)
+    #
+    #     # before the experiment (if before steady state is false, after is null?)
+    #     for probe in data['steady_states']['before']['probes']:
+    #         if not probe['tolerance_met']:
+    #             # start_state = probe['activity']['tolerance']
+    #             # end_state = probe['status']
+    #             start_state, end_state = None, None
+    #             return start_state, end_state
+    #
+    #     # after the experiment
+    #     for probe in data['steady_states']['after']['probes']:
+    #         # if probe['output']['status'] == probe['activity']['tolerance']:
+    #         if not probe['tolerance_met']:
+    #             # print(probe)
+    #             start_state = probe['activity']['tolerance']
+    #             end_state = probe['output']['status']
+    #             # end_state = probe['status']
+    #             return start_state, end_state
+    #     # if tolerances for all probes are met
+    #     start_state = probe['activity']['tolerance']
+    #     end_state = probe['activity']['tolerance']
+    #     return start_state, end_state
--- a/utils/chaos_ai/src/utils.py
+++ b/utils/chaos_ai/src/utils.py
@@ -0,0 +1,10 @@
+import re
+
+
+def get_load(fault):
+    params = re.findall(r'\(.*?\)', fault)
+    load = 100
+    if len(params) > 0:
+        load = params[0].strip('()')
+        fault = fault.strip(params[0])
+    return fault, load
--- a/utils/chaos_recommender/README.md
+++ b/utils/chaos_recommender/README.md
@@ -20,6 +20,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
    $ git clone https://github.com/krkn-chaos/krkn.git 
    $ cd krkn
    $ pip3 install -r requirements.txt
+    Edit configuration file:
+    $ vi config/recommender_config.yaml 
    $ python3.9 utils/chaos_recommender/chaos_recommender.py
    ```

@@ -30,18 +32,23 @@ To run the recommender with a config file specify the config file path with the
 You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:

  - `application`: Specify the application name.
-  - `namespace`: Specify the namespace name. If you want to profile
+  - `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
  - `labels`: Specify the labels (not used).
  - `kubeconfig`: Specify the location of the kubeconfig file (not used).
  - `prometheus_endpoint`: Specify the prometheus endpoint (must).
  - `auth_token`: Auth token to connect to prometheus endpoint (must).
  - `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
  - `chaos_library`: "kraken" (currently it only supports kraken).
+  - `json_output_file`: True or False (by default False).
+  - `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
  - `chaos_tests`: (for output purpose only do not change if not needed)
    - `GENERAL`: list of general purpose tests available in Krkn
    - `MEM`: list of memory related tests available in Krkn
    - `NETWORK`: list of network related tests available in Krkn
    - `CPU`: list of memory related tests available in Krkn
+  - `threshold`: Specify the threshold to use for comparison and identifying outliers
+  - `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
+  - `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers

 *TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
        ```
@@ -58,8 +65,8 @@ You can also provide the input values through command-line arguments launching t
  -o, --options         Evaluate command line options
  -a APPLICATION, --application APPLICATION
                        Kubernetes application name
-  -n NAMESPACE, --namespace NAMESPACE
-                        Kubernetes application namespace
+  -n NAMESPACES, --namespaces NAMESPACE
+                        Kubernetes application namespaces separated by space
  -l LABELS, --labels LABELS
                        Kubernetes application labels
  -p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
                        Chaos library
  -L LOG_LEVEL, --log-level LOG_LEVEL
                        log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
+  -J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
+                        Create output file, the path to the folder can be specified, if not specified the default folder is used.
  -M MEM [MEM ...], --MEM MEM [MEM ...]
                        Memory related chaos tests (space separated list)
  -C CPU [CPU ...], --CPU CPU [CPU ...]
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
                        Network related chaos tests (space separated list)
  -G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
                        Memory related chaos tests (space separated list)
-
+  --threshold THRESHOLD
+                        Threshold
+  --cpu_threshold CPU_THRESHOLD
+                        CPU threshold to compare with the cpu limits
+  --mem_threshold MEM_THRESHOLD
+                        Memory threshold to compare with the memory limits
 ```

 If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
@@ -97,7 +111,7 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t

 ## Customizing Thresholds and Options

-You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
+You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.

 ## Additional Files

--- a/utils/chaos_recommender/chaos_recommender.py
+++ b/utils/chaos_recommender/chaos_recommender.py
@@ -1,7 +1,10 @@
 import argparse
+import json
 import logging
 import os.path
+import re
 import sys
+import time
 import yaml
 # kraken module import for running the recommender
 # both from the root directory and the recommender
@@ -9,24 +12,28 @@ import yaml
 sys.path.insert(0, './')
 sys.path.insert(0, '../../')

+from krkn_lib.utils import get_yaml_item_value
+
 import kraken.chaos_recommender.analysis as analysis
 import kraken.chaos_recommender.prometheus as prometheus
 from kubernetes import config as kube_config


-
 def parse_arguments(parser):

    # command line options
    parser.add_argument("-c", "--config-file", action="store", help="Config file path")
    parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
-    parser.add_argument("-n", "--namespace", action="store", default="", help="Kubernetes application namespace")
+    parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space")
    parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
    parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
    parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
    parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
    parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")

+    parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
+                        help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
+
    parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
    parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
@@ -35,10 +42,13 @@ def parse_arguments(parser):
                        help="Network related chaos tests (space separated list)")
    parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
-
+    parser.add_argument("--threshold", action="store", default="", help="Threshold")
+    parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
+    parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")

    return parser.parse_args()

+
 def read_configuration(config_file_path):
    if not os.path.exists(config_file_path):
        logging.error(f"Config file not found: {config_file_path}")
@@ -48,15 +58,26 @@ def read_configuration(config_file_path):
        config = yaml.safe_load(config_file)

    log_level = config.get("log level", "INFO")
-    namespace = config.get("namespace", "")
-    kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
+    namespaces = config.get("namespaces")
+    namespaces = re.split(r",+\s+|,+|\s+", namespaces)
+    kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
+
+    prometheus_endpoint = config.get("prometheus_endpoint")
+    auth_token = config.get("auth_token")
+    scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
+    threshold = get_yaml_item_value(config, "threshold", ".7")
+    heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
+    heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
+    output_file = config.get("json_output_file", False)
+    if output_file is True:
+        output_path = config.get("json_output_folder_path")
+    else:
+        output_path = False
+    chaos_tests = config.get("chaos_tests", {})
+    return (namespaces, kubeconfig, prometheus_endpoint, auth_token,
+            scrape_duration, chaos_tests, log_level, threshold,
+            heatmap_cpu_threshold, heatmap_mem_threshold, output_path)

-    prometheus_endpoint = config.get("prometheus_endpoint", "")
-    auth_token = config.get("auth_token", "")
-    scrape_duration = config.get("scrape_duration", "10m")
-    chaos_tests = config.get("chaos_tests" , {})
-    return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
-            chaos_tests, log_level)

 def prompt_input(prompt, default_value):
    user_input = input(f"{prompt} [{default_value}]: ")
@@ -64,6 +85,54 @@ def prompt_input(prompt, default_value):
        return user_input
    return default_value

+
+def make_json_output(inputs, namespace_data, output_path):
+    time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+
+    data = {
+        "inputs": inputs,
+        "analysis_outputs": namespace_data
+    }
+
+    logging.info(f"Summary\n{json.dumps(data, indent=4)}")
+
+    if output_path is not False:
+        file = f"recommender_{time_str}.json"
+        path = f"{os.path.expanduser(output_path)}/{file}"
+
+        with open(path, "w") as json_output:
+            logging.info(f"Saving output file in {output_path} folder...")
+            json_output.write(json.dumps(data, indent=4))
+            logging.info(f"Recommendation output saved in {file}.")
+
+
+def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration,
+                chaos_tests, threshold, heatmap_cpu_threshold,
+                heatmap_mem_threshold):
+    inputs = {
+        "namespaces": namespaces,
+        "kubeconfig": kubeconfig,
+        "prometheus_endpoint": prometheus_endpoint,
+        "scrape_duration": scrape_duration,
+        "chaos_tests": chaos_tests,
+        "threshold": threshold,
+        "heatmap_cpu_threshold": heatmap_cpu_threshold,
+        "heatmap_mem_threshold": heatmap_mem_threshold
+    }
+    return inputs
+
+
+def json_namespace(namespace, queries, analysis_data):
+    data = {
+        "namespace": namespace,
+        "queries": queries,
+        "profiling": analysis_data[0],
+        "heatmap_analysis": analysis_data[1],
+        "recommendations": analysis_data[2]
+    }
+    return data
+
+
 def main():
    parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
    args = parse_arguments(parser)
@@ -75,43 +144,67 @@ def main():

    if args.config_file is not None:
        (
-         namespace,
+         namespaces,
         kubeconfig,
         prometheus_endpoint,
         auth_token,
         scrape_duration,
         chaos_tests,
-         log_level
+         log_level,
+         threshold,
+         heatmap_cpu_threshold,
+         heatmap_mem_threshold,
+         output_path
         ) = read_configuration(args.config_file)

    if args.options:
-        namespace = args.namespace
+        namespaces = args.namespaces
        kubeconfig = args.kubeconfig
        auth_token = args.token
        scrape_duration = args.scrape_duration
        log_level = args.log_level
        prometheus_endpoint = args.prometheus_endpoint
+        output_path = args.json_output_file
        chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
+        threshold = args.threshold
+        heatmap_mem_threshold = args.mem_threshold
+        heatmap_cpu_threshold = args.cpu_threshold

-    if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
+    if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
        logging.error(f"{log_level} not a valid log level")
        sys.exit(1)

    logging.basicConfig(level=log_level)

-    logging.info("============================INPUTS===================================")
-    logging.info(f"Namespace: {namespace}")
-    logging.info(f"Kubeconfig: {kubeconfig}")
-    logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
-    logging.info(f"Scrape duration: {scrape_duration}")
-    for test in chaos_tests.keys():
-        logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
-    logging.info("=====================================================================")
-    logging.info("Starting Analysis ...")
-    logging.info("Fetching the Telemetry data")
+    if output_path is not False:
+        if output_path is None:
+            output_path = "./recommender_output"
+            logging.info(f"Path for output file not specified. "
+                         f"Using default folder {output_path}")
+        if not os.path.exists(os.path.expanduser(output_path)):
+            logging.error(f"Folder {output_path} for output not found.")
+            sys.exit(1)
+
+    logging.info("Loading inputs...")
+    inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint,
+                         scrape_duration, chaos_tests, threshold,
+                         heatmap_cpu_threshold, heatmap_mem_threshold)
+    namespaces_data = []
+
+    logging.info("Starting Analysis...")
+
+    file_path, queries = prometheus.fetch_utilization_from_prometheus(
+        prometheus_endpoint, auth_token, namespaces, scrape_duration)
+
+    analysis_data = analysis(file_path, namespaces, chaos_tests, threshold,
+                             heatmap_cpu_threshold, heatmap_mem_threshold)
+
+    for namespace in namespaces:
+        namespace_data = json_namespace(namespace, queries[namespace],
+                                        analysis_data[namespace])
+        namespaces_data.append(namespace_data)
+    make_json_output(inputs, namespaces_data, output_path)

-    file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
-    analysis(file_path, chaos_tests)

 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
Tullio Sebastiani	2327531e46	Dockerfiles update (#614 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-04-24 11:40:58 -04:00
dependabot[bot]	2c14c48a63	Bump werkzeug from 2.2.2 to 2.3.8 in /utils/chaos_ai/docker (#610 ) Bumps [werkzeug](https://github.com/pallets/werkzeug) from 2.2.2 to 2.3.8. - [Release notes](https://github.com/pallets/werkzeug/releases) - [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/werkzeug/compare/2.2.2...2.3.8) --- updated-dependencies: - dependency-name: werkzeug dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-04-23 15:26:51 +02:00
Tullio Sebastiani	ab98e416a6	Integration of the new pod recovery monitoring strategy implemented in krkn-lib (#609 ) * pod monitoring integration in plugin scenario Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * pod monitoring integration in container scenario Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed wait-for-pod step from plugin scenario config files Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * introduced global pod recovery time Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> nit Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * introduced krkn_pod_recovery_time in plugin scenario and removed all the references to wait-for-pods Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * functional test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * main branch functional test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * increased recovery times Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-04-23 10:49:01 +02:00
Sandeep Hans	19ad2d1a3d	initial version of Chaos AI (#606 ) * init push Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * remove litmus + updated readme Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * remove redundant files Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * removed generated file+unused reference --------- Signed-off-by: Sandeep Hans <shans001@in.ibm.com> Co-authored-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-04-16 10:41:31 -04:00
jtydlcak	804d7cbf58	Accept list of namespaces in chaos recommender Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com>	2024-04-09 23:32:17 -04:00
Paige Rubendall	54af2fc6ff	adding v1.5.12 tag Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-03-29 18:45:52 -04:00
Paige Rubendall	b79e526cfd	adding app outage not creating file (#605 ) Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-03-29 14:35:14 -04:00
Naga Ravi Chaitanya Elluri	a5efd7d06c	Bump release version to v1.5.11 Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-22 15:24:04 -04:00
yogananth	a1b81bd382	Fix: Reslove ingress network chaos plugin issue Added network_chaos to plugin step and job wait time to be based on the test duration and set the default wait_time to 30s Signed-off-by: yogananth subramanian <ysubrama@redhat.com>	2024-03-22 14:48:17 -04:00
Naga Ravi Chaitanya Elluri	782440c8c4	Copy oc and kubectl clients to additional paths This will make sure oc and kubectl clients are accessible for users with both /usr/bin and /usr/local/bin paths set on the host. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-21 11:29:50 -04:00
Naga Ravi Chaitanya Elluri	7e2755cbb7	Remove container status badge Quay is no longer exposing it correctly: https://quay.io/repository/krkn-chaos/krkn/status Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-19 15:33:25 -04:00
Naga Ravi Chaitanya Elluri	2babb53d6e	Bump cryptography version This is need to fix the security vulnerability: https://nvd.nist.gov/vuln/detail/CVE-2024-26130. Note: Reported by FOSSA. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-19 14:44:47 -04:00
Tullio Sebastiani	85f76e9193	do not consider exit code 2 as an error in funtests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-17 23:07:46 -04:00
Liangquan Li	8bf21392f1	fix doc's nit Signed-off-by: Liangquan Li <liangli@redhat.com>	2024-03-13 15:21:57 -04:00
Tullio Sebastiani	606fb60811	changed exit codes on post chaos alerts and post_scenario failure (#592 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-07 16:31:55 +01:00
Tullio Sebastiani	fac7c3c6fb	lowered arcaflow log level to error (#591 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-07 15:32:53 +01:00
Paige Rubendall	8dd9b30030	updating tag (#589 ) Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-03-06 13:11:44 -05:00
Naga Ravi Chaitanya Elluri	2d99f17aaf	fix: requirements.txt to reduce vulnerabilities (#587 ) The following vulnerabilities are fixed by pinning transitive dependencies: - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3172287 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3314966 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315324 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315328 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315331 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315452 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315972 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315975 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3316038 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3316211 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5663682 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5777683 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5813745 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5813746 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5813750 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5914629 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6036192 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6050294 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6092044 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6126975 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6210214 - https://snyk.io/vuln/SNYK-PYTHON-SETUPTOOLS-3180412 - https://snyk.io/vuln/SNYK-PYTHON-WHEEL-3180413 Co-authored-by: snyk-bot <snyk-bot@snyk.io>	2024-03-06 12:54:30 -05:00
Tullio Sebastiani	50742a793c	updated krkn-lib to 2.1.0 (#588 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-06 11:30:01 -05:00
Naga Ravi Chaitanya Elluri	ba6a844544	Add /usr/local/bin to the path for krkn images This is needed to ensure oc and kubectl binaries under /usr/local/bin are accessible. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-04 16:03:40 -05:00
Tullio Sebastiani	7e7a917dba	dockerfiles update (#585 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-04 15:59:53 +01:00
Tullio Sebastiani	b9c0bb39c7	checking post run alerts properties presence (#584 ) added metric check Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 18:30:54 +01:00
Tullio Sebastiani	706a886151	checking alert properties presence (#583 ) typo fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 17:58:21 +01:00
Tullio Sebastiani	a1cf9e2c00	fixed typo on funtests (#582 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 17:09:19 +01:00
Tullio Sebastiani	0f5dfcb823	fixed the telemetry funtest according to the new telemetry API Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 09:48:56 -05:00
Tullio Sebastiani	1e1015e6e7	added new WS configuration to funtests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-29 11:35:00 -05:00
Tullio Sebastiani	c71ce31779	integrated new telemetry library for WS 2.0 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> updated krkn-lib version Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-28 22:58:54 -05:00
Tullio Sebastiani	1298f220a6	Critical alerts collection and upload (#577 ) * added prometheus client method for critical alerts Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * adapted run_kraken to the new plugin method for critical_alerts collection + telemetry upload Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * requirements.txt pointing temporarly to git Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * fixed severity level Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * added functional tests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * exit on post chaos critical alerts Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> log moved Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed noisy log Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fixed log Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * updated requirements.txt to krkn-lib 1.4.13 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * krkn lib * added check on variable that makes kraken return 1 whether post critical alerts are > 0 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-28 09:48:29 -05:00
jtydlcak	24059fb731	Add json output file option for recommender (#511 ) Output in terminal changed to use json structure. The json output file names are in format recommender_namespace_YYYY-MM-DD_HH-MM-SS. The path to the json file can be specified. Default path is in kraken/utils/chaos_recommender/recommender_output. Signed-off-by: jtydlcak <139967002+jtydlack@users.noreply.github.com>	2024-02-27 11:09:00 -05:00
Naga Ravi Chaitanya Elluri	ab951adb78	Expose thresholds config options (#574 ) This commit allows users to edit the thresholds in the chaos-recommender config to be able to identify outliers based on their use case. Fixes https://github.com/krkn-chaos/krkn/issues/509 Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-02-26 09:43:34 -05:00
Paige Rubendall	a9a7fb7e51	updating release version in dockerfiles (#578 ) Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-02-21 10:17:02 -05:00