Dockerfile update

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
Bump arcaflow version to 0.17.2 (#648 )
2026-02-19 20:40:33 +00:00 · 2024-06-12 14:36:38 -04:00 · 2024-06-12 20:29:32 +02:00 · 2024-06-12 09:17:14 -04:00 · 2024-06-11 12:07:28 -04:00 · 2024-06-10 14:26:03 -04:00
92 changed files with 2653 additions and 618 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -61,6 +61,8 @@ jobs:
          kubectl create namespace namespace-scenario
          kubectl apply -f CI/templates/time_pod.yaml
          kubectl wait --for=condition=ready pod -l scenario=time-skew --timeout=300s
+          kubectl apply -f CI/templates/service_hijacking.yaml
+          kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=proxy" --timeout=300s
      - name: Get Kind nodes
        run: |
          kubectl get nodes --show-labels=true
@@ -70,12 +72,14 @@ jobs:
        run: python -m coverage run -a -m unittest discover -s tests -v

      - name: Setup Pull Request Functional Tests
-        if: github.event_name == 'pull_request'
+        if: |
+          github.event_name == 'pull_request'
        run: |
            yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
            yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
            yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
-            echo "test_app_outages" > ./CI/tests/functional_tests
+            echo "test_service_hijacking" > ./CI/tests/functional_tests
+            echo "test_app_outages" >> ./CI/tests/functional_tests
            echo "test_container"      >> ./CI/tests/functional_tests
            echo "test_namespace"      >> ./CI/tests/functional_tests
            echo "test_net_chaos"      >> ./CI/tests/functional_tests
@@ -84,7 +88,9 @@ jobs:
            echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
            echo "test_arca_io_hog" >> ./CI/tests/functional_tests

-      # Push on main only steps
+
+      # Push on main only steps + all other functional to collect coverage
+      # for the badge
      - name: Configure AWS Credentials
        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
        uses: aws-actions/configure-aws-credentials@v4
@@ -101,6 +107,15 @@ jobs:
          yq -i '.telemetry.username="${{secrets.TELEMETRY_USERNAME}}"' CI/config/common_test_config.yaml
          yq -i '.telemetry.password="${{secrets.TELEMETRY_PASSWORD}}"' CI/config/common_test_config.yaml
          echo "test_telemetry" > ./CI/tests/functional_tests
+          echo "test_service_hijacking" >> ./CI/tests/functional_tests
+          echo "test_app_outages" >> ./CI/tests/functional_tests
+          echo "test_container"      >> ./CI/tests/functional_tests
+          echo "test_namespace"      >> ./CI/tests/functional_tests
+          echo "test_net_chaos"      >> ./CI/tests/functional_tests
+          echo "test_time"           >> ./CI/tests/functional_tests
+          echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
+          echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
+          echo "test_arca_io_hog" >> ./CI/tests/functional_tests

      # Final common steps
      - name: Run Functional tests
@@ -119,6 +134,7 @@ jobs:
      - name: Collect coverage report
        run: |
          python -m coverage html
+          python -m coverage json
      - name: Publish coverage report to job summary
        run: |
          pip install html2text
@@ -129,6 +145,54 @@ jobs:
          name: coverage
          path: htmlcov
          if-no-files-found: error
+      - name: Upload json coverage
+        uses: actions/upload-artifact@v3
+        with:
+          name: coverage.json
+          path: coverage.json
+          if-no-files-found: error
      - name: Check CI results
        run: grep Fail CI/results.markdown && false || true
+  badge:
+    permissions:
+      contents: write
+    name: Generate Coverage Badge
+    runs-on: ubuntu-latest
+    needs:
+      - tests
+    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+    steps:
+        - name: Check out doc repo
+          uses: actions/checkout@master
+          with:
+            repository: krkn-chaos/krkn-lib-docs
+            path: krkn-lib-docs
+            ssh-key: ${{ secrets.KRKN_LIB_DOCS_PRIV_KEY }}
+        - name: Download json coverage
+          uses: actions/download-artifact@v3
+          with:
+            name: coverage.json
+        - name: Set up Python
+          uses: actions/setup-python@v4
+          with:
+            python-version: 3.9
+        - name: Copy badge on GitHub Page Repo
+          env:
+            COLOR: yellow
+          run: |
+            # generate coverage badge on previously calculated total coverage
+            # and copy in the docs page
+            export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])")
+            [[ $TOTAL > 40 ]] && COLOR=green
+            echo "TOTAL: $TOTAL"
+            echo "COLOR: $COLOR"
+            curl "https://img.shields.io/badge/coverage-$TOTAL%25-$COLOR" > ./krkn-lib-docs/coverage_badge_krkn.svg
+        - name: Push updated Coverage Badge
+          run: |
+            cd krkn-lib-docs
+            git add .
+            git config user.name "krkn-chaos"
+            git config user.email "<>"
+            git commit -m "[KRKN] Coverage Badge ${GITHUB_REF##*/}" || echo "no changes to commit"
+            git push
      
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ __pycache__/*
 *.out
 kube-burner*
 kube_burner*
+recommender_*.json

 # Project files
 .ropeproject
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -29,7 +29,7 @@ tunings:
    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever.
 telemetry:
    enabled: False                                           # enable/disables the telemetry collection feature
-    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    api_url: https://yvnn4rfoi7.execute-api.us-west-2.amazonaws.com/test #telemetry service endpoint
    username: $TELEMETRY_USERNAME                                      # telemetry service username
    password: $TELEMETRY_PASSWORD                                      # telemetry service password
    prometheus_namespace: 'prometheus-k8s'                                # prometheus namespace
@@ -49,3 +49,4 @@ telemetry:
        - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
    events_backup: True                                     # enables/disables cluster events collection
+    telemetry_group: "funtests"
--- a/CI/templates/service_hijacking.yaml
+++ b/CI/templates/service_hijacking.yaml
@@ -0,0 +1,29 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: nginx
+  labels:
+    app.kubernetes.io/name: proxy
+spec:
+  containers:
+  - name: nginx
+    image: nginx:stable
+    ports:
+      - containerPort: 80
+        name: http-web-svc
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: nginx-service
+spec:
+  selector:
+    app.kubernetes.io/name: proxy
+  type: NodePort
+  ports:
+  - name: name-of-service-port
+    protocol: TCP
+    port: 80
+    targetPort: http-web-svc
+    nodePort: 30036
--- a/CI/tests/common.sh
+++ b/CI/tests/common.sh
@@ -1,15 +1,23 @@
 ERRORED=false

 function finish {
-    if [ $? -eq 1 ] && [ $ERRORED != "true" ]
+    if [ $? != 0 ] && [ $ERRORED != "true" ]
    then
        error
    fi
 }

 function error {
-    echo "Error caught."
-    ERRORED=true
+    exit_code=$?
+    if [ $exit_code == 1 ]
+    then
+      echo "Error caught."
+      ERRORED=true
+    elif [ $exit_code == 2 ]
+    then
+      echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
+      exit 0
+    fi
 }

 function get_node {
--- a/CI/tests/test_container.sh
+++ b/CI/tests/test_container.sh
@@ -8,11 +8,11 @@ trap finish EXIT
 pod_file="CI/scenarios/hello_pod.yaml"

 function functional_test_container_crash {
-  yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
-  yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
-  yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
+  yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
+  yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
+  yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
  export scenario_type="container_scenarios"
-  export scenario_file="- scenarios/openshift/app_outage.yaml"
+  export scenario_file="- scenarios/openshift/container_etcd.yml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml

--- a/CI/tests/test_service_hijacking.sh
+++ b/CI/tests/test_service_hijacking.sh
@@ -0,0 +1,107 @@
+set -xeEo pipefail
+
+source CI/tests/common.sh
+
+trap error ERR
+trap finish EXIT
+# port mapping has been configured in kind-config.yml
+SERVICE_URL=http://localhost:8888
+PAYLOAD_GET_1="{ \
+  \"status\":\"internal server error\" \
+}"
+STATUS_CODE_GET_1=500
+
+PAYLOAD_PATCH_1="resource patched"
+STATUS_CODE_PATCH_1=201
+
+PAYLOAD_POST_1="{ \
+  \"status\": \"unauthorized\" \
+}"
+STATUS_CODE_POST_1=401
+
+PAYLOAD_GET_2="{ \
+  \"status\":\"resource created\" \
+}"
+STATUS_CODE_GET_2=201
+
+PAYLOAD_PATCH_2="bad request"
+STATUS_CODE_PATCH_2=400
+
+PAYLOAD_POST_2="not found"
+STATUS_CODE_POST_2=404
+
+JSON_MIME="application/json"
+TEXT_MIME="text/plain; charset=utf-8"
+
+function functional_test_service_hijacking {
+
+  export scenario_type="service_hijacking"
+  export scenario_file="scenarios/kube/service_hijacking.yaml"
+  export post_config=""
+  envsubst < CI/config/common_test_config.yaml > CI/config/service_hijacking.yaml
+  python3 -m coverage run -a run_kraken.py -c CI/config/service_hijacking.yaml  > /dev/null 2>&1 &
+  PID=$!
+  #Waiting the hijacking to have effect
+  while [ `curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php` == 404 ]; do echo "waiting scenario to kick in."; sleep 1; done;
+
+  #Checking Step 1 GET on /list/index.php
+  OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`"
+  OUT_CONTENT=`curl -X GET -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
+  OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
+  [ "${PAYLOAD_GET_1//[$'\t\r\n ']}" == "${OUT_GET//[$'\t\r\n ']}" ] && echo "Step 1 GET Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
+  [ "$OUT_STATUS_CODE" == "$STATUS_CODE_GET_1" ] && echo "Step 1 GET Status Code OK" || (echo " Step 1 GET status code did not match. Test failed." && exit 1)
+  [ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 1 GET MIME OK" || (echo " Step 1 GET MIME did not match. Test failed." && exit 1)
+
+  #Checking Step 1 POST on /list/index.php
+  OUT_POST="`curl -s -X POST $SERVICE_URL/list/index.php`"
+  OUT_STATUS_CODE=`curl -X POST -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
+  OUT_CONTENT=`curl -X POST -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
+  [ "${PAYLOAD_POST_1//[$'\t\r\n ']}" == "${OUT_POST//[$'\t\r\n ']}" ] && echo "Step 1 POST Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
+  [ "$OUT_STATUS_CODE" == "$STATUS_CODE_POST_1" ] && echo "Step 1 POST Status Code OK" || (echo "Step 1 POST status code did not match. Test failed." && exit 1)
+  [ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 1 POST MIME OK" || (echo " Step 1 POST MIME did not match. Test failed." && exit 1)
+
+  #Checking Step 1 PATCH on /patch
+  OUT_PATCH="`curl -s -X PATCH $SERVICE_URL/patch`"
+  OUT_STATUS_CODE=`curl -X PATCH -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/patch`
+  OUT_CONTENT=`curl -X PATCH -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/patch`
+  [ "${PAYLOAD_PATCH_1//[$'\t\r\n ']}" == "${OUT_PATCH//[$'\t\r\n ']}" ] && echo "Step 1 PATCH Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
+  [ "$OUT_STATUS_CODE" == "$STATUS_CODE_PATCH_1" ] && echo "Step 1 PATCH Status Code OK" || (echo "Step 1 PATCH status code did not match. Test failed." && exit 1)
+  [ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 1 PATCH MIME OK" || (echo " Step 1 PATCH MIME did not match. Test failed." && exit 1)
+  # wait for the next step
+  sleep 16
+
+  #Checking Step 2 GET on /list/index.php
+  OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`"
+  OUT_CONTENT=`curl -X GET -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
+  OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
+  [ "${PAYLOAD_GET_2//[$'\t\r\n ']}" == "${OUT_GET//[$'\t\r\n ']}" ] && echo "Step 2 GET Payload OK" || (echo "Step 2 GET Payload did not match. Test failed." && exit 1)
+  [ "$OUT_STATUS_CODE" == "$STATUS_CODE_GET_2" ] && echo "Step 2 GET Status Code OK" || (echo "Step 2 GET status code did not match. Test failed." && exit 1)
+  [ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 2 GET MIME OK" || (echo " Step 2 GET MIME did not match. Test failed." && exit 1)
+
+  #Checking Step 2 POST on /list/index.php
+  OUT_POST="`curl -s -X POST $SERVICE_URL/list/index.php`"
+  OUT_CONTENT=`curl -X POST -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
+  OUT_STATUS_CODE=`curl -X POST -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
+  [ "${PAYLOAD_POST_2//[$'\t\r\n ']}" == "${OUT_POST//[$'\t\r\n ']}" ] && echo "Step 2 POST Payload OK" || (echo "Step 2 POST Payload did not match. Test failed." && exit 1)
+  [ "$OUT_STATUS_CODE" == "$STATUS_CODE_POST_2" ] && echo "Step 2 POST Status Code OK" || (echo "Step 2 POST status code did not match. Test failed." && exit 1)
+  [ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 2 POST MIME OK" || (echo " Step 2 POST MIME did not match. Test failed." && exit 1)
+
+  #Checking Step 2 PATCH on /patch
+  OUT_PATCH="`curl -s -X PATCH $SERVICE_URL/patch`"
+  OUT_CONTENT=`curl -X PATCH -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/patch`
+  OUT_STATUS_CODE=`curl -X PATCH -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/patch`
+  [ "${PAYLOAD_PATCH_2//[$'\t\r\n ']}" == "${OUT_PATCH//[$'\t\r\n ']}" ] && echo "Step 2 PATCH Payload OK" || (echo "Step 2 PATCH Payload did not match. Test failed." && exit 1)
+  [ "$OUT_STATUS_CODE" == "$STATUS_CODE_PATCH_2" ] && echo "Step 2 PATCH Status Code OK" || (echo "Step 2 PATCH status code did not match. Test failed." && exit 1)
+  [ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 2 PATCH MIME OK" || (echo " Step 2 PATCH MIME did not match. Test failed." && exit 1)
+  wait $PID
+
+  # now checking  if service has been restore correctly and nginx responds correctly
+  curl -s  $SERVICE_URL | grep nginx! && echo "BODY: Service restored!" || (echo "BODY: failed to restore service" && exit 1)
+  OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL`
+  [ "$OUT_STATUS_CODE" == "200" ] && echo "STATUS_CODE: Service restored!" || (echo "STATUS_CODE: failed to restore service" && exit 1)
+
+  echo "Service Hijacking Chaos test: Success"
+}
+
+
+functional_test_service_hijacking
--- a/CI/tests/test_telemetry.sh
+++ b/CI/tests/test_telemetry.sh
@@ -14,18 +14,22 @@ function functional_test_telemetry {
  export RUN_TAG="funtest-telemetry"
  yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
  yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
  yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
+
  export scenario_type="arcaflow_scenarios"
  export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
-  python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
-  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
+  retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
+  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
  $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
  echo "checking if telemetry files are uploaded on s3"
  cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
-  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
-  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
+  cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded"  && exit 1 )
+  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded"  && exit 1 )
+  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded"  && exit 1 )
  echo "all files uploaded!"
  echo "Telemetry Collection: Success"
 }
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # Krkn aka Kraken
-[![Docker Repository on Quay](https://quay.io/repository/krkn-chaos/krkn/status "Docker Repository on Quay")](https://quay.io/repository/krkn-chaos/krkn?tab=tags&tag=latest)
 ![Workflow-Status](https://github.com/krkn-chaos/krkn/actions/workflows/docker-image.yml/badge.svg)
+![coverage](https://krkn-chaos.github.io/krkn-lib-docs/coverage_badge_krkn.svg)
+![action](https://github.com/krkn-chaos/krkn/actions/workflows/tests.yml/badge.svg)

 ![Krkn logo](media/logo.png)

@@ -74,6 +75,7 @@ Scenario type               | Kubernetes
 [PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: |
 [Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: |
 [ManagedCluster Scenarios](docs/managedcluster_scenarios.md) | :heavy_check_mark: |
+[Service Hijacking Scenarios](docs/service_hijacking_scenarios.md) | :heavy_check_mark: |


 ### Kraken scenario pass/fail criteria and report
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,5 +1,5 @@
 kraken:
-    distribution: openshift                                # Distribution can be kubernetes or openshift
+    distribution: kubernetes                                # Distribution can be kubernetes or openshift
    kubeconfig_path: ~/.kube/config                        # Path to kubeconfig
    exit_on_failure: False                                 # Exit when a post action scenario fails
    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
@@ -15,7 +15,7 @@ kraken:
        - application_outages:
            - scenarios/openshift/app_outage.yaml
        - container_scenarios:                             # List of chaos pod scenarios to load
-            - -    scenarios/openshift/container_etcd.yml
+            - - scenarios/openshift/container_etcd.yml
        - plugin_scenarios:
            - scenarios/openshift/etcd.yml
            - scenarios/openshift/regex_openshift_pod_kill.yml
@@ -23,7 +23,7 @@ kraken:
            - scenarios/openshift/network_chaos_ingress.yml
            - scenarios/openshift/prom_kill.yml
        - node_scenarios:                                  # List of chaos node scenarios to load
-            -   scenarios/openshift/node_scenarios_example.yml
+            - scenarios/openshift/node_scenarios_example.yml
        - plugin_scenarios:
            - scenarios/openshift/openshift-apiserver.yml
            - scenarios/openshift/openshift-kube-apiserver.yml
@@ -42,6 +42,8 @@ kraken:
            - scenarios/openshift/pvc_scenario.yaml
        - network_chaos:
            - scenarios/openshift/network_chaos.yaml
+        - service_hijacking:
+              - scenarios/kube/service_hijacking.yaml

 cerberus:
    cerberus_enabled: False                                # Enable it when cerberus is previously installed
@@ -51,7 +53,7 @@ cerberus:
 performance_monitoring:
    deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
    repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_url:                                      # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
    uuid:                                                 # uuid for the run is generated by default if not set
    enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
@@ -65,14 +67,19 @@ telemetry:
    enabled: False                                           # enable/disables the telemetry collection feature
    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
    username: username                                      # telemetry service username
-    password: password                                      # telemetry service password
+    password: password                                    # telemetry service password
    prometheus_backup: True                                 # enables/disables prometheus data collection
+    prometheus_namespace: ""                                # namespace where prometheus is deployed (if distribution is kubernetes)
+    prometheus_container_name: ""                           # name of the prometheus container name (if distribution is kubernetes)
+    prometheus_pod_name: ""                                 # name of the prometheus pod (if distribution is kubernetes)
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
-    archive_size: 500000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+    archive_size: 500000
+    telemetry_group: ''                                     # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
+    # the size of the prometheus data archive size in KB. The lower the size of archive is
                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
                                                            # simultaneously).
                                                            # For unstable/slow connection is better to keep this value low
@@ -85,6 +92,9 @@ telemetry:
     - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
    events_backup: True                                     # enables/disables cluster events collection
+elastic: 
+    elastic_url: ""                                         # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
+    elastic_index: ""                                       # Elastic search index pattern to post results to



--- a/config/config_performance.yaml
+++ b/config/config_performance.yaml
@@ -77,3 +77,8 @@ telemetry:
     - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+"          # kinit 2023/09/15 11:20:36 log
     - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
+elastic: 
+    elastic_url: ""                                         # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
+    elastic_index: ""                                       # Elastic search index pattern to post results to
+
+
--- a/config/recommender_config.yaml
+++ b/config/recommender_config.yaml
@@ -1,5 +1,5 @@
 application: openshift-etcd
-namespace: openshift-etcd
+namespaces: openshift-etcd
 labels: app=openshift-etcd
 kubeconfig: ~/.kube/config.yaml
 prometheus_endpoint: <Prometheus_Endpoint>
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
 scrape_duration: 10m
 chaos_library: "kraken"
 log_level: INFO
+json_output_file: False
+json_output_folder_path:

 # for output purpose only do not change if not needed
 chaos_tests:
@@ -26,4 +28,8 @@ chaos_tests:
    - pod_network_chaos
  MEM:
    - node_memory_hog
-    - pvc_disk_fill
+    - pvc_disk_fill
+
+threshold: .7
+cpu_threshold: .5
+mem_threshold: .5
--- a/containers/Dockerfile
+++ b/containers/Dockerfile
@@ -1,28 +1,50 @@
-# Dockerfile for kraken
-
+# azure-client
 FROM mcr.microsoft.com/azure-cli:latest as azure-cli

-FROM registry.access.redhat.com/ubi8/ubi:latest
+# oc build
+FROM golang:1.22.4 AS oc-build
+RUN apt-get update && apt-get install -y libkrb5-dev
+WORKDIR /tmp
+RUN git clone --branch release-4.18 https://github.com/openshift/oc.git
+WORKDIR /tmp/oc
+RUN go mod edit -go 1.22.3 &&\
+    go get github.com/moby/buildkit@v0.12.5 &&\
+    go get github.com/containerd/containerd@v1.7.11&&\
+    go get github.com/docker/docker@v25.0.5&&\
+    go mod tidy && go mod vendor
+RUN make GO_REQUIRED_MIN_VERSION:= oc

-ENV KUBECONFIG /root/.kube/config
+FROM fedora:40
+RUN groupadd -g 1001 krkn && useradd -m -u 1001 -g krkn krkn
+RUN dnf update -y

-# Copy azure client binary from azure-cli image
+# krkn version that will be built
+ENV KRKN_VERSION v1.6.1
+
+ENV KUBECONFIG /home/krkn/.kube/config
+
+# install kubectl
+RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" &&\
+    cp kubectl /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl &&\
+    cp kubectl /usr/bin/kubectl && chmod +x /usr/bin/kubectl
+
+# This overwrites any existing configuration in /etc/yum.repos.d/kubernetes.repo
+RUN dnf update && dnf install -y git python39 jq yq gettext wget which
+# copy azure client binary from azure-cli image
 COPY --from=azure-cli /usr/local/bin/az /usr/bin/az

-# Install dependencies
-RUN yum install -y git python39 python3-pip jq gettext wget && \
-    python3.9 -m pip install -U pip && \
-    git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.7 /root/kraken && \
-    mkdir -p /root/.kube && cd /root/kraken && \
-    pip3.9 install -r requirements.txt && \
-    pip3.9 install virtualenv && \
-    wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && chmod +x /usr/bin/yq
+# copy oc client binary from oc-build image
+COPY --from=oc-build /tmp/oc/oc /usr/bin/oc

-# Get Kubernetes and OpenShift clients from stable releases
-WORKDIR /tmp
-RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
-
-WORKDIR /root/kraken
+# krkn build
+RUN git clone https://github.com/krkn-chaos/krkn.git --branch $KRKN_VERSION /home/krkn/kraken && \
+    mkdir -p /home/krkn/.kube
+WORKDIR /home/krkn/kraken
+RUN python3.9 -m ensurepip
+RUN pip3.9 install -r requirements.txt
+RUN pip3.9 install jsonschema

+RUN chown -R krkn:krkn /home/krkn
+USER krkn
 ENTRYPOINT ["python3.9", "run_kraken.py"]
-CMD ["--config=config/config.yaml"]
+CMD ["--config=config/config.yaml"]
--- a/containers/Dockerfile-ppc64le
+++ b/containers/Dockerfile-ppc64le
@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
 # Install dependencies
 RUN yum install -y git python39 python3-pip jq gettext wget && \
    python3.9 -m pip install -U pip && \
-    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.7 /root/kraken && \
+    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.14 /root/kraken && \
    mkdir -p /root/.kube && cd /root/kraken && \
    pip3.9 install -r requirements.txt && \
    pip3.9 install virtualenv && \
@@ -22,7 +22,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \

 # Get Kubernetes and OpenShift clients from stable releases
 WORKDIR /tmp
-RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
+RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl

 WORKDIR /root/kraken

--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the
    namespace_pattern: ^<namespace>$
    label_selector: <pod label>
    kill: <number of pods to kill>
- id: wait-for-pods
-  config:
-    namespace_pattern: ^<namespace>$
-    label_selector: <pod label>
-    count: <expected number of pods that match namespace and label>
+    krkn_pod_recovery_time: <expected time for the pod to become ready>
 ```

 #### Node Scenario Yaml Template
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -22,7 +22,7 @@ the capabilities of the current supported scenarios.
 Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
 ```
 $ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
-$ cd kraken
+$ cd krkn
 ```

 #### Install the dependencies
--- a/docs/pod_scenarios.md
+++ b/docs/pod_scenarios.md
@@ -17,11 +17,8 @@ You can then create the scenario file with the following contents:
  config:
    namespace_pattern: ^kube-system$
    label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: k8s-app=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
+    
 ```

 Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.
--- a/docs/service_hijacking_scenarios.md
+++ b/docs/service_hijacking_scenarios.md
@@ -0,0 +1,80 @@
+### Service Hijacking Scenarios
+
+Service Hijacking Scenarios aim to simulate fake HTTP responses from a workload targeted by a 
+`Service` already deployed in the cluster. 
+This scenario is executed by deploying a custom-made web service and modifying the target `Service`
+selector to direct traffic to this web service for a specified duration.
+
+The web service's source code is available [here](https://github.com/krkn-chaos/krkn-service-hijacking). 
+It employs a time-based test plan from the scenario configuration file, which specifies the behavior of resources during the chaos scenario as follows:
+
+```yaml
+service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration).
+service_name: nginx-service # The name of the service that will be hijacked.
+service_namespace: default # The namespace where the target service is located.
+image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic.
+chaos_duration: 30 # Total duration of the chaos scenario in seconds.
+plan:
+  - resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored. For resources, only query parameters are captured.
+
+    steps:                      # A time-based plan consisting of steps can be defined for each resource.
+      GET:                      # One or more HTTP methods can be specified for each step. Note: Non-standard methods are supported for fully custom web services (e.g., using NONEXISTENT instead of POST).
+
+        - duration: 15          # Duration in seconds for this step before moving to the next one, if defined. Otherwise, this step will continue until the chaos scenario ends.
+
+          status: 500           # HTTP status code to be returned in this step.
+          mime_type: "application/json" # MIME type of the response for this step.
+          payload: |            # The response payload for this step.
+            {
+              "status":"internal server error"
+            }
+        - duration: 15
+          status: 201
+          mime_type: "application/json"
+          payload: |
+            {
+              "status":"resource created"
+            }
+      POST:
+        - duration: 15
+          status: 401
+          mime_type: "application/json"
+          payload: |
+            {
+               "status": "unauthorized"
+            }
+        - duration: 15
+          status: 404
+          mime_type: "text/plain"
+          payload: "not found"
+
+
+```
+The scenario will focus on the `service_name` within the `service_namespace`, 
+substituting the selector with a randomly generated one, which is added as a label in the mock service manifest.
+This allows multiple scenarios to be executed in the same namespace, each targeting different services without 
+causing conflicts.
+
+The newly deployed mock web service will expose a `service_target_port`, 
+which can be either a named or numeric port based on the service configuration. 
+This ensures that the Service correctly routes HTTP traffic to the mock web service during the chaos run.
+
+Each step will last for `duration` seconds from the deployment of the mock web service in the cluster. 
+For each HTTP resource, defined as a top-level YAML property of the plan 
+(it could be a specific resource, e.g., /list/index.php, or a path-based resource typical in MVC frameworks), 
+one or more HTTP request methods can be specified. Both standard and custom request methods are supported.
+
+During this time frame, the web service will respond with:
+
+- `status`: The [HTTP status code](https://datatracker.ietf.org/doc/html/rfc7231#section-6) (can be standard or custom).
+- `mime_type`: The [MIME type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) (can be standard or custom).
+- `payload`: The response body to be returned to the client.
+
+At the end of the step `duration`, the web service will proceed to the next step (if available) until 
+the global `chaos_duration` concludes. At this point, the original service will be restored, 
+and the custom web service and its resources will be undeployed.
+
+__NOTE__: Some clients (e.g., cURL, jQuery) may optimize queries using lightweight methods (like HEAD or OPTIONS) 
+to probe API behavior. If these methods are not defined in the test plan, the web service may respond with 
+a `405` or `404` status code. If you encounter unexpected behavior, consider this use case.
+
--- a/kind-config.yml
+++ b/kind-config.yml
@@ -2,6 +2,9 @@ kind: Cluster
 apiVersion: kind.x-k8s.io/v1alpha4
 nodes:
  - role: control-plane
+    extraPortMappings:
+      - containerPort: 30036
+        hostPort: 8888
  - role: control-plane
  - role: control-plane
  - role: worker
--- a/kraken/application_outage/actions.py
+++ b/kraken/application_outage/actions.py
@@ -4,6 +4,7 @@ import time
 import kraken.cerberus.setup as cerberus
 from jinja2 import Template
 import kraken.invoke.command as runcommand
+from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.utils.functions import get_yaml_item_value, log_exception
@@ -11,14 +12,14 @@ from krkn_lib.utils.functions import get_yaml_item_value, log_exception

 # Reads the scenario config, applies and deletes a network policy to
 # block the traffic for the specified duration
-def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
+def run(scenarios_list, config, wait_duration,kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
    failed_post_scenarios = ""
    scenario_telemetries: list[ScenarioTelemetry] = []
    failed_scenarios = []
    for app_outage_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = app_outage_config
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
        if len(app_outage_config) > 1:
            try:
@@ -49,25 +50,22 @@ spec:
  podSelector:
    matchLabels: {{ pod_selector }}
  policyTypes: {{ traffic_type }}
-                    """
+"""
                    t = Template(network_policy_template)
                    rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
-                    # Write the rendered template to a file
-                    with open("kraken_network_policy.yaml", "w") as f:
-                        f.write(rendered_spec)
+                    yaml_spec = yaml.safe_load(rendered_spec)
                    # Block the traffic by creating network policy
                    logging.info("Creating the network policy")
-                    runcommand.invoke(
-                        "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
-                    )

+                    kubecli.create_net_policy(yaml_spec, namespace)
+                   
                    # wait for the specified duration
                    logging.info("Waiting for the specified duration in the config: %s" % (duration))
                    time.sleep(duration)

                    # unblock the traffic by deleting the network policy
                    logging.info("Deleting the network policy")
-                    runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
+                    kubecli.delete_net_policy("kraken-deny", namespace)

                    logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
                    time.sleep(wait_duration)
@@ -75,12 +73,12 @@ spec:
                    end_time = int(time.time())
                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
            except Exception as e :
-                scenario_telemetry.exitStatus = 1
+                scenario_telemetry.exit_status = 1
                failed_scenarios.append(app_outage_config)
                log_exception(app_outage_config)
            else:
-                scenario_telemetry.exitStatus = 0
-            scenario_telemetry.endTimeStamp = time.time()
+                scenario_telemetry.exit_status = 0
+            scenario_telemetry.end_timestamp = time.time()
            scenario_telemetries.append(scenario_telemetry)
    return failed_scenarios, scenario_telemetries

--- a/kraken/arcaflow_plugin/arcaflow_plugin.py
+++ b/kraken/arcaflow_plugin/arcaflow_plugin.py
@@ -16,12 +16,12 @@ def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetr
    for scenario in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = scenario
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry,scenario)
        engine_args = build_args(scenario)
        status_code = run_workflow(engine_args, kubeconfig_path)
-        scenario_telemetry.endTimeStamp = time.time()
-        scenario_telemetry.exitStatus = status_code
+        scenario_telemetry.end_timestamp = time.time()
+        scenario_telemetry.exit_status = status_code
        scenario_telemetries.append(scenario_telemetry)
        if status_code != 0:
            failed_post_scenarios.append(scenario)
@@ -36,9 +36,10 @@ def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:

 def build_args(input_file: str) -> arcaflow.EngineArgs:
    """sets the kubeconfig parsed by setArcaKubeConfig as an input to the arcaflow workflow"""
-    context = Path(input_file).parent
-    workflow = "{}/workflow.yaml".format(context)
-    config = "{}/config.yaml".format(context)
+    current_path = Path().resolve()
+    context = f"{current_path}/{Path(input_file).parent}"
+    workflow = f"{context}/workflow.yaml"
+    config = f"{context}/config.yaml"
    if not os.path.exists(context):
        raise Exception(
            "context folder for arcaflow workflow not found: {}".format(
@@ -61,7 +62,8 @@ def build_args(input_file: str) -> arcaflow.EngineArgs:
    engine_args = arcaflow.EngineArgs()
    engine_args.context = context
    engine_args.config = config
-    engine_args.input = input_file
+    engine_args.workflow = workflow
+    engine_args.input = f"{current_path}/{input_file}"
    return engine_args


--- a/kraken/chaos_recommender/analysis.py
+++ b/kraken/chaos_recommender/analysis.py
@@ -4,13 +4,10 @@ import pandas as pd
 import kraken.chaos_recommender.kraken_tests as kraken_tests
 import time

-threshold = .7  # Adjust the threshold as needed
-heatmap_cpu_threshold = .5
-heatmap_mem_threshold = .5
-
 KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"

-#Placeholder, this should be done with topology
+
+# Placeholder, this should be done with topology
 def return_critical_services():
    return ["web", "cart"]

@@ -19,15 +16,18 @@ def load_telemetry_data(file_path):
    data = pd.read_csv(file_path, delimiter=r"\s+")
    return data

+
 def calculate_zscores(data):
    zscores = pd.DataFrame()
+    zscores["Namespace"] = data["namespace"]
    zscores["Service"] = data["service"]
    zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
    zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
    zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
    return zscores

-def identify_outliers(data):
+
+def identify_outliers(data, threshold):
    outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
    outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
    outliers_network = data[data["Network"] > threshold]["Service"].tolist()
@@ -47,44 +47,85 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
    return cpu_services, mem_services


-def analysis(file_path, chaos_tests_config):
+def analysis(file_path, namespaces, chaos_tests_config, threshold,
+             heatmap_cpu_threshold, heatmap_mem_threshold):
    # Load the telemetry data from file
+    logging.info("Fetching the Telemetry data...")
    data = load_telemetry_data(file_path)

    # Calculate Z-scores for CPU, Memory, and Network columns
    zscores = calculate_zscores(data)
+    # Dict for saving analysis data -- key is the namespace
+    analysis_data = {}

-    # Identify outliers
-    outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
-    cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
+    # Identify outliers for each namespace
+    for namespace in namespaces:

-    # Display the identified outliers
-    logging.info("======================== Profiling ==================================")
-    logging.info(f"CPU outliers: {outliers_cpu}")
-    logging.info(f"Memory outliers: {outliers_memory}")
-    logging.info(f"Network outliers: {outliers_network}")
-    logging.info("===================== HeatMap Analysis ==============================")
+        logging.info(f"Identifying outliers for namespace {namespace}...")
+
+        namespace_zscores = zscores.loc[zscores["Namespace"] == namespace]
+        namespace_data = data.loc[data["namespace"] == namespace]
+        outliers_cpu, outliers_memory, outliers_network = identify_outliers(
+            namespace_zscores, threshold)
+        cpu_services, mem_services = get_services_above_heatmap_threshold(
+            namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold)
+
+        analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory,
+                                                 outliers_network,
+                                                 cpu_services, mem_services,
+                                                 chaos_tests_config)
+
+        if cpu_services:
+            logging.info(f"These services use significant CPU compared to "
+                         f"their assigned limits: {cpu_services}")
+        else:
+            logging.info("There are no services that are using significant "
+                         "CPU compared to their assigned limits "
+                         "(infinite in case no limits are set).")
+        if mem_services:
+            logging.info(f"These services use significant MEMORY compared to "
+                         f"their assigned limits: {mem_services}")
+        else:
+            logging.info("There are no services that are using significant "
+                         "MEMORY compared to their assigned limits "
+                         "(infinite in case no limits are set).")
+        time.sleep(2)
+
+    logging.info("Please check data in utilisation.txt for further analysis")
+
+    return analysis_data
+
+
+def analysis_json(outliers_cpu, outliers_memory, outliers_network,
+                  cpu_services, mem_services, chaos_tests_config):
+
+    profiling = {
+        "cpu_outliers": outliers_cpu,
+        "memory_outliers": outliers_memory,
+        "network_outliers": outliers_network
+    }
+
+    heatmap = {
+        "services_with_cpu_heatmap_above_threshold": cpu_services,
+        "services_with_mem_heatmap_above_threshold": mem_services
+    }
+
+    recommendations = {}

    if cpu_services:
-        logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
-    else:
-        logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
+        cpu_recommend = {"services": cpu_services,
+                         "tests": chaos_tests_config['CPU']}
+        recommendations["cpu_services_recommendations"] = cpu_recommend
+
    if mem_services:
-        logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
-    else:
-        logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
-    time.sleep(2)
-    logging.info("======================= Recommendations =============================")
-    if cpu_services:
-        logging.info(f"Recommended tests for {str(cpu_services)}  :\n {chaos_tests_config['CPU']}")
-        logging.info("\n")
-    if mem_services:
-        logging.info(f"Recommended tests for {str(mem_services)}  :\n {chaos_tests_config['MEM']}")
-        logging.info("\n")
+        mem_recommend = {"services": mem_services,
+                         "tests": chaos_tests_config['MEM']}
+        recommendations["mem_services_recommendations"] = mem_recommend

    if outliers_network:
-        logging.info(f"Recommended tests for  str(outliers_network)  :\n {chaos_tests_config['NETWORK']}")
-        logging.info("\n")
+        outliers_network_recommend = {"outliers_networks": outliers_network,
+                                      "tests": chaos_tests_config['NETWORK']}
+        recommendations["outliers_network_recommendations"] = (
+            outliers_network_recommend)

-    logging.info("\n")
-    logging.info("Please check data in utilisation.txt for further analysis")
+    return [profiling, heatmap, recommendations]
--- a/kraken/chaos_recommender/prometheus.py
+++ b/kraken/chaos_recommender/prometheus.py
@@ -1,6 +1,5 @@
 import logging

-import pandas
 from prometheus_api_client import PrometheusConnect
 import pandas as pd
 import urllib3
@@ -8,6 +7,7 @@ import urllib3

 saved_metrics_path = "./utilisation.txt"

+
 def convert_data_to_dataframe(data, label):
    df = pd.DataFrame()
    df['service'] = [item['metric']['pod'] for item in data]
@@ -17,29 +17,60 @@ def convert_data_to_dataframe(data, label):


 def convert_data(data, service):
-
    result = {}
    for entry in data:
        pod_name = entry['metric']['pod']
        value = entry['value'][1]
        result[pod_name] = value
-    return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
-
-def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
-    df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
-    merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
-    services = df_cpu.service.unique()
-    logging.info(services)
-
-    for s in services:
-
-        new_row_df = pd.DataFrame( {"service": s, "CPU" : convert_data(cpu_data, s),
-                    "CPU_LIMITS" : convert_data(cpu_limits_result, s),
-                    "MEM" : convert_data(mem_data, s), "MEM_LIMITS" : convert_data(mem_limits_result, s),
-                    "NETWORK" : convert_data(network_data, s)}, index=[0])
-        merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
+    return result.get(service) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value


+def convert_data_limits(data, node_data, service, prometheus):
+    result = {}
+    for entry in data:
+        pod_name = entry['metric']['pod']
+        value = entry['value'][1]
+        result[pod_name] = value
+    return result.get(service, get_node_capacity(node_data, service, prometheus)) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
+
+def get_node_capacity(node_data, pod_name, prometheus ):
+
+    # Get the node name on which the pod is running
+    query = f'kube_pod_info{{pod="{pod_name}"}}'
+    result = prometheus.custom_query(query)
+    if not result:
+        return None
+
+    node_name = result[0]['metric']['node']
+
+    for item in node_data:
+        if item['metric']['node'] == node_name:
+            return item['value'][1]
+
+    return '1000000000'
+
+
+def save_utilization_to_file(utilization, filename, prometheus):
+
+    merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
+    for namespace in utilization:
+        # Loading utilization_data[] for namespace
+        # indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network
+        utilization_data = utilization[namespace]
+        df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU")
+        services = df_cpu.service.unique()
+        logging.info(f"Services for namespace {namespace}: {services}")
+
+        for s in services:
+
+            new_row_df = pd.DataFrame({
+                "namespace": namespace, "service": s,
+                "CPU": convert_data(utilization_data[0], s),
+                "CPU_LIMITS": convert_data_limits(utilization_data[1],utilization_data[5], s, prometheus),
+                "MEM": convert_data(utilization_data[2], s),
+                "MEM_LIMITS": convert_data_limits(utilization_data[3], utilization_data[6], s, prometheus),
+                "NETWORK": convert_data(utilization_data[4], s)}, index=[0])
+            merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)

    # Convert columns to string
    merged_df['CPU'] = merged_df['CPU'].astype(str)
@@ -49,48 +80,65 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
    merged_df['NETWORK'] = merged_df['NETWORK'].astype(str)

    # Extract integer part before the decimal point
-    merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0]
-    merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0]
-    merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0]
-    merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0]
-    merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0]
+    #merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0]
+    #merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0]
+    #merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0]
+    #merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0]
+    #merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0]

    merged_df.to_csv(filename, sep='\t', index=False)

-def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
+
+def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
+                                      namespaces, scrape_duration):
    urllib3.disable_warnings()
-    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
+    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={
+        'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)

-    # Fetch CPU utilization
-    cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
-    logging.info(cpu_query)
-    cpu_result = prometheus.custom_query(cpu_query)
-    cpu_data = cpu_result
-
-
-    cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
-    logging.info(cpu_limits_query)
-    cpu_limits_result = prometheus.custom_query(cpu_limits_query)
-
-
-    mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
-    logging.info(mem_query)
-    mem_result = prometheus.custom_query(mem_query)
-    mem_data = mem_result
-
-    mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
-    logging.info(mem_limits_query)
-    mem_limits_result = prometheus.custom_query(mem_limits_query)
-
-
-    network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
-    (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
-    network_result = prometheus.custom_query(network_query)
-    logging.info(network_query)
-    network_data = network_result
-
-
-    save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
-    return saved_metrics_path
+    # Dicts for saving utilisation and queries -- key is namespace
+    utilization = {}
+    queries = {}
+
+    logging.info("Fetching utilization...")
+    for namespace in namespaces:
+
+        # Fetch CPU utilization
+        cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
+        cpu_result = prometheus.custom_query(cpu_query)
+
+        cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
+        cpu_limits_result = prometheus.custom_query(cpu_limits_query)
+
+        node_cpu_limits_query = 'kube_node_status_capacity{resource="cpu", unit="core"}*1000'
+        node_cpu_limits_result = prometheus.custom_query(node_cpu_limits_query)
+
+        mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
+        mem_result = prometheus.custom_query(mem_query)
+
+        mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
+        mem_limits_result = prometheus.custom_query(mem_limits_query)
+
+        node_mem_limits_query = 'kube_node_status_capacity{resource="memory", unit="byte"}'
+        node_mem_limits_result = prometheus.custom_query(node_mem_limits_query)
+
+        network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
+        (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
+        network_result = prometheus.custom_query(network_query)
+
+        utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, node_cpu_limits_result, node_mem_limits_result ]
+        queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)
+
+    save_utilization_to_file(utilization, saved_metrics_path, prometheus)
+
+    return saved_metrics_path, queries


+def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query):
+    queries = {
+        "cpu_query": cpu_query,
+        "cpu_limit_query": cpu_limits_query,
+        "memory_query": mem_query,
+        "memory_limit_query": mem_limits_query,
+        "network_query": network_query
+    }
+    return queries
--- a/kraken/network_chaos/actions.py
+++ b/kraken/network_chaos/actions.py
@@ -23,7 +23,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
    for net_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = net_config
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, net_config)
        try:
            with open(net_config, "r") as file:
@@ -114,11 +114,11 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
                    logging.info("Deleting jobs")
                    delete_job(joblst[:], kubecli)
        except (RuntimeError, Exception):
-            scenario_telemetry.exitStatus = 1
+            scenario_telemetry.exit_status = 1
            failed_scenarios.append(net_config)
            log_exception(net_config)
        else:
-            scenario_telemetry.exitStatus = 0
+            scenario_telemetry.exit_status = 0
        scenario_telemetries.append(scenario_telemetry)
    return failed_scenarios, scenario_telemetries

--- a/kraken/node_actions/run.py
+++ b/kraken/node_actions/run.py
@@ -15,7 +15,7 @@ import kraken.cerberus.setup as cerberus
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
 from krkn_lib.models.telemetry import ScenarioTelemetry
-from krkn_lib.utils.functions import get_yaml_item_value
+from krkn_lib.utils.functions import get_yaml_item_value, log_exception

 node_general = False

@@ -61,7 +61,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
    for node_scenario_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = node_scenario_config
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config)
        with open(node_scenario_config, "r") as f:
            node_scenario_config = yaml.full_load(f)
@@ -78,13 +78,13 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
                            cerberus.get_status(config, start_time, end_time)
                            logging.info("")
                        except (RuntimeError, Exception) as e:
-                            scenario_telemetry.exitStatus = 1
+                            scenario_telemetry.exit_status = 1
                            failed_scenarios.append(node_scenario_config)
                            log_exception(node_scenario_config)
                        else:
-                            scenario_telemetry.exitStatus = 0
+                            scenario_telemetry.exit_status = 0

-                        scenario_telemetry.endTimeStamp = time.time()
+                        scenario_telemetry.end_timestamp = time.time()
                        scenario_telemetries.append(scenario_telemetry)

    return failed_scenarios, scenario_telemetries
--- a/kraken/plugins/init.py
+++ b/kraken/plugins/init.py
@@ -2,11 +2,14 @@ import dataclasses
 import json
 import logging
 from os.path import abspath
-from typing import List, Dict
+from typing import List, Dict, Any
 import time

 from arcaflow_plugin_sdk import schema, serialization, jsonschema
 from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
+from krkn_lib.k8s import KrknKubernetes
+from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
+
 import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
 import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
 from kraken.plugins.run_python_plugin import run_python_file
@@ -47,11 +50,14 @@ class Plugins:
                )
            self.steps_by_id[step.schema.id] = step

+    def unserialize_scenario(self, file: str) -> Any:
+        return serialization.load_from_file(abspath(file))
+
    def run(self, file: str, kubeconfig_path: str, kraken_config: str):
        """
        Run executes a series of steps
        """
-        data = serialization.load_from_file(abspath(file))
+        data = self.unserialize_scenario(abspath(file))
        if not isinstance(data, list):
            raise Exception(
                "Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
@@ -213,6 +219,12 @@ PLUGINS = Plugins(
                "error"
            ]
        ),
+        PluginStep(
+            network_chaos,
+            [
+                "error"
+            ]
+        ),        
        PluginStep(
            pod_outage,
            [
@@ -235,25 +247,72 @@ PLUGINS = Plugins(
 )


-def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]):
+def run(scenarios: List[str],
+        kubeconfig_path: str,
+        kraken_config: str,
+        failed_post_scenarios: List[str],
+        wait_duration: int,
+        telemetry: KrknTelemetryKubernetes,
+        kubecli: KrknKubernetes
+        ) -> (List[str], list[ScenarioTelemetry]):
+
    scenario_telemetries: list[ScenarioTelemetry] = []
    for scenario in scenarios:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = scenario
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, scenario)
        logging.info('scenario ' + str(scenario))
+        pool = PodsMonitorPool(kubecli)
+        kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"]
+
        try:
+            start_monitoring(pool, kill_scenarios)
            PLUGINS.run(scenario, kubeconfig_path, kraken_config)
+            result = pool.join()
+            scenario_telemetry.affected_pods = result
+            if result.error:
+                raise Exception(f"unrecovered pods: {result.error}")
+
        except Exception as e:
-            scenario_telemetry.exitStatus = 1
+            logging.error(f"scenario exception: {str(e)}")
+            scenario_telemetry.exit_status = 1
+            pool.cancel()
            failed_post_scenarios.append(scenario)
            log_exception(scenario)
        else:
-            scenario_telemetry.exitStatus = 0
+            scenario_telemetry.exit_status = 0
            logging.info("Waiting for the specified duration: %s" % (wait_duration))
            time.sleep(wait_duration)
        scenario_telemetries.append(scenario_telemetry)
-        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetry.end_timestamp = time.time()

    return failed_post_scenarios, scenario_telemetries
+
+
+def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]):
+    for kill_scenario in scenarios:
+        recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
+        if ("namespace_pattern" in kill_scenario["config"] and
+                "label_selector" in kill_scenario["config"]):
+            namespace_pattern = kill_scenario["config"]["namespace_pattern"]
+            label_selector = kill_scenario["config"]["label_selector"]
+            pool.select_and_monitor_by_namespace_pattern_and_label(
+                namespace_pattern=namespace_pattern,
+                label_selector=label_selector,
+                max_timeout=recovery_time)
+            logging.info(
+                f"waiting {recovery_time} seconds for pod recovery, "
+                f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}")
+
+        elif ("namespace_pattern" in kill_scenario["config"] and
+              "name_pattern" in kill_scenario["config"]):
+            namespace_pattern = kill_scenario["config"]["namespace_pattern"]
+            name_pattern = kill_scenario["config"]["name_pattern"]
+            pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern,
+                                                                          namespace_pattern=namespace_pattern,
+                                                                          max_timeout=recovery_time)
+            logging.info(f"waiting {recovery_time} seconds for pod recovery, "
+                         f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}")
+        else:
+            raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration")
--- a/kraken/plugins/network/ingress_shaping.py
+++ b/kraken/plugins/network/ingress_shaping.py
@@ -62,7 +62,7 @@ class NetworkScenarioConfig:
        typing.Optional[int],
        validation.min(1)
    ] = field(
-        default=300,
+        default=30,
        metadata={
            "name": "Wait Duration",
            "description":
@@ -864,7 +864,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
                )
            logging.info("Waiting for parallel job to finish")
            start_time = int(time.time())
-            wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
+            wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
            end_time = int(time.time())
            if publish:
                cerberus.publish_kraken_status(
@@ -893,7 +893,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
                    )
                logging.info("Waiting for serial job to finish")
                start_time = int(time.time())
-                wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
+                wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
                logging.info("Deleting jobs")
                delete_jobs(cli, batch_cli, job_list[:])
                job_list = []
--- a/kraken/plugins/node_scenarios/vmware_plugin.py
+++ b/kraken/plugins/node_scenarios/vmware_plugin.py
@@ -119,11 +119,11 @@ class vSphere:
        vm = self.get_vm(instance_id)
        try:
            self.client.vcenter.vm.Power.stop(vm)
-            logging.info("Stopped VM -- '{}-({})'", instance_id, vm)
+            logging.info(f"Stopped VM -- '{instance_id}-({vm})'")
            return True
        except AlreadyInDesiredState:
            logging.info(
-                "VM '{}'-'({})' is already Powered Off", instance_id, vm
+                f"VM '{instance_id}'-'({vm})' is already Powered Off"
            )
            return False

@@ -136,11 +136,11 @@ class vSphere:
        vm = self.get_vm(instance_id)
        try:
            self.client.vcenter.vm.Power.start(vm)
-            logging.info("Started VM -- '{}-({})'", instance_id, vm)
+            logging.info(f"Started VM -- '{instance_id}-({vm})'")
            return True
        except AlreadyInDesiredState:
            logging.info(
-                "VM '{}'-'({})' is already Powered On", instance_id, vm
+                f"VM '{instance_id}'-'({vm})' is already Powered On"
            )
            return False

@@ -318,12 +318,12 @@ class vSphere:
        try:
            vm = self.get_vm(instance_id)
            state = self.client.vcenter.vm.Power.get(vm).state
-            logging.info("Check instance %s status", instance_id)
+            logging.info(f"Check instance {instance_id} status")
            return state
        except Exception as e:
            logging.error(
-                "Failed to get node instance status %s. Encountered following "
-                "exception: %s.", instance_id, e
+                f"Failed to get node instance status {instance_id}. Encountered following "
+                f"exception: {str(e)}. "
            )
            return None

@@ -338,16 +338,14 @@ class vSphere:
        while vm is not None:
            vm = self.get_vm(instance_id)
            logging.info(
-                "VM %s is still being deleted, "
-                "sleeping for 5 seconds",
-                instance_id
+                f"VM {instance_id} is still being deleted, "
+                f"sleeping for 5 seconds"
            )
            time.sleep(5)
            time_counter += 5
            if time_counter >= timeout:
                logging.info(
-                    "VM %s is still not deleted in allotted time",
-                    instance_id
+                    f"VM {instance_id} is still not deleted in allotted time"
                )
                return False
        return True
@@ -371,8 +369,7 @@ class vSphere:
            time_counter += 5
            if time_counter >= timeout:
                logging.info(
-                    "VM %s is still not ready in allotted time",
-                    instance_id
+                    f"VM {instance_id} is still not ready in allotted time"
                )
                return False
        return True
@@ -388,16 +385,14 @@ class vSphere:
        while status != Power.State.POWERED_OFF:
            status = self.get_vm_status(instance_id)
            logging.info(
-                "VM %s is still not running, "
-                "sleeping for 5 seconds",
-                instance_id
+                f"VM {instance_id} is still not running, "
+                f"sleeping for 5 seconds"
            )
            time.sleep(5)
            time_counter += 5
            if time_counter >= timeout:
                logging.info(
-                    "VM %s is still not ready in allotted time",
-                    instance_id
+                    f"VM {instance_id} is still not ready in allotted time"
                )
                return False
        return True
@@ -561,7 +556,7 @@ def node_start(
            try:
                for _ in range(cfg.runs):
                    logging.info("Starting node_start_scenario injection")
-                    logging.info("Starting the node %s ", name)
+                    logging.info(f"Starting the node {name} ")
                    vm_started = vsphere.start_instances(name)
                    if vm_started:
                        vsphere.wait_until_running(name, cfg.timeout)
@@ -571,7 +566,7 @@ def node_start(
                            )
                        nodes_started[int(time.time_ns())] = Node(name=name)
                    logging.info(
-                        "Node with instance ID: %s is in running state", name
+                        f"Node with instance ID: {name} is in running state"
                    )
                    logging.info(
                        "node_start_scenario has been successfully injected!"
@@ -579,8 +574,8 @@ def node_start(
            except Exception as e:
                logging.error("Failed to start node instance. Test Failed")
                logging.error(
-                    "node_start_scenario injection failed! "
-                    "Error was: %s", str(e)
+                    f"node_start_scenario injection failed! "
+                    f"Error was: {str(e)}"
                )
                return "error", NodeScenarioErrorOutput(
                    format_exc(), kube_helper.Actions.START
@@ -620,7 +615,7 @@ def node_stop(
            try:
                for _ in range(cfg.runs):
                    logging.info("Starting node_stop_scenario injection")
-                    logging.info("Stopping the node %s ", name)
+                    logging.info(f"Stopping the node {name} ")
                    vm_stopped = vsphere.stop_instances(name)
                    if vm_stopped:
                        vsphere.wait_until_stopped(name, cfg.timeout)
@@ -630,7 +625,7 @@ def node_stop(
                            )
                        nodes_stopped[int(time.time_ns())] = Node(name=name)
                    logging.info(
-                        "Node with instance ID: %s is in stopped state", name
+                        f"Node with instance ID: {name} is in stopped state"
                    )
                    logging.info(
                        "node_stop_scenario has been successfully injected!"
@@ -638,8 +633,8 @@ def node_stop(
            except Exception as e:
                logging.error("Failed to stop node instance. Test Failed")
                logging.error(
-                    "node_stop_scenario injection failed! "
-                    "Error was: %s", str(e)
+                    f"node_stop_scenario injection failed! "
+                    f"Error was: {str(e)}"
                )
                return "error", NodeScenarioErrorOutput(
                    format_exc(), kube_helper.Actions.STOP
@@ -679,7 +674,7 @@ def node_reboot(
            try:
                for _ in range(cfg.runs):
                    logging.info("Starting node_reboot_scenario injection")
-                    logging.info("Rebooting the node %s ", name)
+                    logging.info(f"Rebooting the node {name} ")
                    vsphere.reboot_instances(name)
                    if not cfg.skip_openshift_checks:
                        kube_helper.wait_for_unknown_status(
@@ -690,8 +685,8 @@ def node_reboot(
                        )
                    nodes_rebooted[int(time.time_ns())] = Node(name=name)
                    logging.info(
-                        "Node with instance ID: %s has rebooted "
-                        "successfully", name
+                        f"Node with instance ID: {name} has rebooted "
+                        "successfully"
                    )
                    logging.info(
                        "node_reboot_scenario has been successfully injected!"
@@ -699,8 +694,8 @@ def node_reboot(
            except Exception as e:
                logging.error("Failed to reboot node instance. Test Failed")
                logging.error(
-                    "node_reboot_scenario injection failed! "
-                    "Error was: %s", str(e)
+                    f"node_reboot_scenario injection failed! "
+                    f"Error was: {str(e)}"
                )
                return "error", NodeScenarioErrorOutput(
                    format_exc(), kube_helper.Actions.REBOOT
@@ -739,13 +734,13 @@ def node_terminate(
                    vsphere.stop_instances(name)
                    vsphere.wait_until_stopped(name, cfg.timeout)
                    logging.info(
-                        "Releasing the node with instance ID: %s ", name
+                        f"Releasing the node with instance ID: {name} "
                    )
                    vsphere.release_instances(name)
                    vsphere.wait_until_released(name, cfg.timeout)
                    nodes_terminated[int(time.time_ns())] = Node(name=name)
                    logging.info(
-                        "Node with instance ID: %s has been released", name
+                        f"Node with instance ID: {name} has been released"
                    )
                    logging.info(
                        "node_terminate_scenario has been "
@@ -754,8 +749,8 @@ def node_terminate(
            except Exception as e:
                logging.error("Failed to terminate node instance. Test Failed")
                logging.error(
-                    "node_terminate_scenario injection failed! "
-                    "Error was: %s", str(e)
+                    f"node_terminate_scenario injection failed! "
+                    f"Error was: {str(e)}"
                )
                return "error", NodeScenarioErrorOutput(
                    format_exc(), kube_helper.Actions.TERMINATE
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -1,9 +1,13 @@
 import logging
 import time
+from typing import Any
+
 import yaml
 import sys
 import random
 import arcaflow_plugin_kill_pod
+from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
+
 import kraken.cerberus.setup as cerberus
 import kraken.post_actions.actions as post_actions
 from krkn_lib.k8s import KrknKubernetes
@@ -79,11 +83,12 @@ def container_run(kubeconfig_path,

    failed_scenarios = []
    scenario_telemetries: list[ScenarioTelemetry] = []
+    pool = PodsMonitorPool(kubecli)

    for container_scenario_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = container_scenario_config[0]
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
        if len(container_scenario_config) > 1:
            pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
@@ -91,23 +96,17 @@ def container_run(kubeconfig_path,
            pre_action_output = ""
        with open(container_scenario_config[0], "r") as f:
            cont_scenario_config = yaml.full_load(f)
+            start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool)
            for cont_scenario in cont_scenario_config["scenarios"]:
                # capture start time
                start_time = int(time.time())
                try:
                    killed_containers = container_killing_in_pod(cont_scenario, kubecli)
-                    if len(container_scenario_config) > 1:
-                        failed_post_scenarios = post_actions.check_recovery(
-                            kubeconfig_path,
-                            container_scenario_config,
-                            failed_post_scenarios,
-                            pre_action_output
-                        )
-                    else:
-                        failed_post_scenarios = check_failed_containers(
-                            killed_containers, cont_scenario.get("retry_wait", 120), kubecli
-                        )
-
+                    logging.info(f"killed containers: {str(killed_containers)}")
+                    result = pool.join()
+                    if result.error:
+                        raise Exception(f"pods failed to recovery: {result.error}")
+                    scenario_telemetry.affected_pods = result
                    logging.info("Waiting for the specified duration: %s" % (wait_duration))
                    time.sleep(wait_duration)

@@ -117,18 +116,29 @@ def container_run(kubeconfig_path,
                    # publish cerberus status
                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
                except (RuntimeError, Exception):
+                    pool.cancel()
                    failed_scenarios.append(container_scenario_config[0])
                    log_exception(container_scenario_config[0])
-                    scenario_telemetry.exitStatus = 1
+                    scenario_telemetry.exit_status = 1
                    # removed_exit
                    # sys.exit(1)
                else:
-                    scenario_telemetry.exitStatus = 0
-                scenario_telemetry.endTimeStamp = time.time()
+                    scenario_telemetry.exit_status = 0
+                scenario_telemetry.end_timestamp = time.time()
                scenario_telemetries.append(scenario_telemetry)

    return failed_scenarios, scenario_telemetries

+def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool):
+    for kill_scenario in kill_scenarios:
+        namespace_pattern = f"^{kill_scenario['namespace']}$"
+        label_selector = kill_scenario["label_selector"]
+        recovery_time = kill_scenario["expected_recovery_time"]
+        pool.select_and_monitor_by_namespace_pattern_and_label(
+            namespace_pattern=namespace_pattern,
+            label_selector=label_selector,
+            max_timeout=recovery_time)
+

 def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
    scenario_name = get_yaml_item_value(cont_scenario, "name", "")
--- a/kraken/prometheus/client.py
+++ b/kraken/prometheus/client.py
@@ -1,10 +1,13 @@
 import datetime
 import os.path
+from typing import Optional
+
 import urllib3
 import logging
 import sys

 import yaml
+from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):

            prom_cli.process_alert(alert,
                                   datetime.datetime.fromtimestamp(start_time),
-                                   datetime.datetime.fromtimestamp(end_time))
+                                   datetime.datetime.fromtimestamp(end_time))
+
+
+def critical_alerts(prom_cli: KrknPrometheus,
+                    summary: ChaosRunAlertSummary,
+                    run_id,
+                    scenario,
+                    start_time,
+                    end_time):
+    summary.scenario = scenario
+    summary.run_id = run_id
+    query = r"""ALERTS{severity="critical"}"""
+    logging.info("Checking for critical alerts firing post chaos")
+
+    during_critical_alerts = prom_cli.process_prom_query_in_range(
+        query,
+        start_time=datetime.datetime.fromtimestamp(start_time),
+        end_time=end_time
+
+    )
+
+    for alert in during_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.chaos_alerts.append(alert)
+
+
+    post_critical_alerts = prom_cli.process_query(
+        query
+    )
+
+    for alert in post_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.post_chaos_alerts.append(alert)
+
+    during_critical_alerts_count = len(during_critical_alerts)
+    post_critical_alerts_count = len(post_critical_alerts)
+    firing_alerts = False
+
+    if during_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if post_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if not firing_alerts:
+        logging.info("No critical alerts are firing!!")
--- a/kraken/pvc/pvc_scenario.py
+++ b/kraken/pvc/pvc_scenario.py
@@ -11,7 +11,7 @@ from krkn_lib.utils.functions import get_yaml_item_value, log_exception


 # krkn_lib
-def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
+def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
    """
    Reads the scenario config and creates a temp file to fill up the PVC
    """
@@ -21,7 +21,7 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
    for app_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = app_config
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, app_config)
        try:
            if len(app_config) > 1:
@@ -305,7 +305,9 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
                        file_size_kb,
                        kubecli
                    )
-
+                    logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
+                    time.sleep(wait_duration)
+                    
                    end_time = int(time.time())
                    cerberus.publish_kraken_status(
                        config,
@@ -314,11 +316,11 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
                        end_time
                    )
        except (RuntimeError, Exception):
-            scenario_telemetry.exitStatus = 1
+            scenario_telemetry.exit_status = 1
            failed_scenarios.append(app_config)
            log_exception(app_config)
        else:
-            scenario_telemetry.exitStatus = 0
+            scenario_telemetry.exit_status = 0
        scenario_telemetries.append(scenario_telemetry)

    return failed_scenarios, scenario_telemetries
--- a/kraken/service_disruption/common_service_disruption_functions.py
+++ b/kraken/service_disruption/common_service_disruption_functions.py
@@ -165,7 +165,7 @@ def run(
    for scenario_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = scenario_config[0]
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0])
        try:
            if len(scenario_config) > 1:
@@ -249,12 +249,12 @@ def run(
                    end_time = int(time.time())
                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
        except (Exception, RuntimeError):
-            scenario_telemetry.exitStatus = 1
+            scenario_telemetry.exit_status = 1
            failed_scenarios.append(scenario_config[0])
            log_exception(scenario_config[0])
        else:
-            scenario_telemetry.exitStatus = 0
-        scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetry.exit_status = 0
+        scenario_telemetry.end_timestamp = time.time()
        scenario_telemetries.append(scenario_telemetry)
    return failed_scenarios, scenario_telemetries

--- a/kraken/service_hijacking/init.py
+++ b/kraken/service_hijacking/init.py
--- a/kraken/service_hijacking/service_hijacking.py
+++ b/kraken/service_hijacking/service_hijacking.py
@@ -0,0 +1,90 @@
+import logging
+import time
+
+import yaml
+from krkn_lib.k8s import KrknKubernetes
+from krkn_lib.models.telemetry import ScenarioTelemetry
+from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
+
+
+def run(scenarios_list: list[str],wait_duration: int,  krkn_lib: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries= list[ScenarioTelemetry]()
+    failed_post_scenarios = []
+    for scenario in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario
+        scenario_telemetry.start_timestamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, scenario)
+        with open(scenario) as stream:
+            scenario_config = yaml.safe_load(stream)
+
+        service_name = scenario_config['service_name']
+        service_namespace = scenario_config['service_namespace']
+        plan = scenario_config["plan"]
+        image = scenario_config["image"]
+        target_port = scenario_config["service_target_port"]
+        chaos_duration = scenario_config["chaos_duration"]
+
+        logging.info(f"checking service {service_name} in namespace: {service_namespace}")
+        if not krkn_lib.service_exists(service_name, service_namespace):
+            logging.error(f"service: {service_name} not found in namespace: {service_namespace}, failed to run scenario.")
+            fail(scenario_telemetry, scenario_telemetries)
+            failed_post_scenarios.append(scenario)
+            break
+        try:
+            logging.info(f"service: {service_name} found in namespace: {service_namespace}")
+            logging.info(f"creating webservice and initializing test plan...")
+            # both named ports and port numbers can be used
+            if isinstance(target_port, int):
+                logging.info(f"webservice will listen on port {target_port}")
+                webservice = krkn_lib.deploy_service_hijacking(service_namespace, plan, image, port_number=target_port)
+            else:
+                logging.info(f"traffic will be redirected to named port: {target_port}")
+                webservice = krkn_lib.deploy_service_hijacking(service_namespace, plan, image, port_name=target_port)
+            logging.info(f"successfully deployed pod: {webservice.pod_name} "
+                         f"in namespace:{service_namespace} with selector {webservice.selector}!"
+                         )
+            logging.info(f"patching service: {service_name} to hijack traffic towards: {webservice.pod_name}")
+            original_service = krkn_lib.replace_service_selector([webservice.selector], service_name, service_namespace)
+            if original_service is None:
+                logging.error(f"failed to patch service: {service_name}, namespace: {service_namespace} with selector {webservice.selector}")
+                fail(scenario_telemetry, scenario_telemetries)
+                failed_post_scenarios.append(scenario)
+                break
+
+            logging.info(f"service: {service_name} successfully patched!")
+            logging.info(f"original service manifest:\n\n{yaml.dump(original_service)}")
+            logging.info(f"waiting {chaos_duration} before restoring the service")
+            time.sleep(chaos_duration)
+            selectors = ["=".join([key, original_service["spec"]["selector"][key]]) for key in original_service["spec"]["selector"].keys()]
+            logging.info(f"restoring the service selectors {selectors}")
+            original_service = krkn_lib.replace_service_selector(selectors, service_name, service_namespace)
+            if original_service is None:
+                logging.error(f"failed to restore original service: {service_name}, namespace: {service_namespace} with selectors: {selectors}")
+                fail(scenario_telemetry, scenario_telemetries)
+                failed_post_scenarios.append(scenario)
+                break
+            logging.info("selectors successfully restored")
+            logging.info("undeploying service-hijacking resources...")
+            krkn_lib.undeploy_service_hijacking(webservice)
+
+            logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
+            time.sleep(wait_duration)
+            
+            scenario_telemetry.exit_status = 0
+            scenario_telemetry.end_timestamp = time.time()
+            scenario_telemetries.append(scenario_telemetry)
+            logging.info("success")
+        except Exception as e:
+            logging.error(f"scenario {scenario} failed with exception: {e}")
+            fail(scenario_telemetry, scenario_telemetries)
+            failed_post_scenarios.append(scenario)
+
+    return failed_post_scenarios, scenario_telemetries
+
+
+def fail(scenario_telemetry: ScenarioTelemetry,  scenario_telemetries: list[ScenarioTelemetry]):
+    scenario_telemetry.exit_status = 1
+    scenario_telemetry.end_timestamp = time.time()
+    scenario_telemetries.append(scenario_telemetry)
+
--- a/kraken/shut_down/common_shut_down_func.py
+++ b/kraken/shut_down/common_shut_down_func.py
@@ -147,7 +147,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr

        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = config_path
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, config_path)

        with open(config_path, "r") as f:
@@ -175,11 +175,11 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
            except (RuntimeError, Exception):
                log_exception(config_path)
                failed_scenarios.append(config_path)
-                scenario_telemetry.exitStatus = 1
+                scenario_telemetry.exit_status = 1
            else:
-                scenario_telemetry.exitStatus = 0
+                scenario_telemetry.exit_status = 0

-            scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetry.end_timestamp = time.time()
            scenario_telemetries.append(scenario_telemetry)

    return failed_scenarios, scenario_telemetries
--- a/kraken/time_actions/common_time_functions.py
+++ b/kraken/time_actions/common_time_functions.py
@@ -354,7 +354,7 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
    for time_scenario_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = time_scenario_config
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config)
        try:
            with open(time_scenario_config, "r") as f:
@@ -377,12 +377,12 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
                        end_time
                    )
        except (RuntimeError, Exception):
-            scenario_telemetry.exitStatus = 1
+            scenario_telemetry.exit_status = 1
            log_exception(time_scenario_config)
            failed_scenarios.append(time_scenario_config)
        else:
-            scenario_telemetry.exitStatus = 0
-        scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetry.exit_status = 0
+        scenario_telemetry.end_timestamp = time.time()
        scenario_telemetries.append(scenario_telemetry)

    return failed_scenarios, scenario_telemetries
--- a/kraken/zone_outage/actions.py
+++ b/kraken/zone_outage/actions.py
@@ -19,7 +19,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
    for zone_outage_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
        scenario_telemetry.scenario = zone_outage_config
-        scenario_telemetry.startTimeStamp = time.time()
+        scenario_telemetry.start_timestamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config)
        try:
            if len(zone_outage_config) > 1:
@@ -110,12 +110,12 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
                        end_time
                    )
        except (RuntimeError, Exception):
-            scenario_telemetry.exitStatus = 1
+            scenario_telemetry.exit_status = 1
            failed_scenarios.append(zone_outage_config)
            log_exception(zone_outage_config)
        else:
-            scenario_telemetry.exitStatus = 0
-        scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetry.exit_status = 0
+        scenario_telemetry.end_timestamp = time.time()
        scenario_telemetries.append(scenario_telemetry)
    return failed_scenarios, scenario_telemetries

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,21 @@
 aliyun-python-sdk-core==2.13.36
 aliyun-python-sdk-ecs==4.24.25
-arcaflow==0.9.0
+arcaflow==0.17.2
 arcaflow-plugin-sdk==0.10.0
 boto3==1.28.61
-azure-identity==1.15.0
+azure-identity==1.16.1
 azure-keyvault==4.2.0
 azure-mgmt-compute==30.5.0
 itsdangerous==2.0.1
 coverage==7.4.1
 datetime==5.4
 docker==7.0.0
-docker-compose==1.29.2
 gitpython==3.1.41
 google-api-python-client==2.116.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
-jinja2==3.1.3
-krkn-lib==1.4.9
+jinja2==3.1.4
+krkn-lib==2.1.3
 lxml==5.1.0
 kubernetes==26.1.0
 oauth2client==4.1.3
@@ -29,13 +28,14 @@ pyfiglet==1.0.2
 pytest==8.0.0
 python-ipmi==0.5.4
 python-openstackclient==6.5.0
-requests==2.31.0
+requests==2.32.0
 service_identity==24.1.0
-PyYAML==5.4.1
+PyYAML==6.0
 setuptools==65.5.1
-werkzeug==3.0.1
+werkzeug==3.0.3
 wheel==0.42.0
 zope.interface==5.4.0

 git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
-git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
+git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
+cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -9,6 +9,8 @@ import optparse
 import pyfiglet
 import uuid
 import time
+
+from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 import kraken.time_actions.common_time_functions as time_actions
 import kraken.performance_dashboards.setup as performance_dashboards
@@ -23,10 +25,12 @@ import kraken.pvc.pvc_scenario as pvc_scenario
 import kraken.network_chaos.actions as network_chaos
 import kraken.arcaflow_plugin as arcaflow_plugin
 import kraken.prometheus as prometheus_plugin
+import kraken.service_hijacking.service_hijacking as service_hijacking_plugin
 import server as server
 from kraken import plugins
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.ocp import KrknOpenshift
+from krkn_lib.telemetry.elastic import KrknElastic
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.models.telemetry import ChaosRunTelemetry
@@ -94,6 +98,9 @@ def main(cfg):
            config["performance_monitoring"], "check_critical_alerts", False
        )
        telemetry_api_url = config["telemetry"].get("api_url")
+        elastic_config = get_yaml_item_value(config,"elastic",{})
+        elastic_url = get_yaml_item_value(elastic_config,"elastic_url","")
+        elastic_index = get_yaml_item_value(elastic_config,"elastic_index","")
        
        # Initialize clients
        if (not os.path.isfile(kubeconfig_path) and
@@ -129,8 +136,6 @@ def main(cfg):
        except:
            kubecli.initialize_clients(None)

-
-
        # find node kraken might be running on
        kubecli.find_kraken_node()

@@ -161,8 +166,13 @@ def main(cfg):
            if prometheus_url is None:
                try:
                    connection_data = ocpcli.get_prometheus_api_connection_data()
-                    prometheus_url = connection_data.endpoint
-                    prometheus_bearer_token = connection_data.token
+                    if connection_data:
+                        prometheus_url = connection_data.endpoint
+                        prometheus_bearer_token = connection_data.token
+                    else: 
+                        # If can't make a connection, set alerts to false
+                        enable_alerts = False
+                        critical_alerts = False
                except Exception:
                    logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster."
                                  "Please set 'kubernetes' in config.yaml krkn.platform and try again")
@@ -175,9 +185,9 @@ def main(cfg):
        # KrknTelemetry init
        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
-
-
-        if enable_alerts:
+        telemetry_elastic = KrknElastic(safe_logger,elastic_url)
+        summary = ChaosRunAlertSummary()
+        if enable_alerts or check_critical_alerts:
            prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)

        logging.info("Server URL: %s" % kubecli.get_host())
@@ -208,7 +218,8 @@ def main(cfg):

        # Capture the start time
        start_time = int(time.time())
-
+        post_critical_alerts = 0
+        chaos_output = ChaosRunOutput()
        chaos_telemetry = ChaosRunTelemetry()
        chaos_telemetry.run_uuid = run_uuid
        # Loop to run the chaos starts here
@@ -254,7 +265,8 @@ def main(cfg):
                                kraken_config,
                                failed_post_scenarios,
                                wait_duration,
-                                telemetry_k8s
+                                telemetry_k8s,
+                                kubecli
                            )
                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                        # krkn_lib
@@ -322,14 +334,14 @@ def main(cfg):
                        elif scenario_type == "application_outages":
                            logging.info("Injecting application outage")
                            failed_post_scenarios, scenario_telemetries = application_outage.run(
-                                scenarios_list, config, wait_duration, telemetry_k8s)
+                                scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # PVC scenarios
                        # krkn_lib
                        elif scenario_type == "pvc_scenarios":
                            logging.info("Running PVC scenario")
-                            failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry_k8s)
+                            failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # Network scenarios
@@ -337,27 +349,27 @@ def main(cfg):
                        elif scenario_type == "network_chaos":
                            logging.info("Running Network Chaos")
                            failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
+                        elif scenario_type == "service_hijacking":
+                            logging.info("Running Service Hijacking Chaos")
+                            failed_post_scenarios, scenario_telemetries = service_hijacking_plugin.run(scenarios_list, wait_duration, kubecli, telemetry_k8s)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)

                        # Check for critical alerts when enabled
-                        if enable_alerts and check_critical_alerts :
-                            logging.info("Checking for critical alerts firing post choas")
+                        post_critical_alerts = 0
+                        if check_critical_alerts:
+                            prometheus_plugin.critical_alerts(prometheus,
+                                                              summary,
+                                                              run_uuid,
+                                                              scenario_type,
+                                                              start_time,
+                                                              datetime.datetime.now())

-                            ##PROM
-                            query = r"""ALERTS{severity="critical"}"""
-                            end_time = datetime.datetime.now()
-                            critical_alerts = prometheus.process_prom_query_in_range(
-                                query,
-                                start_time = datetime.datetime.fromtimestamp(start_time),
-                                end_time = end_time
+                            chaos_output.critical_alerts = summary
+                            post_critical_alerts = len(summary.post_chaos_alerts)
+                            if post_critical_alerts > 0:
+                                logging.error("Post chaos critical alerts firing please check, exiting")
+                                break

-                            )
-                            critical_alerts_count = len(critical_alerts)
-                            if critical_alerts_count > 0:
-                                logging.error("Critical alerts are firing: %s", critical_alerts)
-                                logging.error("Please check, exiting")
-                                sys.exit(1)
-                            else:
-                                logging.info("No critical alerts are firing!!")

            iteration += 1
            logging.info("")
@@ -377,14 +389,18 @@ def main(cfg):
            telemetry_k8s.collect_cluster_metadata(chaos_telemetry)

        decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
-        logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
-
+        chaos_output.telemetry = decoded_chaos_run_telemetry
+        logging.info(f"Chaos data:\n{chaos_output.to_json()}")
+        telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
        if config["telemetry"]["enabled"]:
-            logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
+            logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
+                         f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
+                         f'{telemetry_request_id}')
            logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
            try:
                telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
                telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
+                telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
                # prometheus data collection is available only on Openshift
                if config["telemetry"]["prometheus_backup"]:
                    prometheus_archive_files = ''
@@ -434,11 +450,15 @@ def main(cfg):
                logging.error("Alert profile is not defined")
                sys.exit(1)

+        if post_critical_alerts > 0:
+            logging.error("Critical alerts are firing, please check; exiting")
+            sys.exit(2)
+
        if failed_post_scenarios:
            logging.error(
                "Post scenarios are still failing at the end of all iterations"
            )
-            sys.exit(1)
+            sys.exit(2)

        logging.info(
            "Successfully finished running Kraken. UUID for the run: "
--- a/scenarios/arcaflow/cpu-hog/config.yaml
+++ b/scenarios/arcaflow/cpu-hog/config.yaml
@@ -4,7 +4,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/arcaflow/cpu-hog/input.yaml
+++ b/scenarios/arcaflow/cpu-hog/input.yaml
@@ -2,7 +2,7 @@ input_list:
  - cpu_count: 1
    cpu_load_percentage: 80
    cpu_method: all
-    duration: 1s
+    duration: 30
    kubeconfig: ''
    namespace: default
    # set the node selector as a key-value pair eg.
--- a/scenarios/arcaflow/cpu-hog/sub-workflow.yaml
+++ b/scenarios/arcaflow/cpu-hog/sub-workflow.yaml
@@ -1,9 +1,9 @@
 version: v0.2.0
 input:
-  root: RootObject
+  root: SubRootObject
  objects:
-    RootObject:
-      id: input_item
+    SubRootObject:
+      id: SubRootObject
      properties:
        kubeconfig:
          display:
@@ -35,7 +35,7 @@ input:
            description: stop stress test after T seconds. One can also specify the units of time in
              seconds, minutes, hours, days or years with the suffix s, m, h, d or y
          type:
-            type_id: string
+            type_id: integer
          required: true
        cpu_count:
          display:
@@ -68,18 +68,18 @@ steps:
      kubeconfig: !expr $.input.kubeconfig
  stressng:
    plugin: 
-      src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
+      src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
      deployment_type: image
    step: workload
    input:
      cleanup: "true"
-      StressNGParams:
-        timeout: !expr $.input.duration
-        stressors:
-          - stressor: cpu
-            cpu_count: !expr $.input.cpu_count
-            cpu_method: !expr $.input.cpu_method
-            cpu_load: !expr $.input.cpu_load_percentage
+
+      timeout: !expr $.input.duration
+      stressors:
+        - stressor: cpu
+          workers: !expr $.input.cpu_count
+          cpu-method: "all"
+          cpu-load: !expr $.input.cpu_load_percentage
    deploy:
      deployer_name: kubernetes
      connection: !expr $.steps.kubeconfig.outputs.success.connection
--- a/scenarios/arcaflow/cpu-hog/workflow.yaml
+++ b/scenarios/arcaflow/cpu-hog/workflow.yaml
@@ -9,62 +9,10 @@ input:
          type:
            type_id: list
            items:
-              id: input_item
-              type_id: object
-              properties:
-                kubeconfig:
-                  display:
-                    description: The complete kubeconfig file as a string
-                    name: Kubeconfig file contents
-                  type:
-                    type_id: string
-                  required: true
-                namespace:
-                    display:
-                      description: The namespace where the container will be deployed
-                      name: Namespace
-                    type:
-                      type_id: string
-                    required: true
-                node_selector:
-                    display:
-                      description: kubernetes node name where the plugin must be deployed
-                    type:
-                      type_id: map
-                      values:
-                        type_id: string
-                      keys:
-                        type_id: string
-                    required: true
-                duration:
-                  display:
-                    name: duration the scenario expressed in seconds
-                    description: stop stress test after T seconds. One can also specify the units of time in
-                      seconds, minutes, hours, days or years with the suffix s, m, h, d or y
-                  type:
-                    type_id: string
-                  required: true
-                cpu_count:
-                  display:
-                    description: Number of CPU cores to be used (0 means all)
-                    name: number of CPUs
-                  type:
-                    type_id: integer
-                  required: true
-                cpu_method:
-                  display:
-                    description: CPU stress method
-                    name: fine grained control of which cpu stressors to use (ackermann, cfloat etc.)
-                  type:
-                    type_id: string
-                  required: true
-                cpu_load_percentage:
-                  display:
-                    description: load CPU by percentage
-                    name: CPU load
-                  type:
-                    type_id: integer
-                  required: true
+              id: SubRootObject
+              type_id: ref
+              namespace: $.steps.workload_loop.execute.inputs.items
+
 steps:
  workload_loop:
    kind: foreach
--- a/scenarios/arcaflow/io-hog/config.yaml
+++ b/scenarios/arcaflow/io-hog/config.yaml
@@ -3,7 +3,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/arcaflow/io-hog/input.yaml
+++ b/scenarios/arcaflow/io-hog/input.yaml
@@ -1,5 +1,5 @@
 input_list:
- duration: 30s
+- duration: 30
  io_block_size: 1m
  io_workers: 1
  io_write_bytes: 10m
--- a/scenarios/arcaflow/io-hog/sub-workflow.yaml
+++ b/scenarios/arcaflow/io-hog/sub-workflow.yaml
@@ -1,6 +1,6 @@
 version: v0.2.0
 input:
-  root: RootObject
+  root: SubRootObject
  objects:
    hostPath:
      id: HostPathVolumeSource
@@ -18,8 +18,8 @@ input:
          type:
            id: hostPath
            type_id: ref
-    RootObject:
-      id: input_item
+    SubRootObject:
+      id: SubRootObject
      properties:
        kubeconfig:
          display:
@@ -51,7 +51,7 @@ input:
            description: stop  stress  test  after  T  seconds.  One  can  also specify the units of time in
              seconds, minutes, hours, days or years with the suffix s, m, h, d or  y
          type:
-            type_id: string
+            type_id: integer
          required: true
        io_workers:
          display:
@@ -102,19 +102,18 @@ steps:
      kubeconfig: !expr $.input.kubeconfig
  stressng:
    plugin: 
-      src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
+      src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
      deployment_type: image
    step: workload
    input:
      cleanup: "true"
-      StressNGParams:
-        timeout: !expr $.input.duration
-        workdir: !expr $.input.target_pod_folder
-        stressors:
-          - stressor: hdd
-            hdd: !expr $.input.io_workers
-            hdd_bytes: !expr $.input.io_write_bytes
-            hdd_write_size: !expr $.input.io_block_size
+      timeout: !expr $.input.duration
+      workdir: !expr $.input.target_pod_folder
+      stressors:
+        - stressor: hdd
+          workers: !expr $.input.io_workers
+          hdd-bytes: !expr $.input.io_write_bytes
+          hdd-write-size: !expr $.input.io_block_size

    deploy:
      deployer_name: kubernetes
--- a/scenarios/arcaflow/io-hog/workflow.yaml
+++ b/scenarios/arcaflow/io-hog/workflow.yaml
@@ -2,22 +2,6 @@ version: v0.2.0
 input:
  root: RootObject
  objects:
-    hostPath:
-      id: HostPathVolumeSource
-      properties:
-        path:
-          type:
-            type_id: string
-    Volume:
-      id: Volume
-      properties:
-        name:
-          type:
-            type_id: string
-        hostPath:
-          type:
-            id: hostPath
-            type_id: ref
    RootObject:
      id: RootObject
      properties:
@@ -25,80 +9,9 @@ input:
          type:
            type_id: list
            items:
-              id: input_item
-              type_id: object
-              properties:
-                kubeconfig:
-                  display:
-                    description: The complete kubeconfig file as a string
-                    name: Kubeconfig file contents
-                  type:
-                    type_id: string
-                  required: true
-                namespace:
-                  display:
-                    description: The namespace where the container will be deployed
-                    name: Namespace
-                  type:
-                    type_id: string
-                  required: true
-                node_selector:
-                  display:
-                    description: kubernetes node name where the plugin must be deployed
-                  type:
-                    type_id: map
-                    values:
-                      type_id: string
-                    keys:
-                      type_id: string
-                  required: true
-                duration:
-                  display:
-                    name: duration the scenario expressed in seconds
-                    description: stop  stress  test  after  T  seconds.  One  can  also specify the units of time in
-                      seconds, minutes, hours, days or years with the suffix s, m, h, d or  y
-                  type:
-                    type_id: string
-                  required: true
-                io_workers:
-                  display:
-                    description: number of workers
-                    name: start N workers continually writing, reading  and  removing  temporary  files
-                  type:
-                    type_id: integer
-                  required: true
-                io_block_size:
-                  display:
-                    description: single write size
-                    name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
-                  type:
-                    type_id: string
-                  required: true
-                io_write_bytes:
-                  display:
-                    description: Total number of bytes written
-                    name: write  N  bytes for each hdd process, the default is 1 GB. One can specify the size
-                      as % of free space on the file system or in units  of  Bytes,  KBytes,  MBytes  and
-                      GBytes using the suffix b, k, m or g
-                  type:
-                    type_id: string
-                  required: true
-                target_pod_folder:
-                  display:
-                    description: Target Folder
-                    name: Folder in the pod where the test will be executed and the test files will be written
-                  type:
-                    type_id: string
-                  required: true
-                target_pod_volume:
-                  display:
-                    name: kubernetes volume definition
-                    description: the volume that will be attached to the pod. In order to stress
-                      the node storage only hosPath mode is currently supported
-                  type:
-                    type_id: ref
-                    id: Volume
-                  required: true
+              id: SubRootObject
+              type_id: ref
+              namespace: $.steps.workload_loop.execute.inputs.items
 steps:
  workload_loop:
    kind: foreach
--- a/scenarios/arcaflow/memory-hog/config.yaml
+++ b/scenarios/arcaflow/memory-hog/config.yaml
@@ -4,7 +4,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/arcaflow/memory-hog/input.yaml
+++ b/scenarios/arcaflow/memory-hog/input.yaml
@@ -1,5 +1,5 @@
 input_list:
- duration: 30s
+- duration: 30
  vm_bytes: 10%
  vm_workers: 2
  # set the node selector as a key-value pair eg.
--- a/scenarios/arcaflow/memory-hog/sub-workflow.yaml
+++ b/scenarios/arcaflow/memory-hog/sub-workflow.yaml
@@ -1,9 +1,9 @@
 version: v0.2.0
 input:
-  root: RootObject
+  root: SubRootObject
  objects:
-    RootObject:
-      id: input_item
+    SubRootObject:
+      id: SubRootObject
      properties:
        kubeconfig:
          display:
@@ -34,7 +34,7 @@ input:
            name: duration the scenario expressed in seconds
            description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or  y
          type:
-            type_id: string
+            type_id: integer
          required: true
        vm_workers:
          display:
@@ -60,17 +60,16 @@ steps:
      kubeconfig: !expr $.input.kubeconfig
  stressng:
    plugin: 
-      src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
+      src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
      deployment_type: image
    step: workload
    input:
      cleanup: "true"
-      StressNGParams:
-        timeout: !expr $.input.duration
-        stressors:
-          - stressor: vm
-            vm: !expr $.input.vm_workers
-            vm_bytes: !expr $.input.vm_bytes
+      timeout: !expr $.input.duration
+      stressors:
+        - stressor: vm
+          workers: !expr $.input.vm_workers
+          vm-bytes: !expr $.input.vm_bytes
    deploy:
      deployer_name: kubernetes
      connection: !expr $.steps.kubeconfig.outputs.success.connection
--- a/scenarios/arcaflow/memory-hog/workflow.yaml
+++ b/scenarios/arcaflow/memory-hog/workflow.yaml
@@ -9,54 +9,10 @@ input:
          type:
            type_id: list
            items:
-              id: input_item
-              type_id: object
-              properties:
-                kubeconfig:
-                  display:
-                    description: The complete kubeconfig file as a string
-                    name: Kubeconfig file contents
-                  type:
-                    type_id: string
-                  required: true
-                namespace:
-                    display:
-                      description: The namespace where the container will be deployed
-                      name: Namespace
-                    type:
-                      type_id: string
-                    required: true
-                node_selector:
-                  display:
-                    description: kubernetes node name where the plugin must be deployed
-                  type:
-                    type_id: map
-                    values:
-                      type_id: string
-                    keys:
-                      type_id: string
-                  required: true
-                duration:
-                  display:
-                    name: duration the scenario expressed in seconds
-                    description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or  y
-                  type:
-                    type_id: string
-                  required: true
-                vm_workers:
-                  display:
-                    description: Number of VM stressors to be run (0 means 1 stressor per CPU)
-                    name: Number of VM stressors
-                  type:
-                    type_id: integer
-                  required: true
-                vm_bytes:
-                  display:
-                    description: N bytes per vm process, the default is 256MB. The size can be expressed in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g.
-                    name: Kubeconfig file contents
-                  type:
-                    type_id: string
-                  required: true
+              id: SubRootObject
+              type_id: ref
+              namespace: $.steps.workload_loop.execute.inputs.items
+
 steps:
  workload_loop:
    kind: foreach
--- a/scenarios/kind/scheduler.yml
+++ b/scenarios/kind/scheduler.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^kube-system$
    label_selector: component=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: component=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/kube/pod.yml
+++ b/scenarios/kube/pod.yml
@@ -4,3 +4,4 @@
    name_pattern: ^nginx-.*$
    namespace_pattern: ^default$
    kill: 1
+    krkn_pod_recovery_time: 120
--- a/scenarios/kube/scheduler.yml
+++ b/scenarios/kube/scheduler.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^kube-system$
    label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: k8s-app=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/kube/service_hijacking.yaml
+++ b/scenarios/kube/service_hijacking.yaml
@@ -0,0 +1,56 @@
+# refer to the documentation for further infos https://github.com/krkn-chaos/krkn/blob/main/docs/service_hijacking.md
+
+service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration).
+service_name: nginx-service # name of the service to be hijacked
+service_namespace: default # The namespace where the target service is located
+image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic.
+chaos_duration: 30 # Total duration of the chaos scenario in seconds.
+plan:
+  - resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored.
+                                # For resources, only query parameters are captured.
+
+    steps:                      # A time-based plan consisting of steps can be defined for each resource.
+      GET:                      # One or more HTTP methods can be specified for each step.
+                                # Note: Non-standard methods are supported
+                                # for fully custom web services (e.g., using NONEXISTENT instead of POST).
+
+        - duration: 15          # Duration in seconds for this step before moving to the next one, if defined. Otherwise,
+                                # this step will continue until the chaos scenario ends.
+
+          status: 500           # HTTP status code to be returned in this step.
+          mime_type: "application/json" # MIME type of the response for this step.
+          payload: |            # The response payload for this step.
+            {
+              "status":"internal server error"
+            }
+        - duration: 15
+          status: 201
+          mime_type: "application/json"
+          payload: |
+            {
+              "status":"resource created"
+            }
+      POST:
+        - duration: 15
+          status: 401
+          mime_type: "application/json"
+          payload: |
+            {
+               "status": "unauthorized"
+            }
+        - duration: 15
+          status: 404
+          mime_type: "text/plain"
+          payload: "not found"
+
+  - resource: "/patch"
+    steps:
+      PATCH:
+        - duration: 15
+          status: 201
+          mime_type: "text/plain"
+          payload: "resource patched"
+        - duration: 15
+          status: 400
+          mime_type: "text/plain"
+          payload: "bad request"
--- a/scenarios/openshift/container_etcd.yml
+++ b/scenarios/openshift/container_etcd.yml
@@ -5,4 +5,4 @@ scenarios:
  container_name: "etcd"
  action: 1
  count: 1
-  expected_recovery_time: 60
+  expected_recovery_time: 120
--- a/scenarios/openshift/customapp_pod.yaml
+++ b/scenarios/openshift/customapp_pod.yaml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^acme-air$
    name_pattern: .*
- id: wait-for-pods
-  config:
-    namespace_pattern: ^acme-air$
-    name_pattern: .*
-    count: 8
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/etcd.yml
+++ b/scenarios/openshift/etcd.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^openshift-etcd$
    label_selector: k8s-app=etcd
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-etcd$
-    label_selector: k8s-app=etcd
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/openshift-apiserver.yml
+++ b/scenarios/openshift/openshift-apiserver.yml
@@ -3,8 +3,5 @@
  config:
    namespace_pattern: ^openshift-apiserver$
    label_selector: app=openshift-apiserver-a
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-apiserver$
-    label_selector: app=openshift-apiserver-a
-    count: 3
+    krkn_pod_recovery_time: 120
+
--- a/scenarios/openshift/openshift-kube-apiserver.yml
+++ b/scenarios/openshift/openshift-kube-apiserver.yml
@@ -3,8 +3,5 @@
  config:
    namespace_pattern: ^openshift-kube-apiserver$
    label_selector: app=openshift-kube-apiserver
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-kube-apiserver$
-    label_selector: app=openshift-kube-apiserver
-    count: 3
+    krkn_pod_recovery_time: 120
+
--- a/scenarios/openshift/post_action_prometheus.yml
+++ b/scenarios/openshift/post_action_prometheus.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: app=prometheus
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: app=prometheus
-    count: 2
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/prom_kill.yml
+++ b/scenarios/openshift/prom_kill.yml
@@ -2,8 +2,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
-    count: 1
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/prometheus.yml
+++ b/scenarios/openshift/prometheus.yml
@@ -3,9 +3,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: app=prometheus
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: app=prometheus
-    count: 2
-    timeout: 180
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/regex_openshift_pod_kill.yml
+++ b/scenarios/openshift/regex_openshift_pod_kill.yml
@@ -4,3 +4,4 @@
    namespace_pattern: ^openshift-.*$
    name_pattern: .*
    kill: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/plugin.schema.json
+++ b/scenarios/plugin.schema.json
@@ -60,7 +60,14 @@
 										"default": 1,
 										"title": "Backoff",
 										"description": "How many seconds to wait between checks for the target pod status."
+									},
+									"krkn_pod_recovery_time": {
+										"type": "integer",
+										"default": 30,
+										"title": "Recovery Time",
+										"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
 									}
+
 								},
 								"required": [
 									"namespace_pattern"
@@ -112,6 +119,12 @@
 								"default": 1,
 								"title": "Backoff",
 								"description": "How many seconds to wait between checks for the target pod status."
+							},
+							"krkn_pod_recovery_time": {
+								"type": "integer",
+								"default": 30,
+								"title": "Recovery Time",
+								"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
 							}
 						},
 						"required": [
--- a/utils/chaos_ai/README.md
+++ b/utils/chaos_ai/README.md
@@ -0,0 +1,40 @@
+# aichaos
+Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
+
+## Generate python package wheel file
+```
+$ python3.9 generate_wheel_package.py sdist bdist_wheel
+$ cp dist/aichaos-0.0.1-py3-none-any.whl docker/
+```
+This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder. 
+
+## Build Image
+```
+$ cd docker
+$ podman build -t aichaos:1.0 .
+OR
+$ docker build -t aichaos:1.0 .
+```
+
+## Run Chaos AI
+```
+$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
+OR
+$ docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
+```
+
+The output should look like:
+```
+$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
+ * Serving Flask app 'swagger_api' (lazy loading)
+ * Environment: production
+   WARNING: This is a development server. Do not use it in a production deployment.
+   Use a production WSGI server instead.
+ * Debug mode: on
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on all addresses (0.0.0.0)
+ * Running on http://127.0.0.1:5001
+ * Running on http://172.17.0.2:5001
+```
+
+You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
--- a/utils/chaos_ai/config/experiments/.gitkeep
+++ b/utils/chaos_ai/config/experiments/.gitkeep
--- a/utils/chaos_ai/docker/Dockerfile
+++ b/utils/chaos_ai/docker/Dockerfile
@@ -0,0 +1,21 @@
+FROM bitnami/kubectl:1.20.9 as kubectl
+FROM python:3.9
+WORKDIR /app
+RUN pip3 install --upgrade pip
+COPY config config/
+COPY requirements.txt .
+RUN mkdir -p /app/logs
+RUN pip3 install -r requirements.txt
+
+COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
+
+COPY swagger_api.py .
+ENV PYTHONUNBUFFERED=1
+
+RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
+
+RUN apt-get update && apt-get install -y podman
+
+COPY aichaos-0.0.1-py3-none-any.whl .
+RUN pip3 install aichaos-0.0.1-py3-none-any.whl
+CMD ["python3", "swagger_api.py"]
--- a/utils/chaos_ai/docker/aichaos-config.json
+++ b/utils/chaos_ai/docker/aichaos-config.json
@@ -0,0 +1,7 @@
+{
+  "command": "podman",
+  "chaosengine": "kraken",
+  "faults": "pod-delete",
+  "iterations": 1,
+  "maxfaults": 5
+}
--- a/utils/chaos_ai/docker/config/experiments/log.yml
+++ b/utils/chaos_ai/docker/config/experiments/log.yml
@@ -0,0 +1,15 @@
+
+    Get Log from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/pod-delete.json
+++ b/utils/chaos_ai/docker/config/pod-delete.json
@@ -0,0 +1,36 @@
+{
+  "apiVersion": "1.0",
+  "kind": "ChaosEngine",
+  "metadata": {
+    "name": "engine-cartns3"
+  },
+  "spec": {
+    "engineState": "active",
+    "annotationCheck": "false",
+    "appinfo": {
+      "appns": "robot-shop",
+      "applabel": "service=payment",
+      "appkind": "deployment"
+    },
+    "chaosServiceAccount": "pod-delete-sa",
+    "experiments": [
+      {
+        "name": "pod-delete",
+        "spec": {
+          "components": {
+            "env": [
+              {
+                "name": "FORCE",
+                "value": "true"
+              },
+              {
+                "name": "TOTAL_CHAOS_DURATION",
+                "value": "120"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  }
+}
--- a/utils/chaos_ai/docker/config/yml/chaosGen.yml
+++ b/utils/chaos_ai/docker/config/yml/chaosGen.yml
@@ -0,0 +1,40 @@
+
+Generate chaos on an application deployed on a cluster.
+---
+    tags:
+      - ChaosAI API
+    parameters:
+      - name: file
+        in: formData
+        type: file
+        required: true
+        description: Kube-config file
+      - name: namespace
+        in: formData
+        type: string
+        default: robot-shop
+        required: true
+        description: Namespace to test
+      - name: podlabels
+        in: formData
+        type: string
+        default: service=cart,service=payment
+        required: true
+        description: Pod labels to test
+      - name: nodelabels
+        in: formData
+        type: string
+        required: false
+        description: Node labels to test
+      - name: urls
+        in: formData
+        type: string
+        default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
+        required: true
+        description: Application URLs to test
+
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Chaos ID for the initiated chaos.
--- a/utils/chaos_ai/docker/config/yml/episodes.yml
+++ b/utils/chaos_ai/docker/config/yml/episodes.yml
@@ -0,0 +1,15 @@
+
+    Get Episodes from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/log.yml
+++ b/utils/chaos_ai/docker/config/yml/log.yml
@@ -0,0 +1,15 @@
+
+    Get Log from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/qtable.yml
+++ b/utils/chaos_ai/docker/config/yml/qtable.yml
@@ -0,0 +1,15 @@
+
+    Get QTable from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/status.yml
+++ b/utils/chaos_ai/docker/config/yml/status.yml
@@ -0,0 +1,15 @@
+
+     Get status of the Constraints ID.---
+    tags:
+      - ChaosAI API
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Chaos for the given ID.
--- a/utils/chaos_ai/docker/requirements.txt
+++ b/utils/chaos_ai/docker/requirements.txt
@@ -0,0 +1,6 @@
+numpy
+pandas
+requests
+Flask==2.2.5
+Werkzeug==3.0.3
+flasgger==0.9.5
--- a/utils/chaos_ai/docker/swagger_api.py
+++ b/utils/chaos_ai/docker/swagger_api.py
@@ -0,0 +1,186 @@
+import json, os
+import logging
+# import numpy as np
+# import pandas as pd
+import threading
+from datetime import datetime
+from flask import Flask, request
+from flasgger import Swagger
+from flasgger.utils import swag_from
+# import zipfile
+import sys
+
+# sys.path.append("..")
+from src.aichaos_main import AIChaos
+
+app = Flask(__name__)
+Swagger(app)
+flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "app", "logs") + '/'
+
+
+class AIChaosSwagger:
+    def __init__(self, flaskdir=''):
+        self.flaskdir = flaskdir
+
+    @app.route("/")
+    def empty(params=''):
+        return "AI Chaos Repository!"
+
+    def startchaos(self, kubeconfigfile, file_id, params):
+        print('[StartChaos]', file_id, kubeconfigfile)
+        dir = flaskdir
+        outfile = ''.join([dir, 'out-', file_id])
+        initfile = ''.join([dir, 'init-', file_id])
+        with open(initfile, 'w'):
+            pass
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        # kubeconfigfile = params['file']
+        os.environ["KUBECONFIG"] = kubeconfigfile
+        os.system("export KUBECONFIG="+kubeconfigfile)
+        os.system("echo $KUBECONFIG")
+        print('setting kubeconfig')
+        params['command'] = 'podman'
+        params['chaosengine'] = 'kraken'
+        params['faults'] = 'pod-delete'
+        params['iterations'] = 1
+        params['maxfaults'] = 5
+        if os.path.isfile('/config/aichaos-config.json'):
+            with open('/config/aichaos-config.json') as f:
+                config_params = json.load(f)
+                params['command'] = config_params['command']
+                params['chaosengine'] = config_params['chaosengine']
+                params['faults']= config_params['faults']
+                params['iterations'] = config_params['iterations']
+                params['maxfaults'] = config_params['maxfaults']
+        # faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
+        faults = []
+        for f in params['faults'].split(','):
+            if f in ['pod-delete']:
+                for p in params['podlabels'].split(','):
+                    faults.append(f + ':' + p)
+            elif f in ['network-chaos', 'node-memory-hog', 'node-cpu-hog']:
+                for p in params['nodelabels'].split(','):
+                    faults.append(f + ':' + p)
+            else:
+                pass
+
+        print('#faults:', len(faults), faults)
+        states = {'200': 0, '500': 1, '501': 2, '502': 3, '503': 4, '504': 5,
+                  '401': 6,  '403': 7,  '404': 8,  '429': 9,
+                  'Timeout': 10, 'Other': 11}
+        rewards = {'200': -1, '500': 0.8, '501': 0.8, '502': 0.8, '503': 0.8, '504': 0.8,
+                   '401': 1,  '403': 1,  '404': 1,  '429': 1,
+                   'Timeout': 1, 'Other': 1}
+        logfile = self.flaskdir + 'log_' + str(file_id)
+        qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
+        efile = self.flaskdir + 'efile_' + str(file_id)
+        epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
+        # probe_url = params['probeurl']
+        cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
+                'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
+                'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
+        aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                          logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
+                          urls=params['urls'].split(','), namespace=params['namespace'],
+                          max_faults=int(params['maxfaults']),
+                          num_requests=10, timeout=2,
+                          chaos_engine=params['chaosengine'],
+                          chaos_dir='config/', kubeconfig=kubeconfigfile,
+                          loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=int(params['iterations']),
+                          command=params['command'])
+        print('checking kubeconfig')
+        os.system("echo $KUBECONFIG")
+        aichaos.start_chaos()
+
+        file = open(outfile, "w")
+        file.write('done')
+        file.close()
+        os.remove(initfile)
+        # os.remove(csvfile)
+        # ConstraintsInference().remove_temp_files(dir, file_id)
+        return 'WRITE'
+
+    @app.route('/GenerateChaos/', methods=['POST'])
+    @swag_from('config/yml/chaosGen.yml')
+    def chaos_gen():
+        dir = flaskdir
+        sw = AIChaosSwagger(flaskdir=dir)
+        f = request.files['file']
+        list = os.listdir(dir)
+        for i in range(10000):
+            fname = 'kubeconfig-'+str(i)
+            if fname not in list:
+                break
+        kubeconfigfile = ''.join([dir, 'kubeconfig-', str(i)])
+        f.save(kubeconfigfile)
+        # creating empty file
+        open(kubeconfigfile, 'a').close()
+        # print('HEADER:', f.headers)
+        print('[GenerateChaos] reqs:', request.form.to_dict())
+        # print('[GenerateChaos]', f.filename, datetime.now())
+        thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
+        thread.daemon = True
+        print(thread.getName())
+        thread.start()
+        return 'Chaos ID: ' + str(i)
+
+    @app.route('/GetStatus/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/status.yml')
+    def get_status(chaosid):
+        print('[GetStatus]', chaosid, flaskdir)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            return 'Completed'
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Does not exist'
+
+    @app.route('/GetQTable/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/qtable.yml')
+    def get_qtable(chaosid):
+        print('[GetQTable]', chaosid)
+        qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(qfile):
+            f = open(qfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+    @app.route('/GetEpisodes/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/episodes.yml')
+    def get_episodes(chaosid):
+        print('[GetEpisodes]', chaosid)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+    @app.route('/GetLog/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/log.yml')
+    def get_log(chaosid):
+        print('[GetLog]', chaosid)
+        epfile = flaskdir + 'log_' + str(chaosid)
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port='5001')
--- a/utils/chaos_ai/generate_wheel_package.py
+++ b/utils/chaos_ai/generate_wheel_package.py
@@ -0,0 +1,21 @@
+import setuptools
+# from setuptools_cythonize import get_cmdclass
+
+setuptools.setup(
+    # cmdclass=get_cmdclass(),
+    name="aichaos",
+    version="0.0.1",
+    author="Sandeep Hans",
+    author_email="shans001@in.ibm.com",
+    description="Chaos AI",
+    long_description="Chaos Engineering using AI",
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.9',
+)
--- a/utils/chaos_ai/requirements.txt
+++ b/utils/chaos_ai/requirements.txt
@@ -0,0 +1,10 @@
+numpy
+pandas
+notebook
+jupyterlab
+jupyter
+seaborn
+requests
+wheel
+Flask==2.1.0
+flasgger==0.9.5
--- a/utils/chaos_ai/src/init.py
+++ b/utils/chaos_ai/src/init.py
--- a/utils/chaos_ai/src/aichaos.py
+++ b/utils/chaos_ai/src/aichaos.py
@@ -0,0 +1,213 @@
+import json
+import os
+import random
+import sys
+
+import numpy as np
+import logging
+
+
+class AIChaos:
+    def __init__(self, states=None, faults=None, rewards=None, pod_names=[], chaos_dir=None,
+                 chaos_experiment='experiment.json',
+                 chaos_journal='journal.json', iterations=1000, static_run=False):
+        self.faults = faults
+        self.pod_names = pod_names
+        self.states = states
+        self.rewards = rewards
+        self.episodes = []
+
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.chaos_journal = chaos_journal
+
+        self.iterations = iterations
+        # Initialize parameters
+        self.gamma = 0.75  # Discount factor
+        self.alpha = 0.9  # Learning rate
+
+        # Initializing Q-Values
+        # self.Q = np.array(np.zeros([9, 9]))
+        # self.Q = np.array(np.zeros([len(faults), len(faults)]))
+        # currently action is a single fault, later on we will do multiple faults together
+        # For multiple faults, the no of cols in q-matrix will be all combinations of faults (infinite)
+        # eg. {f1,f2},f3,f4,{f4,f5} - f1,f2  in parallel, then f3, then f4,  then f4,f5 in parallel produces end state
+        # self.Q = np.array(np.zeros([len(states), len(states)]))
+        self.Q = np.array(np.zeros([len(states), len(faults)]))
+        self.state_matrix = np.array(np.zeros([len(states), len(states)]))
+
+        # may be Q is a dictionary of dictionaries, for each state there is a dictionary of faults
+        # Q = {'500' = {'f1f2f4': 0.3, 'f1':  0.5}, '404' = {'f2': 0.22}}
+
+        self.logger = logging.getLogger()
+        # run from old static experiment and journal files
+        self.static_run = static_run
+
+    # End state is reached when system is down or return error code like '500','404'
+    def get_next_state(self):
+        self.logger.info('[GET_NEXT_STATE]')
+        f = open(self.chaos_dir + self.chaos_journal)
+        data = json.load(f)
+
+        # before the experiment (if before steady state is false, after is null?)
+        for probe in data['steady_states']['before']['probes']:
+            if not probe['tolerance_met']:
+                # start_state = probe['activity']['tolerance']
+                # end_state = probe['status']
+                start_state, end_state = None, None
+                return start_state, end_state
+
+        # after the experiment
+        for probe in data['steady_states']['after']['probes']:
+            # if probe['output']['status'] == probe['activity']['tolerance']:
+            if not probe['tolerance_met']:
+                # print(probe)
+                start_state = probe['activity']['tolerance']
+                end_state = probe['output']['status']
+                # end_state = probe['status']
+                return start_state, end_state
+        # if tolerances for all probes are met
+        start_state = probe['activity']['tolerance']
+        end_state = probe['activity']['tolerance']
+        return start_state, end_state
+
+    def inject_faults(self, fault, pod_name):
+        self.logger.info('[INJECT_FAULT] ' + fault)
+        f = open(self.chaos_dir + self.chaos_experiment)
+        data = json.load(f)
+        for m in data['method']:
+            if 'provider' in m:
+                if fault == 'kill_microservice':
+                    m['name'] = 'kill-microservice'
+                    m['provider']['module'] = 'chaosk8s.actions'
+                    m['provider']['arguments']['name'] = pod_name
+                else:
+                    m['provider']['arguments']['name_pattern'] = pod_name
+                m['provider']['func'] = fault
+
+                print('[INJECT_FAULT] method:', m)
+                # self.logger.info('[INJECT_FAULT] ' + m['provider']['arguments']['name_pattern'])
+                # self.logger.info('[INJECT_FAULT] ' + str(m))
+
+        exp_file = self.chaos_dir + 'experiment_' + str(random.randint(1, 10)) + '.json'
+        with open(exp_file, 'w') as f:
+            json.dump(data, f)
+        exp_file = self.chaos_dir + 'experiment.json'
+        # execute faults
+        # cmd = 'cd ' + self.chaos_dir + ';chaos run ' + self.chaos_experiment
+        cmd = 'cd ' + self.chaos_dir + ';chaos run ' + exp_file
+        if not self.static_run:
+            os.system(cmd)
+
+    def create_episode(self):
+        self.logger.info('[CREATE_EPISODE]')
+        episode = []
+        while True:
+            # inject more faults
+            # TODO: model - choose faults based on q-learning ...
+            fault_pod = random.choice(self.faults)
+            fault = fault_pod.split(':')[0]
+            pod_name = fault_pod.split(':')[1]
+            # fault = random.choice(self.faults)
+            # pod_name = random.choice(self.pod_names)
+            # fault = lstm_model.get_next_fault(episode)
+            # fault = get_max_prob_fault(episode)
+
+            self.inject_faults(fault, pod_name)
+            start_state, next_state = self.get_next_state()
+            print('[CREATE EPISODE]', start_state, next_state)
+            # if before state tolerance is not met
+            if start_state is None and next_state is None:
+                continue
+
+            episode.append({'fault': fault, 'pod_name': pod_name})
+            self.update_q_fault(fault_pod, episode, start_state, next_state)
+            # self.update_q_fault(fault, episode, start_state, next_state)
+            # if an end_state is reached
+            # if next_state is not None:
+            if start_state != next_state:
+                self.logger.info('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
+                self.logger.info('[CREATE_EPISODE] END STATE:' + str(next_state))
+                return episode, start_state, next_state
+
+    def update_q_fault(self, fault, episode, start_state, end_state):
+        self.logger.info('[UPDATE_Q]')
+        print('[UPDATE_Q] ', str(start_state), str(end_state))
+        if end_state is None:
+            end_state = start_state
+
+        # reward is dependent on the error response (eg. '404') and length of episode
+        reward = self.rewards[str(end_state)] / len(episode)
+        current_state = self.states[str(start_state)]
+        next_state = self.states[str(end_state)]
+        fault_index = self.faults.index(fault)
+
+        TD = reward + \
+             self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+             self.Q[current_state, fault_index]
+        self.Q[current_state, fault_index] += self.alpha * TD
+
+        # update state matrix
+        TD_state = reward + \
+                   self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
+                   self.state_matrix[current_state, next_state]
+        self.state_matrix[current_state, next_state] += self.alpha * TD_state
+
+    # def update_q(self, episode, start_state, end_state):
+    #     self.logger.info('[UPDATE_Q]')
+    #     if end_state is None:
+    #         end_state = start_state
+    #
+    #     # reward is dependent on the error response (eg. '404') and length of episode
+    #     reward = self.rewards[str(end_state)] / len(episode)
+    #     current_state = self.states[str(start_state)]
+    #     next_state = self.states[str(end_state)]
+    #     TD = reward + \
+    #          self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+    #          self.Q[current_state, next_state]
+    #     self.Q[current_state, next_state] += self.alpha * TD
+
+    def start_chaos(self):
+        for i in range(self.iterations):
+            episode, start_state, end_state = self.create_episode()
+            # update Q matrix
+            # will do it with each fault injection
+            # self.update_q(episode, start_state, end_state)
+            print(self.Q)
+            print(self.state_matrix)
+
+
+def test_chaos():
+    svc_list = ['cart', 'catalogue', 'dispatch', 'mongodb', 'mysql', 'payment', 'rabbitmq', 'ratings', 'redis',
+                'shipping', 'user', 'web']
+    # Define faults
+    # faults = ['terminate_pods']
+    #     faults = ['terminate_pods:' + x for x in pod_names]
+    faults = ['kill_microservice:' + x for x in svc_list]
+    # Define the states
+    states = {
+        '200': 0,
+        '500': 1,
+        '404': 2
+    }
+    # Define rewards, currently not used
+    rewards = {
+        '200': 0,
+        '500': 0.8,
+        '404': 1
+    }
+
+    # cdir = '/Users/sandeephans/Downloads/chaos/chaostoolkit-samples-master/service-down-not-visible-to-users/'
+    cdir = '/Users/sandeephans/Downloads/openshift/'
+    cexp = 'experiment.json'
+    cjournal = 'journal.json'
+
+    aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                      chaos_dir=cdir, chaos_experiment=cexp, chaos_journal=cjournal,
+                      static_run=False)
+    aichaos.start_chaos()
+
+
+if __name__ == '__main__':
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+    test_chaos()
--- a/utils/chaos_ai/src/aichaos_main.py
+++ b/utils/chaos_ai/src/aichaos_main.py
@@ -0,0 +1,248 @@
+import json
+import os
+import random
+
+import numpy as np
+import pandas as pd
+import logging
+
+# sys.path.insert(1, os.path.join(sys.path[0], '..'))
+import src.utils as utils
+from src.kraken_utils import KrakenUtils
+from src.qlearning import QLearning
+from src.test_application import TestApplication
+
+
+class AIChaos:
+    def __init__(self, namespace='robot-shop', states=None, faults=None, rewards=None, urls=[], max_faults=5,
+                 service_weights=None, ctd_subsets=None, pod_names=[], chaos_dir='../config/', kubeconfig='~/.kube/config',
+                 chaos_experiment='experiment.json', logfile='log', qfile='qfile.csv', efile='efile', epfile='episodes.json',
+                 loglevel=logging.INFO,
+                 chaos_journal='journal.json', iterations=10, alpha=0.9, gamma=0.2, epsilon=0.3,
+                 num_requests=10, sleep_time=1, timeout=2, chaos_engine='kraken', dstk_probes=None,
+                 static_run=False, all_faults=False, command='podman'):
+        self.namespace = namespace
+        self.faults = faults
+        self.unused_faults = faults.copy()
+        self.all_faults = all_faults
+        self.pod_names = pod_names
+        self.states = states
+        self.rewards = rewards
+        self.urls = urls
+        self.max_faults = max_faults
+        self.episodes = []
+        self.service_weights = service_weights
+        self.ctd_subsets = ctd_subsets
+
+        self.kubeconfig = kubeconfig
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.chaos_journal = chaos_journal
+        self.command = command
+
+        if chaos_engine == 'kraken':
+            self.chaos_engine = KrakenUtils(namespace, kubeconfig=kubeconfig, chaos_dir=chaos_dir, chaos_experiment=chaos_experiment, command=self.command)
+        else:
+            self.chaos_engine = None
+
+        self.iterations = iterations
+        # Initialize RL parameters
+        self.epsilon = epsilon  # epsilon decay policy
+        # self.epsdecay = 0
+
+        # log files
+        self.logfile = logfile
+        self.qfile = qfile
+        self.efile = efile
+        self.epfile = epfile
+        open(efile, 'w+').close()
+        open(logfile, 'w+').close()
+        open(logfile, 'r+').truncate(0)
+        logging.getLogger("requests").setLevel(logging.WARNING)
+        logging.getLogger("urllib3").setLevel(logging.WARNING)
+        logging.basicConfig(filename=logfile, filemode='w+', level=loglevel)
+        self.logger = logging.getLogger(logfile.replace('/',''))
+        self.logger.addHandler(logging.FileHandler(logfile))
+
+        self.testapp = TestApplication(num_requests, timeout, sleep_time)
+        self.ql = QLearning(gamma, alpha, faults, states, rewards, urls)
+
+        # run from old static experiment and journal files
+        self.static_run = static_run
+
+    def realistic(self, faults_pods):
+        self.logger.debug('[Realistic] ' + str(faults_pods))
+        fp = faults_pods.copy()
+        for f1 in faults_pods:
+            for f2 in faults_pods:
+                if f1 == f2:
+                    continue
+                if f1 in fp and f2 in fp:
+                    f1_fault, load_1 = utils.get_load(f1.split(':')[0])
+                    f1_pod = f1.split(':')[1]
+                    f2_fault, load_2 = utils.get_load(f2.split(':')[0])
+                    f2_pod = f2.split(':')[1]
+                    if f1_pod == f2_pod:
+                        if f1_fault == 'pod-delete':
+                            fp.remove(f2)
+                        if f1_fault == f2_fault:
+                            # if int(load_1) > int(load_2):
+                            # randomly remove one fault from same faults with different params
+                            fp.remove(f2)
+        if self.service_weights is None:
+            return fp
+
+        fp_copy = fp.copy()
+        for f in fp:
+            f_fault = f.split(':')[0]
+            f_pod = f.split(':')[1].replace('service=', '')
+            self.logger.debug('[ServiceWeights] ' + f + ' ' + str(self.service_weights[f_pod][f_fault]))
+            if self.service_weights[f_pod][f_fault] == 0:
+                fp_copy.remove(f)
+
+        self.logger.debug('[Realistic] ' + str(fp_copy))
+        return fp_copy
+
+    def select_faults(self):
+        max_faults = min(self.max_faults, len(self.unused_faults))
+        num_faults = random.randint(1, max_faults)
+        if self.all_faults:
+            num_faults = len(self.unused_faults)
+        if random.random() > self.epsilon:
+            self.logger.info('[Exploration]')
+            # faults_pods = random.sample(self.faults, k=num_faults)
+            # using used faults list to avoid starvation
+            faults_pods = random.sample(self.unused_faults, k=num_faults)
+            faults_pods = self.realistic(faults_pods)
+            for f in faults_pods:
+                self.unused_faults.remove(f)
+            if len(self.unused_faults) == 0:
+                self.unused_faults = self.faults.copy()
+        else:
+            self.logger.info('[Exploitation]')
+            first_row = self.ql.Q[:, 0, :][0]
+            top_k_indices = np.argpartition(first_row, -num_faults)[-num_faults:]
+            faults_pods = [self.faults[i] for i in top_k_indices]
+            faults_pods = self.realistic(faults_pods)
+
+        return faults_pods
+
+    def create_episode(self, ctd_subset=None):
+        self.logger.debug('[CREATE_EPISODE]')
+        episode = []
+
+        if ctd_subset is None:
+            faults_pods = self.select_faults()
+        else:
+            faults_pods = ctd_subset
+            self.logger.info('CTD Subset: ' + str(faults_pods))
+
+        # faults_pods = self.realistic(faults_pods)
+        if len(faults_pods) == 0:
+            return [], 200, 200
+
+        engines = []
+        for fp in faults_pods:
+            fault = fp.split(':')[0]
+            pod_name = fp.split(':')[1]
+            engine = self.chaos_engine.inject_faults(fault, pod_name)
+            engines.append(engine)
+            episode.append({'fault': fault, 'pod_name': pod_name})
+        self.logger.info('[create_episode]' + str(faults_pods))
+        engines_running = self.chaos_engine.wait_engines(engines)
+        self.logger.info('[create_episode] engines_running' + str(engines_running))
+        if not engines_running:
+            return None, None, None
+
+        # randomly shuffling urls 
+        urls = random.sample(self.urls, len(self.urls))
+        ep_json = []
+        for url in urls:
+            start_state, next_state = self.testapp.test_load(url)
+            self.logger.info('[CREATE EPISODE]' + str(start_state) + ',' + str(next_state))
+            # if before state tolerance is not met
+            if start_state is None and next_state is None:
+                # self.cleanup()
+                self.chaos_engine.stop_engines()
+                continue
+
+                ### episode.append({'fault': fault, 'pod_name': pod_name})
+                # self.update_q_fault(fault_pod, episode, start_state, next_state)
+            url_index = self.urls.index(url)
+            self.logger.info('[CREATEEPISODE]' + str(url) + ':' + str(url_index))
+            for fp in faults_pods:
+                self.ql.update_q_fault(fp, episode, start_state, next_state, self.urls.index(url))
+            ep_json.append({'start_state': start_state, 'next_state': next_state, 'url': url, 'faults': episode})
+
+        self.logger.debug('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
+        self.logger.debug('[CREATE_EPISODE] END STATE:' + str(next_state))
+
+        self.chaos_engine.print_result(engines)
+        self.chaos_engine.stop_engines(episode=episode)
+        # ep_json = {'start_state': start_state, 'next_state': next_state, 'faults': episode}
+
+        return ep_json, start_state, next_state
+
+    def start_chaos(self):
+        self.logger.info('[INITIALIZING]')
+        self.logger.info('Logfile: '+self.logfile)
+        self.logger.info('Loggerfile: '+self.logger.handlers[0].stream.name)
+        self.logger.info('Chaos Engine: ' + self.chaos_engine.get_name())
+        self.logger.debug('Faults:' + str(self.faults))
+
+        self.chaos_engine.cleanup()
+        if self.ctd_subsets is None:
+            for i in range(self.iterations):
+                episode, start_state, end_state = self.create_episode()
+                self.logger.debug('[start_chaos]' + str(i) + ' ' + str(episode))
+                if episode is None:
+                    continue
+                # update Q matrix
+                # will do it with each fault injection
+                # self.update_q(episode, start_state, end_state)
+                # if episode['next_state'] != '200':
+                self.episodes.extend(episode)
+                self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
+                # print(i, self.state_matrix)
+                self.write_q()
+                self.write_episode(episode)
+        else:
+            for i, subset in enumerate(self.ctd_subsets):
+                episode, start_state, end_state = self.create_episode(subset)
+                self.logger.debug('[start_chaos]' + str(episode))
+                if episode is None:
+                    continue
+                self.episodes.append(episode)
+                self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
+                self.write_q()
+                self.write_episode(episode)
+
+        self.chaos_engine.cleanup()
+        # self.remove_temp_file()
+        with open(self.epfile, 'w', encoding='utf-8') as f:
+            json.dump(self.episodes, f, ensure_ascii=False, indent=4)
+        self.logger.info('COMPLETE!!!')
+
+    def write_q(self):
+        df = pd.DataFrame(self.ql.Q[:, 0, :], index=self.urls, columns=self.faults)
+        df.to_csv(self.qfile)
+        return df
+
+    def write_episode(self, episode):
+        for ep in episode:
+            with open(self.efile, "a") as outfile:
+                x = [e['fault'] + ':' + e['pod_name'] for e in ep['faults']]
+                x.append(ep['url'])
+                x.append(str(ep['next_state']))
+                outfile.write(','.join(x) + '\n')
+
+    def remove_temp_file(self):
+        mydir = self.chaos_dir + 'experiments'
+        print('Removing temp files from: '+mydir)
+        self.logger.debug('Removing temp files: '+mydir)
+        if os.path.exists(mydir):
+            return
+        filelist = [f for f in os.listdir(mydir) if f.endswith(".json")]
+        for f in filelist:
+            print(f)
+            os.remove(os.path.join(mydir, f))
--- a/utils/chaos_ai/src/experiments.py
+++ b/utils/chaos_ai/src/experiments.py
@@ -0,0 +1,56 @@
+import random
+
+
+class Experiments:
+    def __init__(self):
+        self.k = 0
+
+    def monotonic(self, aichaos, num_sets=3):
+        for i in range(num_sets):
+            faults_pods = random.sample(aichaos.faults, k=2)
+            faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
+
+            resp1, resp2, resp_both = 0, 0, 0
+            for fl in faults_set:
+                engines = []
+                for fp in fl:
+                    fault = fp.split(':')[0]
+                    pod_name = fp.split(':')[1]
+                    engine = aichaos.inject_faults_litmus(fault, pod_name)
+                    engines.append(engine)
+                aichaos.litmus.wait_engines(engines)
+
+                for index, url in enumerate(aichaos.urls):
+                    start_state, next_state = aichaos.test_load(url)
+                    print(i, fl, next_state)
+                    # self.write(str(fl), next_state)
+                    if resp1 == 0:
+                        resp1 = next_state
+                    elif resp2 == 0:
+                        resp2 = next_state
+                    else:
+                        resp_both = next_state
+
+                aichaos.litmus.stop_engines()
+            self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
+        print('Experiment Complete!!!')
+
+    @staticmethod
+    def write(fault, next_state):
+        with open("experiment", "a") as outfile:
+            outfile.write(fault + ',' + str(next_state) + ',' + '\n')
+
+
+    @staticmethod
+    def write_resp(faults, resp1, resp2, resp3):
+        monotonic = True
+        if resp3 == 200:
+            if resp1 != 200 or resp2 != 200:
+                monotonic = False
+        else:
+            if resp1 == 200 and resp2 == 200:
+                monotonic = False
+
+        with open("experiment", "a") as outfile:
+            # outfile.write(faults + ',' + str(resp1) + ',' + '\n')
+            outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
--- a/utils/chaos_ai/src/kraken_utils.py
+++ b/utils/chaos_ai/src/kraken_utils.py
@@ -0,0 +1,99 @@
+import json
+import os
+import time
+import logging
+
+import src.utils as utils
+
+
+class KrakenUtils:
+    def __init__(self, namespace='robot-shop', chaos_dir='../config/',
+                 chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.namespace = namespace
+        self.kubeconfig = kubeconfig
+        self.logger = logging.getLogger()
+        self.engines = []
+        self.wait_checks = wait_checks
+        self.command = command
+
+    def exp_status(self, engine='engine-cartns3'):
+        substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
+        substr = '|'.join(substring_list)
+        # cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
+        # cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
+        cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
+        line = os.popen(cmd).read()
+        self.logger.debug('[exp_status]'+line)
+        # if 'Waiting for the specified duration' in line:
+        # if 'Waiting for' in line or 'waiting for' in line:
+        # if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
+        if any(map(line.__contains__, substring_list)):
+            return 'Running'
+        return 'Not Running'
+ 
+    # print chaos result, check if litmus showed any error
+    def print_result(self, engines):
+        # self.logger.debug('')
+        for e in engines:
+            # cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
+            # line = os.popen(cmd).read()
+            # self.logger.debug('[Chaos Result] '+e+' : '+line)
+            self.logger.debug('[KRAKEN][Chaos Result] '+e)
+
+    def wait_engines(self, engines=[]):
+        status = 'Completed'
+        max_checks = self.wait_checks
+        for e in engines:
+            self.logger.info('[Wait Engines] ' + e)
+            for i in range(max_checks):
+                status = self.exp_status(e)
+                if status == 'Running':
+                    break
+                time.sleep(1)
+            # return False, if even one engine is not running
+            if status != 'Running':
+                return False
+
+        self.engines = engines
+        # return True if all engines are running
+        return True
+
+
+    def cleanup(self):
+        self.logger.debug('Removing previous engines')
+        # cmd = "docker rm $(docker ps -q -f 'status=exited')"
+        if len(self.engines) > 0:
+            cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
+            os.system(cmd)
+        self.engines = []
+
+        cmd = self.command+" container prune -f >> temp"
+        os.system(cmd)
+        self.logger.debug('Engines removed')
+
+    def stop_engines(self, episode=[]):
+        self.cleanup()
+
+    def get_name(self):
+        return 'kraken'
+
+    def inject_faults(self, fault, pod_name):
+        self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
+        fault, load = utils.get_load(fault)
+        engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
+        if fault == 'pod-delete':
+            cmd = self.command+' run  -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
+        elif fault == 'network-chaos':
+            # 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10  --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'        
+            cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120  --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
+        elif fault == 'node-memory-hog':
+            cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
+        elif fault == 'node-cpu-hog':
+            cmd = self.command+'  run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
+        else:
+            cmd = 'echo'
+        self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
+        os.system(cmd)
+        return engine
--- a/utils/chaos_ai/src/qlearning.py
+++ b/utils/chaos_ai/src/qlearning.py
@@ -0,0 +1,62 @@
+import logging
+
+import numpy as np
+
+
+class QLearning:
+    def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
+        self.gamma = gamma  # Discount factor
+        self.alpha = alpha  # Learning rate
+        self.faults = faults
+        self.states = states
+        self.rewards = rewards
+
+        # Initializing Q-Values
+        # self.Q = np.array(np.zeros([len(states), len(states)]))
+        self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
+        self.state_matrix = np.array(np.zeros([len(states), len(states)]))
+
+        self.logger = logging.getLogger()
+
+    def update_q_fault(self, fault, episode, start_state, end_state, url_index):
+        self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
+        if end_state is None:
+            end_state = start_state
+        if end_state not in self.states:
+            end_state = 'Other'
+        # reward is dependent on the error response (eg. '404') and length of episode
+        reward = self.rewards[str(end_state)] / len(episode)
+        current_state = self.states[str(start_state)]
+        next_state = self.states[str(end_state)]
+        fault_index = self.faults.index(fault)
+        # self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
+        # self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
+        # self.logger.debug(
+        #     'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
+        # self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
+
+        TD = reward + \
+             self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
+             self.Q[url_index, current_state, fault_index]
+        self.Q[url_index, current_state, fault_index] += self.alpha * TD
+
+        # update state matrix
+        TD_state = reward + \
+                   self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
+                   self.state_matrix[current_state, next_state]
+        self.state_matrix[current_state, next_state] += self.alpha * TD_state
+        # self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
+
+    # def update_q(self, episode, start_state, end_state):
+    #     self.logger.info('[UPDATE_Q]')
+    #     if end_state is None:
+    #         end_state = start_state
+    #
+    #     # reward is dependent on the error response (eg. '404') and length of episode
+    #     reward = self.rewards[str(end_state)] / len(episode)
+    #     current_state = self.states[str(start_state)]
+    #     next_state = self.states[str(end_state)]
+    #     TD = reward + \
+    #          self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+    #          self.Q[current_state, next_state]
+    #     self.Q[current_state, next_state] += self.alpha * TD
--- a/utils/chaos_ai/src/swagger_api.py
+++ b/utils/chaos_ai/src/swagger_api.py
@@ -0,0 +1,171 @@
+import json, os
+import logging
+# import numpy as np
+# import pandas as pd
+import threading
+from datetime import datetime
+from flask import Flask, request
+from flasgger import Swagger
+from flasgger.utils import swag_from
+# import zipfile
+import sys
+
+sys.path.append("..")
+from aichaos_main import AIChaos
+
+app = Flask(__name__)
+Swagger(app)
+flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "experiments",
+                        "flask") + '/'
+
+
+class AIChaosSwagger:
+    def __init__(self, flaskdir=''):
+        self.flaskdir = flaskdir
+
+    @app.route("/")
+    def empty(params=''):
+        return "AI Chaos Repository!"
+
+    def startchaos(self, kubeconfigfile, file_id, params):
+        print('[StartChaos]', file_id, kubeconfigfile)
+        dir = flaskdir
+        outfile = ''.join([dir, 'out-', file_id])
+        initfile = ''.join([dir, 'init-', file_id])
+        with open(initfile, 'w'):
+            pass
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        # cons = ConstraintsInference(outdir=dir).get_constraints(csvfile, file_id, params, verbose=False,
+        #                                                         write_local=False)
+        os.environ["KUBECONFIG"] = kubeconfigfile
+        params['command'] = 'podman'
+        params['chaos_engine'] = 'kraken'
+        params['faults'] = 'pod-delete'
+        params['iterations'] = 1
+        params['maxfaults'] = 5
+        if os.path.isfile('/config/aichaos-config.json'):
+            with open('/config/aichaos-config.json') as f:
+                config_params = json.load(f)
+                params['command'] = config_params['command']
+                params['chaos_engine'] = config_params['chaos_engine']
+                params['faults']= config_params['faults']
+                params['iterations'] = config_params['iterations']
+                params['maxfaults'] = config_params['maxfaults']
+        faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
+        print('#faults:', len(faults), faults)
+        states = {'200': 0, '500': 1, '502': 2, '503': 3, '404': 4, 'Timeout': 5}
+        rewards = {'200': -1, '500': 0.8, '502': 0.8, '503': 0.8, '404': 1, 'Timeout': 1}
+        logfile = self.flaskdir + 'log_' + str(file_id)
+        qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
+        efile = self.flaskdir + 'efile_' + str(file_id)
+        epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
+        probe_url = params['probeurl']
+        probes = {'pod-delete': 'executeprobe', 'cpu-hog': 'wolffi/cpu_load', 'disk-fill': 'wolffi/memory_load',
+                  'io_load': 'wolffi/io_load', 'http_delay': 'wolffi/http_delay', 'packet_delay': 'wolffi/packet_delay',
+                  'packet_duplication': 'wolffi/packet_duplication', 'packet_loss': 'wolffi/packet_loss',
+                  'packet_corruption': 'wolffi/packet_corruption',
+                  'packet_reordering': 'wolffi/packet_reordering', 'network_load': 'wolffi/network_load',
+                  'http_bad_request': 'wolffi/http_bad_request',
+                  'http_unauthorized': 'wolffi/http_unauthorized', 'http_forbidden': 'wolffi/http_forbidden',
+                  'http_not_found': 'wolffi/http_not_found',
+                  'http_method_not_allowed': 'wolffi/http_method_not_allowed',
+                  'http_not_acceptable': 'wolffi/http_not_acceptable',
+                  'http_request_timeout': 'wolffi/http_request_timeout',
+                  'http_unprocessable_entity': 'wolffi/http_unprocessable_entity',
+                  'http_internal_server_error': 'wolffi/http_internal_server_error',
+                  'http_not_implemented': 'wolffi/http_not_implemented',
+                  'http_bad_gateway': 'wolffi/http_bad_gateway',
+                  'http_service_unavailable': 'wolffi/http_service_unavailable',
+                  'bandwidth_restrict': 'wolffi/bandwidth_restrict',
+                  'pod_cpu_load': 'wolffi/pod_cpu_load', 'pod_memory_load': 'wolffi/pod_memory_load',
+                  'pod_io_load': 'wolffi/pod_io_load',
+                  'pod_network_load': 'wolffi/pod_network_load'
+                  }
+        dstk_probes = {k: probe_url + v for k, v in probes.items()}
+        cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
+                'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
+                'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
+        aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                          logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
+                          urls=params['urls'].split(','), namespace=params['namespace'],
+                          max_faults=params['maxfaults'],
+                          num_requests=10, timeout=2,
+                          chaos_engine=params['chaos_engine'], dstk_probes=dstk_probes, command=params['command'],
+                          loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=params['iterations'])
+        aichaos.start_chaos()
+
+        file = open(outfile, "w")
+        file.write('done')
+        file.close()
+        os.remove(initfile)
+        # os.remove(csvfile)
+        # ConstraintsInference().remove_temp_files(dir, file_id)
+        return 'WRITE'
+
+    @app.route('/GenerateChaos/', methods=['POST'])
+    @swag_from('../config/yml/chaosGen.yml')
+    def chaos_gen():
+        dir = flaskdir
+        sw = AIChaosSwagger(flaskdir=dir)
+        f = request.files['file']
+        list = os.listdir(dir)
+        for i in range(10000):
+            if str(i) not in list:
+                break
+        kubeconfigfile = ''.join([dir, str(i)])
+        f.save(kubeconfigfile)
+        print('HEADER:', f.headers)
+        print('[GenerateChaos] reqs:', request.form.to_dict())
+        print('[GenerateChaos]', f.filename, datetime.now())
+        # thread = threading.Thread(target=sw.write_constraints, args=(csvfile, str(i), parameters))
+        thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
+        thread.daemon = True
+        print(thread.getName())
+        thread.start()
+        return 'Chaos ID: ' + str(i)
+
+    @app.route('/GetStatus/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/status.yml')
+    def get_status(chaosid):
+        print('[GetStatus]', chaosid, flaskdir)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            return 'Completed'
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Does not exist'
+
+    @app.route('/GetQTable/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/qtable.yml')
+    def get_qtable(chaosid):
+        print('[GetQTable]', chaosid)
+        qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(qfile):
+            f = open(qfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+    @app.route('/GetEpisodes/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/episodes.yml')
+    def get_episodes(chaosid):
+        print('[GetEpisodes]', chaosid)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port='5001')
--- a/utils/chaos_ai/src/test_application.py
+++ b/utils/chaos_ai/src/test_application.py
@@ -0,0 +1,83 @@
+import json
+import logging
+import time
+import requests
+
+
+class TestApplication:
+    def __init__(self, num_requests=10, timeout=2, sleep_time=1):
+        self.num_requests = num_requests
+        self.timeout = timeout
+        self.sleep_time = sleep_time
+        self.logger = logging.getLogger()
+
+    def test_load(self, url=''):
+        # url = 'http://192.168.49.2:31902/api/cart/health'
+        timeout_count = 0
+        avg_lat = 0
+        for i in range(self.num_requests):
+            try:
+                r = requests.get(url, verify=False, timeout=self.timeout)
+                avg_lat += r.elapsed.total_seconds()
+                self.logger.info(
+                    url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
+                    + " {:.2f}".format(avg_lat))
+                if r.status_code != 200:
+                    return '200', r.status_code
+            # except requests.exceptions.Timeout as toe:
+            except Exception as toe:
+                self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
+                timeout_count += 1
+                if timeout_count > 3:
+                    return '200', 'Timeout'
+            # except Exception as e:
+            #   self.logger.debug('Connection refused!'+str(e))
+            time.sleep(self.sleep_time)
+        self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
+        return '200', '200'
+
+    # def test_load_hey(self):
+    #     cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
+    #     os.system(cmd)
+    #     with open('temp') as f:
+    #         datafile = f.readlines()
+    #     found = False
+    #     for line in datafile:
+    #         if 'Status code distribution:' in line:
+    #             found = True
+    #         if found:
+    #             print('[test_load]', line)
+    #             m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
+    #             if m is not None:
+    #                 resp_code = m.group(1)
+    #                 if resp_code != 200:
+    #                     return '200', resp_code
+    #     return '200', '200'
+
+    # # End state is reached when system is down or return error code like '500','404'
+    # def get_next_state(self):
+    #     self.logger.info('[GET_NEXT_STATE]')
+    #     f = open(self.chaos_dir + self.chaos_journal)
+    #     data = json.load(f)
+    #
+    #     # before the experiment (if before steady state is false, after is null?)
+    #     for probe in data['steady_states']['before']['probes']:
+    #         if not probe['tolerance_met']:
+    #             # start_state = probe['activity']['tolerance']
+    #             # end_state = probe['status']
+    #             start_state, end_state = None, None
+    #             return start_state, end_state
+    #
+    #     # after the experiment
+    #     for probe in data['steady_states']['after']['probes']:
+    #         # if probe['output']['status'] == probe['activity']['tolerance']:
+    #         if not probe['tolerance_met']:
+    #             # print(probe)
+    #             start_state = probe['activity']['tolerance']
+    #             end_state = probe['output']['status']
+    #             # end_state = probe['status']
+    #             return start_state, end_state
+    #     # if tolerances for all probes are met
+    #     start_state = probe['activity']['tolerance']
+    #     end_state = probe['activity']['tolerance']
+    #     return start_state, end_state
--- a/utils/chaos_ai/src/utils.py
+++ b/utils/chaos_ai/src/utils.py
@@ -0,0 +1,10 @@
+import re
+
+
+def get_load(fault):
+    params = re.findall(r'\(.*?\)', fault)
+    load = 100
+    if len(params) > 0:
+        load = params[0].strip('()')
+        fault = fault.strip(params[0])
+    return fault, load
--- a/utils/chaos_recommender/README.md
+++ b/utils/chaos_recommender/README.md
@@ -7,8 +7,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
 ## Pre-requisites

 - Openshift Or Kubernetes Environment where the application is hosted
- Access to the telemetry data via the exposed Prometheus endpoint
- Python3
+- Access to the metrics via the exposed Prometheus endpoint
+- Python3.9

 ## Usage

@@ -20,28 +20,35 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
    $ git clone https://github.com/krkn-chaos/krkn.git 
    $ cd krkn
    $ pip3 install -r requirements.txt
-    $ python3.9 utils/chaos_recommender/chaos_recommender.py
+    Edit configuration file:
+    $ vi config/recommender_config.yaml 
+    $ python3.9 utils/chaos_recommender/chaos_recommender.py -c utils/chaos_recommender/recommender_config.yaml
    ```

 2. Follow the prompts to provide the required information.

 ## Configuration
 To run the recommender with a config file specify the config file path with the `-c` argument.
-You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
+You can customize the default values by editing the `recommender_config.yaml` file. The configuration file contains the following options:

  - `application`: Specify the application name.
-  - `namespace`: Specify the namespace name. If you want to profile
+  - `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
  - `labels`: Specify the labels (not used).
  - `kubeconfig`: Specify the location of the kubeconfig file (not used).
  - `prometheus_endpoint`: Specify the prometheus endpoint (must).
  - `auth_token`: Auth token to connect to prometheus endpoint (must).
  - `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
  - `chaos_library`: "kraken" (currently it only supports kraken).
+  - `json_output_file`: True or False (by default False).
+  - `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
  - `chaos_tests`: (for output purpose only do not change if not needed)
    - `GENERAL`: list of general purpose tests available in Krkn
    - `MEM`: list of memory related tests available in Krkn
    - `NETWORK`: list of network related tests available in Krkn
    - `CPU`: list of memory related tests available in Krkn
+  - `threshold`: Specify the threshold to use for comparison and identifying outliers
+  - `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
+  - `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers

 *TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
        ```
@@ -58,8 +65,8 @@ You can also provide the input values through command-line arguments launching t
  -o, --options         Evaluate command line options
  -a APPLICATION, --application APPLICATION
                        Kubernetes application name
-  -n NAMESPACE, --namespace NAMESPACE
-                        Kubernetes application namespace
+  -n NAMESPACES, --namespaces NAMESPACE
+                        Kubernetes application namespaces separated by space
  -l LABELS, --labels LABELS
                        Kubernetes application labels
  -p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
                        Chaos library
  -L LOG_LEVEL, --log-level LOG_LEVEL
                        log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
+  -J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
+                        Create output file, the path to the folder can be specified, if not specified the default folder is used.
  -M MEM [MEM ...], --MEM MEM [MEM ...]
                        Memory related chaos tests (space separated list)
  -C CPU [CPU ...], --CPU CPU [CPU ...]
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
                        Network related chaos tests (space separated list)
  -G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
                        Memory related chaos tests (space separated list)
-
+  --threshold THRESHOLD
+                        Threshold
+  --cpu_threshold CPU_THRESHOLD
+                        CPU threshold to compare with the cpu limits
+  --mem_threshold MEM_THRESHOLD
+                        Memory threshold to compare with the memory limits
 ```

 If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
@@ -97,10 +111,10 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t

 ## Customizing Thresholds and Options

-You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
+You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.

 ## Additional Files

- `config/recommender_config.yaml`: The configuration file containing default values for application, namespace, labels, and kubeconfig.
+- `recommender_config.yaml`: The configuration file containing default values for application, namespace, labels, and kubeconfig.

 Happy Chaos!
--- a/utils/chaos_recommender/chaos_recommender.py
+++ b/utils/chaos_recommender/chaos_recommender.py
@@ -1,7 +1,10 @@
 import argparse
+import json
 import logging
 import os.path
+import re
 import sys
+import time
 import yaml
 # kraken module import for running the recommender
 # both from the root directory and the recommender
@@ -9,24 +12,28 @@ import yaml
 sys.path.insert(0, './')
 sys.path.insert(0, '../../')

+from krkn_lib.utils import get_yaml_item_value
+
 import kraken.chaos_recommender.analysis as analysis
 import kraken.chaos_recommender.prometheus as prometheus
 from kubernetes import config as kube_config


-
 def parse_arguments(parser):

    # command line options
    parser.add_argument("-c", "--config-file", action="store", help="Config file path")
    parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
-    parser.add_argument("-n", "--namespace", action="store", default="", help="Kubernetes application namespace")
+    parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space")
    parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
    parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
    parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
    parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
    parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")

+    parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
+                        help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
+
    parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
    parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
@@ -35,10 +42,13 @@ def parse_arguments(parser):
                        help="Network related chaos tests (space separated list)")
    parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
-
+    parser.add_argument("--threshold", action="store", default="", help="Threshold")
+    parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
+    parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")

    return parser.parse_args()

+
 def read_configuration(config_file_path):
    if not os.path.exists(config_file_path):
        logging.error(f"Config file not found: {config_file_path}")
@@ -48,15 +58,26 @@ def read_configuration(config_file_path):
        config = yaml.safe_load(config_file)

    log_level = config.get("log level", "INFO")
-    namespace = config.get("namespace", "")
-    kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
+    namespaces = config.get("namespaces")
+    namespaces = re.split(r",+\s+|,+|\s+", namespaces)
+    kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
+
+    prometheus_endpoint = config.get("prometheus_endpoint")
+    auth_token = config.get("auth_token")
+    scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
+    threshold = get_yaml_item_value(config, "threshold", ".7")
+    heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
+    heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
+    output_file = config.get("json_output_file", False)
+    if output_file is True:
+        output_path = config.get("json_output_folder_path")
+    else:
+        output_path = False
+    chaos_tests = config.get("chaos_tests", {})
+    return (namespaces, kubeconfig, prometheus_endpoint, auth_token,
+            scrape_duration, chaos_tests, log_level, threshold,
+            heatmap_cpu_threshold, heatmap_mem_threshold, output_path)

-    prometheus_endpoint = config.get("prometheus_endpoint", "")
-    auth_token = config.get("auth_token", "")
-    scrape_duration = config.get("scrape_duration", "10m")
-    chaos_tests = config.get("chaos_tests" , {})
-    return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
-            chaos_tests, log_level)

 def prompt_input(prompt, default_value):
    user_input = input(f"{prompt} [{default_value}]: ")
@@ -64,6 +85,54 @@ def prompt_input(prompt, default_value):
        return user_input
    return default_value

+
+def make_json_output(inputs, namespace_data, output_path):
+    time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+
+    data = {
+        "inputs": inputs,
+        "analysis_outputs": namespace_data
+    }
+
+    logging.info(f"Summary\n{json.dumps(data, indent=4)}")
+
+    if output_path is not False:
+        file = f"recommender_{time_str}.json"
+        path = f"{os.path.expanduser(output_path)}/{file}"
+
+        with open(path, "w") as json_output:
+            logging.info(f"Saving output file in {output_path} folder...")
+            json_output.write(json.dumps(data, indent=4))
+            logging.info(f"Recommendation output saved in {file}.")
+
+
+def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration,
+                chaos_tests, threshold, heatmap_cpu_threshold,
+                heatmap_mem_threshold):
+    inputs = {
+        "namespaces": namespaces,
+        "kubeconfig": kubeconfig,
+        "prometheus_endpoint": prometheus_endpoint,
+        "scrape_duration": scrape_duration,
+        "chaos_tests": chaos_tests,
+        "threshold": threshold,
+        "heatmap_cpu_threshold": heatmap_cpu_threshold,
+        "heatmap_mem_threshold": heatmap_mem_threshold
+    }
+    return inputs
+
+
+def json_namespace(namespace, queries, analysis_data):
+    data = {
+        "namespace": namespace,
+        "queries": queries,
+        "profiling": analysis_data[0],
+        "heatmap_analysis": analysis_data[1],
+        "recommendations": analysis_data[2]
+    }
+    return data
+
+
 def main():
    parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
    args = parse_arguments(parser)
@@ -75,43 +144,67 @@ def main():

    if args.config_file is not None:
        (
-         namespace,
+         namespaces,
         kubeconfig,
         prometheus_endpoint,
         auth_token,
         scrape_duration,
         chaos_tests,
-         log_level
+         log_level,
+         threshold,
+         heatmap_cpu_threshold,
+         heatmap_mem_threshold,
+         output_path
         ) = read_configuration(args.config_file)

    if args.options:
-        namespace = args.namespace
+        namespaces = args.namespaces
        kubeconfig = args.kubeconfig
        auth_token = args.token
        scrape_duration = args.scrape_duration
        log_level = args.log_level
        prometheus_endpoint = args.prometheus_endpoint
+        output_path = args.json_output_file
        chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
+        threshold = args.threshold
+        heatmap_mem_threshold = args.mem_threshold
+        heatmap_cpu_threshold = args.cpu_threshold

-    if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
+    if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
        logging.error(f"{log_level} not a valid log level")
        sys.exit(1)

    logging.basicConfig(level=log_level)

-    logging.info("============================INPUTS===================================")
-    logging.info(f"Namespace: {namespace}")
-    logging.info(f"Kubeconfig: {kubeconfig}")
-    logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
-    logging.info(f"Scrape duration: {scrape_duration}")
-    for test in chaos_tests.keys():
-        logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
-    logging.info("=====================================================================")
-    logging.info("Starting Analysis ...")
-    logging.info("Fetching the Telemetry data")
+    if output_path is not False:
+        if output_path is None:
+            output_path = "./recommender_output"
+            logging.info(f"Path for output file not specified. "
+                         f"Using default folder {output_path}")
+        if not os.path.exists(os.path.expanduser(output_path)):
+            logging.error(f"Folder {output_path} for output not found.")
+            sys.exit(1)
+
+    logging.info("Loading inputs...")
+    inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint,
+                         scrape_duration, chaos_tests, threshold,
+                         heatmap_cpu_threshold, heatmap_mem_threshold)
+    namespaces_data = []
+
+    logging.info("Starting Analysis...")
+
+    file_path, queries = prometheus.fetch_utilization_from_prometheus(
+        prometheus_endpoint, auth_token, namespaces, scrape_duration)
+
+    analysis_data = analysis(file_path, namespaces, chaos_tests, threshold,
+                             heatmap_cpu_threshold, heatmap_mem_threshold)
+
+    for namespace in namespaces:
+        namespace_data = json_namespace(namespace, queries[namespace],
+                                        analysis_data[namespace])
+        namespaces_data.append(namespace_data)
+    make_json_output(inputs, namespaces_data, output_path)

-    file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
-    analysis(file_path, chaos_tests)

 if __name__ == "__main__":
    main()
--- a/utils/chaos_recommender/recommender_config.yaml
+++ b/utils/chaos_recommender/recommender_config.yaml
@@ -0,0 +1,35 @@
+application: openshift-etcd
+namespaces: openshift-etcd
+labels: app=openshift-etcd
+kubeconfig: ~/.kube/config.yaml
+prometheus_endpoint: <Prometheus_Endpoint>
+auth_token: <Auth_Token>
+scrape_duration: 10m
+chaos_library: "kraken"
+log_level: INFO
+json_output_file: False
+json_output_folder_path:
+
+# for output purpose only do not change if not needed
+chaos_tests:
+  GENERIC:
+    - pod_failure
+    - container_failure
+    - node_failure
+    - zone_outage
+    - time_skew
+    - namespace_failure
+    - power_outage
+  CPU:
+    - node_cpu_hog
+  NETWORK:
+    - application_outage
+    - node_network_chaos
+    - pod_network_chaos
+  MEM:
+    - node_memory_hog
+    - pvc_disk_fill
+
+threshold: .7
+cpu_threshold: .5
+mem_threshold: .5