if no scenarios run

adding resiliency config default
custom weight
2026-03-16 16:40:42 +00:00 · 2026-03-11 13:58:50 -04:00 · 2026-03-11 13:58:50 -04:00 · 2026-03-11 13:58:50 -04:00 · 2026-03-11 13:58:49 -04:00 · 2026-03-11 13:58:34 -04:00
141 changed files with 8334 additions and 2689 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,3 +2,4 @@
 omit =
    tests/*
    krkn/tests/**
+    CI/tests_v2/*
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -43,11 +43,11 @@ jobs:

      - name: Deploy test workloads
        run: |
-          es_pod_name=$(kubectl get pods -l "app=elasticsearch-master" -o name)
-          echo "POD_NAME: $es_pod_name"
-          kubectl --namespace default port-forward $es_pod_name 9200 &
-          prom_name=$(kubectl get pods -n monitoring -l "app.kubernetes.io/name=prometheus" -o name)
-          kubectl --namespace monitoring port-forward $prom_name 9090 &
+          #          es_pod_name=$(kubectl get pods -l "app=elasticsearch-master" -o name)
+          #          echo "POD_NAME: $es_pod_name"
+          #          kubectl --namespace default port-forward $es_pod_name 9200 &
+          #          prom_name=$(kubectl get pods -n monitoring -l "app.kubernetes.io/name=prometheus" -o name)
+          #          kubectl --namespace monitoring port-forward $prom_name 9090 &

          # Wait for Elasticsearch to be ready
          echo "Waiting for Elasticsearch to be ready..."
@@ -85,7 +85,7 @@ jobs:
            yq -i '.elastic.enable_elastic=False' CI/config/common_test_config.yaml
            yq -i '.elastic.password="${{env.ELASTIC_PASSWORD}}"' CI/config/common_test_config.yaml
            yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
-            echo "test_app_outages" >> ./CI/tests/functional_tests
+            echo "test_app_outages" > ./CI/tests/functional_tests
            echo "test_container" >> ./CI/tests/functional_tests
            echo "test_cpu_hog" >> ./CI/tests/functional_tests
            echo "test_customapp_pod" >> ./CI/tests/functional_tests
@@ -94,13 +94,17 @@ jobs:
            echo "test_namespace" >> ./CI/tests/functional_tests
            echo "test_net_chaos" >> ./CI/tests/functional_tests
            echo "test_node" >> ./CI/tests/functional_tests
-            echo "test_pod" >> ./CI/tests/functional_tests
-            echo "test_pod_error" >> ./CI/tests/functional_tests
            echo "test_service_hijacking" >> ./CI/tests/functional_tests
            echo "test_pod_network_filter" >> ./CI/tests/functional_tests
            echo "test_pod_server" >> ./CI/tests/functional_tests
            echo "test_time" >> ./CI/tests/functional_tests
+            echo "test_node_network_chaos" >> ./CI/tests/functional_tests
+            echo "test_pod_network_chaos" >> ./CI/tests/functional_tests          
+            echo "test_cerberus_unhealthy" >> ./CI/tests/functional_tests
+            echo "test_pod_error" >> ./CI/tests/functional_tests
+            echo "test_pod" >> ./CI/tests/functional_tests
            # echo "test_pvc" >> ./CI/tests/functional_tests
+          

      # Push on main only steps + all other functional to collect coverage
      # for the badge
--- a/.github/workflows/tests_v2.yml
+++ b/.github/workflows/tests_v2.yml
@@ -0,0 +1,53 @@
+name: Tests v2 (pytest functional)
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+jobs:
+  tests-v2:
+    name: Tests v2 (pytest functional)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+
+      - name: Create KinD cluster
+        uses: redhat-chaos/actions/kind@main
+
+      - name: Pre-load test images into KinD
+        run: |
+          docker pull nginx:alpine
+          kind load docker-image nginx:alpine
+          docker pull quay.io/krkn-chaos/krkn:tools
+          kind load docker-image quay.io/krkn-chaos/krkn:tools
+
+      - name: Install Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          architecture: 'x64'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get install -y build-essential python3-dev
+          pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -r CI/tests_v2/requirements.txt
+
+      - name: Run tests_v2
+        run: |
+          KRKN_TEST_COVERAGE=1 python -m pytest CI/tests_v2/ -v --timeout=300 --reruns=1 --reruns-delay=5 \
+            --html=CI/tests_v2/report.html -n auto --junitxml=CI/tests_v2/results.xml
+
+      - name: Upload tests_v2 artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tests-v2-results
+          path: |
+            CI/tests_v2/report.html
+            CI/tests_v2/results.xml
+            CI/tests_v2/assets/
+          if-no-files-found: ignore
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ __pycache__/*
 kube-burner*
 kube_burner*
 recommender_*.json
+resiliency*.json

 # Project files
 .ropeproject
@@ -64,6 +65,10 @@ CI/out/*
 CI/ci_results
 CI/legacy/*node.yaml
 CI/results.markdown
+# CI tests_v2 (pytest-html / pytest outputs)
+CI/tests_v2/results.xml
+CI/tests_v2/report.html
+CI/tests_v2/assets/

 #env
 chaos/*
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -42,7 +42,7 @@ telemetry:
    prometheus_backup: True                                 # enables/disables prometheus data collection
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
-    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    archive_path: /tmp                                      # local path where the archive files will be temporarily stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
--- a/CI/templates/mock_cerberus.yaml
+++ b/CI/templates/mock_cerberus.yaml
@@ -0,0 +1,79 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mock-cerberus-server
+  namespace: default
+data:
+  server.py: |
+    #!/usr/bin/env python3
+    from http.server import HTTPServer, BaseHTTPRequestHandler
+    import json
+    
+    class MockCerberusHandler(BaseHTTPRequestHandler):
+        def do_GET(self):
+            if self.path == '/':
+                # Return True to indicate cluster is healthy
+                self.send_response(200)
+                self.send_header('Content-type', 'text/plain')
+                self.end_headers()
+                self.wfile.write(b'True')
+            elif self.path.startswith('/history'):
+                # Return empty history (no failures)
+                self.send_response(200)
+                self.send_header('Content-type', 'application/json')
+                self.end_headers()
+                response = {
+                    "history": {
+                        "failures": []
+                    }
+                }
+                self.wfile.write(json.dumps(response).encode())
+            else:
+                self.send_response(404)
+                self.end_headers()
+        
+        def log_message(self, format, *args):
+            print(f"[MockCerberus] {format % args}")
+    
+    if __name__ == '__main__':
+        server = HTTPServer(('0.0.0.0', 8080), MockCerberusHandler)
+        print("[MockCerberus] Starting mock cerberus server on port 8080...")
+        server.serve_forever()
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: mock-cerberus
+  namespace: default
+  labels:
+    app: mock-cerberus
+spec:
+  containers:
+  - name: mock-cerberus
+    image: python:3.9-slim
+    command: ["python3", "/app/server.py"]
+    ports:
+    - containerPort: 8080
+      name: http
+    volumeMounts:
+    - name: server-script
+      mountPath: /app
+  volumes:
+  - name: server-script
+    configMap:
+      name: mock-cerberus-server
+      defaultMode: 0755
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: mock-cerberus
+  namespace: default
+spec:
+  selector:
+    app: mock-cerberus
+  ports:
+  - protocol: TCP
+    port: 8080
+    targetPort: 8080
+  type: ClusterIP
--- a/CI/templates/mock_cerberus_unhealthy.yaml
+++ b/CI/templates/mock_cerberus_unhealthy.yaml
@@ -0,0 +1,85 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mock-cerberus-unhealthy-server
+  namespace: default
+data:
+  server.py: |
+    #!/usr/bin/env python3
+    from http.server import HTTPServer, BaseHTTPRequestHandler
+    import json
+    
+    class MockCerberusUnhealthyHandler(BaseHTTPRequestHandler):
+        def do_GET(self):
+            if self.path == '/':
+                # Return False to indicate cluster is unhealthy
+                self.send_response(200)
+                self.send_header('Content-type', 'text/plain')
+                self.end_headers()
+                self.wfile.write(b'False')
+            elif self.path.startswith('/history'):
+                # Return history with failures
+                self.send_response(200)
+                self.send_header('Content-type', 'application/json')
+                self.end_headers()
+                response = {
+                    "history": {
+                        "failures": [
+                            {
+                                "component": "node",
+                                "name": "test-node",
+                                "timestamp": "2024-01-01T00:00:00Z"
+                            }
+                        ]
+                    }
+                }
+                self.wfile.write(json.dumps(response).encode())
+            else:
+                self.send_response(404)
+                self.end_headers()
+        
+        def log_message(self, format, *args):
+            print(f"[MockCerberusUnhealthy] {format % args}")
+    
+    if __name__ == '__main__':
+        server = HTTPServer(('0.0.0.0', 8080), MockCerberusUnhealthyHandler)
+        print("[MockCerberusUnhealthy] Starting mock cerberus unhealthy server on port 8080...")
+        server.serve_forever()
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: mock-cerberus-unhealthy
+  namespace: default
+  labels:
+    app: mock-cerberus-unhealthy
+spec:
+  containers:
+  - name: mock-cerberus-unhealthy
+    image: python:3.9-slim
+    command: ["python3", "/app/server.py"]
+    ports:
+    - containerPort: 8080
+      name: http
+    volumeMounts:
+    - name: server-script
+      mountPath: /app
+  volumes:
+  - name: server-script
+    configMap:
+      name: mock-cerberus-unhealthy-server
+      defaultMode: 0755
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: mock-cerberus-unhealthy
+  namespace: default
+spec:
+  selector:
+    app: mock-cerberus-unhealthy
+  ports:
+  - protocol: TCP
+    port: 8080
+    targetPort: 8080
+  type: ClusterIP
--- a/CI/tests/test_cerberus_unhealthy.sh
+++ b/CI/tests/test_cerberus_unhealthy.sh
@@ -0,0 +1,79 @@
+set -xeEo pipefail
+
+source CI/tests/common.sh
+
+trap error ERR
+trap finish EXIT
+
+function functional_test_cerberus_unhealthy {
+  echo "========================================"
+  echo "Starting Cerberus Unhealthy Test"
+  echo "========================================"
+  
+  # Deploy mock cerberus unhealthy server
+  echo "Deploying mock cerberus unhealthy server..."
+  kubectl apply -f CI/templates/mock_cerberus_unhealthy.yaml
+  
+  # Wait for mock cerberus unhealthy pod to be ready
+  echo "Waiting for mock cerberus unhealthy to be ready..."
+  kubectl wait --for=condition=ready pod -l app=mock-cerberus-unhealthy --timeout=300s
+  
+  # Verify mock cerberus service is accessible
+  echo "Verifying mock cerberus unhealthy service..."
+  mock_cerberus_ip=$(kubectl get service mock-cerberus-unhealthy -o jsonpath='{.spec.clusterIP}')
+  echo "Mock Cerberus Unhealthy IP: $mock_cerberus_ip"
+  
+  # Test cerberus endpoint from within the cluster (should return False)
+  kubectl run cerberus-unhealthy-test --image=curlimages/curl:latest --rm -i --restart=Never -- \
+    curl -s http://mock-cerberus-unhealthy.default.svc.cluster.local:8080/ || echo "Cerberus unhealthy test curl completed"
+  
+  # Configure scenario for pod disruption with cerberus enabled
+  export scenario_type="pod_disruption_scenarios"
+  export scenario_file="scenarios/kind/pod_etcd.yml"
+  export post_config=""
+  
+  # Generate config with cerberus enabled
+  envsubst < CI/config/common_test_config.yaml > CI/config/cerberus_unhealthy_test_config.yaml
+  
+  # Enable cerberus in the config but DON'T exit_on_failure (so the test can verify the behavior)
+  # Using yq jq-wrapper syntax with -i -y
+  yq -i '.cerberus.cerberus_enabled = true' CI/config/cerberus_unhealthy_test_config.yaml
+  yq -i ".cerberus.cerberus_url = \"http://${mock_cerberus_ip}:8080\"" CI/config/cerberus_unhealthy_test_config.yaml
+  yq -i '.kraken.exit_on_failure = false' CI/config/cerberus_unhealthy_test_config.yaml
+  
+  echo "========================================"
+  echo "Cerberus Unhealthy Configuration:"
+  yq '.cerberus' CI/config/cerberus_unhealthy_test_config.yaml
+  echo "exit_on_failure:"
+  yq '.kraken.exit_on_failure' CI/config/cerberus_unhealthy_test_config.yaml
+  echo "========================================"
+  
+  # Run kraken with cerberus unhealthy (should detect unhealthy but not exit due to exit_on_failure=false)
+  echo "Running kraken with cerberus unhealthy integration..."
+  
+  # We expect this to complete (not exit 1) because exit_on_failure is false
+  # But cerberus should log that the cluster is unhealthy
+  python3 -m coverage run -a run_kraken.py -c CI/config/cerberus_unhealthy_test_config.yaml || {
+    exit_code=$?
+    echo "Kraken exited with code: $exit_code"
+    # If exit_code is 1, that's expected when cerberus reports unhealthy and exit_on_failure would be true
+    # But since we set exit_on_failure=false, it should not exit
+    if [ $exit_code -eq 1 ]; then
+      echo "WARNING: Kraken exited with 1, which may indicate cerberus detected unhealthy cluster"
+    fi
+  }
+  
+  # Verify cerberus was called by checking mock cerberus logs
+  echo "Checking mock cerberus unhealthy logs..."
+  kubectl logs -l app=mock-cerberus-unhealthy --tail=50
+  
+  # Cleanup
+  echo "Cleaning up mock cerberus unhealthy..."
+  kubectl delete -f CI/templates/mock_cerberus_unhealthy.yaml || true
+  
+  echo "========================================"
+  echo "Cerberus unhealthy functional test: Success"
+  echo "========================================"
+}
+
+functional_test_cerberus_unhealthy
--- a/CI/tests/test_node_network_chaos.sh
+++ b/CI/tests/test_node_network_chaos.sh
@@ -0,0 +1,165 @@
+set -xeEo pipefail
+
+source CI/tests/common.sh
+
+trap error ERR
+trap finish EXIT
+
+function functional_test_node_network_chaos {
+  echo "Starting node network chaos functional test"
+
+  # Get a worker node
+  get_node
+  export TARGET_NODE=$(echo $WORKER_NODE | awk '{print $1}')
+  echo "Target node: $TARGET_NODE"
+
+  # Deploy nginx workload on the target node
+  echo "Deploying nginx workload on $TARGET_NODE..."
+  kubectl create deployment nginx-node-net-chaos --image=nginx:latest
+
+  # Add node selector to ensure pod runs on target node
+  kubectl patch deployment nginx-node-net-chaos -p '{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"'$TARGET_NODE'"}}}}}'
+
+  # Expose service
+  kubectl expose deployment nginx-node-net-chaos --port=80 --target-port=80 --name=nginx-node-net-chaos-svc
+
+  # Wait for nginx to be ready
+  echo "Waiting for nginx pod to be ready on $TARGET_NODE..."
+  kubectl wait --for=condition=ready pod -l app=nginx-node-net-chaos --timeout=120s
+
+  # Verify pod is on correct node
+  export POD_NAME=$(kubectl get pods -l app=nginx-node-net-chaos -o jsonpath='{.items[0].metadata.name}')
+  export POD_NODE=$(kubectl get pod $POD_NAME -o jsonpath='{.spec.nodeName}')
+  echo "Pod $POD_NAME is running on node $POD_NODE"
+
+  if [ "$POD_NODE" != "$TARGET_NODE" ]; then
+    echo "ERROR: Pod is not on target node (expected $TARGET_NODE, got $POD_NODE)"
+    kubectl get pods -l app=nginx-node-net-chaos -o wide
+    exit 1
+  fi
+
+  # Setup port-forward to access nginx
+  echo "Setting up port-forward to nginx service..."
+  kubectl port-forward service/nginx-node-net-chaos-svc 8091:80 &
+  PORT_FORWARD_PID=$!
+  sleep 3  # Give port-forward time to start
+
+  # Test baseline connectivity
+  echo "Testing baseline connectivity..."
+  response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8091 || echo "000")
+  if [ "$response" != "200" ]; then
+    echo "ERROR: Nginx not responding correctly (got $response, expected 200)"
+    kubectl get pods -l app=nginx-node-net-chaos
+    kubectl describe pod $POD_NAME
+    exit 1
+  fi
+  echo "Baseline test passed: nginx responding with 200"
+
+  # Measure baseline latency
+  echo "Measuring baseline latency..."
+  baseline_start=$(date +%s%3N)
+  curl -s http://localhost:8091 > /dev/null || true
+  baseline_end=$(date +%s%3N)
+  baseline_latency=$((baseline_end - baseline_start))
+  echo "Baseline latency: ${baseline_latency}ms"
+
+  # Configure node network chaos scenario
+  echo "Configuring node network chaos scenario..."
+  yq -i '.[0].config.target="'$TARGET_NODE'"' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.namespace="default"' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.test_duration=20' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.latency="200ms"' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.loss=15' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.bandwidth="10mbit"' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.ingress=true' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.egress=true' scenarios/kube/node-network-chaos.yml
+  yq -i '.[0].config.force=false' scenarios/kube/node-network-chaos.yml
+  yq -i 'del(.[0].config.interfaces)' scenarios/kube/node-network-chaos.yml
+
+  # Prepare krkn config
+  export scenario_type="network_chaos_ng_scenarios"
+  export scenario_file="scenarios/kube/node-network-chaos.yml"
+  export post_config=""
+  envsubst < CI/config/common_test_config.yaml > CI/config/node_network_chaos_config.yaml
+
+  # Run krkn in background
+  echo "Starting krkn with node network chaos scenario..."
+  python3 -m coverage run -a run_kraken.py -c CI/config/node_network_chaos_config.yaml &
+  KRKN_PID=$!
+  echo "Krkn started with PID: $KRKN_PID"
+
+  # Wait for chaos to start (give it time to inject chaos)
+  echo "Waiting for chaos injection to begin..."
+  sleep 10
+
+  # Test during chaos - check for increased latency or packet loss effects
+  echo "Testing network behavior during chaos..."
+  chaos_test_count=0
+  chaos_success=0
+
+  for i in {1..5}; do
+    chaos_test_count=$((chaos_test_count + 1))
+    chaos_start=$(date +%s%3N)
+    response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 http://localhost:8091 || echo "000")
+    chaos_end=$(date +%s%3N)
+    chaos_latency=$((chaos_end - chaos_start))
+
+    echo "Attempt $i: HTTP $response, latency: ${chaos_latency}ms"
+
+    # We expect either increased latency or some failures due to packet loss
+    if [ "$response" == "200" ] || [ "$response" == "000" ]; then
+      chaos_success=$((chaos_success + 1))
+    fi
+
+    sleep 2
+  done
+
+  echo "Chaos test results: $chaos_success/$chaos_test_count requests processed"
+
+  # Verify node-level chaos affects pod
+  echo "Verifying node-level chaos affects pod on $TARGET_NODE..."
+  # The node chaos should affect all pods on the node
+
+  # Wait for krkn to complete
+  echo "Waiting for krkn to complete..."
+  wait $KRKN_PID || true
+  echo "Krkn completed"
+
+  # Wait a bit for cleanup
+  sleep 5
+
+  # Verify recovery - nginx should respond normally again
+  echo "Verifying service recovery..."
+  recovery_attempts=0
+  max_recovery_attempts=10
+
+  while [ $recovery_attempts -lt $max_recovery_attempts ]; do
+    recovery_attempts=$((recovery_attempts + 1))
+    response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8091 || echo "000")
+
+    if [ "$response" == "200" ]; then
+      echo "Recovery verified: nginx responding normally (attempt $recovery_attempts)"
+      break
+    fi
+
+    echo "Recovery attempt $recovery_attempts/$max_recovery_attempts: got $response, retrying..."
+    sleep 3
+  done
+
+  if [ "$response" != "200" ]; then
+    echo "ERROR: Service did not recover after chaos (got $response)"
+    kubectl get pods -l app=nginx-node-net-chaos
+    kubectl describe pod $POD_NAME
+    exit 1
+  fi
+
+  # Cleanup
+  echo "Cleaning up test resources..."
+  kill $PORT_FORWARD_PID 2>/dev/null || true
+  kubectl delete deployment nginx-node-net-chaos --ignore-not-found=true
+  kubectl delete service nginx-node-net-chaos-svc --ignore-not-found=true
+
+  echo "Node network chaos test: Success"
+}
+
+functional_test_node_network_chaos
--- a/CI/tests/test_pod.sh
+++ b/CI/tests/test_pod.sh
@@ -7,14 +7,15 @@ trap finish EXIT

 function functional_test_pod_crash {
  export scenario_type="pod_disruption_scenarios"
-  export scenario_file="scenarios/kind/pod_etcd.yml"
+  export scenario_file="scenarios/kind/pod_path_provisioner.yml"
+
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml

  python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml
  echo "Pod disruption scenario test: Success"
  date
-  kubectl get pods -n kube-system -l component=etcd -o yaml
+  kubectl get pods -n local-path-storage -l app=local-path-provisioner -o yaml
 }

 functional_test_pod_crash
--- a/CI/tests/test_pod_error.sh
+++ b/CI/tests/test_pod_error.sh
@@ -1,4 +1,5 @@

+
 source CI/tests/common.sh

 trap error ERR
@@ -8,7 +9,9 @@ function functional_test_pod_error {
  export scenario_type="pod_disruption_scenarios"
  export scenario_file="scenarios/kind/pod_etcd.yml"
  export post_config=""
+  # this test will check if krkn exits with an error when too many pods are targeted
  yq -i '.[0].config.kill=5' scenarios/kind/pod_etcd.yml
+  yq -i '.[0].config.krkn_pod_recovery_time=1' scenarios/kind/pod_etcd.yml
  envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml
  cat CI/config/pod_config.yaml

--- a/CI/tests/test_pod_network_chaos.sh
+++ b/CI/tests/test_pod_network_chaos.sh
@@ -0,0 +1,143 @@
+set -xeEo pipefail
+
+source CI/tests/common.sh
+
+trap error ERR
+trap finish EXIT
+
+function functional_test_pod_network_chaos {
+  echo "Starting pod network chaos functional test"
+
+  # Deploy nginx workload
+  echo "Deploying nginx workload..."
+  kubectl create deployment nginx-pod-net-chaos --image=nginx:latest
+  kubectl expose deployment nginx-pod-net-chaos --port=80 --target-port=80 --name=nginx-pod-net-chaos-svc
+
+  # Wait for nginx to be ready
+  echo "Waiting for nginx pod to be ready..."
+  kubectl wait --for=condition=ready pod -l app=nginx-pod-net-chaos --timeout=120s
+
+  # Get pod name
+  export POD_NAME=$(kubectl get pods -l app=nginx-pod-net-chaos -o jsonpath='{.items[0].metadata.name}')
+  echo "Target pod: $POD_NAME"
+
+  # Setup port-forward to access nginx
+  echo "Setting up port-forward to nginx service..."
+  kubectl port-forward service/nginx-pod-net-chaos-svc 8090:80 &
+  PORT_FORWARD_PID=$!
+  sleep 3  # Give port-forward time to start
+
+  # Test baseline connectivity
+  echo "Testing baseline connectivity..."
+  response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8090 || echo "000")
+  if [ "$response" != "200" ]; then
+    echo "ERROR: Nginx not responding correctly (got $response, expected 200)"
+    kubectl get pods -l app=nginx-pod-net-chaos
+    kubectl describe pod $POD_NAME
+    exit 1
+  fi
+  echo "Baseline test passed: nginx responding with 200"
+
+  # Measure baseline latency
+  echo "Measuring baseline latency..."
+  baseline_start=$(date +%s%3N)
+  curl -s http://localhost:8090 > /dev/null || true
+  baseline_end=$(date +%s%3N)
+  baseline_latency=$((baseline_end - baseline_start))
+  echo "Baseline latency: ${baseline_latency}ms"
+
+  # Configure pod network chaos scenario
+  echo "Configuring pod network chaos scenario..."
+  yq -i '.[0].config.target="'$POD_NAME'"' scenarios/kube/pod-network-chaos.yml
+  yq -i '.[0].config.namespace="default"' scenarios/kube/pod-network-chaos.yml
+  yq -i '.[0].config.test_duration=20' scenarios/kube/pod-network-chaos.yml
+  yq -i '.[0].config.latency="200ms"' scenarios/kube/pod-network-chaos.yml
+  yq -i '.[0].config.loss=15' scenarios/kube/pod-network-chaos.yml
+  yq -i '.[0].config.bandwidth="10mbit"' scenarios/kube/pod-network-chaos.yml
+  yq -i '.[0].config.ingress=true' scenarios/kube/pod-network-chaos.yml
+  yq -i '.[0].config.egress=true' scenarios/kube/pod-network-chaos.yml
+  yq -i 'del(.[0].config.interfaces)' scenarios/kube/pod-network-chaos.yml
+
+  # Prepare krkn config
+  export scenario_type="network_chaos_ng_scenarios"
+  export scenario_file="scenarios/kube/pod-network-chaos.yml"
+  export post_config=""
+  envsubst < CI/config/common_test_config.yaml > CI/config/pod_network_chaos_config.yaml
+
+  # Run krkn in background
+  echo "Starting krkn with pod network chaos scenario..."
+  python3 -m coverage run -a run_kraken.py -c CI/config/pod_network_chaos_config.yaml &
+  KRKN_PID=$!
+  echo "Krkn started with PID: $KRKN_PID"
+
+  # Wait for chaos to start (give it time to inject chaos)
+  echo "Waiting for chaos injection to begin..."
+  sleep 10
+
+  # Test during chaos - check for increased latency or packet loss effects
+  echo "Testing network behavior during chaos..."
+  chaos_test_count=0
+  chaos_success=0
+
+  for i in {1..5}; do
+    chaos_test_count=$((chaos_test_count + 1))
+    chaos_start=$(date +%s%3N)
+    response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 http://localhost:8090 || echo "000")
+    chaos_end=$(date +%s%3N)
+    chaos_latency=$((chaos_end - chaos_start))
+
+    echo "Attempt $i: HTTP $response, latency: ${chaos_latency}ms"
+
+    # We expect either increased latency or some failures due to packet loss
+    if [ "$response" == "200" ] || [ "$response" == "000" ]; then
+      chaos_success=$((chaos_success + 1))
+    fi
+
+    sleep 2
+  done
+
+  echo "Chaos test results: $chaos_success/$chaos_test_count requests processed"
+
+  # Wait for krkn to complete
+  echo "Waiting for krkn to complete..."
+  wait $KRKN_PID || true
+  echo "Krkn completed"
+
+  # Wait a bit for cleanup
+  sleep 5
+
+  # Verify recovery - nginx should respond normally again
+  echo "Verifying service recovery..."
+  recovery_attempts=0
+  max_recovery_attempts=10
+
+  while [ $recovery_attempts -lt $max_recovery_attempts ]; do
+    recovery_attempts=$((recovery_attempts + 1))
+    response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8090 || echo "000")
+
+    if [ "$response" == "200" ]; then
+      echo "Recovery verified: nginx responding normally (attempt $recovery_attempts)"
+      break
+    fi
+
+    echo "Recovery attempt $recovery_attempts/$max_recovery_attempts: got $response, retrying..."
+    sleep 3
+  done
+
+  if [ "$response" != "200" ]; then
+    echo "ERROR: Service did not recover after chaos (got $response)"
+    kubectl get pods -l app=nginx-pod-net-chaos
+    kubectl describe pod $POD_NAME
+    exit 1
+  fi
+
+  # Cleanup
+  echo "Cleaning up test resources..."
+  kill $PORT_FORWARD_PID 2>/dev/null || true
+  kubectl delete deployment nginx-pod-net-chaos --ignore-not-found=true
+  kubectl delete service nginx-pod-net-chaos-svc --ignore-not-found=true
+
+  echo "Pod network chaos test: Success"
+}
+
+functional_test_pod_network_chaos
--- a/CI/tests/test_telemetry.sh
+++ b/CI/tests/test_telemetry.sh
@@ -19,12 +19,12 @@ function functional_test_telemetry {
  yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml

  export scenario_type="pod_disruption_scenarios"
-  export scenario_file="scenarios/kind/pod_etcd.yml"
+  export scenario_file="scenarios/kind/pod_path_provisioner.yml"

  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
  retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
-  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
+  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p" | sed 's/\x1b\[[0-9;]*m//g'`
  $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
  echo "checking if telemetry files are uploaded on s3"
  cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded"  && exit 1 )
--- a/CI/tests_v2/CONTRIBUTING_TESTS.md
+++ b/CI/tests_v2/CONTRIBUTING_TESTS.md
@@ -0,0 +1,175 @@
+# Adding a New Scenario Test (CI/tests_v2)
+
+This guide explains how to add a new chaos scenario test to the v2 pytest framework. The layout is **folder-per-scenario**: each scenario has its own directory under `scenarios/<scenario_name>/` containing the test file, Kubernetes resources, and the Krkn scenario base YAML.
+
+## Option 1: Scaffold script (recommended)
+
+From the **repository root**:
+
+```bash
+python CI/tests_v2/scaffold.py --scenario service_hijacking
+```
+
+This creates:
+
+- `CI/tests_v2/scenarios/service_hijacking/test_service_hijacking.py` — A test class extending `BaseScenarioTest` with a stub `test_happy_path` and `WORKLOAD_MANIFEST` pointing to the folder’s `resource.yaml`.
+- `CI/tests_v2/scenarios/service_hijacking/resource.yaml` — A placeholder Deployment (namespace is patched at deploy time).
+- `CI/tests_v2/scenarios/service_hijacking/scenario_base.yaml` — A placeholder Krkn scenario; edit this with the structure expected by your scenario type.
+
+The script automatically registers the marker in `CI/tests_v2/pytest.ini`. For example, it adds:
+
+```
+service_hijacking: marks a test as a service_hijacking scenario test
+```
+
+**Next steps after scaffolding:**
+
+1. Verify the marker was added to `pytest.ini` (the scaffold does this automatically).
+2. Edit `scenario_base.yaml` with the structure your Krkn scenario type expects (see `scenarios/application_outage/scenario_base.yaml` and `scenarios/pod_disruption/scenario_base.yaml` for examples). The top-level key should match `SCENARIO_NAME`.
+3. If your scenario uses a **list** structure (like pod_disruption) instead of a **dict** with a top-level key, set `NAMESPACE_KEY_PATH` (e.g. `[0, "config", "namespace_pattern"]`) and `NAMESPACE_IS_REGEX = True` if the namespace is a regex pattern.
+4. The generated `test_happy_path` already uses `self.run_scenario(self.tmp_path, ns)` and assertions. Add more test methods (e.g. negative tests with `@pytest.mark.no_workload`) as needed.
+5. Adjust `resource.yaml` if your scenario needs a different workload (e.g. specific image or labels).
+
+If your Kraken scenario type string is not `<scenario>_scenarios`, pass it explicitly:
+
+```bash
+python CI/tests_v2/scaffold.py --scenario node_disruption --scenario-type node_scenarios
+```
+
+## Option 2: Manual setup
+
+1. **Create the scenario folder**  
+   `CI/tests_v2/scenarios/<scenario_name>/`.
+
+2. **Add resource.yaml**  
+   Kubernetes manifest(s) for the workload (Deployment or Pod). Use a distinct label (e.g. `app: <scenario>-target`). Omit or leave `metadata.namespace`; the framework patches it at deploy time.
+
+3. **Add scenario_base.yaml**  
+   The canonical Krkn scenario structure. Tests will load this, patch namespace (and any overrides), write to `tmp_path`, and pass to `build_config`. See existing scenarios for the format your scenario type expects.
+
+4. **Add test_<scenario>.py**  
+   - Import `BaseScenarioTest` from `lib.base` and helpers from `lib.utils` (e.g. `assert_kraken_success`, `get_pods_list`, `scenario_dir` if needed).
+   - Define a class extending `BaseScenarioTest` with:
+     - `WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/<scenario_name>/resource.yaml"`
+     - `WORKLOAD_IS_PATH = True`
+     - `LABEL_SELECTOR = "app=<label>"`
+     - `SCENARIO_NAME = "<scenario_name>"`
+     - `SCENARIO_TYPE = "<scenario_type>"` (e.g. `application_outages_scenarios`)
+     - `NAMESPACE_KEY_PATH`: path to the namespace field (e.g. `["application_outage", "namespace"]` for dict-based, or `[0, "config", "namespace_pattern"]` for list-based)
+     - `NAMESPACE_IS_REGEX = False` (or `True` for regex patterns like pod_disruption)
+     - `OVERRIDES_KEY_PATH = ["<top-level key>"]` if the scenario supports overrides (e.g. duration, block).
+   - Add `@pytest.mark.functional` and `@pytest.mark.<scenario>` on the class.
+   - In at least one test, call `self.run_scenario(self.tmp_path, self.ns)` and assert with `assert_kraken_success`, `assert_pod_count_unchanged`, and `assert_all_pods_running_and_ready`. Use `self.k8s_core`, `self.tmp_path`, etc. (injected by the base class).
+
+5. **Register the marker**  
+   In `CI/tests_v2/pytest.ini`, under `markers`:
+   ```
+   <scenario>: marks a test as a <scenario> scenario test
+   ```
+
+## Conventions
+
+- **Folder-per-scenario**: One directory per scenario under `scenarios/`. All assets (test, resource.yaml, scenario_base.yaml, and any extra YAMLs) live there for easy tracking and onboarding.
+- **Ephemeral namespace**: Every test gets a unique `krkn-test-<uuid>` namespace. The base class deploys the workload into it before the test; no manual deploy is required.
+- **Negative tests**: For tests that don’t need a workload (e.g. invalid scenario, bad namespace), use `@pytest.mark.no_workload`. The test will still get a namespace but no workload will be deployed.
+- **Scenario type**: `SCENARIO_TYPE` must match the key in Kraken’s config (e.g. `application_outages_scenarios`, `pod_disruption_scenarios`). See `CI/tests_v2/config/common_test_config.yaml` and the scenario plugin’s `get_scenario_types()`.
+- **Assertions**: Use `assert_kraken_success(result, context=f"namespace={ns}", tmp_path=self.tmp_path)` so failures include stdout/stderr and optional log files.
+- **Timeouts**: Use constants from `lib.base` (`READINESS_TIMEOUT`, `POLICY_WAIT_TIMEOUT`, etc.) instead of magic numbers.
+
+## Exit Code Handling
+
+Kraken uses the following exit codes: **0** = success; **1** = scenario failure (e.g. post scenarios still failing); **2** = critical alerts fired; **3+** = health check / KubeVirt check failures; **-1** = infrastructure error (bad config, no kubeconfig).
+
+- **Happy-path tests**: Use `assert_kraken_success(result, ...)`. By default only exit code 0 is accepted.
+- **Alert-aware tests**: If you enable `check_critical_alerts` and expect alerts, use `assert_kraken_success(result, allowed_codes=(0, 2), ...)` so exit code 2 is treated as acceptable.
+- **Expected-failure tests**: Use `assert_kraken_failure(result, context=..., tmp_path=self.tmp_path)` for negative tests (invalid scenario, bad namespace, etc.). This gives the same diagnostic quality (log dump, tmp_path hint) as success assertions. Prefer this over a bare `assert result.returncode != 0`.
+
+## Running your new tests
+
+```bash
+pytest CI/tests_v2/ -v -m <scenario>
+```
+
+For debugging with logs and keeping failed namespaces:
+
+```bash
+pytest CI/tests_v2/ -v -m <scenario> --log-cli-level=DEBUG --keep-ns-on-fail
+```
+
+---
+
+## Naming Conventions
+
+Follow these conventions so the framework stays consistent as new scenarios are added.
+
+### Quick Reference
+
+| Element | Pattern | Example |
+|---|---|---|
+| Scenario folder | `scenarios/<snake_case>/` | `scenarios/node_disruption/` |
+| Test file | `test_<scenario>.py` | `test_node_disruption.py` |
+| Test class | `Test<CamelCase>(BaseScenarioTest)` | `TestNodeDisruption` |
+| Pytest marker | `@pytest.mark.<scenario>` (matches folder) | `@pytest.mark.node_disruption` |
+| Scenario YAML | `scenario_base.yaml` | — |
+| Workload YAML | `resource.yaml` | — |
+| Extra YAMLs | `<descriptive_name>.yaml` | `nginx_http.yaml` |
+| Lib modules | `lib/<concern>.py` | `lib/deploy.py` |
+| Public fixtures | `<verb>_<noun>` or `<noun>` | `run_kraken`, `test_namespace` |
+| Private/autouse fixtures | `_<descriptive>` | `_cleanup_stale_namespaces` |
+| Assertion helpers | `assert_<condition>` | `assert_pod_count_unchanged` |
+| Query helpers | `get_<resource>` or `find_<resource>_by_<criteria>` | `get_pods_list`, `find_network_policy_by_prefix` |
+| Env var overrides | `KRKN_TEST_<NAME>` | `KRKN_TEST_READINESS_TIMEOUT` |
+
+### Folders
+
+- One folder per scenario under `scenarios/`. The folder name is `snake_case` and must match the `SCENARIO_NAME` class attribute in the test.
+- Shared framework code lives in `lib/`. Each module covers a single concern (`k8s`, `namespace`, `deploy`, `kraken`, `utils`, `base`, `preflight`).
+- Do **not** add scenario-specific code to `lib/`; keep it in the scenario folder as module-level helpers.
+
+### Files
+
+- Test files: `test_<scenario>.py`. This is required for pytest discovery (`test_*.py`).
+- Workload manifests: always `resource.yaml`. If a scenario needs additional K8s resources (e.g. a Service for traffic testing), use a descriptive name like `nginx_http.yaml`.
+- Scenario config: always `scenario_base.yaml`. This is the template that `load_and_patch_scenario` loads and patches.
+
+### Classes
+
+- One test class per file: `Test<CamelCase>` extending `BaseScenarioTest`.
+- The CamelCase name must be the PascalCase equivalent of the folder name (e.g. `pod_disruption` -> `TestPodDisruption`).
+
+### Test Methods
+
+- Prefix: `test_` (pytest requirement).
+- Use descriptive names that convey **what is being verified**, not implementation details.
+- Good: `test_pod_crash_and_recovery`, `test_traffic_blocked_during_outage`, `test_invalid_scenario_fails`.
+- Avoid: `test_run_1`, `test_scenario`, `test_it_works`.
+
+### Fixtures
+
+- **Public fixtures** (intended for use in tests): use `<verb>_<noun>` or plain `<noun>`. Examples: `run_kraken`, `deploy_workload`, `test_namespace`, `kubectl`.
+- **Private/autouse fixtures** (framework internals): prefix with `_`. Examples: `_kube_config_loaded`, `_preflight_checks`, `_inject_common_fixtures`.
+- K8s client fixtures use the `k8s_` prefix: `k8s_core`, `k8s_apps`, `k8s_networking`, `k8s_client`.
+
+### Helpers and Utilities
+
+- **Assertions**: `assert_<what_is_expected>`. Always raise `AssertionError` with a message that includes the namespace.
+- **K8s queries**: `get_<resource>_list` for direct API calls, `find_<resource>_by_<criteria>` for filtered lookups.
+- **Private helpers**: prefix with `_` for module-internal functions (e.g. `_pods`, `_policies`, `_get_nested`).
+
+### Constants and Environment Variables
+
+- Timeout constants: `UPPER_CASE` in `lib/base.py`. Each is overridable via an env var prefixed `KRKN_TEST_`.
+- Feature flags: `KRKN_TEST_DRY_RUN`, `KRKN_TEST_COVERAGE`. Always use the `KRKN_TEST_` prefix so all tunables are discoverable with `grep KRKN_TEST_`.
+
+### Markers
+
+- Every test class gets `@pytest.mark.functional` (framework-wide) and `@pytest.mark.<scenario>` (scenario-specific).
+- The scenario marker name matches the folder name exactly.
+- Behavioral modifiers use plain descriptive names: `no_workload`, `order`.
+- Register all custom markers in `pytest.ini` to avoid warnings.
+
+## Adding Dependencies
+
+- **Runtime (Kraken needs it)**: Add to the **root** `requirements.txt`. Pin a version (e.g. `package==1.2.3` or `package>=1.2,<2`).
+- **Test-only (only CI/tests_v2 needs it)**: Add to **`CI/tests_v2/requirements.txt`**. Pin a version there as well.
+- After changing either file, run `make setup` (or `make -f CI/tests_v2/Makefile setup`) from the repo root to verify both files install cleanly together.
--- a/CI/tests_v2/Makefile
+++ b/CI/tests_v2/Makefile
@@ -0,0 +1,97 @@
+# CI/tests_v2 functional tests - single entry point.
+# Run from repo root: make -f CI/tests_v2/Makefile <target>
+# Or from CI/tests_v2: make <target> (REPO_ROOT is resolved automatically).
+
+# Resolve repo root: go to Makefile dir then up two levels (CI/tests_v2 -> repo root)
+REPO_ROOT := $(shell cd "$(dir $(firstword $(MAKEFILE_LIST)))" && cd ../.. && pwd)
+VENV := $(REPO_ROOT)/venv
+PYTHON := $(VENV)/bin/python
+PIP := $(VENV)/bin/pip
+CLUSTER_NAME ?= ci-krkn
+TESTS_DIR := $(REPO_ROOT)/CI/tests_v2
+
+.PHONY: setup preflight test test-fast test-debug test-scenario test-dry-run clean help
+
+help:
+	@echo "CI/tests_v2 functional tests - usage: make [target]"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup         Create venv (if missing), install Python deps, create KinD cluster (kind-config-dev.yml)."
+	@echo "               Run once before first test. Override cluster config: KIND_CONFIG=path make setup"
+	@echo ""
+	@echo "  preflight     Check Python 3.9+, kind, kubectl, Docker, cluster reachability, test deps."
+	@echo "               Invoked automatically by test targets; run standalone to validate environment."
+	@echo ""
+	@echo "  test         Full run: retries (2), timeout 300s, HTML report, JUnit XML, coverage."
+	@echo "               Use for CI or final verification. Output: report.html, results.xml"
+	@echo ""
+	@echo "  test-fast     Quick run: no retries, 120s timeout, no report. For fast local iteration."
+	@echo ""
+	@echo "  test-debug    Debug run: verbose (-s), keep failed namespaces (--keep-ns-on-fail), DEBUG logging."
+	@echo "               Use when investigating failures; inspect kept namespaces with kubectl."
+	@echo ""
+	@echo "  test-scenario Run only one scenario. Requires SCENARIO=<marker>."
+	@echo "               Example: make test-scenario SCENARIO=pod_disruption"
+	@echo ""
+	@echo "  test-dry-run  Validate scenario plumbing only (no Kraken execution). Sets KRKN_TEST_DRY_RUN=1."
+	@echo ""
+	@echo "  clean        Delete KinD cluster $(CLUSTER_NAME) and remove report.html, results.xml."
+	@echo ""
+	@echo "  help         Show this help."
+	@echo ""
+	@echo "Run from repo root: make -f CI/tests_v2/Makefile <target>"
+	@echo "Or from CI/tests_v2:  make <target>"
+
+setup: $(VENV)/.installed
+	@echo "Running cluster setup..."
+	$(MAKE) -f $(TESTS_DIR)/Makefile preflight
+	cd $(REPO_ROOT) && ./CI/tests_v2/setup_env.sh
+	@echo "Setup complete. Run 'make test' or 'make -f CI/tests_v2/Makefile test' from repo root."
+
+$(VENV)/.installed: $(REPO_ROOT)/requirements.txt $(TESTS_DIR)/requirements.txt
+	@if [ ! -d "$(VENV)" ]; then python3 -m venv $(VENV); echo "Created venv at $(VENV)"; fi
+	$(PYTHON) -m pip install -q --upgrade pip
+	# Root = Kraken runtime; tests_v2 = test-only plugins; both required for functional tests.
+	$(PIP) install -q -r $(REPO_ROOT)/requirements.txt
+	$(PIP) install -q -r $(TESTS_DIR)/requirements.txt
+	@touch $(VENV)/.installed
+	@echo "Python deps installed."
+
+preflight:
+	@echo "Preflight: checking Python, tools, and cluster..."
+	@command -v python3 >/dev/null 2>&1 || { echo "Error: python3 not found."; exit 1; }
+	@python3 -c "import sys; exit(0 if sys.version_info >= (3, 9) else 1)" || { echo "Error: Python 3.9+ required."; exit 1; }
+	@command -v kind >/dev/null 2>&1 || { echo "Error: kind not installed."; exit 1; }
+	@command -v kubectl >/dev/null 2>&1 || { echo "Error: kubectl not installed."; exit 1; }
+	@docker info >/dev/null 2>&1 || { echo "Error: Docker not running (required for KinD)."; exit 1; }
+	@if kind get clusters 2>/dev/null | grep -qx "$(CLUSTER_NAME)"; then \
+		kubectl cluster-info >/dev/null 2>&1 || { echo "Error: Cluster $(CLUSTER_NAME) exists but cluster-info failed."; exit 1; }; \
+	else \
+		echo "Note: Cluster $(CLUSTER_NAME) not found. Run 'make setup' to create it."; \
+	fi
+	@$(PYTHON) -c "import pytest_rerunfailures, pytest_html, pytest_timeout, pytest_order" 2>/dev/null || \
+		{ echo "Error: Install test deps with 'make setup' or pip install -r CI/tests_v2/requirements.txt"; exit 1; }
+	@echo "Preflight OK."
+
+test: preflight
+	cd $(REPO_ROOT) && KRKN_TEST_COVERAGE=1 $(PYTHON) -m pytest $(TESTS_DIR)/ -v --timeout=300 --reruns=2 --reruns-delay=10 \
+		--html=$(TESTS_DIR)/report.html -n auto --junitxml=$(TESTS_DIR)/results.xml
+
+test-fast: preflight
+	cd $(REPO_ROOT) && $(PYTHON) -m pytest $(TESTS_DIR)/ -v -p no:rerunfailures -n auto --timeout=120
+
+test-debug: preflight
+	cd $(REPO_ROOT) && $(PYTHON) -m pytest $(TESTS_DIR)/ -v -s -p no:rerunfailures --timeout=300 \
+		--keep-ns-on-fail --log-cli-level=DEBUG
+
+test-scenario: preflight
+	@if [ -z "$(SCENARIO)" ]; then echo "Error: set SCENARIO=pod_disruption (or application_outage, etc.)"; exit 1; fi
+	cd $(REPO_ROOT) && $(PYTHON) -m pytest $(TESTS_DIR)/ -v -m "$(SCENARIO)" --timeout=300 --reruns=2 --reruns-delay=10
+
+test-dry-run: preflight
+	cd $(REPO_ROOT) && KRKN_TEST_DRY_RUN=1 $(PYTHON) -m pytest $(TESTS_DIR)/ -v
+
+clean:
+	@kind delete cluster --name $(CLUSTER_NAME) 2>/dev/null || true
+	@rm -f $(TESTS_DIR)/report.html $(TESTS_DIR)/results.xml
+	@echo "Cleaned cluster and report artifacts."
--- a/CI/tests_v2/README.md
+++ b/CI/tests_v2/README.md
@@ -0,0 +1,198 @@
+# Pytest Functional Tests (tests_v2)
+
+This directory contains a pytest-based functional test framework that runs **alongside** the existing bash tests in `CI/tests/`. It covers the **pod disruption** and **application outage** scenarios with proper assertions, retries, and reporting.
+
+Each test runs in its **own ephemeral Kubernetes namespace** (`krkn-test-<uuid>`). Before the test, the framework creates the namespace, deploys the target workload, and waits for pods to be ready. After the test, the namespace is deleted (cascading all resources). **You do not need to deploy any workloads manually.**
+
+## Prerequisites
+
+Without a cluster, tests that need one will **skip** with a clear message (e.g. *"Could not load kube config"*). No manual workload deployment is required; workloads are deployed automatically into ephemeral namespaces per test.
+
+- **KinD cluster** (or any Kubernetes cluster) running with `kubectl` configured (e.g. `KUBECONFIG` or default `~/.kube/config`).
+- **Python 3.9+** and main repo deps: `pip install -r requirements.txt`.
+
+### Supported clusters
+
+- **KinD** (recommended): Use `make -f CI/tests_v2/Makefile setup` from the repo root. Fastest for local dev; uses a 2-node dev config by default. Override with `KIND_CONFIG=/path/to/kind-config.yml` for a larger cluster.
+- **Minikube**: Should work; ensure `kubectl` context is set. Not tested in CI.
+- **Remote/cloud cluster**: Tests create and delete namespaces; use with caution. Use `--require-kind` to avoid accidentally running against production (tests will skip unless context is kind/minikube).
+
+### Setting up the cluster
+
+**Option A: Use the setup script (recommended)**
+
+From the repository root, with `kind` and `kubectl` installed:
+
+```bash
+# Create KinD cluster (defaults to CI/tests_v2/kind-config-dev.yml; override with KIND_CONFIG=...)
+./CI/tests_v2/setup_env.sh
+```
+
+Then in the same shell (or after `export KUBECONFIG=~/.kube/config` in another terminal), activate your venv and install Python deps:
+
+```bash
+python3 -m venv venv
+source venv/bin/activate   # or: source venv/Scripts/activate on Windows
+pip install -r requirements.txt
+pip install -r CI/tests_v2/requirements.txt
+```
+
+**Option B: Manual setup**
+
+1. Install [kind](https://kind.sigs.k8s.io/docs/user/quick-start/) and [kubectl](https://kubernetes.io/docs/tasks/tools/).
+2. Create a cluster (from repo root):
+   ```bash
+   kind create cluster --name kind --config kind-config.yml
+   ```
+3. Wait for the cluster:
+   ```bash
+   kubectl wait --for=condition=Ready nodes --all --timeout=120s
+   ```
+4. Create a virtualenv, activate it, and install dependencies (as in Option A).
+5. Run tests from repo root: `pytest CI/tests_v2/ -v ...`
+
+## Install test dependencies
+
+From the repository root:
+
+```bash
+pip install -r CI/tests_v2/requirements.txt
+```
+
+This adds `pytest-rerunfailures`, `pytest-html`, `pytest-timeout`, and `pytest-order` (pytest and coverage come from the main `requirements.txt`).
+
+## Dependency Management
+
+Dependencies are split into two files:
+
+- **Root `requirements.txt`** — Kraken runtime (cloud SDKs, Kubernetes client, krkn-lib, pytest, coverage, etc.). Required to run Kraken.
+- **`CI/tests_v2/requirements.txt`** — Test-only pytest plugins (rerunfailures, html, timeout, order, xdist). Not needed by Kraken itself.
+
+**Rule of thumb:** If Kraken needs it at runtime, add to root. If only the functional tests need it, add to `CI/tests_v2/requirements.txt`.
+
+Running `make -f CI/tests_v2/Makefile setup` (or `make setup` from `CI/tests_v2`) creates the venv and installs **both** files automatically; you do not need to install them separately. The Makefile re-installs when either file changes (via the `.installed` sentinel).
+
+## Run tests
+
+All commands below are from the **repository root**.
+
+### Basic run (with retries and HTML report)
+
+```bash
+pytest CI/tests_v2/ -v --timeout=300 --reruns=2 --reruns-delay=10 --html=CI/tests_v2/report.html --junitxml=CI/tests_v2/results.xml
+```
+
+- Failed tests are **retried up to 2 times** with a 10s delay (configurable in `CI/tests_v2/pytest.ini`).
+- Each test has a **5-minute timeout**.
+- Open `CI/tests_v2/report.html` in a browser for a detailed report.
+
+### Run in parallel (faster suite)
+
+```bash
+pytest CI/tests_v2/ -v -n 4 --timeout=300
+```
+
+Ephemeral namespaces make tests parallel-safe; use `-n` with the number of workers (e.g. 4).
+
+### Run without retries (for debugging)
+
+```bash
+pytest CI/tests_v2/ -v -p no:rerunfailures
+```
+
+### Run with coverage
+
+```bash
+python -m coverage run -m pytest CI/tests_v2/ -v
+python -m coverage report
+```
+
+To append to existing coverage from unit tests, ensure coverage was started with `coverage run -a` for earlier runs, or run the full test suite in one go.
+
+### Run only pod disruption tests
+
+```bash
+pytest CI/tests_v2/ -v -m pod_disruption
+```
+
+### Run only application outage tests
+
+```bash
+pytest CI/tests_v2/ -v -m application_outage
+```
+
+### Run with verbose output and no capture
+
+```bash
+pytest CI/tests_v2/ -v -s
+```
+
+### Keep failed test namespaces for debugging
+
+When a test fails, its ephemeral namespace is normally deleted. To **keep** the namespace so you can inspect pods, logs, and network policies:
+
+```bash
+pytest CI/tests_v2/ -v --keep-ns-on-fail
+```
+
+On failure, the namespace name is printed (e.g. `[keep-ns-on-fail] Keeping namespace krkn-test-a1b2c3d4 for debugging`). Use `kubectl get pods -n krkn-test-a1b2c3d4` (and similar) to debug, then delete the namespace manually when done.
+
+### Logging and cluster options
+
+- **Structured logging**: Use `--log-cli-level=DEBUG` to see namespace creation, workload deploy, and readiness in the console. Use `--log-file=test.log` to capture logs to a file.
+- **Require dev cluster**: To avoid running against the wrong cluster, use `--require-kind`. Tests will skip unless the current kube context cluster name contains "kind" or "minikube".
+- **Stale namespace cleanup**: At session start, namespaces matching `krkn-test-*` that are older than 30 minutes are deleted (e.g. from a previous crashed run).
+- **Timeout overrides**: Set env vars to tune timeouts (e.g. in CI): `KRKN_TEST_READINESS_TIMEOUT`, `KRKN_TEST_DEPLOY_TIMEOUT`, `KRKN_TEST_NS_CLEANUP_TIMEOUT`, `KRKN_TEST_POLICY_WAIT_TIMEOUT`, `KRKN_TEST_KRAKEN_PROC_WAIT_TIMEOUT`, `KRKN_TEST_TIMEOUT_BUDGET`.
+
+## Architecture
+
+- **Folder-per-scenario**: Each scenario lives under `scenarios/<scenario_name>/` with:
+  - **test_<scenario>.py** — Test class extending `BaseScenarioTest`; sets `WORKLOAD_MANIFEST`, `SCENARIO_NAME`, `SCENARIO_TYPE`, `NAMESPACE_KEY_PATH`, and optionally `OVERRIDES_KEY_PATH`.
+  - **resource.yaml** — Kubernetes resources (Deployment/Pod) for the scenario; namespace is patched at deploy time.
+  - **scenario_base.yaml** — Canonical Krkn scenario; the base class loads it, patches namespace (and overrides), and passes it to Kraken via `run_scenario()`. Optional extra YAMLs (e.g. `nginx_http.yaml` for application_outage) can live in the same folder.
+- **lib/**: Shared framework — `lib/base.py` defines `BaseScenarioTest`, timeout constants (env-overridable), and scenario helpers (`load_and_patch_scenario`, `run_scenario`); `lib/utils.py` provides assertion and K8s helpers; `lib/k8s.py` provides K8s client fixtures; `lib/namespace.py` provides namespace lifecycle; `lib/deploy.py` provides `deploy_workload`, `wait_for_pods_running`, `wait_for_deployment_replicas`; `lib/kraken.py` provides `run_kraken`, `build_config` (using `CI/tests_v2/config/common_test_config.yaml`).
+- **conftest.py**: Re-exports fixtures from the lib modules and defines `pytest_addoption`, logging, and `repo_root`.
+- **Adding a new scenario**: Use the scaffold script (see [CONTRIBUTING_TESTS.md](CONTRIBUTING_TESTS.md)) to create `scenarios/<name>/` with test file, `resource.yaml`, and `scenario_base.yaml`, or copy an existing scenario folder and adapt.
+
+## What is tested
+
+Each test runs in an isolated ephemeral namespace; workloads are deployed automatically before the test and the namespace is deleted after (unless `--keep-ns-on-fail` is set and the test failed).
+
+- **scenarios/pod_disruption/**  
+  Pod disruption scenario. `resource.yaml` is a deployment with label `app=krkn-pod-disruption-target`; `scenario_base.yaml` is loaded and `namespace_pattern` is patched to the test namespace. The test:
+  1. Records baseline pod UIDs and restart counts.
+  2. Runs Kraken with the pod disruption scenario.
+  3. Asserts that chaos had an effect (UIDs changed or restart count increased).
+  4. Waits for pods to be Running and all containers Ready.
+  5. Asserts pod count is unchanged and all pods are healthy.
+
+- **scenarios/application_outage/**  
+  Application outage scenario (block Ingress/Egress to target pods, then restore). `resource.yaml` is the main workload (outage pod); `scenario_base.yaml` is loaded and patched with namespace (and duration/block as needed). Optional `nginx_http.yaml` is used by the traffic test. Tests include:
+  - **test_app_outage_block_restore_and_variants**: Happy path with default, exclude_label, and block variants (Ingress, Egress, both); Krkn exit 0, pods still Running/Ready.
+  - **test_network_policy_created_then_deleted**: Policy with prefix `krkn-deny-` appears during run and is gone after.
+  - **test_traffic_blocked_during_outage** (disabled, planned): Deploys nginx with label `scenario=outage`, port-forwards; during outage curl fails, after run curl succeeds.
+  - **test_invalid_scenario_fails**: Invalid scenario file (missing `application_outage` key) causes Kraken to exit non-zero.
+  - **test_bad_namespace_fails**: Scenario targeting a non-existent namespace causes Kraken to exit non-zero.
+
+## Configuration
+
+- **pytest.ini**: Markers (`functional`, `pod_disruption`, `application_outage`, `no_workload`). Use `--timeout=300`, `--reruns=2`, `--reruns-delay=10` on the command line for full runs.
+- **conftest.py**: Re-exports fixtures from `lib/k8s.py`, `lib/namespace.py`, `lib/deploy.py`, `lib/kraken.py` (e.g. `test_namespace`, `deploy_workload`, `k8s_core`, `wait_for_pods_running`, `run_kraken`, `build_config`). Configs are built from `CI/tests_v2/config/common_test_config.yaml` with monitoring disabled for local runs. Timeout constants in `lib/base.py` can be overridden via env vars.
+- **Cluster access**: Reads and applies use the Kubernetes Python client; `kubectl` is still used for `port-forward` and for running Kraken.
+- **utils.py**: Pod/network policy helpers and assertion helpers (`assert_all_pods_running_and_ready`, `assert_pod_count_unchanged`, `assert_kraken_success`, `assert_kraken_failure`, `patch_namespace_in_docs`).
+
+## Relationship to existing CI
+
+- The **existing** bash tests in `CI/tests/` and `CI/run.sh` are **unchanged**. They continue to run as before in GitHub Actions.
+- This framework is **additive**. To run it in CI later, add a separate job or step that runs `pytest CI/tests_v2/ ...` from the repo root.
+
+## Troubleshooting
+
+- **`pytest.skip: Could not load kube config`** — No cluster or bad KUBECONFIG. Run `make -f CI/tests_v2/Makefile setup` (or `make setup` from `CI/tests_v2`) or check `kubectl cluster-info`.
+- **KinD cluster creation hangs** — Docker is not running. Start Docker Desktop or run `systemctl start docker`.
+- **`Bind for 0.0.0.0:9090 failed: port is already allocated`** — Another process (e.g. Prometheus) is using the port. The default dev config (`kind-config-dev.yml`) no longer maps host ports; if you use `KIND_CONFIG=kind-config.yml` or a custom config with `extraPortMappings`, free the port or switch to `kind-config-dev.yml`.
+- **`TimeoutError: Pods did not become ready`** — Slow image pull or node resource limits. Increase `KRKN_TEST_READINESS_TIMEOUT` or check node resources.
+- **`ModuleNotFoundError: pytest_rerunfailures`** — Missing test deps. Run `pip install -r CI/tests_v2/requirements.txt` (or `make setup`).
+- **Stale `krkn-test-*` namespaces** — Left over from a previous crashed run. They are auto-cleaned at session start (older than 30 min). To remove cluster and reports: `make -f CI/tests_v2/Makefile clean`.
+- **Wrong cluster targeted** — Multiple kube contexts. Use `--require-kind` to skip unless context is kind/minikube, or set context explicitly: `kubectl config use-context kind-ci-krkn`.
+- **`OSError: [Errno 48] Address already in use` when running tests in parallel** — Kraken normally starts an HTTP status server on port 8081. With `-n auto` (pytest-xdist), multiple Kraken processes would all try to bind to 8081. The test framework disables this server (`publish_kraken_status: False`) in the generated config, so parallel runs should not hit this. If you see it, ensure you're using the framework's `build_config` and not a config that has `publish_kraken_status: True`.
--- a/CI/tests_v2/config/common_test_config.yaml
+++ b/CI/tests_v2/config/common_test_config.yaml
@@ -0,0 +1,74 @@
+kraken:
+    distribution: kubernetes                                # Distribution can be kubernetes or openshift.
+    kubeconfig_path: ~/.kube/config                        # Path to kubeconfig.
+    exit_on_failure: False                                 # Exit when a post action scenario fails.
+    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
+    signal_state: RUN                                      # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
+    signal_address: 0.0.0.0                                # Signal listening address
+    port: 8081                                             # Signal port
+    auto_rollback: True                                    # Enable auto rollback for scenarios.
+    rollback_versions_directory: /tmp/kraken-rollback      # Directory to store rollback version files.
+    chaos_scenarios:                                       # List of policies/chaos scenarios to load.
+        -   $scenario_type:                                # List of chaos pod scenarios to load.
+            - $scenario_file
+cerberus:
+    cerberus_enabled: False                                # Enable it when cerberus is previously installed.
+    cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal.
+
+performance_monitoring:
+    capture_metrics: False
+    metrics_profile_path: config/metrics-aggregated.yaml
+    prometheus_url:                                        # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_bearer_token:                               # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
+    uuid:                                                  # uuid for the run is generated by default if not set.
+    enable_alerts: True                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
+    enable_metrics: True
+    alert_profile: config/alerts.yaml                          # Path or URL to alert profile with the prometheus queries
+    metrics_profile: config/metrics-report.yaml
+    check_critical_alerts: True                           # Path to alert profile with the prometheus queries.
+
+tunings:
+    wait_duration: 6                                       # Duration to wait between each chaos scenario.
+    iterations: 1                                          # Number of times to execute the scenarios.
+    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever.
+telemetry:
+    enabled: False                                           # enable/disables the telemetry collection feature
+    api_url: https://yvnn4rfoi7.execute-api.us-west-2.amazonaws.com/test #telemetry service endpoint
+    username: $TELEMETRY_USERNAME                                      # telemetry service username
+    password: $TELEMETRY_PASSWORD                                      # telemetry service password
+    prometheus_namespace: 'monitoring'                                # prometheus namespace
+    prometheus_pod_name: 'prometheus-kind-prometheus-kube-prome-prometheus-0'                                 # prometheus pod_name
+    prometheus_container_name: 'prometheus'
+    prometheus_backup: True                                 # enables/disables prometheus data collection
+    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
+    backup_threads: 5                                       # number of telemetry download/upload threads
+    archive_path: /tmp                                      # local path where the archive files will be temporarily stored
+    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
+    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
+    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+    logs_backup: True
+    logs_filter_patterns:
+        - "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+"         # Sep 9 11:20:36.123425532
+        - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+"          # kinit 2023/09/15 11:20:36 log
+        - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
+    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
+    events_backup: True                                     # enables/disables cluster events collection
+    telemetry_group: "funtests"
+elastic:
+    enable_elastic: False
+    verify_certs: False
+    elastic_url: "https://192.168.39.196"                                         # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
+    elastic_port: 32766
+    username: "elastic"
+    password: "test"
+    metrics_index: "krkn-metrics"
+    alerts_index: "krkn-alerts"
+    telemetry_index: "krkn-telemetry"
+
+health_checks:                                              # Utilizing health check endpoints to observe application behavior during chaos injection.
+    interval:                                               # Interval in seconds to perform health checks, default value is 2 seconds
+    config:                                                 # Provide list of health check configurations for applications
+        - url:                                              # Provide application endpoint
+          bearer_token:                                     # Bearer token for authentication if any
+          auth:                                             # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
+          exit_on_failure:                                  # If value is True exits when health check failed for application, values can be True/False
--- a/CI/tests_v2/conftest.py
+++ b/CI/tests_v2/conftest.py
@@ -0,0 +1,67 @@
+"""
+Shared fixtures for pytest functional tests (CI/tests_v2).
+Tests must be run from the repository root so run_kraken.py and config paths resolve.
+"""
+
+import logging
+from pathlib import Path
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--keep-ns-on-fail",
+        action="store_true",
+        default=False,
+        help="Don't delete test namespaces on failure (for debugging)",
+    )
+    parser.addoption(
+        "--require-kind",
+        action="store_true",
+        default=False,
+        help="Skip tests unless current context is a known dev cluster (kind, minikube)",
+    )
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    outcome = yield
+    rep = outcome.get_result()
+    setattr(item, f"rep_{rep.when}", rep)
+
+
+def _repo_root() -> Path:
+    """Repository root (directory containing run_kraken.py and CI/)."""
+    return Path(__file__).resolve().parent.parent.parent
+
+
+@pytest.fixture(scope="session")
+def repo_root():
+    return _repo_root()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def _configure_logging():
+    """Set log format with timestamps for test runs."""
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+    )
+
+
+# Re-export fixtures from lib modules so pytest discovers them
+from lib.deploy import deploy_workload, wait_for_pods_running  # noqa: E402, F401
+from lib.kraken import build_config, run_kraken, run_kraken_background  # noqa: E402, F401
+from lib.k8s import (  # noqa: E402, F401
+    _kube_config_loaded,
+    _log_cluster_context,
+    k8s_apps,
+    k8s_client,
+    k8s_core,
+    k8s_networking,
+    kubectl,
+)
+from lib.namespace import _cleanup_stale_namespaces, test_namespace  # noqa: E402, F401
+from lib.preflight import _preflight_checks  # noqa: E402, F401
--- a/CI/tests_v2/kind-config-dev.yml
+++ b/CI/tests_v2/kind-config-dev.yml
@@ -0,0 +1,8 @@
+# Lean KinD config for local dev (faster than full 5-node). Use KIND_CONFIG to override.
+# No extraPortMappings so setup works when 9090/30080 are in use (e.g. local Prometheus).
+# For Prometheus/ES port mapping, use the repo root kind-config.yml.
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+  - role: control-plane
+  - role: worker
--- a/CI/tests_v2/lib/init.py
+++ b/CI/tests_v2/lib/init.py
@@ -0,0 +1,7 @@
+# Shared framework for CI/tests_v2 functional tests.
+# base: BaseScenarioTest, timeout constants
+# utils: assertions, K8s helpers, patch_namespace_in_docs
+# k8s: K8s client fixtures, cluster context checks
+# namespace: test_namespace, stale namespace cleanup
+# deploy: deploy_workload, wait_for_pods_running, wait_for_deployment_replicas
+# kraken: run_kraken, run_kraken_background, build_config
--- a/CI/tests_v2/lib/base.py
+++ b/CI/tests_v2/lib/base.py
@@ -0,0 +1,155 @@
+"""
+Base class for CI/tests_v2 scenario tests.
+Encapsulates the shared lifecycle: ephemeral namespace, optional workload deploy, teardown.
+"""
+
+import copy
+import logging
+import os
+import subprocess
+from pathlib import Path
+
+import pytest
+import yaml
+
+from lib.utils import load_scenario_base
+
+logger = logging.getLogger(__name__)
+
+
+def _get_nested(obj, path):
+    """Walk path (list of keys/indices) and return the value. Supports list and dict."""
+    for key in path:
+        obj = obj[key]
+    return obj
+
+
+def _set_nested(obj, path, value):
+    """Walk path to the parent and set the last key to value."""
+    if not path:
+        return
+    parent_path, last_key = path[:-1], path[-1]
+    parent = obj
+    for key in parent_path:
+        parent = parent[key]
+    parent[last_key] = value
+
+
+# Timeout constants (seconds). Override via env vars (e.g. KRKN_TEST_READINESS_TIMEOUT).
+# Coordinate with pytest-timeout budget (e.g. 300s).
+TIMEOUT_BUDGET = int(os.environ.get("KRKN_TEST_TIMEOUT_BUDGET", "300"))
+DEPLOY_TIMEOUT = int(os.environ.get("KRKN_TEST_DEPLOY_TIMEOUT", "90"))
+READINESS_TIMEOUT = int(os.environ.get("KRKN_TEST_READINESS_TIMEOUT", "90"))
+NS_CLEANUP_TIMEOUT = int(os.environ.get("KRKN_TEST_NS_CLEANUP_TIMEOUT", "60"))
+POLICY_WAIT_TIMEOUT = int(os.environ.get("KRKN_TEST_POLICY_WAIT_TIMEOUT", "30"))
+KRAKEN_PROC_WAIT_TIMEOUT = int(os.environ.get("KRKN_TEST_KRAKEN_PROC_WAIT_TIMEOUT", "60"))
+
+
+class BaseScenarioTest:
+    """
+    Base class for scenario tests. Subclasses set:
+    - WORKLOAD_MANIFEST: path (str), or callable(namespace) -> YAML str for inline manifest
+    - WORKLOAD_IS_PATH: True if WORKLOAD_MANIFEST is a file path, False if inline YAML
+    - LABEL_SELECTOR: label selector for pods to wait on (e.g. "app=my-target")
+    - SCENARIO_NAME: e.g. "pod_disruption", "application_outage"
+    - SCENARIO_TYPE: e.g. "pod_disruption_scenarios", "application_outages_scenarios"
+    - NAMESPACE_KEY_PATH: path to namespace field, e.g. [0, "config", "namespace_pattern"] or ["application_outage", "namespace"]
+    - NAMESPACE_IS_REGEX: True to wrap namespace in ^...$
+    - OVERRIDES_KEY_PATH: path to dict for **overrides (e.g. ["application_outage"]), or [] if none
+    """
+
+    WORKLOAD_MANIFEST = None
+    WORKLOAD_IS_PATH = True
+    LABEL_SELECTOR = None
+    SCENARIO_NAME = ""
+    SCENARIO_TYPE = ""
+    NAMESPACE_KEY_PATH = []
+    NAMESPACE_IS_REGEX = False
+    OVERRIDES_KEY_PATH = []
+
+    @pytest.fixture(autouse=True)
+    def _inject_common_fixtures(
+        self,
+        repo_root,
+        tmp_path,
+        build_config,
+        run_kraken,
+        run_kraken_background,
+        k8s_core,
+        k8s_apps,
+        k8s_networking,
+        k8s_client,
+    ):
+        """Inject common fixtures onto self so test methods don't need to declare them."""
+        self.repo_root = repo_root
+        self.tmp_path = tmp_path
+        self.build_config = build_config
+        self.run_kraken = run_kraken
+        self.run_kraken_background = run_kraken_background
+        self.k8s_core = k8s_core
+        self.k8s_apps = k8s_apps
+        self.k8s_networking = k8s_networking
+        self.k8s_client = k8s_client
+        yield
+
+    @pytest.fixture(autouse=True)
+    def _setup_workload(self, request, repo_root):
+        if "no_workload" in request.keywords:
+            request.instance.ns = request.getfixturevalue("test_namespace")
+            logger.debug("no_workload marker: skipping workload deploy, ns=%s", request.instance.ns)
+            yield
+            return
+        deploy = request.getfixturevalue("deploy_workload")
+        test_namespace = request.getfixturevalue("test_namespace")
+        manifest = self.WORKLOAD_MANIFEST
+        if callable(manifest):
+            manifest = manifest(test_namespace)
+            is_path = False
+            logger.info("Deploying inline workload in ns=%s, label_selector=%s", test_namespace, self.LABEL_SELECTOR)
+        else:
+            is_path = self.WORKLOAD_IS_PATH
+            if is_path and manifest and not Path(manifest).is_absolute():
+                manifest = repo_root / manifest
+            logger.info("Deploying workload from %s in ns=%s, label_selector=%s", manifest, test_namespace, self.LABEL_SELECTOR)
+        ns = deploy(manifest, self.LABEL_SELECTOR, is_path=is_path, timeout=DEPLOY_TIMEOUT)
+        request.instance.ns = ns
+        yield
+
+    def load_and_patch_scenario(self, repo_root, namespace, **overrides):
+        """Load scenario_base.yaml and patch namespace (and overrides). Returns the scenario structure."""
+        scenario = copy.deepcopy(load_scenario_base(repo_root, self.SCENARIO_NAME))
+        ns_value = f"^{namespace}$" if self.NAMESPACE_IS_REGEX else namespace
+        if self.NAMESPACE_KEY_PATH:
+            _set_nested(scenario, self.NAMESPACE_KEY_PATH, ns_value)
+        if overrides and self.OVERRIDES_KEY_PATH:
+            target = _get_nested(scenario, self.OVERRIDES_KEY_PATH)
+            for key, value in overrides.items():
+                target[key] = value
+        return scenario
+
+    def write_scenario(self, tmp_path, scenario_data, suffix=""):
+        """Write scenario data to a YAML file in tmp_path. Returns the path."""
+        filename = f"{self.SCENARIO_NAME}_scenario{suffix}.yaml"
+        path = tmp_path / filename
+        path.write_text(yaml.dump(scenario_data, default_flow_style=False, sort_keys=False))
+        return path
+
+    def run_scenario(self, tmp_path, namespace, *, overrides=None, config_filename=None):
+        """Load, patch, write scenario; build config; run Kraken. Returns CompletedProcess."""
+        scenario = self.load_and_patch_scenario(self.repo_root, namespace, **(overrides or {}))
+        scenario_path = self.write_scenario(tmp_path, scenario)
+        config_path = self.build_config(
+            self.SCENARIO_TYPE,
+            str(scenario_path),
+            filename=config_filename or "test_config.yaml",
+        )
+        if os.environ.get("KRKN_TEST_DRY_RUN", "0") == "1":
+            logger.info(
+                "[dry-run] Would run Kraken with config=%s, scenario=%s",
+                config_path,
+                scenario_path,
+            )
+            return subprocess.CompletedProcess(
+                args=[], returncode=0, stdout="[dry-run] skipped", stderr=""
+            )
+        return self.run_kraken(config_path)
--- a/CI/tests_v2/lib/deploy.py
+++ b/CI/tests_v2/lib/deploy.py
@@ -0,0 +1,145 @@
+"""
+Workload deploy and pod/deployment readiness fixtures for CI/tests_v2.
+"""
+
+import logging
+import time
+from pathlib import Path
+
+import pytest
+import yaml
+from kubernetes import utils as k8s_utils
+
+from lib.base import READINESS_TIMEOUT
+from lib.utils import patch_namespace_in_docs
+
+logger = logging.getLogger(__name__)
+
+
+def wait_for_deployment_replicas(k8s_apps, namespace: str, name: str, timeout: int = 120) -> None:
+    """
+    Poll until the deployment has ready_replicas >= spec.replicas.
+    Raises TimeoutError with diagnostic details on failure.
+    """
+    deadline = time.monotonic() + timeout
+    last_dep = None
+    attempts = 0
+    while time.monotonic() < deadline:
+        try:
+            dep = k8s_apps.read_namespaced_deployment(name=name, namespace=namespace)
+        except Exception as e:
+            logger.debug("Deployment %s/%s poll attempt %s failed: %s", namespace, name, attempts, e)
+            time.sleep(2)
+            attempts += 1
+            continue
+        last_dep = dep
+        ready = dep.status.ready_replicas or 0
+        desired = dep.spec.replicas or 1
+        if ready >= desired:
+            logger.debug("Deployment %s/%s ready (%s/%s)", namespace, name, ready, desired)
+            return
+        logger.debug("Deployment %s/%s not ready yet: %s/%s", namespace, name, ready, desired)
+        time.sleep(2)
+        attempts += 1
+    diag = ""
+    if last_dep is not None and last_dep.status:
+        diag = f" ready_replicas={last_dep.status.ready_replicas}, desired={last_dep.spec.replicas}"
+    raise TimeoutError(
+        f"Deployment {namespace}/{name} did not become ready within {timeout}s.{diag}"
+    )
+
+
+@pytest.fixture
+def wait_for_pods_running(k8s_core):
+    """
+    Poll until all matching pods are Running and all containers ready.
+    Uses exponential backoff: 1s, 2s, 4s, ... capped at 10s.
+    Raises TimeoutError with diagnostic details on failure.
+    """
+
+    def _wait(namespace: str, label_selector: str, timeout: int = READINESS_TIMEOUT):
+        deadline = time.monotonic() + timeout
+        interval = 1.0
+        max_interval = 10.0
+        last_list = None
+        while time.monotonic() < deadline:
+            try:
+                pod_list = k8s_core.list_namespaced_pod(
+                    namespace=namespace,
+                    label_selector=label_selector,
+                )
+            except Exception:
+                time.sleep(min(interval, max_interval))
+                interval = min(interval * 2, max_interval)
+                continue
+            last_list = pod_list
+            items = pod_list.items or []
+            if not items:
+                time.sleep(min(interval, max_interval))
+                interval = min(interval * 2, max_interval)
+                continue
+            all_running = all(
+                (p.status and p.status.phase == "Running") for p in items
+            )
+            if not all_running:
+                time.sleep(min(interval, max_interval))
+                interval = min(interval * 2, max_interval)
+                continue
+            all_ready = True
+            for p in items:
+                if not p.status or not p.status.container_statuses:
+                    all_ready = False
+                    break
+                for cs in p.status.container_statuses:
+                    if not getattr(cs, "ready", False):
+                        all_ready = False
+                        break
+            if all_ready:
+                return
+            time.sleep(min(interval, max_interval))
+            interval = min(interval * 2, max_interval)
+
+        diag = ""
+        if last_list and last_list.items:
+            p = last_list.items[0]
+            diag = f" e.g. pod {p.metadata.name}: phase={getattr(p.status, 'phase', None)}"
+        raise TimeoutError(
+            f"Pods in {namespace} with label {label_selector} did not become ready within {timeout}s.{diag}"
+        )
+
+    return _wait
+
+
+@pytest.fixture(scope="function")
+def deploy_workload(test_namespace, k8s_client, wait_for_pods_running, repo_root, tmp_path):
+    """
+    Helper that applies a manifest into the test namespace and waits for pods.
+    Yields a callable: deploy(manifest_path_or_content, label_selector, *, is_path=True)
+    which applies the manifest, waits for readiness, and returns the namespace name.
+    """
+
+    def _deploy(manifest_path_or_content, label_selector, *, is_path=True, timeout=READINESS_TIMEOUT):
+        try:
+            if is_path:
+                path = Path(manifest_path_or_content)
+                if not path.is_absolute():
+                    path = repo_root / path
+                with open(path) as f:
+                    docs = list(yaml.safe_load_all(f))
+            else:
+                docs = list(yaml.safe_load_all(manifest_path_or_content))
+            docs = patch_namespace_in_docs(docs, test_namespace)
+            k8s_utils.create_from_yaml(
+                k8s_client,
+                yaml_objects=docs,
+                namespace=test_namespace,
+            )
+        except k8s_utils.FailToCreateError as e:
+            msgs = [str(exc) for exc in e.api_exceptions]
+            raise RuntimeError(f"Failed to create resources: {'; '.join(msgs)}") from e
+        logger.info("Workload applied in namespace=%s, waiting for pods with selector=%s", test_namespace, label_selector)
+        wait_for_pods_running(test_namespace, label_selector, timeout=timeout)
+        logger.info("Pods ready in namespace=%s", test_namespace)
+        return test_namespace
+
+    return _deploy
--- a/CI/tests_v2/lib/k8s.py
+++ b/CI/tests_v2/lib/k8s.py
@@ -0,0 +1,88 @@
+"""
+Kubernetes client fixtures and cluster context checks for CI/tests_v2.
+"""
+
+import logging
+import subprocess
+from pathlib import Path
+
+import pytest
+from kubernetes import client, config
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture(scope="session")
+def _kube_config_loaded():
+    """Load kubeconfig once per session. Skips if cluster unreachable."""
+    try:
+        config.load_kube_config()
+        logger.info("Kube config loaded successfully")
+    except config.ConfigException as e:
+        logger.warning("Could not load kube config: %s", e)
+        pytest.skip(f"Could not load kube config (is a cluster running?): {e}")
+
+
+@pytest.fixture(scope="session")
+def k8s_core(_kube_config_loaded):
+    """Kubernetes CoreV1Api for pods, etc. Uses default kubeconfig."""
+    return client.CoreV1Api()
+
+
+@pytest.fixture(scope="session")
+def k8s_networking(_kube_config_loaded):
+    """Kubernetes NetworkingV1Api for network policies."""
+    return client.NetworkingV1Api()
+
+
+@pytest.fixture(scope="session")
+def k8s_client(_kube_config_loaded):
+    """Kubernetes ApiClient for create_from_yaml and other generic API calls."""
+    return client.ApiClient()
+
+
+@pytest.fixture(scope="session")
+def k8s_apps(_kube_config_loaded):
+    """Kubernetes AppsV1Api for deployment status polling."""
+    return client.AppsV1Api()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def _log_cluster_context(request):
+    """Log current cluster context at session start; skip if --require-kind and not a dev cluster."""
+    try:
+        contexts, active = config.list_kube_config_contexts()
+    except Exception as e:
+        logger.warning("Could not list kube config contexts: %s", e)
+        return
+    if not active:
+        return
+    context_name = active.get("name", "?")
+    cluster = (active.get("context") or {}).get("cluster", "?")
+    logger.info("Running tests against cluster: context=%s cluster=%s", context_name, cluster)
+    if not request.config.getoption("--require-kind", False):
+        return
+    cluster_lower = (cluster or "").lower()
+    if "kind" in cluster_lower or "minikube" in cluster_lower:
+        return
+    pytest.skip(
+        f"Cluster '{cluster}' does not look like kind/minikube. "
+        "Use default kubeconfig or pass --require-kind only on dev clusters."
+    )
+
+
+@pytest.fixture
+def kubectl(repo_root):
+    """Run kubectl with given args from repo root. Returns CompletedProcess."""
+
+    def run(args, timeout=120):
+        cmd = ["kubectl"] + (args if isinstance(args, list) else list(args))
+        return subprocess.run(
+            cmd,
+            cwd=repo_root,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+    return run
--- a/CI/tests_v2/lib/kraken.py
+++ b/CI/tests_v2/lib/kraken.py
@@ -0,0 +1,94 @@
+"""
+Kraken execution and config building fixtures for CI/tests_v2.
+"""
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+import yaml
+
+
+def _kraken_cmd(config_path: str, repo_root: Path):
+    """Use the same Python as the test process so venv/.venv and coverage match."""
+    python = sys.executable
+    if os.environ.get("KRKN_TEST_COVERAGE", "0") == "1":
+        return [
+            python, "-m", "coverage", "run", "-a",
+            "run_kraken.py", "-c", str(config_path),
+        ]
+    return [python, "run_kraken.py", "-c", str(config_path)]
+
+
+@pytest.fixture
+def run_kraken(repo_root):
+    """Run Kraken with the given config path. Returns CompletedProcess. Default timeout 300s."""
+
+    def run(config_path, timeout=300, extra_args=None):
+        cmd = _kraken_cmd(config_path, repo_root)
+        if extra_args:
+            cmd.extend(extra_args)
+        return subprocess.run(
+            cmd,
+            cwd=repo_root,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+    return run
+
+
+@pytest.fixture
+def run_kraken_background(repo_root):
+    """Start Kraken in background. Returns Popen. Call proc.terminate() or proc.wait() to stop."""
+
+    def start(config_path):
+        cmd = _kraken_cmd(config_path, repo_root)
+        return subprocess.Popen(
+            cmd,
+            cwd=repo_root,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+
+    return start
+
+
+@pytest.fixture
+def build_config(repo_root, tmp_path):
+    """
+    Build a Kraken config from tests_v2's common_test_config.yaml with scenario_type and scenario_file
+    substituted. Disables Prometheus/Elastic checks for local runs.
+    Returns the path to the written config file.
+    """
+    common_path = repo_root / "CI" / "tests_v2" / "config" / "common_test_config.yaml"
+
+    def _build(scenario_type: str, scenario_file: str, filename: str = "test_config.yaml"):
+        content = common_path.read_text()
+        content = content.replace("$scenario_type", scenario_type)
+        content = content.replace("$scenario_file", scenario_file)
+        content = content.replace("$post_config", "")
+
+        config = yaml.safe_load(content)
+        if "kraken" in config:
+            # Disable status server so parallel test workers don't all bind to port 8081
+            config["kraken"]["publish_kraken_status"] = False
+        if "performance_monitoring" in config:
+            config["performance_monitoring"]["check_critical_alerts"] = False
+            config["performance_monitoring"]["enable_alerts"] = False
+            config["performance_monitoring"]["enable_metrics"] = False
+        if "elastic" in config:
+            config["elastic"]["enable_elastic"] = False
+        if "tunings" in config:
+            config["tunings"]["wait_duration"] = 1
+
+        out_path = tmp_path / filename
+        with open(out_path, "w") as f:
+            yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+        return str(out_path)
+
+    return _build
--- a/CI/tests_v2/lib/namespace.py
+++ b/CI/tests_v2/lib/namespace.py
@@ -0,0 +1,114 @@
+"""
+Namespace lifecycle fixtures for CI/tests_v2: create, delete, stale cleanup.
+"""
+
+import logging
+import os
+import time
+import uuid
+from datetime import datetime
+
+import pytest
+from kubernetes import client
+from kubernetes.client.rest import ApiException
+
+logger = logging.getLogger(__name__)
+
+STALE_NS_AGE_MINUTES = 30
+
+
+def _namespace_age_minutes(metadata) -> float:
+    """Return age of namespace in minutes from its creation_timestamp."""
+    if not metadata or not metadata.creation_timestamp:
+        return 0.0
+    created = metadata.creation_timestamp
+    if hasattr(created, "timestamp"):
+        created_ts = created.timestamp()
+    else:
+        try:
+            dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
+            created_ts = dt.timestamp()
+        except Exception:
+            return 0.0
+    return (time.time() - created_ts) / 60.0
+
+
+def _wait_for_namespace_gone(k8s_core, name: str, timeout: int = 60):
+    """Poll until the namespace no longer exists."""
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        try:
+            k8s_core.read_namespace(name=name)
+        except ApiException as e:
+            if e.status == 404:
+                return
+            raise
+        time.sleep(1)
+    raise TimeoutError(f"Namespace {name} did not disappear within {timeout}s")
+
+
+@pytest.fixture(scope="function")
+def test_namespace(request, k8s_core):
+    """
+    Create an ephemeral namespace for the test. Deleted after the test unless
+    --keep-ns-on-fail is set and the test failed.
+    """
+    name = f"krkn-test-{uuid.uuid4().hex[:8]}"
+    ns = client.V1Namespace(
+        metadata=client.V1ObjectMeta(
+            name=name,
+            labels={
+                "pod-security.kubernetes.io/audit": "privileged",
+                "pod-security.kubernetes.io/enforce": "privileged",
+                "pod-security.kubernetes.io/enforce-version": "v1.24",
+                "pod-security.kubernetes.io/warn": "privileged",
+                "security.openshift.io/scc.podSecurityLabelSync": "false",
+            },
+        )
+    )
+    k8s_core.create_namespace(body=ns)
+    logger.info("Created test namespace: %s", name)
+
+    yield name
+
+    keep_on_fail = request.config.getoption("--keep-ns-on-fail", False)
+    rep_call = getattr(request.node, "rep_call", None)
+    failed = rep_call is not None and rep_call.failed
+    if keep_on_fail and failed:
+        logger.info("[keep-ns-on-fail] Keeping namespace %s for debugging", name)
+        return
+
+    try:
+        k8s_core.delete_namespace(
+            name=name,
+            body=client.V1DeleteOptions(propagation_policy="Background"),
+        )
+        logger.debug("Scheduled background deletion for namespace: %s", name)
+    except Exception as e:
+        logger.warning("Failed to delete namespace %s: %s", name, e)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def _cleanup_stale_namespaces(k8s_core):
+    """Delete krkn-test-* namespaces older than STALE_NS_AGE_MINUTES at session start."""
+    if os.environ.get("PYTEST_XDIST_WORKER"):
+        return
+    try:
+        namespaces = k8s_core.list_namespace()
+    except Exception as e:
+        logger.warning("Could not list namespaces for stale cleanup: %s", e)
+        return
+    for ns in namespaces.items or []:
+        name = ns.metadata.name if ns.metadata else ""
+        if not name.startswith("krkn-test-"):
+            continue
+        if _namespace_age_minutes(ns.metadata) <= STALE_NS_AGE_MINUTES:
+            continue
+        try:
+            logger.warning("Deleting stale namespace: %s", name)
+            k8s_core.delete_namespace(
+                name=name,
+                body=client.V1DeleteOptions(propagation_policy="Background"),
+            )
+        except Exception as e:
+            logger.warning("Failed to delete stale namespace %s: %s", name, e)
--- a/CI/tests_v2/lib/preflight.py
+++ b/CI/tests_v2/lib/preflight.py
@@ -0,0 +1,48 @@
+"""
+Preflight checks for CI/tests_v2: cluster reachability and test deps at session start.
+"""
+
+import logging
+import subprocess
+
+import pytest
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def _preflight_checks(repo_root):
+    """
+    Verify cluster is reachable and test deps are importable at session start.
+    Skips the session if cluster-info fails or required plugins are missing.
+    """
+    # Check test deps (pytest plugins)
+    try:
+        import pytest_rerunfailures  # noqa: F401
+        import pytest_html  # noqa: F401
+        import pytest_timeout  # noqa: F401
+        import pytest_order  # noqa: F401
+        import xdist  # noqa: F401
+    except ImportError as e:
+        pytest.skip(
+            f"Missing test dependency: {e}. "
+            "Run: pip install -r CI/tests_v2/requirements.txt"
+        )
+
+    # Check cluster reachable and log server URL
+    result = subprocess.run(
+        ["kubectl", "cluster-info"],
+        cwd=repo_root,
+        capture_output=True,
+        text=True,
+        timeout=10,
+    )
+    if result.returncode != 0:
+        pytest.skip(
+            f"Cluster not reachable (kubectl cluster-info failed). "
+            f"Start a cluster (e.g. make setup) or check KUBECONFIG. stderr: {result.stderr or '(none)'}"
+        )
+    # Log first line of cluster-info (server URL) for debugging
+    if result.stdout:
+        first_line = result.stdout.strip().split("\n")[0]
+        logger.info("Preflight: %s", first_line)
--- a/CI/tests_v2/lib/utils.py
+++ b/CI/tests_v2/lib/utils.py
@@ -0,0 +1,212 @@
+"""
+Shared helpers for CI/tests_v2 functional tests.
+"""
+
+import logging
+import time
+from pathlib import Path
+from typing import List, Optional, Union
+
+import pytest
+import yaml
+from kubernetes.client import V1NetworkPolicy, V1NetworkPolicyList, V1Pod, V1PodList
+
+logger = logging.getLogger(__name__)
+
+
+def _pods(pod_list: Union[V1PodList, List[V1Pod]]) -> List[V1Pod]:
+    """Normalize V1PodList or list of V1Pod to list of V1Pod."""
+    return pod_list.items if hasattr(pod_list, "items") else pod_list
+
+
+def _policies(
+    policy_list: Union[V1NetworkPolicyList, List[V1NetworkPolicy]],
+) -> List[V1NetworkPolicy]:
+    """Normalize V1NetworkPolicyList or list to list of V1NetworkPolicy."""
+    return policy_list.items if hasattr(policy_list, "items") else policy_list
+
+
+def scenario_dir(repo_root: Path, scenario_name: str) -> Path:
+    """Return the path to a scenario folder under CI/tests_v2/scenarios/."""
+    return repo_root / "CI" / "tests_v2" / "scenarios" / scenario_name
+
+
+def load_scenario_base(
+    repo_root: Path,
+    scenario_name: str,
+    filename: str = "scenario_base.yaml",
+) -> Union[dict, list]:
+    """
+    Load and parse the scenario base YAML for a scenario.
+    Returns dict or list depending on the YAML structure.
+    """
+    path = scenario_dir(repo_root, scenario_name) / filename
+    text = path.read_text()
+    data = yaml.safe_load(text)
+    if data is None:
+        raise ValueError(f"Empty or invalid YAML in {path}")
+    return data
+
+
+def patch_namespace_in_docs(docs: list, namespace: str) -> list:
+    """Override metadata.namespace in each doc so create_from_yaml respects target namespace."""
+    for doc in docs:
+        if isinstance(doc, dict) and doc.get("metadata") is not None:
+            doc["metadata"]["namespace"] = namespace
+    return docs
+
+
+def get_pods_list(k8s_core, namespace: str, label_selector: str) -> V1PodList:
+    """Return V1PodList from the Kubernetes API."""
+    return k8s_core.list_namespaced_pod(
+        namespace=namespace,
+        label_selector=label_selector,
+    )
+
+
+def get_pods_or_skip(
+    k8s_core,
+    namespace: str,
+    label_selector: str,
+    no_pods_reason: Optional[str] = None,
+) -> V1PodList:
+    """
+    Get pods via Kubernetes API or skip if cluster unreachable or no matching pods.
+    Use at test start when prerequisites may be missing.
+    no_pods_reason: message when no pods match; if None, a default message is used.
+    """
+    try:
+        pod_list = k8s_core.list_namespaced_pod(
+            namespace=namespace,
+            label_selector=label_selector,
+        )
+    except Exception as e:
+        pytest.skip(f"Cluster unreachable: {e}")
+    if not pod_list.items or len(pod_list.items) == 0:
+        reason = (
+            no_pods_reason
+            if no_pods_reason
+            else f"No pods in {namespace} with label {label_selector}. "
+            "Start a KinD cluster with default storage (local-path-provisioner)."
+        )
+        pytest.skip(reason)
+    return pod_list
+
+
+def pod_uids(pod_list: Union[V1PodList, List[V1Pod]]) -> list:
+    """Return list of pod UIDs from V1PodList or list of V1Pod."""
+    return [p.metadata.uid for p in _pods(pod_list)]
+
+
+def restart_counts(pod_list: Union[V1PodList, List[V1Pod]]) -> int:
+    """Return total restart count across all containers in V1PodList or list of V1Pod."""
+    total = 0
+    for p in _pods(pod_list):
+        if not p.status or not p.status.container_statuses:
+            continue
+        for cs in p.status.container_statuses:
+            total += getattr(cs, "restart_count", 0)
+    return total
+
+
+def get_network_policies_list(k8s_networking, namespace: str) -> V1NetworkPolicyList:
+    """Return V1NetworkPolicyList from the Kubernetes API."""
+    return k8s_networking.list_namespaced_network_policy(namespace=namespace)
+
+
+def find_network_policy_by_prefix(
+    policy_list: Union[V1NetworkPolicyList, List[V1NetworkPolicy]],
+    name_prefix: str,
+) -> Optional[V1NetworkPolicy]:
+    """Return the first NetworkPolicy whose name starts with name_prefix, or None."""
+    for policy in _policies(policy_list):
+        if (
+            policy.metadata
+            and policy.metadata.name
+            and policy.metadata.name.startswith(name_prefix)
+        ):
+            return policy
+    return None
+
+
+def assert_all_pods_running_and_ready(
+    pod_list: Union[V1PodList, List[V1Pod]],
+    namespace: str = "",
+) -> None:
+    """
+    Assert all pods are Running and all containers Ready.
+    Include namespace in assertion messages for debugging.
+    """
+    ns_suffix = f" (namespace={namespace})" if namespace else ""
+    for pod in _pods(pod_list):
+        assert pod.status and pod.status.phase == "Running", (
+            f"Pod {pod.metadata.name} not Running after scenario: {pod.status}{ns_suffix}"
+        )
+        if pod.status.container_statuses:
+            for cs in pod.status.container_statuses:
+                assert getattr(cs, "ready", False) is True, (
+                    f"Container {getattr(cs, 'name', '?')} not ready in pod {pod.metadata.name}{ns_suffix}"
+                )
+
+
+def assert_pod_count_unchanged(
+    before: Union[V1PodList, List[V1Pod]],
+    after: Union[V1PodList, List[V1Pod]],
+    namespace: str = "",
+) -> None:
+    """Assert pod count is unchanged; include namespace in failure message."""
+    before_items = _pods(before)
+    after_items = _pods(after)
+    ns_suffix = f" (namespace={namespace})" if namespace else ""
+    assert len(after_items) == len(before_items), (
+        f"Pod count changed after scenario: expected {len(before_items)}, got {len(after_items)}.{ns_suffix}"
+    )
+
+
+def assert_kraken_success(result, context: str = "", tmp_path=None, allowed_codes=(0,)) -> None:
+    """
+    Assert Kraken run succeeded (returncode in allowed_codes). On failure, include stdout and stderr
+    in the assertion message and optionally write full output to tmp_path.
+    Default allowed_codes=(0,). For alert-aware tests, use allowed_codes=(0, 2).
+    """
+    if result.returncode in allowed_codes:
+        return
+    if tmp_path is not None:
+        try:
+            (tmp_path / "kraken_stdout.log").write_text(result.stdout or "")
+            (tmp_path / "kraken_stderr.log").write_text(result.stderr or "")
+        except Exception as e:
+            logger.warning("Could not write Kraken logs to tmp_path: %s", e)
+    lines = (result.stdout or "").splitlines()
+    tail_stdout = "\n".join(lines[-20:]) if lines else "(empty)"
+    context_str = f" {context}" if context else ""
+    path_hint = f"\nFull logs: {tmp_path}/kraken_stdout.log, {tmp_path}/kraken_stderr.log" if tmp_path else ""
+    raise AssertionError(
+        f"Krkn failed (rc={result.returncode}){context_str}.{path_hint}\n"
+        f"--- stderr ---\n{result.stderr or '(empty)'}\n"
+        f"--- stdout (last 20 lines) ---\n{tail_stdout}"
+    )
+
+
+def assert_kraken_failure(result, context: str = "", tmp_path=None) -> None:
+    """
+    Assert Kraken run failed (returncode != 0). On failure (Kraken unexpectedly succeeded),
+    raise AssertionError with stdout/stderr and optional tmp_path log files for diagnostics.
+    """
+    if result.returncode != 0:
+        return
+    if tmp_path is not None:
+        try:
+            (tmp_path / "kraken_stdout.log").write_text(result.stdout or "")
+            (tmp_path / "kraken_stderr.log").write_text(result.stderr or "")
+        except Exception as e:
+            logger.warning("Could not write Kraken logs to tmp_path: %s", e)
+    lines = (result.stdout or "").splitlines()
+    tail_stdout = "\n".join(lines[-20:]) if lines else "(empty)"
+    context_str = f" {context}" if context else ""
+    path_hint = f"\nFull logs: {tmp_path}/kraken_stdout.log, {tmp_path}/kraken_stderr.log" if tmp_path else ""
+    raise AssertionError(
+        f"Expected Krkn to fail but it succeeded (rc=0){context_str}.{path_hint}\n"
+        f"--- stderr ---\n{result.stderr or '(empty)'}\n"
+        f"--- stdout (last 20 lines) ---\n{tail_stdout}"
+    )
--- a/CI/tests_v2/pytest.ini
+++ b/CI/tests_v2/pytest.ini
@@ -0,0 +1,14 @@
+[pytest]
+testpaths = .
+python_files = test_*.py
+python_functions = test_*
+# Install CI/tests_v2/requirements.txt for --timeout, --reruns, --reruns-delay.
+# Example full run: pytest CI/tests_v2/ -v --timeout=300 --reruns=2 --reruns-delay=10 --html=... --junitxml=...
+addopts = -v
+markers =
+    functional: marks a test as a functional test (deselect with '-m "not functional"')
+    pod_disruption: marks a test as a pod disruption scenario test
+    application_outage: marks a test as an application outage scenario test
+    no_workload: skip workload deployment for this test (e.g. negative tests)
+    order: set test order (pytest-order)
+junit_family = xunit2
--- a/CI/tests_v2/requirements.txt
+++ b/CI/tests_v2/requirements.txt
@@ -0,0 +1,15 @@
+# Pytest plugin deps for CI/tests_v2 functional tests.
+#
+# Kept separate from the root requirements.txt because:
+#   - Root deps are Kraken runtime (cloud SDKs, K8s client, etc.)
+#   - These are test-only plugins not needed by Kraken itself
+#   - Merging would bloat installs for users who don't run functional tests
+#   - Separate files reduce version-conflict risk between test and runtime deps
+#
+# pytest and coverage are already in root requirements.txt; do NOT duplicate here.
+# The Makefile installs both files automatically via `make setup`.
+pytest-rerunfailures>=14.0
+pytest-html>=4.1.0
+pytest-timeout>=2.2.0
+pytest-order>=1.2.0
+pytest-xdist>=3.5.0
--- a/CI/tests_v2/scaffold.py
+++ b/CI/tests_v2/scaffold.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+Generate boilerplate for a new scenario test in CI/tests_v2.
+
+Usage (from repository root):
+  python CI/tests_v2/scaffold.py --scenario service_hijacking
+  python CI/tests_v2/scaffold.py --scenario node_disruption --scenario-type node_scenarios
+
+Creates (folder-per-scenario layout):
+  - CI/tests_v2/scenarios/<scenario>/test_<scenario>.py (BaseScenarioTest subclass + stub test)
+  - CI/tests_v2/scenarios/<scenario>/resource.yaml (placeholder workload)
+  - CI/tests_v2/scenarios/<scenario>/scenario_base.yaml (placeholder Krkn scenario; edit for your scenario_type)
+  - Adds the scenario marker to pytest.ini (if not already present)
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+
+
+def snake_to_camel(snake: str) -> str:
+    """Convert snake_case to CamelCase."""
+    return "".join(word.capitalize() for word in snake.split("_"))
+
+
+def scenario_type_default(scenario: str) -> str:
+    """Default scenario_type for build_config (e.g. service_hijacking -> service_hijacking_scenarios)."""
+    return f"{scenario}_scenarios"
+
+
+TEST_FILE_TEMPLATE = '''"""
+Functional test for {scenario} scenario.
+Each test runs in its own ephemeral namespace with workload deployed automatically.
+"""
+
+import pytest
+
+from lib.base import BaseScenarioTest
+from lib.utils import (
+    assert_all_pods_running_and_ready,
+    assert_kraken_failure,
+    assert_kraken_success,
+    assert_pod_count_unchanged,
+    get_pods_list,
+)
+
+
+@pytest.mark.functional
+@pytest.mark.{marker}
+class Test{class_name}(BaseScenarioTest):
+    """{scenario} scenario."""
+
+    WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/{scenario}/resource.yaml"
+    WORKLOAD_IS_PATH = True
+    LABEL_SELECTOR = "app={app_label}"
+    SCENARIO_NAME = "{scenario}"
+    SCENARIO_TYPE = "{scenario_type}"
+    NAMESPACE_KEY_PATH = {namespace_key_path}
+    NAMESPACE_IS_REGEX = {namespace_is_regex}
+    OVERRIDES_KEY_PATH = {overrides_key_path}
+
+    @pytest.mark.order(1)
+    def test_happy_path(self):
+        """Run {scenario} scenario and assert pods remain healthy."""
+        ns = self.ns
+        before = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
+
+        result = self.run_scenario(self.tmp_path, ns)
+        assert_kraken_success(result, context=f"namespace={{ns}}", tmp_path=self.tmp_path)
+
+        after = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
+        assert_pod_count_unchanged(before, after, namespace=ns)
+        assert_all_pods_running_and_ready(after, namespace=ns)
+'''
+
+RESOURCE_YAML_TEMPLATE = '''# Target workload for {scenario} scenario tests.
+# Namespace is patched at deploy time by the test framework.
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {app_label}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: {app_label}
+  template:
+    metadata:
+      labels:
+        app: {app_label}
+    spec:
+      containers:
+      - name: app
+        image: nginx:alpine
+        ports:
+        - containerPort: 80
+'''
+
+SCENARIO_BASE_DICT_TEMPLATE = '''# Base scenario for {scenario} (used by build_config with scenario_type: {scenario_type}).
+# Edit this file with the structure expected by Krkn. Top-level key must match SCENARIO_NAME.
+# See scenarios/application_outage/scenario_base.yaml and scenarios/pod_disruption/scenario_base.yaml for examples.
+{scenario}:
+  namespace: default
+  # Add fields required by your scenario plugin.
+'''
+
+SCENARIO_BASE_LIST_TEMPLATE = '''# Base scenario for {scenario} (list format). Tests patch config.namespace_pattern with ^<ns>$.
+# Edit with the structure expected by your scenario plugin. See scenarios/pod_disruption/scenario_base.yaml.
+- id: {scenario}-default
+  config:
+    namespace_pattern: "^default$"
+    # Add fields required by your scenario plugin.
+'''
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Scaffold a new scenario test in CI/tests_v2 (folder-per-scenario)")
+    parser.add_argument(
+        "--scenario",
+        required=True,
+        help="Scenario name in snake_case (e.g. service_hijacking)",
+    )
+    parser.add_argument(
+        "--scenario-type",
+        default=None,
+        help="Kraken scenario_type for build_config (default: <scenario>_scenarios)",
+    )
+    parser.add_argument(
+        "--list-based",
+        action="store_true",
+        help="Use list-based scenario (NAMESPACE_KEY_PATH [0, 'config', 'namespace_pattern'], OVERRIDES_KEY_PATH [0, 'config'])",
+    )
+    parser.add_argument(
+        "--regex-namespace",
+        action="store_true",
+        help="Set NAMESPACE_IS_REGEX = True (namespace wrapped in ^...$)",
+    )
+    args = parser.parse_args()
+
+    scenario = args.scenario.strip().lower()
+    if not re.match(r"^[a-z][a-z0-9_]*$", scenario):
+        print("Error: --scenario must be snake_case (e.g. service_hijacking)", file=sys.stderr)
+        return 1
+
+    scenario_type = args.scenario_type or scenario_type_default(scenario)
+    class_name = snake_to_camel(scenario)
+    marker = scenario
+    app_label = scenario.replace("_", "-")
+
+    if args.list_based:
+        namespace_key_path = [0, "config", "namespace_pattern"]
+        namespace_is_regex = True
+        overrides_key_path = [0, "config"]
+        scenario_base_template = SCENARIO_BASE_LIST_TEMPLATE
+    else:
+        namespace_key_path = [scenario, "namespace"]
+        namespace_is_regex = args.regex_namespace
+        overrides_key_path = [scenario]
+        scenario_base_template = SCENARIO_BASE_DICT_TEMPLATE
+
+    repo_root = Path(__file__).resolve().parent.parent.parent
+    scenario_dir_path = repo_root / "CI" / "tests_v2" / "scenarios" / scenario
+    test_path = scenario_dir_path / f"test_{scenario}.py"
+    resource_path = scenario_dir_path / "resource.yaml"
+    scenario_base_path = scenario_dir_path / "scenario_base.yaml"
+
+    if scenario_dir_path.exists() and any(scenario_dir_path.iterdir()):
+        print(f"Error: scenario directory already exists and is non-empty: {scenario_dir_path}", file=sys.stderr)
+        return 1
+    if test_path.exists():
+        print(f"Error: {test_path} already exists", file=sys.stderr)
+        return 1
+
+    scenario_dir_path.mkdir(parents=True, exist_ok=True)
+
+    test_content = TEST_FILE_TEMPLATE.format(
+        scenario=scenario,
+        marker=marker,
+        class_name=class_name,
+        app_label=app_label,
+        scenario_type=scenario_type,
+        namespace_key_path=repr(namespace_key_path),
+        namespace_is_regex=namespace_is_regex,
+        overrides_key_path=repr(overrides_key_path),
+    )
+    resource_content = RESOURCE_YAML_TEMPLATE.format(scenario=scenario, app_label=app_label)
+    scenario_base_content = scenario_base_template.format(
+        scenario=scenario,
+        scenario_type=scenario_type,
+    )
+
+    test_path.write_text(test_content, encoding="utf-8")
+    resource_path.write_text(resource_content, encoding="utf-8")
+    scenario_base_path.write_text(scenario_base_content, encoding="utf-8")
+
+    # Auto-add marker to pytest.ini if not already present
+    pytest_ini_path = repo_root / "CI" / "tests_v2" / "pytest.ini"
+    marker_line = f"    {marker}: marks a test as a {scenario} scenario test"
+    if pytest_ini_path.exists():
+        content = pytest_ini_path.read_text(encoding="utf-8")
+        if f"    {marker}:" not in content and f"{marker}: marks" not in content:
+            lines = content.splitlines(keepends=True)
+            insert_at = None
+            for i, line in enumerate(lines):
+                if re.match(r"^    \w+:\s*.+", line):
+                    insert_at = i + 1
+            if insert_at is not None:
+                lines.insert(insert_at, marker_line + "\n")
+                pytest_ini_path.write_text("".join(lines), encoding="utf-8")
+                print("Added marker to pytest.ini")
+            else:
+                print("Could not find markers block in pytest.ini; add manually:")
+                print(marker_line)
+        else:
+            print("Marker already in pytest.ini")
+    else:
+        print("pytest.ini not found; add this marker under 'markers':")
+        print(marker_line)
+
+    print(f"Created: {test_path}")
+    print(f"Created: {resource_path}")
+    print(f"Created: {scenario_base_path}")
+    print()
+    print("Then edit scenario_base.yaml with your scenario structure (top-level key should match SCENARIO_NAME).")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/CI/tests_v2/scenarios/application_outage/nginx_http.yaml
+++ b/CI/tests_v2/scenarios/application_outage/nginx_http.yaml
@@ -0,0 +1,34 @@
+# Nginx Deployment + Service for application outage traffic test.
+# Namespace is patched at deploy time by the test framework.
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nginx-outage-http
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: nginx-outage-http
+      scenario: outage
+  template:
+    metadata:
+      labels:
+        app: nginx-outage-http
+        scenario: outage
+    spec:
+      containers:
+      - name: nginx
+        image: nginx:alpine
+        ports:
+        - containerPort: 80
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: nginx-outage-http
+spec:
+  selector:
+    app: nginx-outage-http
+  ports:
+  - port: 80
+    targetPort: 80
--- a/CI/tests_v2/scenarios/application_outage/resource.yaml
+++ b/CI/tests_v2/scenarios/application_outage/resource.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: outage
+  labels:
+    scenario: outage
+spec:
+  containers:
+  - name: fedtools
+    image: quay.io/krkn-chaos/krkn:tools
+    command:
+    - /bin/sh
+    - -c
+    - |
+      sleep infinity
--- a/CI/tests_v2/scenarios/application_outage/scenario_base.yaml
+++ b/CI/tests_v2/scenarios/application_outage/scenario_base.yaml
@@ -0,0 +1,10 @@
+# Base application_outage scenario. Tests load this and patch namespace (and optionally duration, block, exclude_label).
+application_outage:
+  duration: 10
+  namespace: default
+  pod_selector:
+    scenario: outage
+  block:
+  - Ingress
+  - Egress
+  exclude_label: ""
--- a/CI/tests_v2/scenarios/application_outage/test_application_outage.py
+++ b/CI/tests_v2/scenarios/application_outage/test_application_outage.py
@@ -0,0 +1,229 @@
+"""
+Functional test for application outage scenario (block network to target pods, then restore).
+Equivalent to CI/tests/test_app_outages.sh with proper assertions.
+The main happy-path test reuses one namespace and workload for multiple scenario runs (default, exclude_label, block variants); other tests use their own ephemeral namespace as needed.
+"""
+
+import time
+
+import pytest
+
+from lib.base import (
+    BaseScenarioTest,
+    KRAKEN_PROC_WAIT_TIMEOUT,
+    POLICY_WAIT_TIMEOUT,
+)
+from lib.utils import (
+    assert_all_pods_running_and_ready,
+    assert_kraken_failure,
+    assert_kraken_success,
+    assert_pod_count_unchanged,
+    find_network_policy_by_prefix,
+    get_network_policies_list,
+    get_pods_list,
+)
+
+
+def _wait_for_network_policy(k8s_networking, namespace: str, prefix: str, timeout: int = 30):
+    """Poll until a NetworkPolicy with name starting with prefix exists. Return its name."""
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        policy_list = get_network_policies_list(k8s_networking, namespace)
+        policy = find_network_policy_by_prefix(policy_list, prefix)
+        if policy:
+            return policy.metadata.name
+        time.sleep(1)
+    raise TimeoutError(f"No NetworkPolicy with prefix {prefix!r} in {namespace} within {timeout}s")
+
+
+def _assert_no_network_policy_with_prefix(k8s_networking, namespace: str, prefix: str):
+    policy_list = get_network_policies_list(k8s_networking, namespace)
+    policy = find_network_policy_by_prefix(policy_list, prefix)
+    name = policy.metadata.name if policy and policy.metadata else "?"
+    assert policy is None, (
+        f"Expected no NetworkPolicy with prefix {prefix!r} in namespace={namespace}, found {name}"
+    )
+
+
+@pytest.mark.functional
+@pytest.mark.application_outage
+class TestApplicationOutage(BaseScenarioTest):
+    """Application outage scenario: block network to target pods, then restore."""
+
+    WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/application_outage/resource.yaml"
+    WORKLOAD_IS_PATH = True
+    LABEL_SELECTOR = "scenario=outage"
+    POLICY_PREFIX = "krkn-deny-"
+    SCENARIO_NAME = "application_outage"
+    SCENARIO_TYPE = "application_outages_scenarios"
+    NAMESPACE_KEY_PATH = ["application_outage", "namespace"]
+    NAMESPACE_IS_REGEX = False
+    OVERRIDES_KEY_PATH = ["application_outage"]
+
+    @pytest.mark.order(1)
+    def test_app_outage_block_restore_and_variants(self):
+        """Default, exclude_label, and block-type variants (Ingress, Egress, both) run successfully in one namespace; each run restores and pods stay ready."""
+        ns = self.ns
+        before = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
+
+        cases = [
+            ("default", {}, "app_outage_config.yaml"),
+            ("exclude_label", {"exclude_label": {"env": "prod"}}, "app_outage_exclude_config.yaml"),
+            ("block=Ingress", {"block": ["Ingress"]}, "app_outage_block_ingress_config.yaml"),
+            ("block=Egress", {"block": ["Egress"]}, "app_outage_block_egress_config.yaml"),
+            ("block=Ingress,Egress", {"block": ["Ingress", "Egress"]}, "app_outage_block_ingress_egress_config.yaml"),
+        ]
+        for context_name, overrides, config_filename in cases:
+            result = self.run_scenario(
+                self.tmp_path, ns,
+                overrides=overrides if overrides else None,
+                config_filename=config_filename,
+            )
+            assert_kraken_success(
+                result, context=f"{context_name} namespace={ns}", tmp_path=self.tmp_path
+            )
+            after = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
+            assert_pod_count_unchanged(before, after, namespace=ns)
+            assert_all_pods_running_and_ready(after, namespace=ns)
+
+    def test_network_policy_created_then_deleted(self):
+        """NetworkPolicy with prefix krkn-deny- is created during run and deleted after."""
+        ns = self.ns
+        scenario = self.load_and_patch_scenario(self.repo_root, ns, duration=12)
+        scenario_path = self.write_scenario(self.tmp_path, scenario, suffix="_np_lifecycle")
+        config_path = self.build_config(
+            self.SCENARIO_TYPE, str(scenario_path),
+            filename="app_outage_np_lifecycle.yaml",
+        )
+        proc = self.run_kraken_background(config_path)
+        try:
+            policy_name = _wait_for_network_policy(
+                self.k8s_networking, ns, self.POLICY_PREFIX, timeout=POLICY_WAIT_TIMEOUT
+            )
+            assert policy_name.startswith(self.POLICY_PREFIX), (
+                f"Policy name {policy_name!r} should start with {self.POLICY_PREFIX!r} (namespace={ns})"
+            )
+            policy_list = get_network_policies_list(self.k8s_networking, ns)
+            policy = find_network_policy_by_prefix(policy_list, self.POLICY_PREFIX)
+            assert policy is not None and policy.spec is not None, (
+                f"Expected NetworkPolicy with spec (namespace={ns})"
+            )
+            assert policy.spec.pod_selector is not None, f"Policy should have pod_selector (namespace={ns})"
+            assert policy.spec.policy_types is not None, f"Policy should have policy_types (namespace={ns})"
+        finally:
+            proc.wait(timeout=KRAKEN_PROC_WAIT_TIMEOUT)
+        _assert_no_network_policy_with_prefix(self.k8s_networking, ns, self.POLICY_PREFIX)
+
+    # def test_traffic_blocked_during_outage(self, request):
+    #     """During outage, ingress to target pods is blocked; after run, traffic is restored."""
+    #     ns = self.ns
+    #     nginx_path = scenario_dir(self.repo_root, "application_outage") / "nginx_http.yaml"
+    #     docs = list(yaml.safe_load_all(nginx_path.read_text()))
+    #     docs = patch_namespace_in_docs(docs, ns)
+    #     try:
+    #         k8s_utils.create_from_yaml(
+    #             self.k8s_client,
+    #             yaml_objects=docs,
+    #             namespace=ns,
+    #         )
+    #     except k8s_utils.FailToCreateError as e:
+    #         msgs = [str(exc) for exc in e.api_exceptions]
+    #         raise AssertionError(
+    #             f"Failed to create nginx resources (namespace={ns}): {'; '.join(msgs)}"
+    #         ) from e
+    #     wait_for_deployment_replicas(self.k8s_apps, ns, "nginx-outage-http", timeout=READINESS_TIMEOUT)
+    #     port = _get_free_port()
+    #     pf_ref = []
+
+    #     def _kill_port_forward():
+    #         if pf_ref and pf_ref[0].poll() is None:
+    #             pf_ref[0].terminate()
+    #             try:
+    #                 pf_ref[0].wait(timeout=5)
+    #             except subprocess.TimeoutExpired:
+    #                 pf_ref[0].kill()
+
+    #     request.addfinalizer(_kill_port_forward)
+    #     pf = subprocess.Popen(
+    #         ["kubectl", "port-forward", "-n", ns, "service/nginx-outage-http", f"{port}:80"],
+    #         cwd=self.repo_root,
+    #         stdout=subprocess.DEVNULL,
+    #         stderr=subprocess.DEVNULL,
+    #     )
+    #     pf_ref.append(pf)
+    #     url = f"http://127.0.0.1:{port}/"
+    #     try:
+    #         time.sleep(2)
+    #         baseline_ok = False
+    #         for _ in range(10):
+    #             try:
+    #                 resp = requests.get(url, timeout=3)
+    #                 if resp.ok:
+    #                     baseline_ok = True
+    #                     break
+    #             except (requests.ConnectionError, requests.Timeout):
+    #                 pass
+    #             time.sleep(1)
+    #         assert baseline_ok, f"Baseline: HTTP request to nginx should succeed (namespace={ns})"
+
+    #         scenario = self.load_and_patch_scenario(self.repo_root, ns, duration=15)
+    #         scenario_path = self.write_scenario(self.tmp_path, scenario, suffix="_traffic")
+    #         config_path = self.build_config(
+    #             self.SCENARIO_TYPE, str(scenario_path),
+    #             filename="app_outage_traffic_config.yaml",
+    #         )
+    #         proc = self.run_kraken_background(config_path)
+    #         policy_name = _wait_for_network_policy(
+    #             self.k8s_networking, ns, self.POLICY_PREFIX, timeout=POLICY_WAIT_TIMEOUT
+    #         )
+    #         assert policy_name, f"Expected policy to exist (namespace={ns})"
+    #         time.sleep(2)
+    #         failed = False
+    #         for _ in range(5):
+    #             try:
+    #                 resp = requests.get(url, timeout=2)
+    #                 if not resp.ok:
+    #                     failed = True
+    #                     break
+    #             except (requests.ConnectionError, requests.Timeout):
+    #                 failed = True
+    #                 break
+    #             time.sleep(1)
+    #         assert failed, f"During outage, HTTP request to nginx should fail (namespace={ns})"
+    #         proc.wait(timeout=KRAKEN_PROC_WAIT_TIMEOUT)
+    #         time.sleep(1)
+    #         resp = requests.get(url, timeout=5)
+    #         assert resp.ok, f"After scenario, HTTP request to nginx should succeed (namespace={ns})"
+    #     finally:
+    #         pf.terminate()
+    #         pf.wait(timeout=5)
+
+    @pytest.mark.no_workload
+    def test_invalid_scenario_fails(self):
+        """Invalid scenario file (missing application_outage) causes Kraken to exit non-zero."""
+        invalid_scenario_path = self.tmp_path / "invalid_scenario.yaml"
+        invalid_scenario_path.write_text("foo: bar\n")
+        config_path = self.build_config(
+            self.SCENARIO_TYPE, str(invalid_scenario_path),
+            filename="invalid_config.yaml",
+        )
+        result = self.run_kraken(config_path)
+        assert_kraken_failure(
+            result, context=f"namespace={self.ns}", tmp_path=self.tmp_path
+        )
+
+    @pytest.mark.no_workload
+    def test_bad_namespace_fails(self):
+        """Scenario targeting non-existent namespace causes Kraken to exit non-zero."""
+        scenario = self.load_and_patch_scenario(self.repo_root, "nonexistent-namespace-xyz-12345")
+        scenario_path = self.write_scenario(self.tmp_path, scenario, suffix="_bad_ns")
+        config_path = self.build_config(
+            self.SCENARIO_TYPE, str(scenario_path),
+            filename="app_outage_bad_ns_config.yaml",
+        )
+        result = self.run_kraken(config_path)
+        assert_kraken_failure(
+            result,
+            context=f"test namespace={self.ns}",
+            tmp_path=self.tmp_path,
+        )
--- a/CI/tests_v2/scenarios/pod_disruption/resource.yaml
+++ b/CI/tests_v2/scenarios/pod_disruption/resource.yaml
@@ -0,0 +1,21 @@
+# Single-pod deployment targeted by pod disruption scenario.
+# Namespace is patched at deploy time by the test framework.
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: krkn-pod-disruption-target
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: krkn-pod-disruption-target
+  template:
+    metadata:
+      labels:
+        app: krkn-pod-disruption-target
+    spec:
+      containers:
+      - name: app
+        image: nginx:alpine
+        ports:
+        - containerPort: 80
--- a/CI/tests_v2/scenarios/pod_disruption/scenario_base.yaml
+++ b/CI/tests_v2/scenarios/pod_disruption/scenario_base.yaml
@@ -0,0 +1,7 @@
+# Base pod_disruption scenario (list). Tests load this and patch namespace_pattern with ^<ns>$.
+- id: kill-pods
+  config:
+    namespace_pattern: "^default$"
+    label_selector: app=krkn-pod-disruption-target
+    krkn_pod_recovery_time: 5
+    kill: 1
--- a/CI/tests_v2/scenarios/pod_disruption/test_pod_disruption.py
+++ b/CI/tests_v2/scenarios/pod_disruption/test_pod_disruption.py
@@ -0,0 +1,58 @@
+"""
+Functional test for pod disruption scenario (pod crash and recovery).
+Equivalent to CI/tests/test_pod.sh with proper before/after assertions.
+Each test runs in its own ephemeral namespace with workload deployed automatically.
+"""
+
+import pytest
+
+from lib.base import BaseScenarioTest, READINESS_TIMEOUT
+from lib.utils import (
+    assert_all_pods_running_and_ready,
+    assert_kraken_success,
+    assert_pod_count_unchanged,
+    get_pods_list,
+    pod_uids,
+    restart_counts,
+)
+
+
+@pytest.mark.functional
+@pytest.mark.pod_disruption
+class TestPodDisruption(BaseScenarioTest):
+    """Pod disruption scenario: kill pods and verify recovery."""
+
+    WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/pod_disruption/resource.yaml"
+    WORKLOAD_IS_PATH = True
+    LABEL_SELECTOR = "app=krkn-pod-disruption-target"
+    SCENARIO_NAME = "pod_disruption"
+    SCENARIO_TYPE = "pod_disruption_scenarios"
+    NAMESPACE_KEY_PATH = [0, "config", "namespace_pattern"]
+    NAMESPACE_IS_REGEX = True
+
+    @pytest.mark.order(1)
+    def test_pod_crash_and_recovery(self, wait_for_pods_running):
+        ns = self.ns
+        before = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
+        before_uids = pod_uids(before)
+        before_restarts = restart_counts(before)
+
+        result = self.run_scenario(self.tmp_path, ns)
+        assert_kraken_success(result, context=f"namespace={ns}", tmp_path=self.tmp_path)
+
+        after = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
+        after_uids = pod_uids(after)
+        after_restarts = restart_counts(after)
+        uids_changed = set(after_uids) != set(before_uids)
+        restarts_increased = after_restarts > before_restarts
+        assert uids_changed or restarts_increased, (
+            f"Chaos had no effect in namespace={ns}: pod UIDs unchanged and restart count did not increase. "
+            f"Before UIDs: {before_uids}, restarts: {before_restarts}. "
+            f"After UIDs: {after_uids}, restarts: {after_restarts}."
+        )
+
+        wait_for_pods_running(ns, self.LABEL_SELECTOR, timeout=READINESS_TIMEOUT)
+
+        after_final = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
+        assert_pod_count_unchanged(before, after_final, namespace=ns)
+        assert_all_pods_running_and_ready(after_final, namespace=ns)
--- a/CI/tests_v2/setup_env.sh
+++ b/CI/tests_v2/setup_env.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# Setup environment for CI/tests_v2 pytest functional tests.
+# Run from the repository root: ./CI/tests_v2/setup_env.sh
+#
+# - Creates a KinD cluster using kind-config-dev.yml (override with KIND_CONFIG=...).
+# - Waits for the cluster and for local-path-provisioner pods (required by pod disruption test).
+# - Does not install Python deps; use a venv and pip install -r requirements.txt and CI/tests_v2/requirements.txt yourself.
+
+set -e
+
+REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+KIND_CONFIG="${KIND_CONFIG:-${REPO_ROOT}/CI/tests_v2/kind-config-dev.yml}"
+CLUSTER_NAME="${KIND_CLUSTER_NAME:-ci-krkn}"
+
+echo "Repository root: $REPO_ROOT"
+cd "$REPO_ROOT"
+
+# Check required tools
+command -v kind >/dev/null 2>&1 || { echo "Error: kind is not installed. Install from https://kind.sigs.k8s.io/docs/user/quick-start/"; exit 1; }
+command -v kubectl >/dev/null 2>&1 || { echo "Error: kubectl is not installed."; exit 1; }
+
+# Python 3.9+
+python3 -c "import sys; exit(0 if sys.version_info >= (3, 9) else 1)" 2>/dev/null || { echo "Error: Python 3.9+ required. Check: python3 --version"; exit 1; }
+
+# Docker running (required for KinD)
+docker info >/dev/null 2>&1 || { echo "Error: Docker is not running. Start Docker Desktop or run: systemctl start docker"; exit 1; }
+
+# Tool versions for reproducibility
+echo "kind: $(kind --version 2>/dev/null || kind version 2>/dev/null)"
+echo "kubectl: $(kubectl version --client --short 2>/dev/null || kubectl version --client 2>/dev/null)"
+
+# Create cluster if it doesn't exist (use "kind get clusters" so we skip when nodes exist even if kubeconfig check would fail)
+if kind get clusters 2>/dev/null | grep -qx "$CLUSTER_NAME"; then
+  echo "KinD cluster '$CLUSTER_NAME' already exists, skipping creation."
+else
+  echo "Creating KinD cluster '$CLUSTER_NAME' from $KIND_CONFIG ..."
+  kind create cluster --name "$CLUSTER_NAME" --config "$KIND_CONFIG"
+fi
+
+# echo "Pre-pulling test workload images into KinD cluster..."
+# docker pull nginx:alpine
+# kind load docker-image nginx:alpine --name "$CLUSTER_NAME"
+
+# kind merges into default kubeconfig (~/.kube/config), so kubectl should work in this shell.
+# If you need to use this cluster from another terminal: export KUBECONFIG=~/.kube/config
+# and ensure context: kubectl config use-context kind-$CLUSTER_NAME
+
+echo "Waiting for cluster nodes to be Ready..."
+kubectl wait --for=condition=Ready nodes --all --timeout=120s 2>/dev/null || true
+
+echo "Waiting for local-path-provisioner pods (namespace local-path-storage, label app=local-path-provisioner)..."
+for i in {1..60}; do
+  if kubectl get pods -n local-path-storage -l app=local-path-provisioner -o name 2>/dev/null | grep -q .; then
+    echo "Found local-path-provisioner pod(s). Waiting for Ready..."
+    kubectl wait --for=condition=ready pod -l app=local-path-provisioner -n local-path-storage --timeout=120s 2>/dev/null && break
+  fi
+  echo "Attempt $i: local-path-provisioner not ready yet..."
+  sleep 3
+done
+
+if ! kubectl get pods -n local-path-storage -l app=local-path-provisioner -o name 2>/dev/null | grep -q .; then
+  echo "Warning: No pods with label app=local-path-provisioner in local-path-storage."
+  echo "KinD usually deploys this by default. Check: kubectl get pods -n local-path-storage"
+  exit 1
+fi
+
+echo ""
+echo "Cluster is ready for CI/tests_v2."
+echo "  kubectl uses the default kubeconfig (kind merged it). For another terminal: export KUBECONFIG=~/.kube/config"
+echo ""
+echo "Next: activate your venv, install deps, and run tests from repo root:"
+echo "  pip install -r requirements.txt"
+echo "  pip install -r CI/tests_v2/requirements.txt"
+echo "  pytest CI/tests_v2/ -v --timeout=300 --reruns=2 --reruns-delay=10"
--- a/GOVERNANCE.md
+++ b/GOVERNANCE.md
@@ -26,7 +26,7 @@ Here is an excerpt:
 ## Maintainer Levels

 ### Contributor
-Contributors contributor to the community. Anyone can become a contributor by participating in discussions, reporting bugs, or contributing code or documentation.
+Contributors contribute to the community. Anyone can become a contributor by participating in discussions, reporting bugs, or contributing code or documentation.

 #### Responsibilities:

@@ -80,4 +80,4 @@ Represent the project in the broader open-source community.


 # Credits
-Sections of this documents have been borrowed from [Kubernetes governance](https://github.com/kubernetes/community/blob/master/governance.md)
+Sections of this document have been borrowed from [Kubernetes governance](https://github.com/kubernetes/community/blob/master/governance.md)
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -16,5 +16,11 @@ Following are a list of enhancements that we are planning to work on adding supp
 - [x] [Krknctl - client for running Krkn scenarios with ease](https://github.com/krkn-chaos/krknctl)
 - [x] [AI Chat bot to help get started with Krkn and commands](https://github.com/krkn-chaos/krkn-lightspeed)
 - [ ] [Ability to roll back cluster to original state if chaos fails](https://github.com/krkn-chaos/krkn/issues/804)
- [ ] Add recovery time metrics to each scenario for each better regression analysis
- [ ] [Add resiliency scoring to chaos scenarios ran on cluster](https://github.com/krkn-chaos/krkn/issues/125)
+- [ ] Add recovery time metrics to each scenario for better regression analysis
+- [ ] [Add resiliency scoring to chaos scenarios ran on cluster](https://github.com/krkn-chaos/krkn/issues/125)
+- [ ] [Add AI-based Chaos Configuration Generator](https://github.com/krkn-chaos/krkn/issues/1166)
+- [ ] [Introduce Security Chaos Engineering Scenarios](https://github.com/krkn-chaos/krkn/issues/1165)
+- [ ] [Add AWS-native Chaos Scenarios (S3, Lambda, Networking)](https://github.com/krkn-chaos/krkn/issues/1164)
+- [ ] [Unify Krkn Ecosystem under krknctl for Enhanced UX](https://github.com/krkn-chaos/krknctl/issues/113)
+- [ ] [Build Web UI for Creating, Monitoring, and Reviewing Chaos Scenarios](https://github.com/krkn-chaos/krkn/issues/1167)
+- [ ] [Add Predefined Chaos Scenario Templates (KRKN Chaos Library)](https://github.com/krkn-chaos/krkn/issues/1168)
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -40,4 +40,4 @@ The security team currently consists of the [Maintainers of Krkn](https://github

 ## Process and Supported Releases

-The Krkn security team will investigate and provide a fix in a timely mannner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
+The Krkn security team will investigate and provide a fix in a timely manner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
--- a/config/cerberus.yaml
+++ b/config/cerberus.yaml
@@ -39,7 +39,7 @@ cerberus:
        Sunday:
    slack_team_alias:                                    # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned

-    custom_checks:                                       # Relative paths of files conataining additional user defined checks
+    custom_checks:                                       # Relative paths of files containing additional user defined checks

 tunings:
    timeout: 3                                          # Number of seconds before requests fail
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -50,6 +50,8 @@ kraken:
       - network_chaos_ng_scenarios:
               - scenarios/kube/pod-network-filter.yml
               - scenarios/kube/node-network-filter.yml
+               - scenarios/kube/node-network-chaos.yml
+               - scenarios/kube/pod-network-chaos.yml
       -  kubevirt_vm_outage:
              - scenarios/kubevirt/kubevirt-vm-outage.yaml

@@ -77,6 +79,7 @@ elastic:
    metrics_index: "krkn-metrics"
    alerts_index: "krkn-alerts"
    telemetry_index: "krkn-telemetry"
+    run_tag: ""

 tunings:
    wait_duration: 1                                      # Duration to wait between each chaos scenario
@@ -93,7 +96,7 @@ telemetry:
    prometheus_pod_name: ""                                 # name of the prometheus pod (if distribution is kubernetes)
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
-    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    archive_path: /tmp                                      # local path where the archive files will be temporarily stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
    archive_size: 500000
@@ -128,4 +131,5 @@ kubevirt_checks:                                            # Utilizing virt che
    disconnected: False                                     # Boolean of how to try to connect to the VMIs; if True will use the ip_address to try ssh from within a node, if false will use the name and uses virtctl to try to connect; Default is False
    ssh_node: ""                                            # If set, will be a backup way to ssh to a node. Will want to set to a node that isn't targeted in chaos
    node_names: ""
-    exit_on_failure:                                        # If value is True and VMI's are failing post chaos returns failure, values can be True/False
+    exit_on_failure:                                        # If value is True and VMI's are failing post chaos returns failure, values can be True/False
+    
--- a/config/config_kind.yaml
+++ b/config/config_kind.yaml
@@ -32,7 +32,7 @@ tunings:

 telemetry:
    enabled: False                                         # enable/disables the telemetry collection feature
-    archive_path: /tmp                                     # local path where the archive files will be temporarly stored
+    archive_path: /tmp                                     # local path where the archive files will be temporarily stored
    events_backup: False                                   # enables/disables cluster events collection
    logs_backup: False

--- a/config/config_performance.yaml
+++ b/config/config_performance.yaml
@@ -61,7 +61,7 @@ telemetry:
    prometheus_backup: True                                 # enables/disables prometheus data collection
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
-    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    archive_path: /tmp                                      # local path where the archive files will be temporarily stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
    archive_size: 500000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
--- a/containers/entrypoint.sh
+++ b/containers/entrypoint.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 # Run SSH setup
 ./containers/setup-ssh.sh
 # Change to kraken directory
--- a/kind-config.yml
+++ b/kind-config.yml
@@ -3,10 +3,16 @@ apiVersion: kind.x-k8s.io/v1alpha4
 nodes:
  - role: control-plane
    extraPortMappings:
+      - containerPort: 30000
+        hostPort: 9090
+      - containerPort: 32766
+        hostPort: 9200
      - containerPort: 30036
        hostPort: 8888
      - containerPort: 30037
        hostPort: 8889
+      - containerPort: 30080
+        hostPort: 30080
  - role: control-plane
  - role: control-plane
  - role: worker
--- a/utils/chaos_ai/src/init.py
+++ b/utils/chaos_ai/src/init.py
--- a/krkn/cerberus/setup.py
+++ b/krkn/cerberus/setup.py
@@ -2,19 +2,33 @@ import logging
 import requests
 import sys
 import json
+from krkn_lib.utils.functions import get_yaml_item_value

+check_application_routes = ""
+cerberus_url = None
+exit_on_failure = False
+cerberus_enabled = False

-def get_status(config, start_time, end_time):
+def set_url(config):
+    global exit_on_failure
+    exit_on_failure = get_yaml_item_value(config["kraken"], "exit_on_failure", False)
+    global cerberus_enabled
+    cerberus_enabled = get_yaml_item_value(config["cerberus"],"cerberus_enabled", False)
+    if cerberus_enabled:
+        global cerberus_url
+        cerberus_url = get_yaml_item_value(config["cerberus"],"cerberus_url", "")
+        global check_application_routes
+        check_application_routes = \
+            get_yaml_item_value(config["cerberus"],"check_applicaton_routes","")
+
+def get_status(start_time, end_time):
    """
    Get cerberus status
    """
    cerberus_status = True
    check_application_routes = False
    application_routes_status = True
-    if config["cerberus"]["cerberus_enabled"]:
-        cerberus_url = config["cerberus"]["cerberus_url"]
-        check_application_routes = \
-            config["cerberus"]["check_application_routes"]
+    if cerberus_enabled:
        if not cerberus_url:
            logging.error(
                "url where Cerberus publishes True/False signal "
@@ -61,40 +75,38 @@ def get_status(config, start_time, end_time):
    return cerberus_status


-def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
+def publish_kraken_status( start_time, end_time):
    """
    Publish kraken status to cerberus
    """
-    cerberus_status = get_status(config, start_time, end_time)
+    cerberus_status = get_status(start_time, end_time)
    if not cerberus_status:
-        if failed_post_scenarios:
-            if config["kraken"]["exit_on_failure"]:
-                logging.info(
-                    "Cerberus status is not healthy and post action scenarios "
-                    "are still failing, exiting kraken run"
-                )
-                sys.exit(1)
-            else:
-                logging.info(
-                    "Cerberus status is not healthy and post action scenarios "
-                    "are still failing"
-                )
+        if exit_on_failure:
+            logging.info(
+                "Cerberus status is not healthy and post action scenarios "
+                "are still failing, exiting kraken run"
+            )
+            sys.exit(1)
+        else:
+            logging.info(
+                "Cerberus status is not healthy and post action scenarios "
+                "are still failing"
+            )
    else:
-        if failed_post_scenarios:
-            if config["kraken"]["exit_on_failure"]:
-                logging.info(
-                    "Cerberus status is healthy but post action scenarios "
-                    "are still failing, exiting kraken run"
-                )
-                sys.exit(1)
-            else:
-                logging.info(
-                    "Cerberus status is healthy but post action scenarios "
-                    "are still failing"
-                )
+        if exit_on_failure:
+            logging.info(
+                "Cerberus status is healthy but post action scenarios "
+                "are still failing, exiting kraken run"
+            )
+            sys.exit(1)
+        else:
+            logging.info(
+                "Cerberus status is healthy but post action scenarios "
+                "are still failing"
+            )


-def application_status(cerberus_url, start_time, end_time):
+def application_status( start_time, end_time):
    """
    Check application availability
    """
--- a/krkn/prometheus/client.py
+++ b/krkn/prometheus/client.py
@@ -46,7 +46,7 @@ def alerts(
            sys.exit(1)

        for alert in profile_yaml:
-            if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
+            if sorted(alert.keys()) != sorted(["expr", "description", "severity"]):
                logging.error(f"wrong alert {alert}, skipping")
                continue

@@ -205,8 +205,8 @@ def metrics(
                   query
                )
            elif (
-                list(metric_query.keys()).sort()
-                == ["query", "metricName"].sort()
+                sorted(metric_query.keys())
+                == sorted(["query", "metricName"])
            ):
                metrics_result = prom_cli.process_prom_query_in_range(
                    query,
@@ -214,7 +214,7 @@ def metrics(
                    end_time=datetime.datetime.fromtimestamp(end_time), granularity=30
                )
            else: 
-                logging.info('didnt match keys')
+                logging.info("didn't match keys")
                continue
            
            for returned_metric in metrics_result:
--- a/krkn/prometheus/collector.py
+++ b/krkn/prometheus/collector.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import datetime
+import logging
+from typing import Dict, Any, List, Optional
+
+from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
+
+
+# -----------------------------------------------------------------------------
+# SLO evaluation helpers (used by krkn.resiliency)
+# -----------------------------------------------------------------------------
+
+
+def slo_passed(prometheus_result: List[Any]) -> Optional[bool]:
+    if not prometheus_result:
+        return None
+    has_samples = False
+    for series in prometheus_result:
+        if "values" in series:
+            has_samples = True
+            for _ts, val in series["values"]:
+                try:
+                    if float(val) > 0:
+                        return False
+                except (TypeError, ValueError):
+                    continue
+        elif "value" in series:
+            has_samples = True
+            try:
+                return float(series["value"][1]) == 0
+            except (TypeError, ValueError):
+                return False
+
+    # If we reached here and never saw any samples, skip
+    return None if not has_samples else True
+
+
+def evaluate_slos(
+    prom_cli: KrknPrometheus,
+    slo_list: List[Dict[str, Any]],
+    start_time: datetime.datetime,
+    end_time: datetime.datetime,
+) -> Dict[str, bool]:
+    """Evaluate a list of SLO expressions against Prometheus.
+
+    Args:
+        prom_cli: Configured Prometheus client.
+        slo_list: List of dicts with keys ``name``, ``expr``.
+        start_time: Start timestamp.
+        end_time: End timestamp.
+        granularity: Step in seconds for range queries.
+    Returns:
+        Mapping name -> bool indicating pass status.
+        True means good we passed the SLO test otherwise failed the SLO
+    """
+    results: Dict[str, bool] = {}
+    logging.info("Evaluating %d SLOs over window %s – %s", len(slo_list), start_time, end_time)
+    for slo in slo_list:
+        expr = slo["expr"]
+        name = slo["name"]
+        try:
+            response = prom_cli.process_prom_query_in_range(
+                expr,
+                start_time=start_time,
+                end_time=end_time,
+            )
+
+            passed = slo_passed(response)
+            if passed is None:
+                # Absence of data indicates the condition did not trigger; treat as pass.
+                logging.debug("SLO '%s' query returned no data; assuming pass.", name)
+                results[name] = True
+            else:
+                results[name] = passed
+        except Exception as exc:  
+            logging.error("PromQL query failed for SLO '%s': %s", name, exc)
+            results[name] = False  
+    return results
--- a/krkn/resiliency/init.py
+++ b/krkn/resiliency/init.py
@@ -0,0 +1,4 @@
+"""krkn.resiliency package public interface."""
+
+from .resiliency import Resiliency  # noqa: F401
+from .score import calculate_resiliency_score  # noqa: F401
--- a/krkn/resiliency/resiliency.py
+++ b/krkn/resiliency/resiliency.py
@@ -0,0 +1,366 @@
+"""Resiliency evaluation orchestrator for Krkn chaos runs.
+
+This module provides the `Resiliency` class which loads the canonical
+`alerts.yaml`, executes every SLO expression against Prometheus in the
+chaos-test time window, determines pass/fail status and calculates an
+overall resiliency score using the generic weighted model implemented
+in `krkn.resiliency.score`.
+"""
+
+from __future__ import annotations
+
+import datetime
+import logging
+import os
+from typing import Dict, List, Any, Optional, Tuple
+
+import yaml
+import json
+import dataclasses
+from krkn_lib.models.telemetry import ChaosRunTelemetry
+
+from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
+from krkn.prometheus.collector import evaluate_slos
+from krkn.resiliency.score import calculate_resiliency_score
+
+
+class Resiliency:  
+    """Central orchestrator for resiliency scoring."""
+
+    def __init__(self, alerts_yaml_path: str):
+       
+        if not os.path.exists(alerts_yaml_path):
+            raise FileNotFoundError(f"alerts file not found: {alerts_yaml_path}")
+        with open(alerts_yaml_path, "r", encoding="utf-8") as fp:
+            raw_yaml_data = yaml.safe_load(fp)
+        logging.info("Loaded SLO configuration from %s", alerts_yaml_path)
+
+        self._slos = self._normalise_alerts(raw_yaml_data)
+        self._results: Dict[str, bool] = {}
+        self._score: Optional[int] = None
+        self._breakdown: Optional[Dict[str, int]] = None
+        self._health_check_results: Dict[str, bool] = {}
+        self.scenario_reports: List[Dict[str, Any]] = []
+        self.summary: Optional[Dict[str, Any]] = None
+        self.detailed_report: Optional[Dict[str, Any]] = None
+
+    # ---------------------------------------------------------------------
+    # Public API
+    # ---------------------------------------------------------------------
+
+    def calculate_score(
+        self,
+        *,
+        health_check_results: Optional[Dict[str, bool]] = None,
+    ) -> int:
+        """Calculate the resiliency score using collected SLO results."""
+        slo_defs = {slo["name"]: {"severity": slo["severity"], "weight": slo.get("weight")} for slo in self._slos}
+        score, breakdown = calculate_resiliency_score(
+            slo_definitions=slo_defs,
+            prometheus_results=self._results,
+            health_check_results=health_check_results or {},
+        )
+        self._score = score
+        self._breakdown = breakdown
+        self._health_check_results = health_check_results or {}
+        return score
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return a dictionary ready for telemetry output."""
+        if self._score is None:
+            raise RuntimeError("calculate_score() must be called before to_dict()")
+        return {
+            "score": self._score,
+            "breakdown": self._breakdown,
+            "slo_results": self._results,
+            "health_check_results": getattr(self, "_health_check_results", {}),
+        }
+
+    # ------------------------------------------------------------------
+    # Scenario-based resiliency evaluation
+    # ------------------------------------------------------------------
+    def add_scenario_report(
+        self,
+        *,
+        scenario_name: str,
+        prom_cli: KrknPrometheus,
+        start_time: datetime.datetime,
+        end_time: datetime.datetime,
+        weight: float | int = 1,
+        health_check_results: Optional[Dict[str, bool]] = None,
+    ) -> int:
+        """
+        Evaluate SLOs for a single scenario window and store the result.
+
+        Args:
+            scenario_name: Human-friendly scenario identifier.
+            prom_cli: Initialized KrknPrometheus instance.
+            start_time: Window start.
+            end_time: Window end.
+            weight: Weight to use for the final weighted average calculation.
+            health_check_results: Optional mapping of custom health-check name ➡ bool.
+        Returns:
+            The calculated integer resiliency score (0-100) for this scenario.
+        """
+        slo_results = evaluate_slos(
+            prom_cli=prom_cli,
+            slo_list=self._slos,
+            start_time=start_time,
+            end_time=end_time,
+        )
+        slo_defs = {slo["name"]: {"severity": slo["severity"], "weight": slo.get("weight")} for slo in self._slos}
+        score, breakdown = calculate_resiliency_score(
+            slo_definitions=slo_defs,
+            prometheus_results=slo_results,
+            health_check_results=health_check_results or {},
+        )
+        self.scenario_reports.append(
+            {
+                "name": scenario_name,
+                "window": {
+                    "start": start_time.isoformat(),
+                    "end": end_time.isoformat(),
+                },
+                "score": score,
+                "weight": weight,
+                "breakdown": breakdown,
+                "slo_results": slo_results,
+                "health_check_results": health_check_results or {},
+            }
+        )
+        return score
+
+    def finalize_report(
+        self,
+        *,
+        prom_cli: KrknPrometheus,
+        total_start_time: datetime.datetime,
+        total_end_time: datetime.datetime,
+    ) -> None:
+        if not self.scenario_reports:
+            raise RuntimeError("No scenario reports added – nothing to finalize")
+
+        # ---------------- Weighted average (primary resiliency_score) ----------
+        total_weight = sum(rep["weight"] for rep in self.scenario_reports)
+        resiliency_score = int(
+            sum(rep["score"] * rep["weight"] for rep in self.scenario_reports) / total_weight
+        )
+
+        # ---------------- Overall SLO evaluation across full test window -----------------------------
+        full_slo_results = evaluate_slos(
+            prom_cli=prom_cli,
+            slo_list=self._slos,
+            start_time=total_start_time,
+            end_time=total_end_time,
+        )
+        slo_defs = {slo["name"]: {"severity": slo["severity"], "weight": slo.get("weight")} for slo in self._slos}
+        _overall_score, full_breakdown = calculate_resiliency_score(
+            slo_definitions=slo_defs,
+            prometheus_results=full_slo_results,
+            health_check_results={},
+        )
+
+        self.summary = {
+            "scenarios": {rep["name"]: rep["score"] for rep in self.scenario_reports},
+            "resiliency_score": resiliency_score,
+            "passed_slos": full_breakdown.get("passed", 0),
+            "total_slos": full_breakdown.get("passed", 0) + full_breakdown.get("failed", 0),
+        }
+
+        # Detailed report currently limited to per-scenario information; system stability section removed
+        self.detailed_report = {
+            "scenarios": self.scenario_reports,
+        }
+
+    def get_summary(self) -> Dict[str, Any]:
+        """Return the concise resiliency_summary structure."""
+        if not hasattr(self, "summary") or self.summary is None:
+            raise RuntimeError("finalize_report() must be called first")
+        return self.summary
+
+    def get_detailed_report(self) -> Dict[str, Any]:
+        """Return the full resiliency-report structure."""
+        if not hasattr(self, "detailed_report") or self.detailed_report is None:
+            raise RuntimeError("finalize_report() must be called first")
+        return self.detailed_report
+
+    @staticmethod
+    def compact_breakdown(report: Dict[str, Any]) -> Dict[str, int]:
+        """Return a compact summary dict for a single scenario report."""
+        try:
+            passed = report["breakdown"]["passed"]
+            failed = report["breakdown"]["failed"]
+            score_val = report["score"]
+        except Exception:
+            passed = report.get("breakdown", {}).get("passed", 0)
+            failed = report.get("breakdown", {}).get("failed", 0)
+            score_val = report.get("score", 0)
+        return {
+            "resiliency_score": score_val,
+            "passed_slos": passed,
+            "total_slos": passed + failed,
+        }
+
+    def attach_compact_to_telemetry(self, chaos_telemetry: ChaosRunTelemetry) -> None:
+        """Embed per-scenario compact resiliency reports into a ChaosRunTelemetry instance."""
+        score_map = {
+            rep["name"]: self.compact_breakdown(rep) for rep in self.scenario_reports
+        }
+        new_scenarios = []
+        for item in getattr(chaos_telemetry, "scenarios", []):
+            if isinstance(item, dict):
+                name = item.get("scenario")
+                if name in score_map:
+                    item["resiliency_report"] = score_map[name]
+                new_scenarios.append(item)
+            else:
+                name = getattr(item, "scenario", None)
+                try:
+                    item_dict = dataclasses.asdict(item)
+                except Exception:
+                    item_dict = {
+                        k: getattr(item, k)
+                        for k in dir(item)
+                        if not k.startswith("__") and not callable(getattr(item, k))
+                    }
+                if name in score_map:
+                    item_dict["resiliency_report"] = score_map[name]
+                new_scenarios.append(item_dict)
+        chaos_telemetry.scenarios = new_scenarios
+
+    def add_scenario_reports(
+        self,
+        *,
+        scenario_telemetries,
+        prom_cli: KrknPrometheus,
+        scenario_type: str,
+        batch_start_dt: datetime.datetime,
+        batch_end_dt: datetime.datetime,
+        weight: int | float = 1,
+    ) -> None:
+        """Evaluate SLOs for every telemetry item belonging to a scenario window,
+        store the result and enrich the telemetry list with a compact resiliency breakdown.
+
+        Args:
+            scenario_telemetries: Iterable with telemetry objects/dicts for the
+                current scenario batch window.
+            prom_cli: Pre-configured :class:`KrknPrometheus` instance.
+            scenario_type: Fallback scenario identifier in case individual
+                telemetry items do not provide one.
+            batch_start_dt: Fallback start timestamp for the batch window.
+            batch_end_dt: Fallback end timestamp for the batch window.
+            weight: Weight to assign to every scenario when calculating the final
+                weighted average.
+            logger: Optional custom logger.
+        """
+
+        for tel in scenario_telemetries:
+            try:
+                # -------- Extract timestamps & scenario name --------------------
+                if isinstance(tel, dict):
+                    st_ts = tel.get("start_timestamp")
+                    en_ts = tel.get("end_timestamp")
+                    scen_name = tel.get("scenario", scenario_type)
+                else:
+                    st_ts = getattr(tel, "start_timestamp", None)
+                    en_ts = getattr(tel, "end_timestamp", None)
+                    scen_name = getattr(tel, "scenario", scenario_type)
+
+                if st_ts and en_ts:
+                    st_dt = datetime.datetime.fromtimestamp(int(st_ts))
+                    en_dt = datetime.datetime.fromtimestamp(int(en_ts))
+                else:
+                    st_dt = batch_start_dt
+                    en_dt = batch_end_dt
+
+                # -------- Calculate resiliency score for the scenario -----------
+                self.add_scenario_report(
+                    scenario_name=str(scen_name),
+                    prom_cli=prom_cli,
+                    start_time=st_dt,
+                    end_time=en_dt,
+                    weight=weight,
+                    health_check_results=None,
+                )
+
+                compact = self.compact_breakdown(self.scenario_reports[-1])
+                if isinstance(tel, dict):
+                    tel["resiliency_report"] = compact
+                else:
+                    setattr(tel, "resiliency_report", compact)
+            except Exception as exc:
+                logging.error("Resiliency per-scenario evaluation failed: %s", exc)
+
+    def finalize_and_save(
+        self,
+        *,
+        prom_cli: KrknPrometheus,
+        total_start_time: datetime.datetime,
+        total_end_time: datetime.datetime,
+        run_mode: str = "standalone",
+        detailed_path: str = "resiliency-report.json",
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """Finalize resiliency scoring, persist reports and return them.
+
+        Args:
+            prom_cli: Pre-configured KrknPrometheus instance.
+            total_start_time: Start time for the full test window.
+            total_end_time: End time for the full test window.
+            run_mode: "controller" or "standalone" mode.
+
+        Returns:
+            (detailed_report)
+        """
+
+        try:
+            self.finalize_report(
+                prom_cli=prom_cli,
+                total_start_time=total_start_time,
+                total_end_time=total_end_time,
+            )
+            detailed = self.get_detailed_report()
+
+            if run_mode == "controller":
+                # krknctl expects the detailed report on stdout in a special format
+                try:
+                    detailed_json = json.dumps(detailed)
+                    print(f"KRKN_RESILIENCY_REPORT_JSON:{detailed_json}")
+                    logging.info("Resiliency report logged to stdout for krknctl.")
+                except Exception as exc:
+                    logging.error("Failed to serialize and log detailed resiliency report: %s", exc)
+            else:
+                # Stand-alone mode – write to files for post-run consumption
+                try:
+                    with open(detailed_path, "w", encoding="utf-8") as fp:
+                        json.dump(detailed, fp, indent=2)
+                    logging.info("Resiliency report written: %s", detailed_path)
+                except Exception as io_exc:
+                    logging.error("Failed to write resiliency report files: %s", io_exc)
+
+        except Exception as exc:
+            logging.error("Failed to finalize resiliency scoring: %s", exc)
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _normalise_alerts(raw_alerts: Any) -> List[Dict[str, Any]]:
+        """Convert raw YAML alerts data into internal SLO list structure."""
+        if not isinstance(raw_alerts, list):
+            raise ValueError("SLO configuration must be a list under key 'slos' or top-level list")
+
+        slos: List[Dict[str, Any]] = []
+        for idx, alert in enumerate(raw_alerts):
+            if not (isinstance(alert, dict) and "expr" in alert and "severity" in alert):
+                logging.warning("Skipping invalid alert entry at index %d: %s", idx, alert)
+                continue
+            name = alert.get("description") or f"slo_{idx}"
+            slos.append(
+                {
+                    "name": name,
+                    "expr": alert["expr"],
+                    "severity": str(alert["severity"]).lower(),
+                    "weight": alert.get("weight")
+                }
+            )
+        return slos
--- a/krkn/resiliency/score.py
+++ b/krkn/resiliency/score.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from typing import Dict, List, Tuple
+
+DEFAULT_WEIGHTS = {"critical": 3, "warning": 1}
+
+
+class SLOResult:
+    """Simple container representing evaluation outcome for a single SLO."""
+
+    def __init__(self, name: str, severity: str, passed: bool, weight: int | None = None):
+        self.name = name
+        self.severity = severity
+        self.passed = passed
+        self._custom_weight = weight
+
+    def weight(self, severity_weights: Dict[str, int]) -> int:
+        """Return the weight for this SLO. Uses custom weight if set, otherwise uses severity-based weight."""
+        if self._custom_weight is not None:
+            return self._custom_weight
+        return severity_weights.get(self.severity, severity_weights.get("warning", 1))
+
+
+def calculate_resiliency_score(
+    slo_definitions: Dict[str, str] | Dict[str, Dict[str, int | str | None]],
+    prometheus_results: Dict[str, bool],
+    health_check_results: Dict[str, bool],
+) -> Tuple[int, Dict[str, int]]:
+    """Compute a resiliency score between 0-100 based on SLO pass/fail results.
+
+    Args:
+        slo_definitions: Mapping of SLO name -> severity ("critical" | "warning") OR
+            SLO name -> {"severity": str, "weight": int | None}.
+        prometheus_results: Mapping of SLO name -> bool indicating whether the SLO
+            passed. Any SLO missing in this mapping is treated as failed.
+        health_check_results: Mapping of custom health-check name -> bool pass flag.
+            These checks are always treated as *critical*.
+
+    Returns:
+        Tuple containing (final_score, breakdown) where *breakdown* is a dict with
+        the counts of passed/failed SLOs per severity.
+    """
+
+    slo_objects: List[SLOResult] = []
+    for slo_name, slo_def in slo_definitions.items():
+        # Exclude SLOs that were not evaluated (query returned no data)
+        if slo_name not in prometheus_results:
+            continue
+        passed = bool(prometheus_results[slo_name])
+
+        # Support both old format (str) and new format (dict)
+        if isinstance(slo_def, str):
+            severity = slo_def
+            slo_weight = None
+        else:
+            severity = slo_def.get("severity", "warning")
+            slo_weight = slo_def.get("weight")
+
+        slo_objects.append(SLOResult(slo_name, severity, passed, weight=slo_weight))
+
+    # Health-check SLOs (by default keeping them critical)
+    for hc_name, hc_passed in health_check_results.items():
+        slo_objects.append(SLOResult(hc_name, "critical", bool(hc_passed)))
+
+    total_points = sum(slo.weight(DEFAULT_WEIGHTS) for slo in slo_objects)
+    points_lost = sum(slo.weight(DEFAULT_WEIGHTS) for slo in slo_objects if not slo.passed)
+
+    score = 0 if total_points == 0 else int(((total_points - points_lost) / total_points) * 100)
+
+    breakdown = {
+        "total_points": total_points,
+        "points_lost": points_lost,
+        "passed": len([s for s in slo_objects if s.passed]),
+        "failed": len([s for s in slo_objects if not s.passed]),
+    }
+    return score, breakdown
--- a/krkn/scenario_plugins/abstract_scenario_plugin.py
+++ b/krkn/scenario_plugins/abstract_scenario_plugin.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift

-from krkn import utils
+from krkn import utils, cerberus
 from krkn.rollback.handler import (
    RollbackHandler,
    execute_rollback_version_files,
@@ -30,7 +30,6 @@ class AbstractScenarioPlugin(ABC):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -104,7 +103,6 @@ class AbstractScenarioPlugin(ABC):
                    return_value = self.run(
                        run_uuid=run_uuid,
                        scenario=scenario_config,
-                        krkn_config=krkn_config,
                        lib_telemetry=telemetry,
                        scenario_telemetry=scenario_telemetry,
                    )
@@ -126,12 +124,14 @@ class AbstractScenarioPlugin(ABC):
                )
            scenario_telemetry.exit_status = return_value
            scenario_telemetry.end_timestamp = time.time()
+            start_time = int(scenario_telemetry.start_timestamp)
+            end_time = int(scenario_telemetry.end_timestamp)
            utils.collect_and_put_ocp_logs(
                telemetry,
                parsed_scenario_config,
                telemetry.get_telemetry_request_id(),
-                int(scenario_telemetry.start_timestamp),
-                int(scenario_telemetry.end_timestamp),
+                start_time,
+                end_time
            )

            if events_backup: 
@@ -139,15 +139,17 @@ class AbstractScenarioPlugin(ABC):
                    krkn_config,
                    parsed_scenario_config,
                    telemetry.get_lib_kubernetes(),
-                    int(scenario_telemetry.start_timestamp),
-                    int(scenario_telemetry.end_timestamp),
+                    start_time,
+                    end_time
                )

            if scenario_telemetry.exit_status != 0:
                failed_scenarios.append(scenario_config)
            scenario_telemetries.append(scenario_telemetry)
-            logging.info(f"waiting {wait_duration} before running the next scenario")
+            cerberus.publish_kraken_status(start_time,end_time)
+            logging.info(f"wating {wait_duration} before running the next scenario")
            time.sleep(wait_duration)
+            
        return failed_scenarios, scenario_telemetries

    
--- a/krkn/scenario_plugins/application_outage/application_outage_scenario_plugin.py
+++ b/krkn/scenario_plugins/application_outage/application_outage_scenario_plugin.py
@@ -5,7 +5,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.utils import get_yaml_item_value, get_random_string
 from jinja2 import Template
-from krkn import cerberus
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 from krkn.rollback.config import RollbackContent
 from krkn.rollback.handler import set_rollback_context_decorator
@@ -17,11 +16,9 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
-        wait_duration = krkn_config["tunings"]["wait_duration"]
        try:
            with open(scenario, "r") as f:
                app_outage_config_yaml = yaml.full_load(f)
@@ -110,14 +107,8 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
                    policy_name, namespace
                )

-                logging.info(
-                    "End of scenario. Waiting for the specified duration: %s"
-                    % wait_duration
-                )
-                time.sleep(wait_duration)
-
                end_time = int(time.time())
-                cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
+                
        except Exception as e:
            logging.error(
                "ApplicationOutageScenarioPlugin exiting due to Exception %s" % e
--- a/krkn/scenario_plugins/container/container_scenario_plugin.py
+++ b/krkn/scenario_plugins/container/container_scenario_plugin.py
@@ -10,7 +10,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.utils import get_yaml_item_value

-
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin


@@ -19,7 +18,6 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
--- a/krkn/scenario_plugins/hogs/hogs_scenario_plugin.py
+++ b/krkn/scenario_plugins/hogs/hogs_scenario_plugin.py
@@ -23,7 +23,7 @@ from krkn.rollback.handler import set_rollback_context_decorator
 class HogsScenarioPlugin(AbstractScenarioPlugin):
    
    @set_rollback_context_decorator
-    def run(self, run_uuid: str, scenario: str, krkn_config: dict[str, any], lib_telemetry: KrknTelemetryOpenshift,
+    def run(self, run_uuid: str, scenario: str, lib_telemetry: KrknTelemetryOpenshift,
            scenario_telemetry: ScenarioTelemetry) -> int:
        try:
            with open(scenario, "r") as f:
--- a/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py
+++ b/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py
@@ -46,7 +46,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
        try:
            with open(scenario, "r") as f:
                scenario_config = yaml.full_load(f)
-                
+            
            self.init_clients(lib_telemetry.get_lib_kubernetes())
            pods_status = PodsStatus()
            for config in scenario_config["scenarios"]:
@@ -71,75 +71,14 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
        self.custom_object_client = k8s_client.custom_object_client
        logging.info("Successfully initialized Kubernetes client for KubeVirt operations")

-    def get_vmi(self, name: str, namespace: str) -> Optional[Dict]:
-        """
-        Get a Virtual Machine Instance by name and namespace.
-        
-        :param name: Name of the VMI to retrieve
-        :param namespace: Namespace of the VMI
-        :return: The VMI object if found, None otherwise
-        """
-        try:
-            vmi = self.custom_object_client.get_namespaced_custom_object(
-                group="kubevirt.io",
-                version="v1",
-                namespace=namespace,
-                plural="virtualmachineinstances",
-                name=name
-            )
-            return vmi
-        except ApiException as e:
-            if e.status == 404:
-                logging.warning(f"VMI {name} not found in namespace {namespace}")
-                return None
-            else:
-                logging.error(f"Error getting VMI {name}: {e}")
-                raise
-        except Exception as e:
-            logging.error(f"Unexpected error getting VMI {name}: {e}")
-            raise
-            
-    def get_vmis(self, regex_name: str, namespace: str) -> Optional[Dict]:
-        """
-        Get a Virtual Machine Instance by name and namespace.
-        
-        :param name: Name of the VMI to retrieve
-        :param namespace: Namespace of the VMI
-        :return: The VMI object if found, None otherwise
-        """
-        try:
-            namespaces = self.k8s_client.list_namespaces_by_regex(namespace)
-            for namespace in namespaces:
-                vmis = self.custom_object_client.list_namespaced_custom_object(
-                    group="kubevirt.io",
-                    version="v1",
-                    namespace=namespace,
-                    plural="virtualmachineinstances",
-                )
-
-                for vmi in vmis.get("items"):
-                    vmi_name = vmi.get("metadata",{}).get("name")
-                    match = re.match(regex_name, vmi_name)
-                    if match:
-                        self.vmis_list.append(vmi)
-        except ApiException as e:
-            if e.status == 404:
-                logging.warning(f"VMI {regex_name} not found in namespace {namespace}")
-                return []
-            else:
-                logging.error(f"Error getting VMI {regex_name}: {e}")
-                raise
-        except Exception as e:
-            logging.error(f"Unexpected error getting VMI {regex_name}: {e}")
-            raise
    
-    def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> int:
+    def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> PodsStatus:
        """
        Execute a KubeVirt VM outage scenario based on the provided configuration.
-        
+
        :param config: The scenario configuration
        :param scenario_telemetry: The telemetry object for recording metrics
-        :return: 0 for success, 1 for failure
+        :return: PodsStatus object containing recovered and unrecovered pods
        """
        self.pods_status = PodsStatus()
        try:
@@ -149,12 +88,12 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
            timeout = params.get("timeout", 60)
            kill_count = params.get("kill_count", 1)
            disable_auto_restart = params.get("disable_auto_restart", False)
-            
+
            if not vm_name:
                logging.error("vm_name parameter is required")
-                return 1
+                return self.pods_status
            self.pods_status = PodsStatus()
-            self.get_vmis(vm_name,namespace)
+            self.vmis_list = self.k8s_client.get_vmis(vm_name,namespace)
            for _ in range(kill_count):
                
                rand_int = random.randint(0, len(self.vmis_list) - 1)
@@ -163,17 +102,22 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
                logging.info(f"Starting KubeVirt VM outage scenario for VM: {vm_name} in namespace: {namespace}")
                vmi_name = vmi.get("metadata").get("name")
                vmi_namespace = vmi.get("metadata").get("namespace")
-                if not self.validate_environment(vmi_name, vmi_namespace):
-                    return 1
-                    
-                vmi = self.get_vmi(vmi_name, vmi_namespace)
+
+                # Create affected_pod early so we can track failures
                self.affected_pod = AffectedPod(
                    pod_name=vmi_name,
                    namespace=vmi_namespace,
                )
+
+                if not self.validate_environment(vmi_name, vmi_namespace):
+                    self.pods_status.unrecovered.append(self.affected_pod)
+                    continue
+
+                vmi = self.k8s_client.get_vmi(vmi_name, vmi_namespace)
                if not vmi:
                    logging.error(f"VMI {vm_name} not found in namespace {namespace}")
-                    return 1
+                    self.pods_status.unrecovered.append(self.affected_pod)
+                    continue
                    
                self.original_vmi = vmi
                logging.info(f"Captured initial state of VMI: {vm_name}")
@@ -212,15 +156,13 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
        """
        try:
            # Check if KubeVirt CRDs exist
-            crd_list = self.custom_object_client.list_namespaced_custom_object("kubevirt.io","v1",namespace,"virtualmachines")
-            kubevirt_crds = [crd for crd in crd_list.items() ]
-            
+            kubevirt_crds = self.k8s_client.get_vms(vm_name, namespace)
            if not kubevirt_crds:
                logging.error("KubeVirt CRDs not found. Ensure KubeVirt/CNV is installed in the cluster")
                return False
                
            # Check if VMI exists
-            vmi = self.get_vmi(vm_name, namespace)
+            vmi = self.k8s_client.get_vmi(vm_name, namespace)
            if not vmi:
                logging.error(f"VMI {vm_name} not found in namespace {namespace}")
                return False
@@ -243,13 +185,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
        """
        try:
            # Get the VM object first to get its current spec
-            vm = self.custom_object_client.get_namespaced_custom_object(
-                group="kubevirt.io",
-                version="v1",
-                namespace=namespace,
-                plural="virtualmachines",
-                name=vm_name
-            )
+            vm = self.k8s_client.get_vm(vm_name, namespace)
            
            # Update the running state
            if 'spec' not in vm:
@@ -257,14 +193,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
            vm['spec']['running'] = running
            
            # Apply the patch
-            self.custom_object_client.patch_namespaced_custom_object(
-                group="kubevirt.io",
-                version="v1",
-                namespace=namespace,
-                plural="virtualmachines",
-                name=vm_name,
-                body=vm
-            )
+            self.k8s_client.patch_vm(vm_name,namespace,vm)
            return True
            
        except ApiException as e:
@@ -293,26 +222,12 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
                               " - proceeding with deletion but VM may auto-restart")
            start_creation_time =  self.original_vmi.get('metadata', {}).get('creationTimestamp')
            start_time = time.time()
-            try:
-                self.custom_object_client.delete_namespaced_custom_object(
-                    group="kubevirt.io",
-                    version="v1",
-                    namespace=namespace,
-                    plural="virtualmachineinstances",
-                    name=vm_name
-                )
-            except ApiException as e:
-                if e.status == 404:
-                    logging.warning(f"VMI {vm_name} not found during deletion")
-                    return 1
-                else:
-                    logging.error(f"API error during VMI deletion: {e}")
-                    return 1
+            self.k8s_client.delete_vmi(vm_name, namespace)
            
            # Wait for the VMI to be deleted
            
            while time.time() - start_time < timeout:
-                deleted_vmi = self.get_vmi(vm_name, namespace)
+                deleted_vmi = self.k8s_client.get_vmi(vm_name, namespace)
                if deleted_vmi:
                    if start_creation_time != deleted_vmi.get('metadata', {}).get('creationTimestamp'):
                        logging.info(f"VMI {vm_name} successfully recreated")
@@ -337,7 +252,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
        while time.time() - start_time < timeout: 

            # Check current state once since we've already waited for the duration
-            vmi = self.get_vmi(vm_name, namespace)
+            vmi = self.k8s_client.get_vmi(vm_name, namespace)
            
            if vmi:
                if vmi.get('status', {}).get('phase') == "Running":
@@ -378,13 +293,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
                                del metadata[field]
                    
                    # Create the VMI
-                    self.custom_object_client.create_namespaced_custom_object(
-                        group="kubevirt.io",
-                        version="v1",
-                        namespace=namespace,
-                        plural="virtualmachineinstances",
-                        body=vmi_dict
-                    )
+                    self.k8s_client.create_vmi(vm_name, namespace, vmi_dict)
                    logging.info(f"Successfully recreated VMI {vm_name}")
                    
                    # Wait for VMI to start running
--- a/krkn/scenario_plugins/managed_cluster/managed_cluster_scenario_plugin.py
+++ b/krkn/scenario_plugins/managed_cluster/managed_cluster_scenario_plugin.py
@@ -7,7 +7,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.utils import get_yaml_item_value

-from krkn import cerberus, utils
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 from krkn.scenario_plugins.managed_cluster.common_functions import get_managedcluster
 from krkn.scenario_plugins.managed_cluster.scenarios import Scenarios
@@ -18,7 +17,6 @@ class ManagedClusterScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -38,8 +36,6 @@ class ManagedClusterScenarioPlugin(AbstractScenarioPlugin):
                                managedcluster_scenario_object,
                                lib_telemetry.get_lib_kubernetes(),
                            )
-                            end_time = int(time.time())
-                            cerberus.get_status(krkn_config, start_time, end_time)
                        except Exception as e:
                            logging.error(
                                "ManagedClusterScenarioPlugin exiting due to Exception %s"
--- a/krkn/scenario_plugins/native/native_scenario_plugin.py
+++ b/krkn/scenario_plugins/native/native_scenario_plugin.py
@@ -12,7 +12,6 @@ class NativeScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -21,7 +20,6 @@ class NativeScenarioPlugin(AbstractScenarioPlugin):
            PLUGINS.run(
                scenario,
                lib_telemetry.get_lib_kubernetes().get_kubeconfig_path(),
-                krkn_config,
                run_uuid,
            )

--- a/krkn/scenario_plugins/native/network/cerberus.py
+++ b/krkn/scenario_plugins/native/network/cerberus.py
@@ -1,141 +0,0 @@
-import logging
-import requests
-import sys
-import json
-
-
-def get_status(config, start_time, end_time):
-    """
-    Function to get Cerberus status
-    
-    Args:
-        config
-            - Kraken config dictionary
-
-        start_time
-            - The time when chaos is injected
-        
-        end_time
-            - The time when chaos is removed
-
-    Returns:
-        Cerberus status
-    """
-    
-    cerberus_status = True
-    check_application_routes = False
-    application_routes_status = True
-    if config["cerberus"]["cerberus_enabled"]:
-        cerberus_url = config["cerberus"]["cerberus_url"]
-        check_application_routes = config["cerberus"]["check_application_routes"]
-        if not cerberus_url:
-            logging.error("url where Cerberus publishes True/False signal is not provided.")
-            sys.exit(1)
-        cerberus_status = requests.get(cerberus_url, timeout=60).content
-        cerberus_status = True if cerberus_status == b"True" else False
-
-        # Fail if the application routes monitored by cerberus experience downtime during the chaos
-        if check_application_routes:
-            application_routes_status, unavailable_routes = application_status(cerberus_url, start_time, end_time)
-            if not application_routes_status:
-                logging.error(
-                    "Application routes: %s monitored by cerberus encountered downtime during the run, failing"
-                    % unavailable_routes
-                )
-            else:
-                logging.info("Application routes being monitored didn't encounter any downtime during the run!")
-
-        if not cerberus_status:
-            logging.error(
-                "Received a no-go signal from Cerberus, looks like "
-                "the cluster is unhealthy. Please check the Cerberus "
-                "report for more details. Test failed."
-            )
-
-        if not application_routes_status or not cerberus_status:
-            sys.exit(1)
-        else:
-            logging.info("Received a go signal from Ceberus, the cluster is healthy. " "Test passed.")
-    return cerberus_status
-
-
-def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
-    """
-    Function to publish Kraken status to Cerberus
-    
-    Args:
-        config
-            - Kraken config dictionary
-        
-        failed_post_scenarios
-            - String containing the failed post scenarios
-
-        start_time
-            - The time when chaos is injected
-        
-        end_time
-            - The time when chaos is removed
-    """
-
-    cerberus_status = get_status(config, start_time, end_time)
-    if not cerberus_status:
-        if failed_post_scenarios:
-            if config["kraken"]["exit_on_failure"]:
-                logging.info(
-                    "Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
-                )
-                sys.exit(1)
-            else:
-                logging.info("Cerberus status is not healthy and post action scenarios " "are still failing")
-    else:
-        if failed_post_scenarios:
-            if config["kraken"]["exit_on_failure"]:
-                logging.info(
-                    "Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
-                )
-                sys.exit(1)
-            else:
-                logging.info("Cerberus status is healthy but post action scenarios " "are still failing")
-
-
-def application_status(cerberus_url, start_time, end_time):
-    """
-    Function to check application availability
-    
-    Args:
-        cerberus_url
-            - url where Cerberus publishes True/False signal
-
-        start_time
-            - The time when chaos is injected
-        
-        end_time
-            - The time when chaos is removed
-
-    Returns:
-        Application status and failed routes
-    """
-    
-    if not cerberus_url:
-        logging.error("url where Cerberus publishes True/False signal is not provided.")
-        sys.exit(1)
-    else:
-        duration = (end_time - start_time) / 60
-        url = cerberus_url + "/" + "history" + "?" + "loopback=" + str(duration)
-        logging.info("Scraping the metrics for the test duration from cerberus url: %s" % url)
-        try:
-            failed_routes = []
-            status = True
-            metrics = requests.get(url, timeout=60).content
-            metrics_json = json.loads(metrics)
-            for entry in metrics_json["history"]["failures"]:
-                if entry["component"] == "route":
-                    name = entry["name"]
-                    failed_routes.append(name)
-                    status = False
-                else:
-                    continue
-        except Exception as e:
-            logging.error("Failed to scrape metrics from cerberus API at %s: %s" % (url, e))
-            sys.exit(1)
-    return status, set(failed_routes)
--- a/krkn/scenario_plugins/native/network/ingress_shaping.py
+++ b/krkn/scenario_plugins/native/network/ingress_shaping.py
@@ -9,7 +9,6 @@ import random
 from traceback import format_exc
 from jinja2 import Environment, FileSystemLoader
 from . import kubernetes_functions as kube_helper
-from . import cerberus
 import typing
 from arcaflow_plugin_sdk import validation, plugin
 from kubernetes.client.api.core_v1_api import CoreV1Api as CoreV1Api
@@ -100,13 +99,13 @@ class NetworkScenarioConfig:
        default=None,
        metadata={
            "name": "Network Parameters",
-            "description": "The network filters that are applied on the interface. "
-            "The currently supported filters are latency, "
-            "loss and bandwidth",
-        },
+            "description":
+                "The network filters that are applied on the interface. "
+                "The currently supported filters are latency, "
+                "loss and bandwidth"
+        }
    )

-
@dataclass
 class NetworkScenarioSuccessOutput:
    filter_direction: str = field(
@@ -773,8 +772,7 @@ def network_chaos(
                logging.info("Deleting jobs")
                delete_jobs(cli, batch_cli, job_list[:])
                job_list = []
-                logging.info("Waiting for wait_duration : %ss" % cfg.wait_duration)
-                time.sleep(cfg.wait_duration)
+               
                create_interfaces = False
        else:

--- a/krkn/scenario_plugins/native/plugins.py
+++ b/krkn/scenario_plugins/native/plugins.py
@@ -49,7 +49,7 @@ class Plugins:
    def unserialize_scenario(self, file: str) -> Any:
        return serialization.load_from_file(abspath(file))

-    def run(self, file: str, kubeconfig_path: str, kraken_config: str, run_uuid: str):
+    def run(self, file: str, kubeconfig_path: str, run_uuid: str):
        """
        Run executes a series of steps
        """
@@ -93,8 +93,6 @@ class Plugins:
            unserialized_input = step.schema.input.unserialize(entry["config"])
            if "kubeconfig_path" in step.schema.input.properties:
                unserialized_input.kubeconfig_path = kubeconfig_path
-            if "kraken_config" in step.schema.input.properties:
-                unserialized_input.kraken_config = kraken_config
            output_id, output_data = step.schema(
                params=unserialized_input, run_id=run_uuid
            )
--- a/krkn/scenario_plugins/native/pod_network_outage/cerberus.py
+++ b/krkn/scenario_plugins/native/pod_network_outage/cerberus.py
@@ -1,157 +0,0 @@
-import logging
-import requests
-import sys
-import json
-
-
-def get_status(config, start_time, end_time):
-    """
-    Function to get Cerberus status
-
-    Args:
-        config
-            - Kraken config dictionary
-
-        start_time
-            - The time when chaos is injected
-
-        end_time
-            - The time when chaos is removed
-
-    Returns:
-        Cerberus status
-    """
-
-    cerberus_status = True
-    check_application_routes = False
-    application_routes_status = True
-    if config["cerberus"]["cerberus_enabled"]:
-        cerberus_url = config["cerberus"]["cerberus_url"]
-        check_application_routes = config["cerberus"]["check_application_routes"]
-        if not cerberus_url:
-            logging.error(
-                "url where Cerberus publishes True/False signal is not provided.")
-            sys.exit(1)
-        cerberus_status = requests.get(cerberus_url, timeout=60).content
-        cerberus_status = True if cerberus_status == b"True" else False
-
-        # Fail if the application routes monitored by cerberus experience
-        # downtime during the chaos
-        if check_application_routes:
-            application_routes_status, unavailable_routes = application_status(
-                cerberus_url, start_time, end_time)
-            if not application_routes_status:
-                logging.error(
-                    "Application routes: %s monitored by cerberus encountered downtime during the run, failing"
-                    % unavailable_routes
-                )
-            else:
-                logging.info(
-                    "Application routes being monitored didn't encounter any downtime during the run!")
-
-        if not cerberus_status:
-            logging.error(
-                "Received a no-go signal from Cerberus, looks like "
-                "the cluster is unhealthy. Please check the Cerberus "
-                "report for more details. Test failed."
-            )
-
-        if not application_routes_status or not cerberus_status:
-            sys.exit(1)
-        else:
-            logging.info(
-                "Received a go signal from Ceberus, the cluster is healthy. "
-                "Test passed.")
-    return cerberus_status
-
-
-def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
-    """
-    Function to publish Kraken status to Cerberus
-
-    Args:
-        config
-            - Kraken config dictionary
-
-        failed_post_scenarios
-            - String containing the failed post scenarios
-
-        start_time
-            - The time when chaos is injected
-
-        end_time
-            - The time when chaos is removed
-    """
-
-    cerberus_status = get_status(config, start_time, end_time)
-    if not cerberus_status:
-        if failed_post_scenarios:
-            if config["kraken"]["exit_on_failure"]:
-                logging.info(
-                    "Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
-                )
-                sys.exit(1)
-            else:
-                logging.info(
-                    "Cerberus status is not healthy and post action scenarios "
-                    "are still failing")
-    else:
-        if failed_post_scenarios:
-            if config["kraken"]["exit_on_failure"]:
-                logging.info(
-                    "Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
-                )
-                sys.exit(1)
-            else:
-                logging.info(
-                    "Cerberus status is healthy but post action scenarios "
-                    "are still failing")
-
-
-def application_status(cerberus_url, start_time, end_time):
-    """
-    Function to check application availability
-
-    Args:
-        cerberus_url
-            - url where Cerberus publishes True/False signal
-
-        start_time
-            - The time when chaos is injected
-
-        end_time
-            - The time when chaos is removed
-
-    Returns:
-        Application status and failed routes
-    """
-
-    if not cerberus_url:
-        logging.error(
-            "url where Cerberus publishes True/False signal is not provided.")
-        sys.exit(1)
-    else:
-        duration = (end_time - start_time) / 60
-        url = cerberus_url + "/" + "history" + \
-            "?" + "loopback=" + str(duration)
-        logging.info(
-            "Scraping the metrics for the test duration from cerberus url: %s" %
-            url)
-        try:
-            failed_routes = []
-            status = True
-            metrics = requests.get(url, timeout=60).content
-            metrics_json = json.loads(metrics)
-            for entry in metrics_json["history"]["failures"]:
-                if entry["component"] == "route":
-                    name = entry["name"]
-                    failed_routes.append(name)
-                    status = False
-                else:
-                    continue
-        except Exception as e:
-            logging.error(
-                "Failed to scrape metrics from cerberus API at %s: %s" %
-                (url, e))
-            sys.exit(1)
-    return status, set(failed_routes)
--- a/krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py
+++ b/krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py
@@ -15,7 +15,6 @@ from arcaflow_plugin_sdk import plugin, validation
 from kubernetes import client
 from kubernetes.client.api.apiextensions_v1_api import ApiextensionsV1Api
 from kubernetes.client.api.custom_objects_api import CustomObjectsApi
-from . import cerberus


 def get_test_pods(
@@ -36,7 +35,7 @@ def get_test_pods(
            - pods matching the label on which network policy
              need to be applied

-        namepsace (string)
+        namespace (string)
            - namespace in which the pod is present

        kubecli (KrknKubernetes)
@@ -1079,9 +1078,6 @@ def pod_outage(
    job_list = []
    publish = False

-    if params.kraken_config:
-        publish = True
-
    for i in params.direction:
        filter_dict[i] = eval(f"params.{i}_ports")

@@ -1137,11 +1133,6 @@ def pod_outage(
        start_time = int(time.time())
        logging.info("Waiting for job to finish")
        wait_for_job(job_list[:], kubecli, params.test_duration + 300)
-        end_time = int(time.time())
-        if publish:
-            cerberus.publish_kraken_status(
-                params.kraken_config, "", start_time, end_time
-            )

        return "success", PodOutageSuccessOutput(
            test_pods=pods_list,
@@ -1412,24 +1403,13 @@ def pod_egress_shaping(
                wait_for_job(job_list[:], kubecli, params.test_duration + 20)
                logging.info("Waiting for wait_duration %s" % params.test_duration)
                time.sleep(params.test_duration)
-                end_time = int(time.time())
-                if publish:
-                    cerberus.publish_kraken_status(
-                        params.kraken_config, "", start_time, end_time
-                    )
            if params.execution_type == "parallel":
                break
        if params.execution_type == "parallel":
            logging.info("Waiting for parallel job to finish")
-            start_time = int(time.time())
            wait_for_job(job_list[:], kubecli, params.test_duration + 300)
            logging.info("Waiting for wait_duration %s" % params.test_duration)
            time.sleep(params.test_duration)
-            end_time = int(time.time())
-            if publish:
-                cerberus.publish_kraken_status(
-                    params.kraken_config, "", start_time, end_time
-                )

        return "success", PodEgressNetShapingSuccessOutput(
            test_pods=pods_list,
@@ -1696,15 +1676,12 @@ def pod_ingress_shaping(
                )
            if params.execution_type == "serial":
                logging.info("Waiting for serial job to finish")
-                start_time = int(time.time())
-                wait_for_job(job_list[:], kubecli, params.test_duration + 20)
-                logging.info("Waiting for wait_duration %s" % params.test_duration)
+                wait_for_job(job_list[:], kubecli,
+                             params.test_duration + 20)
+                logging.info("Waiting for wait_duration %s" %
+                             params.test_duration)
                time.sleep(params.test_duration)
-                end_time = int(time.time())
-                if publish:
-                    cerberus.publish_kraken_status(
-                        params.kraken_config, "", start_time, end_time
-                    )
+
            if params.execution_type == "parallel":
                break
        if params.execution_type == "parallel":
@@ -1713,11 +1690,6 @@ def pod_ingress_shaping(
            wait_for_job(job_list[:], kubecli, params.test_duration + 300)
            logging.info("Waiting for wait_duration %s" % params.test_duration)
            time.sleep(params.test_duration)
-            end_time = int(time.time())
-            if publish:
-                cerberus.publish_kraken_status(
-                    params.kraken_config, "", start_time, end_time
-                )

        return "success", PodIngressNetShapingSuccessOutput(
            test_pods=pods_list,
--- a/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py
+++ b/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py
@@ -10,7 +10,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.utils import get_yaml_item_value, log_exception

-from krkn import cerberus, utils
 from krkn.scenario_plugins.node_actions import common_node_functions
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin

@@ -20,7 +19,6 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -112,34 +110,21 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
                                return 1
                        if test_execution == "serial":
                            logging.info("Waiting for serial job to finish")
-                            start_time = int(time.time())
                            self.wait_for_job(
                                joblst[:],
                                lib_telemetry.get_lib_kubernetes(),
                                test_duration + 300,
                            )

-                            end_time = int(time.time())
-                            cerberus.publish_kraken_status(
-                                krkn_config,
-                                None,
-                                start_time,
-                                end_time,
-                            )
                        if test_execution == "parallel":
                            break
                    if test_execution == "parallel":
                        logging.info("Waiting for parallel job to finish")
-                        start_time = int(time.time())
                        self.wait_for_job(
                            joblst[:],
                            lib_telemetry.get_lib_kubernetes(),
                            test_duration + 300,
                        )
-                        end_time = int(time.time())
-                        cerberus.publish_kraken_status(
-                            krkn_config, [], start_time, end_time
-                        )
                except Exception as e:
                    logging.error(
                        "NetworkChaosScenarioPlugin exiting due to Exception %s" % e
--- a/krkn/scenario_plugins/network_chaos_ng/models.py
+++ b/krkn/scenario_plugins/network_chaos_ng/models.py
@@ -1,5 +1,7 @@
+import re
 from dataclasses import dataclass
 from enum import Enum
+from typing import TypeVar, Optional


 class NetworkChaosScenarioType(Enum):
@@ -9,16 +11,21 @@ class NetworkChaosScenarioType(Enum):

@dataclass
 class BaseNetworkChaosConfig:
-    supported_execution = ["serial", "parallel"]
    id: str
+    image: str
    wait_duration: int
    test_duration: int
    label_selector: str
    service_account: str
+    taints: list[str]
+    namespace: str
    instance_count: int
    execution: str
-    namespace: str
-    taints: list[str]
+    supported_execution = ["serial", "parallel"]
+    interfaces: list[str]
+    target: str
+    ingress: bool
+    egress: bool

    def validate(self) -> list[str]:
        errors = []
@@ -41,12 +48,7 @@ class BaseNetworkChaosConfig:

@dataclass
 class NetworkFilterConfig(BaseNetworkChaosConfig):
-    ingress: bool
-    egress: bool
-    interfaces: list[str]
-    target: str
    ports: list[int]
-    image: str
    protocols: list[str]

    def validate(self) -> list[str]:
@@ -58,3 +60,30 @@ class NetworkFilterConfig(BaseNetworkChaosConfig):
                f"{self.protocols} contains not allowed protocols only tcp and udp is allowed"
            )
        return errors
+
+
+@dataclass
+class NetworkChaosConfig(BaseNetworkChaosConfig):
+    latency: Optional[str] = None
+    loss: Optional[str] = None
+    bandwidth: Optional[str] = None
+    force: Optional[bool] = None
+
+    def validate(self) -> list[str]:
+        errors = super().validate()
+        latency_regex = re.compile(r"^(\d+)(us|ms|s)$")
+        bandwidth_regex = re.compile(r"^(\d+)(bit|kbit|mbit|gbit|tbit)$")
+        if self.latency:
+            if not (latency_regex.match(self.latency)):
+                errors.append(
+                    "latency must be a number followed by `us` (microseconds) or `ms` (milliseconds), or `s` (seconds)"
+                )
+        if self.bandwidth:
+            if not (bandwidth_regex.match(self.bandwidth)):
+                errors.append(
+                    "bandwidth must be a number followed by `bit` `kbit` or `mbit` or `tbit`"
+                )
+        if self.loss:
+            if "%" in self.loss or not self.loss.isdigit():
+                errors.append("loss must be a number followed without the `%` symbol")
+        return errors
--- a/krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/abstract_network_chaos_module.py
@@ -1,6 +1,7 @@
 import abc
 import logging
 import queue
+from typing import Tuple

 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn.scenario_plugins.network_chaos_ng.models import (
@@ -27,7 +28,7 @@ class AbstractNetworkChaosModule(abc.ABC):
        pass

    @abc.abstractmethod
-    def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
+    def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
        """
        returns the common subset of settings shared by all the scenarios `BaseNetworkChaosConfig` and the type of Network
        Chaos Scenario that is running (Pod Scenario or Node Scenario)
@@ -41,6 +42,42 @@ class AbstractNetworkChaosModule(abc.ABC):

        pass

+    def get_node_targets(self, config: BaseNetworkChaosConfig):
+        if self.base_network_config.label_selector:
+            return self.kubecli.get_lib_kubernetes().list_nodes(
+                self.base_network_config.label_selector
+            )
+        else:
+            if not config.target:
+                raise Exception(
+                    "neither node selector nor node_name (target) specified, aborting."
+                )
+            node_info = self.kubecli.get_lib_kubernetes().list_nodes()
+            if config.target not in node_info:
+                raise Exception(f"node {config.target} not found, aborting")
+
+            return [config.target]
+
+    def get_pod_targets(self, config: BaseNetworkChaosConfig):
+        if not config.namespace:
+            raise Exception("namespace not specified, aborting")
+        if self.base_network_config.label_selector:
+            return self.kubecli.get_lib_kubernetes().list_pods(
+                config.namespace, config.label_selector
+            )
+        else:
+            if not config.target:
+                raise Exception(
+                    "neither node selector nor node_name (target) specified, aborting."
+                )
+            if not self.kubecli.get_lib_kubernetes().check_if_pod_exists(
+                config.target, config.namespace
+            ):
+                raise Exception(
+                    f"pod {config.target} not found in namespace {config.namespace}"
+                )
+            return [config.target]
+
    def __init__(
        self,
        base_network_config: BaseNetworkChaosConfig,
--- a/krkn/scenario_plugins/network_chaos_ng/modules/node_network_chaos.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/node_network_chaos.py
@@ -0,0 +1,156 @@
+import queue
+import time
+from typing import Tuple
+
+from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
+from krkn_lib.utils import get_random_string
+
+from krkn.scenario_plugins.network_chaos_ng.models import (
+    NetworkChaosScenarioType,
+    BaseNetworkChaosConfig,
+    NetworkChaosConfig,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
+    AbstractNetworkChaosModule,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
+    log_info,
+    setup_network_chaos_ng_scenario,
+    log_error,
+    log_warning,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_chaos import (
+    common_set_limit_rules,
+    common_delete_limit_rules,
+    node_qdisc_is_simple,
+)
+
+
+class NodeNetworkChaosModule(AbstractNetworkChaosModule):
+
+    def __init__(self, config: NetworkChaosConfig, kubecli: KrknTelemetryOpenshift):
+        super().__init__(config, kubecli)
+        self.config = config
+
+    def run(self, target: str, error_queue: queue.Queue = None):
+        parallel = False
+        if error_queue:
+            parallel = True
+        try:
+            network_chaos_pod_name = f"node-network-chaos-{get_random_string(5)}"
+            container_name = f"fedora-container-{get_random_string(5)}"
+
+            log_info(
+                f"creating workload to inject network chaos in node {target} network"
+                f"latency:{str(self.config.latency) if self.config.latency else '0'}, "
+                f"packet drop:{str(self.config.loss) if self.config.loss else '0'} "
+                f"bandwidth restriction:{str(self.config.bandwidth) if self.config.bandwidth else '0'} ",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            _, interfaces = setup_network_chaos_ng_scenario(
+                self.config,
+                target,
+                network_chaos_pod_name,
+                container_name,
+                self.kubecli.get_lib_kubernetes(),
+                target,
+                parallel,
+                True,
+            )
+
+            if len(self.config.interfaces) == 0:
+                if len(interfaces) == 0:
+                    log_error(
+                        "no network interface found in pod, impossible to execute the network chaos scenario",
+                        parallel,
+                        network_chaos_pod_name,
+                    )
+                    return
+                log_info(
+                    f"detected network interfaces: {','.join(interfaces)}",
+                    parallel,
+                    network_chaos_pod_name,
+                )
+            else:
+                interfaces = self.config.interfaces
+
+            log_info(
+                f"targeting node {target}",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            complex_config_interfaces = []
+            for interface in interfaces:
+                is_simple = node_qdisc_is_simple(
+                    self.kubecli.get_lib_kubernetes(),
+                    network_chaos_pod_name,
+                    self.config.namespace,
+                    interface,
+                )
+                if not is_simple:
+                    complex_config_interfaces.append(interface)
+
+            if len(complex_config_interfaces) > 0 and not self.config.force:
+                log_warning(
+                    f"node already has tc rules set for {','.join(complex_config_interfaces)}, this action might damage the cluster,"
+                    "if you want to continue set `force` to True in the node network "
+                    "chaos scenario config file and try again"
+                )
+            else:
+                if len(complex_config_interfaces) > 0 and self.config.force:
+                    log_warning(
+                        f"you are forcing node network configuration override for {','.join(complex_config_interfaces)},"
+                        "this action might lead to unpredictable node behaviour, "
+                        "you're doing it in your own responsibility"
+                        "waiting 10 seconds before continuing"
+                    )
+                    time.sleep(10)
+                common_set_limit_rules(
+                    self.config.egress,
+                    self.config.ingress,
+                    interfaces,
+                    self.config.bandwidth,
+                    self.config.latency,
+                    self.config.loss,
+                    parallel,
+                    network_chaos_pod_name,
+                    self.kubecli.get_lib_kubernetes(),
+                    network_chaos_pod_name,
+                    self.config.namespace,
+                    None,
+                )
+
+                time.sleep(self.config.test_duration)
+
+                log_info("removing tc rules", parallel, network_chaos_pod_name)
+
+                common_delete_limit_rules(
+                    self.config.egress,
+                    self.config.ingress,
+                    interfaces,
+                    network_chaos_pod_name,
+                    self.config.namespace,
+                    self.kubecli.get_lib_kubernetes(),
+                    None,
+                    parallel,
+                    network_chaos_pod_name,
+                )
+
+            self.kubecli.get_lib_kubernetes().delete_pod(
+                network_chaos_pod_name, self.config.namespace
+            )
+
+        except Exception as e:
+            if error_queue is None:
+                raise e
+            else:
+                error_queue.put(str(e))
+
+    def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
+        return NetworkChaosScenarioType.Node, self.config
+
+    def get_targets(self) -> list[str]:
+        return self.get_node_targets(self.config)
--- a/krkn/scenario_plugins/network_chaos_ng/modules/node_network_filter.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/node_network_filter.py
@@ -1,5 +1,6 @@
 import queue
 import time
+from typing import Tuple

 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.utils import get_random_string
@@ -11,14 +12,16 @@ from krkn.scenario_plugins.network_chaos_ng.models import (
 from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
    AbstractNetworkChaosModule,
 )
-from krkn.scenario_plugins.network_chaos_ng.modules.utils import log_info
+from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
+    log_info,
+    deploy_network_chaos_ng_pod,
+    get_pod_default_interface,
+)

 from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_filter import (
-    deploy_network_filter_pod,
    apply_network_rules,
    clean_network_rules,
    generate_rules,
-    get_default_interface,
 )


@@ -41,7 +44,7 @@ class NodeNetworkFilterModule(AbstractNetworkChaosModule):
            )

            pod_name = f"node-filter-{get_random_string(5)}"
-            deploy_network_filter_pod(
+            deploy_network_chaos_ng_pod(
                self.config,
                target,
                pod_name,
@@ -50,7 +53,7 @@ class NodeNetworkFilterModule(AbstractNetworkChaosModule):

            if len(self.config.interfaces) == 0:
                interfaces = [
-                    get_default_interface(
+                    get_pod_default_interface(
                        pod_name,
                        self.config.namespace,
                        self.kubecli.get_lib_kubernetes(),
@@ -108,21 +111,8 @@ class NodeNetworkFilterModule(AbstractNetworkChaosModule):
        super().__init__(config, kubecli)
        self.config = config

-    def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
+    def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
        return NetworkChaosScenarioType.Node, self.config

    def get_targets(self) -> list[str]:
-        if self.base_network_config.label_selector:
-            return self.kubecli.get_lib_kubernetes().list_nodes(
-                self.base_network_config.label_selector
-            )
-        else:
-            if not self.config.target:
-                raise Exception(
-                    "neither node selector nor node_name (target) specified, aborting."
-                )
-            node_info = self.kubecli.get_lib_kubernetes().list_nodes()
-            if self.config.target not in node_info:
-                raise Exception(f"node {self.config.target} not found, aborting")
-
-            return [self.config.target]
+        return self.get_node_targets(self.config)
--- a/krkn/scenario_plugins/network_chaos_ng/modules/pod_network_chaos.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/pod_network_chaos.py
@@ -0,0 +1,159 @@
+import queue
+import time
+from typing import Tuple
+
+from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
+from krkn_lib.utils import get_random_string
+
+from krkn.scenario_plugins.network_chaos_ng.models import (
+    NetworkChaosScenarioType,
+    BaseNetworkChaosConfig,
+    NetworkChaosConfig,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
+    AbstractNetworkChaosModule,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
+    log_info,
+    setup_network_chaos_ng_scenario,
+    log_error,
+)
+from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_chaos import (
+    common_set_limit_rules,
+    common_delete_limit_rules,
+)
+
+
+class PodNetworkChaosModule(AbstractNetworkChaosModule):
+
+    def __init__(self, config: NetworkChaosConfig, kubecli: KrknTelemetryOpenshift):
+        super().__init__(config, kubecli)
+        self.config = config
+
+    def run(self, target: str, error_queue: queue.Queue = None):
+        parallel = False
+        if error_queue:
+            parallel = True
+        try:
+            network_chaos_pod_name = f"pod-network-chaos-{get_random_string(5)}"
+            container_name = f"fedora-container-{get_random_string(5)}"
+            pod_info = self.kubecli.get_lib_kubernetes().get_pod_info(
+                target, self.config.namespace
+            )
+
+            log_info(
+                f"creating workload to inject network chaos in pod {target} network"
+                f"latency:{str(self.config.latency) if self.config.latency else '0'}, "
+                f"packet drop:{str(self.config.loss) if self.config.loss else '0'} "
+                f"bandwidth restriction:{str(self.config.bandwidth) if self.config.bandwidth else '0'} ",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            if not pod_info:
+                raise Exception(
+                    f"impossible to retrieve infos for pod {target} namespace {self.config.namespace}"
+                )
+
+            container_ids, interfaces = setup_network_chaos_ng_scenario(
+                self.config,
+                pod_info.nodeName,
+                network_chaos_pod_name,
+                container_name,
+                self.kubecli.get_lib_kubernetes(),
+                target,
+                parallel,
+                False,
+            )
+
+            if len(self.config.interfaces) == 0:
+                if len(interfaces) == 0:
+                    log_error(
+                        "no network interface found in pod, impossible to execute the network chaos scenario",
+                        parallel,
+                        network_chaos_pod_name,
+                    )
+                    return
+                log_info(
+                    f"detected network interfaces: {','.join(interfaces)}",
+                    parallel,
+                    network_chaos_pod_name,
+                )
+            else:
+                interfaces = self.config.interfaces
+
+            if len(container_ids) == 0:
+                raise Exception(
+                    f"impossible to resolve container id for pod {target} namespace {self.config.namespace}"
+                )
+
+            log_info(
+                f"targeting container {container_ids[0]}",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            pids = self.kubecli.get_lib_kubernetes().get_pod_pids(
+                base_pod_name=network_chaos_pod_name,
+                base_pod_namespace=self.config.namespace,
+                base_pod_container_name=container_name,
+                pod_name=target,
+                pod_namespace=self.config.namespace,
+                pod_container_id=container_ids[0],
+            )
+
+            if not pids:
+                raise Exception(f"impossible to resolve pid for pod {target}")
+
+            log_info(
+                f"resolved pids {pids} in node {pod_info.nodeName} for pod {target}",
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            common_set_limit_rules(
+                self.config.egress,
+                self.config.ingress,
+                interfaces,
+                self.config.bandwidth,
+                self.config.latency,
+                self.config.loss,
+                parallel,
+                network_chaos_pod_name,
+                self.kubecli.get_lib_kubernetes(),
+                network_chaos_pod_name,
+                self.config.namespace,
+                pids,
+            )
+
+            time.sleep(self.config.test_duration)
+
+            log_info("removing tc rules", parallel, network_chaos_pod_name)
+
+            common_delete_limit_rules(
+                self.config.egress,
+                self.config.ingress,
+                interfaces,
+                network_chaos_pod_name,
+                self.config.namespace,
+                self.kubecli.get_lib_kubernetes(),
+                pids,
+                parallel,
+                network_chaos_pod_name,
+            )
+
+            self.kubecli.get_lib_kubernetes().delete_pod(
+                network_chaos_pod_name, self.config.namespace
+            )
+
+        except Exception as e:
+            if error_queue is None:
+                raise e
+            else:
+                error_queue.put(str(e))
+
+    def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
+        return NetworkChaosScenarioType.Pod, self.config
+
+    def get_targets(self) -> list[str]:
+        return self.get_pod_targets(self.config)
--- a/krkn/scenario_plugins/network_chaos_ng/modules/pod_network_filter.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/pod_network_filter.py
@@ -1,6 +1,6 @@
-import logging
 import queue
 import time
+from typing import Tuple

 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.utils import get_random_string
@@ -13,12 +13,17 @@ from krkn.scenario_plugins.network_chaos_ng.models import (
 from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
    AbstractNetworkChaosModule,
 )
-from krkn.scenario_plugins.network_chaos_ng.modules.utils import log_info, log_error
+from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
+    log_info,
+    log_error,
+    deploy_network_chaos_ng_pod,
+    get_pod_default_interface,
+    setup_network_chaos_ng_scenario,
+)
 from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_filter import (
-    deploy_network_filter_pod,
-    generate_namespaced_rules,
    apply_network_rules,
    clean_network_rules_namespaced,
+    generate_namespaced_rules,
 )


@@ -50,22 +55,18 @@ class PodNetworkFilterModule(AbstractNetworkChaosModule):
                    f"impossible to retrieve infos for pod {self.config.target} namespace {self.config.namespace}"
                )

-            deploy_network_filter_pod(
+            container_ids, interfaces = setup_network_chaos_ng_scenario(
                self.config,
                pod_info.nodeName,
                pod_name,
-                self.kubecli.get_lib_kubernetes(),
                container_name,
-                host_network=False,
+                self.kubecli.get_lib_kubernetes(),
+                target,
+                parallel,
+                False,
            )

            if len(self.config.interfaces) == 0:
-                interfaces = (
-                    self.kubecli.get_lib_kubernetes().list_pod_network_interfaces(
-                        target, self.config.namespace
-                    )
-                )
-
                if len(interfaces) == 0:
                    log_error(
                        "no network interface found in pod, impossible to execute the network filter scenario",
@@ -157,26 +158,8 @@ class PodNetworkFilterModule(AbstractNetworkChaosModule):
        super().__init__(config, kubecli)
        self.config = config

-    def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
+    def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
        return NetworkChaosScenarioType.Pod, self.config

    def get_targets(self) -> list[str]:
-        if not self.config.namespace:
-            raise Exception("namespace not specified, aborting")
-        if self.base_network_config.label_selector:
-            return self.kubecli.get_lib_kubernetes().list_pods(
-                self.config.namespace, self.config.label_selector
-            )
-        else:
-            if not self.config.target:
-                raise Exception(
-                    "neither node selector nor node_name (target) specified, aborting."
-                )
-            if not self.kubecli.get_lib_kubernetes().check_if_pod_exists(
-                self.config.target, self.config.namespace
-            ):
-                raise Exception(
-                    f"pod {self.config.target} not found in namespace {self.config.namespace}"
-                )
-
-            return [self.config.target]
+        return self.get_pod_targets(self.config)
--- a/krkn/scenario_plugins/network_chaos_ng/modules/utils.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/utils.py
@@ -1,4 +1,15 @@
 import logging
+import os
+from typing import Tuple
+
+import yaml
+from jinja2 import FileSystemLoader, Environment
+from krkn_lib.k8s import KrknKubernetes
+from krkn_lib.models.k8s import Pod
+
+from krkn.scenario_plugins.network_chaos_ng.models import (
+    BaseNetworkChaosConfig,
+)


 def log_info(message: str, parallel: bool = False, node_name: str = ""):
@@ -29,3 +40,101 @@ def log_warning(message: str, parallel: bool = False, node_name: str = ""):
        logging.warning(f"[{node_name}]: {message}")
    else:
        logging.warning(message)
+
+
+def deploy_network_chaos_ng_pod(
+    config: BaseNetworkChaosConfig,
+    target_node: str,
+    pod_name: str,
+    kubecli: KrknKubernetes,
+    container_name: str = "fedora",
+    host_network: bool = True,
+):
+    file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
+    env = Environment(loader=file_loader, autoescape=True)
+    pod_template = env.get_template("templates/network-chaos.j2")
+    tolerations = []
+
+    for taint in config.taints:
+        key_value_part, effect = taint.split(":", 1)
+        if "=" in key_value_part:
+            key, value = key_value_part.split("=", 1)
+            operator = "Equal"
+        else:
+            key = key_value_part
+            value = None
+            operator = "Exists"
+        toleration = {
+            "key": key,
+            "operator": operator,
+            "effect": effect,
+        }
+        if value is not None:
+            toleration["value"] = value
+        tolerations.append(toleration)
+
+    pod_body = yaml.safe_load(
+        pod_template.render(
+            pod_name=pod_name,
+            namespace=config.namespace,
+            host_network=host_network,
+            target=target_node,
+            container_name=container_name,
+            workload_image=config.image,
+            taints=tolerations,
+            service_account=config.service_account,
+        )
+    )
+
+    kubecli.create_pod(pod_body, config.namespace, 300)
+
+
+def get_pod_default_interface(
+    pod_name: str, namespace: str, kubecli: KrknKubernetes
+) -> str:
+    cmd = "ip r | grep default | awk '/default/ {print $5}'"
+    output = kubecli.exec_cmd_in_pod([cmd], pod_name, namespace)
+    return output.replace("\n", "")
+
+
+def setup_network_chaos_ng_scenario(
+    config: BaseNetworkChaosConfig,
+    node_name: str,
+    pod_name: str,
+    container_name: str,
+    kubecli: KrknKubernetes,
+    target: str,
+    parallel: bool,
+    host_network: bool,
+) -> Tuple[list[str], list[str]]:
+
+    deploy_network_chaos_ng_pod(
+        config,
+        node_name,
+        pod_name,
+        kubecli,
+        container_name,
+        host_network=host_network,
+    )
+
+    if len(config.interfaces) == 0:
+        interfaces = [
+            get_pod_default_interface(
+                pod_name,
+                config.namespace,
+                kubecli,
+            )
+        ]
+
+        log_info(f"detected default interface {interfaces[0]}", parallel, target)
+
+    else:
+        interfaces = config.interfaces
+    # if not host_network means that the target is a pod so container_ids need to be resolved
+    # otherwise it's not needed
+    if not host_network:
+        container_ids = kubecli.get_container_ids(target, config.namespace)
+    else:
+        container_ids = []
+
+    return container_ids, interfaces
--- a/krkn/scenario_plugins/network_chaos_ng/modules/utils_network_chaos.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/utils_network_chaos.py
@@ -0,0 +1,263 @@
+import subprocess
+import logging
+from typing import Optional
+
+from krkn_lib.k8s import KrknKubernetes
+
+from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
+    log_info,
+    log_warning,
+    log_error,
+)
+
+ROOT_HANDLE = "100:"
+CLASS_ID = "100:1"
+NETEM_HANDLE = "101:"
+
+
+def run(cmd: list[str], check: bool = True) -> subprocess.CompletedProcess:
+    return subprocess.run(cmd, check=check, text=True, capture_output=True)
+
+
+def tc_node(args: list[str]) -> subprocess.CompletedProcess:
+    return run(["tc"] + args)
+
+
+def get_build_tc_tree_commands(devs: list[str]) -> list[str]:
+    tree = []
+    for dev in devs:
+        tree.append(f"tc qdisc add dev {dev} root handle {ROOT_HANDLE} htb default 1")
+        tree.append(
+            f"tc class add dev {dev} parent {ROOT_HANDLE} classid {CLASS_ID} htb rate 1gbit",
+        )
+        tree.append(
+            f"tc qdisc add dev {dev} parent {CLASS_ID} handle {NETEM_HANDLE} netem delay 0ms loss 0%",
+        )
+
+    return tree
+
+
+def namespaced_tc_commands(pids: list[str], commands: list[str]) -> list[str]:
+    return [
+        f"nsenter --target {pid} --net -- {rule}" for pid in pids for rule in commands
+    ]
+
+
+def get_egress_shaping_comand(
+    devices: list[str],
+    rate_mbit: Optional[str],
+    delay_ms: Optional[str],
+    loss_pct: Optional[str],
+) -> list[str]:
+
+    rate_commands = []
+    rate = f"{rate_mbit}mbit" if rate_mbit is not None else "1gbit"
+    d = delay_ms if delay_ms is not None else 0
+    l = loss_pct if loss_pct is not None else 0
+    for dev in devices:
+        rate_commands.append(
+            f"tc class change dev {dev} parent {ROOT_HANDLE} classid {CLASS_ID} htb rate {rate}"
+        )
+        rate_commands.append(
+            f"tc qdisc change dev {dev} parent {CLASS_ID} handle {NETEM_HANDLE} netem delay {d}ms loss {l}%"
+        )
+    return rate_commands
+
+
+def get_clear_egress_shaping_commands(devices: list[str]) -> list[str]:
+    return [f"tc qdisc del dev {dev} root handle {ROOT_HANDLE}" for dev in devices]
+
+
+def get_ingress_shaping_commands(
+    devs: list[str],
+    rate_mbit: Optional[str],
+    delay_ms: Optional[str],
+    loss_pct: Optional[str],
+    ifb_dev: str = "ifb0",
+) -> list[str]:
+
+    rate_commands = [
+        f"modprobe ifb || true",
+        f"ip link add {ifb_dev} type ifb || true",
+        f"ip link set {ifb_dev} up || true",
+    ]
+
+    for dev in devs:
+        rate_commands.append(f"tc qdisc add dev {dev} handle ffff: ingress || true")
+
+        rate_commands.append(
+            f"tc filter add dev {dev} parent ffff: protocol all prio 1 "
+            f"matchall action mirred egress redirect dev {ifb_dev} || true"
+        )
+
+    rate_commands.append(
+        f"tc qdisc add dev {ifb_dev} root handle {ROOT_HANDLE} htb default 1 || true"
+    )
+    rate_commands.append(
+        f"tc class add dev {ifb_dev} parent {ROOT_HANDLE} classid {CLASS_ID} "
+        f"htb rate {rate_mbit if rate_mbit else '1gbit'} || true"
+    )
+    rate_commands.append(
+        f"tc qdisc add dev {ifb_dev} parent {CLASS_ID} handle {NETEM_HANDLE} "
+        f"netem delay {delay_ms if delay_ms else '0ms'} "
+        f"loss {loss_pct if loss_pct else '0'}% || true"
+    )
+
+    return rate_commands
+
+
+def get_clear_ingress_shaping_commands(
+    devs: list[str],
+    ifb_dev: str = "ifb0",
+) -> list[str]:
+
+    cmds: list[str] = []
+    for dev in devs:
+        cmds.append(f"tc qdisc del dev {dev} ingress || true")
+
+    cmds.append(f"tc qdisc del dev {ifb_dev} root handle {ROOT_HANDLE} || true")
+
+    cmds.append(f"ip link set {ifb_dev} down || true")
+    cmds.append(f"ip link del {ifb_dev} || true")
+
+    return cmds
+
+
+def node_qdisc_is_simple(
+    kubecli: KrknKubernetes, pod_name, namespace: str, interface: str
+) -> bool:
+
+    result = kubecli.exec_cmd_in_pod(
+        [f"tc qdisc show dev {interface}"], pod_name, namespace
+    )
+    lines = [l for l in result.splitlines() if l.strip()]
+    if len(lines) != 1:
+        return False
+
+    line = lines[0].lower()
+    if "htb" in line or "netem" in line or "clsact" in line:
+        return False
+
+    return True
+
+
+def common_set_limit_rules(
+    egress: bool,
+    ingress: bool,
+    interfaces: list[str],
+    bandwidth: str,
+    latency: str,
+    loss: str,
+    parallel: bool,
+    target: str,
+    kubecli: KrknKubernetes,
+    network_chaos_pod_name: str,
+    namespace: str,
+    pids: Optional[list[str]] = None,
+):
+    if egress:
+        build_tree_commands = get_build_tc_tree_commands(interfaces)
+        if pids:
+            build_tree_commands = namespaced_tc_commands(pids, build_tree_commands)
+        egress_shaping_commands = get_egress_shaping_comand(
+            interfaces,
+            bandwidth,
+            latency,
+            loss,
+        )
+        if pids:
+            egress_shaping_commands = namespaced_tc_commands(
+                pids, egress_shaping_commands
+            )
+        error_counter = 0
+        for rule in build_tree_commands:
+            result = kubecli.exec_cmd_in_pod([rule], network_chaos_pod_name, namespace)
+            if not result:
+                log_info(f"created tc tree in pod: {rule}", parallel, target)
+            else:
+                error_counter += 1
+        if len(build_tree_commands) == error_counter:
+            log_error(
+                "failed to apply egress shaping rules on cluster", parallel, target
+            )
+
+        for rule in egress_shaping_commands:
+            result = kubecli.exec_cmd_in_pod([rule], network_chaos_pod_name, namespace)
+            if not result:
+                log_info(f"applied egress shaping rules: {rule}", parallel, target)
+    if ingress:
+        ingress_shaping_commands = get_ingress_shaping_commands(
+            interfaces,
+            bandwidth,
+            latency,
+            loss,
+        )
+        if pids:
+            ingress_shaping_commands = namespaced_tc_commands(
+                pids, ingress_shaping_commands
+            )
+        error_counter = 0
+        for rule in ingress_shaping_commands:
+
+            result = kubecli.exec_cmd_in_pod([rule], network_chaos_pod_name, namespace)
+            if not result:
+                log_info(
+                    f"applied ingress shaping rule: {rule}",
+                    parallel,
+                    network_chaos_pod_name,
+                )
+            else:
+                error_counter += 1
+
+        if len(ingress_shaping_commands) == error_counter:
+            log_error(
+                "failed to apply ingress shaping rules on cluster", parallel, target
+            )
+
+
+def common_delete_limit_rules(
+    egress: bool,
+    ingress: bool,
+    interfaces: list[str],
+    network_chaos_pod_name: str,
+    network_chaos_namespace: str,
+    kubecli: KrknKubernetes,
+    pids: Optional[list[str]],
+    parallel: bool,
+    target: str,
+):
+    if egress:
+        clear_commands = get_clear_egress_shaping_commands(interfaces)
+        if pids:
+            clear_commands = namespaced_tc_commands(pids, clear_commands)
+        error_counter = 0
+        for rule in clear_commands:
+            result = kubecli.exec_cmd_in_pod(
+                [rule], network_chaos_pod_name, network_chaos_namespace
+            )
+            if not result:
+                log_info(f"removed egress shaping rule : {rule}", parallel, target)
+            else:
+                error_counter += 1
+        if len(clear_commands) == error_counter:
+            log_error(
+                "failed to remove egress shaping rules on cluster", parallel, target
+            )
+
+    if ingress:
+        clear_commands = get_clear_ingress_shaping_commands(interfaces)
+        if pids:
+            clear_commands = namespaced_tc_commands(pids, clear_commands)
+        error_counter = 0
+        for rule in clear_commands:
+            result = kubecli.exec_cmd_in_pod(
+                [rule], network_chaos_pod_name, network_chaos_namespace
+            )
+            if not result:
+                log_info(f"removed ingress shaping rule: {rule}", parallel, target)
+            else:
+                error_counter += 1
+        if len(clear_commands) == error_counter:
+            log_error(
+                "failed to remove ingress shaping rules on cluster", parallel, target
+            )
--- a/krkn/scenario_plugins/network_chaos_ng/modules/utils_network_filter.py
+++ b/krkn/scenario_plugins/network_chaos_ng/modules/utils_network_filter.py
@@ -1,7 +1,5 @@
-import os
+from typing import Tuple

-import yaml
-from jinja2 import FileSystemLoader, Environment
 from krkn_lib.k8s import KrknKubernetes

 from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig
@@ -10,7 +8,7 @@ from krkn.scenario_plugins.network_chaos_ng.modules.utils import log_info

 def generate_rules(
    interfaces: list[str], config: NetworkFilterConfig
-) -> (list[str], list[str]):
+) -> Tuple[list[str], list[str]]:
    input_rules = []
    output_rules = []
    for interface in interfaces:
@@ -29,72 +27,6 @@ def generate_rules(
    return input_rules, output_rules


-def generate_namespaced_rules(
-    interfaces: list[str], config: NetworkFilterConfig, pids: list[str]
-) -> (list[str], list[str]):
-    namespaced_input_rules: list[str] = []
-    namespaced_output_rules: list[str] = []
-    input_rules, output_rules = generate_rules(interfaces, config)
-    for pid in pids:
-        ns_input_rules = [
-            f"nsenter --target {pid} --net -- {rule}" for rule in input_rules
-        ]
-        ns_output_rules = [
-            f"nsenter --target {pid} --net -- {rule}" for rule in output_rules
-        ]
-        namespaced_input_rules.extend(ns_input_rules)
-        namespaced_output_rules.extend(ns_output_rules)
-
-    return namespaced_input_rules, namespaced_output_rules
-
-
-def deploy_network_filter_pod(
-    config: NetworkFilterConfig,
-    target_node: str,
-    pod_name: str,
-    kubecli: KrknKubernetes,
-    container_name: str = "fedora",
-    host_network: bool = True,
-):
-    file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
-    env = Environment(loader=file_loader, autoescape=True)
-    pod_template = env.get_template("templates/network-chaos.j2")
-    tolerations = []
-
-    for taint in config.taints:
-        key_value_part, effect = taint.split(":", 1)
-        if "=" in key_value_part:
-            key, value = key_value_part.split("=", 1)
-            operator = "Equal"
-        else:
-            key = key_value_part
-            value = None
-            operator = "Exists"
-        toleration = {
-            "key": key,
-            "operator": operator,
-            "effect": effect,
-        }
-        if value is not None:
-            toleration["value"] = value
-        tolerations.append(toleration)
-
-    pod_body = yaml.safe_load(
-        pod_template.render(
-            pod_name=pod_name,
-            namespace=config.namespace,
-            host_network=host_network,
-            target=target_node,
-            container_name=container_name,
-            workload_image=config.image,
-            taints=tolerations,
-            service_account=config.service_account,
-        )
-    )
-
-    kubecli.create_pod(pod_body, config.namespace, 300)
-
-
 def apply_network_rules(
    kubecli: KrknKubernetes,
    input_rules: list[str],
@@ -153,9 +85,20 @@ def clean_network_rules_namespaced(
            )


-def get_default_interface(
-    pod_name: str, namespace: str, kubecli: KrknKubernetes
-) -> str:
-    cmd = "ip r | grep default | awk '/default/ {print $5}'"
-    output = kubecli.exec_cmd_in_pod([cmd], pod_name, namespace)
-    return output.replace("\n", "")
+def generate_namespaced_rules(
+    interfaces: list[str], config: NetworkFilterConfig, pids: list[str]
+) -> Tuple[list[str], list[str]]:
+    namespaced_input_rules: list[str] = []
+    namespaced_output_rules: list[str] = []
+    input_rules, output_rules = generate_rules(interfaces, config)
+    for pid in pids:
+        ns_input_rules = [
+            f"nsenter --target {pid} --net -- {rule}" for rule in input_rules
+        ]
+        ns_output_rules = [
+            f"nsenter --target {pid} --net -- {rule}" for rule in output_rules
+        ]
+        namespaced_input_rules.extend(ns_input_rules)
+        namespaced_output_rules.extend(ns_output_rules)
+
+    return namespaced_input_rules, namespaced_output_rules
--- a/krkn/scenario_plugins/network_chaos_ng/network_chaos_factory.py
+++ b/krkn/scenario_plugins/network_chaos_ng/network_chaos_factory.py
@@ -1,17 +1,31 @@
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift

-from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig
+from krkn.scenario_plugins.network_chaos_ng.models import (
+    NetworkFilterConfig,
+    NetworkChaosConfig,
+)
 from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
    AbstractNetworkChaosModule,
 )
+from krkn.scenario_plugins.network_chaos_ng.modules.node_network_chaos import (
+    NodeNetworkChaosModule,
+)
 from krkn.scenario_plugins.network_chaos_ng.modules.node_network_filter import (
    NodeNetworkFilterModule,
 )
+from krkn.scenario_plugins.network_chaos_ng.modules.pod_network_chaos import (
+    PodNetworkChaosModule,
+)
 from krkn.scenario_plugins.network_chaos_ng.modules.pod_network_filter import (
    PodNetworkFilterModule,
 )

-supported_modules = ["node_network_filter", "pod_network_filter"]
+supported_modules = [
+    "node_network_filter",
+    "pod_network_filter",
+    "pod_network_chaos",
+    "node_network_chaos",
+]


 class NetworkChaosFactory:
@@ -26,14 +40,28 @@ class NetworkChaosFactory:
            raise Exception(f"{config['id']} is not a supported network chaos module")

        if config["id"] == "node_network_filter":
-            config = NetworkFilterConfig(**config)
-            errors = config.validate()
+            scenario_config = NetworkFilterConfig(**config)
+            errors = scenario_config.validate()
            if len(errors) > 0:
                raise Exception(f"config validation errors: [{';'.join(errors)}]")
-            return NodeNetworkFilterModule(config, kubecli)
+            return NodeNetworkFilterModule(scenario_config, kubecli)
        if config["id"] == "pod_network_filter":
-            config = NetworkFilterConfig(**config)
-            errors = config.validate()
+            scenario_config = NetworkFilterConfig(**config)
+            errors = scenario_config.validate()
            if len(errors) > 0:
                raise Exception(f"config validation errors: [{';'.join(errors)}]")
-            return PodNetworkFilterModule(config, kubecli)
+            return PodNetworkFilterModule(scenario_config, kubecli)
+        if config["id"] == "pod_network_chaos":
+            scenario_config = NetworkChaosConfig(**config)
+            errors = scenario_config.validate()
+            if len(errors) > 0:
+                raise Exception(f"config validation errors: [{';'.join(errors)}]")
+            return PodNetworkChaosModule(scenario_config, kubecli)
+        if config["id"] == "node_network_chaos":
+            scenario_config = NetworkChaosConfig(**config)
+            errors = scenario_config.validate()
+            if len(errors) > 0:
+                raise Exception(f"config validation errors: [{';'.join(errors)}]")
+            return NodeNetworkChaosModule(scenario_config, kubecli)
+        else:
+            raise Exception(f"invalid network chaos id {config['id']}")
--- a/krkn/scenario_plugins/network_chaos_ng/network_chaos_ng_scenario_plugin.py
+++ b/krkn/scenario_plugins/network_chaos_ng/network_chaos_ng_scenario_plugin.py
@@ -22,7 +22,6 @@ class NetworkChaosNgScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
--- a/krkn/scenario_plugins/node_actions/common_node_functions.py
+++ b/krkn/scenario_plugins/node_actions/common_node_functions.py
@@ -11,7 +11,7 @@ def get_node_by_name(node_name_list, kubecli: KrknKubernetes):
    for node_name in node_name_list:
        if node_name not in killable_nodes:
            logging.info(
-                f"Node with provided ${node_name} does not exist or the node might "
+                f"Node with provided {node_name} does not exist or the node might "
                "be in NotReady state."
            )
            return
--- a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py
+++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py
@@ -40,7 +40,6 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -62,7 +61,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
                                scenario_telemetry,
                            )
                            end_time = int(time.time())
-                            cerberus.get_status(krkn_config, start_time, end_time)
+                            cerberus.get_status(start_time, end_time)
                except (RuntimeError, Exception) as e:
                    logging.error("Node Actions exiting due to Exception %s" % e)
                    return 1
@@ -196,13 +195,11 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
                exclude_nodes = common_node_functions.get_node(
                    exclude_label, 0, kubecli
                )
-
-                for node in nodes:
-                    if node in exclude_nodes:
-                        logging.info(
-                            f"excluding node {node} with exclude label {exclude_nodes}"
-                        )
-                        nodes.remove(node)
+                if exclude_nodes:
+                    logging.info(
+                        f"excluding nodes {exclude_nodes} with exclude label {exclude_label}"
+                    )
+                nodes = [node for node in nodes if node not in exclude_nodes]

        # GCP api doesn't support multiprocessing calls, will only actually run 1
        if parallel_nodes:
--- a/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py
+++ b/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py
@@ -28,7 +28,6 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
--- a/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py
+++ b/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py
@@ -9,9 +9,8 @@ import yaml
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
-from krkn_lib.utils import get_yaml_item_value, log_exception
+from krkn_lib.utils import get_yaml_item_value

-from krkn import cerberus, utils
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 from krkn.rollback.config import RollbackContent
 from krkn.rollback.handler import set_rollback_context_decorator
@@ -23,7 +22,6 @@ class PvcScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -181,7 +179,6 @@ class PvcScenarioPlugin(AbstractScenarioPlugin):
                    )
                )

-                start_time = int(time.time())
                # Create temp file in the PVC
                full_path = "%s/%s" % (str(mount_path), str(file_name))

@@ -285,8 +282,6 @@ class PvcScenarioPlugin(AbstractScenarioPlugin):
                    file_size_kb,
                    lib_telemetry.get_lib_kubernetes(),
                )
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
        except (RuntimeError, Exception) as e:
            logging.error("PvcScenarioPlugin exiting due to Exception %s" % e)
            return 1
--- a/krkn/scenario_plugins/scenario_plugin_factory.py
+++ b/krkn/scenario_plugins/scenario_plugin_factory.py
@@ -1,7 +1,7 @@
 import importlib
 import inspect
 import pkgutil
-from typing import Type, Tuple, Optional
+from typing import Type, Tuple, Optional, Any
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin


@@ -11,7 +11,7 @@ class ScenarioPluginNotFound(Exception):

 class ScenarioPluginFactory:

-    loaded_plugins: dict[str, any] = {}
+    loaded_plugins: dict[str, Any] = {}
    failed_plugins: list[Tuple[str, str, str]] = []
    package_name = None

--- a/krkn/scenario_plugins/service_disruption/service_disruption_scenario_plugin.py
+++ b/krkn/scenario_plugins/service_disruption/service_disruption_scenario_plugin.py
@@ -6,9 +6,8 @@ import yaml
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
-from krkn_lib.utils import get_yaml_item_value, log_exception
+from krkn_lib.utils import get_yaml_item_value

-from krkn import cerberus, utils
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin


@@ -17,7 +16,6 @@ class ServiceDisruptionScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -59,8 +57,6 @@ class ServiceDisruptionScenarioPlugin(AbstractScenarioPlugin):
                        + str(run_sleep)
                        + str(wait_time)
                    )
-                    logging.info("done")
-                    start_time = int(time.time())
                    for i in range(run_count):
                        killed_namespaces = {}
                        namespaces = (
@@ -114,10 +110,6 @@ class ServiceDisruptionScenarioPlugin(AbstractScenarioPlugin):
                            )
                            time.sleep(run_sleep)

-                    end_time = int(time.time())
-                    cerberus.publish_kraken_status(
-                        krkn_config, [], start_time, end_time
-                    )
        except (Exception, RuntimeError) as e:
            logging.error(
                "ServiceDisruptionScenarioPlugin exiting due to Exception %s" % e
--- a/krkn/scenario_plugins/service_hijacking/service_hijacking_scenario_plugin.py
+++ b/krkn/scenario_plugins/service_hijacking/service_hijacking_scenario_plugin.py
@@ -16,7 +16,6 @@ class ServiceHijackingScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
--- a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py
+++ b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py
@@ -7,7 +7,6 @@ from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift

-from krkn import cerberus
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
 from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
 from krkn.scenario_plugins.node_actions.az_node_scenarios import Azure
@@ -24,7 +23,6 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -34,15 +32,12 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
                shut_down_config_scenario = shut_down_config_yaml[
                    "cluster_shut_down_scenario"
                ]
-                start_time = int(time.time())
                affected_nodes_status = AffectedNodeStatus()
                self.cluster_shut_down(
                    shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status
                )

                scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
                return 0
        except Exception as e:
            logging.error(
--- a/krkn/scenario_plugins/syn_flood/syn_flood_scenario_plugin.py
+++ b/krkn/scenario_plugins/syn_flood/syn_flood_scenario_plugin.py
@@ -19,7 +19,6 @@ class SynFloodScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
--- a/krkn/scenario_plugins/time_actions/time_actions_scenario_plugin.py
+++ b/krkn/scenario_plugins/time_actions/time_actions_scenario_plugin.py
@@ -11,7 +11,6 @@ from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.utils import get_random_string, get_yaml_item_value, log_exception
 from kubernetes.client import ApiException

-from krkn import cerberus, utils
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin


@@ -20,7 +19,6 @@ class TimeActionsScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -28,7 +26,6 @@ class TimeActionsScenarioPlugin(AbstractScenarioPlugin):
            with open(scenario, "r") as f:
                scenario_config = yaml.full_load(f)
                for time_scenario in scenario_config["time_scenarios"]:
-                    start_time = int(time.time())
                    object_type, object_names = self.skew_time(
                        time_scenario, lib_telemetry.get_lib_kubernetes()
                    )
@@ -39,11 +36,7 @@ class TimeActionsScenarioPlugin(AbstractScenarioPlugin):
                    )
                    if len(not_reset) > 0:
                        logging.info("Object times were not reset")
-                    end_time = int(time.time())
-                    cerberus.publish_kraken_status(
-                        krkn_config, not_reset, start_time, end_time
-                    )
-        except (RuntimeError, Exception) as e:
+        except (RuntimeError, Exception):
            logging.error(
                f"TimeActionsScenarioPlugin scenario {scenario} failed with exception: {e}"
            )
--- a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py
+++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py
@@ -11,9 +11,8 @@ from krkn_lib.models.k8s import AffectedNodeStatus
 from krkn_lib.models.telemetry import ScenarioTelemetry
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift

-from krkn_lib.utils import get_yaml_item_value
 from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
-from krkn.scenario_plugins.native.network import cerberus
+from krkn_lib.utils import get_yaml_item_value

 from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
 from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios
@@ -23,7 +22,6 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
        self,
        run_uuid: str,
        scenario: str,
-        krkn_config: dict[str, any],
        lib_telemetry: KrknTelemetryOpenshift,
        scenario_telemetry: ScenarioTelemetry,
    ) -> int:
@@ -52,8 +50,6 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
                        )
                        return 1

-                end_time = int(time.time())
-                cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
        except (RuntimeError, Exception) as e:
            logging.error(
                f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}"
--- a/krkn/utils/ErrorCollectionHandler.py
+++ b/krkn/utils/ErrorCollectionHandler.py
@@ -0,0 +1,71 @@
+import logging
+import threading
+from datetime import datetime, timezone
+from krkn.utils.ErrorLog import ErrorLog
+
+
+class ErrorCollectionHandler(logging.Handler):
+    """
+    Custom logging handler that captures ERROR and CRITICAL level logs
+    in structured format for telemetry collection.
+
+    Stores logs in memory as ErrorLog objects for later retrieval.
+    Thread-safe for concurrent logging operations.
+    """
+
+    def __init__(self, level=logging.ERROR):
+        """
+        Initialize the error collection handler.
+
+        Args:
+            level: Minimum log level to capture (default: ERROR)
+        """
+        super().__init__(level)
+        self.error_logs: list[ErrorLog] = []
+        self._lock = threading.Lock()
+
+    def emit(self, record: logging.LogRecord):
+        """
+        Capture ERROR and CRITICAL logs and store as ErrorLog objects.
+
+        Args:
+            record: LogRecord from Python logging framework
+        """
+        try:
+            # Only capture ERROR (40) and CRITICAL (50) levels
+            if record.levelno < logging.ERROR:
+                return
+
+            # Format timestamp as ISO 8601 UTC
+            timestamp = datetime.fromtimestamp(
+                record.created, tz=timezone.utc
+            ).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
+
+            # Create ErrorLog object
+            error_log = ErrorLog(
+                timestamp=timestamp,
+                message=record.getMessage()
+            )
+
+            # Thread-safe append
+            with self._lock:
+                self.error_logs.append(error_log)
+
+        except Exception:
+            # Handler should never raise exceptions (logging best practice)
+            self.handleError(record)
+
+    def get_error_logs(self) -> list[dict]:
+        """
+        Retrieve all collected error logs as list of dictionaries.
+
+        Returns:
+            List of error log dictionaries with timestamp and message
+        """
+        with self._lock:
+            return [log.to_dict() for log in self.error_logs]
+
+    def clear(self):
+        """Clear all collected error logs (useful for testing)"""
+        with self._lock:
+            self.error_logs.clear()
--- a/krkn/utils/ErrorLog.py
+++ b/krkn/utils/ErrorLog.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass, asdict
+
+
+@dataclass
+class ErrorLog:
+    """
+    Represents a single error log entry for telemetry collection.
+
+    Attributes:
+        timestamp: ISO 8601 formatted timestamp (UTC)
+        message: Full error message text
+    """
+    timestamp: str
+    message: str
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization"""
+        return asdict(self)
--- a/krkn/utils/HealthChecker.py
+++ b/krkn/utils/HealthChecker.py
@@ -77,7 +77,7 @@ class HealthChecker:
                success_response = {
                    "url": url,
                    "status": True,
-                    "status_code": response["status_code"],
+                    "status_code": health_check_tracker[url]["status_code"],
                    "start_timestamp": health_check_tracker[url]["start_timestamp"].isoformat(),
                    "end_timestamp": health_check_end_time_stamp.isoformat(),
                    "duration": duration
--- a/krkn/utils/VirtChecker.py
+++ b/krkn/utils/VirtChecker.py
@@ -40,16 +40,20 @@ class VirtChecker:
            self.kube_vm_plugin = KubevirtVmOutageScenarioPlugin()
            self.kube_vm_plugin.init_clients(k8s_client=krkn_lib)

-            self.kube_vm_plugin.get_vmis(vmi_name_match,self.namespace)
+            self.vmis_list = self.kube_vm_plugin.k8s_client.get_vmis(vmi_name_match,self.namespace)
        except Exception as e:
            logging.error('Virt Check init exception: ' + str(e))
            return
        # See if multiple node names exist
        node_name_list = [node_name for node_name in self.node_names.split(',') if node_name]
-        for vmi in self.kube_vm_plugin.vmis_list:
+        for vmi in self.vmis_list:
            node_name = vmi.get("status",{}).get("nodeName")
            vmi_name = vmi.get("metadata",{}).get("name")
-            ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress")
+            interfaces = vmi.get("status",{}).get("interfaces",[])
+            if not interfaces:
+                logging.warning(f"VMI {vmi_name} has no network interfaces, skipping")
+                continue
+            ip_address = interfaces[0].get("ipAddress")
            namespace = vmi.get("metadata",{}).get("namespace")
            # If node_name_list exists, only add if node name is in list

@@ -74,7 +78,8 @@ class VirtChecker:
        else:
            logging.debug(f"Disconnected access for {ip_address} on {worker_name} is failed: {output}")
            vmi = self.kube_vm_plugin.get_vmi(vmi_name,self.namespace)
-            new_ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress")
+            interfaces = vmi.get("status",{}).get("interfaces",[])
+            new_ip_address = interfaces[0].get("ipAddress") if interfaces else None
            new_node_name = vmi.get("status",{}).get("nodeName")
            # if vm gets deleted, it'll start up with a new ip address
            if new_ip_address != ip_address:
@@ -102,7 +107,7 @@ class VirtChecker:

    def get_vm_access(self, vm_name: str = '', namespace: str = ''):
        """
-        This method returns True when the VM is access and an error message when it is not, using virtctl protocol
+        This method returns True when the VM is accessible and an error message when it is not, using virtctl protocol
        :param vm_name:
        :param namespace:
        :return: virtctl_status 'True' if successful, or an error message if it fails.
--- a/krkn/utils/init.py
+++ b/krkn/utils/init.py
@@ -1,2 +1,4 @@
 from .TeeLogHandler import TeeLogHandler
+from .ErrorLog import ErrorLog
+from .ErrorCollectionHandler import ErrorCollectionHandler
 from .functions import *
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,23 +1,23 @@
 aliyun-python-sdk-core==2.13.36
 aliyun-python-sdk-ecs==4.24.25
 arcaflow-plugin-sdk==0.14.0
-boto3==1.28.61
+boto3>=1.34.0  # Updated to support urllib3 2.x
 azure-identity==1.16.1
 azure-keyvault==4.2.0
 azure-mgmt-compute==30.5.0
 azure-mgmt-network==27.0.0
 coverage==7.6.12
 datetime==5.4
-docker>=6.0,<7.0  # docker 7.0+ has breaking changes with Unix sockets
+docker>=6.0,<7.0  # docker 7.0+ has breaking changes; works with requests<2.32
 gitpython==3.1.41
 google-auth==2.37.0
 google-cloud-compute==1.22.0
-ibm_cloud_sdk_core==3.18.0
-ibm_vpc==0.20.0
+ibm_cloud_sdk_core>=3.20.0  # Requires urllib3>=2.1.0 (compatible with updated boto3)
+ibm_vpc==0.26.3  # Requires ibm_cloud_sdk_core
 jinja2==3.1.6
-krkn-lib==6.0.1
 lxml==5.1.0
 kubernetes==34.1.0
+krkn-lib==6.0.5
 numpy==1.26.4
 pandas==2.2.0
 openshift-client==1.0.21
@@ -29,11 +29,13 @@ python-ipmi==0.5.4
 python-openstackclient==6.5.0
 requests<2.32  # requests 2.32+ breaks Unix socket support (http+docker scheme)
 requests-unixsocket>=0.4.0  # Required for Docker Unix socket support
+urllib3>=2.1.0,<2.4.0  # Compatible with all dependencies
 service_identity==24.1.0
 PyYAML==6.0.1
 setuptools==78.1.1
 wheel>=0.44.0
 zope.interface==6.1
+colorlog==6.10.1

 git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
 cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -6,28 +6,34 @@ import sys
 import yaml
 import logging
 import optparse
+from colorlog import ColoredFormatter
 import pyfiglet
 import uuid
 import time
 import queue
 import threading
-from typing import Optional
+from typing import Optional, Dict

+from krkn import cerberus
 from krkn_lib.elastic.krkn_elastic import KrknElastic
 from krkn_lib.models.elastic import ElasticChaosRunTelemetry
 from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 import krkn.prometheus as prometheus_plugin
 import server as server
+from krkn.resiliency.resiliency import (
+    Resiliency
+)
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.ocp import KrknOpenshift
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.models.telemetry import ChaosRunTelemetry
+from krkn_lib.models.k8s import ResiliencyReport
 from krkn_lib.utils import SafeLogger
 from krkn_lib.utils.functions import get_yaml_item_value, get_junit_test_case

-from krkn.utils import TeeLogHandler
+from krkn.utils import TeeLogHandler, ErrorCollectionHandler
 from krkn.utils.HealthChecker import HealthChecker
 from krkn.utils.VirtChecker import VirtChecker
 from krkn.scenario_plugins.scenario_plugin_factory import (
@@ -52,6 +58,8 @@ def main(options, command: Optional[str]) -> int:
    print(pyfiglet.figlet_format("kraken"))
    logging.info("Starting kraken")

+    
+
    cfg = options.cfg
    # Parse and read the config
    if os.path.isfile(cfg):
@@ -63,6 +71,7 @@ def main(options, command: Optional[str]) -> int:
            get_yaml_item_value(config["kraken"], "kubeconfig_path", "")
        )
        kraken_config = cfg
+
        chaos_scenarios = get_yaml_item_value(config["kraken"], "chaos_scenarios", [])
        publish_running_status = get_yaml_item_value(
            config["kraken"], "publish_kraken_status", False
@@ -84,14 +93,20 @@ def main(options, command: Optional[str]) -> int:
            config["kraken"], "signal_address", "0.0.0.0"
        )
        run_signal = get_yaml_item_value(config["kraken"], "signal_state", "RUN")
+        
+        resiliency_config = get_yaml_item_value(config,"resiliency",{})
+        # Determine execution mode (standalone, controller, or disabled)
+        run_mode = get_yaml_item_value(resiliency_config, "resiliency_run_mode", "standalone")
+        valid_run_modes = {"standalone", "detailed", "disabled"}
+        if run_mode not in valid_run_modes:
+            logging.warning("Unknown resiliency_run_mode '%s'. Defaulting to 'standalone'", run_mode)
+            run_mode = "standalone"
        wait_duration = get_yaml_item_value(config["tunings"], "wait_duration", 60)
        iterations = get_yaml_item_value(config["tunings"], "iterations", 1)
        daemon_mode = get_yaml_item_value(config["tunings"], "daemon_mode", False)

        prometheus_url = config["performance_monitoring"].get("prometheus_url")
-        prometheus_bearer_token = config["performance_monitoring"].get(
-            "prometheus_bearer_token"
-        )
+        prometheus_bearer_token = config["performance_monitoring"].get("prometheus_bearer_token")
        run_uuid = config["performance_monitoring"].get("uuid")
        enable_alerts = get_yaml_item_value(
            config["performance_monitoring"], "enable_alerts", False
@@ -99,9 +114,13 @@ def main(options, command: Optional[str]) -> int:
        enable_metrics = get_yaml_item_value(
            config["performance_monitoring"], "enable_metrics", False
        )
+
+
+        # Default placeholder; will be overridden if a Prometheus URL is available
+        prometheus = None
        # elastic search
        enable_elastic = get_yaml_item_value(config["elastic"], "enable_elastic", False)
-
+        elastic_run_tag = get_yaml_item_value(config["elastic"], "run_tag", "")
        elastic_url = get_yaml_item_value(config["elastic"], "elastic_url", "")

        elastic_verify_certs = get_yaml_item_value(
@@ -144,6 +163,9 @@ def main(options, command: Optional[str]) -> int:
            return -1
        logging.info("Initializing client to talk to the Kubernetes cluster")

+        # Set Cerberus url if enabled
+        cerberus.set_url(config)
+
        # Generate uuid for the run
        if run_uuid:
            logging.info(
@@ -226,6 +248,11 @@ def main(options, command: Optional[str]) -> int:
        else:
            logging.info("Cluster version CRD not detected, skipping")

+        # Final check: ensure Prometheus URL is available; disable resiliency if not
+        if (not prometheus_url or prometheus_url.strip() == "") and run_mode != "disabled":
+            logging.warning("Prometheus URL not provided; disabling resiliency score features.")
+            run_mode = "disabled"
+
        # KrknTelemetry init
        telemetry_k8s = KrknTelemetryKubernetes(
            safe_logger, kubecli, config["telemetry"]
@@ -246,9 +273,18 @@ def main(options, command: Optional[str]) -> int:
        else:
            elastic_search = None
        summary = ChaosRunAlertSummary()
-        if enable_metrics or enable_alerts or check_critical_alerts:
+        if enable_metrics or enable_alerts or check_critical_alerts or run_mode != "disabled":
            prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
-
+            # Quick connectivity probe for Prometheus – disable resiliency if unreachable
+            try:
+                prometheus.process_prom_query_in_range(
+                    "up", datetime.datetime.utcnow() - datetime.timedelta(seconds=60), datetime.datetime.utcnow(), granularity=60
+                )
+            except Exception as prom_exc:  
+                logging.error("Prometheus connectivity test failed: %s. Disabling resiliency features as Prometheus is required for SLO evaluation.", prom_exc)
+                run_mode = "disabled"
+        resiliency_alerts = get_yaml_item_value(resiliency_config, "resiliency_file", get_yaml_item_value(config['performance_monitoring'],"alert_profile", "config/alerts.yaml"))
+        resiliency_obj = Resiliency(resiliency_alerts) if run_mode != "disabled" else None  # Initialize resiliency orchestrator
        logging.info("Server URL: %s" % kubecli.get_host())

        if command == "list-rollback":
@@ -288,6 +324,7 @@ def main(options, command: Optional[str]) -> int:
        chaos_output = ChaosRunOutput()
        chaos_telemetry = ChaosRunTelemetry()
        chaos_telemetry.run_uuid = run_uuid
+        chaos_telemetry.tag = elastic_run_tag
        scenario_plugin_factory = ScenarioPluginFactory()
        classes_and_types: dict[str, list[str]] = {}
        for loaded in scenario_plugin_factory.loaded_plugins.keys():
@@ -363,12 +400,24 @@ def main(options, command: Optional[str]) -> int:
                            )
                            sys.exit(-1)

-                        failed_post_scenarios, scenario_telemetries = (
+                        
+                        batch_window_start_dt = datetime.datetime.utcnow()
+                        failed_scenarios_current, scenario_telemetries = (
                            scenario_plugin.run_scenarios(
                                run_uuid, scenarios_list, config, telemetry_ocp
                            )
                        )
+                        failed_post_scenarios.extend(failed_scenarios_current)
                        chaos_telemetry.scenarios.extend(scenario_telemetries)
+                        batch_window_end_dt = datetime.datetime.utcnow()
+                        if resiliency_obj:
+                            resiliency_obj.add_scenario_reports(
+                                scenario_telemetries=scenario_telemetries,
+                                prom_cli=prometheus,
+                                scenario_type=scenario_type,
+                                batch_start_dt=batch_window_start_dt,
+                                batch_end_dt=batch_window_end_dt,
+                            )

                        post_critical_alerts = 0
                        if check_critical_alerts:
@@ -425,16 +474,51 @@ def main(options, command: Optional[str]) -> int:
            logging.info("collecting Kubernetes cluster metadata....")
            telemetry_k8s.collect_cluster_metadata(chaos_telemetry)

+        # Collect error logs from handler
+        error_logs = error_collection_handler.get_error_logs()
+        if error_logs:
+            logging.info(f"Collected {len(error_logs)} error logs for telemetry")
+            chaos_telemetry.error_logs = error_logs
+        else:
+            logging.info("No error logs collected during chaos run")
+            chaos_telemetry.error_logs = []
+        if resiliency_obj:
+            try:
+                resiliency_obj.attach_compact_to_telemetry(chaos_telemetry)
+            except Exception as exc:
+                logging.error("Failed to embed per-scenario resiliency in telemetry: %s", exc)
+
+        if resiliency_obj:
+            try:
+                resiliency_obj.finalize_and_save(
+                    prom_cli=prometheus,
+                    total_start_time=datetime.datetime.fromtimestamp(start_time),
+                    total_end_time=datetime.datetime.fromtimestamp(end_time),
+                    run_mode=run_mode,
+                )
+
+            except Exception as e:
+                logging.error("Failed to finalize resiliency scoring: %s", e)
+
+
        telemetry_json = chaos_telemetry.to_json()
        decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(telemetry_json))
+        if resiliency_obj and hasattr(resiliency_obj, "summary") and resiliency_obj.summary is not None:
+            summary_dict = resiliency_obj.get_summary()
+            decoded_chaos_run_telemetry.overall_resiliency_report = ResiliencyReport(
+                json_object=summary_dict,
+                resiliency_score=summary_dict.get("resiliency_score", 0),
+                passed_slos=summary_dict.get("passed_slos", 0),
+                total_slos=summary_dict.get("total_slos", 0)
+            )
        chaos_output.telemetry = decoded_chaos_run_telemetry
        logging.info(f"Chaos data:\n{chaos_output.to_json()}")
        if enable_elastic:
-            elastic_telemetry = ElasticChaosRunTelemetry(
+            elastic_telemetry = ElasticChaosRunTelemetry( 
                chaos_run_telemetry=decoded_chaos_run_telemetry
            )
            result = elastic_search.push_telemetry(
-                elastic_telemetry, elastic_telemetry_index
+                decoded_chaos_run_telemetry, elastic_telemetry_index
            )
            if result == -1:
                safe_logger.error(
@@ -646,15 +730,30 @@ if __name__ == "__main__":
    # If no command or regular execution, continue with existing logic
    report_file = options.output
    tee_handler = TeeLogHandler()
+
+    fmt = "%(asctime)s [%(levelname)s] %(message)s"
+    plain = logging.Formatter(fmt)
+    colored = ColoredFormatter(
+        "%(asctime)s [%(log_color)s%(levelname)s%(reset)s] %(message)s",
+        log_colors={'DEBUG': 'white', 'INFO': 'white', 'WARNING': 'yellow', 'ERROR': 'red', 'CRITICAL': 'bold_red'},
+        reset=True, style='%'
+    )
+    file_handler = logging.FileHandler(report_file, mode="w")
+    file_handler.setFormatter(plain)
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(colored)
+    tee_handler.setFormatter(plain)
+    error_collection_handler = ErrorCollectionHandler(level=logging.ERROR)
+
    handlers = [
-        logging.FileHandler(report_file, mode="w"),
-        logging.StreamHandler(),
+        file_handler,
+        stream_handler,
        tee_handler,
+        error_collection_handler,
    ]

    logging.basicConfig(
        level=logging.DEBUG if options.debug else logging.INFO,
-        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=handlers,
    )
    option_error = False
--- a/scenarios/kind/pod_path_provisioner.yml
+++ b/scenarios/kind/pod_path_provisioner.yml
@@ -0,0 +1,6 @@
+- id: kill-pods
+  config:
+    namespace_pattern: "local-path-storage"
+    label_selector: "app=local-path-provisioner"
+    krkn_pod_recovery_time: 20
+    kill: 1
--- a/scenarios/kube/node-network-chaos.yml
+++ b/scenarios/kube/node-network-chaos.yml
@@ -0,0 +1,18 @@
+- id: node_network_chaos
+  image: "quay.io/krkn-chaos/krkn-network-chaos:latest"
+  wait_duration: 1
+  test_duration: 60
+  label_selector: ""
+  service_account: ""
+  taints: []
+  namespace: 'default'
+  instance_count: 1
+  target: "<node_name>"
+  execution: parallel
+  interfaces: []
+  ingress: true
+  egress: true
+  latency: 0s # supported units are us (microseconds), ms, s
+  loss: 10 # percentage
+  bandwidth: 1gbit #supported units are bit kbit mbit gbit tbit
+  force: false
--- a/scenarios/kube/node-network-filter.yml
+++ b/scenarios/kube/node-network-filter.yml
@@ -4,7 +4,7 @@
  test_duration: 10
  label_selector: "<node_selector>"
  service_account: ""
-  taints: [] # example ["node-role.kubernetes.io/master:NoSchedule"]
+  taints: []
  namespace: 'default'
  instance_count: 1
  execution: parallel
--- a/scenarios/kube/pod-network-chaos.yml
+++ b/scenarios/kube/pod-network-chaos.yml
@@ -0,0 +1,17 @@
+- id: pod_network_chaos
+  image: "quay.io/krkn-chaos/krkn-network-chaos:latest"
+  wait_duration: 1
+  test_duration: 60
+  label_selector: ""
+  service_account: ""
+  taints: []
+  namespace: 'default'
+  instance_count: 1
+  target: "<pod_name>"
+  execution: parallel
+  interfaces: []
+  ingress: true
+  egress: true
+  latency: 0s # supported units are us (microseconds), ms, s
+  loss: 10 # percentage
+  bandwidth: 1gbit #supported units are bit kbit mbit gbit tbit
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Paige Patton	b9d7c8ba12	if no scenarios run	2026-03-11 13:58:50 -04:00
Paige Patton	e8075743ab	adding resiliency config default	2026-03-11 13:58:50 -04:00
Paige Patton	ec5511b2db	custom weight	2026-03-11 13:58:50 -04:00
Abhinav Sharma	4e7dca9474	fix: check prometheus url after openshift prometheus check Signed-off-by: Abhinav Sharma <abhinavs1920bpl@gmail.com> Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-03-11 13:58:49 -04:00
Abhinav Sharma	edf0f3d1c9	feat(resiliency): implement comprehensive resiliency scoring system - Added resiliency scoring engine - Implemented scenario-wise scoring with telemetry - Added configurable SLOs and detailed reporting Signed-off-by: Abhinav Sharma <abhinavs1920bpl@gmail.com> Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-03-11 13:58:34 -04:00
Paige Patton	8c9bce6987	sed change (#1186 ) Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-03-11 12:34:12 -04:00
Arpit Raj	5608482f1b	fix: use sorted() instead of .sort() for key validation (#1182 ) (#1184 ) Signed-off-by: Arpit Raj <vrxn.arp1traj@gmail.com>	2026-03-10 10:58:12 -04:00
Darshan Jain	a14d3955a6	feat(ci): add pytest-based CI test framework v2 with ephemeral namespace isolation (#1172 ) (#1171 ) * feat: add pytest-based CI test framework v2 with ephemeral namespace isolation Signed-off-by: ddjain <darjain@redhat.com> * feat(ci): add tests_v2 pytest functional test framework Signed-off-by: ddjain <darjain@redhat.com> Co-authored-by: Cursor <cursoragent@cursor.com> * feat: improve naming convention Signed-off-by: ddjain <darjain@redhat.com> * improve local setup script. Signed-off-by: ddjain <darjain@redhat.com> * added CI job for v2 test Signed-off-by: ddjain <darjain@redhat.com> * disabled broken test Signed-off-by: ddjain <darjain@redhat.com> * improved CI pipeline execution time Signed-off-by: ddjain <darjain@redhat.com> * chore: remove unwanted/generated files from PR Signed-off-by: ddjain <darjain@redhat.com> * clean up gitignore file Signed-off-by: ddjain <darjain@redhat.com> * fix copilot comments Signed-off-by: ddjain <darjain@redhat.com> * fixed copilot suggestion Signed-off-by: ddjain <darjain@redhat.com> * uncommented out test upload stage Signed-off-by: ddjain <darjain@redhat.com> * exclude CI/tests_v2 from test coverage reporting Signed-off-by: ddjain <darjain@redhat.com> * uploading style.css to fix broken report artifacts Signed-off-by: ddjain <darjain@redhat.com> * added openshift supported labels in namespace creatation api Signed-off-by: ddjain <darjain@redhat.com> --------- Signed-off-by: ddjain <darjain@redhat.com> Co-authored-by: Cursor <cursoragent@cursor.com>	2026-03-06 08:44:07 -05:00
Arpit Raj	f655ec1a73	fix: accumulate failed scenarios across all scenario types instead of overwriting (#1178 ) Signed-off-by: Arpit Raj <vrxn.arp1traj@gmail.com>	2026-03-05 14:06:56 -05:00
Paige Patton	dfc350ac03	adding set run tag (#1174 ) Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-02-27 15:05:05 -05:00
Paige Patton	c474b810b2	updating to use krkn-lib virt functions (#989 ) Assisted By: Claude Code Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-02-27 14:45:31 -05:00
Paige Patton	072e8d0e87	changing pod (#1175 ) Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-02-27 14:40:49 -05:00
Nesar Kavri	aee61061ac	Fix: make entrypoint fail fast if setup-ssh.sh fails (#1170 ) Signed-off-by: Nesar976 <kavrinesar@gmail.com>	2026-02-27 14:18:01 -05:00
Paige Patton	544cac8bbb	merge (#710 ) Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-02-27 14:10:08 -05:00
SurbhiAgarwal	49b1affdb8	Improve error message clarity for setuptools version requirement (#1162 ) Fixes #1143 - Updated error message to clearly state that version 38.3 or newer is required Signed-off-by: Surbhi <agarwalsurbhi1807@gmail.com> Co-authored-by: Paige Patton <64206430+paigerube14@users.noreply.github.com>	2026-02-24 10:59:22 -05:00
Darshan Jain	c1dd43fe87	DevConf Pune 2026 feedback (#1169 ) Signed-off-by: ddjain <darjain@redhat.com>	2026-02-23 19:54:06 +05:30
Ashish Mahajan	8dad2a3996	fix: use per-URL status_code in HealthChecker telemetry (#1091 ) Signed-off-by: AR21SM <mahajanashishar21sm@gmail.com> Co-authored-by: Paige Patton <64206430+paigerube14@users.noreply.github.com>	2026-02-19 09:25:03 -05:00
Tullio Sebastiani	cebc60f5a8	Network chaos NG porting - `pod network chaos` `node network chaos` (#991 ) * fix ibm Signed-off-by: Paige Patton <prubenda@redhat.com> * type hint fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * pod network chaos plugin structure + utils method refactoring Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * Pod network chaos plugin Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * Node network chaos plugin Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * default config files Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * config.yaml Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * all field optional Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * minor fixes Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * minor nit on config Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * utils unit tests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * PodNetworkChaos unit tests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * NodeNetworkChaos unit test Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * PodNetworkChaos functional test Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * NodeNetworkChaso functional test Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * added funtests to the gh action Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * unit test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * changed test order + resource rename * functional tests fix smallchange Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fix requirements Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * changed pod test target Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * added kind port mapping and removed portforwarding Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> test fixes Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> test fixes Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Paige Patton <prubenda@redhat.com> Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> Co-authored-by: Paige Patton <prubenda@redhat.com>	2026-02-18 16:20:16 +01:00
Darshan Jain	2065443622	collect ERROR and CRITICAL logs and send to elastic search (#1147 ) (#1150 ) * collect ERROR and CRITICAL logs and send to elastic search Signed-off-by: ddjain <darjain@redhat.com> * bump up krkn-lib to 6.0.3 Signed-off-by: ddjain <darjain@redhat.com> --------- Signed-off-by: ddjain <darjain@redhat.com>	2026-02-18 18:26:14 +05:30
Ashish Mahajan	b6ef7fa052	fix: use list comprehension to avoid skipping nodes during exclusion (#1059 ) Fixes #1058 Signed-off-by: AR21SM <mahajanashishar21sm@gmail.com> Co-authored-by: Paige Patton <64206430+paigerube14@users.noreply.github.com>	2026-02-17 15:20:10 -05:00
Paige Patton	4f305e78aa	remove chaos ai Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-02-11 13:44:13 -05:00
dependabot[bot]	b17e933134	Bump pillow from 10.3.0 to 12.1.1 in /utils/chaos_ai (#1157 ) Bumps [pillow](https://github.com/python-pillow/Pillow) from 10.3.0 to 12.1.1. - [Release notes](https://github.com/python-pillow/Pillow/releases) - [Changelog](https://github.com/python-pillow/Pillow/blob/main/CHANGES.rst) - [Commits](https://github.com/python-pillow/Pillow/compare/10.3.0...12.1.1) --- updated-dependencies: - dependency-name: pillow dependency-version: 12.1.1 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-02-11 10:08:42 -05:00
Paige Patton	beea484597	adding vm ware tests (#1133 ) Signed-off-by: Paige Patton <paigepatton@Paiges-MacBook-Air.local> Signed-off-by: Paige Patton <prubenda@redhat.com> Co-authored-by: Paige Patton <paigepatton@Paiges-MacBook-Air.local>	2026-02-10 16:24:26 -05:00
Paige Patton	0222b0f161	fix ibm (#1155 ) Signed-off-by: Paige Patton <prubenda@redhat.com>	2026-02-10 10:09:28 -05:00
Ashish Mahajan	f7e674d5ad	docs: fix typos in logs, comments, and documentation (#1079 ) Signed-off-by: AR21SM <mahajanashishar21sm@gmail.com>	2026-02-09 09:48:51 -05:00
Ashish Mahajan	7aea12ce6c	fix(VirtChecker): handle empty VMI interfaces list (#1072 ) Signed-off-by: AR21SM <mahajanashishar21sm@gmail.com> Co-authored-by: Paige Patton <64206430+paigerube14@users.noreply.github.com>	2026-02-09 08:29:48 -05:00
Darshan Jain	625e1e90cf	feat: add color-coded console logging (#1122 ) (#1146 ) Some checks failed Functional & Unit Tests / Functional & Unit Tests (push) Failing after 2m16s Functional & Unit Tests / Generate Coverage Badge (push) Has been skipped Manage Stale Issues and Pull Requests / Mark and Close Stale Issues and PRs (push) Successful in 24s Signed-off-by: ddjain <darjain@redhat.com>	2026-02-05 14:27:52 +05:30