Integration of the new pod recovery monitoring strategy implemented in krkn-lib (#609)

* pod monitoring integration in plugin scenario Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * pod monitoring integration in container scenario Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed wait-for-pod step from plugin scenario config files Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * introduced global pod recovery time Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> nit Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * introduced krkn_pod_recovery_time in plugin scenario and removed all the references to wait-for-pods Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * functional test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * main branch functional test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * increased recovery times Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
2026-02-14 09:59:59 +00:00 · 2024-04-23 10:49:01 +02:00
parent 19ad2d1a3d
commit ab98e416a6
22 changed files with 124 additions and 85 deletions
--- a/CI/tests/common.sh
+++ b/CI/tests/common.sh
@@ -1,7 +1,7 @@
 ERRORED=false

 function finish {
-    if [ $? -eq 1 ] && [ $ERRORED != "true" ]
+    if [ $? != 0 ] && [ $ERRORED != "true" ]
    then
        error
    fi
@@ -13,8 +13,10 @@ function error {
    then
      echo "Error caught."
      ERRORED=true
-    else
-      echo "Exit code greater than zero detected: $exit_code"
+    elif [ $exit_code == 2 ]
+    then
+      echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
+      exit 0
    fi
 }

--- a/CI/tests/test_container.sh
+++ b/CI/tests/test_container.sh
@@ -8,11 +8,11 @@ trap finish EXIT
 pod_file="CI/scenarios/hello_pod.yaml"

 function functional_test_container_crash {
-  yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
-  yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
-  yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
+  yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
+  yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
+  yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
  export scenario_type="container_scenarios"
-  export scenario_file="- scenarios/openshift/app_outage.yaml"
+  export scenario_file="- scenarios/openshift/container_etcd.yml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml

--- a/CI/tests/test_telemetry.sh
+++ b/CI/tests/test_telemetry.sh
@@ -22,14 +22,14 @@ function functional_test_telemetry {
  export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
-  python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
+  retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
  $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
  echo "checking if telemetry files are uploaded on s3"
  cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
-  cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 )
-  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
-  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
+  cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded"  && exit 1 )
+  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded"  && exit 1 )
+  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded"  && exit 1 )
  echo "all files uploaded!"
  echo "Telemetry Collection: Success"
 }
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the
    namespace_pattern: ^<namespace>$
    label_selector: <pod label>
    kill: <number of pods to kill>
- id: wait-for-pods
-  config:
-    namespace_pattern: ^<namespace>$
-    label_selector: <pod label>
-    count: <expected number of pods that match namespace and label>
+    krkn_pod_recovery_time: <expected time for the pod to become ready>
 ```

 #### Node Scenario Yaml Template
--- a/docs/pod_scenarios.md
+++ b/docs/pod_scenarios.md
@@ -17,11 +17,8 @@ You can then create the scenario file with the following contents:
  config:
    namespace_pattern: ^kube-system$
    label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: k8s-app=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
+    
 ```

 Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.
--- a/kraken/plugins/init.py
+++ b/kraken/plugins/init.py
@@ -2,11 +2,14 @@ import dataclasses
 import json
 import logging
 from os.path import abspath
-from typing import List, Dict
+from typing import List, Dict, Any
 import time

 from arcaflow_plugin_sdk import schema, serialization, jsonschema
 from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
+from krkn_lib.k8s import KrknKubernetes
+from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
+
 import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
 import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
 from kraken.plugins.run_python_plugin import run_python_file
@@ -47,11 +50,14 @@ class Plugins:
                )
            self.steps_by_id[step.schema.id] = step

+    def unserialize_scenario(self, file: str) -> Any:
+        return serialization.load_from_file(abspath(file))
+
    def run(self, file: str, kubeconfig_path: str, kraken_config: str):
        """
        Run executes a series of steps
        """
-        data = serialization.load_from_file(abspath(file))
+        data = self.unserialize_scenario(abspath(file))
        if not isinstance(data, list):
            raise Exception(
                "Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
@@ -241,7 +247,15 @@ PLUGINS = Plugins(
 )


-def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]):
+def run(scenarios: List[str],
+        kubeconfig_path: str,
+        kraken_config: str,
+        failed_post_scenarios: List[str],
+        wait_duration: int,
+        telemetry: KrknTelemetryKubernetes,
+        kubecli: KrknKubernetes
+        ) -> (List[str], list[ScenarioTelemetry]):
+
    scenario_telemetries: list[ScenarioTelemetry] = []
    for scenario in scenarios:
        scenario_telemetry = ScenarioTelemetry()
@@ -249,10 +263,21 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
        scenario_telemetry.startTimeStamp = time.time()
        telemetry.set_parameters_base64(scenario_telemetry, scenario)
        logging.info('scenario ' + str(scenario))
+        pool = PodsMonitorPool(kubecli)
+        kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"]
+
        try:
+            start_monitoring(pool, kill_scenarios)
            PLUGINS.run(scenario, kubeconfig_path, kraken_config)
+            result = pool.join()
+            scenario_telemetry.affected_pods = result
+            if result.error:
+                raise Exception(f"unrecovered pods: {result.error}")
+
        except Exception as e:
+            logging.error(f"scenario exception: {str(e)}")
            scenario_telemetry.exitStatus = 1
+            pool.cancel()
            failed_post_scenarios.append(scenario)
            log_exception(scenario)
        else:
@@ -263,3 +288,31 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
        scenario_telemetry.endTimeStamp = time.time()

    return failed_post_scenarios, scenario_telemetries
+
+
+def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]):
+    for kill_scenario in scenarios:
+        recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
+        if ("namespace_pattern" in kill_scenario["config"] and
+                "label_selector" in kill_scenario["config"]):
+            namespace_pattern = kill_scenario["config"]["namespace_pattern"]
+            label_selector = kill_scenario["config"]["label_selector"]
+            pool.select_and_monitor_by_namespace_pattern_and_label(
+                namespace_pattern=namespace_pattern,
+                label_selector=label_selector,
+                max_timeout=recovery_time)
+            logging.info(
+                f"waiting {recovery_time} seconds for pod recovery, "
+                f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}")
+
+        elif ("namespace_pattern" in kill_scenario["config"] and
+              "name_pattern" in kill_scenario["config"]):
+            namespace_pattern = kill_scenario["config"]["namespace_pattern"]
+            name_pattern = kill_scenario["config"]["name_pattern"]
+            pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern,
+                                                                          namespace_pattern=namespace_pattern,
+                                                                          max_timeout=recovery_time)
+            logging.info(f"waiting {recovery_time} seconds for pod recovery, "
+                         f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}")
+        else:
+            raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration")
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -1,9 +1,13 @@
 import logging
 import time
+from typing import Any
+
 import yaml
 import sys
 import random
 import arcaflow_plugin_kill_pod
+from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
+
 import kraken.cerberus.setup as cerberus
 import kraken.post_actions.actions as post_actions
 from krkn_lib.k8s import KrknKubernetes
@@ -79,6 +83,7 @@ def container_run(kubeconfig_path,

    failed_scenarios = []
    scenario_telemetries: list[ScenarioTelemetry] = []
+    pool = PodsMonitorPool(kubecli)

    for container_scenario_config in scenarios_list:
        scenario_telemetry = ScenarioTelemetry()
@@ -91,23 +96,17 @@ def container_run(kubeconfig_path,
            pre_action_output = ""
        with open(container_scenario_config[0], "r") as f:
            cont_scenario_config = yaml.full_load(f)
+            start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool)
            for cont_scenario in cont_scenario_config["scenarios"]:
                # capture start time
                start_time = int(time.time())
                try:
                    killed_containers = container_killing_in_pod(cont_scenario, kubecli)
-                    if len(container_scenario_config) > 1:
-                        failed_post_scenarios = post_actions.check_recovery(
-                            kubeconfig_path,
-                            container_scenario_config,
-                            failed_post_scenarios,
-                            pre_action_output
-                        )
-                    else:
-                        failed_post_scenarios = check_failed_containers(
-                            killed_containers, cont_scenario.get("retry_wait", 120), kubecli
-                        )
-
+                    logging.info(f"killed containers: {str(killed_containers)}")
+                    result = pool.join()
+                    if result.error:
+                        raise Exception(f"pods failed to recovery: {result.error}")
+                    scenario_telemetry.affected_pods = result
                    logging.info("Waiting for the specified duration: %s" % (wait_duration))
                    time.sleep(wait_duration)

@@ -117,6 +116,7 @@ def container_run(kubeconfig_path,
                    # publish cerberus status
                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
                except (RuntimeError, Exception):
+                    pool.cancel()
                    failed_scenarios.append(container_scenario_config[0])
                    log_exception(container_scenario_config[0])
                    scenario_telemetry.exitStatus = 1
@@ -129,6 +129,16 @@ def container_run(kubeconfig_path,

    return failed_scenarios, scenario_telemetries

+def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool):
+    for kill_scenario in kill_scenarios:
+        namespace_pattern = f"^{kill_scenario['namespace']}$"
+        label_selector = kill_scenario["label_selector"]
+        recovery_time = kill_scenario["expected_recovery_time"]
+        pool.select_and_monitor_by_namespace_pattern_and_label(
+            namespace_pattern=namespace_pattern,
+            label_selector=label_selector,
+            max_timeout=recovery_time)
+

 def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
    scenario_name = get_yaml_item_value(cont_scenario, "name", "")
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ google-api-python-client==2.116.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
 jinja2==3.1.3
-krkn-lib==2.1.1
+krkn-lib==2.1.2
 lxml==5.1.0
 kubernetes==26.1.0
 oauth2client==4.1.3
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -264,7 +264,8 @@ def main(cfg):
                                kraken_config,
                                failed_post_scenarios,
                                wait_duration,
-                                telemetry_k8s
+                                telemetry_k8s,
+                                kubecli
                            )
                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                        # krkn_lib
--- a/scenarios/kind/scheduler.yml
+++ b/scenarios/kind/scheduler.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^kube-system$
    label_selector: component=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: component=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/kube/pod.yml
+++ b/scenarios/kube/pod.yml
@@ -4,3 +4,4 @@
    name_pattern: ^nginx-.*$
    namespace_pattern: ^default$
    kill: 1
+    krkn_pod_recovery_time: 120
--- a/scenarios/kube/scheduler.yml
+++ b/scenarios/kube/scheduler.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^kube-system$
    label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
-  config:
-    namespace_pattern: ^kube-system$
-    label_selector: k8s-app=kube-scheduler
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/container_etcd.yml
+++ b/scenarios/openshift/container_etcd.yml
@@ -5,4 +5,4 @@ scenarios:
  container_name: "etcd"
  action: 1
  count: 1
-  expected_recovery_time: 60
+  expected_recovery_time: 120
--- a/scenarios/openshift/customapp_pod.yaml
+++ b/scenarios/openshift/customapp_pod.yaml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^acme-air$
    name_pattern: .*
- id: wait-for-pods
-  config:
-    namespace_pattern: ^acme-air$
-    name_pattern: .*
-    count: 8
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/etcd.yml
+++ b/scenarios/openshift/etcd.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^openshift-etcd$
    label_selector: k8s-app=etcd
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-etcd$
-    label_selector: k8s-app=etcd
-    count: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/openshift-apiserver.yml
+++ b/scenarios/openshift/openshift-apiserver.yml
@@ -3,8 +3,5 @@
  config:
    namespace_pattern: ^openshift-apiserver$
    label_selector: app=openshift-apiserver-a
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-apiserver$
-    label_selector: app=openshift-apiserver-a
-    count: 3
+    krkn_pod_recovery_time: 120
+
--- a/scenarios/openshift/openshift-kube-apiserver.yml
+++ b/scenarios/openshift/openshift-kube-apiserver.yml
@@ -3,8 +3,5 @@
  config:
    namespace_pattern: ^openshift-kube-apiserver$
    label_selector: app=openshift-kube-apiserver
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-kube-apiserver$
-    label_selector: app=openshift-kube-apiserver
-    count: 3
+    krkn_pod_recovery_time: 120
+
--- a/scenarios/openshift/post_action_prometheus.yml
+++ b/scenarios/openshift/post_action_prometheus.yml
@@ -3,8 +3,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: app=prometheus
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: app=prometheus
-    count: 2
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/prom_kill.yml
+++ b/scenarios/openshift/prom_kill.yml
@@ -2,8 +2,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
-    count: 1
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/prometheus.yml
+++ b/scenarios/openshift/prometheus.yml
@@ -3,9 +3,4 @@
  config:
    namespace_pattern: ^openshift-monitoring$
    label_selector: app=prometheus
- id: wait-for-pods
-  config:
-    namespace_pattern: ^openshift-monitoring$
-    label_selector: app=prometheus
-    count: 2
-    timeout: 180
+    krkn_pod_recovery_time: 120
--- a/scenarios/openshift/regex_openshift_pod_kill.yml
+++ b/scenarios/openshift/regex_openshift_pod_kill.yml
@@ -4,3 +4,4 @@
    namespace_pattern: ^openshift-.*$
    name_pattern: .*
    kill: 3
+    krkn_pod_recovery_time: 120
--- a/scenarios/plugin.schema.json
+++ b/scenarios/plugin.schema.json
@@ -60,7 +60,14 @@
 										"default": 1,
 										"title": "Backoff",
 										"description": "How many seconds to wait between checks for the target pod status."
+									},
+									"krkn_pod_recovery_time": {
+										"type": "integer",
+										"default": 30,
+										"title": "Recovery Time",
+										"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
 									}
+
 								},
 								"required": [
 									"namespace_pattern"
@@ -112,6 +119,12 @@
 								"default": 1,
 								"title": "Backoff",
 								"description": "How many seconds to wait between checks for the target pod status."
+							},
+							"krkn_pod_recovery_time": {
+								"type": "integer",
+								"default": 30,
+								"title": "Recovery Time",
+								"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
 							}
 						},
 						"required": [