checking chunk error in ci tests (#937)

Signed-off-by: Paige Patton <prubenda@redhat.com>
2026-02-14 09:59:59 +00:00 · 2025-12-17 15:09:15 -05:00
parent c3f6b1a7ff
commit e7fa6bdebc
8 changed files with 57 additions and 52 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -102,7 +102,7 @@ jobs:
            echo "test_pod_network_filter" >> ./CI/tests/functional_tests
            echo "test_pod_server" >> ./CI/tests/functional_tests
            echo "test_node" >> ./CI/tests/functional_tests
-            echo "test_pvc" >> ./CI/tests/functional_tests
+            # echo "test_pvc" >> ./CI/tests/functional_tests

      # Push on main only steps + all other functional to collect coverage
      # for the badge
@@ -140,7 +140,7 @@ jobs:
          echo "test_pod_network_filter" >> ./CI/tests/functional_tests
          echo "test_pod_server" >> ./CI/tests/functional_tests
          echo "test_node" >> ./CI/tests/functional_tests
-          echo "test_pvc" >> ./CI/tests/functional_tests
+          # echo "test_pvc" >> ./CI/tests/functional_tests 
      # Final common steps
      - name: Run Functional tests
        env:
--- a/CI/tests/test_container.sh
+++ b/CI/tests/test_container.sh
@@ -16,8 +16,10 @@ function functional_test_container_crash {
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml

-  python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml
+  python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml -d True
  echo "Container scenario test: Success"
+
+  kubectl get pods -n kube-system -l component=etcd
 }

 functional_test_container_crash
--- a/CI/tests/test_customapp_pod.sh
+++ b/CI/tests/test_customapp_pod.sh
@@ -11,7 +11,7 @@ function functional_test_customapp_pod_node_selector {
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/customapp_pod_config.yaml

-  python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml
+  python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml -d True
  echo "Pod disruption with node_label_selector test: Success"
 }

--- a/CI/tests/test_pod.sh
+++ b/CI/tests/test_pod.sh
@@ -10,9 +10,11 @@ function functional_test_pod_crash {
  export scenario_file="scenarios/kind/pod_etcd.yml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml
-  cat CI/config/pod_config.yaml
+
  python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml
  echo "Pod disruption scenario test: Success"
+  date
+  kubectl get pods -n kube-system -l component=etcd -o yaml
 }

 functional_test_pod_crash
--- a/krkn/scenario_plugins/container/container_scenario_plugin.py
+++ b/krkn/scenario_plugins/container/container_scenario_plugin.py
@@ -1,6 +1,7 @@
 import logging
 import random
 import time
+import traceback
 from asyncio import Future
 import yaml
 from krkn_lib.k8s import KrknKubernetes
@@ -41,6 +42,7 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
                        logging.info("ContainerScenarioPlugin failed with unrecovered containers")
                        return 1
        except (RuntimeError, Exception) as e:
+            logging.error("Stack trace:\n%s", traceback.format_exc())
            logging.error("ContainerScenarioPlugin exiting due to Exception %s" % e)
            return 1
        else:
@@ -50,7 +52,6 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
        return ["container_scenarios"]

    def start_monitoring(self, kill_scenario: dict, lib_telemetry: KrknTelemetryOpenshift) -> Future:
-        
        namespace_pattern = f"^{kill_scenario['namespace']}$"
        label_selector = kill_scenario["label_selector"]
        recovery_time = kill_scenario["expected_recovery_time"]
@@ -232,4 +233,5 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
            timer += 5
            logging.info("Waiting 5 seconds for containers to become ready")
            time.sleep(5)
+
        return killed_container_list
--- a/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py
+++ b/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py
@@ -2,7 +2,7 @@ import logging
 import random
 import time
 from asyncio import Future
-
+import traceback
 import yaml
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.k8s.pod_monitor import select_and_monitor_by_namespace_pattern_and_label, \
@@ -74,6 +74,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
                        return 1
                    
        except (RuntimeError, Exception) as e:
+            logging.error("Stack trace:\n%s", traceback.format_exc())
            logging.error("PodDisruptionScenariosPlugin exiting due to Exception %s" % e)
            return 1
        else:
@@ -150,7 +151,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
                field_selector=combined_field_selector
            )

-    def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None, quiet: bool = False): 
+    def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None): 
        if label_selector and name_pattern: 
            logging.error('Only, one of name pattern or label pattern can be specified')
            return []
@@ -161,8 +162,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
        
        # If specific node names are provided, make multiple calls with field selector
        if node_names:
-            if not quiet:
-                logging.info(f"Targeting pods on {len(node_names)} specific nodes")
+            logging.debug(f"Targeting pods on {len(node_names)} specific nodes")
            all_pods = []
            for node_name in node_names:
                pods = self._select_pods_with_field_selector(
@@ -172,8 +172,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
                if pods:
                    all_pods.extend(pods)
            
-            if not quiet:
-                logging.info(f"Found {len(all_pods)} target pods across {len(node_names)} nodes")
+            logging.debug(f"Found {len(all_pods)} target pods across {len(node_names)} nodes")
            return all_pods
        
        #  Node label selector approach - use field selectors
@@ -181,11 +180,10 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
            # Get nodes matching the label selector first
            nodes_with_label = kubecli.list_nodes(label_selector=node_label_selector)
            if not nodes_with_label:
-                logging.info(f"No nodes found with label selector: {node_label_selector}")
+                logging.debug(f"No nodes found with label selector: {node_label_selector}")
                return []
            
-            if not quiet:
-                logging.info(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}")
+            logging.debug(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}")
            # Use field selector for each node
            all_pods = []
            for node_name in nodes_with_label:
@@ -196,8 +194,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
                if pods:
                    all_pods.extend(pods)
            
-            if not quiet:
-                logging.info(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes")
+            logging.debug(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes")
            return all_pods
        
        # Standard pod selection (no node targeting)
@@ -207,37 +204,40 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
    
    def killing_pods(self, config: InputParams, kubecli: KrknKubernetes):
        # region Select target pods
+        try:
+            namespace = config.namespace_pattern
+            if not namespace: 
+                logging.error('Namespace pattern must be specified')

-        namespace = config.namespace_pattern
-        if not namespace: 
-            logging.error('Namespace pattern must be specified')
-            return 2
+            pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
+            exclude_pods = set()
+            if config.exclude_label:
+                _exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
+                for pod in _exclude_pods:
+                    exclude_pods.add(pod[0])

-        pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
-        exclude_pods = set()
-        if config.exclude_label:
-            _exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
-            for pod in _exclude_pods:
-                exclude_pods.add(pod[0])

-        pods_count = len(pods)
-        if len(pods) < config.kill:
-            logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format(
-                    config.kill, len(pods)))
-            return 2
+            pods_count = len(pods)
+            if len(pods) < config.kill:
+                logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format(
+                        config.kill, len(pods)))
+                return 1
            
-        random.shuffle(pods)
-        for i in range(config.kill):
-            pod = pods[i]
-            logging.info(pod)
-            if pod[0] in exclude_pods:
-                logging.info(f"Excluding {pod[0]} from chaos")
-            else:
-                logging.info(f'Deleting pod {pod[0]}')
-                kubecli.delete_pod(pod[0], pod[1])
+            random.shuffle(pods)
+            for i in range(config.kill):
+                pod = pods[i]
+                logging.info(pod)
+                if pod[0] in exclude_pods:
+                    logging.info(f"Excluding {pod[0]} from chaos")
+                else:
+                    logging.info(f'Deleting pod {pod[0]}')
+                    kubecli.delete_pod(pod[0], pod[1])
            
-        ret = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names)
-        return ret
+            return_val = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names)
+        except Exception as e:
+            raise(e)
+
+        return return_val

    def wait_for_pods(
        self, label_selector, pod_name, namespace, pod_count, duration, wait_timeout, kubecli: KrknKubernetes, node_label_selector, node_names
@@ -246,7 +246,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
        start_time = datetime.now()

        while not timeout:
-            pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names, quiet=True)
+            pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names)
            if pod_count == len(pods):
                return 0
            
@@ -259,5 +259,4 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
                logging.error("timeout while waiting for pods to come up")
                return 1

-        # should never get to this return
        return 0
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ google-cloud-compute==1.22.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
 jinja2==3.1.6
-krkn-lib==5.1.12
+krkn-lib==5.1.13
 lxml==5.1.0
 kubernetes==34.1.0
 numpy==1.26.4
--- a/scenarios/kind/pvc_scenario.yaml
+++ b/scenarios/kind/pvc_scenario.yaml
@@ -2,6 +2,6 @@ pvc_scenario:
  pvc_name: kraken-test-pvc         # Name of the target PVC
  pod_name: kraken-test-pod      # Name of the pod where the PVC is mounted, it will be ignored if the pvc_name is defined
  namespace: kraken  # Namespace where the PVC is
-  fill_percentage: 38           # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99
+  fill_percentage: 98           # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99
  duration: 10                  # Duration in seconds for the fault
  block_size: 102400            # used only by dd if fallocate not present in the container