checking chunk error in ci tests (#937)

Signed-off-by: Paige Patton <prubenda@redhat.com>
This commit is contained in:
Paige Patton
2025-12-17 15:09:15 -05:00
committed by GitHub
parent c3f6b1a7ff
commit e7fa6bdebc
8 changed files with 57 additions and 52 deletions

View File

@@ -102,7 +102,7 @@ jobs:
echo "test_pod_network_filter" >> ./CI/tests/functional_tests
echo "test_pod_server" >> ./CI/tests/functional_tests
echo "test_node" >> ./CI/tests/functional_tests
echo "test_pvc" >> ./CI/tests/functional_tests
# echo "test_pvc" >> ./CI/tests/functional_tests
# Push on main only steps + all other functional to collect coverage
# for the badge
@@ -140,7 +140,7 @@ jobs:
echo "test_pod_network_filter" >> ./CI/tests/functional_tests
echo "test_pod_server" >> ./CI/tests/functional_tests
echo "test_node" >> ./CI/tests/functional_tests
echo "test_pvc" >> ./CI/tests/functional_tests
# echo "test_pvc" >> ./CI/tests/functional_tests
# Final common steps
- name: Run Functional tests
env:

View File

@@ -16,8 +16,10 @@ function functional_test_container_crash {
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml -d True
echo "Container scenario test: Success"
kubectl get pods -n kube-system -l component=etcd
}
functional_test_container_crash

View File

@@ -11,7 +11,7 @@ function functional_test_customapp_pod_node_selector {
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/customapp_pod_config.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml -d True
echo "Pod disruption with node_label_selector test: Success"
}

View File

@@ -10,9 +10,11 @@ function functional_test_pod_crash {
export scenario_file="scenarios/kind/pod_etcd.yml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml
cat CI/config/pod_config.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml
echo "Pod disruption scenario test: Success"
date
kubectl get pods -n kube-system -l component=etcd -o yaml
}
functional_test_pod_crash

View File

@@ -1,6 +1,7 @@
import logging
import random
import time
import traceback
from asyncio import Future
import yaml
from krkn_lib.k8s import KrknKubernetes
@@ -41,6 +42,7 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
logging.info("ContainerScenarioPlugin failed with unrecovered containers")
return 1
except (RuntimeError, Exception) as e:
logging.error("Stack trace:\n%s", traceback.format_exc())
logging.error("ContainerScenarioPlugin exiting due to Exception %s" % e)
return 1
else:
@@ -50,7 +52,6 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
return ["container_scenarios"]
def start_monitoring(self, kill_scenario: dict, lib_telemetry: KrknTelemetryOpenshift) -> Future:
namespace_pattern = f"^{kill_scenario['namespace']}$"
label_selector = kill_scenario["label_selector"]
recovery_time = kill_scenario["expected_recovery_time"]
@@ -232,4 +233,5 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
timer += 5
logging.info("Waiting 5 seconds for containers to become ready")
time.sleep(5)
return killed_container_list

View File

@@ -2,7 +2,7 @@ import logging
import random
import time
from asyncio import Future
import traceback
import yaml
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.k8s.pod_monitor import select_and_monitor_by_namespace_pattern_and_label, \
@@ -74,6 +74,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
return 1
except (RuntimeError, Exception) as e:
logging.error("Stack trace:\n%s", traceback.format_exc())
logging.error("PodDisruptionScenariosPlugin exiting due to Exception %s" % e)
return 1
else:
@@ -150,7 +151,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
field_selector=combined_field_selector
)
def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None, quiet: bool = False):
def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None):
if label_selector and name_pattern:
logging.error('Only, one of name pattern or label pattern can be specified')
return []
@@ -161,8 +162,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
# If specific node names are provided, make multiple calls with field selector
if node_names:
if not quiet:
logging.info(f"Targeting pods on {len(node_names)} specific nodes")
logging.debug(f"Targeting pods on {len(node_names)} specific nodes")
all_pods = []
for node_name in node_names:
pods = self._select_pods_with_field_selector(
@@ -172,8 +172,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
if pods:
all_pods.extend(pods)
if not quiet:
logging.info(f"Found {len(all_pods)} target pods across {len(node_names)} nodes")
logging.debug(f"Found {len(all_pods)} target pods across {len(node_names)} nodes")
return all_pods
# Node label selector approach - use field selectors
@@ -181,11 +180,10 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
# Get nodes matching the label selector first
nodes_with_label = kubecli.list_nodes(label_selector=node_label_selector)
if not nodes_with_label:
logging.info(f"No nodes found with label selector: {node_label_selector}")
logging.debug(f"No nodes found with label selector: {node_label_selector}")
return []
if not quiet:
logging.info(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}")
logging.debug(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}")
# Use field selector for each node
all_pods = []
for node_name in nodes_with_label:
@@ -196,8 +194,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
if pods:
all_pods.extend(pods)
if not quiet:
logging.info(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes")
logging.debug(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes")
return all_pods
# Standard pod selection (no node targeting)
@@ -207,37 +204,40 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
def killing_pods(self, config: InputParams, kubecli: KrknKubernetes):
# region Select target pods
try:
namespace = config.namespace_pattern
if not namespace:
logging.error('Namespace pattern must be specified')
namespace = config.namespace_pattern
if not namespace:
logging.error('Namespace pattern must be specified')
return 2
pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
exclude_pods = set()
if config.exclude_label:
_exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
for pod in _exclude_pods:
exclude_pods.add(pod[0])
pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
exclude_pods = set()
if config.exclude_label:
_exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names)
for pod in _exclude_pods:
exclude_pods.add(pod[0])
pods_count = len(pods)
if len(pods) < config.kill:
logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format(
config.kill, len(pods)))
return 2
pods_count = len(pods)
if len(pods) < config.kill:
logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format(
config.kill, len(pods)))
return 1
random.shuffle(pods)
for i in range(config.kill):
pod = pods[i]
logging.info(pod)
if pod[0] in exclude_pods:
logging.info(f"Excluding {pod[0]} from chaos")
else:
logging.info(f'Deleting pod {pod[0]}')
kubecli.delete_pod(pod[0], pod[1])
random.shuffle(pods)
for i in range(config.kill):
pod = pods[i]
logging.info(pod)
if pod[0] in exclude_pods:
logging.info(f"Excluding {pod[0]} from chaos")
else:
logging.info(f'Deleting pod {pod[0]}')
kubecli.delete_pod(pod[0], pod[1])
ret = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names)
return ret
return_val = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names)
except Exception as e:
raise(e)
return return_val
def wait_for_pods(
self, label_selector, pod_name, namespace, pod_count, duration, wait_timeout, kubecli: KrknKubernetes, node_label_selector, node_names
@@ -246,7 +246,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
start_time = datetime.now()
while not timeout:
pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names, quiet=True)
pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names)
if pod_count == len(pods):
return 0
@@ -259,5 +259,4 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
logging.error("timeout while waiting for pods to come up")
return 1
# should never get to this return
return 0

View File

@@ -16,7 +16,7 @@ google-cloud-compute==1.22.0
ibm_cloud_sdk_core==3.18.0
ibm_vpc==0.20.0
jinja2==3.1.6
krkn-lib==5.1.12
krkn-lib==5.1.13
lxml==5.1.0
kubernetes==34.1.0
numpy==1.26.4

View File

@@ -2,6 +2,6 @@ pvc_scenario:
pvc_name: kraken-test-pvc # Name of the target PVC
pod_name: kraken-test-pod # Name of the pod where the PVC is mounted, it will be ignored if the pvc_name is defined
namespace: kraken # Namespace where the PVC is
fill_percentage: 38 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99
fill_percentage: 98 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99
duration: 10 # Duration in seconds for the fault
block_size: 102400 # used only by dd if fallocate not present in the container