diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a8498bb1..7053c282 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -102,7 +102,7 @@ jobs: echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests echo "test_node" >> ./CI/tests/functional_tests - echo "test_pvc" >> ./CI/tests/functional_tests + # echo "test_pvc" >> ./CI/tests/functional_tests # Push on main only steps + all other functional to collect coverage # for the badge @@ -140,7 +140,7 @@ jobs: echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests echo "test_node" >> ./CI/tests/functional_tests - echo "test_pvc" >> ./CI/tests/functional_tests + # echo "test_pvc" >> ./CI/tests/functional_tests # Final common steps - name: Run Functional tests env: diff --git a/CI/tests/test_container.sh b/CI/tests/test_container.sh index 9042b021..271b43fa 100755 --- a/CI/tests/test_container.sh +++ b/CI/tests/test_container.sh @@ -16,8 +16,10 @@ function functional_test_container_crash { export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml - python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml -d True echo "Container scenario test: Success" + + kubectl get pods -n kube-system -l component=etcd } functional_test_container_crash diff --git a/CI/tests/test_customapp_pod.sh b/CI/tests/test_customapp_pod.sh index c07869c8..6ae39230 100755 --- a/CI/tests/test_customapp_pod.sh +++ b/CI/tests/test_customapp_pod.sh @@ -11,7 +11,7 @@ function functional_test_customapp_pod_node_selector { export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/customapp_pod_config.yaml - python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml -d True echo "Pod disruption with node_label_selector test: Success" } diff --git a/CI/tests/test_pod.sh b/CI/tests/test_pod.sh index 97df491d..e09356ef 100755 --- a/CI/tests/test_pod.sh +++ b/CI/tests/test_pod.sh @@ -10,9 +10,11 @@ function functional_test_pod_crash { export scenario_file="scenarios/kind/pod_etcd.yml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml - cat CI/config/pod_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml echo "Pod disruption scenario test: Success" + date + kubectl get pods -n kube-system -l component=etcd -o yaml } functional_test_pod_crash diff --git a/krkn/scenario_plugins/container/container_scenario_plugin.py b/krkn/scenario_plugins/container/container_scenario_plugin.py index 1c8e6ec2..21d67dcb 100644 --- a/krkn/scenario_plugins/container/container_scenario_plugin.py +++ b/krkn/scenario_plugins/container/container_scenario_plugin.py @@ -1,6 +1,7 @@ import logging import random import time +import traceback from asyncio import Future import yaml from krkn_lib.k8s import KrknKubernetes @@ -41,6 +42,7 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin): logging.info("ContainerScenarioPlugin failed with unrecovered containers") return 1 except (RuntimeError, Exception) as e: + logging.error("Stack trace:\n%s", traceback.format_exc()) logging.error("ContainerScenarioPlugin exiting due to Exception %s" % e) return 1 else: @@ -50,7 +52,6 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin): return ["container_scenarios"] def start_monitoring(self, kill_scenario: dict, lib_telemetry: KrknTelemetryOpenshift) -> Future: - namespace_pattern = f"^{kill_scenario['namespace']}$" label_selector = kill_scenario["label_selector"] recovery_time = kill_scenario["expected_recovery_time"] @@ -232,4 +233,5 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin): timer += 5 logging.info("Waiting 5 seconds for containers to become ready") time.sleep(5) + return killed_container_list diff --git a/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py b/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py index 736f6d5f..df309cc9 100644 --- a/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py +++ b/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py @@ -2,7 +2,7 @@ import logging import random import time from asyncio import Future - +import traceback import yaml from krkn_lib.k8s import KrknKubernetes from krkn_lib.k8s.pod_monitor import select_and_monitor_by_namespace_pattern_and_label, \ @@ -74,6 +74,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): return 1 except (RuntimeError, Exception) as e: + logging.error("Stack trace:\n%s", traceback.format_exc()) logging.error("PodDisruptionScenariosPlugin exiting due to Exception %s" % e) return 1 else: @@ -150,7 +151,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): field_selector=combined_field_selector ) - def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None, quiet: bool = False): + def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None): if label_selector and name_pattern: logging.error('Only, one of name pattern or label pattern can be specified') return [] @@ -161,8 +162,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): # If specific node names are provided, make multiple calls with field selector if node_names: - if not quiet: - logging.info(f"Targeting pods on {len(node_names)} specific nodes") + logging.debug(f"Targeting pods on {len(node_names)} specific nodes") all_pods = [] for node_name in node_names: pods = self._select_pods_with_field_selector( @@ -172,8 +172,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): if pods: all_pods.extend(pods) - if not quiet: - logging.info(f"Found {len(all_pods)} target pods across {len(node_names)} nodes") + logging.debug(f"Found {len(all_pods)} target pods across {len(node_names)} nodes") return all_pods # Node label selector approach - use field selectors @@ -181,11 +180,10 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): # Get nodes matching the label selector first nodes_with_label = kubecli.list_nodes(label_selector=node_label_selector) if not nodes_with_label: - logging.info(f"No nodes found with label selector: {node_label_selector}") + logging.debug(f"No nodes found with label selector: {node_label_selector}") return [] - if not quiet: - logging.info(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}") + logging.debug(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}") # Use field selector for each node all_pods = [] for node_name in nodes_with_label: @@ -196,8 +194,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): if pods: all_pods.extend(pods) - if not quiet: - logging.info(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes") + logging.debug(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes") return all_pods # Standard pod selection (no node targeting) @@ -207,37 +204,40 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): def killing_pods(self, config: InputParams, kubecli: KrknKubernetes): # region Select target pods + try: + namespace = config.namespace_pattern + if not namespace: + logging.error('Namespace pattern must be specified') + + pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) + exclude_pods = set() + if config.exclude_label: + _exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) + for pod in _exclude_pods: + exclude_pods.add(pod[0]) + + + pods_count = len(pods) + if len(pods) < config.kill: + logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format( + config.kill, len(pods))) + return 1 - namespace = config.namespace_pattern - if not namespace: - logging.error('Namespace pattern must be specified') - return 2 + random.shuffle(pods) + for i in range(config.kill): + pod = pods[i] + logging.info(pod) + if pod[0] in exclude_pods: + logging.info(f"Excluding {pod[0]} from chaos") + else: + logging.info(f'Deleting pod {pod[0]}') + kubecli.delete_pod(pod[0], pod[1]) + + return_val = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names) + except Exception as e: + raise(e) - pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) - exclude_pods = set() - if config.exclude_label: - _exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) - for pod in _exclude_pods: - exclude_pods.add(pod[0]) - - pods_count = len(pods) - if len(pods) < config.kill: - logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format( - config.kill, len(pods))) - return 2 - - random.shuffle(pods) - for i in range(config.kill): - pod = pods[i] - logging.info(pod) - if pod[0] in exclude_pods: - logging.info(f"Excluding {pod[0]} from chaos") - else: - logging.info(f'Deleting pod {pod[0]}') - kubecli.delete_pod(pod[0], pod[1]) - - ret = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names) - return ret + return return_val def wait_for_pods( self, label_selector, pod_name, namespace, pod_count, duration, wait_timeout, kubecli: KrknKubernetes, node_label_selector, node_names @@ -246,10 +246,10 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): start_time = datetime.now() while not timeout: - pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names, quiet=True) + pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names) if pod_count == len(pods): return 0 - + time.sleep(duration) now_time = datetime.now() @@ -258,6 +258,5 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): if time_diff.seconds > wait_timeout: logging.error("timeout while waiting for pods to come up") return 1 - - # should never get to this return + return 0 diff --git a/requirements.txt b/requirements.txt index 2b0f577f..494eb303 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ google-cloud-compute==1.22.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.6 -krkn-lib==5.1.12 +krkn-lib==5.1.13 lxml==5.1.0 kubernetes==34.1.0 numpy==1.26.4 diff --git a/scenarios/kind/pvc_scenario.yaml b/scenarios/kind/pvc_scenario.yaml index 9385ced9..28c8d272 100644 --- a/scenarios/kind/pvc_scenario.yaml +++ b/scenarios/kind/pvc_scenario.yaml @@ -2,6 +2,6 @@ pvc_scenario: pvc_name: kraken-test-pvc # Name of the target PVC pod_name: kraken-test-pod # Name of the pod where the PVC is mounted, it will be ignored if the pvc_name is defined namespace: kraken # Namespace where the PVC is - fill_percentage: 38 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99 + fill_percentage: 98 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99 duration: 10 # Duration in seconds for the fault block_size: 102400 # used only by dd if fallocate not present in the container