From 0777ef924fb751403102bbe0aed44765b24ba482 Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Wed, 8 Apr 2026 14:21:49 -0400 Subject: [PATCH] changing pod recovery to vmi recovery Signed-off-by: Paige Patton --- krkn/prometheus/client.py | 10 ++++ .../kubevirt_vm_outage_scenario_plugin.py | 56 +++++++++---------- requirements.txt | 2 +- tests/test_kubevirt_vm_outage.py | 37 ++++++------ tests/test_logging_and_code_quality.py | 3 + 5 files changed, 60 insertions(+), 48 deletions(-) diff --git a/krkn/prometheus/client.py b/krkn/prometheus/client.py index 41727e64..55f5dfac 100644 --- a/krkn/prometheus/client.py +++ b/krkn/prometheus/client.py @@ -266,6 +266,16 @@ def metrics( metric['timestamp'] = str(datetime.datetime.now()) logging.debug("adding pod %s", metric) metrics_list.append(metric.copy()) + for k,v in scenario.get("affected_vmis", {}).items(): + metric_name = "affected_vmis_recovery" + metric = {"metricName": metric_name, "type": k} + if type(v) is list: + for vmi in v: + for k,v in vmi.items(): + metric[k] = v + metric['timestamp'] = str(datetime.datetime.now()) + logging.debug("adding vmi %s", metric) + metrics_list.append(metric.copy()) for affected_node in scenario["affected_nodes"]: metric_name = "affected_nodes_recovery" metric = {"metricName": metric_name} diff --git a/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py b/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py index a9c1bf67..cbdedfd1 100644 --- a/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py +++ b/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py @@ -21,7 +21,7 @@ from krkn_lib.k8s import KrknKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift from krkn_lib.utils import log_exception -from krkn_lib.models.k8s import AffectedPod, PodsStatus +from krkn_lib.models.k8s import AffectedVMI, VmisStatus from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin @@ -59,14 +59,14 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin): scenario_config = yaml.full_load(f) self.init_clients(lib_telemetry.get_lib_kubernetes()) - pods_status = PodsStatus() + vmis_status = VmisStatus() for config in scenario_config["scenarios"]: if config.get("scenario") == "kubevirt_vm_outage": - single_pods_status = self.execute_scenario(config, scenario_telemetry) - pods_status.merge(single_pods_status) + single_vmis_status = self.execute_scenario(config, scenario_telemetry) + vmis_status.merge(single_vmis_status) - scenario_telemetry.affected_pods = pods_status - if len(scenario_telemetry.affected_pods.unrecovered) > 0: + scenario_telemetry.affected_vmis = vmis_status + if len(scenario_telemetry.affected_vmis.unrecovered) > 0: return 1 return 0 except Exception as e: @@ -83,15 +83,15 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin): logging.info("Successfully initialized Kubernetes client for KubeVirt operations") - def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> PodsStatus: + def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> VmisStatus: """ Execute a KubeVirt VM outage scenario based on the provided configuration. :param config: The scenario configuration :param scenario_telemetry: The telemetry object for recording metrics - :return: PodsStatus object containing recovered and unrecovered pods + :return: VmisStatus object containing recovered and unrecovered pods """ - self.pods_status = PodsStatus() + self.vmis_status = VmisStatus() try: params = config.get("parameters", {}) vm_name = params.get("vm_name") @@ -102,8 +102,8 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin): if not vm_name: logging.error("vm_name parameter is required") - return self.pods_status - self.pods_status = PodsStatus() + return self.vmis_status + self.vmis_status = VmisStatus() self.vmis_list = self.k8s_client.get_vmis(vm_name,namespace) for _ in range(kill_count): @@ -114,48 +114,48 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin): vmi_name = vmi.get("metadata").get("name") vmi_namespace = vmi.get("metadata").get("namespace") - # Create affected_pod early so we can track failures - self.affected_pod = AffectedPod( - pod_name=vmi_name, + # Create affected_vmi early so we can track failures + self.affected_vmi = AffectedVMI( + vmi_name=vmi_name, namespace=vmi_namespace, ) if not self.validate_environment(vmi_name, vmi_namespace): - self.pods_status.unrecovered.append(self.affected_pod) + self.vmis_status.unrecovered.append(self.affected_vmi) continue vmi = self.k8s_client.get_vmi(vmi_name, vmi_namespace) if not vmi: logging.error(f"VMI {vm_name} not found in namespace {namespace}") - self.pods_status.unrecovered.append(self.affected_pod) + self.vmis_status.unrecovered.append(self.affected_vmi) continue self.original_vmi = vmi logging.info(f"Captured initial state of VMI: {vm_name}") result = self.delete_vmi(vmi_name, vmi_namespace, disable_auto_restart) if result != 0: - self.pods_status.unrecovered.append(self.affected_pod) + self.vmis_status.unrecovered.append(self.affected_vmi) continue result = self.wait_for_running(vmi_name,vmi_namespace, timeout) if result != 0: - self.pods_status.unrecovered.append(self.affected_pod) + self.vmis_status.unrecovered.append(self.affected_vmi) continue - self.affected_pod.total_recovery_time = ( - self.affected_pod.pod_readiness_time - + self.affected_pod.pod_rescheduling_time + self.affected_vmi.total_recovery_time = ( + self.affected_vmi.vmi_readiness_time + + self.affected_vmi.vmi_rescheduling_time ) - self.pods_status.recovered.append(self.affected_pod) + self.vmis_status.recovered.append(self.affected_vmi) logging.info(f"Successfully completed KubeVirt VM outage scenario for VM: {vm_name}") - return self.pods_status + return self.vmis_status except Exception as e: logging.error(f"Error executing KubeVirt VM outage scenario: {e}") log_exception(str(e)) - return self.pods_status + return self.vmis_status def validate_environment(self, vm_name: str, namespace: str) -> bool: """ @@ -242,20 +242,20 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin): if deleted_vmi: if start_creation_time != deleted_vmi.get('metadata', {}).get('creationTimestamp'): logging.info(f"VMI {vm_name} successfully recreated") - self.affected_pod.pod_rescheduling_time = time.time() - start_time + self.affected_vmi.vmi_rescheduling_time = time.time() - start_time return 0 else: logging.info(f"VMI {vm_name} successfully deleted") time.sleep(1) logging.error(f"Timed out waiting for VMI {vm_name} to be deleted") - self.pods_status.unrecovered.append(self.affected_pod) + self.vmis_status.unrecovered.append(self.affected_vmi) return 1 except Exception as e: logging.error(f"Error deleting VMI {vm_name}: {e}") log_exception(str(e)) - self.pods_status.unrecovered.append(self.affected_pod) + self.vmis_status.unrecovered.append(self.affected_vmi) return 1 def wait_for_running(self, vm_name: str, namespace: str, timeout: int = 120) -> int: @@ -268,7 +268,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin): if vmi: if vmi.get('status', {}).get('phase') == "Running": end_time = time.time() - self.affected_pod.pod_readiness_time = end_time - start_time + self.affected_vmi.vmi_readiness_time = end_time - start_time logging.info(f"VMI {vm_name} is already running") return 0 diff --git a/requirements.txt b/requirements.txt index 97ec0089..97ba8ecc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ ibm_vpc==0.26.3 # Requires ibm_cloud_sdk_core jinja2==3.1.6 lxml==5.1.0 kubernetes==34.1.0 -krkn-lib==6.0.5 +krkn-lib==6.0.6 numpy==1.26.4 pandas==2.2.0 openshift-client==1.0.21 diff --git a/tests/test_kubevirt_vm_outage.py b/tests/test_kubevirt_vm_outage.py index d980e45f..636f48ec 100644 --- a/tests/test_kubevirt_vm_outage.py +++ b/tests/test_kubevirt_vm_outage.py @@ -38,7 +38,7 @@ from unittest.mock import MagicMock, patch import yaml from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.k8s import AffectedPod, PodsStatus +from krkn_lib.models.k8s import AffectedVMI, VmisStatus from krkn_lib.models.telemetry import ScenarioTelemetry from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift from kubernetes.client.rest import ApiException @@ -137,14 +137,14 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): def mock_delete(self, *args, **kwargs): """Reusable mock for delete_vmi that tracks calls and sets up affected_pod""" self.delete_count += 1 - self.plugin.affected_pod = AffectedPod(pod_name=f"test-vm-{self.delete_count}", namespace="default") - self.plugin.affected_pod.pod_rescheduling_time = 5.0 + self.plugin.affected_vmi = AffectedVMI(vmi_name=f"test-vm-{self.delete_count}", namespace="default") + self.plugin.affected_vmi.vmi_rescheduling_time = 5.0 return 0 def mock_wait(self, *args, **kwargs): - """Reusable mock for wait_for_running that tracks calls and sets pod_readiness_time""" + """Reusable mock for wait_for_running that tracks calls and sets vmi_readiness_time""" self.wait_count += 1 - self.plugin.affected_pod.pod_readiness_time = 3.0 + self.plugin.affected_vmi.vmi_readiness_time = 3.0 return 0 # ==================== Core Scenario Tests ==================== @@ -293,10 +293,9 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): self.plugin.original_vmi = copy.deepcopy(self.mock_vmi) # Initialize pods_status which delete_vmi needs - from krkn_lib.models.k8s import PodsStatus, AffectedPod - self.plugin.pods_status = PodsStatus() - self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default") - self.plugin.pods_status = PodsStatus() + self.plugin.vmis_status = VmisStatus() + self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default") + self.plugin.vmis_status = VmisStatus() # Mock successful delete operation self.k8s_client.delete_vmi.return_value = None @@ -355,8 +354,8 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): """ # Initialize required attributes - use deepcopy to avoid shared references self.plugin.original_vmi = copy.deepcopy(self.mock_vmi) - self.plugin.pods_status = PodsStatus() - self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default") + self.plugin.vmis_status = VmisStatus() + self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default") self.k8s_client.delete_vmi.return_value = None @@ -374,7 +373,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): result = self.plugin.delete_vmi("test-vm", "default", False) self.assertEqual(result, 0) - self.assertIsNotNone(self.plugin.affected_pod.pod_rescheduling_time) + self.assertIsNotNone(self.plugin.affected_vmi.vmi_rescheduling_time) def test_delete_vmi_with_disable_auto_restart_failure(self): """ @@ -382,8 +381,8 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): """ # Initialize required attributes self.plugin.original_vmi = copy.deepcopy(self.mock_vmi) - self.plugin.pods_status = PodsStatus() - self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default") + self.plugin.vmis_status = VmisStatus() + self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default") # Mock patch_vm_spec to fail with patch.object(self.plugin, 'patch_vm_spec', return_value=False): @@ -406,7 +405,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): """ Test wait_for_running times out when VMI doesn't reach Running state """ - self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default") + self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default") # Mock VMI in Pending state pending_vmi = copy.deepcopy(self.mock_vmi) @@ -424,7 +423,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): """ Test wait_for_running when VMI doesn't exist yet """ - self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default") + self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default") # First return None (not exists), then return running VMI running_vmi = copy.deepcopy(self.mock_vmi) @@ -438,7 +437,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): result = self.plugin.wait_for_running("test-vm", "default", 120) self.assertEqual(result, 0) - self.assertIsNotNone(self.plugin.affected_pod.pod_readiness_time) + self.assertIsNotNone(self.plugin.affected_vmi.vmi_readiness_time) # ==================== Recovery Tests ==================== @@ -482,7 +481,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): result = self.plugin.execute_scenario(config, self.scenario_telemetry) # Should return empty PodsStatus when vm_name is missing - self.assertIsInstance(result, PodsStatus) + self.assertIsInstance(result, VmisStatus) self.assertEqual(len(result.recovered), 0) self.assertEqual(len(result.unrecovered), 0) @@ -505,7 +504,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): result = self.plugin.execute_scenario(config, self.scenario_telemetry) # Should be PodsStatus with unrecovered pod when VMI not found - self.assertIsInstance(result, PodsStatus) + self.assertIsInstance(result, VmisStatus) self.assertEqual(len(result.unrecovered), 1) def test_execute_scenario_with_kill_count(self): diff --git a/tests/test_logging_and_code_quality.py b/tests/test_logging_and_code_quality.py index d7b6b3d2..c5adf0de 100644 --- a/tests/test_logging_and_code_quality.py +++ b/tests/test_logging_and_code_quality.py @@ -148,6 +148,9 @@ class TestIssue25NoPrintInClient(unittest.TestCase): "affected_pods": { "disrupted": [{"name": "pod-1", "namespace": "default"}] }, + "affected_vmis": { + "recovered": [{"vmi_name": "vm-1", "namespace": "default"}] + }, "affected_nodes": [], }], "health_checks": [],