changing pod recovery to vmi recovery

Signed-off-by: Paige Patton <prubenda@redhat.com>
This commit is contained in:
Paige Patton
2026-04-08 14:21:49 -04:00
committed by Sahil Shah
parent 1623dbac53
commit 0777ef924f
5 changed files with 60 additions and 48 deletions

View File

@@ -266,6 +266,16 @@ def metrics(
metric['timestamp'] = str(datetime.datetime.now())
logging.debug("adding pod %s", metric)
metrics_list.append(metric.copy())
for k,v in scenario.get("affected_vmis", {}).items():
metric_name = "affected_vmis_recovery"
metric = {"metricName": metric_name, "type": k}
if type(v) is list:
for vmi in v:
for k,v in vmi.items():
metric[k] = v
metric['timestamp'] = str(datetime.datetime.now())
logging.debug("adding vmi %s", metric)
metrics_list.append(metric.copy())
for affected_node in scenario["affected_nodes"]:
metric_name = "affected_nodes_recovery"
metric = {"metricName": metric_name}

View File

@@ -21,7 +21,7 @@ from krkn_lib.k8s import KrknKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
from krkn_lib.utils import log_exception
from krkn_lib.models.k8s import AffectedPod, PodsStatus
from krkn_lib.models.k8s import AffectedVMI, VmisStatus
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
@@ -59,14 +59,14 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
scenario_config = yaml.full_load(f)
self.init_clients(lib_telemetry.get_lib_kubernetes())
pods_status = PodsStatus()
vmis_status = VmisStatus()
for config in scenario_config["scenarios"]:
if config.get("scenario") == "kubevirt_vm_outage":
single_pods_status = self.execute_scenario(config, scenario_telemetry)
pods_status.merge(single_pods_status)
single_vmis_status = self.execute_scenario(config, scenario_telemetry)
vmis_status.merge(single_vmis_status)
scenario_telemetry.affected_pods = pods_status
if len(scenario_telemetry.affected_pods.unrecovered) > 0:
scenario_telemetry.affected_vmis = vmis_status
if len(scenario_telemetry.affected_vmis.unrecovered) > 0:
return 1
return 0
except Exception as e:
@@ -83,15 +83,15 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
logging.info("Successfully initialized Kubernetes client for KubeVirt operations")
def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> PodsStatus:
def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> VmisStatus:
"""
Execute a KubeVirt VM outage scenario based on the provided configuration.
:param config: The scenario configuration
:param scenario_telemetry: The telemetry object for recording metrics
:return: PodsStatus object containing recovered and unrecovered pods
:return: VmisStatus object containing recovered and unrecovered pods
"""
self.pods_status = PodsStatus()
self.vmis_status = VmisStatus()
try:
params = config.get("parameters", {})
vm_name = params.get("vm_name")
@@ -102,8 +102,8 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
if not vm_name:
logging.error("vm_name parameter is required")
return self.pods_status
self.pods_status = PodsStatus()
return self.vmis_status
self.vmis_status = VmisStatus()
self.vmis_list = self.k8s_client.get_vmis(vm_name,namespace)
for _ in range(kill_count):
@@ -114,48 +114,48 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
vmi_name = vmi.get("metadata").get("name")
vmi_namespace = vmi.get("metadata").get("namespace")
# Create affected_pod early so we can track failures
self.affected_pod = AffectedPod(
pod_name=vmi_name,
# Create affected_vmi early so we can track failures
self.affected_vmi = AffectedVMI(
vmi_name=vmi_name,
namespace=vmi_namespace,
)
if not self.validate_environment(vmi_name, vmi_namespace):
self.pods_status.unrecovered.append(self.affected_pod)
self.vmis_status.unrecovered.append(self.affected_vmi)
continue
vmi = self.k8s_client.get_vmi(vmi_name, vmi_namespace)
if not vmi:
logging.error(f"VMI {vm_name} not found in namespace {namespace}")
self.pods_status.unrecovered.append(self.affected_pod)
self.vmis_status.unrecovered.append(self.affected_vmi)
continue
self.original_vmi = vmi
logging.info(f"Captured initial state of VMI: {vm_name}")
result = self.delete_vmi(vmi_name, vmi_namespace, disable_auto_restart)
if result != 0:
self.pods_status.unrecovered.append(self.affected_pod)
self.vmis_status.unrecovered.append(self.affected_vmi)
continue
result = self.wait_for_running(vmi_name,vmi_namespace, timeout)
if result != 0:
self.pods_status.unrecovered.append(self.affected_pod)
self.vmis_status.unrecovered.append(self.affected_vmi)
continue
self.affected_pod.total_recovery_time = (
self.affected_pod.pod_readiness_time
+ self.affected_pod.pod_rescheduling_time
self.affected_vmi.total_recovery_time = (
self.affected_vmi.vmi_readiness_time
+ self.affected_vmi.vmi_rescheduling_time
)
self.pods_status.recovered.append(self.affected_pod)
self.vmis_status.recovered.append(self.affected_vmi)
logging.info(f"Successfully completed KubeVirt VM outage scenario for VM: {vm_name}")
return self.pods_status
return self.vmis_status
except Exception as e:
logging.error(f"Error executing KubeVirt VM outage scenario: {e}")
log_exception(str(e))
return self.pods_status
return self.vmis_status
def validate_environment(self, vm_name: str, namespace: str) -> bool:
"""
@@ -242,20 +242,20 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
if deleted_vmi:
if start_creation_time != deleted_vmi.get('metadata', {}).get('creationTimestamp'):
logging.info(f"VMI {vm_name} successfully recreated")
self.affected_pod.pod_rescheduling_time = time.time() - start_time
self.affected_vmi.vmi_rescheduling_time = time.time() - start_time
return 0
else:
logging.info(f"VMI {vm_name} successfully deleted")
time.sleep(1)
logging.error(f"Timed out waiting for VMI {vm_name} to be deleted")
self.pods_status.unrecovered.append(self.affected_pod)
self.vmis_status.unrecovered.append(self.affected_vmi)
return 1
except Exception as e:
logging.error(f"Error deleting VMI {vm_name}: {e}")
log_exception(str(e))
self.pods_status.unrecovered.append(self.affected_pod)
self.vmis_status.unrecovered.append(self.affected_vmi)
return 1
def wait_for_running(self, vm_name: str, namespace: str, timeout: int = 120) -> int:
@@ -268,7 +268,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
if vmi:
if vmi.get('status', {}).get('phase') == "Running":
end_time = time.time()
self.affected_pod.pod_readiness_time = end_time - start_time
self.affected_vmi.vmi_readiness_time = end_time - start_time
logging.info(f"VMI {vm_name} is already running")
return 0

View File

@@ -17,7 +17,7 @@ ibm_vpc==0.26.3 # Requires ibm_cloud_sdk_core
jinja2==3.1.6
lxml==5.1.0
kubernetes==34.1.0
krkn-lib==6.0.5
krkn-lib==6.0.6
numpy==1.26.4
pandas==2.2.0
openshift-client==1.0.21

View File

@@ -38,7 +38,7 @@ from unittest.mock import MagicMock, patch
import yaml
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.models.k8s import AffectedPod, PodsStatus
from krkn_lib.models.k8s import AffectedVMI, VmisStatus
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
from kubernetes.client.rest import ApiException
@@ -137,14 +137,14 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
def mock_delete(self, *args, **kwargs):
"""Reusable mock for delete_vmi that tracks calls and sets up affected_pod"""
self.delete_count += 1
self.plugin.affected_pod = AffectedPod(pod_name=f"test-vm-{self.delete_count}", namespace="default")
self.plugin.affected_pod.pod_rescheduling_time = 5.0
self.plugin.affected_vmi = AffectedVMI(vmi_name=f"test-vm-{self.delete_count}", namespace="default")
self.plugin.affected_vmi.vmi_rescheduling_time = 5.0
return 0
def mock_wait(self, *args, **kwargs):
"""Reusable mock for wait_for_running that tracks calls and sets pod_readiness_time"""
"""Reusable mock for wait_for_running that tracks calls and sets vmi_readiness_time"""
self.wait_count += 1
self.plugin.affected_pod.pod_readiness_time = 3.0
self.plugin.affected_vmi.vmi_readiness_time = 3.0
return 0
# ==================== Core Scenario Tests ====================
@@ -293,10 +293,9 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
self.plugin.original_vmi = copy.deepcopy(self.mock_vmi)
# Initialize pods_status which delete_vmi needs
from krkn_lib.models.k8s import PodsStatus, AffectedPod
self.plugin.pods_status = PodsStatus()
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
self.plugin.pods_status = PodsStatus()
self.plugin.vmis_status = VmisStatus()
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
self.plugin.vmis_status = VmisStatus()
# Mock successful delete operation
self.k8s_client.delete_vmi.return_value = None
@@ -355,8 +354,8 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
"""
# Initialize required attributes - use deepcopy to avoid shared references
self.plugin.original_vmi = copy.deepcopy(self.mock_vmi)
self.plugin.pods_status = PodsStatus()
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
self.plugin.vmis_status = VmisStatus()
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
self.k8s_client.delete_vmi.return_value = None
@@ -374,7 +373,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
result = self.plugin.delete_vmi("test-vm", "default", False)
self.assertEqual(result, 0)
self.assertIsNotNone(self.plugin.affected_pod.pod_rescheduling_time)
self.assertIsNotNone(self.plugin.affected_vmi.vmi_rescheduling_time)
def test_delete_vmi_with_disable_auto_restart_failure(self):
"""
@@ -382,8 +381,8 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
"""
# Initialize required attributes
self.plugin.original_vmi = copy.deepcopy(self.mock_vmi)
self.plugin.pods_status = PodsStatus()
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
self.plugin.vmis_status = VmisStatus()
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
# Mock patch_vm_spec to fail
with patch.object(self.plugin, 'patch_vm_spec', return_value=False):
@@ -406,7 +405,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
"""
Test wait_for_running times out when VMI doesn't reach Running state
"""
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
# Mock VMI in Pending state
pending_vmi = copy.deepcopy(self.mock_vmi)
@@ -424,7 +423,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
"""
Test wait_for_running when VMI doesn't exist yet
"""
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
# First return None (not exists), then return running VMI
running_vmi = copy.deepcopy(self.mock_vmi)
@@ -438,7 +437,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
result = self.plugin.wait_for_running("test-vm", "default", 120)
self.assertEqual(result, 0)
self.assertIsNotNone(self.plugin.affected_pod.pod_readiness_time)
self.assertIsNotNone(self.plugin.affected_vmi.vmi_readiness_time)
# ==================== Recovery Tests ====================
@@ -482,7 +481,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
result = self.plugin.execute_scenario(config, self.scenario_telemetry)
# Should return empty PodsStatus when vm_name is missing
self.assertIsInstance(result, PodsStatus)
self.assertIsInstance(result, VmisStatus)
self.assertEqual(len(result.recovered), 0)
self.assertEqual(len(result.unrecovered), 0)
@@ -505,7 +504,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
result = self.plugin.execute_scenario(config, self.scenario_telemetry)
# Should be PodsStatus with unrecovered pod when VMI not found
self.assertIsInstance(result, PodsStatus)
self.assertIsInstance(result, VmisStatus)
self.assertEqual(len(result.unrecovered), 1)
def test_execute_scenario_with_kill_count(self):

View File

@@ -148,6 +148,9 @@ class TestIssue25NoPrintInClient(unittest.TestCase):
"affected_pods": {
"disrupted": [{"name": "pod-1", "namespace": "default"}]
},
"affected_vmis": {
"recovered": [{"vmi_name": "vm-1", "namespace": "default"}]
},
"affected_nodes": [],
}],
"health_checks": [],