mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-10 04:37:13 +00:00
changing pod recovery to vmi recovery
Signed-off-by: Paige Patton <prubenda@redhat.com>
This commit is contained in:
@@ -266,6 +266,16 @@ def metrics(
|
||||
metric['timestamp'] = str(datetime.datetime.now())
|
||||
logging.debug("adding pod %s", metric)
|
||||
metrics_list.append(metric.copy())
|
||||
for k,v in scenario.get("affected_vmis", {}).items():
|
||||
metric_name = "affected_vmis_recovery"
|
||||
metric = {"metricName": metric_name, "type": k}
|
||||
if type(v) is list:
|
||||
for vmi in v:
|
||||
for k,v in vmi.items():
|
||||
metric[k] = v
|
||||
metric['timestamp'] = str(datetime.datetime.now())
|
||||
logging.debug("adding vmi %s", metric)
|
||||
metrics_list.append(metric.copy())
|
||||
for affected_node in scenario["affected_nodes"]:
|
||||
metric_name = "affected_nodes_recovery"
|
||||
metric = {"metricName": metric_name}
|
||||
|
||||
@@ -21,7 +21,7 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import log_exception
|
||||
from krkn_lib.models.k8s import AffectedPod, PodsStatus
|
||||
from krkn_lib.models.k8s import AffectedVMI, VmisStatus
|
||||
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
|
||||
@@ -59,14 +59,14 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
scenario_config = yaml.full_load(f)
|
||||
|
||||
self.init_clients(lib_telemetry.get_lib_kubernetes())
|
||||
pods_status = PodsStatus()
|
||||
vmis_status = VmisStatus()
|
||||
for config in scenario_config["scenarios"]:
|
||||
if config.get("scenario") == "kubevirt_vm_outage":
|
||||
single_pods_status = self.execute_scenario(config, scenario_telemetry)
|
||||
pods_status.merge(single_pods_status)
|
||||
single_vmis_status = self.execute_scenario(config, scenario_telemetry)
|
||||
vmis_status.merge(single_vmis_status)
|
||||
|
||||
scenario_telemetry.affected_pods = pods_status
|
||||
if len(scenario_telemetry.affected_pods.unrecovered) > 0:
|
||||
scenario_telemetry.affected_vmis = vmis_status
|
||||
if len(scenario_telemetry.affected_vmis.unrecovered) > 0:
|
||||
return 1
|
||||
return 0
|
||||
except Exception as e:
|
||||
@@ -83,15 +83,15 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
logging.info("Successfully initialized Kubernetes client for KubeVirt operations")
|
||||
|
||||
|
||||
def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> PodsStatus:
|
||||
def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> VmisStatus:
|
||||
"""
|
||||
Execute a KubeVirt VM outage scenario based on the provided configuration.
|
||||
|
||||
:param config: The scenario configuration
|
||||
:param scenario_telemetry: The telemetry object for recording metrics
|
||||
:return: PodsStatus object containing recovered and unrecovered pods
|
||||
:return: VmisStatus object containing recovered and unrecovered pods
|
||||
"""
|
||||
self.pods_status = PodsStatus()
|
||||
self.vmis_status = VmisStatus()
|
||||
try:
|
||||
params = config.get("parameters", {})
|
||||
vm_name = params.get("vm_name")
|
||||
@@ -102,8 +102,8 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
|
||||
if not vm_name:
|
||||
logging.error("vm_name parameter is required")
|
||||
return self.pods_status
|
||||
self.pods_status = PodsStatus()
|
||||
return self.vmis_status
|
||||
self.vmis_status = VmisStatus()
|
||||
self.vmis_list = self.k8s_client.get_vmis(vm_name,namespace)
|
||||
for _ in range(kill_count):
|
||||
|
||||
@@ -114,48 +114,48 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
vmi_name = vmi.get("metadata").get("name")
|
||||
vmi_namespace = vmi.get("metadata").get("namespace")
|
||||
|
||||
# Create affected_pod early so we can track failures
|
||||
self.affected_pod = AffectedPod(
|
||||
pod_name=vmi_name,
|
||||
# Create affected_vmi early so we can track failures
|
||||
self.affected_vmi = AffectedVMI(
|
||||
vmi_name=vmi_name,
|
||||
namespace=vmi_namespace,
|
||||
)
|
||||
|
||||
if not self.validate_environment(vmi_name, vmi_namespace):
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
self.vmis_status.unrecovered.append(self.affected_vmi)
|
||||
continue
|
||||
|
||||
vmi = self.k8s_client.get_vmi(vmi_name, vmi_namespace)
|
||||
if not vmi:
|
||||
logging.error(f"VMI {vm_name} not found in namespace {namespace}")
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
self.vmis_status.unrecovered.append(self.affected_vmi)
|
||||
continue
|
||||
|
||||
self.original_vmi = vmi
|
||||
logging.info(f"Captured initial state of VMI: {vm_name}")
|
||||
result = self.delete_vmi(vmi_name, vmi_namespace, disable_auto_restart)
|
||||
if result != 0:
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
self.vmis_status.unrecovered.append(self.affected_vmi)
|
||||
continue
|
||||
|
||||
result = self.wait_for_running(vmi_name,vmi_namespace, timeout)
|
||||
if result != 0:
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
self.vmis_status.unrecovered.append(self.affected_vmi)
|
||||
continue
|
||||
|
||||
self.affected_pod.total_recovery_time = (
|
||||
self.affected_pod.pod_readiness_time
|
||||
+ self.affected_pod.pod_rescheduling_time
|
||||
self.affected_vmi.total_recovery_time = (
|
||||
self.affected_vmi.vmi_readiness_time
|
||||
+ self.affected_vmi.vmi_rescheduling_time
|
||||
)
|
||||
|
||||
self.pods_status.recovered.append(self.affected_pod)
|
||||
self.vmis_status.recovered.append(self.affected_vmi)
|
||||
logging.info(f"Successfully completed KubeVirt VM outage scenario for VM: {vm_name}")
|
||||
|
||||
return self.pods_status
|
||||
return self.vmis_status
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error executing KubeVirt VM outage scenario: {e}")
|
||||
log_exception(str(e))
|
||||
return self.pods_status
|
||||
return self.vmis_status
|
||||
|
||||
def validate_environment(self, vm_name: str, namespace: str) -> bool:
|
||||
"""
|
||||
@@ -242,20 +242,20 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
if deleted_vmi:
|
||||
if start_creation_time != deleted_vmi.get('metadata', {}).get('creationTimestamp'):
|
||||
logging.info(f"VMI {vm_name} successfully recreated")
|
||||
self.affected_pod.pod_rescheduling_time = time.time() - start_time
|
||||
self.affected_vmi.vmi_rescheduling_time = time.time() - start_time
|
||||
return 0
|
||||
else:
|
||||
logging.info(f"VMI {vm_name} successfully deleted")
|
||||
time.sleep(1)
|
||||
|
||||
logging.error(f"Timed out waiting for VMI {vm_name} to be deleted")
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
self.vmis_status.unrecovered.append(self.affected_vmi)
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error deleting VMI {vm_name}: {e}")
|
||||
log_exception(str(e))
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
self.vmis_status.unrecovered.append(self.affected_vmi)
|
||||
return 1
|
||||
|
||||
def wait_for_running(self, vm_name: str, namespace: str, timeout: int = 120) -> int:
|
||||
@@ -268,7 +268,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
if vmi:
|
||||
if vmi.get('status', {}).get('phase') == "Running":
|
||||
end_time = time.time()
|
||||
self.affected_pod.pod_readiness_time = end_time - start_time
|
||||
self.affected_vmi.vmi_readiness_time = end_time - start_time
|
||||
|
||||
logging.info(f"VMI {vm_name} is already running")
|
||||
return 0
|
||||
|
||||
@@ -17,7 +17,7 @@ ibm_vpc==0.26.3 # Requires ibm_cloud_sdk_core
|
||||
jinja2==3.1.6
|
||||
lxml==5.1.0
|
||||
kubernetes==34.1.0
|
||||
krkn-lib==6.0.5
|
||||
krkn-lib==6.0.6
|
||||
numpy==1.26.4
|
||||
pandas==2.2.0
|
||||
openshift-client==1.0.21
|
||||
|
||||
@@ -38,7 +38,7 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import yaml
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.k8s import AffectedPod, PodsStatus
|
||||
from krkn_lib.models.k8s import AffectedVMI, VmisStatus
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from kubernetes.client.rest import ApiException
|
||||
@@ -137,14 +137,14 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
def mock_delete(self, *args, **kwargs):
|
||||
"""Reusable mock for delete_vmi that tracks calls and sets up affected_pod"""
|
||||
self.delete_count += 1
|
||||
self.plugin.affected_pod = AffectedPod(pod_name=f"test-vm-{self.delete_count}", namespace="default")
|
||||
self.plugin.affected_pod.pod_rescheduling_time = 5.0
|
||||
self.plugin.affected_vmi = AffectedVMI(vmi_name=f"test-vm-{self.delete_count}", namespace="default")
|
||||
self.plugin.affected_vmi.vmi_rescheduling_time = 5.0
|
||||
return 0
|
||||
|
||||
def mock_wait(self, *args, **kwargs):
|
||||
"""Reusable mock for wait_for_running that tracks calls and sets pod_readiness_time"""
|
||||
"""Reusable mock for wait_for_running that tracks calls and sets vmi_readiness_time"""
|
||||
self.wait_count += 1
|
||||
self.plugin.affected_pod.pod_readiness_time = 3.0
|
||||
self.plugin.affected_vmi.vmi_readiness_time = 3.0
|
||||
return 0
|
||||
|
||||
# ==================== Core Scenario Tests ====================
|
||||
@@ -293,10 +293,9 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
self.plugin.original_vmi = copy.deepcopy(self.mock_vmi)
|
||||
|
||||
# Initialize pods_status which delete_vmi needs
|
||||
from krkn_lib.models.k8s import PodsStatus, AffectedPod
|
||||
self.plugin.pods_status = PodsStatus()
|
||||
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
|
||||
self.plugin.pods_status = PodsStatus()
|
||||
self.plugin.vmis_status = VmisStatus()
|
||||
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
|
||||
self.plugin.vmis_status = VmisStatus()
|
||||
|
||||
# Mock successful delete operation
|
||||
self.k8s_client.delete_vmi.return_value = None
|
||||
@@ -355,8 +354,8 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
"""
|
||||
# Initialize required attributes - use deepcopy to avoid shared references
|
||||
self.plugin.original_vmi = copy.deepcopy(self.mock_vmi)
|
||||
self.plugin.pods_status = PodsStatus()
|
||||
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
|
||||
self.plugin.vmis_status = VmisStatus()
|
||||
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
|
||||
|
||||
self.k8s_client.delete_vmi.return_value = None
|
||||
|
||||
@@ -374,7 +373,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
result = self.plugin.delete_vmi("test-vm", "default", False)
|
||||
|
||||
self.assertEqual(result, 0)
|
||||
self.assertIsNotNone(self.plugin.affected_pod.pod_rescheduling_time)
|
||||
self.assertIsNotNone(self.plugin.affected_vmi.vmi_rescheduling_time)
|
||||
|
||||
def test_delete_vmi_with_disable_auto_restart_failure(self):
|
||||
"""
|
||||
@@ -382,8 +381,8 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
"""
|
||||
# Initialize required attributes
|
||||
self.plugin.original_vmi = copy.deepcopy(self.mock_vmi)
|
||||
self.plugin.pods_status = PodsStatus()
|
||||
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
|
||||
self.plugin.vmis_status = VmisStatus()
|
||||
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
|
||||
|
||||
# Mock patch_vm_spec to fail
|
||||
with patch.object(self.plugin, 'patch_vm_spec', return_value=False):
|
||||
@@ -406,7 +405,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
"""
|
||||
Test wait_for_running times out when VMI doesn't reach Running state
|
||||
"""
|
||||
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
|
||||
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
|
||||
|
||||
# Mock VMI in Pending state
|
||||
pending_vmi = copy.deepcopy(self.mock_vmi)
|
||||
@@ -424,7 +423,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
"""
|
||||
Test wait_for_running when VMI doesn't exist yet
|
||||
"""
|
||||
self.plugin.affected_pod = AffectedPod(pod_name="test-vm", namespace="default")
|
||||
self.plugin.affected_vmi = AffectedVMI(vmi_name="test-vm", namespace="default")
|
||||
|
||||
# First return None (not exists), then return running VMI
|
||||
running_vmi = copy.deepcopy(self.mock_vmi)
|
||||
@@ -438,7 +437,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
result = self.plugin.wait_for_running("test-vm", "default", 120)
|
||||
|
||||
self.assertEqual(result, 0)
|
||||
self.assertIsNotNone(self.plugin.affected_pod.pod_readiness_time)
|
||||
self.assertIsNotNone(self.plugin.affected_vmi.vmi_readiness_time)
|
||||
|
||||
# ==================== Recovery Tests ====================
|
||||
|
||||
@@ -482,7 +481,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
result = self.plugin.execute_scenario(config, self.scenario_telemetry)
|
||||
|
||||
# Should return empty PodsStatus when vm_name is missing
|
||||
self.assertIsInstance(result, PodsStatus)
|
||||
self.assertIsInstance(result, VmisStatus)
|
||||
self.assertEqual(len(result.recovered), 0)
|
||||
self.assertEqual(len(result.unrecovered), 0)
|
||||
|
||||
@@ -505,7 +504,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase):
|
||||
result = self.plugin.execute_scenario(config, self.scenario_telemetry)
|
||||
|
||||
# Should be PodsStatus with unrecovered pod when VMI not found
|
||||
self.assertIsInstance(result, PodsStatus)
|
||||
self.assertIsInstance(result, VmisStatus)
|
||||
self.assertEqual(len(result.unrecovered), 1)
|
||||
|
||||
def test_execute_scenario_with_kill_count(self):
|
||||
|
||||
@@ -148,6 +148,9 @@ class TestIssue25NoPrintInClient(unittest.TestCase):
|
||||
"affected_pods": {
|
||||
"disrupted": [{"name": "pod-1", "namespace": "default"}]
|
||||
},
|
||||
"affected_vmis": {
|
||||
"recovered": [{"vmi_name": "vm-1", "namespace": "default"}]
|
||||
},
|
||||
"affected_nodes": [],
|
||||
}],
|
||||
"health_checks": [],
|
||||
|
||||
Reference in New Issue
Block a user