From 2119baee3d75c6b42dd2c88f0b9982fbfcdceb02 Mon Sep 17 00:00:00 2001 From: Amir Alavi Date: Tue, 19 May 2026 15:05:04 -0400 Subject: [PATCH] revert: ensureVMIsLiveMigratable retry helper Reverts b767b9c0f. The helper was added to work around what looked like the virt-handler containerdisk-socket race on k8s 1.36, but the actual root cause is unrelated: k8s 1.36's stricter CRD numeric format validation (kubernetes/kubernetes#136582) rejects VMI status updates with the pre-fix uint32 Checksum schema. See https://github.com/kubevirt/kubevirt/issues/17858 for the upstream context and kubevirt/kubevirt#17469 for the upstream fix (merged to main, included in v1.9.0-alpha.0, not in v1.8.x). The follow-up commit bumps KUBEVIRT_VERSION so the test consumes a KubeVirt release that contains the fix, which removes the need for any test-side retry. --- test/e2e/e2e_evictioninbackground_test.go | 77 ----------------------- 1 file changed, 77 deletions(-) diff --git a/test/e2e/e2e_evictioninbackground_test.go b/test/e2e/e2e_evictioninbackground_test.go index 7e024e22a..e469db5a7 100644 --- a/test/e2e/e2e_evictioninbackground_test.go +++ b/test/e2e/e2e_evictioninbackground_test.go @@ -110,71 +110,6 @@ ethernets: } } -// ensureVMIsLiveMigratable waits until every VMI reports the LiveMigratable -// condition with status True. If a VMI fails to become migratable within the -// per-attempt timeout, it is deleted and recreated. This works around an -// upstream KubeVirt race where virt-handler computes the containerdisk -// checksum before the disk socket is ready, fails, and never retries; the -// recreated VMI lands on a node that already has the containerdisk image -// cached, so the socket comes up before virt-handler's first attempt. -// See https://github.com/kubernetes-sigs/descheduler/pull/1874 for context. -func ensureVMIsLiveMigratable(t *testing.T, ctx context.Context, kvClient kubevirtclient.Interface) { - t.Helper() - const ( - maxAttempts = 3 - perAttemptWait = 120 * time.Second - deleteWait = 60 * time.Second - ) - - isLiveMigratable := func(vmi *kvcorev1.VirtualMachineInstance) bool { - for _, c := range vmi.Status.Conditions { - if c.Type == kvcorev1.VirtualMachineInstanceIsMigratable && c.Status == corev1.ConditionTrue { - return true - } - } - return false - } - - for i := 1; i <= vmiCount; i++ { - name := fmt.Sprintf("kubevirtvmi-%v", i) - var lastVMI *kvcorev1.VirtualMachineInstance - for attempt := 1; attempt <= maxAttempts; attempt++ { - err := wait.PollUntilContextTimeout(ctx, 5*time.Second, perAttemptWait, true, func(ctx context.Context) (bool, error) { - vmi, err := kvClient.KubevirtV1().VirtualMachineInstances("default").Get(ctx, name, metav1.GetOptions{}) - if err != nil { - klog.Infof("Unable to get vmi %v: %v", name, err) - return false, nil - } - lastVMI = vmi - return isLiveMigratable(vmi), nil - }) - if err == nil { - klog.Infof("vmi %v is LiveMigratable (attempt %d/%d)", name, attempt, maxAttempts) - break - } - if attempt == maxAttempts { - if lastVMI != nil { - klog.Infof("Final vmi %v status: phase=%v, conditions=%#v", name, lastVMI.Status.Phase, lastVMI.Status.Conditions) - } - t.Fatalf("vmi %v never became LiveMigratable after %d attempts", name, maxAttempts) - } - klog.Warningf("vmi %v not LiveMigratable after %v, recreating (attempt %d/%d) to work around virt-handler containerdisk-socket race", name, perAttemptWait, attempt, maxAttempts) - if err := kvClient.KubevirtV1().VirtualMachineInstances("default").Delete(ctx, name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { - t.Fatalf("Unable to delete vmi %v for retry: %v", name, err) - } - if err := wait.PollUntilContextTimeout(ctx, 2*time.Second, deleteWait, true, func(ctx context.Context) (bool, error) { - _, err := kvClient.KubevirtV1().VirtualMachineInstances("default").Get(ctx, name, metav1.GetOptions{}) - return apierrors.IsNotFound(err), nil - }); err != nil { - t.Fatalf("Timed out waiting for vmi %v to be deleted: %v", name, err) - } - if _, err := kvClient.KubevirtV1().VirtualMachineInstances("default").Create(ctx, virtualMachineInstance(i), metav1.CreateOptions{}); err != nil { - t.Fatalf("Unable to recreate vmi %v: %v", name, err) - } - } - } -} - func waitForKubevirtReady(t *testing.T, ctx context.Context, kvClient kubevirtclient.Interface) { obj, err := kvClient.KubevirtV1().KubeVirts("kubevirt").Get(ctx, "kubevirt", metav1.GetOptions{}) if err != nil { @@ -506,18 +441,6 @@ func TestLiveMigrationInBackground(t *testing.T) { t.Fatalf("Error waiting for all vmi active pods to be running: %v", err) } - // Ensure VMIs are actually live-migratable before starting the descheduler; - // recreates any VMI that lost the virt-handler containerdisk-socket race. - ensureVMIsLiveMigratable(t, ctx, kvClient) - - // VMIs may have been recreated above, so re-check that pods are Running - // before snapshotting the active virt-launcher pod names below. - if err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 300*time.Second, true, func(ctx context.Context) (bool, error) { - return allVMIsHaveRunningPods(t, ctx, kubeClient, kvClient) - }); err != nil { - t.Fatalf("Error waiting for all vmi active pods to be running after recreate: %v", err) - } - usedRunningPodNames := make(map[string]struct{}) // vmiCount number of names is expected names := kVirtRunningPodNames(t, ctx, kubeClient)