From a7ac072c7dcb7fd4ab5b233d7e24518683b89a16 Mon Sep 17 00:00:00 2001 From: Simone Tiraboschi Date: Thu, 14 May 2026 17:21:47 +0200 Subject: [PATCH] evictions: fix assumePod silently dropping success metric on informer race When KubeVirt sets EvictionInProgressAnnotationKey before returning TooManyRequests, the informer's UpdateFunc can call addPod (evictionAssumed=false) before evictPod's assumePod call arrives. assumePod found the entry already present and returned early, leaving evictionAssumed=false. DeleteFunc then skipped the "success" metric. Fix: if the existing entry has evictionAssumed=false (added by addPod), upgrade it in place without double-counting the pod in the counters. Adds TestEvictionInBackgroundMetrics_InformerRace to reproduce the race deterministically. Signed-off-by: Simone Tiraboschi --- pkg/descheduler/evictions/evictions.go | 19 +++- pkg/descheduler/evictions/evictions_test.go | 110 ++++++++++++++++++++ 2 files changed, 127 insertions(+), 2 deletions(-) diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index 779ef595c..8052630a4 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -154,8 +154,23 @@ func (erc *evictionRequestsCache) assumePod(pod *v1.Pod, profileName, strategyNa erc.mu.Lock() defer erc.mu.Unlock() uid := getPodKey(pod) - if _, exists := erc.requests[uid]; exists { - // Pod already assumed by a previous strategy/profile; first one wins. + if item, exists := erc.requests[uid]; exists { + if item.evictionAssumed { + // Pod already assumed by a previous strategy/profile; first one wins. + return + } + // The informer's UpdateFunc called addPod (evictionAssumed=false) before + // the TooManyRequests response arrived. Upgrade the entry in place without + // bumping the counters a second time. + erc.requests[uid] = evictionRequestItem{ + podNamespace: item.podNamespace, + podName: item.podName, + podNodeName: item.podNodeName, + strategyName: strategyName, + profileName: profileName, + evictionAssumed: true, + assumedTimestamp: metav1.NewTime(time.Now()), + } return } erc.requests[uid] = evictionRequestItem{ diff --git a/pkg/descheduler/evictions/evictions_test.go b/pkg/descheduler/evictions/evictions_test.go index e99d2d0cb..0db24ff1f 100644 --- a/pkg/descheduler/evictions/evictions_test.go +++ b/pkg/descheduler/evictions/evictions_test.go @@ -573,6 +573,116 @@ func TestEvictionInBackgroundMetrics(t *testing.T) { metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "background"}, 2) } +// TestEvictionInBackgroundMetrics_InformerRace reproduces the race where the +// informer's UpdateFunc calls addPod (evictionAssumed=false) before evictPod's +// assumePod call, causing the "success" metric to be silently dropped when the +// pod is later deleted. +// +// The race: evictPod sends the eviction request; the KubeVirt handler sets +// EvictionInProgressAnnotationKey before returning TooManyRequests, so the +// informer's UpdateFunc fires and calls addPod (evictionAssumed=false) while +// evictPod is still waiting for the HTTP response. When the response arrives, +// assumePod finds the entry already present and returns early, leaving +// evictionAssumed=false. DeleteFunc then skips the "success" metric. +func TestEvictionInBackgroundMetrics_InformerRace(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), wait.ForeverTestTimeout) + defer cancel() + + deschedulermetrics.Register() + deschedulermetrics.PodsEvicted.Reset() + deschedulermetrics.PodsEvictedTotal.Reset() + + node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil) + + ownerRef1 := test.GetReplicaSetOwnerRefList() + p1 := test.BuildTestPod("p1", 100, 0, node1.Name, func(pod *v1.Pod) { + pod.Namespace = "dev" + pod.ObjectMeta.OwnerReferences = ownerRef1 + pod.Annotations = map[string]string{ + EvictionRequestAnnotationKey: "", + } + }) + + client := fakeclientset.NewSimpleClientset(node1, p1) + sharedInformerFactory := informers.NewSharedInformerFactory(client, 0) + _, eventRecorder := utils.GetRecorderAndBroadcaster(ctx, client) + + podEvictor, err := NewPodEvictor( + ctx, + client, + eventRecorder, + sharedInformerFactory.Core().V1().Pods().Informer(), + initFeatureGates(), + NewOptions().WithMetricsEnabled(true), + ) + if err != nil { + t.Fatalf("Unexpected error when creating a pod evictor: %v", err) + } + + // calling erCache.addPod from a goroutine. + triggerAddPod := make(chan struct{}) + addPodDone := make(chan struct{}) + + go func() { + <-triggerAddPod + // Simulate what UpdateFunc does when KubeVirt sets + // EvictionInProgressAnnotationKey before the TooManyRequests response + // arrives: addPod inserts the entry with evictionAssumed=false. + podEvictor.erCache.addPod(p1) + close(addPodDone) + }() + + client.PrependReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) { + if action.GetSubresource() != "eviction" { + return false, nil, nil + } + createAct, ok := action.(core.CreateActionImpl) + if !ok { + return false, nil, fmt.Errorf("unable to convert action to core.CreateActionImpl") + } + eviction, ok := createAct.Object.(*policy.Eviction) + if !ok || eviction.GetName() != "p1" { + return false, nil, nil + } + + // Trigger addPod and wait for it to complete before returning + // TooManyRequests, ensuring assumePod will find the entry already present. + close(triggerAddPod) + <-addPodDone + + return true, nil, &apierrors.StatusError{ + ErrStatus: metav1.Status{ + Reason: metav1.StatusReasonTooManyRequests, + Message: "Eviction triggered evacuation", + }, + } + }) + + sharedInformerFactory.Start(ctx.Done()) + sharedInformerFactory.WaitForCacheSync(ctx.Done()) + + evictOpts := EvictOptions{StrategyName: "TestStrategy", ProfileName: "TestProfile"} + podEvictor.EvictPod(ctx, p1, evictOpts) + + // The "background" metric must have been emitted. + metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "background"}, 1) + + // Simulate the background eviction completing: the pod is deleted. + client.CoreV1().Pods(p1.Namespace).Delete(ctx, p1.Name, metav1.DeleteOptions{}) + + // Wait until DeleteFunc has fired and removed the pod from the cache. + if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) { + return !podEvictor.erCache.hasPod(p1), nil + }); err != nil { + t.Fatalf("Timed out waiting for background eviction to complete: %v", err) + } + + // "success" must also be 1: the background eviction completed. + // Without the fix this will be 0 because assumePod returned early (entry + // already present from addPod) leaving evictionAssumed=false. + metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "success"}, 1) +} + func assertEqualEvents(t *testing.T, expected []string, actual <-chan string) { t.Logf("Assert for events: %v", expected) c := time.After(wait.ForeverTestTimeout)