mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-05-09 10:47:18 +00:00
Merge pull request #1856 from tiraboschi/background_eviction_metrics
evictions: fix missing observability for background evictions
This commit is contained in:
@@ -57,6 +57,7 @@ var (
|
||||
|
||||
type evictionRequestItem struct {
|
||||
podName, podNamespace, podNodeName string
|
||||
strategyName, profileName string
|
||||
evictionAssumed bool
|
||||
assumedTimestamp metav1.Time
|
||||
}
|
||||
@@ -149,17 +150,20 @@ func (erc *evictionRequestsCache) addPod(pod *v1.Pod) {
|
||||
erc.requestsTotal++
|
||||
}
|
||||
|
||||
func (erc *evictionRequestsCache) assumePod(pod *v1.Pod) {
|
||||
func (erc *evictionRequestsCache) assumePod(pod *v1.Pod, profileName, strategyName string) {
|
||||
erc.mu.Lock()
|
||||
defer erc.mu.Unlock()
|
||||
uid := getPodKey(pod)
|
||||
if _, exists := erc.requests[uid]; exists {
|
||||
// Pod already assumed by a previous strategy/profile; first one wins.
|
||||
return
|
||||
}
|
||||
erc.requests[uid] = evictionRequestItem{
|
||||
podNamespace: pod.Namespace,
|
||||
podName: pod.Name,
|
||||
podNodeName: pod.Spec.NodeName,
|
||||
strategyName: strategyName,
|
||||
profileName: profileName,
|
||||
evictionAssumed: true,
|
||||
assumedTimestamp: metav1.NewTime(time.Now()),
|
||||
}
|
||||
@@ -191,6 +195,14 @@ func (erc *evictionRequestsCache) deletePod(pod *v1.Pod) {
|
||||
}
|
||||
}
|
||||
|
||||
func (erc *evictionRequestsCache) getPod(pod *v1.Pod) (evictionRequestItem, bool) {
|
||||
erc.mu.RLock()
|
||||
defer erc.mu.RUnlock()
|
||||
uid := getPodKey(pod)
|
||||
item, exists := erc.requests[uid]
|
||||
return item, exists
|
||||
}
|
||||
|
||||
func (erc *evictionRequestsCache) hasPod(pod *v1.Pod) bool {
|
||||
erc.mu.RLock()
|
||||
defer erc.mu.RUnlock()
|
||||
@@ -347,8 +359,12 @@ func NewPodEvictor(
|
||||
klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", t)
|
||||
return
|
||||
}
|
||||
if erCache.hasPod(pod) {
|
||||
if item, exists := erCache.getPod(pod); exists {
|
||||
klog.V(3).InfoS("Pod with eviction in background deleted/evicted. Removing pod from the cache.", "pod", klog.KObj(pod))
|
||||
if item.evictionAssumed && podEvictor.metricsEnabled {
|
||||
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": item.strategyName, "namespace": item.podNamespace, "node": item.podNodeName, "profile": item.profileName}).Inc()
|
||||
metrics.PodsEvictedTotal.With(map[string]string{"result": "success", "strategy": item.strategyName, "namespace": item.podNamespace, "node": item.podNodeName, "profile": item.profileName}).Inc()
|
||||
}
|
||||
}
|
||||
erCache.deletePod(pod)
|
||||
},
|
||||
@@ -608,7 +624,11 @@ func (pe *PodEvictor) evictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
|
||||
return true, nil
|
||||
}
|
||||
klog.V(3).InfoS("Eviction in background assumed", "pod", klog.KObj(pod))
|
||||
pe.erCache.assumePod(pod)
|
||||
pe.erCache.assumePod(pod, opts.ProfileName, opts.StrategyName)
|
||||
if pe.metricsEnabled {
|
||||
metrics.PodsEvicted.With(map[string]string{"result": "background", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||
metrics.PodsEvictedTotal.With(map[string]string{"result": "background", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,6 +39,9 @@ import (
|
||||
"k8s.io/klog/v2"
|
||||
utilptr "k8s.io/utils/ptr"
|
||||
|
||||
metricstest "k8s.io/component-base/metrics/testutil"
|
||||
|
||||
deschedulermetrics "sigs.k8s.io/descheduler/metrics"
|
||||
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
|
||||
"sigs.k8s.io/descheduler/pkg/features"
|
||||
"sigs.k8s.io/descheduler/pkg/utils"
|
||||
@@ -463,6 +466,113 @@ func TestEvictionRequestsCacheCleanup(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvictionInBackgroundMetrics(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), wait.ForeverTestTimeout)
|
||||
defer cancel()
|
||||
|
||||
deschedulermetrics.Register()
|
||||
deschedulermetrics.PodsEvicted.Reset()
|
||||
deschedulermetrics.PodsEvictedTotal.Reset()
|
||||
|
||||
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
|
||||
|
||||
ownerRef1 := test.GetReplicaSetOwnerRefList()
|
||||
updatePod := func(pod *v1.Pod) {
|
||||
pod.Namespace = "dev"
|
||||
pod.ObjectMeta.OwnerReferences = ownerRef1
|
||||
}
|
||||
updatePodWithEvictionAnnotation := func(pod *v1.Pod) {
|
||||
updatePod(pod)
|
||||
pod.Annotations = map[string]string{
|
||||
EvictionRequestAnnotationKey: "",
|
||||
}
|
||||
}
|
||||
|
||||
// p1, p2: will be evicted in background
|
||||
// p3, p4: will be evicted normally
|
||||
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, updatePodWithEvictionAnnotation)
|
||||
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, updatePodWithEvictionAnnotation)
|
||||
p3 := test.BuildTestPod("p3", 100, 0, node1.Name, updatePod)
|
||||
p4 := test.BuildTestPod("p4", 100, 0, node1.Name, updatePod)
|
||||
|
||||
client := fakeclientset.NewSimpleClientset(node1, p1, p2, p3, p4)
|
||||
sharedInformerFactory := informers.NewSharedInformerFactory(client, 0)
|
||||
_, eventRecorder := utils.GetRecorderAndBroadcaster(ctx, client)
|
||||
|
||||
podEvictor, err := NewPodEvictor(
|
||||
ctx,
|
||||
client,
|
||||
eventRecorder,
|
||||
sharedInformerFactory.Core().V1().Pods().Informer(),
|
||||
initFeatureGates(),
|
||||
NewOptions().WithMetricsEnabled(true),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error when creating a pod evictor: %v", err)
|
||||
}
|
||||
|
||||
client.PrependReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) {
|
||||
if action.GetSubresource() == "eviction" {
|
||||
createAct, matched := action.(core.CreateActionImpl)
|
||||
if !matched {
|
||||
return false, nil, fmt.Errorf("unable to convert action to core.CreateActionImpl")
|
||||
}
|
||||
if eviction, matched := createAct.Object.(*policy.Eviction); matched {
|
||||
if eviction.GetName() == "p1" || eviction.GetName() == "p2" {
|
||||
return true, nil, &apierrors.StatusError{
|
||||
ErrStatus: metav1.Status{
|
||||
Reason: metav1.StatusReasonTooManyRequests,
|
||||
Message: "Eviction triggered evacuation",
|
||||
},
|
||||
}
|
||||
}
|
||||
return true, nil, nil
|
||||
}
|
||||
}
|
||||
return false, nil, nil
|
||||
})
|
||||
|
||||
sharedInformerFactory.Start(ctx.Done())
|
||||
sharedInformerFactory.WaitForCacheSync(ctx.Done())
|
||||
|
||||
evictOpts := EvictOptions{StrategyName: "TestStrategy", ProfileName: "TestProfile"}
|
||||
|
||||
// Cycle 1: p1/p2 go to background, p3/p4 are evicted normally.
|
||||
podEvictor.EvictPod(ctx, p1, evictOpts)
|
||||
podEvictor.EvictPod(ctx, p2, evictOpts)
|
||||
podEvictor.EvictPod(ctx, p3, evictOpts)
|
||||
podEvictor.EvictPod(ctx, p4, evictOpts)
|
||||
|
||||
// After cycle 1: 2 "background" (p1, p2), 2 "success" (p3, p4).
|
||||
metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "background"}, 2)
|
||||
metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "success"}, 2)
|
||||
|
||||
// Cycle 2: p1/p2 are still running (background eviction in progress).
|
||||
// Re-attempting them must not emit any additional metrics.
|
||||
podEvictor.EvictPod(ctx, p1, evictOpts)
|
||||
podEvictor.EvictPod(ctx, p2, evictOpts)
|
||||
|
||||
// Counts must be unchanged after cycle 2.
|
||||
metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "background"}, 2)
|
||||
metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "success"}, 2)
|
||||
|
||||
// The background evictions complete: p1 and p2 are deleted.
|
||||
// The informer's DeleteFunc fires and emits a "success" metric for each assumed pod.
|
||||
client.CoreV1().Pods(p1.Namespace).Delete(ctx, p1.Name, metav1.DeleteOptions{})
|
||||
client.CoreV1().Pods(p2.Namespace).Delete(ctx, p2.Name, metav1.DeleteOptions{})
|
||||
|
||||
// Poll until DeleteFunc has fired for both pods (informer is async).
|
||||
if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
|
||||
return !podEvictor.erCache.hasPod(p1) && !podEvictor.erCache.hasPod(p2), nil
|
||||
}); err != nil {
|
||||
t.Fatalf("Timed out waiting for background evictions to complete: %v", err)
|
||||
}
|
||||
|
||||
// After deletion: "success" grows by 2 (p1, p2 completed), "background" stays at 2.
|
||||
metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "success"}, 4)
|
||||
metricstest.AssertVectorCount(t, "descheduler_pods_evicted_total", map[string]string{"result": "background"}, 2)
|
||||
}
|
||||
|
||||
func assertEqualEvents(t *testing.T, expected []string, actual <-chan string) {
|
||||
t.Logf("Assert for events: %v", expected)
|
||||
c := time.After(wait.ForeverTestTimeout)
|
||||
|
||||
Reference in New Issue
Block a user