Merge pull request #1866 from ingvagabund/automated-cherry-pick-of-#1862-upstream-release-1.35

Automated cherry pick of #1862: fix(descheduler): reset prometheus usage client at each
2026-05-15 13:48:05 +00:00 · 2026-05-06 16:53:19 +02:00
parent d48fb05944 4782463ebb
commit 3da4c29a62
2 changed files with 105 additions and 9 deletions
--- a/pkg/framework/plugins/nodeutilization/lownodeutilization.go
+++ b/pkg/framework/plugins/nodeutilization/lownodeutilization.go
@@ -103,18 +103,13 @@ func NewLowNodeUtilization(
 	}

 	// this plugins supports different ways of collecting usage data. each
-	// different way provides its own "usageClient". here we make sure we
-	// have the correct one or an error is triggered. XXX MetricsServer is
-	// deprecated, removed once dropped.
+	// different way provides its own "usageClient". the metrics-based client
+	// is reset at every extension point so we always get the latest prometheus
+	// client (whose SA token can be rotated after plugin creation).
+	// XXX MetricsServer is deprecated, removed once dropped.
 	var usageClient usageClient = newRequestedUsageClient(
 		extendedResourceNames, handle.GetPodsAssignedToNodeFunc(),
 	)
-	if metrics != nil {
-		usageClient, err = usageClientForMetrics(args, handle, extendedResourceNames)
-		if err != nil {
-			return nil, err
-		}
-	}

 	return &LowNodeUtilization{
 		logger:                logger,
@@ -134,12 +129,33 @@ func (l *LowNodeUtilization) Name() string {
 	return LowNodeUtilizationPluginName
 }

+// resetUsageClient refreshes the usageClient field from the current handle
+// state. It must be called at the start of every extension point so that a
+// rotated prometheus SA token is picked up without restarting the process.
+func (l *LowNodeUtilization) resetUsageClient() error {
+	if l.args.MetricsUtilization == nil {
+		return nil
+	}
+	client, err := usageClientForMetrics(l.args, l.handle, l.extendedResourceNames)
+	if err != nil {
+		return err
+	}
+	l.usageClient = client
+	return nil
+}
+
 // Balance holds the main logic of the plugin. It evicts pods from over
 // utilized nodes to under utilized nodes. The goal here is to evenly
 // distribute pods across nodes.
 func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *frameworktypes.Status {
 	logger := klog.FromContext(klog.NewContext(ctx, l.logger)).WithValues("ExtensionPoint", frameworktypes.BalanceExtensionPoint)

+	if err := l.resetUsageClient(); err != nil {
+		return &frameworktypes.Status{
+			Err: fmt.Errorf("error initializing usage client: %v", err),
+		}
+	}
+
 	if err := l.usageClient.sync(ctx, nodes); err != nil {
 		return &frameworktypes.Status{
 			Err: fmt.Errorf("error getting node usage: %v", err),
--- a/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go
+++ b/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go
@@ -1661,3 +1661,83 @@ func TestLowNodeUtilizationWithPrometheusMetrics(t *testing.T) {
 		t.Run(tc.name, testFnc(false, tc.expectedPodsEvicted))
 	}
 }
+
+// TestLowNodeUtilizationPrometheusClientNilAtCreation ensures the plugin
+// tolerates a nil prometheus client at creation time (e.g. in-cluster SA token
+// not yet reconciled) and correctly picks up the client exposed by the handle
+// at Balance call time.
+func TestLowNodeUtilizationPrometheusClientNilAtCreation(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	n1NodeName := "n1"
+	n2NodeName := "n2"
+	n3NodeName := "n3"
+
+	nodes := []*v1.Node{
+		test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
+		test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
+		test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
+	}
+	pods := []*v1.Pod{
+		test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
+	}
+
+	var objs []runtime.Object
+	for _, node := range nodes {
+		objs = append(objs, node)
+	}
+	for _, pod := range pods {
+		objs = append(objs, pod)
+	}
+	fakeClient := fake.NewSimpleClientset(objs...)
+
+	handle, podEvictor, err := frameworktesting.InitFrameworkHandle(
+		ctx, fakeClient, nil, defaultevictor.DefaultEvictorArgs{NodeFit: true}, nil,
+	)
+	if err != nil {
+		t.Fatalf("Unable to initialize a framework handle: %v", err)
+	}
+
+	// PrometheusClientImpl is intentionally nil here — the in-cluster SA token
+	// has not been reconciled yet at plugin-creation time.
+	args := &LowNodeUtilizationArgs{
+		Thresholds:       api.ResourceThresholds{MetricResource: 30},
+		TargetThresholds: api.ResourceThresholds{MetricResource: 50},
+		MetricsUtilization: &MetricsUtilization{
+			Source: api.PrometheusMetrics,
+			Prometheus: &Prometheus{
+				Query: "instance:node_cpu:rate:sum",
+			},
+		},
+	}
+
+	plugin, err := NewLowNodeUtilization(ctx, args, handle)
+	if err != nil {
+		t.Fatalf("plugin creation must succeed even when prometheus client is nil: %v", err)
+	}
+
+	// Simulate the SA token becoming available before the first descheduling cycle.
+	handle.PrometheusClientImpl = &fakePromClient{
+		result: model.Vector{
+			sample("instance:node_cpu:rate:sum", n1NodeName, 0.57), // over target
+			sample("instance:node_cpu:rate:sum", n2NodeName, 0.42), // over target
+			sample("instance:node_cpu:rate:sum", n3NodeName, 0.20), // under threshold
+		},
+		dataType: model.ValVector,
+	}
+
+	status := plugin.(frameworktypes.BalancePlugin).Balance(ctx, nodes)
+	if status != nil {
+		t.Fatalf("Balance failed: %v", status.Err)
+	}
+
+	if podEvictor.TotalEvicted() == 0 {
+		t.Error("expected at least one pod to be evicted")
+	}
+}