fix(descheduler): reset prometheus usage client at each extension point

Profile creation was moved outside the descheduling cycle in b214c147, but reconcileInClusterSAToken() still runs only in runFnc(), after newDescheduler() returns. This leaves the prometheus client nil when LowNodeUtilization's New() runs, causing "prometheus client not initialized" at startup. Avoid failing at plugin creation time if the prometheus client is not yet available. Instead, usageClientForMetrics() is now called at the start of every extension point via a resetUsageClient() helper, so each descheduling cycle picks up the latest client regardless of when the SA token is reconciled or rotated. Fixes: https://github.com/kubernetes-sigs/descheduler/issues/1840 Signed-off-by: Simone Tiraboschi <stirabos@redhat.com>
2026-05-09 10:47:18 +00:00 · 2026-04-28 18:52:08 +02:00
parent 91d7f9ffb9
commit 4782463ebb
2 changed files with 105 additions and 9 deletions
--- a/pkg/framework/plugins/nodeutilization/lownodeutilization.go
+++ b/pkg/framework/plugins/nodeutilization/lownodeutilization.go
@@ -103,18 +103,13 @@ func NewLowNodeUtilization(
 	}

 	// this plugins supports different ways of collecting usage data. each
-	// different way provides its own "usageClient". here we make sure we
-	// have the correct one or an error is triggered. XXX MetricsServer is
-	// deprecated, removed once dropped.
+	// different way provides its own "usageClient". the metrics-based client
+	// is reset at every extension point so we always get the latest prometheus
+	// client (whose SA token can be rotated after plugin creation).
+	// XXX MetricsServer is deprecated, removed once dropped.
 	var usageClient usageClient = newRequestedUsageClient(
 		extendedResourceNames, handle.GetPodsAssignedToNodeFunc(),
 	)
-	if metrics != nil {
-		usageClient, err = usageClientForMetrics(args, handle, extendedResourceNames)
-		if err != nil {
-			return nil, err
-		}
-	}

 	return &LowNodeUtilization{
 		logger:                logger,
@@ -134,12 +129,33 @@ func (l *LowNodeUtilization) Name() string {
 	return LowNodeUtilizationPluginName
 }

+// resetUsageClient refreshes the usageClient field from the current handle
+// state. It must be called at the start of every extension point so that a
+// rotated prometheus SA token is picked up without restarting the process.
+func (l *LowNodeUtilization) resetUsageClient() error {
+	if l.args.MetricsUtilization == nil {
+		return nil
+	}
+	client, err := usageClientForMetrics(l.args, l.handle, l.extendedResourceNames)
+	if err != nil {
+		return err
+	}
+	l.usageClient = client
+	return nil
+}
+
 // Balance holds the main logic of the plugin. It evicts pods from over
 // utilized nodes to under utilized nodes. The goal here is to evenly
 // distribute pods across nodes.
 func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *frameworktypes.Status {
 	logger := klog.FromContext(klog.NewContext(ctx, l.logger)).WithValues("ExtensionPoint", frameworktypes.BalanceExtensionPoint)

+	if err := l.resetUsageClient(); err != nil {
+		return &frameworktypes.Status{
+			Err: fmt.Errorf("error initializing usage client: %v", err),
+		}
+	}
+
 	if err := l.usageClient.sync(ctx, nodes); err != nil {
 		return &frameworktypes.Status{
 			Err: fmt.Errorf("error getting node usage: %v", err),
--- a/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go
+++ b/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go
@@ -1661,3 +1661,83 @@ func TestLowNodeUtilizationWithPrometheusMetrics(t *testing.T) {
 		t.Run(tc.name, testFnc(false, tc.expectedPodsEvicted))
 	}
 }
+
+// TestLowNodeUtilizationPrometheusClientNilAtCreation ensures the plugin
+// tolerates a nil prometheus client at creation time (e.g. in-cluster SA token
+// not yet reconciled) and correctly picks up the client exposed by the handle
+// at Balance call time.
+func TestLowNodeUtilizationPrometheusClientNilAtCreation(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	n1NodeName := "n1"
+	n2NodeName := "n2"
+	n3NodeName := "n3"
+
+	nodes := []*v1.Node{
+		test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
+		test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
+		test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
+	}
+	pods := []*v1.Pod{
+		test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef),
+		test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
+	}
+
+	var objs []runtime.Object
+	for _, node := range nodes {
+		objs = append(objs, node)
+	}
+	for _, pod := range pods {
+		objs = append(objs, pod)
+	}
+	fakeClient := fake.NewSimpleClientset(objs...)
+
+	handle, podEvictor, err := frameworktesting.InitFrameworkHandle(
+		ctx, fakeClient, nil, defaultevictor.DefaultEvictorArgs{NodeFit: true}, nil,
+	)
+	if err != nil {
+		t.Fatalf("Unable to initialize a framework handle: %v", err)
+	}
+
+	// PrometheusClientImpl is intentionally nil here — the in-cluster SA token
+	// has not been reconciled yet at plugin-creation time.
+	args := &LowNodeUtilizationArgs{
+		Thresholds:       api.ResourceThresholds{MetricResource: 30},
+		TargetThresholds: api.ResourceThresholds{MetricResource: 50},
+		MetricsUtilization: &MetricsUtilization{
+			Source: api.PrometheusMetrics,
+			Prometheus: &Prometheus{
+				Query: "instance:node_cpu:rate:sum",
+			},
+		},
+	}
+
+	plugin, err := NewLowNodeUtilization(ctx, args, handle)
+	if err != nil {
+		t.Fatalf("plugin creation must succeed even when prometheus client is nil: %v", err)
+	}
+
+	// Simulate the SA token becoming available before the first descheduling cycle.
+	handle.PrometheusClientImpl = &fakePromClient{
+		result: model.Vector{
+			sample("instance:node_cpu:rate:sum", n1NodeName, 0.57), // over target
+			sample("instance:node_cpu:rate:sum", n2NodeName, 0.42), // over target
+			sample("instance:node_cpu:rate:sum", n3NodeName, 0.20), // under threshold
+		},
+		dataType: model.ValVector,
+	}
+
+	status := plugin.(frameworktypes.BalancePlugin).Balance(ctx, nodes)
+	if status != nil {
+		t.Fatalf("Balance failed: %v", status.Err)
+	}
+
+	if podEvictor.TotalEvicted() == 0 {
+		t.Error("expected at least one pod to be evicted")
+	}
+}