mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-05-15 13:48:05 +00:00
Merge pull request #1866 from ingvagabund/automated-cherry-pick-of-#1862-upstream-release-1.35
Automated cherry pick of #1862: fix(descheduler): reset prometheus usage client at each
This commit is contained in:
@@ -103,18 +103,13 @@ func NewLowNodeUtilization(
|
||||
}
|
||||
|
||||
// this plugins supports different ways of collecting usage data. each
|
||||
// different way provides its own "usageClient". here we make sure we
|
||||
// have the correct one or an error is triggered. XXX MetricsServer is
|
||||
// deprecated, removed once dropped.
|
||||
// different way provides its own "usageClient". the metrics-based client
|
||||
// is reset at every extension point so we always get the latest prometheus
|
||||
// client (whose SA token can be rotated after plugin creation).
|
||||
// XXX MetricsServer is deprecated, removed once dropped.
|
||||
var usageClient usageClient = newRequestedUsageClient(
|
||||
extendedResourceNames, handle.GetPodsAssignedToNodeFunc(),
|
||||
)
|
||||
if metrics != nil {
|
||||
usageClient, err = usageClientForMetrics(args, handle, extendedResourceNames)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return &LowNodeUtilization{
|
||||
logger: logger,
|
||||
@@ -134,12 +129,33 @@ func (l *LowNodeUtilization) Name() string {
|
||||
return LowNodeUtilizationPluginName
|
||||
}
|
||||
|
||||
// resetUsageClient refreshes the usageClient field from the current handle
|
||||
// state. It must be called at the start of every extension point so that a
|
||||
// rotated prometheus SA token is picked up without restarting the process.
|
||||
func (l *LowNodeUtilization) resetUsageClient() error {
|
||||
if l.args.MetricsUtilization == nil {
|
||||
return nil
|
||||
}
|
||||
client, err := usageClientForMetrics(l.args, l.handle, l.extendedResourceNames)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
l.usageClient = client
|
||||
return nil
|
||||
}
|
||||
|
||||
// Balance holds the main logic of the plugin. It evicts pods from over
|
||||
// utilized nodes to under utilized nodes. The goal here is to evenly
|
||||
// distribute pods across nodes.
|
||||
func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *frameworktypes.Status {
|
||||
logger := klog.FromContext(klog.NewContext(ctx, l.logger)).WithValues("ExtensionPoint", frameworktypes.BalanceExtensionPoint)
|
||||
|
||||
if err := l.resetUsageClient(); err != nil {
|
||||
return &frameworktypes.Status{
|
||||
Err: fmt.Errorf("error initializing usage client: %v", err),
|
||||
}
|
||||
}
|
||||
|
||||
if err := l.usageClient.sync(ctx, nodes); err != nil {
|
||||
return &frameworktypes.Status{
|
||||
Err: fmt.Errorf("error getting node usage: %v", err),
|
||||
|
||||
@@ -1661,3 +1661,83 @@ func TestLowNodeUtilizationWithPrometheusMetrics(t *testing.T) {
|
||||
t.Run(tc.name, testFnc(false, tc.expectedPodsEvicted))
|
||||
}
|
||||
}
|
||||
|
||||
// TestLowNodeUtilizationPrometheusClientNilAtCreation ensures the plugin
|
||||
// tolerates a nil prometheus client at creation time (e.g. in-cluster SA token
|
||||
// not yet reconciled) and correctly picks up the client exposed by the handle
|
||||
// at Balance call time.
|
||||
func TestLowNodeUtilizationPrometheusClientNilAtCreation(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
n1NodeName := "n1"
|
||||
n2NodeName := "n2"
|
||||
n3NodeName := "n3"
|
||||
|
||||
nodes := []*v1.Node{
|
||||
test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
|
||||
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
|
||||
test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
|
||||
}
|
||||
pods := []*v1.Pod{
|
||||
test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
|
||||
test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
|
||||
test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
|
||||
test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef),
|
||||
test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef),
|
||||
test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
|
||||
}
|
||||
|
||||
var objs []runtime.Object
|
||||
for _, node := range nodes {
|
||||
objs = append(objs, node)
|
||||
}
|
||||
for _, pod := range pods {
|
||||
objs = append(objs, pod)
|
||||
}
|
||||
fakeClient := fake.NewSimpleClientset(objs...)
|
||||
|
||||
handle, podEvictor, err := frameworktesting.InitFrameworkHandle(
|
||||
ctx, fakeClient, nil, defaultevictor.DefaultEvictorArgs{NodeFit: true}, nil,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to initialize a framework handle: %v", err)
|
||||
}
|
||||
|
||||
// PrometheusClientImpl is intentionally nil here — the in-cluster SA token
|
||||
// has not been reconciled yet at plugin-creation time.
|
||||
args := &LowNodeUtilizationArgs{
|
||||
Thresholds: api.ResourceThresholds{MetricResource: 30},
|
||||
TargetThresholds: api.ResourceThresholds{MetricResource: 50},
|
||||
MetricsUtilization: &MetricsUtilization{
|
||||
Source: api.PrometheusMetrics,
|
||||
Prometheus: &Prometheus{
|
||||
Query: "instance:node_cpu:rate:sum",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
plugin, err := NewLowNodeUtilization(ctx, args, handle)
|
||||
if err != nil {
|
||||
t.Fatalf("plugin creation must succeed even when prometheus client is nil: %v", err)
|
||||
}
|
||||
|
||||
// Simulate the SA token becoming available before the first descheduling cycle.
|
||||
handle.PrometheusClientImpl = &fakePromClient{
|
||||
result: model.Vector{
|
||||
sample("instance:node_cpu:rate:sum", n1NodeName, 0.57), // over target
|
||||
sample("instance:node_cpu:rate:sum", n2NodeName, 0.42), // over target
|
||||
sample("instance:node_cpu:rate:sum", n3NodeName, 0.20), // under threshold
|
||||
},
|
||||
dataType: model.ValVector,
|
||||
}
|
||||
|
||||
status := plugin.(frameworktypes.BalancePlugin).Balance(ctx, nodes)
|
||||
if status != nil {
|
||||
t.Fatalf("Balance failed: %v", status.Err)
|
||||
}
|
||||
|
||||
if podEvictor.TotalEvicted() == 0 {
|
||||
t.Error("expected at least one pod to be evicted")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user