mirror of
https://github.com/open-cluster-management-io/ocm.git
synced 2026-02-14 18:09:57 +00:00
fix: ensure immediate eviction after grace period expires (#1330)
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 1m3s
Post / images (amd64, addon-manager) (push) Failing after 7m31s
Post / coverage (push) Failing after 9m30s
Post / images (amd64, registration-operator) (push) Failing after 57s
Post / images (amd64, work) (push) Failing after 52s
Post / images (arm64, addon-manager) (push) Failing after 50s
Post / images (arm64, placement) (push) Failing after 52s
Post / images (arm64, registration) (push) Failing after 50s
Post / images (arm64, registration-operator) (push) Failing after 52s
Post / images (arm64, work) (push) Failing after 49s
Post / images (amd64, registration) (push) Failing after 7m6s
Post / images (amd64, placement) (push) Failing after 27m47s
Post / image manifest (addon-manager) (push) Has been cancelled
Post / image manifest (placement) (push) Has been cancelled
Post / image manifest (registration) (push) Has been cancelled
Post / image manifest (registration-operator) (push) Has been cancelled
Post / image manifest (work) (push) Has been cancelled
Post / trigger clusteradm e2e (push) Has been cancelled
Close stale issues and PRs / stale (push) Successful in 3s
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 1m3s
Post / images (amd64, addon-manager) (push) Failing after 7m31s
Post / coverage (push) Failing after 9m30s
Post / images (amd64, registration-operator) (push) Failing after 57s
Post / images (amd64, work) (push) Failing after 52s
Post / images (arm64, addon-manager) (push) Failing after 50s
Post / images (arm64, placement) (push) Failing after 52s
Post / images (arm64, registration) (push) Failing after 50s
Post / images (arm64, registration-operator) (push) Failing after 52s
Post / images (arm64, work) (push) Failing after 49s
Post / images (amd64, registration) (push) Failing after 7m6s
Post / images (amd64, placement) (push) Failing after 27m47s
Post / image manifest (addon-manager) (push) Has been cancelled
Post / image manifest (placement) (push) Has been cancelled
Post / image manifest (registration) (push) Has been cancelled
Post / image manifest (registration-operator) (push) Has been cancelled
Post / image manifest (work) (push) Has been cancelled
Post / trigger clusteradm e2e (push) Has been cancelled
Close stale issues and PRs / stale (push) Successful in 3s
Fixed a bug where AppliedManifestWorks were not evicted immediately after the appliedmanifestwork-eviction-grace-period expired. Root cause: The controller used an exponential backoff rate limiter to schedule requeue delays, which caused: 1. Exponentially increasing delays during grace period (1min -> 2min -> 4min...) 2. Unpredictable delays after grace period expired Solution: Replace rate limiter with direct time calculation. Now the controller calculates the exact remaining time until eviction and schedules the next sync for that precise moment: remainingTime := evictionTime.Sub(now) Changes: - Removed rateLimiter field and workqueue import - Calculate exact remaining time instead of using exponential backoff - Added V(4) logging to show scheduled eviction time and remaining time - Updated unit test expectations (queue length 0 for delayed items) Impact: AppliedManifestWorks are now evicted immediately when the grace period expires, instead of being delayed by minutes due to exponential backoff. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Signed-off-by: zhujian <jiazhu@redhat.com> Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,6 @@ import (
|
||||
"k8s.io/apimachinery/pkg/api/meta"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/util/workqueue"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
workv1client "open-cluster-management.io/api/client/work/clientset/versioned/typed/work/v1"
|
||||
@@ -41,7 +40,6 @@ type unmanagedAppliedWorkController struct {
|
||||
hubHash string
|
||||
agentID string
|
||||
evictionGracePeriod time.Duration
|
||||
rateLimiter workqueue.RateLimiter
|
||||
}
|
||||
|
||||
// NewUnManagedAppliedWorkController returns a controller to evict the unmanaged appliedmanifestworks.
|
||||
@@ -71,7 +69,6 @@ func NewUnManagedAppliedWorkController(
|
||||
hubHash: hubHash,
|
||||
agentID: agentID,
|
||||
evictionGracePeriod: evictionGracePeriod,
|
||||
rateLimiter: workqueue.NewItemExponentialFailureRateLimiter(1*time.Minute, evictionGracePeriod),
|
||||
}
|
||||
|
||||
return factory.New().
|
||||
@@ -141,8 +138,13 @@ func (m *unmanagedAppliedWorkController) evictAppliedManifestWork(ctx context.Co
|
||||
return m.patchEvictionStartTime(ctx, appliedManifestWork, &metav1.Time{Time: now})
|
||||
}
|
||||
|
||||
if now.Before(evictionStartTime.Add(m.evictionGracePeriod)) {
|
||||
controllerContext.Queue().AddAfter(appliedManifestWork.Name, m.rateLimiter.When(appliedManifestWork.Name))
|
||||
evictionTime := evictionStartTime.Add(m.evictionGracePeriod)
|
||||
if now.Before(evictionTime) {
|
||||
// Calculate the exact remaining time until eviction
|
||||
remainingTime := evictionTime.Sub(now)
|
||||
controllerContext.Queue().AddAfter(appliedManifestWork.Name, remainingTime)
|
||||
logger.V(4).Info("AppliedManifestWork scheduled for eviction",
|
||||
"evictionTime", evictionTime, "remainingTime", remainingTime)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -160,7 +162,6 @@ func (m *unmanagedAppliedWorkController) stopToEvictAppliedManifestWork(
|
||||
return nil
|
||||
}
|
||||
|
||||
m.rateLimiter.Forget(appliedManifestWork.Name)
|
||||
return m.patchEvictionStartTime(ctx, appliedManifestWork, nil)
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
clienttesting "k8s.io/client-go/testing"
|
||||
"k8s.io/client-go/util/workqueue"
|
||||
|
||||
fakeworkclient "open-cluster-management.io/api/client/work/clientset/versioned/fake"
|
||||
workinformers "open-cluster-management.io/api/client/work/informers/externalversions"
|
||||
@@ -204,7 +203,7 @@ func TestSyncUnamanagedAppliedWork(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedQueueLen: 1,
|
||||
expectedQueueLen: 0, // Item is added to delayed queue via AddAfter, not the main queue
|
||||
validateAppliedManifestWorkActions: testingcommon.AssertNoActions,
|
||||
},
|
||||
{
|
||||
@@ -283,7 +282,6 @@ func TestSyncUnamanagedAppliedWork(t *testing.T) {
|
||||
hubHash: c.hubHash,
|
||||
agentID: c.agentID,
|
||||
evictionGracePeriod: c.evictionGracePeriod,
|
||||
rateLimiter: workqueue.NewItemExponentialFailureRateLimiter(0, c.evictionGracePeriod),
|
||||
}
|
||||
|
||||
controllerContext := testingcommon.NewFakeSyncContext(t, c.appliedManifestWorkName)
|
||||
|
||||
Reference in New Issue
Block a user