mirror of
https://github.com/open-cluster-management-io/ocm.git
synced 2026-02-14 10:00:11 +00:00
Add startTime initialization and wait 10s in hubTimeoutController (#1191)
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 23s
Post / coverage (push) Failing after 29s
Post / images (amd64, addon-manager) (push) Failing after 27s
Post / images (amd64, placement) (push) Failing after 29s
Post / images (amd64, registration) (push) Failing after 21s
Post / images (amd64, registration-operator) (push) Failing after 20s
Post / images (amd64, work) (push) Failing after 23s
Post / images (arm64, addon-manager) (push) Failing after 26s
Post / images (arm64, placement) (push) Failing after 24s
Post / images (arm64, registration) (push) Failing after 19s
Post / images (arm64, registration-operator) (push) Failing after 26s
Post / images (arm64, work) (push) Failing after 33s
Post / image manifest (addon-manager) (push) Has been skipped
Post / image manifest (placement) (push) Has been skipped
Post / image manifest (registration) (push) Has been skipped
Post / image manifest (registration-operator) (push) Has been skipped
Post / image manifest (work) (push) Has been skipped
Post / trigger clusteradm e2e (push) Has been skipped
Close stale issues and PRs / stale (push) Failing after 46s
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 23s
Post / coverage (push) Failing after 29s
Post / images (amd64, addon-manager) (push) Failing after 27s
Post / images (amd64, placement) (push) Failing after 29s
Post / images (amd64, registration) (push) Failing after 21s
Post / images (amd64, registration-operator) (push) Failing after 20s
Post / images (amd64, work) (push) Failing after 23s
Post / images (arm64, addon-manager) (push) Failing after 26s
Post / images (arm64, placement) (push) Failing after 24s
Post / images (arm64, registration) (push) Failing after 19s
Post / images (arm64, registration-operator) (push) Failing after 26s
Post / images (arm64, work) (push) Failing after 33s
Post / image manifest (addon-manager) (push) Has been skipped
Post / image manifest (placement) (push) Has been skipped
Post / image manifest (registration) (push) Has been skipped
Post / image manifest (registration-operator) (push) Has been skipped
Post / image manifest (work) (push) Has been skipped
Post / trigger clusteradm e2e (push) Has been skipped
Close stale issues and PRs / stale (push) Failing after 46s
Signed-off-by: xuezhaojun <zxue@redhat.com>
This commit is contained in:
@@ -21,6 +21,8 @@ type hubTimeoutController struct {
|
||||
timeoutSeconds int32
|
||||
lastLeaseRenewTime time.Time
|
||||
handleTimeout func(ctx context.Context) error
|
||||
|
||||
startTime time.Time
|
||||
}
|
||||
|
||||
func NewHubTimeoutController(
|
||||
@@ -35,6 +37,7 @@ func NewHubTimeoutController(
|
||||
timeoutSeconds: timeoutSeconds,
|
||||
handleTimeout: handleTimeout,
|
||||
leaseClient: leaseClient,
|
||||
startTime: time.Now(),
|
||||
}
|
||||
return factory.New().WithSync(c.sync).ResyncEvery(time.Minute).
|
||||
ToController("HubTimeoutController", recorder)
|
||||
@@ -57,6 +60,25 @@ func (c *hubTimeoutController) sync(ctx context.Context, syncCtx factory.SyncCon
|
||||
c.lastLeaseRenewTime = lease.Spec.RenewTime.Time
|
||||
}
|
||||
|
||||
// If `startTime` within 10s, skip the timeout check.
|
||||
// This handles cases where old leases remain due to incomplete cleanup.
|
||||
//
|
||||
// Example scenario:
|
||||
// 1. ManagedCluster-A is connected to Hub1 with an active lease
|
||||
// 2. Hub1 unexpectedly fails (power outage) - no cleanup opportunity
|
||||
// 3. ManagedCluster-A detects timeout and switches to Hub2
|
||||
// 4. Hub1 comes back online with the old stale lease still present
|
||||
// 5. ManagedCluster-A migrates back to Hub1 (which has the expired lease)
|
||||
// 6. With this grace period: lease controller gets time to update the lease
|
||||
// before timeout checks begin, preventing false timeouts. Otherwise,
|
||||
// timeout controller runs immediately and detects the stale lease as
|
||||
// expired, triggering an unwanted timeout
|
||||
//
|
||||
// This also applies to migration scenarios where cleanup is incomplete.
|
||||
if time.Since(c.startTime) < time.Second*10 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if isTimeout(time.Now(), c.lastLeaseRenewTime, c.timeoutSeconds) {
|
||||
logger.Info("Lease timeout", "cluster", c.clusterName, "lease", leaseName)
|
||||
err := c.handleTimeout(ctx)
|
||||
|
||||
Reference in New Issue
Block a user