Add startTime initialization and wait 10s in hubTimeoutController (#1191)
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 23s
Post / coverage (push) Failing after 29s
Post / images (amd64, addon-manager) (push) Failing after 27s
Post / images (amd64, placement) (push) Failing after 29s
Post / images (amd64, registration) (push) Failing after 21s
Post / images (amd64, registration-operator) (push) Failing after 20s
Post / images (amd64, work) (push) Failing after 23s
Post / images (arm64, addon-manager) (push) Failing after 26s
Post / images (arm64, placement) (push) Failing after 24s
Post / images (arm64, registration) (push) Failing after 19s
Post / images (arm64, registration-operator) (push) Failing after 26s
Post / images (arm64, work) (push) Failing after 33s
Post / image manifest (addon-manager) (push) Has been skipped
Post / image manifest (placement) (push) Has been skipped
Post / image manifest (registration) (push) Has been skipped
Post / image manifest (registration-operator) (push) Has been skipped
Post / image manifest (work) (push) Has been skipped
Post / trigger clusteradm e2e (push) Has been skipped
Close stale issues and PRs / stale (push) Failing after 46s

Signed-off-by: xuezhaojun <zxue@redhat.com>
This commit is contained in:
xuezhao
2025-09-23 15:26:48 +08:00
committed by GitHub
parent 2f04992d6c
commit 010f5efe6d

View File

@@ -21,6 +21,8 @@ type hubTimeoutController struct {
timeoutSeconds int32
lastLeaseRenewTime time.Time
handleTimeout func(ctx context.Context) error
startTime time.Time
}
func NewHubTimeoutController(
@@ -35,6 +37,7 @@ func NewHubTimeoutController(
timeoutSeconds: timeoutSeconds,
handleTimeout: handleTimeout,
leaseClient: leaseClient,
startTime: time.Now(),
}
return factory.New().WithSync(c.sync).ResyncEvery(time.Minute).
ToController("HubTimeoutController", recorder)
@@ -57,6 +60,25 @@ func (c *hubTimeoutController) sync(ctx context.Context, syncCtx factory.SyncCon
c.lastLeaseRenewTime = lease.Spec.RenewTime.Time
}
// If `startTime` within 10s, skip the timeout check.
// This handles cases where old leases remain due to incomplete cleanup.
//
// Example scenario:
// 1. ManagedCluster-A is connected to Hub1 with an active lease
// 2. Hub1 unexpectedly fails (power outage) - no cleanup opportunity
// 3. ManagedCluster-A detects timeout and switches to Hub2
// 4. Hub1 comes back online with the old stale lease still present
// 5. ManagedCluster-A migrates back to Hub1 (which has the expired lease)
// 6. With this grace period: lease controller gets time to update the lease
// before timeout checks begin, preventing false timeouts. Otherwise,
// timeout controller runs immediately and detects the stale lease as
// expired, triggering an unwanted timeout
//
// This also applies to migration scenarios where cleanup is incomplete.
if time.Since(c.startTime) < time.Second*10 {
return nil
}
if isTimeout(time.Now(), c.lastLeaseRenewTime, c.timeoutSeconds) {
logger.Info("Lease timeout", "cluster", c.clusterName, "lease", leaseName)
err := c.handleTimeout(ctx)