mirror of
https://github.com/open-cluster-management-io/ocm.git
synced 2026-02-14 18:09:57 +00:00
Add startTime initialization and wait 10s in hubTimeoutController (#1191)
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 23s
Post / coverage (push) Failing after 29s
Post / images (amd64, addon-manager) (push) Failing after 27s
Post / images (amd64, placement) (push) Failing after 29s
Post / images (amd64, registration) (push) Failing after 21s
Post / images (amd64, registration-operator) (push) Failing after 20s
Post / images (amd64, work) (push) Failing after 23s
Post / images (arm64, addon-manager) (push) Failing after 26s
Post / images (arm64, placement) (push) Failing after 24s
Post / images (arm64, registration) (push) Failing after 19s
Post / images (arm64, registration-operator) (push) Failing after 26s
Post / images (arm64, work) (push) Failing after 33s
Post / image manifest (addon-manager) (push) Has been skipped
Post / image manifest (placement) (push) Has been skipped
Post / image manifest (registration) (push) Has been skipped
Post / image manifest (registration-operator) (push) Has been skipped
Post / image manifest (work) (push) Has been skipped
Post / trigger clusteradm e2e (push) Has been skipped
Close stale issues and PRs / stale (push) Failing after 46s
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 23s
Post / coverage (push) Failing after 29s
Post / images (amd64, addon-manager) (push) Failing after 27s
Post / images (amd64, placement) (push) Failing after 29s
Post / images (amd64, registration) (push) Failing after 21s
Post / images (amd64, registration-operator) (push) Failing after 20s
Post / images (amd64, work) (push) Failing after 23s
Post / images (arm64, addon-manager) (push) Failing after 26s
Post / images (arm64, placement) (push) Failing after 24s
Post / images (arm64, registration) (push) Failing after 19s
Post / images (arm64, registration-operator) (push) Failing after 26s
Post / images (arm64, work) (push) Failing after 33s
Post / image manifest (addon-manager) (push) Has been skipped
Post / image manifest (placement) (push) Has been skipped
Post / image manifest (registration) (push) Has been skipped
Post / image manifest (registration-operator) (push) Has been skipped
Post / image manifest (work) (push) Has been skipped
Post / trigger clusteradm e2e (push) Has been skipped
Close stale issues and PRs / stale (push) Failing after 46s
Signed-off-by: xuezhaojun <zxue@redhat.com>
This commit is contained in:
@@ -21,6 +21,8 @@ type hubTimeoutController struct {
|
|||||||
timeoutSeconds int32
|
timeoutSeconds int32
|
||||||
lastLeaseRenewTime time.Time
|
lastLeaseRenewTime time.Time
|
||||||
handleTimeout func(ctx context.Context) error
|
handleTimeout func(ctx context.Context) error
|
||||||
|
|
||||||
|
startTime time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewHubTimeoutController(
|
func NewHubTimeoutController(
|
||||||
@@ -35,6 +37,7 @@ func NewHubTimeoutController(
|
|||||||
timeoutSeconds: timeoutSeconds,
|
timeoutSeconds: timeoutSeconds,
|
||||||
handleTimeout: handleTimeout,
|
handleTimeout: handleTimeout,
|
||||||
leaseClient: leaseClient,
|
leaseClient: leaseClient,
|
||||||
|
startTime: time.Now(),
|
||||||
}
|
}
|
||||||
return factory.New().WithSync(c.sync).ResyncEvery(time.Minute).
|
return factory.New().WithSync(c.sync).ResyncEvery(time.Minute).
|
||||||
ToController("HubTimeoutController", recorder)
|
ToController("HubTimeoutController", recorder)
|
||||||
@@ -57,6 +60,25 @@ func (c *hubTimeoutController) sync(ctx context.Context, syncCtx factory.SyncCon
|
|||||||
c.lastLeaseRenewTime = lease.Spec.RenewTime.Time
|
c.lastLeaseRenewTime = lease.Spec.RenewTime.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If `startTime` within 10s, skip the timeout check.
|
||||||
|
// This handles cases where old leases remain due to incomplete cleanup.
|
||||||
|
//
|
||||||
|
// Example scenario:
|
||||||
|
// 1. ManagedCluster-A is connected to Hub1 with an active lease
|
||||||
|
// 2. Hub1 unexpectedly fails (power outage) - no cleanup opportunity
|
||||||
|
// 3. ManagedCluster-A detects timeout and switches to Hub2
|
||||||
|
// 4. Hub1 comes back online with the old stale lease still present
|
||||||
|
// 5. ManagedCluster-A migrates back to Hub1 (which has the expired lease)
|
||||||
|
// 6. With this grace period: lease controller gets time to update the lease
|
||||||
|
// before timeout checks begin, preventing false timeouts. Otherwise,
|
||||||
|
// timeout controller runs immediately and detects the stale lease as
|
||||||
|
// expired, triggering an unwanted timeout
|
||||||
|
//
|
||||||
|
// This also applies to migration scenarios where cleanup is incomplete.
|
||||||
|
if time.Since(c.startTime) < time.Second*10 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
if isTimeout(time.Now(), c.lastLeaseRenewTime, c.timeoutSeconds) {
|
if isTimeout(time.Now(), c.lastLeaseRenewTime, c.timeoutSeconds) {
|
||||||
logger.Info("Lease timeout", "cluster", c.clusterName, "lease", leaseName)
|
logger.Info("Lease timeout", "cluster", c.clusterName, "lease", leaseName)
|
||||||
err := c.handleTimeout(ctx)
|
err := c.handleTimeout(ctx)
|
||||||
|
|||||||
Reference in New Issue
Block a user