Fix race condition: wait for CA bundle ConfigMap before applying CRDs (#1309)
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 14s
Post / images (amd64, addon-manager) (push) Failing after 7m59s
Post / coverage (push) Failing after 8m58s
Post / images (amd64, registration) (push) Failing after 52s
Post / images (amd64, registration-operator) (push) Failing after 50s
Post / images (amd64, work) (push) Failing after 48s
Post / images (arm64, placement) (push) Failing after 48s
Post / images (arm64, registration) (push) Failing after 47s
Post / images (arm64, registration-operator) (push) Failing after 46s
Post / images (arm64, work) (push) Failing after 45s
Post / images (amd64, placement) (push) Failing after 7m34s
Post / images (arm64, addon-manager) (push) Failing after 9m56s
Post / image manifest (addon-manager) (push) Has been skipped
Post / image manifest (placement) (push) Has been skipped
Post / image manifest (registration) (push) Has been skipped
Post / image manifest (registration-operator) (push) Has been skipped
Post / image manifest (work) (push) Has been skipped
Post / trigger clusteradm e2e (push) Has been skipped
Close stale issues and PRs / stale (push) Successful in 1m3s

The cluster manager controller was silently using a literal "placeholder"
string as the CA bundle when the ca-bundle-configmap ConfigMap didn't exist
yet. This caused CRDs to be created with an invalid caBundle field
(cGxhY2Vob2xkZXI= which is base64 of "placeholder"), resulting in:

1. CRD conversion webhooks failing with "InvalidCABundle" error
2. CRDs not becoming Established
3. API endpoints not being registered
4. Dependent components (like MultiClusterHub) failing with:
   "no matches for kind ClusterManagementAddOn"

The bug was a race condition between the cert rotation controller (which
creates the ca-bundle-configmap) and the cluster manager controller (which
reads it). When the ConfigMap was not found, the code did "// do nothing"
and silently continued with the placeholder value.

This fix:
1. Creates the hub namespace FIRST (before waiting for the CA bundle)
   to allow the cert rotation controller to create the ca-bundle-configmap
2. Then waits for the CA bundle ConfigMap to exist before proceeding
3. Requeues via AddAfter if the ConfigMap is not found, allowing the
   controller to gracefully retry until the cert rotation controller
   has created it

This ensures CRDs are always created with valid CA bundles while avoiding
the deadlock where clusterManagerController waited for CA bundle but
certRotationController needed the namespace first.

Changes based on review feedback:
- Use requeue (AddAfter) instead of returning error (@elgnay)
- Use contextual logging instead of klog.V(4).Infof (@qiujian16)

The issue was discovered in OpenShift CI Prow jobs for ZTP hub deployment:
- https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-kni-eco-ci-cd-ztp-left-shifting-kpi-ci-4.21-telcov10n-virtualised-single-node-hub-ztp/2005051399989104640
- https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-kni-eco-ci-cd-ztp-left-shifting-kpi-ci-4.21-telcov10n-virtualised-single-node-hub-ztp/2005219283428184064

Affected versions: ACM 2.16.0-113/114, MCE 2.11.0-142/143 on OCP 4.21.0-rc.0

Signed-off-by: Carlos Cardenosa <ccardeno@redhat.com>
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Carlos Cardeñosa
2026-01-07 15:35:55 +01:00
committed by GitHub
parent 46de05b285
commit 1b40e72e0b
2 changed files with 132 additions and 11 deletions

View File

@@ -4,12 +4,14 @@ import (
"context"
"encoding/base64"
errorhelpers "errors"
"fmt"
"os"
"strings"
"time"
"github.com/openshift/library-go/pkg/assets"
"github.com/openshift/library-go/pkg/operator/resource/resourceapply"
corev1 "k8s.io/api/core/v1"
apiextensionsclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
@@ -281,18 +283,32 @@ func (n *clusterManagerController) sync(ctx context.Context, controllerContext f
return n.patcher.RemoveFinalizer(ctx, clusterManager, clusterManagerFinalizer)
}
// get caBundle
caBundle := "placeholder"
configmap, err := n.configMapLister.ConfigMaps(clusterManagerNamespace).Get(helpers.CaBundleConfigmap)
switch {
case errors.IsNotFound(err):
// do nothing
case err != nil:
// Create the hub namespace FIRST, before waiting for the CA bundle.
// This is required because the cert rotation controller needs the namespace to exist
// before it can create the ca-bundle-configmap. If we wait for the CA bundle before
// creating the namespace, we would have a deadlock.
if err := ensureNamespace(ctx, hubClient, clusterManagerNamespace); err != nil {
return err
default:
if cb := configmap.Data["ca-bundle.crt"]; len(cb) > 0 {
caBundle = cb
}
// get caBundle from the ConfigMap created by the cert rotation controller.
// The cert rotation controller must create this ConfigMap before we can proceed,
// otherwise the CRDs will be created with an invalid "placeholder" CA bundle,
// causing webhook conversion to fail and the CRDs to not become Established.
configmap, err := n.configMapLister.ConfigMaps(clusterManagerNamespace).Get(helpers.CaBundleConfigmap)
if err != nil {
if errors.IsNotFound(err) {
logger.V(4).Info("CA bundle configmap not yet available, will retry",
"namespace", clusterManagerNamespace, "configmap", helpers.CaBundleConfigmap)
controllerContext.Queue().AddAfter(clusterManagerName, 3*time.Second)
return nil
}
return err
}
caBundle := configmap.Data["ca-bundle.crt"]
if len(caBundle) == 0 {
return fmt.Errorf("CA bundle configmap %s/%s exists but ca-bundle.crt is empty",
clusterManagerNamespace, helpers.CaBundleConfigmap)
}
encodedCaBundle := base64.StdEncoding.EncodeToString([]byte(caBundle))
config.RegistrationAPIServiceCABundle = encodedCaBundle
@@ -385,6 +401,33 @@ func ensureSAKubeconfigs(ctx context.Context, clusterManagerName, clusterManager
return nil
}
// ensureNamespace creates the namespace if it doesn't exist.
// This is used to create the hub namespace before waiting for the CA bundle,
// so that the cert rotation controller can create the ca-bundle-configmap.
func ensureNamespace(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
_, err := kubeClient.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{})
if err == nil {
// Namespace already exists
return nil
}
if !errors.IsNotFound(err) {
return err
}
// Create the namespace
ns := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: namespace,
},
}
_, err = kubeClient.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{})
if err != nil && !errors.IsAlreadyExists(err) {
return err
}
logger := klog.FromContext(ctx)
logger.V(4).Info("Created namespace for cert rotation controller", "namespace", namespace)
return nil
}
// TODO: support IPV6 address
func isIPFormat(address string) bool {
runes := []rune(address)

View File

@@ -81,7 +81,23 @@ func newClusterManager(name string) *operatorapiv1.ClusterManager {
}
}
func newTestController(t *testing.T, clustermanager *operatorapiv1.ClusterManager) *testController {
// newCaBundleConfigMap creates the CA bundle ConfigMap that the cert rotation controller creates.
// This is required for the cluster manager controller to proceed with deploying CRDs.
func newCaBundleConfigMap(namespace string) *corev1.ConfigMap {
return &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: helpers.CaBundleConfigmap,
Namespace: namespace,
},
Data: map[string]string{
"ca-bundle.crt": "-----BEGIN CERTIFICATE-----\ntest-ca-bundle\n-----END CERTIFICATE-----",
},
}
}
// newTestControllerWithoutCaBundle creates a test controller without the CA bundle ConfigMap
// to test the behavior when the cert rotation controller hasn't created it yet.
func newTestControllerWithoutCaBundle(t *testing.T, clustermanager *operatorapiv1.ClusterManager) *testController {
kubeClient := fakekube.NewSimpleClientset()
kubeInfomers := kubeinformers.NewSharedInformerFactory(kubeClient, 5*time.Minute)
fakeOperatorClient := fakeoperatorlient.NewSimpleClientset(clustermanager)
@@ -107,6 +123,44 @@ func newTestController(t *testing.T, clustermanager *operatorapiv1.ClusterManage
}
}
// newTestController creates a test controller with all required dependencies including
// the CA bundle ConfigMap. This simulates the normal state where the cert rotation
// controller has already created the CA bundle ConfigMap.
func newTestController(t *testing.T, clustermanager *operatorapiv1.ClusterManager) *testController {
clusterManagerNamespace := helpers.ClusterManagerNamespace(clustermanager.Name, clustermanager.Spec.DeployOption.Mode)
caBundleConfigMap := newCaBundleConfigMap(clusterManagerNamespace)
kubeClient := fakekube.NewSimpleClientset(caBundleConfigMap)
kubeInfomers := kubeinformers.NewSharedInformerFactory(kubeClient, 5*time.Minute)
fakeOperatorClient := fakeoperatorlient.NewSimpleClientset(clustermanager)
operatorInformers := operatorinformers.NewSharedInformerFactory(fakeOperatorClient, 5*time.Minute)
clusterManagerController := &clusterManagerController{
patcher: patcher.NewPatcher[
*operatorapiv1.ClusterManager, operatorapiv1.ClusterManagerSpec, operatorapiv1.ClusterManagerStatus](
fakeOperatorClient.OperatorV1().ClusterManagers()),
clusterManagerLister: operatorInformers.Operator().V1().ClusterManagers().Lister(),
configMapLister: kubeInfomers.Core().V1().ConfigMaps().Lister(),
cache: resourceapply.NewResourceCache(),
}
store := operatorInformers.Operator().V1().ClusterManagers().Informer().GetStore()
if err := store.Add(clustermanager); err != nil {
t.Fatal(err)
}
// Add the CA bundle ConfigMap to the informer store
configMapStore := kubeInfomers.Core().V1().ConfigMaps().Informer().GetStore()
if err := configMapStore.Add(caBundleConfigMap); err != nil {
t.Fatal(err)
}
return &testController{
clusterManagerController: clusterManagerController,
operatorClient: fakeOperatorClient,
}
}
func setDeployment(clusterManagerName, clusterManagerNamespace string) []runtime.Object {
var replicas = int32(1)
@@ -629,6 +683,30 @@ func TestSyncDeployNoWebhook(t *testing.T) {
testingcommon.AssertEqualNumber(t, len(createCRDObjects), 12)
}
// TestSyncDeployWithoutCaBundle tests that sync requeues when the CA bundle ConfigMap
// doesn't exist. This ensures the controller waits for the cert rotation controller to create
// the ConfigMap before proceeding, preventing CRDs from being created with invalid "placeholder"
// CA bundles that would cause webhook conversion to fail.
func TestSyncDeployWithoutCaBundle(t *testing.T) {
clusterManager := newClusterManager("testhub")
tc := newTestControllerWithoutCaBundle(t, clusterManager)
clusterManagerNamespace := helpers.ClusterManagerNamespace(clusterManager.Name, clusterManager.Spec.DeployOption.Mode)
cd := setDeployment(clusterManager.Name, clusterManagerNamespace)
setup(t, tc, cd)
syncContext := testingcommon.NewFakeSyncContext(t, "testhub")
err := tc.clusterManagerController.sync(ctx, syncContext, "testhub")
// sync should return nil (requeue via AddAfter) instead of an error
if err != nil {
t.Fatalf("Expected no error when CA bundle ConfigMap is missing (should requeue via AddAfter), but got: %v", err)
}
// Note: We can't easily verify the requeue here because AddAfter adds items with a delay.
// The important thing is that sync returns nil (indicating graceful requeue) instead of
// proceeding with an invalid CA bundle or returning an error.
}
// TestSyncDelete test cleanup hub deploy
func TestSyncDelete(t *testing.T) {
clusterManager := newClusterManager("testhub")