From b43b6908d32941ddfec837afe70759550e1006cd Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Tue, 24 Feb 2026 16:12:05 -0500 Subject: [PATCH 1/6] Release lock before WaitForCacheSync in AddSecretEventHandler When many routes use spec.tls.externalCertificate, the router calls AddSecretEventHandler once per route at startup. Previously this held the global write lock for the entire duration of each call, including the WaitForCacheSync round-trip to the API server and etcd (~100-600ms per secret). This serialized all N secret registrations, causing startup time of O(N * api_latency): 112 secrets x 587ms median = 65s (healthy cluster) 1738 secrets x 1414ms avg = 40min (high etcd latency) Fix: write the new monitoredItem into s.monitors before releasing the lock (so concurrent callers for the same secret reuse the same informer), then release the lock before WaitForCacheSync. This allows concurrent AddSecretEventHandler calls for different secrets to perform their etcd round-trips in parallel, reducing startup time from O(N * api_latency) to O(api_latency). On sync failure, the map entry is cleaned up so a future caller can retry with a fresh informer. Fixes: OCPBUGS-77056 Signed-off-by: Brett Tofel --- pkg/secret/secret_monitor.go | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/pkg/secret/secret_monitor.go b/pkg/secret/secret_monitor.go index ddccb0484f..244ea6269b 100644 --- a/pkg/secret/secret_monitor.go +++ b/pkg/secret/secret_monitor.go @@ -93,10 +93,12 @@ func (s *secretMonitor) createSecretInformer(namespace, name string) cache.Share } // addSecretEventHandler adds a secret event handler and starts the informer if not already running. +// +// The global write lock is released before waiting for the informer cache to sync, allowing +// concurrent calls for different secrets to proceed in parallel. This is critical for performance +// when many routes reference external certificate secrets, as registering N secrets serially +// (each requiring an API server round-trip to etcd) would take O(N * api_latency) time. func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, secretName string, handler cache.ResourceEventHandler, secretInformer cache.SharedInformer) (SecretEventHandlerRegistration, error) { - s.lock.Lock() - defer s.lock.Unlock() - if handler == nil { return nil, fmt.Errorf("nil handler is provided") } @@ -104,24 +106,41 @@ func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, se // secret identifier (namespace/secret) key := NewObjectKey(namespace, secretName) + s.lock.Lock() // Start secret informer if monitor does not exist. m, exists := s.monitors[key] if !exists { m = &monitoredItem{} m.itemMonitor = newSingleItemMonitor(key, secretInformer) m.itemMonitor.StartInformer(ctx) + // Register the monitor in the map immediately (before releasing the lock) so that + // concurrent registrations for the same secret reuse this informer rather than + // starting a duplicate. + s.monitors[key] = m + } + s.lock.Unlock() - // wait for first sync + // Wait for cache sync outside the lock so that concurrent registrations for different + // secrets can perform their API round-trips to etcd in parallel. + if !exists { if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { + // Sync failed (e.g. context cancelled). Clean up the map entry so a future + // caller can retry with a fresh informer. + s.lock.Lock() + // Only remove if it's still our entry (no other caller replaced it). + if s.monitors[key] == m { + m.itemMonitor.StopInformer() + delete(s.monitors, key) + } + s.lock.Unlock() return nil, fmt.Errorf("failed waiting for cache sync") } - - // add item key to monitors map - s.monitors[key] = m - klog.Info("secret informer started", " item key ", key) } + s.lock.Lock() + defer s.lock.Unlock() + // add the event handler registration, err := m.itemMonitor.AddEventHandler(handler) if err != nil { From 31113801f871ace73a81a49547a719706cecf36a Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Tue, 3 Mar 2026 18:33:04 -0500 Subject: [PATCH 2/6] Fix secret manager to allow concurrent registrations and non-blocking cache sync Signed-off-by: Brett Tofel --- pkg/route/secretmanager/manager.go | 46 +++++++++++++++++++++++------- pkg/secret/secret_monitor.go | 21 +++----------- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/pkg/route/secretmanager/manager.go b/pkg/route/secretmanager/manager.go index a4a80ee591..78a1526098 100644 --- a/pkg/route/secretmanager/manager.go +++ b/pkg/route/secretmanager/manager.go @@ -61,7 +61,6 @@ func (m *manager) Queue() workqueue.RateLimitingInterface { // Returns an error if the route is already registered with a secret or if adding the secret event handler fails. func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secretName string, handler cache.ResourceEventHandlerFuncs) error { m.handlersLock.Lock() - defer m.handlersLock.Unlock() // Generate a unique key for the provided namespace and routeName. key := generateKey(namespace, routeName) @@ -70,22 +69,43 @@ func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secre // Each route (namespace/routeName) should be registered only once with any secret. // Note: inside a namespace multiple different routes can be registered(watch) with a common secret. if _, exists := m.registeredHandlers[key]; exists { + m.handlersLock.Unlock() return fmt.Errorf("route already registered with key %s", key) } - // Add a secret event handler for the specified namespace and secret, with the handler functions. - klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) - handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) - if err != nil { - return err - } - - // Store the registration and secretName in the manager's map. Used during UnregisterRoute() and GetSecret(). + // Because adding the secret event handler can take O(latency) time when it starts an informer + // and waits for cache sync, we temporarily release the handlersLock. This permits concurrent + // registrations of other routes. + // We mark this key as tentatively registered using a nil registration + // so that concurrent attempts to register the same route fail immediately. m.registeredHandlers[key] = referencedSecret{ secretName: secretName, - handlerRegistration: handlerReg, + handlerRegistration: nil, // placeholder while syncing } - klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + m.handlersLock.Unlock() + + // Add a secret event handler for the specified namespace and secret, with the handler functions. + klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) + + go func() { + handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) + + m.handlersLock.Lock() + defer m.handlersLock.Unlock() + + if err != nil { + delete(m.registeredHandlers, key) + klog.Errorf("failed to add secret event handler: %v", err) + return + } + + // Update only if it wasn't unregistered while we were syncing + if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { + ref.handlerRegistration = handlerReg + m.registeredHandlers[key] = ref + klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + } + }() return nil } @@ -104,6 +124,10 @@ func (m *manager) UnregisterRoute(namespace, routeName string) error { return fmt.Errorf("no handler registered with key %s", key) } + if ref.handlerRegistration == nil { + return fmt.Errorf("route registration currently in progress for key %s", key) + } + // Remove the corresponding secret event handler from the secret monitor. klog.V(5).Info("trying to remove handler with key", key) err := m.monitor.RemoveSecretEventHandler(ref.handlerRegistration) diff --git a/pkg/secret/secret_monitor.go b/pkg/secret/secret_monitor.go index 244ea6269b..0d7339041b 100644 --- a/pkg/secret/secret_monitor.go +++ b/pkg/secret/secret_monitor.go @@ -120,21 +120,7 @@ func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, se } s.lock.Unlock() - // Wait for cache sync outside the lock so that concurrent registrations for different - // secrets can perform their API round-trips to etcd in parallel. if !exists { - if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { - // Sync failed (e.g. context cancelled). Clean up the map entry so a future - // caller can retry with a fresh informer. - s.lock.Lock() - // Only remove if it's still our entry (no other caller replaced it). - if s.monitors[key] == m { - m.itemMonitor.StopInformer() - delete(s.monitors, key) - } - s.lock.Unlock() - return nil, fmt.Errorf("failed waiting for cache sync") - } klog.Info("secret informer started", " item key ", key) } @@ -213,9 +199,10 @@ func (s *secretMonitor) GetSecret(ctx context.Context, handlerRegistration Secre return nil, fmt.Errorf("secret monitor doesn't exist for key %v", key) } - // wait for informer store sync, to load secrets - if !cache.WaitForCacheSync(ctx.Done(), handlerRegistration.HasSynced) { - return nil, fmt.Errorf("failed waiting for cache sync") + // do not wait for informer store sync. If it's not ready, return an error immediately + // so the caller can proceed asynchronously. + if !handlerRegistration.HasSynced() { + return nil, fmt.Errorf("secret cache not synced yet") } uncast, exists, err := m.itemMonitor.GetItem() From 5f20226e022f27d6ac1b5f40fe73eb152e5cc527 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 16 Mar 2026 09:16:21 -0400 Subject: [PATCH 3/6] Fix tests for concurrent registrations and non-blocking cache sync --- pkg/route/secretmanager/manager_test.go | 34 ++++++++++++++++++++++++- pkg/secret/fake/fake_secret_monitor.go | 11 +++++++- pkg/secret/secret_monitor_test.go | 3 +++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/pkg/route/secretmanager/manager_test.go b/pkg/route/secretmanager/manager_test.go index 0f5f58c629..45b70a1b25 100644 --- a/pkg/route/secretmanager/manager_test.go +++ b/pkg/route/secretmanager/manager_test.go @@ -5,10 +5,12 @@ import ( "fmt" "reflect" "testing" + "time" "github.com/openshift/library-go/pkg/secret/fake" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/cache" ) @@ -17,6 +19,20 @@ type routeSecret struct { secretName string } +func waitForRegistration(mgr *manager, key string) { + err := wait.PollImmediate(5*time.Millisecond, 2*time.Second, func() (bool, error) { + mgr.handlersLock.Lock() + defer mgr.handlersLock.Unlock() + if ref, exists := mgr.registeredHandlers[key]; !exists || ref.handlerRegistration != nil { + return true, nil + } + return false, nil + }) + if err != nil { + panic(fmt.Sprintf("timeout waiting for registration of %s", key)) + } +} + func TestRegisterRoute(t *testing.T) { namespace := "ns" @@ -80,7 +96,7 @@ func TestRegisterRoute(t *testing.T) { sm: fake.SecretMonitor{ Err: fmt.Errorf("some error"), }, - expectErr: 1, + expectErr: 0, }, } for _, s := range scenarios { @@ -96,9 +112,14 @@ func TestRegisterRoute(t *testing.T) { gotErr += 1 } } + for _, rs := range s.rs { + waitForRegistration(mgr, generateKey(namespace, rs.routeName)) + } if gotErr != s.expectErr { t.Fatalf("expected %d errors, got %d errors", s.expectErr, gotErr) } + mgr.handlersLock.Lock() + defer mgr.handlersLock.Unlock() if len(s.expectHandlersKeys) != len(mgr.registeredHandlers) { t.Fatalf("expected %d keys: %v, got %d keys: %v", len(s.expectHandlersKeys), s.expectHandlersKeys, len(mgr.registeredHandlers), mgr.registeredHandlers) } @@ -169,6 +190,9 @@ func TestUnregisterRoute(t *testing.T) { t.Fatalf("failed to register %v: %v", rs, err) } } + for _, rs := range s.register { + waitForRegistration(mgr, generateKey(namespace, rs.routeName)) + } // unregister mgr.monitor = &s.sm @@ -182,6 +206,8 @@ func TestUnregisterRoute(t *testing.T) { if gotErr != s.expectErr { t.Fatalf("expected %d errors, got %d errors", s.expectErr, gotErr) } + mgr.handlersLock.Lock() + defer mgr.handlersLock.Unlock() if len(s.expectHandlersKeys) != len(mgr.registeredHandlers) { t.Fatalf("expected %d keys: %v, got %d keys: %v", len(s.expectHandlersKeys), s.expectHandlersKeys, len(mgr.registeredHandlers), mgr.registeredHandlers) } @@ -245,6 +271,9 @@ func TestGetSecret(t *testing.T) { t.Fatalf("failed to register %v: %v", rs, err) } } + for _, rs := range s.register { + waitForRegistration(&mgr, generateKey(namespace, rs.routeName)) + } mgr.monitor = &s.sm gotSec, err := mgr.GetSecret(context.TODO(), namespace, routeName) @@ -295,6 +324,9 @@ func TestLookupRouteSecret(t *testing.T) { t.Fatalf("failed to register %v: %v", rs, err) } } + for _, rs := range s.register { + waitForRegistration(mgr, generateKey(namespace, rs.routeName)) + } secret, exist := mgr.LookupRouteSecret(namespace, routeName) diff --git a/pkg/secret/fake/fake_secret_monitor.go b/pkg/secret/fake/fake_secret_monitor.go index 563d73253d..b88becbbb9 100644 --- a/pkg/secret/fake/fake_secret_monitor.go +++ b/pkg/secret/fake/fake_secret_monitor.go @@ -13,8 +13,17 @@ type SecretMonitor struct { Secret *corev1.Secret } +type fakeRegistration struct{} + +func (fakeRegistration) HasSynced() bool { return true } +func (fakeRegistration) GetKey() secret.ObjectKey { return secret.ObjectKey{} } +func (fakeRegistration) GetHandler() cache.ResourceEventHandlerRegistration { return nil } + func (sm *SecretMonitor) AddSecretEventHandler(_ context.Context, _ string, _ string, _ cache.ResourceEventHandler) (secret.SecretEventHandlerRegistration, error) { - return nil, sm.Err + if sm.Err != nil { + return nil, sm.Err + } + return fakeRegistration{}, nil } func (sm *SecretMonitor) RemoveSecretEventHandler(_ secret.SecretEventHandlerRegistration) error { return sm.Err diff --git a/pkg/secret/secret_monitor_test.go b/pkg/secret/secret_monitor_test.go index 7120923508..8936680314 100644 --- a/pkg/secret/secret_monitor_test.go +++ b/pkg/secret/secret_monitor_test.go @@ -262,6 +262,9 @@ func TestGetSecret(t *testing.T) { if s.isNilHandlerReg { h = nil } + if h != nil { + cache.WaitForCacheSync(context.TODO().Done(), h.HasSynced) + } if s.isKeyRemoved { delete(sm.monitors, key) } From 200e1ee469633e77b23ec4bea16f2ce98506e74b Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 16 Mar 2026 11:47:01 -0400 Subject: [PATCH 4/6] Fix gofmt errors in fake_secret_monitor.go Signed-off-by: Brett Tofel --- pkg/secret/fake/fake_secret_monitor.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/secret/fake/fake_secret_monitor.go b/pkg/secret/fake/fake_secret_monitor.go index b88becbbb9..63a8654b43 100644 --- a/pkg/secret/fake/fake_secret_monitor.go +++ b/pkg/secret/fake/fake_secret_monitor.go @@ -15,8 +15,8 @@ type SecretMonitor struct { type fakeRegistration struct{} -func (fakeRegistration) HasSynced() bool { return true } -func (fakeRegistration) GetKey() secret.ObjectKey { return secret.ObjectKey{} } +func (fakeRegistration) HasSynced() bool { return true } +func (fakeRegistration) GetKey() secret.ObjectKey { return secret.ObjectKey{} } func (fakeRegistration) GetHandler() cache.ResourceEventHandlerRegistration { return nil } func (sm *SecretMonitor) AddSecretEventHandler(_ context.Context, _ string, _ string, _ cache.ResourceEventHandler) (secret.SecretEventHandlerRegistration, error) { From bff69eeafeb11c2c008659bec041aa8ef65a9d6e Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 26 Mar 2026 13:54:50 -0400 Subject: [PATCH 5/6] fix(secret_monitor): resolve async SAR race condition that prevented external cert serving --- pkg/route/secretmanager/manager.go | 39 +++++++++++++++--------------- pkg/secret/secret_monitor.go | 14 ++++++++--- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/pkg/route/secretmanager/manager.go b/pkg/route/secretmanager/manager.go index 78a1526098..3cc96a4fde 100644 --- a/pkg/route/secretmanager/manager.go +++ b/pkg/route/secretmanager/manager.go @@ -85,27 +85,28 @@ func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secre m.handlersLock.Unlock() // Add a secret event handler for the specified namespace and secret, with the handler functions. + // This call releases the monitor lock internally during WaitForCacheSync, allowing concurrent + // registrations for different secrets. However, it blocks until the cache is synced so that + // GetSecret works immediately after RegisterRoute returns. klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) - go func() { - handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) - - m.handlersLock.Lock() - defer m.handlersLock.Unlock() - - if err != nil { - delete(m.registeredHandlers, key) - klog.Errorf("failed to add secret event handler: %v", err) - return - } - - // Update only if it wasn't unregistered while we were syncing - if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { - ref.handlerRegistration = handlerReg - m.registeredHandlers[key] = ref - klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) - } - }() + handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) + + m.handlersLock.Lock() + + if err != nil { + delete(m.registeredHandlers, key) + m.handlersLock.Unlock() + return fmt.Errorf("failed to add secret event handler for key %s: %w", key, err) + } + + // Update only if it wasn't unregistered while we were syncing + if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { + ref.handlerRegistration = handlerReg + m.registeredHandlers[key] = ref + klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + } + m.handlersLock.Unlock() return nil } diff --git a/pkg/secret/secret_monitor.go b/pkg/secret/secret_monitor.go index 0d7339041b..7c66899d57 100644 --- a/pkg/secret/secret_monitor.go +++ b/pkg/secret/secret_monitor.go @@ -124,6 +124,13 @@ func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, se klog.Info("secret informer started", " item key ", key) } + // Wait for the informer cache to sync before adding event handlers. + // This ensures GetSecret can retrieve secrets immediately after registration. + // The global lock is released above so other secrets can register concurrently. + if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { + return nil, fmt.Errorf("failed waiting for cache sync") + } + s.lock.Lock() defer s.lock.Unlock() @@ -199,10 +206,9 @@ func (s *secretMonitor) GetSecret(ctx context.Context, handlerRegistration Secre return nil, fmt.Errorf("secret monitor doesn't exist for key %v", key) } - // do not wait for informer store sync. If it's not ready, return an error immediately - // so the caller can proceed asynchronously. - if !handlerRegistration.HasSynced() { - return nil, fmt.Errorf("secret cache not synced yet") + // Wait for informer store sync to load secrets. + if !cache.WaitForCacheSync(ctx.Done(), handlerRegistration.HasSynced) { + return nil, fmt.Errorf("failed waiting for cache sync") } uncast, exists, err := m.itemMonitor.GetItem() From ef7716e77898cc7f1ccb0846b3a4fa6ee75365b3 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 26 Mar 2026 17:21:56 -0400 Subject: [PATCH 6/6] test: update expectations for synchronous secret monitor event handler --- pkg/route/secretmanager/manager_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/route/secretmanager/manager_test.go b/pkg/route/secretmanager/manager_test.go index 45b70a1b25..6749abc94f 100644 --- a/pkg/route/secretmanager/manager_test.go +++ b/pkg/route/secretmanager/manager_test.go @@ -96,7 +96,7 @@ func TestRegisterRoute(t *testing.T) { sm: fake.SecretMonitor{ Err: fmt.Errorf("some error"), }, - expectErr: 0, + expectErr: 1, }, } for _, s := range scenarios {