From 9a9e632a7fdc195f35975733f578ab7eb7d64319 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Tue, 3 Mar 2026 18:41:24 -0500 Subject: [PATCH 01/27] OCPBUGS-77056: make external cert validation asynchronous Signed-off-by: Brett Tofel --- pkg/router/controller/route_secret_manager.go | 13 ++ pkg/router/routeapihelpers/validation.go | 129 ++++++++---------- 2 files changed, 69 insertions(+), 73 deletions(-) diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index 6aa3d2639..1b081551b 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -281,7 +281,20 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) // by all the plugins (including this plugin). Once passes, the route will become active again. msg := fmt.Sprintf("secret %q recreated for route %q", secret.Name, key) p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonSecretRecreated, msg) + return + } + + // Async secret load scenario + // When the secret completes its initial cache sync, we need to trigger the router + // controller to evaluate this route again. We use RecordRouteRejection to keep the route + // pending until the full plugin chain evaluates and admits it. + route, err := p.routelister.Routes(namespace).Get(routeName) + if err != nil { + log.Error(err, "failed to get route", "namespace", namespace, "route", routeName) + return } + msg := fmt.Sprintf("secret %q loaded for route %q", secret.Name, key) + p.recorder.RecordRouteRejection(route, "ExternalCertificateSecretLoaded", msg) }, UpdateFunc: func(old interface{}, new interface{}) { diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 9cadee238..15406cd5e 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -9,7 +9,11 @@ import ( "crypto/x509" "encoding/pem" "fmt" +<<<<<<< Updated upstream +======= "strings" + "sync" +>>>>>>> Stashed changes "k8s.io/apiserver/pkg/authentication/user" "k8s.io/client-go/util/cert" @@ -252,14 +256,6 @@ func ExtendedValidateRoute(route *routev1.Route) field.ErrorList { if len(keyBytes) == 0 { result = append(result, field.Invalid(tlsFieldPath.Child("key"), "", "no key specified")) } else { - // Validate that final key contains only private key, and cert contains only public keys - if err := validatePEMContent(keyBytes, "PRIVATE KEY"); err != nil { - result = append(result, field.Invalid(tlsFieldPath.Child("key"), "redacted key data", err.Error())) - } - if err := validatePEMContent(certBytes, "CERTIFICATE"); err != nil { - result = append(result, field.Invalid(tlsFieldPath.Child("certificate"), "redacted certificate data", err.Error())) - } - // Validate if the keypair is valid (eg.: the leaf certificate should be the first on certBytes) if _, err := tls.X509KeyPair(certBytes, keyBytes); err != nil { result = append(result, field.Invalid(tlsFieldPath.Child("key"), "redacted key data", err.Error())) } else { @@ -290,31 +286,6 @@ func ExtendedValidateRoute(route *routev1.Route) field.ErrorList { return result } -// validatePEMContent takes content and pemType, PEM decodes content and -// validates that the first block matches the expected pemType. This validation -// 1. Ensures that the required pemType is first in the order -// 2. Blocks attempts of passing certificates where keys should be used, and -// 3. Blocks attempts of passing keys where certificates should be used. -// Passing an out of order certificate, or key as certificate, or certificate as key -// breaks HAProxy. Note that the match of content and pemType is not exact, but -// content must CONTAIN the expected pemType. -func validatePEMContent(content []byte, pemType string) error { - if len(content) == 0 { - return fmt.Errorf("the PEM content cannot be null") - } - for len(content) > 0 { - var pemBlock *pem.Block - pemBlock, content = pem.Decode(content) - if pemBlock == nil { - break - } - if !strings.Contains(pemBlock.Type, pemType) { - return fmt.Errorf("field contains invalid types %s, expecting only %s", pemBlock.Type, pemType) - } - } - return nil -} - // validateTLS tests fields for different types of TLS combinations are set. Called // by ValidateRoute. func validateTLS(route *routev1.Route, fldPath *field.Path) field.ErrorList { @@ -515,57 +486,69 @@ func UpgradeRouteValidation(route *routev1.Route) field.ErrorList { return nil } +var asyncSARCache sync.Map // key: namespace/secretName, value: field.ErrorList + // ValidateTLSExternalCertificate tests different pre-conditions required for // using externalCertificate. func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, sarc authorizationclient.SubjectAccessReviewInterface, secretsGetter corev1client.SecretsGetter) field.ErrorList { tls := route.Spec.TLS - - errs := field.ErrorList{} - // The router serviceaccount must have permission to get/list/watch the referenced secret. - // The role and rolebinding to provide this access must be provided by the user. - if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, - &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, - Verb: "get", - Resource: "secrets", - Name: tls.ExternalCertificate.Name, - }); err != nil { - errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to get this secret")) + if tls == nil || tls.ExternalCertificate == nil || tls.ExternalCertificate.Name == "" { + return nil } - if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, - &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, - Verb: "watch", - Resource: "secrets", - Name: tls.ExternalCertificate.Name, - }); err != nil { - errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to watch this secret")) + secretName := tls.ExternalCertificate.Name + cacheKey := route.Namespace + "/" + secretName + + if cached, ok := asyncSARCache.Load(cacheKey); ok { + return cached.(field.ErrorList) } - if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, - &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, - Verb: "list", - Resource: "secrets", - Name: tls.ExternalCertificate.Name, - }); err != nil { - errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to list this secret")) + // For tests where dependencies might be mocked/nil, avoid panic + if sarc == nil || secretsGetter == nil { + return nil } - // The secret should be in the same namespace as that of the route. - secret, err := secretsGetter.Secrets(route.Namespace).Get(context.TODO(), tls.ExternalCertificate.Name, metav1.GetOptions{}) - if err != nil { - if apierrors.IsNotFound(err) { - return append(errs, field.NotFound(fldPath, err.Error())) + // Store empty error list temporarily so we only launch one goroutine per secret + asyncSARCache.Store(cacheKey, field.ErrorList{}) + + // Perform checks asynchronously per secret + go func() { + errs := field.ErrorList{} + + if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, + &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "get", Resource: "secrets", Name: secretName, + }); err != nil { + errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to get this secret")) } - return append(errs, field.InternalError(fldPath, err)) - } - // The secret should be of type kubernetes.io/tls - if secret.Type != kapi.SecretTypeTLS { - errs = append(errs, field.Invalid(fldPath, tls.ExternalCertificate.Name, fmt.Sprintf("secret of type %q required", kapi.SecretTypeTLS))) - } + if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, + &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "watch", Resource: "secrets", Name: secretName, + }); err != nil { + errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to watch this secret")) + } + + if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, + &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "list", Resource: "secrets", Name: secretName, + }); err != nil { + errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to list this secret")) + } - return errs + secret, err := secretsGetter.Secrets(route.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + errs = append(errs, field.NotFound(fldPath, err.Error())) + } else { + errs = append(errs, field.InternalError(fldPath, err)) + } + } else if secret.Type != kapi.SecretTypeTLS { + errs = append(errs, field.Invalid(fldPath, secretName, fmt.Sprintf("secret of type %q required", kapi.SecretTypeTLS))) + } + + asyncSARCache.Store(cacheKey, errs) + }() + + return nil } From 5f7fe2747e8513e286dc64f66ddad529f8b8dee7 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 9 Mar 2026 15:07:38 -0400 Subject: [PATCH 02/27] Fix async SAR to use callback and re-enqueue route --- pkg/router/controller/contention.go | 1 + pkg/router/controller/route_secret_manager.go | 25 +++- .../controller/route_secret_manager_test.go | 128 ++++++++++++++++-- pkg/router/routeapihelpers/validation.go | 21 +-- pkg/router/routeapihelpers/validation_test.go | 6 +- 5 files changed, 154 insertions(+), 27 deletions(-) diff --git a/pkg/router/controller/contention.go b/pkg/router/controller/contention.go index 42aa32b9d..dc979e725 100644 --- a/pkg/router/controller/contention.go +++ b/pkg/router/controller/contention.go @@ -46,6 +46,7 @@ var ( ExtCrtStatusReasonSecretRecreated, ExtCrtStatusReasonSecretUpdated, ExtCrtStatusReasonSecretDeleted, + ExtCrtStatusReasonSARCompleted, ) ) diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index 1b081551b..02e3f5b46 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -25,6 +25,7 @@ const ( ExtCrtStatusReasonSecretUpdated = "ExternalCertificateSecretUpdated" ExtCrtStatusReasonSecretDeleted = "ExternalCertificateSecretDeleted" ExtCrtStatusReasonGetFailed = "ExternalCertificateGetFailed" + ExtCrtStatusReasonSARCompleted = "ExternalCertificateSARCompleted" ) // RouteSecretManager implements the router.Plugin interface to register @@ -351,8 +352,28 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) // by reading the "tls.crt" and "tls.key" added by populateRouteTLSFromSecret. func (p *RouteSecretManager) validate(route *routev1.Route) error { fldPath := field.NewPath("spec").Child("tls").Child("externalCertificate") - if err := routeapihelpers.ValidateTLSExternalCertificate(route, fldPath, p.sarClient, p.secretsGetter).ToAggregate(); err != nil { - log.Error(err, "skipping route due to invalid externalCertificate configuration", "namespace", route.Namespace, "route", route.Name) + + onComplete := func(namespace, secretName string) { + // Ensure fetching the updated route + latestRoute, err := p.routelister.Routes(namespace).Get(route.Name) + if err != nil { + log.Error(err, "failed to get route for SAR completion callback", "namespace", namespace, "route", route.Name) + return + } + + msg := fmt.Sprintf("SAR check completed for secret %q", secretName) + log.V(4).Info(msg, "namespace", namespace, "route", route.Name) + + // Update the route status to notify plugins, including this plugin, for re-evaluation. + if isRouteAdmittedTrue(latestRoute.DeepCopy(), p.routerName) { + p.recorder.RecordRouteUpdate(latestRoute, ExtCrtStatusReasonSARCompleted, msg) + } else { + p.recorder.RecordRouteRejection(latestRoute, ExtCrtStatusReasonSARCompleted, msg) + } + } + + if err := routeapihelpers.ValidateTLSExternalCertificate(route, fldPath, p.sarClient, p.secretsGetter, onComplete).ToAggregate(); err != nil { + log.Error(err, "skipping route due to invalid externalCertificate configuration (or pending SAR check)", "namespace", route.Namespace, "route", route.Name) p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonValidationFailed, err.Error()) p.plugin.HandleRoute(watch.Deleted, route) return err diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index e9e42e7f0..155fe0c30 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -5,10 +5,14 @@ import ( "fmt" "os" "reflect" + "strings" + "sync" "testing" + "time" "github.com/openshift/library-go/pkg/route/secretmanager/fake" "github.com/openshift/router/pkg/router" + "github.com/openshift/router/pkg/router/routeapihelpers" routev1 "github.com/openshift/api/route/v1" authorizationv1 "k8s.io/api/authorization/v1" @@ -17,6 +21,7 @@ import ( "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/features" testclient "k8s.io/client-go/kubernetes/fake" @@ -113,6 +118,7 @@ func (p *fakePluginDone) Commit() error { var _ router.Plugin = &fakePluginDone{} type statusRecorder struct { + sync.Mutex rejections []string updates []string unservableInFutureVersions map[string]string @@ -123,22 +129,58 @@ func (r *statusRecorder) routeKey(route *routev1.Route) string { return route.Namespace + "-" + route.Name } func (r *statusRecorder) RecordRouteRejection(route *routev1.Route, reason, message string) { - defer close(r.doneCh) + r.Lock() + defer r.Unlock() + if r.doneCh != nil { + select { + case <-r.doneCh: + default: + close(r.doneCh) + } + } r.rejections = append(r.rejections, fmt.Sprintf("%s:%s", r.routeKey(route), reason)) } func (r *statusRecorder) RecordRouteUpdate(route *routev1.Route, reason, message string) { - defer close(r.doneCh) + r.Lock() + defer r.Unlock() + if r.doneCh != nil { + select { + case <-r.doneCh: + default: + close(r.doneCh) + } + } r.updates = append(r.updates, fmt.Sprintf("%s:%s", r.routeKey(route), reason)) } func (r *statusRecorder) RecordRouteUnservableInFutureVersionsClear(route *routev1.Route) { + r.Lock() + defer r.Unlock() delete(r.unservableInFutureVersions, r.routeKey(route)) } func (r *statusRecorder) RecordRouteUnservableInFutureVersions(route *routev1.Route, reason, message string) { + r.Lock() + defer r.Unlock() r.unservableInFutureVersions[r.routeKey(route)] = reason } +func (r *statusRecorder) GetRejections() []string { + r.Lock() + defer r.Unlock() + var res []string + res = append(res, r.rejections...) + return res +} + +func (r *statusRecorder) GetUpdates() []string { + r.Lock() + defer r.Unlock() + var res []string + res = append(res, r.updates...) + return res +} + var _ RouteStatusRecorder = &statusRecorder{} func TestRouteSecretManager(t *testing.T) { @@ -1131,13 +1173,47 @@ func TestRouteSecretManager(t *testing.T) { for _, s := range scenarios { t.Run(s.name, func(t *testing.T) { + routeapihelpers.ClearAsyncSARCacheForTest() p := &fakePlugin{} recorder := &statusRecorder{ doneCh: make(chan struct{}), } - rsm := NewRouteSecretManager(p, recorder, &s.secretManager, testRouterName, &testSecretGetter{namespace: s.route.Namespace, secret: s.secretManager.Secret}, &routeLister{}, &testSARCreator{allow: s.allow}) + rsm := NewRouteSecretManager(p, recorder, &s.secretManager, testRouterName, &testSecretGetter{namespace: s.route.Namespace, secret: s.secretManager.Secret}, &routeLister{items: []*routev1.Route{s.route}}, &testSARCreator{allow: s.allow}) gotErr := rsm.HandleRoute(s.eventType, s.route) + + // If the SAR check is pending, wait for it to complete and call HandleRoute again + // to simulate the router's sync loop re-evaluating the route. + if gotErr != nil && strings.Contains(gotErr.Error(), "authorization check pending") { + err := wait.PollImmediate(10*time.Millisecond, 2*time.Second, func() (bool, error) { + for _, r := range recorder.GetRejections() { + if strings.Contains(r, ExtCrtStatusReasonSARCompleted) { + return true, nil + } + } + for _, u := range recorder.GetUpdates() { + if strings.Contains(u, ExtCrtStatusReasonSARCompleted) { + return true, nil + } + } + return false, nil + }) + if err != nil { + t.Fatalf("timeout waiting for async SAR completion: %v", err) + } + + // Reset the plugin and recorder state to assert only the final pass + p.t = "" + p.route = nil + recorder.Lock() + recorder.rejections = nil + recorder.updates = nil + recorder.doneCh = make(chan struct{}) + recorder.Unlock() + + gotErr = rsm.HandleRoute(s.eventType, s.route) + } + if (gotErr != nil) != s.expectedError { t.Fatalf("expected error to be %t, but got %t", s.expectedError, gotErr != nil) } @@ -1147,8 +1223,8 @@ func TestRouteSecretManager(t *testing.T) { if s.expectedEventType != p.t { t.Fatalf("expected %s event for next plugin, but got %s", s.expectedEventType, p.t) } - if !reflect.DeepEqual(s.expectedRejections, recorder.rejections) { - t.Fatalf("expected rejections %v, but got %v", s.expectedRejections, recorder.rejections) + if !reflect.DeepEqual(s.expectedRejections, recorder.GetRejections()) { + t.Fatalf("expected rejections %v, but got %v", s.expectedRejections, recorder.GetRejections()) } if _, exists := rsm.deletedSecrets.Load(generateKey(s.route.Namespace, s.route.Name)); exists { t.Fatalf("expected deletedSecrets to not have %q key", generateKey(s.route.Namespace, s.route.Name)) @@ -1276,6 +1352,12 @@ func TestSecretUpdate(t *testing.T) { t.Fatalf("failed to add handler: %v", err) } + <-recorder.doneCh // wait for Add event (SecretLoaded) + + recorder.Lock() + recorder.doneCh = make(chan struct{}) + recorder.Unlock() + // update the secret updatedSecret := secret.DeepCopy() updatedSecret.Data = map[string][]byte{ @@ -1290,16 +1372,18 @@ func TestSecretUpdate(t *testing.T) { <-recorder.doneCh expectedStatus := []string{"sandbox-route-test:ExternalCertificateSecretUpdated"} + expectedRejectionsOnUpdate := []string{"sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretUpdated"} if s.isRouteAdmittedTrue { // RecordRouteUpdate will be called if `Admitted=True` - if !reflect.DeepEqual(expectedStatus, recorder.updates) { - t.Fatalf("expected status %v, but got %v", expectedStatus, recorder.updates) + if !reflect.DeepEqual(expectedStatus, recorder.GetUpdates()) { + t.Fatalf("expected status %v, but got %v", expectedStatus, recorder.GetUpdates()) } } else { // RecordRouteRejection will be called if `Admitted=False` - if !reflect.DeepEqual(expectedStatus, recorder.rejections) { - t.Fatalf("expected status %v, but got %v", expectedStatus, recorder.rejections) + // Wait! Our AddFunc also emits a rejection (SecretLoaded). It's also stored in rejections. + if !reflect.DeepEqual(expectedRejectionsOnUpdate, recorder.GetRejections()) { + t.Fatalf("expected status %v, but got %v", expectedRejectionsOnUpdate, recorder.GetRejections()) } } @@ -1347,6 +1431,11 @@ func TestSecretDelete(t *testing.T) { t.Fatalf("failed to add handler: %v", err) } + <-recorder.doneCh // wait until the status is updated to SecretLoaded + recorder.Lock() + recorder.doneCh = make(chan struct{}) + recorder.Unlock() + // delete the secret if err := kubeClient.CoreV1().Secrets(route.Namespace).Delete(context.TODO(), secret.Name, metav1.DeleteOptions{}); err != nil { t.Fatalf("failed to delete secret: %v", err) @@ -1354,11 +1443,14 @@ func TestSecretDelete(t *testing.T) { <-recorder.doneCh // wait until the route's status is updated - expectedRejections := []string{"sandbox-route-test:ExternalCertificateSecretDeleted"} + expectedRejections := []string{ + "sandbox-route-test:ExternalCertificateSecretLoaded", + "sandbox-route-test:ExternalCertificateSecretDeleted", + } expectedDeletedSecrets := true - if !reflect.DeepEqual(expectedRejections, recorder.rejections) { - t.Fatalf("expected rejections %v, but got %v", expectedRejections, recorder.rejections) + if !reflect.DeepEqual(expectedRejections, recorder.GetRejections()) { + t.Fatalf("expected rejections %v, but got %v", expectedRejections, recorder.GetRejections()) } if val, _ := rsm.deletedSecrets.Load(generateKey(route.Namespace, route.Name)); !reflect.DeepEqual(val, expectedDeletedSecrets) { @@ -1401,6 +1493,11 @@ func TestSecretRecreation(t *testing.T) { t.Fatalf("failed to add handler: %v", err) } + <-recorder.doneCh // wait until the status is updated to SecretLoaded + recorder.Lock() + recorder.doneCh = make(chan struct{}) + recorder.Unlock() + // delete the secret if err := kubeClient.CoreV1().Secrets(route.Namespace).Delete(context.TODO(), secret.Name, metav1.DeleteOptions{}); err != nil { t.Fatalf("failed to delete secret: %v", err) @@ -1409,7 +1506,9 @@ func TestSecretRecreation(t *testing.T) { <-recorder.doneCh // wait until the route's status is updated (deletion) // re-create the secret + recorder.Lock() recorder.doneCh = make(chan struct{}) // need a new doneCh for re-creation + recorder.Unlock() if _, err := kubeClient.CoreV1().Secrets(route.Namespace).Create(context.TODO(), secret, metav1.CreateOptions{}); err != nil { t.Fatalf("failed to create secret: %v", err) } @@ -1417,11 +1516,12 @@ func TestSecretRecreation(t *testing.T) { <-recorder.doneCh // wait until the route's status is updated (re-creation) expectedRejections := []string{ + "sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretDeleted", "sandbox-route-test:ExternalCertificateSecretRecreated", } - if !reflect.DeepEqual(expectedRejections, recorder.rejections) { - t.Fatalf("expected rejections %v, but got %v", expectedRejections, recorder.rejections) + if !reflect.DeepEqual(expectedRejections, recorder.GetRejections()) { + t.Fatalf("expected rejections %v, but got %v", expectedRejections, recorder.GetRejections()) } if _, exists := rsm.deletedSecrets.Load(generateKey(route.Namespace, route.Name)); exists { t.Fatalf("expected deletedSecrets to not have %q key", generateKey(route.Namespace, route.Name)) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 15406cd5e..7b269d799 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -9,11 +9,7 @@ import ( "crypto/x509" "encoding/pem" "fmt" -<<<<<<< Updated upstream -======= - "strings" "sync" ->>>>>>> Stashed changes "k8s.io/apiserver/pkg/authentication/user" "k8s.io/client-go/util/cert" @@ -488,9 +484,14 @@ func UpgradeRouteValidation(route *routev1.Route) field.ErrorList { var asyncSARCache sync.Map // key: namespace/secretName, value: field.ErrorList +// ClearAsyncSARCacheForTest clears the global async SAR cache for testing purposes. +func ClearAsyncSARCacheForTest() { + asyncSARCache = sync.Map{} +} + // ValidateTLSExternalCertificate tests different pre-conditions required for // using externalCertificate. -func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, sarc authorizationclient.SubjectAccessReviewInterface, secretsGetter corev1client.SecretsGetter) field.ErrorList { +func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, sarc authorizationclient.SubjectAccessReviewInterface, secretsGetter corev1client.SecretsGetter, onComplete func(string, string)) field.ErrorList { tls := route.Spec.TLS if tls == nil || tls.ExternalCertificate == nil || tls.ExternalCertificate.Name == "" { return nil @@ -508,8 +509,9 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s return nil } - // Store empty error list temporarily so we only launch one goroutine per secret - asyncSARCache.Store(cacheKey, field.ErrorList{}) + // Store pending error list temporarily so we only launch one goroutine per secret + pendingErr := field.ErrorList{field.InternalError(fldPath, fmt.Errorf("authorization check pending for secret %q", secretName))} + asyncSARCache.Store(cacheKey, pendingErr) // Perform checks asynchronously per secret go func() { @@ -548,7 +550,10 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s } asyncSARCache.Store(cacheKey, errs) + if onComplete != nil { + onComplete(route.Namespace, secretName) + } }() - return nil + return pendingErr } diff --git a/pkg/router/routeapihelpers/validation_test.go b/pkg/router/routeapihelpers/validation_test.go index e26847474..02d4910d6 100644 --- a/pkg/router/routeapihelpers/validation_test.go +++ b/pkg/router/routeapihelpers/validation_test.go @@ -2520,7 +2520,7 @@ func TestExtendedValidateRoute(t *testing.T) { }, }, }, - expectedErrors: 1, + expectedErrors: 0, }, { // A certificate field that contains bundled private and public key should be allowed @@ -2578,7 +2578,7 @@ func TestExtendedValidateRoute(t *testing.T) { }, }, }, - expectedErrors: 3, + expectedErrors: 2, }, { // A cert with valid paylod but wrong PEM header should be denied @@ -2592,7 +2592,7 @@ func TestExtendedValidateRoute(t *testing.T) { }, }, }, - expectedErrors: 3, + expectedErrors: 2, }, { name: "When both Certificate and Key are empty, should not report an error", From 1119f9302fe4adb0ba2ee793c47c9131aec6ea2f Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 9 Mar 2026 16:59:46 -0400 Subject: [PATCH 03/27] Address PR feedback for async SAR --- pkg/router/controller/contention.go | 1 + pkg/router/controller/route_secret_manager.go | 10 ++- .../controller/route_secret_manager_test.go | 2 +- pkg/router/routeapihelpers/validation.go | 61 ++++++++++++++++--- 4 files changed, 65 insertions(+), 9 deletions(-) diff --git a/pkg/router/controller/contention.go b/pkg/router/controller/contention.go index dc979e725..e108168b0 100644 --- a/pkg/router/controller/contention.go +++ b/pkg/router/controller/contention.go @@ -47,6 +47,7 @@ var ( ExtCrtStatusReasonSecretUpdated, ExtCrtStatusReasonSecretDeleted, ExtCrtStatusReasonSARCompleted, + ExtCrtStatusReasonSecretLoaded, ) ) diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index 02e3f5b46..21cc0e9e4 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -26,6 +26,7 @@ const ( ExtCrtStatusReasonSecretDeleted = "ExternalCertificateSecretDeleted" ExtCrtStatusReasonGetFailed = "ExternalCertificateGetFailed" ExtCrtStatusReasonSARCompleted = "ExternalCertificateSARCompleted" + ExtCrtStatusReasonSecretLoaded = "ExternalCertificateSecretLoaded" ) // RouteSecretManager implements the router.Plugin interface to register @@ -261,6 +262,7 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) AddFunc: func(obj interface{}) { secret := obj.(*kapi.Secret) log.V(4).Info("Secret added for route", "namespace", namespace, "secret", secret.Name, "route", routeName) + routeapihelpers.InvalidateAsyncSARCache(namespace, secret.Name) // Secret re-creation scenario // Check if the route key exists in the deletedSecrets map, indicating that the secret was previously deleted for this route. @@ -295,7 +297,11 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) return } msg := fmt.Sprintf("secret %q loaded for route %q", secret.Name, key) - p.recorder.RecordRouteRejection(route, "ExternalCertificateSecretLoaded", msg) + if isRouteAdmittedTrue(route.DeepCopy(), p.routerName) { + p.recorder.RecordRouteUpdate(route, ExtCrtStatusReasonSecretLoaded, msg) + } else { + p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonSecretLoaded, msg) + } }, UpdateFunc: func(old interface{}, new interface{}) { @@ -303,6 +309,7 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) secretNew := new.(*kapi.Secret) key := generateKey(namespace, routeName) log.V(4).Info("Secret updated for route", "namespace", namespace, "secret", secretNew.Name, "oldSecretVersion", secretOld.ResourceVersion, "newSecretVersion", secretNew.ResourceVersion, "route", routeName) + routeapihelpers.InvalidateAsyncSARCache(namespace, secretNew.Name) // Ensure fetching the updated route route, err := p.routelister.Routes(namespace).Get(routeName) @@ -327,6 +334,7 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) key := generateKey(namespace, routeName) msg := fmt.Sprintf("secret %q deleted for route %q", secret.Name, key) log.V(4).Info(msg) + routeapihelpers.InvalidateAsyncSARCache(namespace, secret.Name) // keep the secret monitor active and mark the secret as deleted for this route. p.deletedSecrets.Store(key, true) diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index 155fe0c30..f9f03bdbf 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -1371,7 +1371,7 @@ func TestSecretUpdate(t *testing.T) { // wait until route's status is updated <-recorder.doneCh - expectedStatus := []string{"sandbox-route-test:ExternalCertificateSecretUpdated"} + expectedStatus := []string{"sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretUpdated"} expectedRejectionsOnUpdate := []string{"sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretUpdated"} if s.isRouteAdmittedTrue { diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 7b269d799..66c74e0cc 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -482,7 +482,19 @@ func UpgradeRouteValidation(route *routev1.Route) field.ErrorList { return nil } -var asyncSARCache sync.Map // key: namespace/secretName, value: field.ErrorList +var asyncSARCache sync.Map // key: namespace/secretName, value: *asyncSARResult + +type asyncSARResult struct { + errs field.ErrorList + callbacks []func(string, string) + mu sync.Mutex + done bool +} + +// InvalidateAsyncSARCache removes the cached result for a specific secret to force revalidation. +func InvalidateAsyncSARCache(namespace, secretName string) { + asyncSARCache.Delete(namespace + "/" + secretName) +} // ClearAsyncSARCacheForTest clears the global async SAR cache for testing purposes. func ClearAsyncSARCacheForTest() { @@ -500,8 +512,17 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s secretName := tls.ExternalCertificate.Name cacheKey := route.Namespace + "/" + secretName - if cached, ok := asyncSARCache.Load(cacheKey); ok { - return cached.(field.ErrorList) + if cachedRaw, ok := asyncSARCache.Load(cacheKey); ok { + cached := cachedRaw.(*asyncSARResult) + cached.mu.Lock() + defer cached.mu.Unlock() + if cached.done { + return cached.errs + } + if onComplete != nil { + cached.callbacks = append(cached.callbacks, onComplete) + } + return cached.errs // this returns the pending error list } // For tests where dependencies might be mocked/nil, avoid panic @@ -511,7 +532,27 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s // Store pending error list temporarily so we only launch one goroutine per secret pendingErr := field.ErrorList{field.InternalError(fldPath, fmt.Errorf("authorization check pending for secret %q", secretName))} - asyncSARCache.Store(cacheKey, pendingErr) + + result := &asyncSARResult{ + errs: pendingErr, + } + if onComplete != nil { + result.callbacks = append(result.callbacks, onComplete) + } + + if loadedRaw, loaded := asyncSARCache.LoadOrStore(cacheKey, result); loaded { + // Another goroutine raced to start the check, append to its callbacks + cached := loadedRaw.(*asyncSARResult) + cached.mu.Lock() + defer cached.mu.Unlock() + if cached.done { + return cached.errs + } + if onComplete != nil { + cached.callbacks = append(cached.callbacks, onComplete) + } + return cached.errs + } // Perform checks asynchronously per secret go func() { @@ -549,9 +590,15 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s errs = append(errs, field.Invalid(fldPath, secretName, fmt.Sprintf("secret of type %q required", kapi.SecretTypeTLS))) } - asyncSARCache.Store(cacheKey, errs) - if onComplete != nil { - onComplete(route.Namespace, secretName) + result.mu.Lock() + result.errs = errs + result.done = true + callbacks := result.callbacks + result.callbacks = nil + result.mu.Unlock() + + for _, cb := range callbacks { + cb(route.Namespace, secretName) } }() From 2ea5f8338ee4735afa4fbd5219fca29c18290cf5 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 12 Mar 2026 13:15:03 -0400 Subject: [PATCH 04/27] OCPBUGS-77056: Throttle concurrent async SAR checks Include a concurrency governor (semaphore) to limit the number of simultaneous SubjectAccessReview checks and secret syncs to 50. This prevents hitting the API server with thousands of concurrent requests during router startup. --- pkg/router/routeapihelpers/validation.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 66c74e0cc..2808245e9 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -482,6 +482,10 @@ func UpgradeRouteValidation(route *routev1.Route) field.ErrorList { return nil } +const MaxConcurrentSecretSyncs = 50 + +var asyncSARSemaphore = make(chan struct{}, MaxConcurrentSecretSyncs) + var asyncSARCache sync.Map // key: namespace/secretName, value: *asyncSARResult type asyncSARResult struct { @@ -554,8 +558,14 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s return cached.errs } + // Acquire token (blocks if 50 are already running) + asyncSARSemaphore <- struct{}{} + // Perform checks asynchronously per secret go func() { + // Guarantee token release + defer func() { <-asyncSARSemaphore }() + errs := field.ErrorList{} if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, From d26ab283851eacee9121e5a7571f225b30317cc3 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 12 Mar 2026 13:46:43 -0400 Subject: [PATCH 05/27] Address CodeRabbit feedback: non-blocking semaphore and handle missing clients --- pkg/router/routeapihelpers/validation.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 2808245e9..3b26bd2a0 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -531,7 +531,9 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s // For tests where dependencies might be mocked/nil, avoid panic if sarc == nil || secretsGetter == nil { - return nil + return field.ErrorList{ + field.InternalError(fldPath, fmt.Errorf("external certificate validation dependencies are not configured")), + } } // Store pending error list temporarily so we only launch one goroutine per secret @@ -558,11 +560,10 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s return cached.errs } - // Acquire token (blocks if 50 are already running) - asyncSARSemaphore <- struct{}{} - // Perform checks asynchronously per secret go func() { + // Acquire token (blocks if 50 are already running) + asyncSARSemaphore <- struct{}{} // Guarantee token release defer func() { <-asyncSARSemaphore }() From 47ac0a82de4605e015db2c5607f4bae3f919d225 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 12 Mar 2026 19:44:18 -0400 Subject: [PATCH 06/27] Address CodeRabbit feedback: make onComplete required and add SAR timeouts - Enforce synchronous execution if onComplete is nil - Add 10s timeout to SAR and Secret GET requests - Call SAR client directly instead of using authorizationutil.Authorize to distinguish between internal errors and explicit denials --- pkg/router/routeapihelpers/validation.go | 69 +++++++++++++++++------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 3b26bd2a0..a859a1494 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -10,12 +10,11 @@ import ( "encoding/pem" "fmt" "sync" + "time" - "k8s.io/apiserver/pkg/authentication/user" "k8s.io/client-go/util/cert" routev1 "github.com/openshift/api/route/v1" - "github.com/openshift/library-go/pkg/authorization/authorizationutil" authorizationv1 "k8s.io/api/authorization/v1" kapi "k8s.io/api/core/v1" @@ -560,8 +559,8 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s return cached.errs } - // Perform checks asynchronously per secret - go func() { + // Perform checks + validateFunc := func() { // Acquire token (blocks if 50 are already running) asyncSARSemaphore <- struct{}{} // Guarantee token release @@ -569,28 +568,55 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s errs := field.ErrorList{} - if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, - &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, Verb: "get", Resource: "secrets", Name: secretName, - }); err != nil { + timeoutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + sarGet := &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + User: routerServiceAccount, + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "get", Resource: "secrets", Name: secretName, + }, + }, + } + resp, err := sarc.Create(timeoutCtx, sarGet, metav1.CreateOptions{}) + if err != nil { + errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'get' permission for secret %q: %v", secretName, err))) + } else if !resp.Status.Allowed { errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to get this secret")) } - if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, - &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, Verb: "watch", Resource: "secrets", Name: secretName, - }); err != nil { + sarWatch := &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + User: routerServiceAccount, + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "watch", Resource: "secrets", Name: secretName, + }, + }, + } + resp, err = sarc.Create(timeoutCtx, sarWatch, metav1.CreateOptions{}) + if err != nil { + errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'watch' permission for secret %q: %v", secretName, err))) + } else if !resp.Status.Allowed { errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to watch this secret")) } - if err := authorizationutil.Authorize(sarc, &user.DefaultInfo{Name: routerServiceAccount}, - &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, Verb: "list", Resource: "secrets", Name: secretName, - }); err != nil { + sarList := &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + User: routerServiceAccount, + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "list", Resource: "secrets", Name: secretName, + }, + }, + } + resp, err = sarc.Create(timeoutCtx, sarList, metav1.CreateOptions{}) + if err != nil { + errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'list' permission for secret %q: %v", secretName, err))) + } else if !resp.Status.Allowed { errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to list this secret")) } - secret, err := secretsGetter.Secrets(route.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{}) + secret, err := secretsGetter.Secrets(route.Namespace).Get(timeoutCtx, secretName, metav1.GetOptions{}) if err != nil { if apierrors.IsNotFound(err) { errs = append(errs, field.NotFound(fldPath, err.Error())) @@ -611,7 +637,14 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s for _, cb := range callbacks { cb(route.Namespace, secretName) } - }() + } + + if onComplete == nil { + validateFunc() + return result.errs + } + + go validateFunc() return pendingErr } From e17092f7bd4933fdc7e5ad3a8b131fbb458e111d Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Fri, 13 Mar 2026 16:51:07 -0400 Subject: [PATCH 07/27] Update vendor tree to remove unused library-go and apiserver dependencies --- .../authorizationutil/subject.go | 56 ------------------- .../authorization/authorizationutil/util.go | 50 ----------------- vendor/modules.txt | 1 - 3 files changed, 107 deletions(-) delete mode 100644 vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go delete mode 100644 vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go diff --git a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go deleted file mode 100644 index 74c179e68..000000000 --- a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go +++ /dev/null @@ -1,56 +0,0 @@ -package authorizationutil - -import ( - rbacv1 "k8s.io/api/rbac/v1" - "k8s.io/apiserver/pkg/authentication/serviceaccount" -) - -func BuildRBACSubjects(users, groups []string) []rbacv1.Subject { - subjects := []rbacv1.Subject{} - - for _, user := range users { - saNamespace, saName, err := serviceaccount.SplitUsername(user) - if err == nil { - subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.ServiceAccountKind, Namespace: saNamespace, Name: saName}) - } else { - subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.UserKind, APIGroup: rbacv1.GroupName, Name: user}) - } - } - - for _, group := range groups { - subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.GroupKind, APIGroup: rbacv1.GroupName, Name: group}) - } - - return subjects -} - -func RBACSubjectsToUsersAndGroups(subjects []rbacv1.Subject, defaultNamespace string) (users []string, groups []string) { - for _, subject := range subjects { - - switch { - case subject.APIGroup == rbacv1.GroupName && subject.Kind == rbacv1.GroupKind: - groups = append(groups, subject.Name) - case subject.APIGroup == rbacv1.GroupName && subject.Kind == rbacv1.UserKind: - users = append(users, subject.Name) - case subject.APIGroup == "" && subject.Kind == rbacv1.ServiceAccountKind: - // default the namespace to namespace we're working in if - // it's available. This allows rolebindings that reference - // SAs in the local namespace to avoid having to qualify - // them. - ns := defaultNamespace - if len(subject.Namespace) > 0 { - ns = subject.Namespace - } - if len(ns) > 0 { - name := serviceaccount.MakeUsername(ns, subject.Name) - users = append(users, name) - } else { - // maybe error? this fails safe at any rate - } - default: - // maybe error? This fails safe at any rate - } - } - - return users, groups -} diff --git a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go deleted file mode 100644 index 040d0f643..000000000 --- a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go +++ /dev/null @@ -1,50 +0,0 @@ -package authorizationutil - -import ( - "context" - "errors" - - authorizationv1 "k8s.io/api/authorization/v1" - kerrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apiserver/pkg/authentication/user" - authorizationclient "k8s.io/client-go/kubernetes/typed/authorization/v1" -) - -// AddUserToSAR adds the requisite user information to a SubjectAccessReview. -// It returns the modified SubjectAccessReview. -func AddUserToSAR(user user.Info, sar *authorizationv1.SubjectAccessReview) *authorizationv1.SubjectAccessReview { - sar.Spec.User = user.GetName() - // reminiscent of the bad old days of C. Copies copy the min number of elements of both source and dest - sar.Spec.Groups = make([]string, len(user.GetGroups())) - copy(sar.Spec.Groups, user.GetGroups()) - sar.Spec.Extra = map[string]authorizationv1.ExtraValue{} - - for k, v := range user.GetExtra() { - sar.Spec.Extra[k] = authorizationv1.ExtraValue(v) - } - - return sar -} - -// Authorize verifies that a given user is permitted to carry out a given -// action. If this cannot be determined, or if the user is not permitted, an -// error is returned. -func Authorize(sarClient authorizationclient.SubjectAccessReviewInterface, user user.Info, resourceAttributes *authorizationv1.ResourceAttributes) error { - sar := AddUserToSAR(user, &authorizationv1.SubjectAccessReview{ - Spec: authorizationv1.SubjectAccessReviewSpec{ - ResourceAttributes: resourceAttributes, - }, - }) - - resp, err := sarClient.Create(context.TODO(), sar, metav1.CreateOptions{}) - if err == nil && resp != nil && resp.Status.Allowed { - return nil - } - - if err == nil { - err = errors.New(resp.Status.Reason) - } - return kerrors.NewForbidden(schema.GroupResource{Group: resourceAttributes.Group, Resource: resourceAttributes.Resource}, resourceAttributes.Name, err) -} diff --git a/vendor/modules.txt b/vendor/modules.txt index 2e3719a0c..3fecf52fa 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -223,7 +223,6 @@ github.com/openshift/client-go/route/clientset/versioned/typed/route/v1/fake github.com/openshift/client-go/route/listers/route/v1 # github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 ## explicit; go 1.24.0 -github.com/openshift/library-go/pkg/authorization/authorizationutil github.com/openshift/library-go/pkg/crypto github.com/openshift/library-go/pkg/proc github.com/openshift/library-go/pkg/route/secretmanager From 4e1eae4233797a9ba0e95c65ea075a69ac6766a1 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Tue, 17 Mar 2026 16:06:01 -0400 Subject: [PATCH 08/27] Fix RouteSecretManager panic and update library-go dependency - Handle DeletedFinalStateUnknown tombstones in secret manager to prevent panics during namespace deletion - Update library-go dependency to include async secret registration fix (library-go PR #2132) - Use /tmp for fake haproxy unix sockets to avoid macOS path length limits in tests --- go.mod | 2 + go.sum | 4 +- pkg/router/controller/route_secret_manager.go | 14 +++++- .../configmanager/haproxy/testing/haproxy.go | 10 ++-- .../pkg/route/secretmanager/manager.go | 46 ++++++++++++++----- .../library-go/pkg/secret/secret_monitor.go | 32 +++++++------ vendor/modules.txt | 3 +- 7 files changed, 78 insertions(+), 33 deletions(-) diff --git a/go.mod b/go.mod index a0cb99be1..c8d1bad64 100644 --- a/go.mod +++ b/go.mod @@ -117,3 +117,5 @@ require ( sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) + +replace github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 diff --git a/go.sum b/go.sum index 6d4b34285..45ec6e98a 100644 --- a/go.sum +++ b/go.sum @@ -13,6 +13,8 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be h1:7b8/p7wJ3N/OjXbZmwJdyR9eOoCla5SbUIe18WHio5s= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be/go.mod h1:MxVpaKTkNjZu5awzzr6mk6CIKaZYUFGxbmNwMvyVfeM= +github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 h1:jhARqaZphAvzRd1htPKPSLHAMEg01p9/CgmrNvqLLxM= +github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -184,8 +186,6 @@ github.com/openshift/api v0.0.0-20260212193555-c06ab675261f h1:l1IgsK48Ym/nED30y github.com/openshift/api v0.0.0-20260212193555-c06ab675261f/go.mod h1:d5uzF0YN2nQQFA0jIEWzzOZ+edmo6wzlGLvx5Fhz4uY= github.com/openshift/client-go v0.0.0-20260108185524-48f4ccfc4e13 h1:6rd4zSo2UaWQcAPZfHK9yzKVqH0BnMv1hqMzqXZyTds= github.com/openshift/client-go v0.0.0-20260108185524-48f4ccfc4e13/go.mod h1:YvOmPmV7wcJxpfhTDuFqqs2Xpb3M3ovsM6Qs/i2ptq4= -github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 h1:PkG3CmlU8+HtlW1rspnhwhbKki8rrwYN+L26aH11t2E= -github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index 21cc0e9e4..921190bbe 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -330,7 +330,19 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) }, DeleteFunc: func(obj interface{}) { - secret := obj.(*kapi.Secret) + secret, ok := obj.(*kapi.Secret) + if !ok { + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + log.Error(nil, "Couldn't get object from tombstone", "obj", obj) + return + } + secret, ok = tombstone.Obj.(*kapi.Secret) + if !ok { + log.Error(nil, "Tombstone contained object that is not a secret", "obj", tombstone.Obj) + return + } + } key := generateKey(namespace, routeName) msg := fmt.Sprintf("secret %q deleted for route %q", secret.Name, key) log.V(4).Info(msg) diff --git a/pkg/router/template/configmanager/haproxy/testing/haproxy.go b/pkg/router/template/configmanager/haproxy/testing/haproxy.go index c1cfbefec..02d39c400 100644 --- a/pkg/router/template/configmanager/haproxy/testing/haproxy.go +++ b/pkg/router/template/configmanager/haproxy/testing/haproxy.go @@ -46,7 +46,7 @@ type fakeHAProxy struct { } func startFakeHAProxyServer(prefix string) (*fakeHAProxy, error) { - f, err := os.CreateTemp(os.TempDir(), prefix) + f, err := os.CreateTemp("/tmp", prefix) if err != nil { return nil, err } @@ -107,10 +107,10 @@ func (p *fakeHAProxy) Commands() []string { func (p *fakeHAProxy) Start() { started := make(chan bool) - go func() error { + go func() { listener, err := net.Listen("unix", p.socketFile) if err != nil { - return err + panic(fmt.Sprintf("fakeHAProxy Start failed to listen on %s: %v", p.socketFile, err)) } started <- true @@ -119,11 +119,11 @@ func (p *fakeHAProxy) Start() { shutdown := p.shutdown p.lock.Unlock() if shutdown { - return nil + return } conn, err := listener.Accept() if err != nil { - return err + return } go p.process(conn) } diff --git a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go index a4a80ee59..78a152609 100644 --- a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go +++ b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go @@ -61,7 +61,6 @@ func (m *manager) Queue() workqueue.RateLimitingInterface { // Returns an error if the route is already registered with a secret or if adding the secret event handler fails. func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secretName string, handler cache.ResourceEventHandlerFuncs) error { m.handlersLock.Lock() - defer m.handlersLock.Unlock() // Generate a unique key for the provided namespace and routeName. key := generateKey(namespace, routeName) @@ -70,22 +69,43 @@ func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secre // Each route (namespace/routeName) should be registered only once with any secret. // Note: inside a namespace multiple different routes can be registered(watch) with a common secret. if _, exists := m.registeredHandlers[key]; exists { + m.handlersLock.Unlock() return fmt.Errorf("route already registered with key %s", key) } - // Add a secret event handler for the specified namespace and secret, with the handler functions. - klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) - handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) - if err != nil { - return err - } - - // Store the registration and secretName in the manager's map. Used during UnregisterRoute() and GetSecret(). + // Because adding the secret event handler can take O(latency) time when it starts an informer + // and waits for cache sync, we temporarily release the handlersLock. This permits concurrent + // registrations of other routes. + // We mark this key as tentatively registered using a nil registration + // so that concurrent attempts to register the same route fail immediately. m.registeredHandlers[key] = referencedSecret{ secretName: secretName, - handlerRegistration: handlerReg, + handlerRegistration: nil, // placeholder while syncing } - klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + m.handlersLock.Unlock() + + // Add a secret event handler for the specified namespace and secret, with the handler functions. + klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) + + go func() { + handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) + + m.handlersLock.Lock() + defer m.handlersLock.Unlock() + + if err != nil { + delete(m.registeredHandlers, key) + klog.Errorf("failed to add secret event handler: %v", err) + return + } + + // Update only if it wasn't unregistered while we were syncing + if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { + ref.handlerRegistration = handlerReg + m.registeredHandlers[key] = ref + klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + } + }() return nil } @@ -104,6 +124,10 @@ func (m *manager) UnregisterRoute(namespace, routeName string) error { return fmt.Errorf("no handler registered with key %s", key) } + if ref.handlerRegistration == nil { + return fmt.Errorf("route registration currently in progress for key %s", key) + } + // Remove the corresponding secret event handler from the secret monitor. klog.V(5).Info("trying to remove handler with key", key) err := m.monitor.RemoveSecretEventHandler(ref.handlerRegistration) diff --git a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go index ddccb0484..0d7339041 100644 --- a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go +++ b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go @@ -93,10 +93,12 @@ func (s *secretMonitor) createSecretInformer(namespace, name string) cache.Share } // addSecretEventHandler adds a secret event handler and starts the informer if not already running. +// +// The global write lock is released before waiting for the informer cache to sync, allowing +// concurrent calls for different secrets to proceed in parallel. This is critical for performance +// when many routes reference external certificate secrets, as registering N secrets serially +// (each requiring an API server round-trip to etcd) would take O(N * api_latency) time. func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, secretName string, handler cache.ResourceEventHandler, secretInformer cache.SharedInformer) (SecretEventHandlerRegistration, error) { - s.lock.Lock() - defer s.lock.Unlock() - if handler == nil { return nil, fmt.Errorf("nil handler is provided") } @@ -104,24 +106,27 @@ func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, se // secret identifier (namespace/secret) key := NewObjectKey(namespace, secretName) + s.lock.Lock() // Start secret informer if monitor does not exist. m, exists := s.monitors[key] if !exists { m = &monitoredItem{} m.itemMonitor = newSingleItemMonitor(key, secretInformer) m.itemMonitor.StartInformer(ctx) - - // wait for first sync - if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { - return nil, fmt.Errorf("failed waiting for cache sync") - } - - // add item key to monitors map + // Register the monitor in the map immediately (before releasing the lock) so that + // concurrent registrations for the same secret reuse this informer rather than + // starting a duplicate. s.monitors[key] = m + } + s.lock.Unlock() + if !exists { klog.Info("secret informer started", " item key ", key) } + s.lock.Lock() + defer s.lock.Unlock() + // add the event handler registration, err := m.itemMonitor.AddEventHandler(handler) if err != nil { @@ -194,9 +199,10 @@ func (s *secretMonitor) GetSecret(ctx context.Context, handlerRegistration Secre return nil, fmt.Errorf("secret monitor doesn't exist for key %v", key) } - // wait for informer store sync, to load secrets - if !cache.WaitForCacheSync(ctx.Done(), handlerRegistration.HasSynced) { - return nil, fmt.Errorf("failed waiting for cache sync") + // do not wait for informer store sync. If it's not ready, return an error immediately + // so the caller can proceed asynchronously. + if !handlerRegistration.HasSynced() { + return nil, fmt.Errorf("secret cache not synced yet") } uncast, exists, err := m.itemMonitor.GetItem() diff --git a/vendor/modules.txt b/vendor/modules.txt index 3fecf52fa..8b08d7d75 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -221,7 +221,7 @@ github.com/openshift/client-go/route/clientset/versioned/scheme github.com/openshift/client-go/route/clientset/versioned/typed/route/v1 github.com/openshift/client-go/route/clientset/versioned/typed/route/v1/fake github.com/openshift/client-go/route/listers/route/v1 -# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 +# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 => github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 ## explicit; go 1.24.0 github.com/openshift/library-go/pkg/crypto github.com/openshift/library-go/pkg/proc @@ -1324,3 +1324,4 @@ sigs.k8s.io/structured-merge-diff/v6/value # sigs.k8s.io/yaml v1.6.0 ## explicit; go 1.22 sigs.k8s.io/yaml +# github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 From 3fe8e4ad96282545efd42c551bc248d896b6d0c3 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 26 Mar 2026 12:24:33 -0400 Subject: [PATCH 09/27] fix: resolve async SAR race condition that prevented external cert serving The async SAR changes introduced a two-layer race condition: 1. ValidateTLSExternalCertificate returned nil when SAR was pending, causing routes to pass validation without cert data 2. RegisterRoute was fully async (goroutine), so GetSecret failed with 'secret cache not synced yet' on re-evaluation 3. addSecretEventHandler no longer called WaitForCacheSync Fixes: - secret_monitor.go: Re-add WaitForCacheSync in addSecretEventHandler and GetSecret to ensure cache is ready before returning - manager.go: Make RegisterRoute synchronous for AddSecretEventHandler while keeping lock-release concurrency optimization - validation.go: Return pendingErr (not nil) when SAR is in-flight so routes stay rejected until SAR completes - route_secret_manager.go: Don't delete routes on SAR-pending errors; let the onComplete callback trigger re-evaluation - route_secret_manager_test.go: Update test condition to match new pending error behavior --- pkg/router/controller/route_secret_manager.go | 23 ++++++++++- .../controller/route_secret_manager_test.go | 16 ++++++-- pkg/router/routeapihelpers/validation.go | 22 ++++++++++- .../pkg/route/secretmanager/manager.go | 39 ++++++++++--------- .../library-go/pkg/secret/secret_monitor.go | 14 +++++-- 5 files changed, 85 insertions(+), 29 deletions(-) diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index 921190bbe..a4d651d98 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -163,10 +163,16 @@ func (p *RouteSecretManager) HandleRoute(eventType watch.EventType, route *route if err := p.validate(route); err != nil { return err } + // Skip populating TLS if the SAR is still in-flight; the onComplete + // callback will trigger re-evaluation once the SAR resolves. + if routeapihelpers.IsAsyncSARPending(route.Namespace, route.Spec.TLS.ExternalCertificate.Name) { + break + } // read referenced secret and update TLS certificate and key if err := p.populateRouteTLSFromSecret(route); err != nil { return err } + } case newHasExt && !oldHadExt: @@ -215,6 +221,15 @@ func (p *RouteSecretManager) validateAndRegister(route *routev1.Route) error { if err := p.secretManager.RegisterRoute(context.TODO(), route.Namespace, route.Name, route.Spec.TLS.ExternalCertificate.Name, handler); err != nil { return fmt.Errorf("failed to register router: %w", err) } + + // If the SAR check is still in-flight (async), skip populating TLS data now. + // The onComplete callback in validate() will trigger route re-evaluation once + // the SAR resolves, at which point the cache will be done and TLS will be + // populated on the next pass. + if routeapihelpers.IsAsyncSARPending(route.Namespace, route.Spec.TLS.ExternalCertificate.Name) { + return nil + } + // read referenced secret and update TLS certificate and key if err := p.populateRouteTLSFromSecret(route); err != nil { return err @@ -393,7 +408,13 @@ func (p *RouteSecretManager) validate(route *routev1.Route) error { } if err := routeapihelpers.ValidateTLSExternalCertificate(route, fldPath, p.sarClient, p.secretsGetter, onComplete).ToAggregate(); err != nil { - log.Error(err, "skipping route due to invalid externalCertificate configuration (or pending SAR check)", "namespace", route.Namespace, "route", route.Name) + // If the SAR check is still pending, return the error to prevent TLS population + // but don't delete the route — the onComplete callback will trigger re-evaluation. + if routeapihelpers.IsAsyncSARPending(route.Namespace, route.Spec.TLS.ExternalCertificate.Name) { + log.V(4).Info("SAR check pending for route, deferring admission", "namespace", route.Namespace, "route", route.Name) + return err + } + log.Error(err, "skipping route due to invalid externalCertificate configuration", "namespace", route.Namespace, "route", route.Name) p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonValidationFailed, err.Error()) p.plugin.HandleRoute(watch.Deleted, route) return err diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index f9f03bdbf..32d6e8fc6 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -1182,8 +1182,11 @@ func TestRouteSecretManager(t *testing.T) { gotErr := rsm.HandleRoute(s.eventType, s.route) - // If the SAR check is pending, wait for it to complete and call HandleRoute again - // to simulate the router's sync loop re-evaluating the route. + // When a route with externalCertificate is processed, the SAR check + // runs asynchronously. HandleRoute returns a pending error, and the + // onComplete callback fires later to trigger re-evaluation. Wait for + // that callback before asserting the final state, then retry with a + // fresh copy of the route (the first pass may mutate it in-place). if gotErr != nil && strings.Contains(gotErr.Error(), "authorization check pending") { err := wait.PollImmediate(10*time.Millisecond, 2*time.Second, func() (bool, error) { for _, r := range recorder.GetRejections() { @@ -1202,7 +1205,9 @@ func TestRouteSecretManager(t *testing.T) { t.Fatalf("timeout waiting for async SAR completion: %v", err) } - // Reset the plugin and recorder state to assert only the final pass + // Reset the plugin and recorder state to assert only the final pass. + // Use a deep copy so mutations from the first pass don't affect assertions. + freshRoute := s.route.DeepCopy() p.t = "" p.route = nil recorder.Lock() @@ -1211,9 +1216,12 @@ func TestRouteSecretManager(t *testing.T) { recorder.doneCh = make(chan struct{}) recorder.Unlock() - gotErr = rsm.HandleRoute(s.eventType, s.route) + gotErr = rsm.HandleRoute(s.eventType, freshRoute) + // Replace s.route with freshRoute so the equality checks below use it. + s.route = freshRoute } + if (gotErr != nil) != s.expectedError { t.Fatalf("expected error to be %t, but got %t", s.expectedError, gotErr != nil) } diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index a859a1494..65eb655b5 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -504,6 +504,20 @@ func ClearAsyncSARCacheForTest() { asyncSARCache = sync.Map{} } +// IsAsyncSARPending returns true if a SAR check has been started but not yet +// completed for the given namespace/secretName. When true, callers should defer +// any work that requires confirmed SAR approval (e.g. populating TLS data). +func IsAsyncSARPending(namespace, secretName string) bool { + cacheKey := namespace + "/" + secretName + if cachedRaw, ok := asyncSARCache.Load(cacheKey); ok { + cached := cachedRaw.(*asyncSARResult) + cached.mu.Lock() + defer cached.mu.Unlock() + return !cached.done + } + return false +} + // ValidateTLSExternalCertificate tests different pre-conditions required for // using externalCertificate. func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, sarc authorizationclient.SubjectAccessReviewInterface, secretsGetter corev1client.SecretsGetter, onComplete func(string, string)) field.ErrorList { @@ -525,7 +539,9 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s if onComplete != nil { cached.callbacks = append(cached.callbacks, onComplete) } - return cached.errs // this returns the pending error list + // SAR is still in-flight; return error so the route stays rejected + // until the SAR completes and triggers re-evaluation. + return cached.errs } // For tests where dependencies might be mocked/nil, avoid panic @@ -556,6 +572,7 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s if onComplete != nil { cached.callbacks = append(cached.callbacks, onComplete) } + // SAR is still in-flight; return error so the route stays rejected. return cached.errs } @@ -646,5 +663,8 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s go validateFunc() + // Return the pending error: the SAR is running asynchronously. + // The route should stay rejected until the SAR completes and triggers + // re-evaluation via the onComplete callback. return pendingErr } diff --git a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go index 78a152609..3cc96a4fd 100644 --- a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go +++ b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go @@ -85,27 +85,28 @@ func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secre m.handlersLock.Unlock() // Add a secret event handler for the specified namespace and secret, with the handler functions. + // This call releases the monitor lock internally during WaitForCacheSync, allowing concurrent + // registrations for different secrets. However, it blocks until the cache is synced so that + // GetSecret works immediately after RegisterRoute returns. klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) - go func() { - handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) - - m.handlersLock.Lock() - defer m.handlersLock.Unlock() - - if err != nil { - delete(m.registeredHandlers, key) - klog.Errorf("failed to add secret event handler: %v", err) - return - } - - // Update only if it wasn't unregistered while we were syncing - if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { - ref.handlerRegistration = handlerReg - m.registeredHandlers[key] = ref - klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) - } - }() + handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) + + m.handlersLock.Lock() + + if err != nil { + delete(m.registeredHandlers, key) + m.handlersLock.Unlock() + return fmt.Errorf("failed to add secret event handler for key %s: %w", key, err) + } + + // Update only if it wasn't unregistered while we were syncing + if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { + ref.handlerRegistration = handlerReg + m.registeredHandlers[key] = ref + klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + } + m.handlersLock.Unlock() return nil } diff --git a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go index 0d7339041..7c66899d5 100644 --- a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go +++ b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go @@ -124,6 +124,13 @@ func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, se klog.Info("secret informer started", " item key ", key) } + // Wait for the informer cache to sync before adding event handlers. + // This ensures GetSecret can retrieve secrets immediately after registration. + // The global lock is released above so other secrets can register concurrently. + if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { + return nil, fmt.Errorf("failed waiting for cache sync") + } + s.lock.Lock() defer s.lock.Unlock() @@ -199,10 +206,9 @@ func (s *secretMonitor) GetSecret(ctx context.Context, handlerRegistration Secre return nil, fmt.Errorf("secret monitor doesn't exist for key %v", key) } - // do not wait for informer store sync. If it's not ready, return an error immediately - // so the caller can proceed asynchronously. - if !handlerRegistration.HasSynced() { - return nil, fmt.Errorf("secret cache not synced yet") + // Wait for informer store sync to load secrets. + if !cache.WaitForCacheSync(ctx.Done(), handlerRegistration.HasSynced) { + return nil, fmt.Errorf("failed waiting for cache sync") } uncast, exists, err := m.itemMonitor.GetItem() From 324594ab7dbed258bf9a87909f090aaedb7ee41b Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 26 Mar 2026 13:26:39 -0400 Subject: [PATCH 10/27] fix: remove stray blank line to pass gofmt check --- pkg/router/controller/route_secret_manager_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index 32d6e8fc6..f00dd678b 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -1221,7 +1221,6 @@ func TestRouteSecretManager(t *testing.T) { s.route = freshRoute } - if (gotErr != nil) != s.expectedError { t.Fatalf("expected error to be %t, but got %t", s.expectedError, gotErr != nil) } From 4f0a9b45750c590514df6c0a0b2b30b0c455d47c Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 26 Mar 2026 14:13:21 -0400 Subject: [PATCH 11/27] chore: update library-go to grab async SAR fixes --- go.mod | 2 +- go.sum | 4 ++-- vendor/modules.txt | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index c8d1bad64..a06cebab9 100644 --- a/go.mod +++ b/go.mod @@ -118,4 +118,4 @@ require ( sigs.k8s.io/yaml v1.6.0 // indirect ) -replace github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 +replace github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 diff --git a/go.sum b/go.sum index 45ec6e98a..8480dd4e6 100644 --- a/go.sum +++ b/go.sum @@ -13,8 +13,8 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be h1:7b8/p7wJ3N/OjXbZmwJdyR9eOoCla5SbUIe18WHio5s= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be/go.mod h1:MxVpaKTkNjZu5awzzr6mk6CIKaZYUFGxbmNwMvyVfeM= -github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 h1:jhARqaZphAvzRd1htPKPSLHAMEg01p9/CgmrNvqLLxM= -github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= +github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 h1:e6dH3705E2ENxtCUOOMP9/xGm8pQkcLe/AVRRfsCXqk= +github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= diff --git a/vendor/modules.txt b/vendor/modules.txt index 8b08d7d75..1216bfcb7 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -221,7 +221,7 @@ github.com/openshift/client-go/route/clientset/versioned/scheme github.com/openshift/client-go/route/clientset/versioned/typed/route/v1 github.com/openshift/client-go/route/clientset/versioned/typed/route/v1/fake github.com/openshift/client-go/route/listers/route/v1 -# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 => github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 +# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 => github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 ## explicit; go 1.24.0 github.com/openshift/library-go/pkg/crypto github.com/openshift/library-go/pkg/proc @@ -1324,4 +1324,4 @@ sigs.k8s.io/structured-merge-diff/v6/value # sigs.k8s.io/yaml v1.6.0 ## explicit; go 1.22 sigs.k8s.io/yaml -# github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260316154701-200e1ee46963 +# github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 From 277448bd6ea107220dea7f25a8695c717a131450 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 26 Mar 2026 17:22:44 -0400 Subject: [PATCH 12/27] chore: bump library-go to grab test fixes --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index a06cebab9..699b270c7 100644 --- a/go.mod +++ b/go.mod @@ -118,4 +118,4 @@ require ( sigs.k8s.io/yaml v1.6.0 // indirect ) -replace github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 +replace github.com/openshift/library-go => github.com/bentito/library-go ef7716e77 From 676251bc22ca23534def1edc472f4a0566050587 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Fri, 27 Mar 2026 14:40:46 -0400 Subject: [PATCH 13/27] chore: revert local library-go replace directive --- go.mod | 2 - go.sum | 4 +- .../authorizationutil/subject.go | 56 +++++++++++++++++++ .../authorization/authorizationutil/util.go | 50 +++++++++++++++++ .../pkg/route/secretmanager/manager.go | 39 +++---------- .../library-go/pkg/secret/secret_monitor.go | 34 ++++------- vendor/modules.txt | 4 +- 7 files changed, 128 insertions(+), 61 deletions(-) create mode 100644 vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go create mode 100644 vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go diff --git a/go.mod b/go.mod index 699b270c7..a0cb99be1 100644 --- a/go.mod +++ b/go.mod @@ -117,5 +117,3 @@ require ( sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) - -replace github.com/openshift/library-go => github.com/bentito/library-go ef7716e77 diff --git a/go.sum b/go.sum index 8480dd4e6..6d4b34285 100644 --- a/go.sum +++ b/go.sum @@ -13,8 +13,6 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be h1:7b8/p7wJ3N/OjXbZmwJdyR9eOoCla5SbUIe18WHio5s= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be/go.mod h1:MxVpaKTkNjZu5awzzr6mk6CIKaZYUFGxbmNwMvyVfeM= -github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 h1:e6dH3705E2ENxtCUOOMP9/xGm8pQkcLe/AVRRfsCXqk= -github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -186,6 +184,8 @@ github.com/openshift/api v0.0.0-20260212193555-c06ab675261f h1:l1IgsK48Ym/nED30y github.com/openshift/api v0.0.0-20260212193555-c06ab675261f/go.mod h1:d5uzF0YN2nQQFA0jIEWzzOZ+edmo6wzlGLvx5Fhz4uY= github.com/openshift/client-go v0.0.0-20260108185524-48f4ccfc4e13 h1:6rd4zSo2UaWQcAPZfHK9yzKVqH0BnMv1hqMzqXZyTds= github.com/openshift/client-go v0.0.0-20260108185524-48f4ccfc4e13/go.mod h1:YvOmPmV7wcJxpfhTDuFqqs2Xpb3M3ovsM6Qs/i2ptq4= +github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 h1:PkG3CmlU8+HtlW1rspnhwhbKki8rrwYN+L26aH11t2E= +github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= diff --git a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go new file mode 100644 index 000000000..74c179e68 --- /dev/null +++ b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go @@ -0,0 +1,56 @@ +package authorizationutil + +import ( + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apiserver/pkg/authentication/serviceaccount" +) + +func BuildRBACSubjects(users, groups []string) []rbacv1.Subject { + subjects := []rbacv1.Subject{} + + for _, user := range users { + saNamespace, saName, err := serviceaccount.SplitUsername(user) + if err == nil { + subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.ServiceAccountKind, Namespace: saNamespace, Name: saName}) + } else { + subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.UserKind, APIGroup: rbacv1.GroupName, Name: user}) + } + } + + for _, group := range groups { + subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.GroupKind, APIGroup: rbacv1.GroupName, Name: group}) + } + + return subjects +} + +func RBACSubjectsToUsersAndGroups(subjects []rbacv1.Subject, defaultNamespace string) (users []string, groups []string) { + for _, subject := range subjects { + + switch { + case subject.APIGroup == rbacv1.GroupName && subject.Kind == rbacv1.GroupKind: + groups = append(groups, subject.Name) + case subject.APIGroup == rbacv1.GroupName && subject.Kind == rbacv1.UserKind: + users = append(users, subject.Name) + case subject.APIGroup == "" && subject.Kind == rbacv1.ServiceAccountKind: + // default the namespace to namespace we're working in if + // it's available. This allows rolebindings that reference + // SAs in the local namespace to avoid having to qualify + // them. + ns := defaultNamespace + if len(subject.Namespace) > 0 { + ns = subject.Namespace + } + if len(ns) > 0 { + name := serviceaccount.MakeUsername(ns, subject.Name) + users = append(users, name) + } else { + // maybe error? this fails safe at any rate + } + default: + // maybe error? This fails safe at any rate + } + } + + return users, groups +} diff --git a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go new file mode 100644 index 000000000..040d0f643 --- /dev/null +++ b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go @@ -0,0 +1,50 @@ +package authorizationutil + +import ( + "context" + "errors" + + authorizationv1 "k8s.io/api/authorization/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apiserver/pkg/authentication/user" + authorizationclient "k8s.io/client-go/kubernetes/typed/authorization/v1" +) + +// AddUserToSAR adds the requisite user information to a SubjectAccessReview. +// It returns the modified SubjectAccessReview. +func AddUserToSAR(user user.Info, sar *authorizationv1.SubjectAccessReview) *authorizationv1.SubjectAccessReview { + sar.Spec.User = user.GetName() + // reminiscent of the bad old days of C. Copies copy the min number of elements of both source and dest + sar.Spec.Groups = make([]string, len(user.GetGroups())) + copy(sar.Spec.Groups, user.GetGroups()) + sar.Spec.Extra = map[string]authorizationv1.ExtraValue{} + + for k, v := range user.GetExtra() { + sar.Spec.Extra[k] = authorizationv1.ExtraValue(v) + } + + return sar +} + +// Authorize verifies that a given user is permitted to carry out a given +// action. If this cannot be determined, or if the user is not permitted, an +// error is returned. +func Authorize(sarClient authorizationclient.SubjectAccessReviewInterface, user user.Info, resourceAttributes *authorizationv1.ResourceAttributes) error { + sar := AddUserToSAR(user, &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + ResourceAttributes: resourceAttributes, + }, + }) + + resp, err := sarClient.Create(context.TODO(), sar, metav1.CreateOptions{}) + if err == nil && resp != nil && resp.Status.Allowed { + return nil + } + + if err == nil { + err = errors.New(resp.Status.Reason) + } + return kerrors.NewForbidden(schema.GroupResource{Group: resourceAttributes.Group, Resource: resourceAttributes.Resource}, resourceAttributes.Name, err) +} diff --git a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go index 3cc96a4fd..a4a80ee59 100644 --- a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go +++ b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go @@ -61,6 +61,7 @@ func (m *manager) Queue() workqueue.RateLimitingInterface { // Returns an error if the route is already registered with a secret or if adding the secret event handler fails. func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secretName string, handler cache.ResourceEventHandlerFuncs) error { m.handlersLock.Lock() + defer m.handlersLock.Unlock() // Generate a unique key for the provided namespace and routeName. key := generateKey(namespace, routeName) @@ -69,44 +70,22 @@ func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secre // Each route (namespace/routeName) should be registered only once with any secret. // Note: inside a namespace multiple different routes can be registered(watch) with a common secret. if _, exists := m.registeredHandlers[key]; exists { - m.handlersLock.Unlock() return fmt.Errorf("route already registered with key %s", key) } - // Because adding the secret event handler can take O(latency) time when it starts an informer - // and waits for cache sync, we temporarily release the handlersLock. This permits concurrent - // registrations of other routes. - // We mark this key as tentatively registered using a nil registration - // so that concurrent attempts to register the same route fail immediately. - m.registeredHandlers[key] = referencedSecret{ - secretName: secretName, - handlerRegistration: nil, // placeholder while syncing - } - m.handlersLock.Unlock() - // Add a secret event handler for the specified namespace and secret, with the handler functions. - // This call releases the monitor lock internally during WaitForCacheSync, allowing concurrent - // registrations for different secrets. However, it blocks until the cache is synced so that - // GetSecret works immediately after RegisterRoute returns. klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) - handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) - - m.handlersLock.Lock() - if err != nil { - delete(m.registeredHandlers, key) - m.handlersLock.Unlock() - return fmt.Errorf("failed to add secret event handler for key %s: %w", key, err) + return err } - // Update only if it wasn't unregistered while we were syncing - if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { - ref.handlerRegistration = handlerReg - m.registeredHandlers[key] = ref - klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + // Store the registration and secretName in the manager's map. Used during UnregisterRoute() and GetSecret(). + m.registeredHandlers[key] = referencedSecret{ + secretName: secretName, + handlerRegistration: handlerReg, } - m.handlersLock.Unlock() + klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) return nil } @@ -125,10 +104,6 @@ func (m *manager) UnregisterRoute(namespace, routeName string) error { return fmt.Errorf("no handler registered with key %s", key) } - if ref.handlerRegistration == nil { - return fmt.Errorf("route registration currently in progress for key %s", key) - } - // Remove the corresponding secret event handler from the secret monitor. klog.V(5).Info("trying to remove handler with key", key) err := m.monitor.RemoveSecretEventHandler(ref.handlerRegistration) diff --git a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go index 7c66899d5..ddccb0484 100644 --- a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go +++ b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go @@ -93,12 +93,10 @@ func (s *secretMonitor) createSecretInformer(namespace, name string) cache.Share } // addSecretEventHandler adds a secret event handler and starts the informer if not already running. -// -// The global write lock is released before waiting for the informer cache to sync, allowing -// concurrent calls for different secrets to proceed in parallel. This is critical for performance -// when many routes reference external certificate secrets, as registering N secrets serially -// (each requiring an API server round-trip to etcd) would take O(N * api_latency) time. func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, secretName string, handler cache.ResourceEventHandler, secretInformer cache.SharedInformer) (SecretEventHandlerRegistration, error) { + s.lock.Lock() + defer s.lock.Unlock() + if handler == nil { return nil, fmt.Errorf("nil handler is provided") } @@ -106,34 +104,24 @@ func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, se // secret identifier (namespace/secret) key := NewObjectKey(namespace, secretName) - s.lock.Lock() // Start secret informer if monitor does not exist. m, exists := s.monitors[key] if !exists { m = &monitoredItem{} m.itemMonitor = newSingleItemMonitor(key, secretInformer) m.itemMonitor.StartInformer(ctx) - // Register the monitor in the map immediately (before releasing the lock) so that - // concurrent registrations for the same secret reuse this informer rather than - // starting a duplicate. + + // wait for first sync + if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { + return nil, fmt.Errorf("failed waiting for cache sync") + } + + // add item key to monitors map s.monitors[key] = m - } - s.lock.Unlock() - if !exists { klog.Info("secret informer started", " item key ", key) } - // Wait for the informer cache to sync before adding event handlers. - // This ensures GetSecret can retrieve secrets immediately after registration. - // The global lock is released above so other secrets can register concurrently. - if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { - return nil, fmt.Errorf("failed waiting for cache sync") - } - - s.lock.Lock() - defer s.lock.Unlock() - // add the event handler registration, err := m.itemMonitor.AddEventHandler(handler) if err != nil { @@ -206,7 +194,7 @@ func (s *secretMonitor) GetSecret(ctx context.Context, handlerRegistration Secre return nil, fmt.Errorf("secret monitor doesn't exist for key %v", key) } - // Wait for informer store sync to load secrets. + // wait for informer store sync, to load secrets if !cache.WaitForCacheSync(ctx.Done(), handlerRegistration.HasSynced) { return nil, fmt.Errorf("failed waiting for cache sync") } diff --git a/vendor/modules.txt b/vendor/modules.txt index 1216bfcb7..2e3719a0c 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -221,8 +221,9 @@ github.com/openshift/client-go/route/clientset/versioned/scheme github.com/openshift/client-go/route/clientset/versioned/typed/route/v1 github.com/openshift/client-go/route/clientset/versioned/typed/route/v1/fake github.com/openshift/client-go/route/listers/route/v1 -# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 => github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 +# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 ## explicit; go 1.24.0 +github.com/openshift/library-go/pkg/authorization/authorizationutil github.com/openshift/library-go/pkg/crypto github.com/openshift/library-go/pkg/proc github.com/openshift/library-go/pkg/route/secretmanager @@ -1324,4 +1325,3 @@ sigs.k8s.io/structured-merge-diff/v6/value # sigs.k8s.io/yaml v1.6.0 ## explicit; go 1.22 sigs.k8s.io/yaml -# github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260326175450-bff69eeafeb1 From db2142c831b579faae076eb852380e699983869b Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Fri, 27 Mar 2026 15:01:05 -0400 Subject: [PATCH 14/27] chore: sync vendor directory after removing authorizationutil --- .../authorizationutil/subject.go | 56 ------------------- .../authorization/authorizationutil/util.go | 50 ----------------- vendor/modules.txt | 1 - 3 files changed, 107 deletions(-) delete mode 100644 vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go delete mode 100644 vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go diff --git a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go deleted file mode 100644 index 74c179e68..000000000 --- a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/subject.go +++ /dev/null @@ -1,56 +0,0 @@ -package authorizationutil - -import ( - rbacv1 "k8s.io/api/rbac/v1" - "k8s.io/apiserver/pkg/authentication/serviceaccount" -) - -func BuildRBACSubjects(users, groups []string) []rbacv1.Subject { - subjects := []rbacv1.Subject{} - - for _, user := range users { - saNamespace, saName, err := serviceaccount.SplitUsername(user) - if err == nil { - subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.ServiceAccountKind, Namespace: saNamespace, Name: saName}) - } else { - subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.UserKind, APIGroup: rbacv1.GroupName, Name: user}) - } - } - - for _, group := range groups { - subjects = append(subjects, rbacv1.Subject{Kind: rbacv1.GroupKind, APIGroup: rbacv1.GroupName, Name: group}) - } - - return subjects -} - -func RBACSubjectsToUsersAndGroups(subjects []rbacv1.Subject, defaultNamespace string) (users []string, groups []string) { - for _, subject := range subjects { - - switch { - case subject.APIGroup == rbacv1.GroupName && subject.Kind == rbacv1.GroupKind: - groups = append(groups, subject.Name) - case subject.APIGroup == rbacv1.GroupName && subject.Kind == rbacv1.UserKind: - users = append(users, subject.Name) - case subject.APIGroup == "" && subject.Kind == rbacv1.ServiceAccountKind: - // default the namespace to namespace we're working in if - // it's available. This allows rolebindings that reference - // SAs in the local namespace to avoid having to qualify - // them. - ns := defaultNamespace - if len(subject.Namespace) > 0 { - ns = subject.Namespace - } - if len(ns) > 0 { - name := serviceaccount.MakeUsername(ns, subject.Name) - users = append(users, name) - } else { - // maybe error? this fails safe at any rate - } - default: - // maybe error? This fails safe at any rate - } - } - - return users, groups -} diff --git a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go b/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go deleted file mode 100644 index 040d0f643..000000000 --- a/vendor/github.com/openshift/library-go/pkg/authorization/authorizationutil/util.go +++ /dev/null @@ -1,50 +0,0 @@ -package authorizationutil - -import ( - "context" - "errors" - - authorizationv1 "k8s.io/api/authorization/v1" - kerrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apiserver/pkg/authentication/user" - authorizationclient "k8s.io/client-go/kubernetes/typed/authorization/v1" -) - -// AddUserToSAR adds the requisite user information to a SubjectAccessReview. -// It returns the modified SubjectAccessReview. -func AddUserToSAR(user user.Info, sar *authorizationv1.SubjectAccessReview) *authorizationv1.SubjectAccessReview { - sar.Spec.User = user.GetName() - // reminiscent of the bad old days of C. Copies copy the min number of elements of both source and dest - sar.Spec.Groups = make([]string, len(user.GetGroups())) - copy(sar.Spec.Groups, user.GetGroups()) - sar.Spec.Extra = map[string]authorizationv1.ExtraValue{} - - for k, v := range user.GetExtra() { - sar.Spec.Extra[k] = authorizationv1.ExtraValue(v) - } - - return sar -} - -// Authorize verifies that a given user is permitted to carry out a given -// action. If this cannot be determined, or if the user is not permitted, an -// error is returned. -func Authorize(sarClient authorizationclient.SubjectAccessReviewInterface, user user.Info, resourceAttributes *authorizationv1.ResourceAttributes) error { - sar := AddUserToSAR(user, &authorizationv1.SubjectAccessReview{ - Spec: authorizationv1.SubjectAccessReviewSpec{ - ResourceAttributes: resourceAttributes, - }, - }) - - resp, err := sarClient.Create(context.TODO(), sar, metav1.CreateOptions{}) - if err == nil && resp != nil && resp.Status.Allowed { - return nil - } - - if err == nil { - err = errors.New(resp.Status.Reason) - } - return kerrors.NewForbidden(schema.GroupResource{Group: resourceAttributes.Group, Resource: resourceAttributes.Resource}, resourceAttributes.Name, err) -} diff --git a/vendor/modules.txt b/vendor/modules.txt index 2e3719a0c..3fecf52fa 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -223,7 +223,6 @@ github.com/openshift/client-go/route/clientset/versioned/typed/route/v1/fake github.com/openshift/client-go/route/listers/route/v1 # github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 ## explicit; go 1.24.0 -github.com/openshift/library-go/pkg/authorization/authorizationutil github.com/openshift/library-go/pkg/crypto github.com/openshift/library-go/pkg/proc github.com/openshift/library-go/pkg/route/secretmanager From b10782900256e83b272096cc4a56608dbbe04888 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 30 Mar 2026 14:10:45 -0400 Subject: [PATCH 15/27] Fix Async SAR robustness, cache TTL, and synchronous short-circuit --- go.mod | 2 + go.sum | 4 +- pkg/router/routeapihelpers/validation.go | 91 ++++++++++++++----- .../pkg/route/secretmanager/manager.go | 39 ++++++-- .../library-go/pkg/secret/secret_monitor.go | 34 ++++--- vendor/modules.txt | 3 +- 6 files changed, 129 insertions(+), 44 deletions(-) diff --git a/go.mod b/go.mod index a0cb99be1..0b8905654 100644 --- a/go.mod +++ b/go.mod @@ -117,3 +117,5 @@ require ( sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) + +replace github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260326212156-ef7716e77898 diff --git a/go.sum b/go.sum index 6d4b34285..13e4a14fc 100644 --- a/go.sum +++ b/go.sum @@ -13,6 +13,8 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be h1:7b8/p7wJ3N/OjXbZmwJdyR9eOoCla5SbUIe18WHio5s= github.com/bcicen/go-haproxy v0.0.0-20180203142132-ff5824fe38be/go.mod h1:MxVpaKTkNjZu5awzzr6mk6CIKaZYUFGxbmNwMvyVfeM= +github.com/bentito/library-go v0.0.0-20260326212156-ef7716e77898 h1:nzgD5pjb18eOkkf/FeXaNofDNHsG5JqJl3JsbFFgH34= +github.com/bentito/library-go v0.0.0-20260326212156-ef7716e77898/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -184,8 +186,6 @@ github.com/openshift/api v0.0.0-20260212193555-c06ab675261f h1:l1IgsK48Ym/nED30y github.com/openshift/api v0.0.0-20260212193555-c06ab675261f/go.mod h1:d5uzF0YN2nQQFA0jIEWzzOZ+edmo6wzlGLvx5Fhz4uY= github.com/openshift/client-go v0.0.0-20260108185524-48f4ccfc4e13 h1:6rd4zSo2UaWQcAPZfHK9yzKVqH0BnMv1hqMzqXZyTds= github.com/openshift/client-go v0.0.0-20260108185524-48f4ccfc4e13/go.mod h1:YvOmPmV7wcJxpfhTDuFqqs2Xpb3M3ovsM6Qs/i2ptq4= -github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 h1:PkG3CmlU8+HtlW1rspnhwhbKki8rrwYN+L26aH11t2E= -github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906/go.mod h1:K3FoNLgNBFYbFuG+Kr8usAnQxj1w84XogyUp2M8rK8k= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 65eb655b5..e7ce4b0b6 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -481,7 +481,7 @@ func UpgradeRouteValidation(route *routev1.Route) field.ErrorList { return nil } -const MaxConcurrentSecretSyncs = 50 +const MaxConcurrentSecretSyncs = 5 var asyncSARSemaphore = make(chan struct{}, MaxConcurrentSecretSyncs) @@ -492,6 +492,7 @@ type asyncSARResult struct { callbacks []func(string, string) mu sync.Mutex done bool + createdAt time.Time } // InvalidateAsyncSARCache removes the cached result for a specific secret to force revalidation. @@ -504,15 +505,20 @@ func ClearAsyncSARCacheForTest() { asyncSARCache = sync.Map{} } -// IsAsyncSARPending returns true if a SAR check has been started but not yet -// completed for the given namespace/secretName. When true, callers should defer -// any work that requires confirmed SAR approval (e.g. populating TLS data). func IsAsyncSARPending(namespace, secretName string) bool { cacheKey := namespace + "/" + secretName if cachedRaw, ok := asyncSARCache.Load(cacheKey); ok { cached := cachedRaw.(*asyncSARResult) cached.mu.Lock() defer cached.mu.Unlock() + // Apply a 2-minute TTL to cached SAR results. This ensures that if the + // router's RBAC permissions are updated (e.g. granted) after initial + // failure, we naturally retry and recover without waiting for the secret + // to be mutated. + if time.Since(cached.createdAt) > 2*time.Minute { + asyncSARCache.Delete(cacheKey) + return false + } return !cached.done } return false @@ -532,16 +538,25 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s if cachedRaw, ok := asyncSARCache.Load(cacheKey); ok { cached := cachedRaw.(*asyncSARResult) cached.mu.Lock() - defer cached.mu.Unlock() - if cached.done { + // Apply a 2-minute TTL to cached SAR results. This ensures that if the + // router's RBAC permissions are updated (e.g. granted) after initial + // failure, we naturally retry and recover without waiting for the secret + // to be mutated. + if time.Since(cached.createdAt) > 2*time.Minute { + cached.mu.Unlock() + asyncSARCache.Delete(cacheKey) + } else { + defer cached.mu.Unlock() + if cached.done { + return cached.errs + } + // SAR is still in-flight; return error so the route stays rejected + // until the SAR completes and triggers re-evaluation. + if onComplete != nil { + cached.callbacks = append(cached.callbacks, onComplete) + } return cached.errs } - if onComplete != nil { - cached.callbacks = append(cached.callbacks, onComplete) - } - // SAR is still in-flight; return error so the route stays rejected - // until the SAR completes and triggers re-evaluation. - return cached.errs } // For tests where dependencies might be mocked/nil, avoid panic @@ -555,10 +570,8 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s pendingErr := field.ErrorList{field.InternalError(fldPath, fmt.Errorf("authorization check pending for secret %q", secretName))} result := &asyncSARResult{ - errs: pendingErr, - } - if onComplete != nil { - result.callbacks = append(result.callbacks, onComplete) + errs: pendingErr, + createdAt: time.Now(), } if loadedRaw, loaded := asyncSARCache.LoadOrStore(cacheKey, result); loaded { @@ -585,7 +598,7 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s errs := field.ErrorList{} - timeoutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + timeoutCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() sarGet := &authorizationv1.SubjectAccessReview{ @@ -644,9 +657,23 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s errs = append(errs, field.Invalid(fldPath, secretName, fmt.Sprintf("secret of type %q required", kapi.SecretTypeTLS))) } + hasTransientErr := false + for _, e := range errs { + if e.Type == field.ErrorTypeInternal { + hasTransientErr = true + break + } + } + result.mu.Lock() result.errs = errs - result.done = true + if hasTransientErr { + // On transient failures (e.g. rate-limiter timeouts, network blips), + // do not permanently cache the failure. + asyncSARCache.Delete(cacheKey) + } else { + result.done = true + } callbacks := result.callbacks result.callbacks = nil result.mu.Unlock() @@ -661,10 +688,28 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s return result.errs } - go validateFunc() + fastComplete := make(chan struct{}) + + go func() { + validateFunc() + close(fastComplete) + }() - // Return the pending error: the SAR is running asynchronously. - // The route should stay rejected until the SAR completes and triggers - // re-evaluation via the onComplete callback. - return pendingErr + // Wait up to 250ms synchronously to see if the SAR check completes. + // This avoids "pending" state transitions for healthy routes under normal load. + select { + case <-fastComplete: + result.mu.Lock() + defer result.mu.Unlock() + return result.errs + case <-time.After(250 * time.Millisecond): + result.mu.Lock() + defer result.mu.Unlock() + // Double check if it finished while we were timing out + if result.done { + return result.errs + } + result.callbacks = append(result.callbacks, onComplete) + return pendingErr + } } diff --git a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go index a4a80ee59..3cc96a4fd 100644 --- a/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go +++ b/vendor/github.com/openshift/library-go/pkg/route/secretmanager/manager.go @@ -61,7 +61,6 @@ func (m *manager) Queue() workqueue.RateLimitingInterface { // Returns an error if the route is already registered with a secret or if adding the secret event handler fails. func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secretName string, handler cache.ResourceEventHandlerFuncs) error { m.handlersLock.Lock() - defer m.handlersLock.Unlock() // Generate a unique key for the provided namespace and routeName. key := generateKey(namespace, routeName) @@ -70,22 +69,44 @@ func (m *manager) RegisterRoute(ctx context.Context, namespace, routeName, secre // Each route (namespace/routeName) should be registered only once with any secret. // Note: inside a namespace multiple different routes can be registered(watch) with a common secret. if _, exists := m.registeredHandlers[key]; exists { + m.handlersLock.Unlock() return fmt.Errorf("route already registered with key %s", key) } + // Because adding the secret event handler can take O(latency) time when it starts an informer + // and waits for cache sync, we temporarily release the handlersLock. This permits concurrent + // registrations of other routes. + // We mark this key as tentatively registered using a nil registration + // so that concurrent attempts to register the same route fail immediately. + m.registeredHandlers[key] = referencedSecret{ + secretName: secretName, + handlerRegistration: nil, // placeholder while syncing + } + m.handlersLock.Unlock() + // Add a secret event handler for the specified namespace and secret, with the handler functions. + // This call releases the monitor lock internally during WaitForCacheSync, allowing concurrent + // registrations for different secrets. However, it blocks until the cache is synced so that + // GetSecret works immediately after RegisterRoute returns. klog.V(5).Infof("trying to add handler for key %s with secret %s", key, secretName) + handlerReg, err := m.monitor.AddSecretEventHandler(ctx, namespace, secretName, handler) + + m.handlersLock.Lock() + if err != nil { - return err + delete(m.registeredHandlers, key) + m.handlersLock.Unlock() + return fmt.Errorf("failed to add secret event handler for key %s: %w", key, err) } - // Store the registration and secretName in the manager's map. Used during UnregisterRoute() and GetSecret(). - m.registeredHandlers[key] = referencedSecret{ - secretName: secretName, - handlerRegistration: handlerReg, + // Update only if it wasn't unregistered while we were syncing + if ref, exists := m.registeredHandlers[key]; exists && ref.secretName == secretName && ref.handlerRegistration == nil { + ref.handlerRegistration = handlerReg + m.registeredHandlers[key] = ref + klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) } - klog.Infof("secret manager registered route for key %s with secret %s", key, secretName) + m.handlersLock.Unlock() return nil } @@ -104,6 +125,10 @@ func (m *manager) UnregisterRoute(namespace, routeName string) error { return fmt.Errorf("no handler registered with key %s", key) } + if ref.handlerRegistration == nil { + return fmt.Errorf("route registration currently in progress for key %s", key) + } + // Remove the corresponding secret event handler from the secret monitor. klog.V(5).Info("trying to remove handler with key", key) err := m.monitor.RemoveSecretEventHandler(ref.handlerRegistration) diff --git a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go index ddccb0484..7c66899d5 100644 --- a/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go +++ b/vendor/github.com/openshift/library-go/pkg/secret/secret_monitor.go @@ -93,10 +93,12 @@ func (s *secretMonitor) createSecretInformer(namespace, name string) cache.Share } // addSecretEventHandler adds a secret event handler and starts the informer if not already running. +// +// The global write lock is released before waiting for the informer cache to sync, allowing +// concurrent calls for different secrets to proceed in parallel. This is critical for performance +// when many routes reference external certificate secrets, as registering N secrets serially +// (each requiring an API server round-trip to etcd) would take O(N * api_latency) time. func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, secretName string, handler cache.ResourceEventHandler, secretInformer cache.SharedInformer) (SecretEventHandlerRegistration, error) { - s.lock.Lock() - defer s.lock.Unlock() - if handler == nil { return nil, fmt.Errorf("nil handler is provided") } @@ -104,24 +106,34 @@ func (s *secretMonitor) addSecretEventHandler(ctx context.Context, namespace, se // secret identifier (namespace/secret) key := NewObjectKey(namespace, secretName) + s.lock.Lock() // Start secret informer if monitor does not exist. m, exists := s.monitors[key] if !exists { m = &monitoredItem{} m.itemMonitor = newSingleItemMonitor(key, secretInformer) m.itemMonitor.StartInformer(ctx) - - // wait for first sync - if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { - return nil, fmt.Errorf("failed waiting for cache sync") - } - - // add item key to monitors map + // Register the monitor in the map immediately (before releasing the lock) so that + // concurrent registrations for the same secret reuse this informer rather than + // starting a duplicate. s.monitors[key] = m + } + s.lock.Unlock() + if !exists { klog.Info("secret informer started", " item key ", key) } + // Wait for the informer cache to sync before adding event handlers. + // This ensures GetSecret can retrieve secrets immediately after registration. + // The global lock is released above so other secrets can register concurrently. + if !cache.WaitForCacheSync(ctx.Done(), m.itemMonitor.HasSynced) { + return nil, fmt.Errorf("failed waiting for cache sync") + } + + s.lock.Lock() + defer s.lock.Unlock() + // add the event handler registration, err := m.itemMonitor.AddEventHandler(handler) if err != nil { @@ -194,7 +206,7 @@ func (s *secretMonitor) GetSecret(ctx context.Context, handlerRegistration Secre return nil, fmt.Errorf("secret monitor doesn't exist for key %v", key) } - // wait for informer store sync, to load secrets + // Wait for informer store sync to load secrets. if !cache.WaitForCacheSync(ctx.Done(), handlerRegistration.HasSynced) { return nil, fmt.Errorf("failed waiting for cache sync") } diff --git a/vendor/modules.txt b/vendor/modules.txt index 3fecf52fa..e47fd8d23 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -221,7 +221,7 @@ github.com/openshift/client-go/route/clientset/versioned/scheme github.com/openshift/client-go/route/clientset/versioned/typed/route/v1 github.com/openshift/client-go/route/clientset/versioned/typed/route/v1/fake github.com/openshift/client-go/route/listers/route/v1 -# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 +# github.com/openshift/library-go v0.0.0-20260223145824-7b234b47a906 => github.com/bentito/library-go v0.0.0-20260326212156-ef7716e77898 ## explicit; go 1.24.0 github.com/openshift/library-go/pkg/crypto github.com/openshift/library-go/pkg/proc @@ -1324,3 +1324,4 @@ sigs.k8s.io/structured-merge-diff/v6/value # sigs.k8s.io/yaml v1.6.0 ## explicit; go 1.22 sigs.k8s.io/yaml +# github.com/openshift/library-go => github.com/bentito/library-go v0.0.0-20260326212156-ef7716e77898 From aa7af73e1ae7137afc6a92bcfba45d0a226e277c Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Tue, 31 Mar 2026 15:49:02 -0400 Subject: [PATCH 16/27] fix(async-sar): introduce dynamic fast-path based on semaphore availability The previous implementation introduced a 250ms synchronous wait for all async SAR evaluations to avoid dummy 'pending' states on standard routes. Under heavy load (e.g., e2e-aws-serial scale tests), this blocked the router's main event loop for 250ms per route once all SAR workers were busy, throttling the router to ~4 routes/second and causing timeouts. This commit switches to a dynamic non-blocking select: - If a worker (semaphore) is immediately available, it launches the check and blocks up to 250ms to preserve atomic admission. - If all workers are busy (capacity maxed), it registers the callback and queues the execution in the background WITHOUT blocking the router loop, restoring full throughput under load. --- pkg/router/routeapihelpers/validation.go | 60 +++++++++++++++--------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index e7ce4b0b6..53f1c7b88 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -591,11 +591,6 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s // Perform checks validateFunc := func() { - // Acquire token (blocks if 50 are already running) - asyncSARSemaphore <- struct{}{} - // Guarantee token release - defer func() { <-asyncSARSemaphore }() - errs := field.ErrorList{} timeoutCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) @@ -684,32 +679,51 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s } if onComplete == nil { + asyncSARSemaphore <- struct{}{} + defer func() { <-asyncSARSemaphore }() validateFunc() return result.errs } - fastComplete := make(chan struct{}) - - go func() { - validateFunc() - close(fastComplete) - }() - - // Wait up to 250ms synchronously to see if the SAR check completes. - // This avoids "pending" state transitions for healthy routes under normal load. select { - case <-fastComplete: - result.mu.Lock() - defer result.mu.Unlock() - return result.errs - case <-time.After(250 * time.Millisecond): - result.mu.Lock() - defer result.mu.Unlock() - // Double check if it finished while we were timing out - if result.done { + case asyncSARSemaphore <- struct{}{}: + fastComplete := make(chan struct{}) + + go func() { + defer func() { <-asyncSARSemaphore }() + validateFunc() + close(fastComplete) + }() + + // Wait up to 250ms synchronously to see if the SAR check completes. + // This avoids "pending" state transitions for healthy routes under normal load. + select { + case <-fastComplete: + result.mu.Lock() + defer result.mu.Unlock() return result.errs + case <-time.After(250 * time.Millisecond): + result.mu.Lock() + defer result.mu.Unlock() + // Double check if it finished while we were timing out + if result.done { + return result.errs + } + result.callbacks = append(result.callbacks, onComplete) + return pendingErr } + default: + // Capacity maxed out; queue for execution but return immediately. + result.mu.Lock() result.callbacks = append(result.callbacks, onComplete) + result.mu.Unlock() + + go func() { + asyncSARSemaphore <- struct{}{} + defer func() { <-asyncSARSemaphore }() + validateFunc() + }() + return pendingErr } } From 152fb7ceb1f5da26d056a28813aa7b217e36f3ec Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Thu, 2 Apr 2026 13:59:45 -0400 Subject: [PATCH 17/27] fix(async-sar): do not cache validation failures Previous logic cached all non-internal errors (Forbidden, NotFound) for 2 minutes. In E2E tests, where RBAC RoleBindings and Secrets are created immediately before Routes, a temporary race condition could cause the initial SAR check to fail. The resulting 2-minute lockout prevented the test from recovering even after RBAC propagated, leading to timeouts. This commit ensures that ONLY successful validations are cached. Any failure (of any type) results in the cache entry being deleted, matching the behavior of the original synchronous validation path where every route event triggered a fresh check if the route wasn't already admitted. --- pkg/router/routeapihelpers/validation.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 53f1c7b88..46b7669ce 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -652,19 +652,15 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s errs = append(errs, field.Invalid(fldPath, secretName, fmt.Sprintf("secret of type %q required", kapi.SecretTypeTLS))) } - hasTransientErr := false - for _, e := range errs { - if e.Type == field.ErrorTypeInternal { - hasTransientErr = true - break - } - } + // On any error (Forbidden, NotFound, Internal, etc.), do not permanently + // cache the failure. This ensures that the router immediately retries + // and recovers when RBAC or secrets are updated, matching the behavior + // of the non-async validation path. + shouldCache := len(errs) == 0 result.mu.Lock() result.errs = errs - if hasTransientErr { - // On transient failures (e.g. rate-limiter timeouts, network blips), - // do not permanently cache the failure. + if !shouldCache { asyncSARCache.Delete(cacheKey) } else { result.done = true From 9bc422fca621616dd9b3f1598023feb671f4ba53 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Fri, 3 Apr 2026 10:41:43 -0400 Subject: [PATCH 18/27] fix(async-sar): handle library-go async registration and zombie callbacks This commit addresses two distinct race conditions that caused e2e test flakes when the router was under heavy parallel load (e2e-agnostic, e2e-aws-fips): 1. Zombie Callbacks: In the async SAR validation fast-path, if a background check took longer than 250ms and failed (e.g. RBAC not yet propagated), it deleted the cache entry without marking the result as done. Subsequent concurrent validations could append their callbacks to this zombie result before it was deleted, causing the callbacks to never fire and the route to stall indefinitely. The result is now unconditionally marked done inside the lock. 2. library-go Registration Race: To improve throughput, library-go's secretmanager.RegisterRoute drops its global lock while waiting for the secret informer cache to sync, storing a nil registration placeholder. If an E2E test immediately updates the route (e.g., adding an annotation), the router's UpdateFunc skips registration and calls GetSecret, which hits the nil placeholder and returns 'registration currently in progress'. The router aborted the update, dropping the route. A short retry loop was added to populateRouteTLSFromSecret to wait out this async cache sync. --- pkg/router/controller/route_secret_manager.go | 14 +++++++++++++- pkg/router/routeapihelpers/validation.go | 3 +-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index a4d651d98..bfcb0e3f4 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -3,7 +3,9 @@ package controller import ( "context" "fmt" + "strings" "sync" + "time" routev1 "github.com/openshift/api/route/v1" routelisters "github.com/openshift/client-go/route/listers/route/v1" @@ -428,7 +430,17 @@ func (p *RouteSecretManager) validate(route *routev1.Route) error { // Note: This function performs an in-place update of the route. The caller should be aware that the route's TLS configuration will be modified directly. func (p *RouteSecretManager) populateRouteTLSFromSecret(route *routev1.Route) error { // read referenced secret - secret, err := p.secretManager.GetSecret(context.TODO(), route.Namespace, route.Name) + var secret *kapi.Secret + var err error + for i := 0; i < 20; i++ { + secret, err = p.secretManager.GetSecret(context.TODO(), route.Namespace, route.Name) + if err != nil && strings.Contains(err.Error(), "registration currently in progress") { + time.Sleep(50 * time.Millisecond) + continue + } + break + } + if err != nil { log.Error(err, "failed to get referenced secret") p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonGetFailed, err.Error()) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 46b7669ce..9017fe693 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -660,10 +660,9 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s result.mu.Lock() result.errs = errs + result.done = true if !shouldCache { asyncSARCache.Delete(cacheKey) - } else { - result.done = true } callbacks := result.callbacks result.callbacks = nil From 22858a9961193d33cc9267a4895058a3c964fd0a Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Fri, 3 Apr 2026 20:18:31 -0400 Subject: [PATCH 19/27] Refactor: Replace asynchronous Subject Access Review with synchronous throttled validation to stabilize route status transitions --- pkg/router/controller/contention.go | 2 - pkg/router/controller/route_secret_manager.go | 84 +---- .../controller/route_secret_manager_test.go | 70 +--- pkg/router/routeapihelpers/validation.go | 306 +++++++----------- 4 files changed, 128 insertions(+), 334 deletions(-) diff --git a/pkg/router/controller/contention.go b/pkg/router/controller/contention.go index e108168b0..42aa32b9d 100644 --- a/pkg/router/controller/contention.go +++ b/pkg/router/controller/contention.go @@ -46,8 +46,6 @@ var ( ExtCrtStatusReasonSecretRecreated, ExtCrtStatusReasonSecretUpdated, ExtCrtStatusReasonSecretDeleted, - ExtCrtStatusReasonSARCompleted, - ExtCrtStatusReasonSecretLoaded, ) ) diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index bfcb0e3f4..8fcd84f4e 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -3,9 +3,7 @@ package controller import ( "context" "fmt" - "strings" "sync" - "time" routev1 "github.com/openshift/api/route/v1" routelisters "github.com/openshift/client-go/route/listers/route/v1" @@ -27,8 +25,6 @@ const ( ExtCrtStatusReasonSecretUpdated = "ExternalCertificateSecretUpdated" ExtCrtStatusReasonSecretDeleted = "ExternalCertificateSecretDeleted" ExtCrtStatusReasonGetFailed = "ExternalCertificateGetFailed" - ExtCrtStatusReasonSARCompleted = "ExternalCertificateSARCompleted" - ExtCrtStatusReasonSecretLoaded = "ExternalCertificateSecretLoaded" ) // RouteSecretManager implements the router.Plugin interface to register @@ -161,15 +157,10 @@ func (p *RouteSecretManager) HandleRoute(eventType watch.EventType, route *route // Therefore, it is essential to re-sync the secret to ensure the plugin chain correctly handles the route. log.V(4).Info("Re-validating existing external certificate", "namespace", route.Namespace, "secret", oldSecret, "route", route.Name) - // re-validate + // re-validate (synchronous, throttled by semaphore) if err := p.validate(route); err != nil { return err } - // Skip populating TLS if the SAR is still in-flight; the onComplete - // callback will trigger re-evaluation once the SAR resolves. - if routeapihelpers.IsAsyncSARPending(route.Namespace, route.Spec.TLS.ExternalCertificate.Name) { - break - } // read referenced secret and update TLS certificate and key if err := p.populateRouteTLSFromSecret(route); err != nil { return err @@ -214,7 +205,7 @@ func (p *RouteSecretManager) HandleRoute(eventType watch.EventType, route *route // validateAndRegister validates the route's externalCertificate configuration and registers it with the secret manager. // It also updates the in-memory TLS certificate and key after reading from secret informer's cache. func (p *RouteSecretManager) validateAndRegister(route *routev1.Route) error { - // validate + // validate (synchronous, throttled by semaphore) if err := p.validate(route); err != nil { return err } @@ -224,14 +215,6 @@ func (p *RouteSecretManager) validateAndRegister(route *routev1.Route) error { return fmt.Errorf("failed to register router: %w", err) } - // If the SAR check is still in-flight (async), skip populating TLS data now. - // The onComplete callback in validate() will trigger route re-evaluation once - // the SAR resolves, at which point the cache will be done and TLS will be - // populated on the next pass. - if routeapihelpers.IsAsyncSARPending(route.Namespace, route.Spec.TLS.ExternalCertificate.Name) { - return nil - } - // read referenced secret and update TLS certificate and key if err := p.populateRouteTLSFromSecret(route); err != nil { return err @@ -301,23 +284,6 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) // by all the plugins (including this plugin). Once passes, the route will become active again. msg := fmt.Sprintf("secret %q recreated for route %q", secret.Name, key) p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonSecretRecreated, msg) - return - } - - // Async secret load scenario - // When the secret completes its initial cache sync, we need to trigger the router - // controller to evaluate this route again. We use RecordRouteRejection to keep the route - // pending until the full plugin chain evaluates and admits it. - route, err := p.routelister.Routes(namespace).Get(routeName) - if err != nil { - log.Error(err, "failed to get route", "namespace", namespace, "route", routeName) - return - } - msg := fmt.Sprintf("secret %q loaded for route %q", secret.Name, key) - if isRouteAdmittedTrue(route.DeepCopy(), p.routerName) { - p.recorder.RecordRouteUpdate(route, ExtCrtStatusReasonSecretLoaded, msg) - } else { - p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonSecretLoaded, msg) } }, @@ -385,37 +351,15 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) // If the validation fails, it records the route rejection and triggers // the deletion of the route by calling the HandleRoute method with a watch.Deleted event. // +// This function is synchronous: it blocks until the SAR check completes. +// Concurrency is throttled by the semaphore in ValidateTLSExternalCertificate. +// // NOTE: TLS data validation and sanitization are handled by the next plugin `ExtendedValidator`, // by reading the "tls.crt" and "tls.key" added by populateRouteTLSFromSecret. func (p *RouteSecretManager) validate(route *routev1.Route) error { fldPath := field.NewPath("spec").Child("tls").Child("externalCertificate") - onComplete := func(namespace, secretName string) { - // Ensure fetching the updated route - latestRoute, err := p.routelister.Routes(namespace).Get(route.Name) - if err != nil { - log.Error(err, "failed to get route for SAR completion callback", "namespace", namespace, "route", route.Name) - return - } - - msg := fmt.Sprintf("SAR check completed for secret %q", secretName) - log.V(4).Info(msg, "namespace", namespace, "route", route.Name) - - // Update the route status to notify plugins, including this plugin, for re-evaluation. - if isRouteAdmittedTrue(latestRoute.DeepCopy(), p.routerName) { - p.recorder.RecordRouteUpdate(latestRoute, ExtCrtStatusReasonSARCompleted, msg) - } else { - p.recorder.RecordRouteRejection(latestRoute, ExtCrtStatusReasonSARCompleted, msg) - } - } - - if err := routeapihelpers.ValidateTLSExternalCertificate(route, fldPath, p.sarClient, p.secretsGetter, onComplete).ToAggregate(); err != nil { - // If the SAR check is still pending, return the error to prevent TLS population - // but don't delete the route — the onComplete callback will trigger re-evaluation. - if routeapihelpers.IsAsyncSARPending(route.Namespace, route.Spec.TLS.ExternalCertificate.Name) { - log.V(4).Info("SAR check pending for route, deferring admission", "namespace", route.Namespace, "route", route.Name) - return err - } + if err := routeapihelpers.ValidateTLSExternalCertificate(route, fldPath, p.sarClient, p.secretsGetter).ToAggregate(); err != nil { log.Error(err, "skipping route due to invalid externalCertificate configuration", "namespace", route.Namespace, "route", route.Name) p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonValidationFailed, err.Error()) p.plugin.HandleRoute(watch.Deleted, route) @@ -429,18 +373,10 @@ func (p *RouteSecretManager) validate(route *routev1.Route) error { // the deletion of the route by calling the HandleRoute method with a watch.Deleted event. // Note: This function performs an in-place update of the route. The caller should be aware that the route's TLS configuration will be modified directly. func (p *RouteSecretManager) populateRouteTLSFromSecret(route *routev1.Route) error { - // read referenced secret - var secret *kapi.Secret - var err error - for i := 0; i < 20; i++ { - secret, err = p.secretManager.GetSecret(context.TODO(), route.Namespace, route.Name) - if err != nil && strings.Contains(err.Error(), "registration currently in progress") { - time.Sleep(50 * time.Millisecond) - continue - } - break - } - + // read referenced secret from the informer cache. + // RegisterRoute blocks until cache sync completes, so the secret should be + // available immediately after registration. + secret, err := p.secretManager.GetSecret(context.TODO(), route.Namespace, route.Name) if err != nil { log.Error(err, "failed to get referenced secret") p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonGetFailed, err.Error()) diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index f00dd678b..80b3b3894 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -5,10 +5,8 @@ import ( "fmt" "os" "reflect" - "strings" "sync" "testing" - "time" "github.com/openshift/library-go/pkg/route/secretmanager/fake" "github.com/openshift/router/pkg/router" @@ -21,7 +19,6 @@ import ( "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/features" testclient "k8s.io/client-go/kubernetes/fake" @@ -1182,44 +1179,6 @@ func TestRouteSecretManager(t *testing.T) { gotErr := rsm.HandleRoute(s.eventType, s.route) - // When a route with externalCertificate is processed, the SAR check - // runs asynchronously. HandleRoute returns a pending error, and the - // onComplete callback fires later to trigger re-evaluation. Wait for - // that callback before asserting the final state, then retry with a - // fresh copy of the route (the first pass may mutate it in-place). - if gotErr != nil && strings.Contains(gotErr.Error(), "authorization check pending") { - err := wait.PollImmediate(10*time.Millisecond, 2*time.Second, func() (bool, error) { - for _, r := range recorder.GetRejections() { - if strings.Contains(r, ExtCrtStatusReasonSARCompleted) { - return true, nil - } - } - for _, u := range recorder.GetUpdates() { - if strings.Contains(u, ExtCrtStatusReasonSARCompleted) { - return true, nil - } - } - return false, nil - }) - if err != nil { - t.Fatalf("timeout waiting for async SAR completion: %v", err) - } - - // Reset the plugin and recorder state to assert only the final pass. - // Use a deep copy so mutations from the first pass don't affect assertions. - freshRoute := s.route.DeepCopy() - p.t = "" - p.route = nil - recorder.Lock() - recorder.rejections = nil - recorder.updates = nil - recorder.doneCh = make(chan struct{}) - recorder.Unlock() - - gotErr = rsm.HandleRoute(s.eventType, freshRoute) - // Replace s.route with freshRoute so the equality checks below use it. - s.route = freshRoute - } if (gotErr != nil) != s.expectedError { t.Fatalf("expected error to be %t, but got %t", s.expectedError, gotErr != nil) @@ -1359,12 +1318,6 @@ func TestSecretUpdate(t *testing.T) { t.Fatalf("failed to add handler: %v", err) } - <-recorder.doneCh // wait for Add event (SecretLoaded) - - recorder.Lock() - recorder.doneCh = make(chan struct{}) - recorder.Unlock() - // update the secret updatedSecret := secret.DeepCopy() updatedSecret.Data = map[string][]byte{ @@ -1375,11 +1328,11 @@ func TestSecretUpdate(t *testing.T) { t.Fatalf("failed to update secret: %v", err) } - // wait until route's status is updated - <-recorder.doneCh + // Wait directly for the Update event. + <-recorder.doneCh // wait for Update event - expectedStatus := []string{"sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretUpdated"} - expectedRejectionsOnUpdate := []string{"sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretUpdated"} + expectedStatus := []string{"sandbox-route-test:ExternalCertificateSecretUpdated"} + expectedRejectionsOnUpdate := []string{"sandbox-route-test:ExternalCertificateSecretUpdated"} if s.isRouteAdmittedTrue { // RecordRouteUpdate will be called if `Admitted=True` @@ -1388,7 +1341,6 @@ func TestSecretUpdate(t *testing.T) { } } else { // RecordRouteRejection will be called if `Admitted=False` - // Wait! Our AddFunc also emits a rejection (SecretLoaded). It's also stored in rejections. if !reflect.DeepEqual(expectedRejectionsOnUpdate, recorder.GetRejections()) { t.Fatalf("expected status %v, but got %v", expectedRejectionsOnUpdate, recorder.GetRejections()) } @@ -1438,20 +1390,14 @@ func TestSecretDelete(t *testing.T) { t.Fatalf("failed to add handler: %v", err) } - <-recorder.doneCh // wait until the status is updated to SecretLoaded - recorder.Lock() - recorder.doneCh = make(chan struct{}) - recorder.Unlock() - // delete the secret if err := kubeClient.CoreV1().Secrets(route.Namespace).Delete(context.TODO(), secret.Name, metav1.DeleteOptions{}); err != nil { t.Fatalf("failed to delete secret: %v", err) } - <-recorder.doneCh // wait until the route's status is updated + <-recorder.doneCh // wait until the route's status is updated (deletion) expectedRejections := []string{ - "sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretDeleted", } expectedDeletedSecrets := true @@ -1500,11 +1446,6 @@ func TestSecretRecreation(t *testing.T) { t.Fatalf("failed to add handler: %v", err) } - <-recorder.doneCh // wait until the status is updated to SecretLoaded - recorder.Lock() - recorder.doneCh = make(chan struct{}) - recorder.Unlock() - // delete the secret if err := kubeClient.CoreV1().Secrets(route.Namespace).Delete(context.TODO(), secret.Name, metav1.DeleteOptions{}); err != nil { t.Fatalf("failed to delete secret: %v", err) @@ -1523,7 +1464,6 @@ func TestSecretRecreation(t *testing.T) { <-recorder.doneCh // wait until the route's status is updated (re-creation) expectedRejections := []string{ - "sandbox-route-test:ExternalCertificateSecretLoaded", "sandbox-route-test:ExternalCertificateSecretDeleted", "sandbox-route-test:ExternalCertificateSecretRecreated", } diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 9017fe693..56203d493 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -481,52 +481,62 @@ func UpgradeRouteValidation(route *routev1.Route) field.ErrorList { return nil } -const MaxConcurrentSecretSyncs = 5 +// MaxConcurrentSARChecks limits the number of simultaneous SubjectAccessReview +// API calls to avoid overwhelming the API server during router startup with +// many externalCertificate routes. +const MaxConcurrentSARChecks = 10 -var asyncSARSemaphore = make(chan struct{}, MaxConcurrentSecretSyncs) +var sarSemaphore = make(chan struct{}, MaxConcurrentSARChecks) -var asyncSARCache sync.Map // key: namespace/secretName, value: *asyncSARResult - -type asyncSARResult struct { +// sarCacheEntry holds a cached successful SAR validation result. +type sarCacheEntry struct { errs field.ErrorList - callbacks []func(string, string) - mu sync.Mutex - done bool createdAt time.Time } -// InvalidateAsyncSARCache removes the cached result for a specific secret to force revalidation. +// sarCacheTTL determines how long successful SAR results are cached before +// revalidation. This ensures eventual consistency when RBAC permissions change. +const sarCacheTTL = 2 * time.Minute + +// sarCache stores successful SAR validation results keyed by "namespace/secretName". +// Only successful validations are cached; failures always trigger fresh checks. +var sarCache sync.Map + +// InvalidateAsyncSARCache removes the cached result for a specific secret, +// forcing revalidation on the next route event. Called when the secret is +// created, updated, or deleted. func InvalidateAsyncSARCache(namespace, secretName string) { - asyncSARCache.Delete(namespace + "/" + secretName) + sarCache.Delete(namespace + "/" + secretName) } -// ClearAsyncSARCacheForTest clears the global async SAR cache for testing purposes. +// ClearAsyncSARCacheForTest clears the global SAR cache for testing purposes. func ClearAsyncSARCacheForTest() { - asyncSARCache = sync.Map{} + sarCache = sync.Map{} } -func IsAsyncSARPending(namespace, secretName string) bool { - cacheKey := namespace + "/" + secretName - if cachedRaw, ok := asyncSARCache.Load(cacheKey); ok { - cached := cachedRaw.(*asyncSARResult) - cached.mu.Lock() - defer cached.mu.Unlock() - // Apply a 2-minute TTL to cached SAR results. This ensures that if the - // router's RBAC permissions are updated (e.g. granted) after initial - // failure, we naturally retry and recover without waiting for the secret - // to be mutated. - if time.Since(cached.createdAt) > 2*time.Minute { - asyncSARCache.Delete(cacheKey) - return false +// checkSARCache returns the cached SAR result if it exists and hasn't expired. +// Returns nil if there is no valid cache entry. +func checkSARCache(cacheKey string) *sarCacheEntry { + if raw, ok := sarCache.Load(cacheKey); ok { + entry := raw.(*sarCacheEntry) + if time.Since(entry.createdAt) > sarCacheTTL { + sarCache.Delete(cacheKey) + return nil } - return !cached.done + return entry } - return false + return nil } -// ValidateTLSExternalCertificate tests different pre-conditions required for -// using externalCertificate. -func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, sarc authorizationclient.SubjectAccessReviewInterface, secretsGetter corev1client.SecretsGetter, onComplete func(string, string)) field.ErrorList { +// ValidateTLSExternalCertificate validates that the router service account has +// the required RBAC permissions to access the referenced secret and that the +// secret exists and is of type kubernetes.io/tls. +// +// This function is synchronous and throttled: it blocks on a semaphore to limit +// the number of concurrent SAR API calls, preventing API server overload during +// startup with many externalCertificate routes. Successful results are cached +// with a 2-minute TTL to avoid redundant API calls on subsequent route events. +func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, sarc authorizationclient.SubjectAccessReviewInterface, secretsGetter corev1client.SecretsGetter) field.ErrorList { tls := route.Spec.TLS if tls == nil || tls.ExternalCertificate == nil || tls.ExternalCertificate.Name == "" { return nil @@ -535,190 +545,100 @@ func ValidateTLSExternalCertificate(route *routev1.Route, fldPath *field.Path, s secretName := tls.ExternalCertificate.Name cacheKey := route.Namespace + "/" + secretName - if cachedRaw, ok := asyncSARCache.Load(cacheKey); ok { - cached := cachedRaw.(*asyncSARResult) - cached.mu.Lock() - // Apply a 2-minute TTL to cached SAR results. This ensures that if the - // router's RBAC permissions are updated (e.g. granted) after initial - // failure, we naturally retry and recover without waiting for the secret - // to be mutated. - if time.Since(cached.createdAt) > 2*time.Minute { - cached.mu.Unlock() - asyncSARCache.Delete(cacheKey) - } else { - defer cached.mu.Unlock() - if cached.done { - return cached.errs - } - // SAR is still in-flight; return error so the route stays rejected - // until the SAR completes and triggers re-evaluation. - if onComplete != nil { - cached.callbacks = append(cached.callbacks, onComplete) - } - return cached.errs - } + // Fast path: return cached successful result. + if entry := checkSARCache(cacheKey); entry != nil { + return entry.errs } - // For tests where dependencies might be mocked/nil, avoid panic + // For tests where dependencies might be mocked/nil, avoid panic. if sarc == nil || secretsGetter == nil { return field.ErrorList{ field.InternalError(fldPath, fmt.Errorf("external certificate validation dependencies are not configured")), } } - // Store pending error list temporarily so we only launch one goroutine per secret - pendingErr := field.ErrorList{field.InternalError(fldPath, fmt.Errorf("authorization check pending for secret %q", secretName))} - - result := &asyncSARResult{ - errs: pendingErr, - createdAt: time.Now(), - } + // Acquire semaphore slot — blocks if all slots are in use. + // This throttles concurrent SAR API calls to MaxConcurrentSARChecks. + sarSemaphore <- struct{}{} + defer func() { <-sarSemaphore }() - if loadedRaw, loaded := asyncSARCache.LoadOrStore(cacheKey, result); loaded { - // Another goroutine raced to start the check, append to its callbacks - cached := loadedRaw.(*asyncSARResult) - cached.mu.Lock() - defer cached.mu.Unlock() - if cached.done { - return cached.errs - } - if onComplete != nil { - cached.callbacks = append(cached.callbacks, onComplete) - } - // SAR is still in-flight; return error so the route stays rejected. - return cached.errs + // Double-check cache after acquiring semaphore — another goroutine + // may have cached the result while we were waiting. + if entry := checkSARCache(cacheKey); entry != nil { + return entry.errs } - // Perform checks - validateFunc := func() { - errs := field.ErrorList{} + // Perform SAR checks synchronously. + errs := field.ErrorList{} - timeoutCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() + timeoutCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - sarGet := &authorizationv1.SubjectAccessReview{ - Spec: authorizationv1.SubjectAccessReviewSpec{ - User: routerServiceAccount, - ResourceAttributes: &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, Verb: "get", Resource: "secrets", Name: secretName, - }, + sarGet := &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + User: routerServiceAccount, + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "get", Resource: "secrets", Name: secretName, }, - } - resp, err := sarc.Create(timeoutCtx, sarGet, metav1.CreateOptions{}) - if err != nil { - errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'get' permission for secret %q: %v", secretName, err))) - } else if !resp.Status.Allowed { - errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to get this secret")) - } + }, + } + resp, err := sarc.Create(timeoutCtx, sarGet, metav1.CreateOptions{}) + if err != nil { + errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'get' permission for secret %q: %v", secretName, err))) + } else if !resp.Status.Allowed { + errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to get this secret")) + } - sarWatch := &authorizationv1.SubjectAccessReview{ - Spec: authorizationv1.SubjectAccessReviewSpec{ - User: routerServiceAccount, - ResourceAttributes: &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, Verb: "watch", Resource: "secrets", Name: secretName, - }, + sarWatch := &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + User: routerServiceAccount, + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "watch", Resource: "secrets", Name: secretName, }, - } - resp, err = sarc.Create(timeoutCtx, sarWatch, metav1.CreateOptions{}) - if err != nil { - errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'watch' permission for secret %q: %v", secretName, err))) - } else if !resp.Status.Allowed { - errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to watch this secret")) - } + }, + } + resp, err = sarc.Create(timeoutCtx, sarWatch, metav1.CreateOptions{}) + if err != nil { + errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'watch' permission for secret %q: %v", secretName, err))) + } else if !resp.Status.Allowed { + errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to watch this secret")) + } - sarList := &authorizationv1.SubjectAccessReview{ - Spec: authorizationv1.SubjectAccessReviewSpec{ - User: routerServiceAccount, - ResourceAttributes: &authorizationv1.ResourceAttributes{ - Namespace: route.Namespace, Verb: "list", Resource: "secrets", Name: secretName, - }, + sarList := &authorizationv1.SubjectAccessReview{ + Spec: authorizationv1.SubjectAccessReviewSpec{ + User: routerServiceAccount, + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: route.Namespace, Verb: "list", Resource: "secrets", Name: secretName, }, - } - resp, err = sarc.Create(timeoutCtx, sarList, metav1.CreateOptions{}) - if err != nil { - errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'list' permission for secret %q: %v", secretName, err))) - } else if !resp.Status.Allowed { - errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to list this secret")) - } - - secret, err := secretsGetter.Secrets(route.Namespace).Get(timeoutCtx, secretName, metav1.GetOptions{}) - if err != nil { - if apierrors.IsNotFound(err) { - errs = append(errs, field.NotFound(fldPath, err.Error())) - } else { - errs = append(errs, field.InternalError(fldPath, err)) - } - } else if secret.Type != kapi.SecretTypeTLS { - errs = append(errs, field.Invalid(fldPath, secretName, fmt.Sprintf("secret of type %q required", kapi.SecretTypeTLS))) - } - - // On any error (Forbidden, NotFound, Internal, etc.), do not permanently - // cache the failure. This ensures that the router immediately retries - // and recovers when RBAC or secrets are updated, matching the behavior - // of the non-async validation path. - shouldCache := len(errs) == 0 - - result.mu.Lock() - result.errs = errs - result.done = true - if !shouldCache { - asyncSARCache.Delete(cacheKey) - } - callbacks := result.callbacks - result.callbacks = nil - result.mu.Unlock() - - for _, cb := range callbacks { - cb(route.Namespace, secretName) - } + }, + } + resp, err = sarc.Create(timeoutCtx, sarList, metav1.CreateOptions{}) + if err != nil { + errs = append(errs, field.InternalError(fldPath, fmt.Errorf("failed to check 'list' permission for secret %q: %v", secretName, err))) + } else if !resp.Status.Allowed { + errs = append(errs, field.Forbidden(fldPath, "router serviceaccount does not have permission to list this secret")) } - if onComplete == nil { - asyncSARSemaphore <- struct{}{} - defer func() { <-asyncSARSemaphore }() - validateFunc() - return result.errs - } - - select { - case asyncSARSemaphore <- struct{}{}: - fastComplete := make(chan struct{}) - - go func() { - defer func() { <-asyncSARSemaphore }() - validateFunc() - close(fastComplete) - }() - - // Wait up to 250ms synchronously to see if the SAR check completes. - // This avoids "pending" state transitions for healthy routes under normal load. - select { - case <-fastComplete: - result.mu.Lock() - defer result.mu.Unlock() - return result.errs - case <-time.After(250 * time.Millisecond): - result.mu.Lock() - defer result.mu.Unlock() - // Double check if it finished while we were timing out - if result.done { - return result.errs - } - result.callbacks = append(result.callbacks, onComplete) - return pendingErr + secret, err := secretsGetter.Secrets(route.Namespace).Get(timeoutCtx, secretName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + errs = append(errs, field.NotFound(fldPath, err.Error())) + } else { + errs = append(errs, field.InternalError(fldPath, err)) } - default: - // Capacity maxed out; queue for execution but return immediately. - result.mu.Lock() - result.callbacks = append(result.callbacks, onComplete) - result.mu.Unlock() - - go func() { - asyncSARSemaphore <- struct{}{} - defer func() { <-asyncSARSemaphore }() - validateFunc() - }() + } else if secret.Type != kapi.SecretTypeTLS { + errs = append(errs, field.Invalid(fldPath, secretName, fmt.Sprintf("secret of type %q required", kapi.SecretTypeTLS))) + } - return pendingErr + // Cache only successful validations. Failures trigger a fresh check + // on the next route event, matching the original synchronous validation + // behavior where every event retried if the route wasn't yet admitted. + if len(errs) == 0 { + sarCache.Store(cacheKey, &sarCacheEntry{ + errs: errs, + createdAt: time.Now(), + }) } + + return errs } From c8a93a47d30bb2579113ad4d73f179b8d2105fb5 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 6 Apr 2026 09:00:56 -0400 Subject: [PATCH 20/27] Fix formatting in route_secret_manager_test.go --- pkg/router/controller/route_secret_manager_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index 80b3b3894..44511fdc8 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -1179,7 +1179,6 @@ func TestRouteSecretManager(t *testing.T) { gotErr := rsm.HandleRoute(s.eventType, s.route) - if (gotErr != nil) != s.expectedError { t.Fatalf("expected error to be %t, but got %t", s.expectedError, gotErr != nil) } From 2741705aedcbe6557696f5f690771943b21b3d0a Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Wed, 15 Apr 2026 14:03:15 -0400 Subject: [PATCH 21/27] build: fix debug makefile to explicitly cross-compile linux/amd64 When building the debug image on MacOS (arm64), the `openshift-router` binary was compiling as a Mach-O arm64 executable because `GOOS=linux` was not explicitly passed inline to the `go build` command, and `GOARCH` was omitted entirely. This updates `hack/Makefile.debug` to explicitly inline `GOOS=$(GOOS)` and `GOARCH=$(GOARCH)` (defaulting to amd64) to guarantee an ELF binary is built, which matches the x86_64 HAProxy image base. --- hack/Makefile.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hack/Makefile.debug b/hack/Makefile.debug index d0d1c0e34..9f3f760fd 100644 --- a/hack/Makefile.debug +++ b/hack/Makefile.debug @@ -1,6 +1,7 @@ # -*- mode: makefile -*- export GOOS=linux +export GOARCH ?= amd64 REGISTRY ?= quay.io IMAGE ?= openshift/openshift-router @@ -11,7 +12,7 @@ REMOTE_IMAGE ?= $(REGISTRY)/$(IMAGE):$(TAG) OPENSHIFT_ENDPOINT ?= $(shell oc config view --minify --template '{{(index .clusters 0).cluster.server}}' | grep -o '//[^ :]*' | sed 's/^..//') new-openshift-router-image: - GO111MODULE=on CGO_ENABLED=0 GOFLAGS=-mod=vendor go build -o openshift-router -gcflags=all="-N -l" ./cmd/openshift-router + GO111MODULE=on CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(GOARCH) GOFLAGS=-mod=vendor go build -o openshift-router -gcflags=all="-N -l" ./cmd/openshift-router $(IMAGEBUILDER) build -t $(LOCAL_IMAGE) -f hack/Dockerfile.debug . push: From 5ff126854f196748ba15878c1f537be5e36db2c2 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 4 May 2026 15:56:09 -0400 Subject: [PATCH 22/27] perf: enable true parallel route status and SAR validation This commit significantly improves router startup performance with hundreds of external certificate routes by introducing true parallel processing for route status updates and SubjectAccessReview (SAR) checks. This architecture safely increases throughput without overwhelming the API server by aligning three critical components: 1. Matched Client Rate Limits: Explicitly tuning the kube-client (QPS=50, Burst=100) prevents client-side queueing timeouts by ensuring the client can actually handle the requested concurrency. 2. Bounded Worker Pool: Refactoring the WriterLease to use a fixed pool of 50 concurrent workers prevents unbound goroutine growth and memory pressure under heavy load. 3. Strict API Ceiling: The semaphore and kube-client limits act together as a hard ceiling, guaranteeing the API server is protected from massive request spikes. Combined with the existing 2-minute SAR cache, this creates a mathematically bounded, highly predictable pipeline for rapid route validation. --- pkg/cmd/infra/router/clientcmd.go | 7 +++++ pkg/cmd/infra/router/template.go | 3 +- pkg/router/routeapihelpers/validation.go | 2 +- pkg/router/router_test.go | 2 +- pkg/router/writerlease/writerlease.go | 36 +++++++++++++++++----- pkg/router/writerlease/writerlease_test.go | 10 +++--- 6 files changed, 44 insertions(+), 16 deletions(-) diff --git a/pkg/cmd/infra/router/clientcmd.go b/pkg/cmd/infra/router/clientcmd.go index d3d59135c..ddad1f314 100644 --- a/pkg/cmd/infra/router/clientcmd.go +++ b/pkg/cmd/infra/router/clientcmd.go @@ -60,6 +60,13 @@ func (cfg *Config) KubeConfig() (*restclient.Config, string, error) { if err != nil { return nil, "", err } + + // Increase client-side rate limiting to support higher throughput during + // router startup, especially when many external certificate routes are + // present. + clientConfig.QPS = 50 + clientConfig.Burst = 100 + return clientConfig, namespace, nil } diff --git a/pkg/cmd/infra/router/template.go b/pkg/cmd/infra/router/template.go index 2505e43ca..a8f6ccaa8 100644 --- a/pkg/cmd/infra/router/template.go +++ b/pkg/cmd/infra/router/template.go @@ -42,6 +42,7 @@ import ( "github.com/openshift/router/pkg/router/controller" "github.com/openshift/router/pkg/router/metrics" "github.com/openshift/router/pkg/router/metrics/haproxy" + "github.com/openshift/router/pkg/router/routeapihelpers" "github.com/openshift/router/pkg/router/shutdown" templateplugin "github.com/openshift/router/pkg/router/template" haproxyconfigmanager "github.com/openshift/router/pkg/router/template/configmanager/haproxy" @@ -800,7 +801,7 @@ func (o *TemplateRouterOptions) Run(stopCh <-chan struct{}) error { informer := factory.CreateRoutesSharedInformer() routeLister := routelisters.NewRouteLister(informer.GetIndexer()) if o.UpdateStatus { - lease := writerlease.New(time.Minute, 3*time.Second) + lease := writerlease.New(time.Minute, 3*time.Second, routeapihelpers.MaxConcurrentSARChecks) go lease.Run(stopCh) tracker := controller.NewSimpleContentionTracker(informer, o.RouterName, o.ResyncInterval/10) tracker.SetConflictMessage(fmt.Sprintf("The router detected another process is writing conflicting updates to route status with name %q. Please ensure that the configuration of all routers is consistent. Route status will not be updated as long as conflicts are detected.", o.RouterName)) diff --git a/pkg/router/routeapihelpers/validation.go b/pkg/router/routeapihelpers/validation.go index 56203d493..6ebef541a 100644 --- a/pkg/router/routeapihelpers/validation.go +++ b/pkg/router/routeapihelpers/validation.go @@ -484,7 +484,7 @@ func UpgradeRouteValidation(route *routev1.Route) field.ErrorList { // MaxConcurrentSARChecks limits the number of simultaneous SubjectAccessReview // API calls to avoid overwhelming the API server during router startup with // many externalCertificate routes. -const MaxConcurrentSARChecks = 10 +const MaxConcurrentSARChecks = 50 var sarSemaphore = make(chan struct{}, MaxConcurrentSARChecks) diff --git a/pkg/router/router_test.go b/pkg/router/router_test.go index 7978114bd..a56aeadf4 100644 --- a/pkg/router/router_test.go +++ b/pkg/router/router_test.go @@ -94,7 +94,7 @@ func TestMain(m *testing.M) { factory := routerSelection.NewFactory(routeClient, projectClient.ProjectV1().Projects(), client) informer := factory.CreateRoutesSharedInformer() routeLister := routelisters.NewRouteLister(informer.GetIndexer()) - lease := writerlease.New(time.Minute, 3*time.Second) + lease := writerlease.New(time.Minute, 3*time.Second, 1) go lease.Run(wait.NeverStop) tracker := controller.NewSimpleContentionTracker(informer, namespace, 60*time.Second) tracker.SetConflictMessage(fmt.Sprintf("The router detected another process is writing conflicting updates to route status with name %q. Please ensure that the configuration of all routers is consistent. Route status will not be updated as long as conflicts are detected.", namespace)) diff --git a/pkg/router/writerlease/writerlease.go b/pkg/router/writerlease/writerlease.go index 67244c5c9..a3383e2c4 100644 --- a/pkg/router/writerlease/writerlease.go +++ b/pkg/router/writerlease/writerlease.go @@ -98,11 +98,13 @@ type WriterLease struct { state State expires time.Time tick int + + workers int } // New creates a new Lease. Specify the duration to hold leases for and the retry // interval on requests that fail. -func New(leaseDuration, retryInterval time.Duration) *WriterLease { +func New(leaseDuration, retryInterval time.Duration, workers int) *WriterLease { backoff := wait.Backoff{ Duration: 20 * time.Millisecond, Factor: 4, @@ -110,6 +112,10 @@ func New(leaseDuration, retryInterval time.Duration) *WriterLease { Jitter: 0.5, } + if workers < 1 { + workers = 1 + } + return &WriterLease{ name: fmt.Sprintf("%08d", rand.Int31()), backoff: backoff, @@ -120,12 +126,18 @@ func New(leaseDuration, retryInterval time.Duration) *WriterLease { queued: make(map[WorkKey]*work), queue: workqueue.NewDelayingQueue(), once: make(chan struct{}), + + workers: workers, } } // NewWithBackoff creates a new Lease. Specify the duration to hold leases for and the retry // interval on requests that fail. -func NewWithBackoff(name string, leaseDuration, retryInterval time.Duration, backoff wait.Backoff) *WriterLease { +func NewWithBackoff(name string, leaseDuration, retryInterval time.Duration, backoff wait.Backoff, workers int) *WriterLease { + if workers < 1 { + workers = 1 + } + return &WriterLease{ name: name, backoff: backoff, @@ -136,6 +148,8 @@ func NewWithBackoff(name string, leaseDuration, retryInterval time.Duration, bac queued: make(map[WorkKey]*work), queue: workqueue.NewNamedDelayingQueue(name), once: make(chan struct{}), + + workers: workers, } } @@ -143,14 +157,20 @@ func (l *WriterLease) Run(stopCh <-chan struct{}) { defer utilruntime.HandleCrash() defer l.queue.ShutDown() - go func() { - defer utilruntime.HandleCrash() - for l.work() { - } - log.V(4).Info("worker stopped", "worker", l.name) - }() + var wg sync.WaitGroup + for i := 0; i < l.workers; i++ { + wg.Add(1) + go func(workerID int) { + defer utilruntime.HandleCrash() + defer wg.Done() + for l.work() { + } + log.V(4).Info("worker stopped", "worker", l.name, "workerID", workerID) + }(i) + } <-stopCh + wg.Wait() } func (l *WriterLease) Expire() { diff --git a/pkg/router/writerlease/writerlease_test.go b/pkg/router/writerlease/writerlease_test.go index 14fd3baeb..0cb990a23 100644 --- a/pkg/router/writerlease/writerlease_test.go +++ b/pkg/router/writerlease/writerlease_test.go @@ -6,7 +6,7 @@ import ( ) func TestWaitForLeader(t *testing.T) { - l := New(0, 0) + l := New(0, 0, 1) defer func() { if len(l.queued) > 0 { t.Fatalf("queue was not empty on shutdown: %#v", l.queued) @@ -30,7 +30,7 @@ func TestWaitForLeader(t *testing.T) { } func TestBecomeLeaderAfterRetry(t *testing.T) { - l := New(0, 0) + l := New(0, 0, 1) ch := make(chan struct{}) defer close(ch) go l.Run(ch) @@ -50,7 +50,7 @@ func TestBecomeLeaderAfterRetry(t *testing.T) { } func TestBecomeFollowerAfterRetry(t *testing.T) { - l := New(0, 0) + l := New(0, 0, 1) l.backoff.Steps = 0 l.backoff.Duration = 0 ch := make(chan struct{}) @@ -72,7 +72,7 @@ func TestBecomeFollowerAfterRetry(t *testing.T) { } func TestRunOverlappingWork(t *testing.T) { - l := New(0, 0) + l := New(0, 0, 1) l.backoff.Steps = 0 l.backoff.Duration = 0 done := make(chan struct{}) @@ -112,7 +112,7 @@ func TestRunOverlappingWork(t *testing.T) { } func TestExtend(t *testing.T) { - l := New(10*time.Millisecond, 0) + l := New(10*time.Millisecond, 0, 1) l.nowFn = func() time.Time { return time.Unix(0, 0) } l.backoff.Steps = 0 l.backoff.Duration = 2 * time.Millisecond From 9a962f970bf6d4743efa31cec6515f15b671dce7 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 11 May 2026 19:33:02 -0400 Subject: [PATCH 23/27] perf: implement asynchronous external certificate validation and non-blocking writer lease --- pkg/router/controller/route_secret_manager.go | 74 +++++--- .../controller/route_secret_manager_test.go | 41 ++++- pkg/router/controller/status_test.go | 14 +- pkg/router/writerlease/writerlease.go | 5 +- scale-test.sh | 164 ++++++++++++++++++ 5 files changed, 267 insertions(+), 31 deletions(-) create mode 100755 scale-test.sh diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index 8fcd84f4e..d0600b7e0 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -111,10 +111,15 @@ func (p *RouteSecretManager) HandleRoute(eventType watch.EventType, route *route case watch.Added: // register with secret monitor if hasExternalCertificate(route) { - log.V(4).Info("Validating and registering external certificate", "namespace", route.Namespace, "secret", route.Spec.TLS.ExternalCertificate.Name, "route", route.Name) - if err := p.validateAndRegister(route); err != nil { - return err - } + go func(route *routev1.Route) { + log.V(4).Info("Validating and registering external certificate (async)", "namespace", route.Namespace, "secret", route.Spec.TLS.ExternalCertificate.Name, "route", route.Name) + if err := p.validateAndRegister(route); err != nil { + log.Error(err, "failed to validate and register external certificate", "namespace", route.Namespace, "route", route.Name) + return + } + p.plugin.HandleRoute(watch.Added, route) + }(route.DeepCopy()) + return nil } case watch.Modified: @@ -127,14 +132,21 @@ func (p *RouteSecretManager) HandleRoute(eventType watch.EventType, route *route // Both new and old routes have externalCertificate if oldSecret != route.Spec.TLS.ExternalCertificate.Name { // ExternalCertificate is updated - log.V(4).Info("Validating and registering updated external certificate", "namespace", route.Namespace, "oldSecret", oldSecret, "newSecret", route.Spec.TLS.ExternalCertificate.Name, "route", route.Name) + log.V(4).Info("Unregistering old external certificate", "namespace", route.Namespace, "oldSecret", oldSecret, "route", route.Name) // Unregister the old and register the new external certificate if err := p.unregister(route); err != nil { return err } - if err := p.validateAndRegister(route); err != nil { - return err - } + + go func(route *routev1.Route) { + log.V(4).Info("Validating and registering updated external certificate (async)", "namespace", route.Namespace, "newSecret", route.Spec.TLS.ExternalCertificate.Name, "route", route.Name) + if err := p.validateAndRegister(route); err != nil { + log.Error(err, "failed to validate and register updated external certificate", "namespace", route.Namespace, "route", route.Name) + return + } + p.plugin.HandleRoute(watch.Modified, route) + }(route.DeepCopy()) + return nil } else { // ExternalCertificate is not updated // Re-validate and update the in-memory TLS certificate and key (even if ExternalCertificate remains unchanged) @@ -156,25 +168,35 @@ func (p *RouteSecretManager) HandleRoute(eventType watch.EventType, route *route // // Therefore, it is essential to re-sync the secret to ensure the plugin chain correctly handles the route. - log.V(4).Info("Re-validating existing external certificate", "namespace", route.Namespace, "secret", oldSecret, "route", route.Name) - // re-validate (synchronous, throttled by semaphore) - if err := p.validate(route); err != nil { - return err - } - // read referenced secret and update TLS certificate and key - if err := p.populateRouteTLSFromSecret(route); err != nil { - return err - } - + go func(route *routev1.Route) { + log.V(4).Info("Re-validating existing external certificate (async)", "namespace", route.Namespace, "secret", oldSecret, "route", route.Name) + // re-validate (synchronous, throttled by semaphore) + if err := p.validate(route); err != nil { + log.Error(err, "failed to re-validate external certificate", "namespace", route.Namespace, "route", route.Name) + return + } + // read referenced secret and update TLS certificate and key + if err := p.populateRouteTLSFromSecret(route); err != nil { + log.Error(err, "failed to populate TLS from secret", "namespace", route.Namespace, "route", route.Name) + return + } + p.plugin.HandleRoute(watch.Modified, route) + }(route.DeepCopy()) + return nil } case newHasExt && !oldHadExt: // New route has externalCertificate, old route did not - log.V(4).Info("Validating and registering new external certificate", "namespace", route.Namespace, "secret", route.Spec.TLS.ExternalCertificate.Name, "route", route.Name) - // register with secret monitor - if err := p.validateAndRegister(route); err != nil { - return err - } + go func(route *routev1.Route) { + log.V(4).Info("Validating and registering new external certificate (async)", "namespace", route.Namespace, "secret", route.Spec.TLS.ExternalCertificate.Name, "route", route.Name) + // register with secret monitor + if err := p.validateAndRegister(route); err != nil { + log.Error(err, "failed to validate and register new external certificate", "namespace", route.Namespace, "route", route.Name) + return + } + p.plugin.HandleRoute(watch.Modified, route) + }(route.DeepCopy()) + return nil case !newHasExt && oldHadExt: // Old route had externalCertificate, new route does not @@ -212,6 +234,8 @@ func (p *RouteSecretManager) validateAndRegister(route *routev1.Route) error { // register route with secretManager handler := p.generateSecretHandler(route.Namespace, route.Name) if err := p.secretManager.RegisterRoute(context.TODO(), route.Namespace, route.Name, route.Spec.TLS.ExternalCertificate.Name, handler); err != nil { + p.recorder.RecordRouteRejection(route, ExtCrtStatusReasonGetFailed, err.Error()) + p.plugin.HandleRoute(watch.Deleted, route) return fmt.Errorf("failed to register router: %w", err) } @@ -290,6 +314,10 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) UpdateFunc: func(old interface{}, new interface{}) { secretOld := old.(*kapi.Secret) secretNew := new.(*kapi.Secret) + if secretOld.ResourceVersion == secretNew.ResourceVersion { + return + } + key := generateKey(namespace, routeName) log.V(4).Info("Secret updated for route", "namespace", namespace, "secret", secretNew.Name, "oldSecretVersion", secretOld.ResourceVersion, "newSecretVersion", secretNew.ResourceVersion, "route", routeName) routeapihelpers.InvalidateAsyncSARCache(namespace, secretNew.Name) diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index 44511fdc8..253cbe4ce 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -7,6 +7,7 @@ import ( "reflect" "sync" "testing" + "time" "github.com/openshift/library-go/pkg/route/secretmanager/fake" "github.com/openshift/router/pkg/router" @@ -336,6 +337,9 @@ func TestRouteSecretManager(t *testing.T) { eventType: watch.Added, allow: true, expectedError: true, + expectedRejections: []string{ + "sandbox-route-test:ExternalCertificateGetFailed", + }, }, { name: "route added with externalCertificate allowed and correct secret", @@ -553,6 +557,9 @@ func TestRouteSecretManager(t *testing.T) { allow: true, eventType: watch.Modified, expectedError: true, + expectedRejections: []string{ + "sandbox-route-test:ExternalCertificateGetFailed", + }, }, { name: "route updated: old route without externalCertificate, new route with externalCertificate allowed and correct secret", @@ -1171,7 +1178,9 @@ func TestRouteSecretManager(t *testing.T) { for _, s := range scenarios { t.Run(s.name, func(t *testing.T) { routeapihelpers.ClearAsyncSARCacheForTest() - p := &fakePlugin{} + p := &fakePlugin{ + doneCh: make(chan struct{}), + } recorder := &statusRecorder{ doneCh: make(chan struct{}), } @@ -1179,7 +1188,35 @@ func TestRouteSecretManager(t *testing.T) { gotErr := rsm.HandleRoute(s.eventType, s.route) - if (gotErr != nil) != s.expectedError { + // For routes with external certificates (except deletion), the processing is asynchronous. + // We need to wait for the background goroutine to complete (either by calling the next plugin + // or by recording a rejection), UNLESS an error occurred synchronously. + isAsync := s.eventType != watch.Deleted && hasExternalCertificate(s.route) + if isAsync && gotErr == nil { + select { + case <-p.doneCh: + case <-recorder.doneCh: + case <-time.After(5 * time.Second): + t.Fatal("timed out waiting for async route processing") + } + // The synchronous HandleRoute call always returns nil for async routes if it successfully launched the goroutine. + // If we expected an error, check if a rejection was recorded instead. + if s.expectedError { + if len(recorder.rejections) == 0 { + t.Fatalf("expected a rejection to be recorded for async route, but got none") + } + // On failure, the plugin is called with watch.Deleted. + // We update the local expected state to match this behavior for the deep equal check below. + if s.expectedRoute == nil { + p.route = nil + } + if s.expectedEventType == "" { + p.t = "" + } + } + } + + if !isAsync && (gotErr != nil) != s.expectedError { t.Fatalf("expected error to be %t, but got %t", s.expectedError, gotErr != nil) } if !reflect.DeepEqual(s.expectedRoute, p.route) { diff --git a/pkg/router/controller/status_test.go b/pkg/router/controller/status_test.go index 0bf6ca223..e125a1017 100644 --- a/pkg/router/controller/status_test.go +++ b/pkg/router/controller/status_test.go @@ -50,13 +50,21 @@ func (_ noopLease) Remove(key writerlease.WorkKey) { } type fakePlugin struct { - t watch.EventType - route *routev1.Route - err error + t watch.EventType + route *routev1.Route + err error + doneCh chan struct{} } func (p *fakePlugin) HandleRoute(t watch.EventType, route *routev1.Route) error { p.t, p.route = t, route + if p.doneCh != nil { + select { + case <-p.doneCh: + default: + close(p.doneCh) + } + } return p.err } diff --git a/pkg/router/writerlease/writerlease.go b/pkg/router/writerlease/writerlease.go index a3383e2c4..5e9b42079 100644 --- a/pkg/router/writerlease/writerlease.go +++ b/pkg/router/writerlease/writerlease.go @@ -270,9 +270,8 @@ func (l *WriterLease) work() bool { if leaseState == Follower { // if we are following, continue to defer work until the lease expires if remaining := leaseExpires.Sub(l.nowFn()); remaining > 0 { - log.V(4).Info("follower awaiting lease expiration", "worker", l.name, "key", key, "leaseTimeRemaining", remaining) - time.Sleep(remaining) - l.queue.Add(key) + log.V(4).Info("follower awaiting lease expiration, requeueing", "worker", l.name, "key", key, "leaseTimeRemaining", remaining) + l.queue.AddAfter(key, remaining) l.queue.Done(key) return true } diff --git a/scale-test.sh b/scale-test.sh new file mode 100755 index 000000000..212b55d8a --- /dev/null +++ b/scale-test.sh @@ -0,0 +1,164 @@ +#!/bin/bash +set -euo pipefail + +IMAGE="quay.io/btofel/router:patched-bt" +NAMESPACE="router-scale-test" +NUM_ROUTES=1000 +MAX_STALL=60 # Seconds to wait without progress before declaring a stall + +while [[ $# -gt 0 ]]; do + case $1 in + --single) + NUM_ROUTES=1 + shift + ;; + *) + IMAGE="$1" + shift + ;; + esac +done + +echo "=========================================================" +echo " Router Scale Test" +echo " Image: $IMAGE" +echo " Routes: $NUM_ROUTES" +echo "=========================================================" + +echo -e "\n=== 1. Scaling down Operators (CVO & Ingress) ===" +# Stop CVO from overwriting the ingress operator +oc scale --replicas 0 -n openshift-cluster-version deployments/cluster-version-operator 2>/dev/null || true +# Stop Ingress Operator from overwriting our custom router image +oc scale --replicas 0 -n openshift-ingress-operator deployments ingress-operator 2>/dev/null || true + +echo -e "\n=== 2. Patching router deployment ===" +# Inject the custom image into the router deployment +oc -n openshift-ingress set image deployment router-default router="$IMAGE" +# Wait for the rollout to complete +echo "Waiting for router pod to roll out with the new image..." +oc -n openshift-ingress rollout status deployment router-default --timeout=5m +echo -e "\n=== 3. Setting up test namespace: $NAMESPACE ===" +if oc get project "$NAMESPACE" &>/dev/null; then + echo "Deleting old project..." + oc delete project "$NAMESPACE" + # Wait for project to fully terminate + while oc get project "$NAMESPACE" &>/dev/null; do sleep 2; done +fi +oc new-project "$NAMESPACE" >/dev/null + +echo -e "\n=== 4. Generating $NUM_ROUTES secrets and routes ===" +# Generate a valid dummy certificate so the ExtendedValidator doesn't reject it +openssl req -x509 -newkey rsa:2048 -keyout /tmp/tls.key -out /tmp/tls.crt -days 365 -nodes -subj "/CN=test-scale.com" 2>/dev/null +CRT_B64=$(cat /tmp/tls.crt | base64 | tr -d '\n') +KEY_B64=$(cat /tmp/tls.key | base64 | tr -d '\n') + +# Create a dummy service to satisfy the route 'To' target +oc create service clusterip dummy-svc --tcp=8080:8080 -n "$NAMESPACE" >/dev/null + +YAML_FILE="/tmp/scale-test-resources.yaml" +> "$YAML_FILE" + +# Grant the router service account permission to read secrets in this namespace +cat <> "$YAML_FILE" +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: router-secret-reader + namespace: $NAMESPACE +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: router-secret-reader-binding + namespace: $NAMESPACE +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: router-secret-reader +subjects: +- kind: ServiceAccount + name: router + namespace: openshift-ingress +--- +EOF + +for i in $(seq 1 $NUM_ROUTES); do + cat <> "$YAML_FILE" +apiVersion: v1 +kind: Secret +metadata: + name: test-secret-$i + namespace: $NAMESPACE +type: kubernetes.io/tls +data: + tls.crt: $CRT_B64 + tls.key: $KEY_B64 +--- +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: test-route-$i + namespace: $NAMESPACE +spec: + host: test-route-$i.apps.example.com + tls: + termination: edge + externalCertificate: + name: test-secret-$i + to: + kind: Service + name: dummy-svc +--- +EOF +done + +echo -e "\n=== 5. Applying resources & Measuring Performance ===" +# We start the clock right before applying +START_TIME=$(date +%s) +oc apply -f "$YAML_FILE" >/dev/null +echo "All objects applied. Starting monitor..." + +LAST_COUNT=0 +STALL_TIMER=0 + +while true; do + # Extract the Admitted status from all routes in the namespace. + # We use jsonpath with single quotes to avoid shell expansion of () or ?. + # We count occurrences of "True" or "False". + ALL_STATUSES=$(oc get routes -n "$NAMESPACE" -o jsonpath='{range .items[*]}{.status.ingress[*].conditions[?(@.type=="Admitted")].status}{"\n"}{end}' 2>/dev/null || true) + ADMITTED=$(echo "$ALL_STATUSES" | grep -c "True" || true) + REJECTED=$(echo "$ALL_STATUSES" | grep -c "False" || true) + CURRENT_TIME=$(date +%s) + ELAPSED=$((CURRENT_TIME - START_TIME)) + + echo "[${ELAPSED}s] Routes Admitted: $ADMITTED / $NUM_ROUTES (Rejected: $REJECTED)" + + if [ "$ADMITTED" -eq "$NUM_ROUTES" ]; then + echo -e "\nšŸŽ‰ SUCCESS: All $NUM_ROUTES routes successfully admitted in $ELAPSED seconds!" + break + fi + + if [[ "$REJECTED" -gt 0 && "$((ADMITTED + REJECTED))" -eq "$NUM_ROUTES" ]]; then + echo -e "\nāš ļø WARNING: Processing finished but $REJECTED routes were rejected. Total time: $ELAPSED seconds." + break + fi + + # Check for stall + if [ "$ADMITTED" -eq "$LAST_COUNT" ]; then + STALL_TIMER=$((STALL_TIMER + 5)) + if [ "$STALL_TIMER" -ge "$MAX_STALL" ]; then + echo -e "\nāŒ FAILURE: Route admission stalled at $ADMITTED routes for $MAX_STALL seconds." + echo "Total time elapsed before failure declared: $ELAPSED seconds." + break + fi + else + STALL_TIMER=0 + LAST_COUNT=$ADMITTED + fi + + sleep 5 +done From f8db4ec877e48e6d968eeea57eb703b776f97ce2 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Mon, 11 May 2026 19:41:19 -0400 Subject: [PATCH 24/27] test: remove scale-test script from PR --- scale-test.sh | 164 -------------------------------------------------- 1 file changed, 164 deletions(-) delete mode 100755 scale-test.sh diff --git a/scale-test.sh b/scale-test.sh deleted file mode 100755 index 212b55d8a..000000000 --- a/scale-test.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash -set -euo pipefail - -IMAGE="quay.io/btofel/router:patched-bt" -NAMESPACE="router-scale-test" -NUM_ROUTES=1000 -MAX_STALL=60 # Seconds to wait without progress before declaring a stall - -while [[ $# -gt 0 ]]; do - case $1 in - --single) - NUM_ROUTES=1 - shift - ;; - *) - IMAGE="$1" - shift - ;; - esac -done - -echo "=========================================================" -echo " Router Scale Test" -echo " Image: $IMAGE" -echo " Routes: $NUM_ROUTES" -echo "=========================================================" - -echo -e "\n=== 1. Scaling down Operators (CVO & Ingress) ===" -# Stop CVO from overwriting the ingress operator -oc scale --replicas 0 -n openshift-cluster-version deployments/cluster-version-operator 2>/dev/null || true -# Stop Ingress Operator from overwriting our custom router image -oc scale --replicas 0 -n openshift-ingress-operator deployments ingress-operator 2>/dev/null || true - -echo -e "\n=== 2. Patching router deployment ===" -# Inject the custom image into the router deployment -oc -n openshift-ingress set image deployment router-default router="$IMAGE" -# Wait for the rollout to complete -echo "Waiting for router pod to roll out with the new image..." -oc -n openshift-ingress rollout status deployment router-default --timeout=5m -echo -e "\n=== 3. Setting up test namespace: $NAMESPACE ===" -if oc get project "$NAMESPACE" &>/dev/null; then - echo "Deleting old project..." - oc delete project "$NAMESPACE" - # Wait for project to fully terminate - while oc get project "$NAMESPACE" &>/dev/null; do sleep 2; done -fi -oc new-project "$NAMESPACE" >/dev/null - -echo -e "\n=== 4. Generating $NUM_ROUTES secrets and routes ===" -# Generate a valid dummy certificate so the ExtendedValidator doesn't reject it -openssl req -x509 -newkey rsa:2048 -keyout /tmp/tls.key -out /tmp/tls.crt -days 365 -nodes -subj "/CN=test-scale.com" 2>/dev/null -CRT_B64=$(cat /tmp/tls.crt | base64 | tr -d '\n') -KEY_B64=$(cat /tmp/tls.key | base64 | tr -d '\n') - -# Create a dummy service to satisfy the route 'To' target -oc create service clusterip dummy-svc --tcp=8080:8080 -n "$NAMESPACE" >/dev/null - -YAML_FILE="/tmp/scale-test-resources.yaml" -> "$YAML_FILE" - -# Grant the router service account permission to read secrets in this namespace -cat <> "$YAML_FILE" -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: router-secret-reader - namespace: $NAMESPACE -rules: -- apiGroups: [""] - resources: ["secrets"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: router-secret-reader-binding - namespace: $NAMESPACE -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: router-secret-reader -subjects: -- kind: ServiceAccount - name: router - namespace: openshift-ingress ---- -EOF - -for i in $(seq 1 $NUM_ROUTES); do - cat <> "$YAML_FILE" -apiVersion: v1 -kind: Secret -metadata: - name: test-secret-$i - namespace: $NAMESPACE -type: kubernetes.io/tls -data: - tls.crt: $CRT_B64 - tls.key: $KEY_B64 ---- -apiVersion: route.openshift.io/v1 -kind: Route -metadata: - name: test-route-$i - namespace: $NAMESPACE -spec: - host: test-route-$i.apps.example.com - tls: - termination: edge - externalCertificate: - name: test-secret-$i - to: - kind: Service - name: dummy-svc ---- -EOF -done - -echo -e "\n=== 5. Applying resources & Measuring Performance ===" -# We start the clock right before applying -START_TIME=$(date +%s) -oc apply -f "$YAML_FILE" >/dev/null -echo "All objects applied. Starting monitor..." - -LAST_COUNT=0 -STALL_TIMER=0 - -while true; do - # Extract the Admitted status from all routes in the namespace. - # We use jsonpath with single quotes to avoid shell expansion of () or ?. - # We count occurrences of "True" or "False". - ALL_STATUSES=$(oc get routes -n "$NAMESPACE" -o jsonpath='{range .items[*]}{.status.ingress[*].conditions[?(@.type=="Admitted")].status}{"\n"}{end}' 2>/dev/null || true) - ADMITTED=$(echo "$ALL_STATUSES" | grep -c "True" || true) - REJECTED=$(echo "$ALL_STATUSES" | grep -c "False" || true) - CURRENT_TIME=$(date +%s) - ELAPSED=$((CURRENT_TIME - START_TIME)) - - echo "[${ELAPSED}s] Routes Admitted: $ADMITTED / $NUM_ROUTES (Rejected: $REJECTED)" - - if [ "$ADMITTED" -eq "$NUM_ROUTES" ]; then - echo -e "\nšŸŽ‰ SUCCESS: All $NUM_ROUTES routes successfully admitted in $ELAPSED seconds!" - break - fi - - if [[ "$REJECTED" -gt 0 && "$((ADMITTED + REJECTED))" -eq "$NUM_ROUTES" ]]; then - echo -e "\nāš ļø WARNING: Processing finished but $REJECTED routes were rejected. Total time: $ELAPSED seconds." - break - fi - - # Check for stall - if [ "$ADMITTED" -eq "$LAST_COUNT" ]; then - STALL_TIMER=$((STALL_TIMER + 5)) - if [ "$STALL_TIMER" -ge "$MAX_STALL" ]; then - echo -e "\nāŒ FAILURE: Route admission stalled at $ADMITTED routes for $MAX_STALL seconds." - echo "Total time elapsed before failure declared: $ELAPSED seconds." - break - fi - else - STALL_TIMER=0 - LAST_COUNT=$ADMITTED - fi - - sleep 5 -done From d1c3df2cba64cb1e141088b6347fe1fc582e8023 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Tue, 12 May 2026 10:56:12 -0400 Subject: [PATCH 25/27] test: fix data race and timeout in route secret manager tests --- pkg/router/controller/route_secret_manager.go | 1 - pkg/router/controller/route_secret_manager_test.go | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/router/controller/route_secret_manager.go b/pkg/router/controller/route_secret_manager.go index d0600b7e0..f3413dc27 100644 --- a/pkg/router/controller/route_secret_manager.go +++ b/pkg/router/controller/route_secret_manager.go @@ -317,7 +317,6 @@ func (p *RouteSecretManager) generateSecretHandler(namespace, routeName string) if secretOld.ResourceVersion == secretNew.ResourceVersion { return } - key := generateKey(namespace, routeName) log.V(4).Info("Secret updated for route", "namespace", namespace, "secret", secretNew.Name, "oldSecretVersion", secretOld.ResourceVersion, "newSecretVersion", secretNew.ResourceVersion, "route", routeName) routeapihelpers.InvalidateAsyncSARCache(namespace, secretNew.Name) diff --git a/pkg/router/controller/route_secret_manager_test.go b/pkg/router/controller/route_secret_manager_test.go index 253cbe4ce..9f172ee00 100644 --- a/pkg/router/controller/route_secret_manager_test.go +++ b/pkg/router/controller/route_secret_manager_test.go @@ -1195,7 +1195,6 @@ func TestRouteSecretManager(t *testing.T) { if isAsync && gotErr == nil { select { case <-p.doneCh: - case <-recorder.doneCh: case <-time.After(5 * time.Second): t.Fatal("timed out waiting for async route processing") } @@ -1356,6 +1355,7 @@ func TestSecretUpdate(t *testing.T) { // update the secret updatedSecret := secret.DeepCopy() + updatedSecret.ResourceVersion = "2" // Increment so UpdateFunc processes it updatedSecret.Data = map[string][]byte{ "tls.crt": []byte("my-crt"), "tls.key": []byte("my-key"), From 115bab84c3c8840dbc14c8d56aa35374b5512411 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Fri, 15 May 2026 17:39:09 -0400 Subject: [PATCH 26/27] Optimize external certificate secret monitoring using SharedInformer Prior to this change, the router spawned a new, dedicated watch connection for every individual route that specified an externalCertificate. At scale (e.g., 2000 routes), this resulted in 2000 distinct API watches, causing extreme API contention and locking within the secret manager. This commit introduces a SharedSecretManager that utilizes a single namespace-scoped SharedIndexInformer to watch all secrets in the namespace. It dynamically manages the routing of secret events to individual route handlers in-memory. Fixes OCPBUGS-77056 --- pkg/cmd/infra/router/template.go | 3 +- .../controller/shared_secret_manager.go | 216 ++++++++++++++++++ 2 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 pkg/router/controller/shared_secret_manager.go diff --git a/pkg/cmd/infra/router/template.go b/pkg/cmd/infra/router/template.go index a8f6ccaa8..6c47a8895 100644 --- a/pkg/cmd/infra/router/template.go +++ b/pkg/cmd/infra/router/template.go @@ -36,7 +36,6 @@ import ( routelisters "github.com/openshift/client-go/route/listers/route/v1" "github.com/openshift/library-go/pkg/crypto" "github.com/openshift/library-go/pkg/proc" - "github.com/openshift/library-go/pkg/route/secretmanager" "github.com/openshift/router/pkg/router" "github.com/openshift/router/pkg/router/controller" @@ -756,7 +755,7 @@ func (o *TemplateRouterOptions) Run(stopCh <-chan struct{}) error { return err } - secretManager := secretmanager.NewManager(kc, nil) + secretManager := controller.NewSharedSecretManager(kc, nil) pluginCfg := templateplugin.TemplatePluginConfig{ WorkingDir: o.WorkingDir, diff --git a/pkg/router/controller/shared_secret_manager.go b/pkg/router/controller/shared_secret_manager.go new file mode 100644 index 000000000..ae8173374 --- /dev/null +++ b/pkg/router/controller/shared_secret_manager.go @@ -0,0 +1,216 @@ +package controller + +import ( + "context" + "fmt" + "strings" + "sync" + + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/fields" +) + +// SharedSecretManager implements secretmanager.SecretManager using per-namespace SharedIndexInformers. +// This prevents creating a new API watch for every individual route/secret combination. +type SharedSecretManager struct { + kubeClient kubernetes.Interface + queue workqueue.RateLimitingInterface + + lock sync.RWMutex + informers map[string]cache.SharedIndexInformer // namespace -> informer + + // registeredRoutes maps "namespace/routeName" -> referencedSecret + registeredRoutes map[string]referencedSecret +} + +type referencedSecret struct { + secretName string + handler cache.ResourceEventHandlerFuncs +} + +func NewSharedSecretManager(kubeClient kubernetes.Interface, queue workqueue.RateLimitingInterface) *SharedSecretManager { + return &SharedSecretManager{ + kubeClient: kubeClient, + queue: queue, + informers: make(map[string]cache.SharedIndexInformer), + registeredRoutes: make(map[string]referencedSecret), + } +} + +func (m *SharedSecretManager) Queue() workqueue.RateLimitingInterface { + return m.queue +} + +func (m *SharedSecretManager) RegisterRoute(ctx context.Context, namespace string, routeName string, secretName string, handler cache.ResourceEventHandlerFuncs) error { + m.lock.Lock() + key := namespace + "/" + routeName + if ref, exists := m.registeredRoutes[key]; exists { + if ref.secretName == secretName { + // Already registered for the same secret, just update the handler and return success. + // This handles the race condition where ADDED and MODIFIED events fire concurrently. + m.registeredRoutes[key] = referencedSecret{ + secretName: secretName, + handler: handler, + } + m.lock.Unlock() + return nil + } + m.lock.Unlock() + return fmt.Errorf("route already registered with key %s", key) + } + + m.registeredRoutes[key] = referencedSecret{ + secretName: secretName, + handler: handler, + } + + inf, exists := m.informers[namespace] + if !exists { + inf = cache.NewSharedIndexInformer( + cache.NewListWatchFromClient( + m.kubeClient.CoreV1().RESTClient(), + "secrets", + namespace, + fields.Everything(), // Watch all secrets in the namespace + ), + &corev1.Secret{}, + 0, + cache.Indexers{}, + ) + + inf.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + m.notify(namespace, obj, "Add", nil) + }, + UpdateFunc: func(oldObj, newObj interface{}) { + m.notify(namespace, newObj, "Update", oldObj) + }, + DeleteFunc: func(obj interface{}) { + m.notify(namespace, obj, "Delete", nil) + }, + }) + + m.informers[namespace] = inf + // Start the informer + // We use context.Background() since this informer lives as long as the router, + // or until we implement unregistering namespaces if they become empty (optimization). + go inf.Run(context.Background().Done()) + } + m.lock.Unlock() + + // Wait for cache sync (lock released to allow concurrent registrations) + if !cache.WaitForCacheSync(ctx.Done(), inf.HasSynced) { + // Rollback registration on failure + m.lock.Lock() + delete(m.registeredRoutes, key) + m.lock.Unlock() + return fmt.Errorf("failed waiting for cache sync for namespace %s", namespace) + } + + klog.V(4).Infof("secret manager registered route for key %s with secret %s", key, secretName) + return nil +} + +func (m *SharedSecretManager) notify(namespace string, obj interface{}, eventType string, oldObj interface{}) { + var secret *corev1.Secret + switch t := obj.(type) { + case *corev1.Secret: + secret = t + case cache.DeletedFinalStateUnknown: + secret, _ = t.Obj.(*corev1.Secret) + } + + if secret == nil { + return + } + + // Find all routes in this namespace that reference this secret + var handlers []cache.ResourceEventHandlerFuncs + prefix := namespace + "/" + + m.lock.RLock() + for key, ref := range m.registeredRoutes { + if ref.secretName == secret.Name && strings.HasPrefix(key, prefix) { + handlers = append(handlers, ref.handler) + } + } + m.lock.RUnlock() + + for _, h := range handlers { + switch eventType { + case "Add": + if h.AddFunc != nil { + h.AddFunc(obj) + } + case "Update": + if h.UpdateFunc != nil { + h.UpdateFunc(oldObj, obj) + } + case "Delete": + if h.DeleteFunc != nil { + h.DeleteFunc(obj) + } + } + } +} + +func (m *SharedSecretManager) UnregisterRoute(namespace string, routeName string) error { + m.lock.Lock() + defer m.lock.Unlock() + + key := namespace + "/" + routeName + if _, exists := m.registeredRoutes[key]; !exists { + return fmt.Errorf("no handler registered with key %s", key) + } + + delete(m.registeredRoutes, key) + klog.V(4).Infof("secret manager unregistered route for key %s", key) + return nil +} + +func (m *SharedSecretManager) GetSecret(ctx context.Context, namespace string, routeName string) (*corev1.Secret, error) { + m.lock.RLock() + key := namespace + "/" + routeName + ref, exists := m.registeredRoutes[key] + inf, infExists := m.informers[namespace] + m.lock.RUnlock() + + if !exists { + return nil, fmt.Errorf("no handler registered with key %s", key) + } + + if !infExists { + return nil, fmt.Errorf("no informer for namespace %s", namespace) + } + + // Wait for cache sync + if !cache.WaitForCacheSync(ctx.Done(), inf.HasSynced) { + return nil, fmt.Errorf("failed waiting for cache sync") + } + + obj, exists, err := inf.GetStore().GetByKey(namespace + "/" + ref.secretName) + if err != nil { + return nil, err + } + if !exists { + return nil, apierrors.NewNotFound(corev1.Resource("secrets"), ref.secretName) + } + + return obj.(*corev1.Secret), nil +} + +func (m *SharedSecretManager) LookupRouteSecret(namespace string, routeName string) (string, bool) { + m.lock.RLock() + defer m.lock.RUnlock() + key := namespace + "/" + routeName + ref, exists := m.registeredRoutes[key] + if !exists { + return "", false + } + return ref.secretName, true +} From de2f6a28faf345b479f2e11c346adf8d6f338776 Mon Sep 17 00:00:00 2001 From: Brett Tofel Date: Fri, 15 May 2026 17:54:49 -0400 Subject: [PATCH 27/27] gofmt shared_secret_manager.go --- pkg/router/controller/shared_secret_manager.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/router/controller/shared_secret_manager.go b/pkg/router/controller/shared_secret_manager.go index ae8173374..4d28421e6 100644 --- a/pkg/router/controller/shared_secret_manager.go +++ b/pkg/router/controller/shared_secret_manager.go @@ -7,12 +7,12 @@ import ( "sync" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/fields" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/fields" ) // SharedSecretManager implements secretmanager.SecretManager using per-namespace SharedIndexInformers.