diff --git a/internal/config/config.go b/internal/config/config.go index 42a2885..a5ca613 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -614,6 +614,35 @@ type GatewayConfig struct { // // +default=5 MaxConcurrentReconciles int `json:"maxConcurrentReconciles,omitempty"` + + // CertificateReissuance controls how the gateway controller handles + // failed certificate issuance for custom hostnames. When a Certificate + // is stuck in a failed state, the controller deletes and recreates it + // to bypass cert-manager's exponential backoff. Kubernetes GC cascades + // the deletion through the entire chain (CertificateRequest, Order, + // Challenge, solver resources). + CertificateReissuance CertificateReissuanceConfig `json:"certificateReissuance,omitempty"` +} + +// +k8s:deepcopy-gen=true + +// CertificateReissuanceConfig controls automatic recovery of failed certificate +// issuance by deleting and recreating stuck Certificates. +type CertificateReissuanceConfig struct { + // RetryInterval is the minimum time to wait after a Certificate failure + // before deleting it and recreating a fresh one. This prevents excessive + // requests to the ACME provider when the underlying issue persists (e.g. + // DNS not pointed to the Gateway). + // + // Defaults to 5m via GetRetryInterval(). + RetryInterval metav1.Duration `json:"retryInterval,omitempty"` + + // MaxRetries is the maximum number of times the gateway controller will + // fast-track re-issuance of a failed Certificate before falling back to + // cert-manager's built-in exponential backoff. + // + // +default=3 + MaxRetries int `json:"maxRetries,omitempty"` } // HasDefaultListenerTLSSecret returns true when a shared TLS certificate @@ -631,6 +660,24 @@ func (c *GatewayConfig) ShouldDeleteErroredChallenges() bool { return *c.DeleteErroredChallenges } +// GetRetryInterval returns the configured retry interval for certificate +// re-issuance, defaulting to 5 minutes. +func (c *CertificateReissuanceConfig) GetRetryInterval() time.Duration { + if c.RetryInterval.Duration > 0 { + return c.RetryInterval.Duration + } + return 5 * time.Minute +} + +// GetMaxRetries returns the configured maximum number of fast-track +// re-issuance attempts, defaulting to 3. +func (c *CertificateReissuanceConfig) GetMaxRetries() int { + if c.MaxRetries > 0 { + return c.MaxRetries + } + return 3 +} + func (c *GatewayConfig) GatewayDNSAddress(gateway *gatewayv1.Gateway) string { return fmt.Sprintf("%s.%s", strings.ReplaceAll(string(gateway.UID), "-", ""), c.TargetDomain) } diff --git a/internal/config/zz_generated.deepcopy.go b/internal/config/zz_generated.deepcopy.go index 1cdd6ec..5085f31 100644 --- a/internal/config/zz_generated.deepcopy.go +++ b/internal/config/zz_generated.deepcopy.go @@ -30,6 +30,22 @@ func (in *BackendTrafficPolicyValidationOptions) DeepCopy() *BackendTrafficPolic return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CertificateReissuanceConfig) DeepCopyInto(out *CertificateReissuanceConfig) { + *out = *in + out.RetryInterval = in.RetryInterval +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CertificateReissuanceConfig. +func (in *CertificateReissuanceConfig) DeepCopy() *CertificateReissuanceConfig { + if in == nil { + return nil + } + out := new(CertificateReissuanceConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClientConnectionConfig) DeepCopyInto(out *ClientConnectionConfig) { *out = *in @@ -339,6 +355,7 @@ func (in *GatewayConfig) DeepCopyInto(out *GatewayConfig) { *out = new(bool) **out = **in } + out.CertificateReissuance = in.CertificateReissuance } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GatewayConfig. diff --git a/internal/config/zz_generated.defaults.go b/internal/config/zz_generated.defaults.go index 65817ce..000f178 100644 --- a/internal/config/zz_generated.defaults.go +++ b/internal/config/zz_generated.defaults.go @@ -224,6 +224,9 @@ func SetObjectDefaults_NetworkServicesOperator(in *NetworkServicesOperator) { if in.Gateway.MaxConcurrentReconciles == 0 { in.Gateway.MaxConcurrentReconciles = 5 } + if in.Gateway.CertificateReissuance.MaxRetries == 0 { + in.Gateway.CertificateReissuance.MaxRetries = 3 + } if in.HTTPProxy.GatewayClassName == "" { in.HTTPProxy.GatewayClassName = "datum-external-global-proxy" } diff --git a/internal/controller/gateway_controller.go b/internal/controller/gateway_controller.go index abff37a..f39694a 100644 --- a/internal/controller/gateway_controller.go +++ b/internal/controller/gateway_controller.go @@ -6,6 +6,7 @@ import ( "context" "fmt" "slices" + "strconv" "strings" "time" @@ -52,6 +53,7 @@ import ( const gatewayControllerFinalizer = "gateway.networking.datumapis.com/gateway-controller" const gatewayControllerGCFinalizer = "gateway.networking.datumapis.com/gateway-controller-gc" const certificateIssuerTLSOption = "gateway.networking.datumapis.com/certificate-issuer" +const annotationReissuanceCount = "networking.datumapis.com/reissuance-count" const KindGateway = "Gateway" const KindHTTPRoute = "HTTPRoute" const KindService = "Service" @@ -474,6 +476,7 @@ func (r *GatewayReconciler) ensureListenerCertificates( wildcardSuffix := "." + r.Config.Gateway.TargetDomain desiredCerts := make(map[string]bool) + gatewayNeedsUpdate := false // Only create Certificates for listeners with custom hostnames outside the // wildcard scope. Wildcard-covered listeners use the shared TLS secret and @@ -550,6 +553,13 @@ func (r *GatewayReconciler) ensureListenerCertificates( var opResult string var err error if isNew { + reissuanceCount := getReissuanceCount(downstreamGateway, certName) + if reissuanceCount > 0 { + if cert.Annotations == nil { + cert.Annotations = make(map[string]string) + } + cert.Annotations[annotationReissuanceCount] = fmt.Sprintf("%d", reissuanceCount) + } cert.Spec = desiredSpec err = downstreamClient.Create(ctx, cert) opResult = "created" @@ -565,6 +575,27 @@ func (r *GatewayReconciler) ensureListenerCertificates( if opResult != "" { logger.Info("Certificate reconciled", "certificate", certName, "operation", opResult) } + + // Check if an existing Certificate is stuck in a failed state and + // should be deleted so we can recreate it fresh on the next reconcile. + if !isNew { + requeueAfter, gwChanged := r.reissueFailedCertificate(ctx, cert, certName, downstreamGateway, downstreamClient) + if gwChanged { + gatewayNeedsUpdate = true + } + if requeueAfter > 0 && (result.RequeueAfter == 0 || requeueAfter < result.RequeueAfter) { + result.RequeueAfter = requeueAfter + } + } + } + + // Persist reissuance tracking annotations on the downstream Gateway if + // any Certificates were deleted for re-issuance. + if gatewayNeedsUpdate { + if err := downstreamClient.Update(ctx, downstreamGateway); err != nil { + result.Err = fmt.Errorf("failed to update downstream gateway reissuance annotations: %w", err) + return result + } } // Clean up Certificate resources for listeners that no longer need them. @@ -605,6 +636,113 @@ func (r *GatewayReconciler) ensureListenerCertificates( return result } +// reissueFailedCertificate checks whether a Certificate is stuck in a failed +// state and should be deleted so the gateway controller recreates it fresh on +// the next reconcile (bypassing cert-manager's exponential backoff). +// +// The reissuance count is tracked as an annotation on the downstream Gateway +// (keyed by certificate name) so it survives Certificate deletion. +// +// Returns: +// - requeueAfter: non-zero if the caller should requeue after this duration +// - gatewayChanged: true if the downstream gateway annotations were modified +// and the caller must persist the change +func (r *GatewayReconciler) reissueFailedCertificate( + ctx context.Context, + cert *cmv1.Certificate, + certName string, + downstreamGateway *gatewayv1.Gateway, + downstreamClient client.Client, +) (requeueAfter time.Duration, gatewayChanged bool) { + logger := log.FromContext(ctx) + reissuanceCfg := &r.Config.Gateway.CertificateReissuance + + if cert.Status.LastFailureTime == nil { + if clearReissuanceCount(downstreamGateway, certName) { + logger.V(1).Info("cleared reissuance count for healthy Certificate", "certificate", certName) + gatewayChanged = true + } + return 0, gatewayChanged + } + + retryCount := getReissuanceCount(downstreamGateway, certName) + maxRetries := reissuanceCfg.GetMaxRetries() + if retryCount >= maxRetries { + logger.V(1).Info("Certificate has exhausted fast-track re-issuance attempts, deferring to cert-manager backoff", + "certificate", certName, + "reissuanceCount", retryCount, + "maxRetries", maxRetries, + ) + return 0, false + } + + retryInterval := reissuanceCfg.GetRetryInterval() + sinceFailure := time.Since(cert.Status.LastFailureTime.Time) + if sinceFailure < retryInterval { + remaining := retryInterval - sinceFailure + logger.V(1).Info("Certificate failed but retry interval has not elapsed", + "certificate", certName, + "sinceFailure", sinceFailure, + "retryInterval", retryInterval, + "requeueAfter", remaining, + ) + return remaining, false + } + + setReissuanceCount(downstreamGateway, certName, retryCount+1) + + logger.Info("deleting failed Certificate for re-issuance", + "certificate", certName, + "lastFailureTime", cert.Status.LastFailureTime.Time, + "reissuanceCount", retryCount+1, + "maxRetries", maxRetries, + ) + + if err := downstreamClient.Delete(ctx, cert); err != nil { + if !apierrors.IsNotFound(err) { + logger.Error(err, "failed to delete Certificate for re-issuance", "certificate", certName) + } + return 0, true + } + + return 0, true +} + +// reissuanceAnnotationKey returns the gateway annotation key used to track +// reissuance count for a given certificate name. +func reissuanceAnnotationKey(certName string) string { + return annotationReissuanceCount + "/" + certName +} + +func getReissuanceCount(gw *gatewayv1.Gateway, certName string) int { + if gw.Annotations == nil { + return 0 + } + count, err := strconv.Atoi(gw.Annotations[reissuanceAnnotationKey(certName)]) + if err != nil { + return 0 + } + return count +} + +func setReissuanceCount(gw *gatewayv1.Gateway, certName string, count int) { + if gw.Annotations == nil { + gw.Annotations = make(map[string]string) + } + gw.Annotations[reissuanceAnnotationKey(certName)] = fmt.Sprintf("%d", count) +} + +// clearReissuanceCount removes the reissuance tracking annotation for a +// certificate that has recovered. Returns true if the annotation was present. +func clearReissuanceCount(gw *gatewayv1.Gateway, certName string) bool { + key := reissuanceAnnotationKey(certName) + if _, ok := gw.Annotations[key]; ok { + delete(gw.Annotations, key) + return true + } + return false +} + func (r *GatewayReconciler) reconcileGatewayStatus( upstreamClient client.Client, upstreamGateway *gatewayv1.Gateway, diff --git a/internal/controller/gateway_controller_test.go b/internal/controller/gateway_controller_test.go index 66b8a5a..c30e6b5 100644 --- a/internal/controller/gateway_controller_test.go +++ b/internal/controller/gateway_controller_test.go @@ -5,12 +5,14 @@ import ( "fmt" "slices" "testing" + "time" cmv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -1407,3 +1409,229 @@ func TestServiceSpecClobberingCausesReconciliationStorm(t *testing.T) { "idempotent: second call also returns 'unchanged'") }) } + +func TestReissueFailedCertificate(t *testing.T) { + testScheme := runtime.NewScheme() + require.NoError(t, scheme.AddToScheme(testScheme)) + require.NoError(t, gatewayv1.Install(testScheme)) + require.NoError(t, cmv1.AddToScheme(testScheme)) + + logger := zap.New(zap.UseFlagOptions(&zap.Options{Development: true})) + + newCert := func(name string, opts ...func(*cmv1.Certificate)) *cmv1.Certificate { + cert := &cmv1.Certificate{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "ns-test", + UID: uuid.NewUUID(), + CreationTimestamp: metav1.Now(), + }, + } + for _, o := range opts { + o(cert) + } + return cert + } + + newGW := func(annotations map[string]string) *gatewayv1.Gateway { + return &gatewayv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-gw", + Namespace: "ns-test", + UID: uuid.NewUUID(), + Annotations: annotations, + }, + } + } + + failedAt := func(ago time.Duration) *metav1.Time { + t := metav1.NewTime(time.Now().Add(-ago)) + return &t + } + + tests := []struct { + name string + cert *cmv1.Certificate + gateway *gatewayv1.Gateway + retryInterval time.Duration + maxRetries int + expectDeleted bool + expectRequeue bool + expectGWChanged bool + expectGWCount int + }{ + { + name: "healthy certificate — no action", + cert: newCert("my-cert"), + gateway: newGW(nil), + expectDeleted: false, + expectRequeue: false, + }, + { + name: "failed cert within retry interval — requeue", + cert: newCert("my-cert", func(c *cmv1.Certificate) { + c.Status.LastFailureTime = failedAt(2 * time.Minute) + }), + gateway: newGW(nil), + retryInterval: 5 * time.Minute, + expectDeleted: false, + expectRequeue: true, + }, + { + name: "failed cert past retry interval — delete and reissue", + cert: newCert("my-cert", func(c *cmv1.Certificate) { + c.Status.LastFailureTime = failedAt(10 * time.Minute) + }), + gateway: newGW(nil), + retryInterval: 5 * time.Minute, + maxRetries: 3, + expectDeleted: true, + expectGWChanged: true, + expectGWCount: 1, + }, + { + name: "second re-issuance increments count", + cert: newCert("my-cert", func(c *cmv1.Certificate) { + c.Status.LastFailureTime = failedAt(10 * time.Minute) + }), + gateway: newGW(map[string]string{ + reissuanceAnnotationKey("my-cert"): "1", + }), + retryInterval: 5 * time.Minute, + maxRetries: 3, + expectDeleted: true, + expectGWChanged: true, + expectGWCount: 2, + }, + { + name: "max retries exhausted — no action", + cert: newCert("my-cert", func(c *cmv1.Certificate) { + c.Status.LastFailureTime = failedAt(10 * time.Minute) + }), + gateway: newGW(map[string]string{ + reissuanceAnnotationKey("my-cert"): "3", + }), + retryInterval: 5 * time.Minute, + maxRetries: 3, + expectDeleted: false, + expectRequeue: false, + expectGWCount: 3, + }, + { + name: "previously failed cert now healthy — clears count", + cert: newCert("my-cert"), + gateway: newGW(map[string]string{ + reissuanceAnnotationKey("my-cert"): "2", + }), + expectDeleted: false, + expectGWChanged: true, + expectGWCount: 0, + }, + { + name: "healthy cert with no history — no gateway mutation", + cert: newCert("my-cert"), + gateway: newGW(map[string]string{ + "some-other-annotation": "keep-me", + }), + expectDeleted: false, + expectGWChanged: false, + expectGWCount: 0, + }, + { + name: "default config values — uses 5m interval and 3 max retries", + cert: newCert("my-cert", func(c *cmv1.Certificate) { + c.Status.LastFailureTime = failedAt(6 * time.Minute) + }), + gateway: newGW(nil), + expectDeleted: true, + expectGWChanged: true, + expectGWCount: 1, + }, + { + name: "default config — failure at 4m does not trigger delete", + cert: newCert("my-cert", func(c *cmv1.Certificate) { + c.Status.LastFailureTime = failedAt(4 * time.Minute) + }), + gateway: newGW(nil), + expectDeleted: false, + expectRequeue: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := log.IntoContext(context.Background(), logger) + + fakeClient := fake.NewClientBuilder(). + WithScheme(testScheme). + WithObjects(tt.cert). + Build() + + cfg := config.CertificateReissuanceConfig{} + if tt.retryInterval > 0 { + cfg.RetryInterval = metav1.Duration{Duration: tt.retryInterval} + } + if tt.maxRetries > 0 { + cfg.MaxRetries = tt.maxRetries + } + + reconciler := &GatewayReconciler{ + Config: config.NetworkServicesOperator{ + Gateway: config.GatewayConfig{ + CertificateReissuance: cfg, + }, + }, + } + + requeueAfter, gwChanged := reconciler.reissueFailedCertificate( + ctx, tt.cert, tt.cert.Name, tt.gateway, fakeClient, + ) + + assert.Equal(t, tt.expectGWChanged, gwChanged, "gatewayChanged") + + if tt.expectRequeue { + assert.Greater(t, requeueAfter, time.Duration(0), "should requeue") + } else { + assert.Equal(t, time.Duration(0), requeueAfter, "should not requeue") + } + + if tt.expectDeleted { + err := fakeClient.Get(ctx, client.ObjectKeyFromObject(tt.cert), &cmv1.Certificate{}) + assert.True(t, apierrors.IsNotFound(err), "Certificate should be deleted") + } else if tt.cert.Status.LastFailureTime != nil || tt.expectGWCount == 0 { + err := fakeClient.Get(ctx, client.ObjectKeyFromObject(tt.cert), &cmv1.Certificate{}) + assert.NoError(t, err, "Certificate should still exist") + } + + gotCount := getReissuanceCount(tt.gateway, tt.cert.Name) + assert.Equal(t, tt.expectGWCount, gotCount, "reissuance count on gateway") + }) + } + + // Verify that calling twice on the same healthy cert never mutates the + // gateway — prevents over-reconciliation from spurious gateway updates. + t.Run("idempotent on healthy cert", func(t *testing.T) { + ctx := log.IntoContext(context.Background(), logger) + cert := newCert("stable-cert") + gw := newGW(nil) + + fakeClient := fake.NewClientBuilder(). + WithScheme(testScheme). + WithObjects(cert). + Build() + + reconciler := &GatewayReconciler{ + Config: config.NetworkServicesOperator{ + Gateway: config.GatewayConfig{}, + }, + } + + for i := 0; i < 3; i++ { + requeueAfter, gwChanged := reconciler.reissueFailedCertificate( + ctx, cert, cert.Name, gw, fakeClient, + ) + assert.False(t, gwChanged, "call %d: should not mutate gateway", i) + assert.Equal(t, time.Duration(0), requeueAfter, "call %d: should not requeue", i) + } + }) +}