Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,35 @@ type GatewayConfig struct {
//
// +default=5
MaxConcurrentReconciles int `json:"maxConcurrentReconciles,omitempty"`

// CertificateReissuance controls how the gateway controller handles
// failed certificate issuance for custom hostnames. When a Certificate
// is stuck in a failed state, the controller deletes and recreates it
// to bypass cert-manager's exponential backoff. Kubernetes GC cascades
// the deletion through the entire chain (CertificateRequest, Order,
// Challenge, solver resources).
CertificateReissuance CertificateReissuanceConfig `json:"certificateReissuance,omitempty"`
}

// +k8s:deepcopy-gen=true

// CertificateReissuanceConfig controls automatic recovery of failed certificate
// issuance by deleting and recreating stuck Certificates.
type CertificateReissuanceConfig struct {
// RetryInterval is the minimum time to wait after a Certificate failure
// before deleting it and recreating a fresh one. This prevents excessive
// requests to the ACME provider when the underlying issue persists (e.g.
// DNS not pointed to the Gateway).
//
// Defaults to 5m via GetRetryInterval().
RetryInterval metav1.Duration `json:"retryInterval,omitempty"`

// MaxRetries is the maximum number of times the gateway controller will
// fast-track re-issuance of a failed Certificate before falling back to
// cert-manager's built-in exponential backoff.
//
// +default=3
MaxRetries int `json:"maxRetries,omitempty"`
}

// HasDefaultListenerTLSSecret returns true when a shared TLS certificate
Expand All @@ -631,6 +660,24 @@ func (c *GatewayConfig) ShouldDeleteErroredChallenges() bool {
return *c.DeleteErroredChallenges
}

// GetRetryInterval returns the configured retry interval for certificate
// re-issuance, defaulting to 5 minutes.
func (c *CertificateReissuanceConfig) GetRetryInterval() time.Duration {
if c.RetryInterval.Duration > 0 {
return c.RetryInterval.Duration
}
return 5 * time.Minute
}

// GetMaxRetries returns the configured maximum number of fast-track
// re-issuance attempts, defaulting to 3.
func (c *CertificateReissuanceConfig) GetMaxRetries() int {
if c.MaxRetries > 0 {
return c.MaxRetries
}
return 3
}

func (c *GatewayConfig) GatewayDNSAddress(gateway *gatewayv1.Gateway) string {
return fmt.Sprintf("%s.%s", strings.ReplaceAll(string(gateway.UID), "-", ""), c.TargetDomain)
}
Expand Down
17 changes: 17 additions & 0 deletions internal/config/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions internal/config/zz_generated.defaults.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

138 changes: 138 additions & 0 deletions internal/controller/gateway_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"context"
"fmt"
"slices"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -52,6 +53,7 @@ import (
const gatewayControllerFinalizer = "gateway.networking.datumapis.com/gateway-controller"
const gatewayControllerGCFinalizer = "gateway.networking.datumapis.com/gateway-controller-gc"
const certificateIssuerTLSOption = "gateway.networking.datumapis.com/certificate-issuer"
const annotationReissuanceCount = "networking.datumapis.com/reissuance-count"
const KindGateway = "Gateway"
const KindHTTPRoute = "HTTPRoute"
const KindService = "Service"
Expand Down Expand Up @@ -474,6 +476,7 @@ func (r *GatewayReconciler) ensureListenerCertificates(
wildcardSuffix := "." + r.Config.Gateway.TargetDomain

desiredCerts := make(map[string]bool)
gatewayNeedsUpdate := false

// Only create Certificates for listeners with custom hostnames outside the
// wildcard scope. Wildcard-covered listeners use the shared TLS secret and
Expand Down Expand Up @@ -550,6 +553,13 @@ func (r *GatewayReconciler) ensureListenerCertificates(
var opResult string
var err error
if isNew {
reissuanceCount := getReissuanceCount(downstreamGateway, certName)
if reissuanceCount > 0 {
if cert.Annotations == nil {
cert.Annotations = make(map[string]string)
}
cert.Annotations[annotationReissuanceCount] = fmt.Sprintf("%d", reissuanceCount)
}
cert.Spec = desiredSpec
err = downstreamClient.Create(ctx, cert)
opResult = "created"
Expand All @@ -565,6 +575,27 @@ func (r *GatewayReconciler) ensureListenerCertificates(
if opResult != "" {
logger.Info("Certificate reconciled", "certificate", certName, "operation", opResult)
}

// Check if an existing Certificate is stuck in a failed state and
// should be deleted so we can recreate it fresh on the next reconcile.
if !isNew {
requeueAfter, gwChanged := r.reissueFailedCertificate(ctx, cert, certName, downstreamGateway, downstreamClient)
if gwChanged {
gatewayNeedsUpdate = true
}
if requeueAfter > 0 && (result.RequeueAfter == 0 || requeueAfter < result.RequeueAfter) {
result.RequeueAfter = requeueAfter
}
}
}

// Persist reissuance tracking annotations on the downstream Gateway if
// any Certificates were deleted for re-issuance.
if gatewayNeedsUpdate {
if err := downstreamClient.Update(ctx, downstreamGateway); err != nil {
result.Err = fmt.Errorf("failed to update downstream gateway reissuance annotations: %w", err)
return result
}
}

// Clean up Certificate resources for listeners that no longer need them.
Expand Down Expand Up @@ -605,6 +636,113 @@ func (r *GatewayReconciler) ensureListenerCertificates(
return result
}

// reissueFailedCertificate checks whether a Certificate is stuck in a failed
// state and should be deleted so the gateway controller recreates it fresh on
// the next reconcile (bypassing cert-manager's exponential backoff).
//
// The reissuance count is tracked as an annotation on the downstream Gateway
// (keyed by certificate name) so it survives Certificate deletion.
//
// Returns:
// - requeueAfter: non-zero if the caller should requeue after this duration
// - gatewayChanged: true if the downstream gateway annotations were modified
// and the caller must persist the change
func (r *GatewayReconciler) reissueFailedCertificate(
ctx context.Context,
cert *cmv1.Certificate,
certName string,
downstreamGateway *gatewayv1.Gateway,
downstreamClient client.Client,
) (requeueAfter time.Duration, gatewayChanged bool) {
logger := log.FromContext(ctx)
reissuanceCfg := &r.Config.Gateway.CertificateReissuance

if cert.Status.LastFailureTime == nil {
if clearReissuanceCount(downstreamGateway, certName) {
logger.V(1).Info("cleared reissuance count for healthy Certificate", "certificate", certName)
gatewayChanged = true
}
return 0, gatewayChanged
}

retryCount := getReissuanceCount(downstreamGateway, certName)
maxRetries := reissuanceCfg.GetMaxRetries()
if retryCount >= maxRetries {
logger.V(1).Info("Certificate has exhausted fast-track re-issuance attempts, deferring to cert-manager backoff",
"certificate", certName,
"reissuanceCount", retryCount,
"maxRetries", maxRetries,
)
return 0, false
}

retryInterval := reissuanceCfg.GetRetryInterval()
sinceFailure := time.Since(cert.Status.LastFailureTime.Time)
if sinceFailure < retryInterval {
remaining := retryInterval - sinceFailure
logger.V(1).Info("Certificate failed but retry interval has not elapsed",
"certificate", certName,
"sinceFailure", sinceFailure,
"retryInterval", retryInterval,
"requeueAfter", remaining,
)
return remaining, false
}

setReissuanceCount(downstreamGateway, certName, retryCount+1)

logger.Info("deleting failed Certificate for re-issuance",
"certificate", certName,
"lastFailureTime", cert.Status.LastFailureTime.Time,
"reissuanceCount", retryCount+1,
"maxRetries", maxRetries,
)

if err := downstreamClient.Delete(ctx, cert); err != nil {
if !apierrors.IsNotFound(err) {
logger.Error(err, "failed to delete Certificate for re-issuance", "certificate", certName)
}
return 0, true
}

return 0, true
}

// reissuanceAnnotationKey returns the gateway annotation key used to track
// reissuance count for a given certificate name.
func reissuanceAnnotationKey(certName string) string {
return annotationReissuanceCount + "/" + certName
}

func getReissuanceCount(gw *gatewayv1.Gateway, certName string) int {
if gw.Annotations == nil {
return 0
}
count, err := strconv.Atoi(gw.Annotations[reissuanceAnnotationKey(certName)])
if err != nil {
return 0
}
return count
}

func setReissuanceCount(gw *gatewayv1.Gateway, certName string, count int) {
if gw.Annotations == nil {
gw.Annotations = make(map[string]string)
}
gw.Annotations[reissuanceAnnotationKey(certName)] = fmt.Sprintf("%d", count)
}

// clearReissuanceCount removes the reissuance tracking annotation for a
// certificate that has recovered. Returns true if the annotation was present.
func clearReissuanceCount(gw *gatewayv1.Gateway, certName string) bool {
key := reissuanceAnnotationKey(certName)
if _, ok := gw.Annotations[key]; ok {
delete(gw.Annotations, key)
return true
}
return false
}

func (r *GatewayReconciler) reconcileGatewayStatus(
upstreamClient client.Client,
upstreamGateway *gatewayv1.Gateway,
Expand Down
Loading
Loading