From ebc3002e74bbefdd4c2733ebdc611b320ca81870 Mon Sep 17 00:00:00 2001 From: Todd Short Date: Mon, 4 May 2026 15:47:52 -0400 Subject: [PATCH] OCPBUGS-62517: Scale to replicas=2 and enable PDB on HighlyAvailable topology Rolling updates in HighlyAvailable clusters leave catalogd and operator-controller unavailable when the only running pod is evicted before its replacement is ready. Fetch the cluster Infrastructure resource at startup and check ControlPlaneTopology. When a HighlyAvailable topology is detected (HighlyAvailable, HighlyAvailableArbiter, or DualReplica), override the Helm values to replicas=2 and podDisruptionBudget.enabled=true before rendering manifests. SingleReplica (SNO) and External topologies keep the static defaults of replicas=1 and PDB disabled. When a topology change is observed at runtime via the infrastructure informer (exceedingly rare), the operator exits so its deployment controller restarts it, triggering a fresh Helm render with the correct values for the new topology. Changes: - helmvalues: add SetIntValue and SetBoolValue helpers - clients: add InfrastructureClient backed by the config informer - controller/builder: add Infrastructure field to Builder - controller/helm: apply HA replica/PDB overrides in renderHelmTemplate; add isHighlyAvailableTopology helper - main: fetch infra at startup, pass to Builder, watch for topology changes and exit to trigger re-render Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Todd Short --- cmd/cluster-olm-operator/main.go | 32 +++++++++++++++++++++++++- pkg/clients/clients.go | 24 ++++++++++++++++++++ pkg/controller/builder.go | 1 + pkg/controller/helm.go | 39 ++++++++++++++++++++++++++++++++ pkg/helmvalues/helmvalues.go | 16 +++++++++++++ 5 files changed, 111 insertions(+), 1 deletion(-) diff --git a/cmd/cluster-olm-operator/main.go b/cmd/cluster-olm-operator/main.go index 0afee6de1..ce3bf1408 100644 --- a/cmd/cluster-olm-operator/main.go +++ b/cmd/cluster-olm-operator/main.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" + cache "k8s.io/client-go/tools/cache" "k8s.io/component-base/cli" utilflag "k8s.io/component-base/cli/flag" "k8s.io/klog/v2" @@ -234,6 +235,11 @@ func runOperator(ctx context.Context, cc *controllercmd.ControllerContext) error return fmt.Errorf("unable to retrieve featureSet: %w", err) } + infra, err := cl.ConfigClient.ConfigV1().Infrastructures().Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("unable to retrieve infrastructure: %w", err) + } + clusterCatalogGvk := ocv1.GroupVersion.WithKind("ClusterCatalog") cb := controller.Builder{ Assets: assetPath, @@ -246,7 +252,8 @@ func runOperator(ctx context.Context, cc *controllercmd.ControllerContext) error Scope: meta.RESTScopeRoot, }, }, - FeatureGate: *fg, + FeatureGate: *fg, + Infrastructure: infra, } staticResourceControllers, deploymentControllers, clusterCatalogControllers, relatedObjects, err := cb.BuildControllers("catalogd", "operator-controller") @@ -366,6 +373,29 @@ func runOperator(ctx context.Context, cc *controllercmd.ControllerContext) error operatorLoggingController := loglevel.NewClusterOperatorLoggingController(cl.OperatorClient, cc.EventRecorder.ForComponent("ClusterOLMOperatorLoggingController")) + // Watch for infrastructure topology changes. Topology changes are exceedingly rare + // (e.g., SNO to HA conversion) but require re-rendering the Helm manifests with the + // correct replica count and PDB settings. Exiting causes the deployment controller to + // restart cluster-olm-operator, which re-renders the manifests on startup. + initialTopology := infra.Status.ControlPlaneTopology + checkTopologyChange := func(obj interface{}) { + newInfra, ok := obj.(*configv1.Infrastructure) + if !ok { + return + } + if newInfra.Status.ControlPlaneTopology != initialTopology { + log.Info("Infrastructure topology changed, restarting to re-render manifests", + "old", initialTopology, "new", newInfra.Status.ControlPlaneTopology) + os.Exit(0) + } + } + if _, err := cl.InfrastructureClient.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: checkTopologyChange, + UpdateFunc: func(_, newObj interface{}) { checkTopologyChange(newObj) }, + }); err != nil { + return fmt.Errorf("failed to add infrastructure event handler: %w", err) + } + cl.StartInformers(ctx) select { diff --git a/pkg/clients/clients.go b/pkg/clients/clients.go index 13f4644f2..a931aaaac 100644 --- a/pkg/clients/clients.go +++ b/pkg/clients/clients.go @@ -64,6 +64,7 @@ type Clients struct { ClusterObjectSetClient *ClusterObjectSetClient ClusterCatalogClient *ClusterCatalogClient ProxyClient *ProxyClient + InfrastructureClient *InfrastructureClient ConfigClient configclient.Interface KubeInformerFactory informers.SharedInformerFactory ConfigInformerFactory configinformer.SharedInformerFactory @@ -128,6 +129,7 @@ func New(cc *controllercmd.ControllerContext) (*Clients, error) { ClusterCatalogClient: NewClusterCatalogClient(dynClient), ClusterObjectSetClient: NewClusterObjectSetClient(dynClient), ProxyClient: NewProxyClient(configInformerFactory), + InfrastructureClient: NewInfrastructureClient(configInformerFactory), ConfigClient: configClient, KubeInformerFactory: informers.NewSharedInformerFactory(kubeClient, defaultResyncPeriod), ConfigInformerFactory: configInformerFactory, @@ -290,6 +292,28 @@ func NewProxyClient(infFact configinformer.SharedInformerFactory) *ProxyClient { } } +type InfrastructureClientInterface interface { + Get(key string) (*configv1.Infrastructure, error) +} + +type InfrastructureClient struct { + informer configinformerv1.InfrastructureInformer +} + +func (ic *InfrastructureClient) Informer() cache.SharedIndexInformer { + return ic.informer.Informer() +} + +func (ic *InfrastructureClient) Get(key string) (*configv1.Infrastructure, error) { + return ic.informer.Lister().Get(key) +} + +func NewInfrastructureClient(infFact configinformer.SharedInformerFactory) *InfrastructureClient { + return &InfrastructureClient{ + informer: infFact.Config().V1().Infrastructures(), + } +} + type OperatorClient struct { clientset operatorclient.Interface informers operatorinformers.SharedInformerFactory diff --git a/pkg/controller/builder.go b/pkg/controller/builder.go index beef9e2be..2a2292297 100644 --- a/pkg/controller/builder.go +++ b/pkg/controller/builder.go @@ -45,6 +45,7 @@ type Builder struct { ControllerContext *controllercmd.ControllerContext KnownRESTMappings map[schema.GroupVersionKind]*meta.RESTMapping FeatureGate configv1.FeatureGate + Infrastructure *configv1.Infrastructure } func (b *Builder) BuildControllers(subDirectories ...string) (map[string]factory.Controller, map[string]factory.Controller, map[string]factory.Controller, []configv1.ObjectReference, error) { diff --git a/pkg/controller/helm.go b/pkg/controller/helm.go index d5b97762b..b26e9f6ef 100644 --- a/pkg/controller/helm.go +++ b/pkg/controller/helm.go @@ -9,6 +9,7 @@ import ( "regexp" "strings" + configv1 "github.com/openshift/api/config/v1" "github.com/openshift/cluster-olm-operator/pkg/helmvalues" yaml3 "gopkg.in/yaml.v3" @@ -80,6 +81,34 @@ func (b *Builder) renderHelmTemplate(helmPath, manifestDir string) error { return fmt.Errorf("error setting OPERATOR_CONTROLLER_IMAGE: %w", err) } + // On HighlyAvailable topologies scale to 2 replicas and enable the PDB so that rolling + // updates never leave zero running pods. On SingleReplica (SNO) / External topologies + // the manifest defaults (replicas=1, PDB disabled) are kept as-is. + if b.Infrastructure != nil && isHighlyAvailableTopology(b.Infrastructure) { + log.Info("HighlyAvailable topology detected, setting replicas=2 and enabling PDB") + haOverrides := []struct { + key string + value interface{} + }{ + {"options.catalogd.deployment.replicas", 2}, + {"options.operatorController.deployment.replicas", 2}, + {"options.catalogd.podDisruptionBudget.enabled", true}, + {"options.operatorController.podDisruptionBudget.enabled", true}, + } + for _, o := range haOverrides { + var err error + switch v := o.value.(type) { + case int: + err = values.SetIntValue(o.key, v) + case bool: + err = values.SetBoolValue(o.key, v) + } + if err != nil { + return fmt.Errorf("error setting %s: %w", o.key, err) + } + } + } + log.Info("Calculated values", "values", values.GetValues()) // Load the helm chart @@ -265,6 +294,16 @@ func sanitizeFilename(name string) string { return reg.ReplaceAllString(name, "-") } +func isHighlyAvailableTopology(infra *configv1.Infrastructure) bool { + switch infra.Status.ControlPlaneTopology { + case configv1.HighlyAvailableTopologyMode, + configv1.HighlyAvailableArbiterMode, + configv1.DualReplicaTopologyMode: + return true + } + return false +} + func writeDocument(filePath, content string) error { file, err := os.Create(filePath) if err != nil { diff --git a/pkg/helmvalues/helmvalues.go b/pkg/helmvalues/helmvalues.go index 88e1bd624..e11a216d5 100644 --- a/pkg/helmvalues/helmvalues.go +++ b/pkg/helmvalues/helmvalues.go @@ -82,6 +82,22 @@ func (v *HelmValues) SetStringValue(location string, newValue string) error { return unstructured.SetNestedField(v.values, newValue, ss...) } +func (v *HelmValues) SetIntValue(location string, newValue int) error { + if location == "" { + return errors.New("location string has no locations") + } + ss := strings.Split(location, ".") + return unstructured.SetNestedField(v.values, int64(newValue), ss...) +} + +func (v *HelmValues) SetBoolValue(location string, newValue bool) error { + if location == "" { + return errors.New("location string has no locations") + } + ss := strings.Split(location, ".") + return unstructured.SetNestedField(v.values, newValue, ss...) +} + func (v *HelmValues) AddListValue(location string, newValue string) error { if location == "" { return errors.New("location string has no locations")