From 51caaeacbedac2232d32738973d9ef1fb093a8e6 Mon Sep 17 00:00:00 2001 From: Rafael Azevedo Date: Mon, 27 Apr 2026 15:21:59 -0400 Subject: [PATCH] fix(hostedcluster): reconcile HCP when pull secret is unavailable When the pull secret is missing or invalid, the HostedCluster reconciler exits before reaching reconcileHostedControlPlane. This prevents RequestServingNodeAdditionalSelector and other spec fields from being propagated to the HCP, breaking HA scheduling for request-serving nodes. Log the error and call reconcileHostedControlPlane before returning so pod placement decisions continue during a pull secret outage. The error is still returned to trigger a requeue for full reconciliation once the pull secret is restored. JIRA: OCPBUGS-77268 --- .../hostedcluster/hostedcluster_controller.go | 25 ++- .../hostedcluster_controller_test.go | 189 ++++++++++++++++++ test/e2e/chaos_test.go | 160 +++++++++++++++ 3 files changed, 373 insertions(+), 1 deletion(-) diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go index 403cbda28ec..32e0b499d7e 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go @@ -655,7 +655,30 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques pullSecretBytes, err := hyperutil.GetPullSecretBytes(ctx, r.Client, hcluster) if err != nil { - return ctrl.Result{}, err + log.Error(err, "failed to get pull secret") + // Ensure even with a bad pull secret we process pod placement decisions + // before returning the pull secret error for requeue. + hcp = controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) + isAutoscalingNeeded, autoscaleErr := r.isAutoscalingNeeded(ctx, hcluster) + if autoscaleErr != nil { + log.Error(autoscaleErr, "failed to determine if autoscaler is needed during pull secret recovery, defaulting to true") + isAutoscalingNeeded = true + } + isAWSNodeTerminationHandlerNeeded, nthErr := r.isAWSNodeTerminationHandlerNeeded(ctx, hcluster) + if nthErr != nil { + log.Error(nthErr, "failed to determine if AWS node termination handler is needed during pull secret recovery, defaulting to true") + isAWSNodeTerminationHandlerNeeded = true + } + _, hcpErr := createOrUpdate(ctx, r.Client, hcp, func() error { + // Skip cert annotation resolution during pull secret recovery — it requires + // the pull secret to resolve the CPO image, which is unavailable here. + return reconcileHostedControlPlane(hcp, hcluster, isAutoscalingNeeded, isAWSNodeTerminationHandlerNeeded, + func() (map[string]string, error) { return nil, nil }) + }) + if hcpErr != nil { + log.Error(hcpErr, "failed to reconcile hostedcontrolplane during pull secret recovery") + } + return ctrl.Result{}, fmt.Errorf("failed to get pull secret: %w", err) } controlPlaneOperatorImage, err := hyperutil.GetControlPlaneOperatorImage(ctx, hcluster, releaseProvider, r.HypershiftOperatorImage, pullSecretBytes) diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go index e86cc223640..3e71c8c9cab 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go @@ -256,6 +256,195 @@ func TestHasBeenAvailable(t *testing.T) { } } +func TestReconcileWhenPullSecretMissing(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "any", + Annotations: map[string]string{ + hyperv1.RequestServingNodeAdditionalSelectorAnnotation: `{"node-role.kubernetes.io/worker": ""}`, + }, + }, + Spec: hyperv1.HostedClusterSpec{ + ClusterID: "id", + InfraID: "infra-id", + Networking: hyperv1.ClusterNetworking{ + ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.16.1.0/24")}}, + }, + Services: []hyperv1.ServicePublishingStrategyMapping{ + { + Service: hyperv1.Ignition, + ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ + Type: hyperv1.LoadBalancer, + }, + }, + }, + PullSecret: corev1.LocalObjectReference{ + Name: "pull-secret", + }, + Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.15.0"}, + Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.AWSPlatform, + }, + NodeSelector: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + } + + hcpNs := hcpmanifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + hcp := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + + // Create the namespace so createOrUpdate can find it + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: hcpNs}, + } + + // No pull secret in the fake client — this is the scenario we're testing + objects := []crclient.Object{ + hcp, + hcluster, + ns, + } + + client := fake.NewClientBuilder().WithScheme(api.Scheme).WithObjects(objects...).WithStatusSubresource(hcluster).Build() + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockedProvider := releaseinfo.NewMockProviderWithOpenShiftImageRegistryOverrides(mockCtrl) + + r := &HostedClusterReconciler{ + Client: client, + Clock: clocktesting.NewFakeClock(time.Now()), + CertRotationScale: 24 * time.Hour, + createOrUpdate: func(reconcile.Request) upsert.CreateOrUpdateFN { return ctrl.CreateOrUpdate }, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportNoCapabilities{}, + RegistryProvider: fakeReleaseProvider{ + releaseProvider: mockedProvider, + metadataProvider: fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ + Result: &dockerv1client.DockerImageConfig{}, + }, + }, + now: func() metav1.Time { return metav1.NewTime(time.Now()) }, + } + + ctx := t.Context() + _, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: crclient.ObjectKeyFromObject(hcluster)}) + + // When the pull secret is missing, reconciliation should return an error for requeue + g.Expect(err).To(HaveOccurred(), "When pull secret is missing it should return an error") + g.Expect(err.Error()).To(ContainSubstring("failed to get pull secret"), "When pull secret is missing the error should mention pull secret") + + // Verify reconcileHostedControlPlane still ran — the HCP should have the HostedCluster's spec fields + updatedHCP := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + err = client.Get(ctx, crclient.ObjectKeyFromObject(updatedHCP), updatedHCP) + g.Expect(err).ToNot(HaveOccurred(), "When pull secret is missing it should still be able to get the HCP") + + g.Expect(updatedHCP.Spec.NodeSelector).To(Equal(hcluster.Spec.NodeSelector), + "When pull secret is missing it should still propagate NodeSelector to HCP") + g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( + hyperv1.RequestServingNodeAdditionalSelectorAnnotation, + hcluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation]), + "When pull secret is missing it should still propagate RequestServingNodeAdditionalSelector to HCP") + g.Expect(updatedHCP.Spec.ReleaseImage).To(Equal(hcluster.Spec.Release.Image), + "When pull secret is missing it should still propagate ReleaseImage to HCP") + + // With no NodePools in the fake client, autoscaling is not needed so the + // autoscaler should be disabled. This validates the defaulting logic in the + // pull secret recovery path. + g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( + hyperv1.DisableClusterAutoscalerAnnotation, "true"), + "When pull secret is missing and no NodePools exist it should disable the cluster autoscaler") +} + +func TestReconcileWhenPullSecretMissingAndNodePoolListFails(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "any", + }, + Spec: hyperv1.HostedClusterSpec{ + ClusterID: "id", + InfraID: "infra-id", + Networking: hyperv1.ClusterNetworking{ + ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.16.1.0/24")}}, + }, + Services: []hyperv1.ServicePublishingStrategyMapping{ + { + Service: hyperv1.Ignition, + ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ + Type: hyperv1.LoadBalancer, + }, + }, + }, + PullSecret: corev1.LocalObjectReference{ + Name: "pull-secret", + }, + Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.15.0"}, + Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.AWSPlatform, + }, + }, + } + + hcpNs := hcpmanifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + hcp := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: hcpNs}, + } + + fakeClient := fake.NewClientBuilder().WithScheme(api.Scheme).WithObjects(hcp, hcluster, ns).WithStatusSubresource(hcluster). + WithInterceptorFuncs(interceptor.Funcs{ + List: func(ctx context.Context, client crclient.WithWatch, list crclient.ObjectList, opts ...crclient.ListOption) error { + if _, ok := list.(*hyperv1.NodePoolList); ok { + return fmt.Errorf("simulated list error") + } + return client.List(ctx, list, opts...) + }, + }).Build() + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockedProvider := releaseinfo.NewMockProviderWithOpenShiftImageRegistryOverrides(mockCtrl) + + r := &HostedClusterReconciler{ + Client: fakeClient, + Clock: clocktesting.NewFakeClock(time.Now()), + CertRotationScale: 24 * time.Hour, + createOrUpdate: func(reconcile.Request) upsert.CreateOrUpdateFN { return ctrl.CreateOrUpdate }, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportNoCapabilities{}, + RegistryProvider: fakeReleaseProvider{ + releaseProvider: mockedProvider, + metadataProvider: fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ + Result: &dockerv1client.DockerImageConfig{}, + }, + }, + now: func() metav1.Time { return metav1.NewTime(time.Now()) }, + } + + ctx := t.Context() + _, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: crclient.ObjectKeyFromObject(hcluster)}) + + g.Expect(err).To(HaveOccurred(), "When pull secret is missing it should return an error") + + updatedHCP := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + err = fakeClient.Get(ctx, crclient.ObjectKeyFromObject(updatedHCP), updatedHCP) + g.Expect(err).ToNot(HaveOccurred()) + + // When NodePool list fails, isAutoscalingNeeded defaults to true, so the + // autoscaler should NOT be disabled (annotation should be absent). + g.Expect(updatedHCP.Annotations).NotTo(HaveKey(hyperv1.DisableClusterAutoscalerAnnotation), + "When NodePool list fails during pull secret recovery it should default to keeping the autoscaler enabled") +} + func TestReconcileHostedControlPlaneAdditionalTrustBundle(t *testing.T) { t.Parallel() tests := []struct { diff --git a/test/e2e/chaos_test.go b/test/e2e/chaos_test.go index 34ab14b34b8..578b54df162 100644 --- a/test/e2e/chaos_test.go +++ b/test/e2e/chaos_test.go @@ -19,6 +19,7 @@ import ( batchv1 "k8s.io/api/batch/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" "k8s.io/utils/ptr" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" @@ -459,6 +460,165 @@ func testSingleMemberRecovery(parentCtx context.Context, client crclient.Client, } } +// TestPullSecretUnavailable validates that the HostedCluster reconciler continues +// to propagate critical spec fields (like NodeSelector and +// RequestServingNodeAdditionalSelector) to the HostedControlPlane even when the +// pull secret is corrupted. This is a regression test for OCPBUGS-77268. +func TestPullSecretUnavailable(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(testContext) + defer cancel() + + clusterOpts := globalOpts.DefaultClusterOptions(t) + clusterOpts.ControlPlaneAvailabilityPolicy = string(hyperv1.HighlyAvailable) + clusterOpts.NodePoolReplicas = 0 + + e2eutil.NewHypershiftTest(t, ctx, func(t *testing.T, g Gomega, mgtClient crclient.Client, hostedCluster *hyperv1.HostedCluster) { + controlPlaneNamespace := manifests.HostedControlPlaneNamespace(hostedCluster.Namespace, hostedCluster.Name) + + // Get the current pull secret and save original data for restoration + pullSecret := &corev1.Secret{} + err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: hostedCluster.Namespace, + Name: hostedCluster.Spec.PullSecret.Name, + }, pullSecret) + g.Expect(err).NotTo(HaveOccurred(), "failed to get pull secret") + originalData := pullSecret.Data[".dockerconfigjson"] + g.Expect(originalData).NotTo(BeEmpty(), "pull secret should have .dockerconfigjson data") + + // Save original values of test-owned keys for cleanup + err = mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hostedCluster) + g.Expect(err).NotTo(HaveOccurred(), "failed to get hostedcluster") + originalSelectorVal, hadSelector := hostedCluster.Spec.NodeSelector["ocpbugs-77268-test"] + originalAnnotationVal, hadAnnotation := hostedCluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] + + // Ensure pull secret and test-owned keys are restored even if the test fails mid-way + t.Cleanup(func() { + t.Log("Cleanup: restoring pull secret and test-owned HostedCluster keys...") + secret := &corev1.Secret{} + if err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: hostedCluster.Namespace, + Name: hostedCluster.Spec.PullSecret.Name, + }, secret); err == nil { + if secret.Data == nil { + secret.Data = map[string][]byte{} + } + secret.Data[".dockerconfigjson"] = originalData + if err := mgtClient.Update(ctx, secret); err != nil { + t.Logf("WARNING: failed to restore pull secret during cleanup: %v", err) + } + } + hc := &hyperv1.HostedCluster{} + if err := mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc); err == nil { + if hadSelector { + if hc.Spec.NodeSelector == nil { + hc.Spec.NodeSelector = map[string]string{} + } + hc.Spec.NodeSelector["ocpbugs-77268-test"] = originalSelectorVal + } else { + delete(hc.Spec.NodeSelector, "ocpbugs-77268-test") + } + if hadAnnotation { + if hc.Annotations == nil { + hc.Annotations = map[string]string{} + } + hc.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] = originalAnnotationVal + } else { + delete(hc.Annotations, hyperv1.RequestServingNodeAdditionalSelectorAnnotation) + } + if err := mgtClient.Update(ctx, hc); err != nil { + t.Logf("WARNING: failed to restore hostedcluster keys during cleanup: %v", err) + } + } + }) + + // Wait for the first successful reconcile so the HCP exists with all + // fields populated before we corrupt the pull secret. + e2eutil.EventuallyObject(t, ctx, "HostedCluster to reconcile successfully", + func(ctx context.Context) (*hyperv1.HostedCluster, error) { + hc := &hyperv1.HostedCluster{} + err := mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc) + return hc, err + }, + []e2eutil.Predicate[*hyperv1.HostedCluster]{ + func(hc *hyperv1.HostedCluster) (done bool, reasons string, err error) { + for _, c := range hc.Status.Conditions { + if c.Type == string(hyperv1.ReconciliationSucceeded) && c.Status == metav1.ConditionTrue { + return true, "ReconciliationSucceeded=True", nil + } + } + return false, "waiting for ReconciliationSucceeded=True", nil + }, + }, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(5*time.Minute), + ) + + // Remove the pull secret key so GetPullSecretBytes fails and the + // recovery reconcileHostedControlPlane path is exercised. + t.Log("Removing pull secret key...") + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: hostedCluster.Namespace, + Name: hostedCluster.Spec.PullSecret.Name, + }, pullSecret); err != nil { + return err + } + delete(pullSecret.Data, ".dockerconfigjson") + return mgtClient.Update(ctx, pullSecret) + }) + g.Expect(err).NotTo(HaveOccurred(), "failed to corrupt pull secret") + + // Mutate NodeSelector and RequestServingNodeAdditionalSelector while pull secret is broken. + // If reconcileHostedControlPlane still runs, these changes should propagate to the HCP. + t.Log("Updating HostedCluster NodeSelector and annotations while pull secret is corrupted...") + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hostedCluster); err != nil { + return err + } + if hostedCluster.Spec.NodeSelector == nil { + hostedCluster.Spec.NodeSelector = map[string]string{} + } + hostedCluster.Spec.NodeSelector["ocpbugs-77268-test"] = "true" + if hostedCluster.Annotations == nil { + hostedCluster.Annotations = map[string]string{} + } + hostedCluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] = `{"ocpbugs-77268-test":"true"}` + return mgtClient.Update(ctx, hostedCluster) + }) + g.Expect(err).NotTo(HaveOccurred(), "failed to update hostedcluster") + + // Verify both mutations propagated to the HCP even with a bad pull secret + e2eutil.EventuallyObject(t, ctx, "HCP spec updated despite corrupted pull secret", + func(ctx context.Context) (*hyperv1.HostedControlPlane, error) { + h := &hyperv1.HostedControlPlane{} + err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: controlPlaneNamespace, + Name: hostedCluster.Name, + }, h) + return h, err + }, + []e2eutil.Predicate[*hyperv1.HostedControlPlane]{ + func(h *hyperv1.HostedControlPlane) (done bool, reasons string, err error) { + val, ok := h.Spec.NodeSelector["ocpbugs-77268-test"] + if !ok || val != "true" { + return false, "NodeSelector not yet propagated to HCP", nil + } + ann, ok := h.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] + if !ok || ann != `{"ocpbugs-77268-test":"true"}` { + return false, "RequestServingNodeAdditionalSelector not yet propagated to HCP", nil + } + return true, "NodeSelector and RequestServingNodeAdditionalSelector propagated to HCP", nil + }, + }, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(2*time.Minute), + ) + + }).Execute(&clusterOpts, hyperv1.NonePlatform, globalOpts.ArtifactDir, "pull-secret-unavailable", globalOpts.ServiceAccountSigningKey) +} + // TODO: Generics :-) func randomPods(pods []corev1.Pod, count int) []corev1.Pod { var selected []corev1.Pod