diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go index 403cbda28ec..32e0b499d7e 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go @@ -655,7 +655,30 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques pullSecretBytes, err := hyperutil.GetPullSecretBytes(ctx, r.Client, hcluster) if err != nil { - return ctrl.Result{}, err + log.Error(err, "failed to get pull secret") + // Ensure even with a bad pull secret we process pod placement decisions + // before returning the pull secret error for requeue. + hcp = controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) + isAutoscalingNeeded, autoscaleErr := r.isAutoscalingNeeded(ctx, hcluster) + if autoscaleErr != nil { + log.Error(autoscaleErr, "failed to determine if autoscaler is needed during pull secret recovery, defaulting to true") + isAutoscalingNeeded = true + } + isAWSNodeTerminationHandlerNeeded, nthErr := r.isAWSNodeTerminationHandlerNeeded(ctx, hcluster) + if nthErr != nil { + log.Error(nthErr, "failed to determine if AWS node termination handler is needed during pull secret recovery, defaulting to true") + isAWSNodeTerminationHandlerNeeded = true + } + _, hcpErr := createOrUpdate(ctx, r.Client, hcp, func() error { + // Skip cert annotation resolution during pull secret recovery — it requires + // the pull secret to resolve the CPO image, which is unavailable here. + return reconcileHostedControlPlane(hcp, hcluster, isAutoscalingNeeded, isAWSNodeTerminationHandlerNeeded, + func() (map[string]string, error) { return nil, nil }) + }) + if hcpErr != nil { + log.Error(hcpErr, "failed to reconcile hostedcontrolplane during pull secret recovery") + } + return ctrl.Result{}, fmt.Errorf("failed to get pull secret: %w", err) } controlPlaneOperatorImage, err := hyperutil.GetControlPlaneOperatorImage(ctx, hcluster, releaseProvider, r.HypershiftOperatorImage, pullSecretBytes) diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go index e86cc223640..3e71c8c9cab 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go @@ -256,6 +256,195 @@ func TestHasBeenAvailable(t *testing.T) { } } +func TestReconcileWhenPullSecretMissing(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "any", + Annotations: map[string]string{ + hyperv1.RequestServingNodeAdditionalSelectorAnnotation: `{"node-role.kubernetes.io/worker": ""}`, + }, + }, + Spec: hyperv1.HostedClusterSpec{ + ClusterID: "id", + InfraID: "infra-id", + Networking: hyperv1.ClusterNetworking{ + ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.16.1.0/24")}}, + }, + Services: []hyperv1.ServicePublishingStrategyMapping{ + { + Service: hyperv1.Ignition, + ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ + Type: hyperv1.LoadBalancer, + }, + }, + }, + PullSecret: corev1.LocalObjectReference{ + Name: "pull-secret", + }, + Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.15.0"}, + Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.AWSPlatform, + }, + NodeSelector: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + } + + hcpNs := hcpmanifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + hcp := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + + // Create the namespace so createOrUpdate can find it + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: hcpNs}, + } + + // No pull secret in the fake client — this is the scenario we're testing + objects := []crclient.Object{ + hcp, + hcluster, + ns, + } + + client := fake.NewClientBuilder().WithScheme(api.Scheme).WithObjects(objects...).WithStatusSubresource(hcluster).Build() + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockedProvider := releaseinfo.NewMockProviderWithOpenShiftImageRegistryOverrides(mockCtrl) + + r := &HostedClusterReconciler{ + Client: client, + Clock: clocktesting.NewFakeClock(time.Now()), + CertRotationScale: 24 * time.Hour, + createOrUpdate: func(reconcile.Request) upsert.CreateOrUpdateFN { return ctrl.CreateOrUpdate }, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportNoCapabilities{}, + RegistryProvider: fakeReleaseProvider{ + releaseProvider: mockedProvider, + metadataProvider: fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ + Result: &dockerv1client.DockerImageConfig{}, + }, + }, + now: func() metav1.Time { return metav1.NewTime(time.Now()) }, + } + + ctx := t.Context() + _, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: crclient.ObjectKeyFromObject(hcluster)}) + + // When the pull secret is missing, reconciliation should return an error for requeue + g.Expect(err).To(HaveOccurred(), "When pull secret is missing it should return an error") + g.Expect(err.Error()).To(ContainSubstring("failed to get pull secret"), "When pull secret is missing the error should mention pull secret") + + // Verify reconcileHostedControlPlane still ran — the HCP should have the HostedCluster's spec fields + updatedHCP := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + err = client.Get(ctx, crclient.ObjectKeyFromObject(updatedHCP), updatedHCP) + g.Expect(err).ToNot(HaveOccurred(), "When pull secret is missing it should still be able to get the HCP") + + g.Expect(updatedHCP.Spec.NodeSelector).To(Equal(hcluster.Spec.NodeSelector), + "When pull secret is missing it should still propagate NodeSelector to HCP") + g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( + hyperv1.RequestServingNodeAdditionalSelectorAnnotation, + hcluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation]), + "When pull secret is missing it should still propagate RequestServingNodeAdditionalSelector to HCP") + g.Expect(updatedHCP.Spec.ReleaseImage).To(Equal(hcluster.Spec.Release.Image), + "When pull secret is missing it should still propagate ReleaseImage to HCP") + + // With no NodePools in the fake client, autoscaling is not needed so the + // autoscaler should be disabled. This validates the defaulting logic in the + // pull secret recovery path. + g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( + hyperv1.DisableClusterAutoscalerAnnotation, "true"), + "When pull secret is missing and no NodePools exist it should disable the cluster autoscaler") +} + +func TestReconcileWhenPullSecretMissingAndNodePoolListFails(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "any", + }, + Spec: hyperv1.HostedClusterSpec{ + ClusterID: "id", + InfraID: "infra-id", + Networking: hyperv1.ClusterNetworking{ + ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.16.1.0/24")}}, + }, + Services: []hyperv1.ServicePublishingStrategyMapping{ + { + Service: hyperv1.Ignition, + ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ + Type: hyperv1.LoadBalancer, + }, + }, + }, + PullSecret: corev1.LocalObjectReference{ + Name: "pull-secret", + }, + Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.15.0"}, + Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.AWSPlatform, + }, + }, + } + + hcpNs := hcpmanifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + hcp := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: hcpNs}, + } + + fakeClient := fake.NewClientBuilder().WithScheme(api.Scheme).WithObjects(hcp, hcluster, ns).WithStatusSubresource(hcluster). + WithInterceptorFuncs(interceptor.Funcs{ + List: func(ctx context.Context, client crclient.WithWatch, list crclient.ObjectList, opts ...crclient.ListOption) error { + if _, ok := list.(*hyperv1.NodePoolList); ok { + return fmt.Errorf("simulated list error") + } + return client.List(ctx, list, opts...) + }, + }).Build() + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockedProvider := releaseinfo.NewMockProviderWithOpenShiftImageRegistryOverrides(mockCtrl) + + r := &HostedClusterReconciler{ + Client: fakeClient, + Clock: clocktesting.NewFakeClock(time.Now()), + CertRotationScale: 24 * time.Hour, + createOrUpdate: func(reconcile.Request) upsert.CreateOrUpdateFN { return ctrl.CreateOrUpdate }, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportNoCapabilities{}, + RegistryProvider: fakeReleaseProvider{ + releaseProvider: mockedProvider, + metadataProvider: fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ + Result: &dockerv1client.DockerImageConfig{}, + }, + }, + now: func() metav1.Time { return metav1.NewTime(time.Now()) }, + } + + ctx := t.Context() + _, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: crclient.ObjectKeyFromObject(hcluster)}) + + g.Expect(err).To(HaveOccurred(), "When pull secret is missing it should return an error") + + updatedHCP := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + err = fakeClient.Get(ctx, crclient.ObjectKeyFromObject(updatedHCP), updatedHCP) + g.Expect(err).ToNot(HaveOccurred()) + + // When NodePool list fails, isAutoscalingNeeded defaults to true, so the + // autoscaler should NOT be disabled (annotation should be absent). + g.Expect(updatedHCP.Annotations).NotTo(HaveKey(hyperv1.DisableClusterAutoscalerAnnotation), + "When NodePool list fails during pull secret recovery it should default to keeping the autoscaler enabled") +} + func TestReconcileHostedControlPlaneAdditionalTrustBundle(t *testing.T) { t.Parallel() tests := []struct { diff --git a/test/e2e/chaos_test.go b/test/e2e/chaos_test.go index 34ab14b34b8..578b54df162 100644 --- a/test/e2e/chaos_test.go +++ b/test/e2e/chaos_test.go @@ -19,6 +19,7 @@ import ( batchv1 "k8s.io/api/batch/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" "k8s.io/utils/ptr" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" @@ -459,6 +460,165 @@ func testSingleMemberRecovery(parentCtx context.Context, client crclient.Client, } } +// TestPullSecretUnavailable validates that the HostedCluster reconciler continues +// to propagate critical spec fields (like NodeSelector and +// RequestServingNodeAdditionalSelector) to the HostedControlPlane even when the +// pull secret is corrupted. This is a regression test for OCPBUGS-77268. +func TestPullSecretUnavailable(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(testContext) + defer cancel() + + clusterOpts := globalOpts.DefaultClusterOptions(t) + clusterOpts.ControlPlaneAvailabilityPolicy = string(hyperv1.HighlyAvailable) + clusterOpts.NodePoolReplicas = 0 + + e2eutil.NewHypershiftTest(t, ctx, func(t *testing.T, g Gomega, mgtClient crclient.Client, hostedCluster *hyperv1.HostedCluster) { + controlPlaneNamespace := manifests.HostedControlPlaneNamespace(hostedCluster.Namespace, hostedCluster.Name) + + // Get the current pull secret and save original data for restoration + pullSecret := &corev1.Secret{} + err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: hostedCluster.Namespace, + Name: hostedCluster.Spec.PullSecret.Name, + }, pullSecret) + g.Expect(err).NotTo(HaveOccurred(), "failed to get pull secret") + originalData := pullSecret.Data[".dockerconfigjson"] + g.Expect(originalData).NotTo(BeEmpty(), "pull secret should have .dockerconfigjson data") + + // Save original values of test-owned keys for cleanup + err = mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hostedCluster) + g.Expect(err).NotTo(HaveOccurred(), "failed to get hostedcluster") + originalSelectorVal, hadSelector := hostedCluster.Spec.NodeSelector["ocpbugs-77268-test"] + originalAnnotationVal, hadAnnotation := hostedCluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] + + // Ensure pull secret and test-owned keys are restored even if the test fails mid-way + t.Cleanup(func() { + t.Log("Cleanup: restoring pull secret and test-owned HostedCluster keys...") + secret := &corev1.Secret{} + if err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: hostedCluster.Namespace, + Name: hostedCluster.Spec.PullSecret.Name, + }, secret); err == nil { + if secret.Data == nil { + secret.Data = map[string][]byte{} + } + secret.Data[".dockerconfigjson"] = originalData + if err := mgtClient.Update(ctx, secret); err != nil { + t.Logf("WARNING: failed to restore pull secret during cleanup: %v", err) + } + } + hc := &hyperv1.HostedCluster{} + if err := mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc); err == nil { + if hadSelector { + if hc.Spec.NodeSelector == nil { + hc.Spec.NodeSelector = map[string]string{} + } + hc.Spec.NodeSelector["ocpbugs-77268-test"] = originalSelectorVal + } else { + delete(hc.Spec.NodeSelector, "ocpbugs-77268-test") + } + if hadAnnotation { + if hc.Annotations == nil { + hc.Annotations = map[string]string{} + } + hc.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] = originalAnnotationVal + } else { + delete(hc.Annotations, hyperv1.RequestServingNodeAdditionalSelectorAnnotation) + } + if err := mgtClient.Update(ctx, hc); err != nil { + t.Logf("WARNING: failed to restore hostedcluster keys during cleanup: %v", err) + } + } + }) + + // Wait for the first successful reconcile so the HCP exists with all + // fields populated before we corrupt the pull secret. + e2eutil.EventuallyObject(t, ctx, "HostedCluster to reconcile successfully", + func(ctx context.Context) (*hyperv1.HostedCluster, error) { + hc := &hyperv1.HostedCluster{} + err := mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc) + return hc, err + }, + []e2eutil.Predicate[*hyperv1.HostedCluster]{ + func(hc *hyperv1.HostedCluster) (done bool, reasons string, err error) { + for _, c := range hc.Status.Conditions { + if c.Type == string(hyperv1.ReconciliationSucceeded) && c.Status == metav1.ConditionTrue { + return true, "ReconciliationSucceeded=True", nil + } + } + return false, "waiting for ReconciliationSucceeded=True", nil + }, + }, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(5*time.Minute), + ) + + // Remove the pull secret key so GetPullSecretBytes fails and the + // recovery reconcileHostedControlPlane path is exercised. + t.Log("Removing pull secret key...") + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: hostedCluster.Namespace, + Name: hostedCluster.Spec.PullSecret.Name, + }, pullSecret); err != nil { + return err + } + delete(pullSecret.Data, ".dockerconfigjson") + return mgtClient.Update(ctx, pullSecret) + }) + g.Expect(err).NotTo(HaveOccurred(), "failed to corrupt pull secret") + + // Mutate NodeSelector and RequestServingNodeAdditionalSelector while pull secret is broken. + // If reconcileHostedControlPlane still runs, these changes should propagate to the HCP. + t.Log("Updating HostedCluster NodeSelector and annotations while pull secret is corrupted...") + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hostedCluster); err != nil { + return err + } + if hostedCluster.Spec.NodeSelector == nil { + hostedCluster.Spec.NodeSelector = map[string]string{} + } + hostedCluster.Spec.NodeSelector["ocpbugs-77268-test"] = "true" + if hostedCluster.Annotations == nil { + hostedCluster.Annotations = map[string]string{} + } + hostedCluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] = `{"ocpbugs-77268-test":"true"}` + return mgtClient.Update(ctx, hostedCluster) + }) + g.Expect(err).NotTo(HaveOccurred(), "failed to update hostedcluster") + + // Verify both mutations propagated to the HCP even with a bad pull secret + e2eutil.EventuallyObject(t, ctx, "HCP spec updated despite corrupted pull secret", + func(ctx context.Context) (*hyperv1.HostedControlPlane, error) { + h := &hyperv1.HostedControlPlane{} + err := mgtClient.Get(ctx, crclient.ObjectKey{ + Namespace: controlPlaneNamespace, + Name: hostedCluster.Name, + }, h) + return h, err + }, + []e2eutil.Predicate[*hyperv1.HostedControlPlane]{ + func(h *hyperv1.HostedControlPlane) (done bool, reasons string, err error) { + val, ok := h.Spec.NodeSelector["ocpbugs-77268-test"] + if !ok || val != "true" { + return false, "NodeSelector not yet propagated to HCP", nil + } + ann, ok := h.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation] + if !ok || ann != `{"ocpbugs-77268-test":"true"}` { + return false, "RequestServingNodeAdditionalSelector not yet propagated to HCP", nil + } + return true, "NodeSelector and RequestServingNodeAdditionalSelector propagated to HCP", nil + }, + }, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(2*time.Minute), + ) + + }).Execute(&clusterOpts, hyperv1.NonePlatform, globalOpts.ArtifactDir, "pull-secret-unavailable", globalOpts.ServiceAccountSigningKey) +} + // TODO: Generics :-) func randomPods(pods []corev1.Pod, count int) []corev1.Pod { var selected []corev1.Pod