From 7b92a2f4b7d63749e672117d126f10b6c1251876 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Tue, 12 May 2026 05:56:59 -0400 Subject: [PATCH] docs: restore non-obvious comments after gocyclo refactor Restore and improve comments that document hidden constraints, edge cases, and design decisions across files affected by the gocyclo complexity refactor in PR #8309. Changes: - Add missing requeue comment in AWS private link deletion flow - Restore OAuth IDP conversion comments: GitLab OIDC requirement, Keystone ID, IBM Cloud special case, ID claim safety, challenge flow validation constraint - Fix comment formatting (//Handle -> // Handle) in v2 OAuth IDP - Add ARO-HCP OpenAPI spec link for Azure resource type constant - Restore CAPI label propagation doc comment with TODO and JIRA link (HOSTEDCP-971) explaining why labels/taints bypass rolling upgrades - Restore globalPS DaemonSet scheduling comment Signed-off-by: Bryan Cox Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/cluster/core/create.go | 10 +++++++ cmd/install/assets/hypershift_operator.go | 3 +++ cmd/install/install.go | 8 ++++++ .../awsprivatelink_controller.go | 6 +++++ .../azureprivatelinkservice/controller.go | 13 +++++++++ .../hostedcontrolplane_controller.go | 18 +++++++++++++ .../hostedcontrolplane/oauth/idp_convert.go | 9 +++++-- .../v2/oauth/idp_convert.go | 2 +- .../controllers/resources/resources.go | 11 ++++++++ control-plane-operator/main.go | 11 +++++++- .../snapshot_controller.go | 3 +++ .../hostedcluster/etcd_recovery.go | 3 +++ .../hostedcluster/metrics/metrics.go | 7 ++++- .../hostedcluster/network_policies.go | 23 ++++++++++++++-- .../hostedclustersizing_controller.go | 14 +++++++++- .../controllers/nodepool/aws.go | 10 ++++++- .../controllers/nodepool/capi.go | 18 +++++++++++++ .../controllers/nodepool/metrics/metrics.go | 5 ++++ .../controllers/platform/aws/controller.go | 3 +++ .../controllers/scheduler/aws/autoscaler.go | 27 +++++++++++++++++++ .../aws/dedicated_request_serving_nodes.go | 9 +++++++ hypershift-operator/main.go | 9 +++++++ .../controllers/local_ignitionprovider.go | 20 ++++++++++++++ 23 files changed, 233 insertions(+), 9 deletions(-) diff --git a/cmd/cluster/core/create.go b/cmd/cluster/core/create.go index 4c3ccc771e0..8296268bd02 100644 --- a/cmd/cluster/core/create.go +++ b/cmd/cluster/core/create.go @@ -355,6 +355,7 @@ func prototypeResources(ctx context.Context, opts *CreateOptions) (*resources, e } func resolveReleaseImage(ctx context.Context, opts *CreateOptions) error { + // allow client side defaulting when release image is empty but release stream is set. if len(opts.ReleaseImage) != 0 || len(opts.ReleaseStream) == 0 { return nil } @@ -386,6 +387,7 @@ func parseKeyValuePairs(items []string, kind string) (map[string]string, error) } func resolvePullSecret(opts *CreateOptions) ([]byte, error) { + // overrides if pullSecretFile is set if len(opts.PullSecretFile) > 0 { data, err := os.ReadFile(opts.PullSecretFile) if err != nil { @@ -430,6 +432,7 @@ func applyEtcdConfig(cluster *hyperv1.HostedCluster, opts *CreateOptions) error func applySSHKey(prototype *resources, opts *CreateOptions) error { sshKey, sshPrivateKey := opts.PublicKey, opts.PrivateKey var err error + // overrides secret if SSHKeyFile is set if len(opts.SSHKeyFile) > 0 { if opts.GenerateSSH { return fmt.Errorf("--generate-ssh and --ssh-key cannot be specified together") @@ -468,6 +471,8 @@ func applySSHKey(prototype *resources, opts *CreateOptions) error { } func applyPausedUntil(cluster *hyperv1.HostedCluster, opts *CreateOptions) error { + // validate pausedUntil value + // valid values are either "true" or RFC3339 format date if len(opts.PausedUntil) == 0 || opts.PausedUntil == "true" { return nil } @@ -725,14 +730,18 @@ func (opts *RawCreateOptions) Validate(ctx context.Context) (*ValidatedCreateOpt return nil, err } + // Validate HostedCluster name follows RFC1123 standard + // https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#dns-label-names errs := validation.IsDNS1123Label(opts.Name) if len(errs) > 0 { return nil, fmt.Errorf("HostedCluster name failed RFC1123 validation: %s", strings.Join(errs[:], " ")) } + // Validate HostedCluster with this name doesn't exist in the namespace if err := opts.validateClusterExistence(ctx); err != nil { return nil, err } + // Validate multi-arch aspects if err := opts.validateArchAndFeatureSet(); err != nil { return nil, err } @@ -804,6 +813,7 @@ func (opts *RawCreateOptions) validateArchAndFeatureSet() error { return fmt.Errorf("specified arch %q is not supported", opts.Arch) } + // Validate feature set is "", TechPreviewNoUpgrade, or DevPreviewNoUpgrade switch opts.FeatureSet { case string(configv1.Default), string(configv1.TechPreviewNoUpgrade), string(configv1.DevPreviewNoUpgrade): case string(configv1.CustomNoUpgrade): diff --git a/cmd/install/assets/hypershift_operator.go b/cmd/install/assets/hypershift_operator.go index 592531ff749..d2af9802030 100644 --- a/cmd/install/assets/hypershift_operator.go +++ b/cmd/install/assets/hypershift_operator.go @@ -993,6 +993,9 @@ func (o HyperShiftOperatorDeployment) addAzurePlatformResources(envVars *[]corev if o.AzurePLSResourceGroup != "" { *envVars = append(*envVars, corev1.EnvVar{Name: "AZURE_RESOURCE_GROUP", Value: o.AzurePLSResourceGroup}) } + // Workload identity mode: the SA annotation triggers Azure AD Workload Identity + // webhook to inject federated tokens. Set the client ID as an env var so the + // HO platform controller can construct credentials. if o.AzurePLSManagedIdentityClientID != "" { *envVars = append(*envVars, corev1.EnvVar{Name: "AZURE_PLS_CLIENT_ID", Value: o.AzurePLSManagedIdentityClientID}) if o.AzurePLSSubscriptionID != "" { diff --git a/cmd/install/install.go b/cmd/install/install.go index ed798ea8f4f..0f91fcedbd0 100644 --- a/cmd/install/install.go +++ b/cmd/install/install.go @@ -185,6 +185,8 @@ func (o *Options) validatePlatformConfig() []error { errs = append(errs, fmt.Errorf("--aws-private-region and --aws-private-creds or --aws-private-secret are required with --private-platform=%s", hyperv1.AWSPlatform)) } case hyperv1.GCPPlatform: + // GCP uses Workload Identity Federation, no credentials required. + // However, --gcp-project and --gcp-region must be set together. if (o.GCPProject == "") != (o.GCPRegion == "") { errs = append(errs, fmt.Errorf("--gcp-project and --gcp-region must be set together when --private-platform=%s", hyperv1.GCPPlatform)) } @@ -242,6 +244,7 @@ func (o *Options) validateExternalDNSConfig() []error { return nil } var errs []error + // Credentials are optional for GCP when using Workload Identity credentialsRequired := o.ExternalDNSProvider != "google" if credentialsRequired && len(o.ExternalDNSCredentials) == 0 && len(o.ExternalDNSCredentialsSecret) == 0 { errs = append(errs, fmt.Errorf("--external-dns-credentials or --external-dns-credentials-secret are required with --external-dns-provider")) @@ -282,20 +285,24 @@ func (o *Options) validateImageConfig() []error { return errs } +// Validate scale-from-zero credentials func (o *Options) validateScaleFromZeroConfig() []error { if len(o.ScaleFromZeroCreds) == 0 && len(o.ScaleFromZeroCredentialsSecret) == 0 { return nil } var errs []error supportedProviders := set.New("aws") + // Check mutual exclusivity - only one of file or secret should be provided if len(o.ScaleFromZeroCreds) != 0 && len(o.ScaleFromZeroCredentialsSecret) != 0 { errs = append(errs, fmt.Errorf("only one of --scale-from-zero-creds or --scale-from-zero-secret is supported")) } + // Provider is required when using scale-from-zero credentials if len(o.ScaleFromZeroProvider) == 0 { errs = append(errs, fmt.Errorf("--scale-from-zero-provider is required when using scale-from-zero credentials")) } else if !supportedProviders.Has(o.ScaleFromZeroProvider) { errs = append(errs, fmt.Errorf("invalid --scale-from-zero-provider: %s (must be one of: %v)", o.ScaleFromZeroProvider, supportedProviders.UnsortedList())) } + // Validate credentials file exists and is accessible if provided if len(o.ScaleFromZeroCreds) > 0 { if _, err := os.Stat(o.ScaleFromZeroCreds); err != nil { if os.IsNotExist(err) { @@ -324,6 +331,7 @@ func (o *Options) validateMiscConfig() []error { if len(o.ManagedService) > 0 && o.ManagedService != hyperv1.AroHCP { errs = append(errs, fmt.Errorf("not a valid managed service type: %s", o.ManagedService)) } + // Validate all the platforms in the list are valid for _, platform := range o.PlatformsToInstall { platformToCheck := strings.ToLower(platform) if !ValidPlatforms.Has(platformToCheck) { diff --git a/control-plane-operator/controllers/awsprivatelink/awsprivatelink_controller.go b/control-plane-operator/controllers/awsprivatelink/awsprivatelink_controller.go index 696a42bc817..8ba562ccb3b 100644 --- a/control-plane-operator/controllers/awsprivatelink/awsprivatelink_controller.go +++ b/control-plane-operator/controllers/awsprivatelink/awsprivatelink_controller.go @@ -677,8 +677,10 @@ func deleteEndpointIfWrongService(ctx context.Context, ec2Client awsapi.EC2API, } func modifyEndpointIfNeeded(ctx context.Context, ec2Client awsapi.EC2API, awsEndpointService *hyperv1.AWSEndpointService, endpoint ec2types.VpcEndpoint, endpointID string, log logr.Logger) error { + // Ensure endpoint has the right subnets. addedSubnet, removedSubnet := diffIDs(awsEndpointService.Spec.SubnetIDs, endpoint.SubnetIds) + // Ensure endpoint has the right SG. existingSG := make([]string, 0) for _, group := range endpoint.Groups { existingSG = append(existingSG, aws.ToString(group.GroupId)) @@ -816,6 +818,8 @@ func (r *AWSEndpointServiceReconciler) reconcileExternalNameServices(ctx context isPublic := netutil.IsPublicHCP(hcp) externalNames := hcpExternalNames(hcp) + // only if not public and external names are configured, create services of type ExternalName so external-dns + // can create records for them if !isPublic && len(externalNames) > 0 { var errs []error for svcType, externalName := range externalNames { @@ -839,6 +843,7 @@ func (r *AWSEndpointServiceReconciler) reconcileExternalNameServices(ctx context return nil } + // if the cluster is public, ensure that any ExternalName services are removed privateExternalServices := &corev1.ServiceList{} if err := r.List(ctx, privateExternalServices, client.InNamespace(hcp.Namespace), client.HasLabels{externalPrivateServiceLabel}); err != nil { return fmt.Errorf("cannot list private external services: %w", err) @@ -1106,6 +1111,7 @@ func (r *AWSEndpointServiceReconciler) delete(ctx context.Context, awsEndpointSe } if output != nil && len(output.VpcEndpoints) != 0 { + // Once the VPC Endpoint is deleted, we need to return an error to reexecute the reconciliation return false, fmt.Errorf("resource requested for deletion but still present") } diff --git a/control-plane-operator/controllers/azureprivatelinkservice/controller.go b/control-plane-operator/controllers/azureprivatelinkservice/controller.go index 13cd2255fdc..1c75c9a2529 100644 --- a/control-plane-operator/controllers/azureprivatelinkservice/controller.go +++ b/control-plane-operator/controllers/azureprivatelinkservice/controller.go @@ -860,6 +860,9 @@ func (r *AzurePrivateLinkServiceReconciler) handleAzureError(ctx context.Context // CPO finalizer has completed PE cleanup and removed its finalizer from the CR. func (r *AzurePrivateLinkServiceReconciler) reconcileDelete(ctx context.Context, azPLS *hyperv1.AzurePrivateLinkService, log logr.Logger) error { resourceGroup := azPLS.Spec.ResourceGroupName + // Delete DNS resources using the zone name persisted in status. + // This avoids a dependency on the HostedControlPlane during deletion, which may + // already be torn down or unavailable when the finalizer runs. dnsZoneName := azPLS.Status.DNSZoneName if dnsZoneName != "" { @@ -961,6 +964,9 @@ func (r *AzurePrivateLinkServiceReconciler) deleteBaseDomainResources(ctx contex return err } + // Only delete the base domain DNS zone if no other CRs share it. + // When multiple CRs (e.g., private-router and oauth-openshift) use the same + // base domain zone, the zone must not be deleted until the last CR is removed. hasSiblings, err := r.hasSiblingCR(ctx, azPLS) if err != nil { return fmt.Errorf("failed to check for sibling CRs during base domain zone cleanup: %w", err) @@ -988,6 +994,9 @@ func (r *AzurePrivateLinkServiceReconciler) deleteBaseDomainARecords(ctx context var baseDomainRecords []string if azPLS.Name == privateRouterCRName { baseDomainRecords = append(baseDomainRecords, kasBaseDomainRecordPrefix+clusterName) + // Only delete the oauth record if there is no sibling OAuth CR that + // owns it. This prevents the private-router deletion from removing + // an oauth record that now belongs to the dedicated OAuth CR. hasSiblings, err := r.hasSiblingCR(ctx, azPLS) if err != nil { return fmt.Errorf("failed to check for sibling CRs during base domain cleanup: %w", err) @@ -1013,6 +1022,10 @@ func (r *AzurePrivateLinkServiceReconciler) deleteBaseDomainARecords(ctx context return nil } +// deletePrivateEndpoint always attempts deletion by deterministic name, even +// when PrivateEndpointID is empty. If the status was never populated (e.g., +// status update failed after PE creation), relying solely on PrivateEndpointID +// would orphan the PE in the customer's subscription. func (r *AzurePrivateLinkServiceReconciler) deletePrivateEndpoint(ctx context.Context, resourceGroup, crName, statusPEID string, log logr.Logger) error { peName := privateEndpointName(crName) log.Info("Deleting Private Endpoint", "name", peName, "hasStatusID", statusPEID != "") diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go index 017ac10a8d3..d21ee116ffd 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go @@ -502,6 +502,7 @@ func (r *HostedControlPlaneReconciler) reconcileKASStatus(ctx context.Context, h return fmt.Errorf("failed to fetch Kube APIServer deployment %s/%s: %w", deployment.Namespace, deployment.Name, err) } } else { + // Assume the deployment is unavailable until proven otherwise. newCondition = metav1.Condition{ Type: string(hyperv1.KubeAPIServerAvailable), Status: metav1.ConditionFalse, @@ -670,6 +671,8 @@ func (r *HostedControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.R return reconcile.Result{}, fmt.Errorf("failed to look up release image metadata: %w", err) } + // This runs after LookupReleaseImage so we can use the version and resolved + // digest from the release image metadata. if err := r.reconcileControlPlaneVersionStatus(ctx, hostedControlPlane, originalHostedControlPlane, releaseImage); err != nil { return reconcile.Result{}, err } @@ -847,6 +850,8 @@ func (r *HostedControlPlaneReconciler) reconcileControlPlaneVersionStatus(ctx co if err := r.Client.Get(ctx, client.ObjectKeyFromObject(pullSecret), pullSecret); err != nil { return fmt.Errorf("failed to get pull secret for version reconciliation: %w", err) } + // Resolve the release image to its digest so controlPlaneVersion records + // the immutable image reference, consistent with how CVO records images. _, resolvedRef, err := r.ImageMetadataProvider.GetDigest(ctx, util.HCPControlPlaneReleaseImage(hostedControlPlane), pullSecret.Data[corev1.DockerConfigJsonKey]) if err != nil { return fmt.Errorf("failed to resolve control plane release image digest: %w", err) @@ -855,6 +860,9 @@ func (r *HostedControlPlaneReconciler) reconcileControlPlaneVersionStatus(ctx co componentsList := &hyperv1.ControlPlaneComponentList{} if listErr := r.Client.List(ctx, componentsList, client.InNamespace(hostedControlPlane.Namespace)); listErr != nil { + // On list failure, ensure a Partial entry exists so consumers + // know an upgrade was attempted. Preserve observedGeneration. + // Persist the Partial entry before returning the error. hostedControlPlane.Status.ControlPlaneVersion = ensureControlPlaneVersionPartial(hostedControlPlane, clk, releaseImage.Version(), resolvedImage) if patchErr := r.Client.Status().Patch(ctx, hostedControlPlane, client.MergeFromWithOptions(originalHostedControlPlane, client.MergeFromWithOptimisticLock{})); patchErr != nil { return fmt.Errorf("failed to patch status after component list failure: %w (list error: %v)", patchErr, listErr) @@ -1590,6 +1598,10 @@ func (r *HostedControlPlaneReconciler) reconcileOLMAndMiscCerts(ctx context.Cont } func (r *HostedControlPlaneReconciler) reconcileNetworkServingCerts(ctx context.Context, hcp *hyperv1.HostedControlPlane, p *pki.PKIParams, createOrUpdate upsert.CreateOrUpdateFN, rootCASecret *corev1.Secret) error { + // For the Multus Admission Controller, Network Node Identity, and OVN Control Plane Metrics Serving Certs: + // We want to remove the secret if there was an existing one created by service-ca; otherwise, it will cause + // issues in cases where you are upgrading an older CPO prior to us adding the feature to reconcile the serving + // cert secret ourselves. if !netutil.IsDisableMultiNetwork(hcp) { multusAdmissionControllerService := manifests.MultusAdmissionControllerService(hcp.Namespace) if err := r.Get(ctx, client.ObjectKeyFromObject(multusAdmissionControllerService), multusAdmissionControllerService); err != nil { @@ -1598,6 +1610,8 @@ func (r *HostedControlPlaneReconciler) reconcileNetworkServingCerts(ctx context. } } + // If the service doesn't have the service ca annotation, delete any previous secret with the annotation and + // reconcile the secret with our own rootCA; otherwise, skip reconciling the secret with our own rootCA. if hasServiceCAAnnotation := doesServiceHaveServiceCAAnnotation(multusAdmissionControllerService); !hasServiceCAAnnotation { multusAdmissionControllerServingCertSecret := manifests.MultusAdmissionControllerServingCert(hcp.Namespace) @@ -1620,6 +1634,8 @@ func (r *HostedControlPlaneReconciler) reconcileNetworkServingCerts(ctx context. } } + // If the service doesn't have the service ca annotation, delete any previous secret with the annotation and + // reconcile the secret with our own rootCA; otherwise, skip reconciling the secret with our own rootCA. if hasServiceCAAnnotation := doesServiceHaveServiceCAAnnotation(networkNodeIdentityService); !hasServiceCAAnnotation { networkNodeIdentityServingCertSecret := manifests.NetworkNodeIdentityControllerServingCert(hcp.Namespace) @@ -1641,6 +1657,8 @@ func (r *HostedControlPlaneReconciler) reconcileNetworkServingCerts(ctx context. } } + // If the service doesn't have the service ca annotation, delete any previous secret with the annotation and + // reconcile the secret with our own rootCA; otherwise, skip reconciling the secret with our own rootCA. if hasServiceCAAnnotation := doesServiceHaveServiceCAAnnotation(ovnControlPlaneService); !hasServiceCAAnnotation { ovnControlPlaneMetricsServingCertSecret := manifests.OVNControlPlaneMetricsServingCert(hcp.Namespace) diff --git a/control-plane-operator/controllers/hostedcontrolplane/oauth/idp_convert.go b/control-plane-operator/controllers/hostedcontrolplane/oauth/idp_convert.go index 4aeb258be96..5704c079812 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/oauth/idp_convert.go +++ b/control-plane-operator/controllers/hostedcontrolplane/oauth/idp_convert.go @@ -191,7 +191,7 @@ func convertGitLabIDP(providerConfig *configv1.IdentityProviderConfig, i int, id ClientSecret: configv1.StringSource{StringSourceSpec: configv1.StringSourceSpec{ File: idpVolumeMounts.SecretPath(i, gitlabConfig.ClientSecret.Name, "client-secret", configv1.ClientSecretKey), }}, - Legacy: new(bool), + Legacy: new(bool), // we require OIDC for GitLab now } if gitlabConfig.CA.Name != "" { provider.CA = idpVolumeMounts.ConfigMapPath(i, gitlabConfig.CA.Name, "ca", corev1.ServiceAccountRootCAKey) @@ -235,7 +235,7 @@ func convertKeystoneIDP(providerConfig *configv1.IdentityProviderConfig, i int, TypeMeta: metav1.TypeMeta{Kind: "KeystonePasswordIdentityProvider", APIVersion: osinv1.GroupVersion.String()}, RemoteConnectionInfo: configv1.RemoteConnectionInfo{URL: keystoneConfig.URL}, DomainName: keystoneConfig.DomainName, - UseKeystoneIdentity: true, + UseKeystoneIdentity: true, // force use of keystone ID } if keystoneConfig.CA.Name != "" { provider.RemoteConnectionInfo.CA = idpVolumeMounts.ConfigMapPath(i, keystoneConfig.CA.Name, "ca", corev1.ServiceAccountRootCAKey) @@ -291,6 +291,7 @@ func convertOpenIDIDP(ctx context.Context, providerConfig *configv1.IdentityProv ExtraScopes: openIDConfig.ExtraScopes, ExtraAuthorizeParameters: openIDConfig.ExtraAuthorizeParameters, } + // Handle special case for IBM Cloud's OIDC provider (need to override some fields not available in public api) if configOverride != nil { openIDProvider.URLs = configOverride.URLs openIDProvider.Claims = configOverride.Claims @@ -308,6 +309,7 @@ func convertOpenIDIDP(ctx context.Context, providerConfig *configv1.IdentityProv } } openIDProvider.Claims = osinv1.OpenIDClaims{ + // There is no longer a user-facing setting for ID as it is considered unsafe ID: []string{configv1.UserIDClaim}, PreferredUsername: openIDConfig.Claims.PreferredUsername, Name: openIDConfig.Claims.Name, @@ -322,6 +324,9 @@ func convertOpenIDIDP(ctx context.Context, providerConfig *configv1.IdentityProv if configOverride != nil && configOverride.Challenge != nil { data.challenge = *configOverride.Challenge } else { + // openshift CR validating in kube-apiserver does not allow + // challenge-redirecting IdPs to be configured with OIDC so it is safe + // to allow challenge-issuing flow if it's available on the OIDC side challengeFlowsAllowed, err := checkOIDCPasswordGrantFlow(ctx, kclient, openIDProvider.URLs.Token, openIDConfig.ClientID, namespace, openIDConfig.CA, openIDConfig.ClientSecret, skipKonnectivityDialer) if err != nil { return nil, fmt.Errorf("error attempting password grant flow: %v", err) diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/oauth/idp_convert.go b/control-plane-operator/controllers/hostedcontrolplane/v2/oauth/idp_convert.go index d4e12f63669..4f5f85944c9 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/oauth/idp_convert.go +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/oauth/idp_convert.go @@ -381,7 +381,7 @@ func convertOpenIDIDP( ExtraScopes: openIDConfig.ExtraScopes, ExtraAuthorizeParameters: openIDConfig.ExtraAuthorizeParameters, } - //Handle special case for IBM Cloud's OIDC provider (need to override some fields not available in public api) + // Handle special case for IBM Cloud's OIDC provider (need to override some fields not available in public api) if configOverride != nil { openIDProvider.URLs = configOverride.URLs openIDProvider.Claims = configOverride.Claims diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go index c929a3a9e1c..1ee6a3a7180 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go @@ -504,6 +504,7 @@ func (r *reconciler) cleanupLegacyResources(ctx context.Context, log logr.Logger } func (r *reconciler) reconcileDeletion(ctx context.Context, log logr.Logger, hcp *hyperv1.HostedControlPlane) (ctrl.Result, error) { + // Delete admission policies during cluster deletion to allow HCCO cleanup operations for ARO HCP if hcp.Spec.Platform.Type == hyperv1.AzurePlatform { registryConfigManagementStateAdmissionPolicy := registry.AdmissionPolicy{Name: registry.AdmissionPolicyNameManagementState} log.Info("Cluster is being deleted, deleting registry management state admission policy and binding to allow cleanup") @@ -607,6 +608,10 @@ func (r *reconciler) reconcileRegistryAndIngress(ctx context.Context, hcp *hyper } if capabilities.IsImageRegistryCapabilityEnabled(hcp.Spec.Capabilities) { + // For platforms where cluster-image-registry-operator (CIRO) needs a PVC to be created, bootstrap needs to happen + // in CIRO before the registry config is created. For now, this is the case for the OpenStack platform. + // If the object exist, we reconcile the registry config for other fields as it should be fine since the PVC would + // exist at this point. if imageRegistryPlatformWithPVC(hcp.Spec.Platform.Type) && (!registryConfigExists || registryConfig == nil) { log.Info("skipping registry config to let CIRO bootstrap") } else { @@ -623,6 +628,7 @@ func (r *reconciler) reconcileRegistryAndIngress(ctx context.Context, hcp *hyper errs = append(errs, fmt.Errorf("failed to reconcile imageregistry config: %w", err)) } + // TODO: remove this when ROSA HCP stops setting the managementState to Removed to disable the Image Registry if registryConfig.Spec.ManagementState == operatorv1.Removed && r.platformType != hyperv1.IBMCloudPlatform && r.platformType != hyperv1.AzurePlatform { log.Info("imageregistry operator managementstate is removed, disabling openshift-controller-manager controllers and cleaning up resources") ocmConfigMap := cpomanifests.OpenShiftControllerManagerConfig(r.hcpNamespace) @@ -829,10 +835,13 @@ func (r *reconciler) reconcileNetworkingAndSecrets(ctx context.Context, hcp *hyp }); err != nil { errs = append(errs, fmt.Errorf("failed to reconcile network operator: %w", err)) } + // Detect suboptimal MTU size on kubevirt hosted cluster with ovn-k and raise a condition in such a case if err := networkoperator.DetectSuboptimalMTU(ctx, r.cpClient, networkOperator, hcp); err != nil { errs = append(errs, err) } + // this allows users to disable data collection in sensitive environments + // solves https://issues.redhat.com/browse/OCPBUGS-12208 ensureExistsReconciliationStrategy := false if _, exists := hcp.Annotations[hyperv1.EnsureExistsPullSecretReconciliation]; exists { ensureExistsReconciliationStrategy = true @@ -1819,11 +1828,13 @@ func (r *reconciler) reconcileControlPlaneConnectionAvailable(ctx context.Contex cm := manifests.KASConnectionCheckerConfigMap() if err := r.client.Get(ctx, client.ObjectKeyFromObject(cm), cm); err != nil { if apierrors.IsNotFound(err) { + // CPO has not created the configmap yet, wait for create condition.Reason = hyperv1.ControlPlaneConnectionConfigMapNotFoundReason condition.Message = fmt.Sprintf("Connectivity check ConfigMap %s/%s not found; the hosted cluster config operator may not have reconciled it yet", manifests.KASConnectionCheckerNamespace, manifests.KASConnectionCheckerConfigMapName) return r.patchHCPStatusCondition(ctx, hcp, condition) } + // This should not happen as we are started by the CPO after the configmap should be created condition.Reason = hyperv1.ReconcileErrorReason condition.Message = fmt.Sprintf("Failed to get connectivity check ConfigMap %s/%s: %v", manifests.KASConnectionCheckerNamespace, manifests.KASConnectionCheckerConfigMapName, err) diff --git a/control-plane-operator/main.go b/control-plane-operator/main.go index 2e2a2912aba..6fde842dde1 100644 --- a/control-plane-operator/main.go +++ b/control-plane-operator/main.go @@ -187,6 +187,10 @@ func defaultCommand() *cobra.Command { return cmd } +// For now, since the hosted cluster config operator is treated like any other +// release payload component but isn't actually part of a release payload, +// enable the user to specify an image directly as a flag, and otherwise +// try and detect the control plane operator's image to use instead. func newOperatorImageLookup(mgr ctrl.Manager, namespace, deploymentName string) func(string) (string, error) { return func(userSpecifiedImage string) (string, error) { if len(userSpecifiedImage) > 0 { @@ -201,6 +205,7 @@ func newOperatorImageLookup(mgr ctrl.Manager, namespace, deploymentName string) } for _, container := range me.Spec.Containers { + // If CPO container image is a sha256 reference, use it if container.Name == "control-plane-operator" { if strings.Contains(container.Image, "@sha256:") { return container.Image, nil @@ -208,6 +213,7 @@ func newOperatorImageLookup(mgr ctrl.Manager, namespace, deploymentName string) } } + // Use the container status to make sure we get the sha256 reference for _, container := range me.Status.ContainerStatuses { if container.Name == "control-plane-operator" { image := strings.TrimPrefix(container.ImageID, "docker-pullable://") @@ -289,14 +295,16 @@ func buildReleaseProviders(componentImages map[string]string, registryOverrides ComponentImages: componentImages, } + // It should be used to lookup spec.releaseImage. userReleaseProvider := &releaseinfo.ProviderWithOpenShiftImageRegistryOverridesDecorator{ Delegate: &releaseinfo.RegistryMirrorProviderDecorator{ Delegate: coreReleaseProvider, - RegistryOverrides: nil, + RegistryOverrides: nil, // UserReleaseProvider shouldn't include registry overrides as they should not get propagated to the data plane. }, OpenShiftImageRegistryOverrides: imageRegistryOverrides, } + // It should be used to lookup spec.controlPlaneReleaseImage. cpReleaseProvider := &releaseinfo.ProviderWithOpenShiftImageRegistryOverridesDecorator{ Delegate: &releaseinfo.RegistryMirrorProviderDecorator{ Delegate: coreReleaseProvider, @@ -445,6 +453,7 @@ func NewStartCommand() *cobra.Command { if err != nil { return false, err } + // Apparently this is occasionally set to an empty string if hostedClusterConfigOperatorImage == "" { setupLog.Info("hosted cluster config operator image is empty, retrying") return false, nil diff --git a/hypershift-operator/controllers/auditlogpersistence/snapshot_controller.go b/hypershift-operator/controllers/auditlogpersistence/snapshot_controller.go index 9bb17347941..7d0d6a38832 100644 --- a/hypershift-operator/controllers/auditlogpersistence/snapshot_controller.go +++ b/hypershift-operator/controllers/auditlogpersistence/snapshot_controller.go @@ -76,6 +76,7 @@ func (r *SnapshotReconciler) getSnapshotConfig(ctx context.Context) (*auditlogpe return nil, fmt.Errorf("failed to get AuditLogPersistenceConfig: %w", err) } + // Apply defaults to a copy of the spec to avoid modifying the original spec := config.Spec.DeepCopy() ApplyDefaults(spec) @@ -97,6 +98,7 @@ func (r *SnapshotReconciler) getLastObservedRestartCount(ctx context.Context, po if podCopy.Annotations == nil { podCopy.Annotations = make(map[string]string) } + // Reset corrupted annotation to 0 podCopy.Annotations[lastObservedRestartCountAnnotation] = "0" if patchErr := r.client.Patch(ctx, podCopy, client.MergeFrom(pod)); patchErr != nil { log.Error(patchErr, "Failed to reset corrupted annotation") @@ -118,6 +120,7 @@ func (r *SnapshotReconciler) checkSnapshotInterval(ctx context.Context, pod *cor if podCopy.Annotations == nil { podCopy.Annotations = make(map[string]string) } + // Remove corrupted annotation - it will be set correctly after snapshot creation delete(podCopy.Annotations, lastSnapshotTimeAnnotation) if patchErr := r.client.Patch(ctx, podCopy, client.MergeFrom(pod)); patchErr != nil { log.Error(patchErr, "Failed to remove corrupted last snapshot time annotation") diff --git a/hypershift-operator/controllers/hostedcluster/etcd_recovery.go b/hypershift-operator/controllers/hostedcluster/etcd_recovery.go index d4196de80b6..3d67b56e52f 100644 --- a/hypershift-operator/controllers/hostedcluster/etcd_recovery.go +++ b/hypershift-operator/controllers/hostedcluster/etcd_recovery.go @@ -88,6 +88,8 @@ func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Cont return false, nil } +// Creating the condition for the first time or in the case of the ETCD fails intermittently. +// If the ETCD keeps failing and recovering, we can see the hcluster.Generation increasing indefinitely. func (r *HostedClusterReconciler) setEtcdRecoveryCondition(ctx context.Context, hcluster *hyperv1.HostedCluster, status metav1.ConditionStatus, reason, message string) error { condition := metav1.Condition{ Type: string(hyperv1.EtcdRecoveryActive), @@ -128,6 +130,7 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte } if failingEtcdPod == nil { + // However, if the statefulset is not reporting fully available, check later if !fullyAvailable { return &requeueAfter, nil } diff --git a/hypershift-operator/controllers/hostedcluster/metrics/metrics.go b/hypershift-operator/controllers/hostedcluster/metrics/metrics.go index 8c26a7bb0f4..898d55fa423 100644 --- a/hypershift-operator/controllers/hostedcluster/metrics/metrics.go +++ b/hypershift-operator/controllers/hostedcluster/metrics/metrics.go @@ -91,7 +91,8 @@ const ( HostedClusterManagedAzureInfoMetricName = "hosted_cluster_managed_azure_info" HostedClusterManagedAzureInfoMetricHelp = "Reports Azure managed (ARO) specific information about the given HostedCluster" - HostedClusterManagedAzureResourceType = "hcpOpenShiftClusters" + // see https://github.com/Azure/ARO-HCP/blob/4134b5bb53782858047a0493f31b250c811eb84c/api/redhatopenshift/resource-manager/Microsoft.RedHatOpenShift/hcpclusters/preview/2024-06-10-preview/openapi.json#L131 + HostedClusterManagedAzureResourceType = "hcpOpenShiftClusters" HostedClusterAzureInfoMetricName = "hosted_cluster_azure_info" HostedClusterAzureInfoMetricHelp = "Reports Azure information about the given HostedCluster" @@ -395,6 +396,7 @@ func collectInitialRollingOutMetric(ch chan<- prometheus.Metric, clk clock.Clock } func collectUpgradingDurationMetric(ch chan<- prometheus.Metric, clk clock.Clock, hcluster *hyperv1.HostedCluster, hclusterLabelValues []string) { + // The upgrade is adding a new entry in the history on top of the initial rollout. if hcluster.Status.Version == nil || len(hcluster.Status.Version.History) <= 1 { return } @@ -463,6 +465,7 @@ func (c *hostedClustersMetricsCollector) collectProxyMetrics(ch chan<- prometheu } func (c *hostedClustersMetricsCollector) collectProxyCAMetrics(ch chan<- prometheus.Metric, hcluster *hyperv1.HostedCluster, hclusterLabelValues []string) { + // Only report CA validity if a CA is actually configured proxyCAValid := 0.0 validProxyCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.ValidProxyConfiguration)) if validProxyCondition != nil && validProxyCondition.Status == metav1.ConditionTrue { @@ -475,6 +478,7 @@ func (c *hostedClustersMetricsCollector) collectProxyCAMetrics(ch chan<- prometh hclusterLabelValues..., ) + // Silently skip expiry time if we can't fetch it proxyExpiryTime := 0.0 expiryTime, err := c.expiryTimeProxyCA(hcluster) if err == nil { @@ -559,6 +563,7 @@ func collectAzureInfoMetrics(ch chan<- prometheus.Metric, hcluster *hyperv1.Host } } +// Use detailed credential status: 0=valid, 1=invalid, 2=unknown func collectAwsCredsMetric(ch chan<- prometheus.Metric, hcluster *hyperv1.HostedCluster, hclusterLabelValues []string) { credStatus := platformaws.GetCredentialStatus(hcluster) ch <- prometheus.MustNewConstMetric( diff --git a/hypershift-operator/controllers/hostedcluster/network_policies.go b/hypershift-operator/controllers/hostedcluster/network_policies.go index 9467de3b8da..f45a5ab6d04 100644 --- a/hypershift-operator/controllers/hostedcluster/network_policies.go +++ b/hypershift-operator/controllers/hostedcluster/network_policies.go @@ -101,6 +101,8 @@ func (r *HostedClusterReconciler) reconcileNetworkPolicies(ctx context.Context, } func (r *HostedClusterReconciler) reconcileIngressNetworkPolicy(ctx context.Context, createOrUpdate upsert.CreateOrUpdateFN, hcp *hyperv1.HostedControlPlane, controlPlaneNamespaceName string) error { + // Only needed when routes are served by the management cluster's default ingress controller, + // i.e., when routes are NOT labeled for the HCP router. policy := networkpolicy.OpenshiftIngressNetworkPolicy(controlPlaneNamespaceName) if !netutil.LabelHCPRoutes(hcp) { if _, err := createOrUpdate(ctx, r.Client, policy, func() error { @@ -328,6 +330,8 @@ func reconcilePrivateRouterNetworkPolicy(policy *networkingv1.NetworkPolicy, _ * "app": "private-router", }, } + // TODO: Network policy code should move to the control plane operator. For now, + // only setup ingress rules (and not egress rules) when version is < 4.14 if ingressOnly { policy.Spec.PolicyTypes = []networkingv1.PolicyType{networkingv1.PolicyTypeIngress} return nil @@ -652,6 +656,15 @@ func reconcileVirtLauncherNetworkPolicy(log logr.Logger, policy *networkingv1.Ne return buildVirtLauncherNetworkPolicyBase(log, policy, hcluster, blockedIPv4Networks, blockedIPv6Networks, controlPlanePeers) } +// reconcileVirtLauncherNetworkPolicyExternalInfra builds the virt-launcher +// NetworkPolicy for deployments where the KubeVirt VMs run on a separate +// infrastructure cluster. Unlike the centralized variant, this omits egress +// rules for control-plane pods (kube-apiserver, oauth, ignition-server-proxy) +// because those pods reside on the management cluster and are reached via +// external IPs already permitted by the broad 0.0.0.0/0 allow rule. +// infraClusterNetwork may be nil when the infra kubeconfig lacks cluster- +// scoped read access to networks.config.openshift.io. In that case the +// policy is still created but without CIDR-based egress blocking. func reconcileVirtLauncherNetworkPolicyExternalInfra(log logr.Logger, policy *networkingv1.NetworkPolicy, hcluster *hyperv1.HostedCluster, infraClusterNetwork *configv1.Network) error { blockedIPv4Networks := []string{} blockedIPv6Networks := []string{} @@ -667,8 +680,9 @@ func reconcileVirtLauncherNetworkPolicyExternalInfra(log logr.Logger, policy *ne return buildVirtLauncherNetworkPolicyBase(log, policy, hcluster, blockedIPv4Networks, blockedIPv6Networks, nil) } -// buildVirtLauncherNetworkPolicyBase constructs the virt-launcher -// NetworkPolicy structure. extraEgressPeers are appended to the primary egress rule +// buildVirtLauncherNetworkPolicyBase constructs the common virt-launcher +// NetworkPolicy structure shared by both centralized and external infra +// deployments. extraEgressPeers are appended to the primary egress rule // (e.g. control-plane pod selectors for centralized infra). func buildVirtLauncherNetworkPolicyBase(log logr.Logger, policy *networkingv1.NetworkPolicy, hcluster *hyperv1.HostedCluster, blockedIPv4Networks, blockedIPv6Networks []string, extraEgressPeers []networkingv1.NetworkPolicyPeer) error { protocolTCP := corev1.ProtocolTCP @@ -1039,6 +1053,11 @@ func (r *HostedClusterReconciler) reconcileExternalInfraVirtLauncherPolicy(ctx c infraClient := kvInfraClient.GetInfraClient() infraNamespace := kvInfraClient.GetInfraNamespace() + // networks.config.openshift.io is cluster-scoped, so the infra + // kubeconfig needs a ClusterRole with get permission on that + // resource. When the permission is missing we still create the + // NetworkPolicy but without CIDR-based egress blocking, and + // surface the RBAC gap as a condition on the HostedCluster. infraClusterNetwork, err := fetchInfraClusterNetwork(ctx, infraClient, infraNamespace, hcluster, log) if err != nil { return err diff --git a/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go b/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go index 64983a9a73c..f13741230c6 100644 --- a/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go +++ b/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go @@ -180,6 +180,8 @@ func (r *reconciler) reconcile( config *schedulingv1alpha1.ClusterSizingConfiguration, hostedCluster *hypershiftv1beta1.HostedCluster, ) (*action, error) { if !isConfigValid(config) { + // we can't put clusters into t-shirt sizes unless we have a valid configuration; + // we'll re-trigger when the configuration object changes and can process clusters then return nil, nil } @@ -200,6 +202,8 @@ func (r *reconciler) reconcile( lastTransitionTime, lastSizeClass := previousTransitionFor(hostedCluster) currentSizeClass, sizeClassLabelPresent := hostedCluster.ObjectMeta.Labels[hypershiftv1beta1.HostedClusterSizeLabel] + // we can't update both the status and the labels in one call, so when we have + // updated status but have not yet updated the labels, we just need to do that first if lastTransitionTime != nil && !sizeClassLabelPresent || currentSizeClass != lastSizeClass { return &action{ applyCfg: hypershiftv1beta1applyconfigurations.HostedCluster(hostedCluster.Name, hostedCluster.Namespace). @@ -243,7 +247,7 @@ func (r *reconciler) determineSizeClass( } } logger.Error(fmt.Errorf("could not find a size class for hosted cluster"), "no size can be set on hosted cluster") - return nil, nil + return nil, nil // user needs to reformat the field, returning error is useless } if autoScaling := hostedCluster.Annotations[hypershiftv1beta1.ResourceBasedControlPlaneAutoscalingAnnotation]; autoScaling == "true" { @@ -272,6 +276,7 @@ func (r *reconciler) determineSizeClassFromAutoscaling( } } + // If no recommended size is set, or the recommended size wasn't found, use the first size class as fallback sizeClass := &config.Spec.Sizes[0] if recommendedSize == "" { logger.Info("Resource-based autoscaling enabled but no recommended size set, using first size class", "defaultSize", sizeClass.Name) @@ -349,6 +354,12 @@ func (r *reconciler) transitionSizeClass( } increasingSize := previousMinimumSize < sizeClass.Criteria.From + // For new clusters being added to the fleet, we have an SLA on creation time and can't + // afford to delay the first transition, as it is required for the control plane to schedule. + // For other clusters, we want to limit the amount of churn to promote management plane stability. + // third, we need to know if we're ready to transition the cluster: + // - the hosted cluster has limits to how quickly it can transition up and down, and + // - the management plane has limits to how many clusters can be transitioning at any time if result := r.checkTransitionDelay(config, hostedCluster, sizeClass, increasingSize, lastTransitionTime); result != nil { if result.applyCfg == nil { return &action{requeueAfter: result.requeueAfter}, nil @@ -394,6 +405,7 @@ func (r *reconciler) checkTransitionDelay( sizeClass *schedulingv1alpha1.SizeConfiguration, increasingSize bool, lastTransitionTime *time.Time, ) *action { + // if we transitioned in the past, we need to enforce the delay from there delayStart := time.Time{} if lastTransitionTime != nil { delayStart = *lastTransitionTime diff --git a/hypershift-operator/controllers/nodepool/aws.go b/hypershift-operator/controllers/nodepool/aws.go index f1a847404e4..7f884700c4e 100644 --- a/hypershift-operator/controllers/nodepool/aws.go +++ b/hypershift-operator/controllers/nodepool/aws.go @@ -81,7 +81,7 @@ func awsMachineTemplateSpec(infraName string, hostedCluster *hyperv1.HostedClust instanceMetadataOptions := &capiaws.InstanceMetadataOptions{ HTTPTokens: capiaws.HTTPTokensStateOptional, - HTTPPutResponseHopLimit: 2, + HTTPPutResponseHopLimit: 2, // set to 2 as per AWS recommendation for container envs https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#imds-considerations HTTPEndpoint: capiaws.InstanceMetadataEndpointStateEnabled, InstanceMetadataTags: capiaws.InstanceMetadataEndpointStateDisabled, } @@ -119,11 +119,13 @@ func awsMachineTemplateSpec(infraName string, hostedCluster *hyperv1.HostedClust } func resolveAWSAMI(hostedCluster *hyperv1.HostedCluster, nodePool *hyperv1.NodePool, releaseImage *releaseinfo.ReleaseImage) (string, error) { + // TODO: Should the region be included in the NodePool platform information? region := hostedCluster.Spec.Platform.AWS.Region arch := nodePool.Spec.Arch if nodePool.Spec.Platform.AWS.AMI != "" { return nodePool.Spec.Platform.AWS.AMI, nil } + // Use Windows AMI mapping when ImageType is set to Windows if nodePool.Spec.Platform.AWS.ImageType == hyperv1.ImageTypeWindows { ami, err := getWindowsAMI(region, arch, releaseImage) if err != nil { @@ -131,6 +133,7 @@ func resolveAWSAMI(hostedCluster *hyperv1.HostedCluster, nodePool *hyperv1.NodeP } return ami, nil } + // Default behavior for Linux/RHCOS AMIs ami, err := defaultNodePoolAMI(region, arch, releaseImage) if err != nil { return "", fmt.Errorf("couldn't discover an AMI for release image: %w", err) @@ -208,8 +211,10 @@ func applyAWSPlacementOptions(nodePool *hyperv1.NodePool, spec *capiaws.AWSMachi } spec.Template.Spec.Tenancy = placement.Tenancy + // Handle market type - placement.MarketType takes precedence over capacityReservation.MarketType (deprecated) switch placement.MarketType { case hyperv1.MarketTypeSpot: + // Spot instances spec.Template.Spec.SpotMarketOptions = &capiaws.SpotMarketOptions{} if placement.Spot.MaxPrice != "" { spec.Template.Spec.SpotMarketOptions.MaxPrice = ptr.To(placement.Spot.MaxPrice) @@ -219,6 +224,7 @@ func applyAWSPlacementOptions(nodePool *hyperv1.NodePool, spec *capiaws.AWSMachi case hyperv1.MarketTypeOnDemand: spec.Template.Spec.MarketType = capiaws.MarketTypeOnDemand default: + // If placement.MarketType is not set, fall back to capacityReservation.MarketType (deprecated) if capacityReservation := placement.CapacityReservation; capacityReservation != nil { //nolint:staticcheck // SA1019: capacityReservation.MarketType is deprecated but supported for backward compatibility switch capacityReservation.MarketType { @@ -227,6 +233,7 @@ func applyAWSPlacementOptions(nodePool *hyperv1.NodePool, spec *capiaws.AWSMachi case hyperv1.MarketTypeOnDemand: spec.Template.Spec.MarketType = capiaws.MarketTypeOnDemand default: + // if the tenancy is not host and the ID is set, default the market type to CapacityBlock if placement.Tenancy != "host" && capacityReservation.ID != nil { spec.Template.Spec.MarketType = capiaws.MarketTypeCapacityBlock } @@ -234,6 +241,7 @@ func applyAWSPlacementOptions(nodePool *hyperv1.NodePool, spec *capiaws.AWSMachi } } + // Handle capacity reservation options if capacityReservation := placement.CapacityReservation; capacityReservation != nil { spec.Template.Spec.CapacityReservationID = capacityReservation.ID spec.Template.Spec.CapacityReservationPreference = capiaws.CapacityReservationPreference(capacityReservation.Preference) diff --git a/hypershift-operator/controllers/nodepool/capi.go b/hypershift-operator/controllers/nodepool/capi.go index b66ebcd1d68..f2965b8a555 100644 --- a/hypershift-operator/controllers/nodepool/capi.go +++ b/hypershift-operator/controllers/nodepool/capi.go @@ -410,6 +410,9 @@ func (c *CAPI) reconcileMachineDeployment(ctx context.Context, log logr.Logger, c.setMachineDeploymentMetadata(machineDeployment, capiClusterName) + // Set defaults. These are normally set by the CAPI machinedeployment webhook. + // However, since we don't run the webhook, CAPI updates the machinedeployment + // after it has been created with defaults. machineDeployment.Spec.MinReadySeconds = ptr.To[int32](0) machineDeployment.Spec.ClusterName = capiClusterName if machineDeployment.Spec.Selector.MatchLabels == nil { @@ -429,6 +432,8 @@ func (c *CAPI) reconcileMachineDeployment(ctx context.Context, log logr.Logger, resourcesName: resourcesName, capiv1.ClusterNameLabel: capiClusterName, }, + // Annotations here propagate down to Machines + // https://cluster-api.sigs.k8s.io/developer/architecture/controllers/metadata-propagation.html#machinedeployment. Annotations: map[string]string{ nodePoolAnnotation: client.ObjectKeyFromObject(nodePool).String(), hyperv1.NodePoolReleaseVersionAnnotation: c.Version(), @@ -451,6 +456,7 @@ func (c *CAPI) reconcileMachineDeployment(ctx context.Context, log logr.Logger, }, } + // This label must be on the MachineDeployment template so the spot MHC can select machines if isSpotEnabled(nodePool) { machineDeployment.Spec.Template.Labels[interruptibleInstanceLabel] = "" } @@ -494,11 +500,13 @@ func (c *CAPI) setMachineDeploymentMetadata(machineDeployment *capiv1.MachineDep } func setMachineDeploymentFailureDomain(nodePool *hyperv1.NodePool, machineDeployment *capiv1.MachineDeployment) { + // The CAPI provider for OpenStack uses the FailureDomain field to set the availability zone. if nodePool.Spec.Platform.Type == hyperv1.OpenStackPlatform && nodePool.Spec.Platform.OpenStack != nil { if nodePool.Spec.Platform.OpenStack.AvailabilityZone != "" { machineDeployment.Spec.Template.Spec.FailureDomain = ptr.To(nodePool.Spec.Platform.OpenStack.AvailabilityZone) } } + // The CAPI provider for GCP uses the FailureDomain field to set the zone. if nodePool.Spec.Platform.Type == hyperv1.GCPPlatform && nodePool.Spec.Platform.GCP != nil { if nodePool.Spec.Platform.GCP.Zone != "" { machineDeployment.Spec.Template.Spec.FailureDomain = ptr.To(nodePool.Spec.Platform.GCP.Zone) @@ -506,6 +514,11 @@ func setMachineDeploymentFailureDomain(nodePool *hyperv1.NodePool, machineDeploy } } +// propagateLabelsAndTaintsToMachines propagates label/taints directly into Machines +// to avoid a NodePool label/taints change triggering a rolling upgrade. +// TODO(Alberto): drop this and rely on core in-place propagation once CAPI 1.4.0 +// https://github.com/kubernetes-sigs/cluster-api/releases comes through the payload. +// https://issues.redhat.com/browse/HOSTEDCP-971 func (c *CAPI) propagateLabelsAndTaintsToMachines(ctx context.Context, log logr.Logger, machineDeployment *capiv1.MachineDeployment) error { nodePool := c.nodePool machineList := &capiv1.MachineList{} @@ -530,6 +543,9 @@ func (c *CAPI) propagateLabelsAndTaintsToMachines(ctx context.Context, log logr. machine.Labels[labelKey] = v } + // Propagate globalPS managed label to Machines so the HCCO Node controller + // applies it to Nodes. This enables the GlobalPullSecret DaemonSet to + // schedule on Replace nodes. Only AWS and Azure platforms support this. if nodePool.Spec.Platform.Type == hyperv1.AWSPlatform || nodePool.Spec.Platform.Type == hyperv1.AzurePlatform { globalPSLabelKey := fmt.Sprintf("%s.%s", labelManagedPrefix, globalPSNodeLabel) machine.Labels[globalPSLabelKey] = "true" @@ -594,6 +610,8 @@ func (c *CAPI) reconcileMachineDeploymentStatus(log logr.Logger, machineDeployme targetConfigHash := c.HashWithoutVersion() targetConfigVersionHash := c.Hash() + // If the MachineDeployment is now processing we know + // is at the expected version (spec.version) and config (userData Secret) so we reconcile status and annotation. if MachineDeploymentComplete(machineDeployment) { if nodePool.Status.Version != targetVersion { log.Info("Version update complete", diff --git a/hypershift-operator/controllers/nodepool/metrics/metrics.go b/hypershift-operator/controllers/nodepool/metrics/metrics.go index d52b458db75..5283c729976 100644 --- a/hypershift-operator/controllers/nodepool/metrics/metrics.go +++ b/hypershift-operator/controllers/nodepool/metrics/metrics.go @@ -390,6 +390,7 @@ func (c *nodePoolsMetricsCollector) Collect(ch chan<- prometheus.Metric) { c.lastCollectTime = currentCollectTime } +// Hosted clusters loop func (c *nodePoolsMetricsCollector) collectHostedClusterData(ctx context.Context) map[string]*hclusterData { hclusterPathToData := make(map[string]*hclusterData) hclusters := &hyperv1.HostedClusterList{} @@ -414,6 +415,7 @@ func (c *nodePoolsMetricsCollector) collectHostedClusterData(ctx context.Context return hclusterPathToData } +// Machine sets loop func (c *nodePoolsMetricsCollector) collectMachineSetReplicas(ctx context.Context) map[string]int32 { result := make(map[string]int32) machineSets := &capiv1.MachineSetList{} @@ -422,11 +424,13 @@ func (c *nodePoolsMetricsCollector) collectMachineSetReplicas(ctx context.Contex } for k := range machineSets.Items { machineSet := &machineSets.Items[k] + // we use machineSet.Spec.Replicas because nodePool.Spec.Replicas will not be set if autoscaling is enabled result[machineSet.Namespace+"/"+machineSet.Name] = *machineSet.Spec.Replicas } return result } +// Machine deployments loop func (c *nodePoolsMetricsCollector) collectMachineDeploymentReplicas(ctx context.Context) map[string]int32 { result := make(map[string]int32) machineDeployments := &capiv1.MachineDeploymentList{} @@ -435,6 +439,7 @@ func (c *nodePoolsMetricsCollector) collectMachineDeploymentReplicas(ctx context } for k := range machineDeployments.Items { md := &machineDeployments.Items[k] + // we use machineDeployment.Spec.Replicas because nodePool.Spec.Replicas will not be set if autoscaling is enabled result[md.Namespace+"/"+md.Name] = *md.Spec.Replicas } return result diff --git a/hypershift-operator/controllers/platform/aws/controller.go b/hypershift-operator/controllers/platform/aws/controller.go index 7d69152e39a..e7779ba5a57 100644 --- a/hypershift-operator/controllers/platform/aws/controller.go +++ b/hypershift-operator/controllers/platform/aws/controller.go @@ -460,6 +460,7 @@ func listKarpenterSubnetIDs(ctx context.Context, c client.Client, namespace stri return subnetIDs, nil } +// If a previous awsendpointservice that points to an ingress controller exists, remove it func (r *AWSEndpointServiceReconciler) deleteObsoleteEndpointService(ctx context.Context, awsEndpointService *hyperv1.AWSEndpointService) (done bool, err error) { endpointServices := &hyperv1.AWSEndpointServiceList{} if err := r.List(ctx, endpointServices, client.InNamespace(awsEndpointService.Namespace)); err != nil { @@ -476,6 +477,7 @@ func (r *AWSEndpointServiceReconciler) deleteObsoleteEndpointService(ctx context hasPrivateIngressControllerEPService = true } } + // Only if both router and private ingress controller AWSEndpointServices exist, delete the obsolete one if !hasPrivateRouterEPService || !hasPrivateIngressControllerEPService { return false, nil } @@ -515,6 +517,7 @@ func (r *AWSEndpointServiceReconciler) ensureVpcEndpointService(ctx context.Cont return "", "", err } if len(output.ServiceConfigurations) == 0 { + // clear the EndpointServiceName so a new Endpoint Service is created on the requeue awsEndpointService.Status.EndpointServiceName = "" return "", "", fmt.Errorf("endpoint service %s not found, resetting status", serviceName) } diff --git a/hypershift-operator/controllers/scheduler/aws/autoscaler.go b/hypershift-operator/controllers/scheduler/aws/autoscaler.go index 90c04d95e0a..89ae151e63d 100644 --- a/hypershift-operator/controllers/scheduler/aws/autoscaler.go +++ b/hypershift-operator/controllers/scheduler/aws/autoscaler.go @@ -387,6 +387,8 @@ func machineSetsToScaleUp(pods []corev1.Pod, machineSets []machinev1beta1.Machin } requiredNodeCounts := determineRequiredNodes(pendingPods, pods, nodes) + // If a specific pair label is required, find the corresponding machinesets + // that are not already scaled up. var placeHoldersNeeded []nodeRequirement for _, r := range requiredNodeCounts { if r.pairLabel != "" { @@ -398,6 +400,7 @@ func machineSetsToScaleUp(pods []corev1.Pod, machineSets []machinev1beta1.Machin result = append(result, machineSetsToScale...) continue } + // Otherwise, we need to find placeholders without a specific pair label placeHoldersNeeded = append(placeHoldersNeeded, r) } @@ -415,6 +418,9 @@ func machineSetsToScaleUp(pods []corev1.Pod, machineSets []machinev1beta1.Machin return result, pendingPods, requiredNodeCounts } +// collectTakenPairLabels returns pair labels that cannot be used for new +// placeholders because they are either (1) assigned to a hosted cluster +// or (2) already have a placeholder scheduled. func collectTakenPairLabels(pods []corev1.Pod, nodes []corev1.Node) sets.Set[string] { takenPairLabels := sets.New[string]() for _, n := range nodes { @@ -434,6 +440,12 @@ func scaleMachineSetsForRequirement(r nodeRequirement, machineSets []machinev1be var result []machinev1beta1.MachineSet needCount := r.count + // Find any available nodes of the specified size. These may not be ready + // yet but will allow scheduling soon. + // Available nodes must: + // 1 - have the request serving label + // 2 - have matching size label + // 3 - have a pair label that is not already taken availableNodes := filterNodes(nodes, func(n *corev1.Node) bool { return n.Labels[hyperv1.RequestServingComponentLabel] != "" && n.Labels[hyperv1.NodeSizeLabel] == r.sizeLabel && @@ -449,6 +461,14 @@ func scaleMachineSetsForRequirement(r nodeRequirement, machineSets []machinev1be } } + // Find machinesets that have already been scaled up but do not have + // any nodes yet. + // Pending machinesets must: + // 1 - have the request serving label + // 2 - have matching size label + // 3 - be scaled up without available replicas + // 4 - not correspond to any available nodes + // 5 - not have a pair label that is assigned to a cluster pendingMachineSets := filterMachineSets(machineSets, func(ms *machinev1beta1.MachineSet) bool { return isRequestServingMachineSet(ms) && machineSetSize(ms) == r.sizeLabel && @@ -463,6 +483,7 @@ func scaleMachineSetsForRequirement(r nodeRequirement, machineSets []machinev1be return nil } + // Scale up the paired machineset for any pending machinesets that need it for _, ms := range pendingMachineSets { if pairedMachineSet := matchingMachineSet(&ms, machineSets); pairedMachineSet != nil { if ptr.Deref(pairedMachineSet.Spec.Replicas, 0) == 0 { @@ -481,6 +502,12 @@ func scaleMachineSetsForRequirement(r nodeRequirement, machineSets []machinev1be } func pickAvailableMachineSetPairs(sizeLabel string, needCount int, machineSets []machinev1beta1.MachineSet, takenPairLabels sets.Set[string]) []machinev1beta1.MachineSet { + // Pick random pairs from available machinesets. + // Available machinesets must: + // 1 - have the request serving label + // 2 - have the corresponding size label + // 3 - not be scaled up + // 4 - have a pair label that is not already taken availableMachineSets := filterMachineSets(machineSets, func(ms *machinev1beta1.MachineSet) bool { return isRequestServingMachineSet(ms) && machineSetSize(ms) == sizeLabel && diff --git a/hypershift-operator/controllers/scheduler/aws/dedicated_request_serving_nodes.go b/hypershift-operator/controllers/scheduler/aws/dedicated_request_serving_nodes.go index 69dbb720cae..750930ff5cc 100644 --- a/hypershift-operator/controllers/scheduler/aws/dedicated_request_serving_nodes.go +++ b/hypershift-operator/controllers/scheduler/aws/dedicated_request_serving_nodes.go @@ -174,6 +174,7 @@ func (r *DedicatedServingComponentScheduler) Reconcile(ctx context.Context, req return ctrl.Result{}, fmt.Errorf("found too many dedicated nodes for HC: %v", len(dedicatedNodesForHC.Items)) } + // We check existing dedicated Nodes are 2. If not e.g. some was deleted, continue. if scheduled := hcluster.Annotations[hyperv1.HostedClusterScheduledAnnotation]; scheduled == "true" && len(dedicatedNodesForHC.Items) == 2 { log.Info("hosted cluster is already scheduled, nothing to do") return ctrl.Result{}, nil @@ -307,6 +308,7 @@ func (r *DedicatedServingComponentScheduler) updateHostedClusterAnnotations(ctx } if node.Labels[schedulerutil.LBSubnetsLabel] != "" && lbSubnets == "" { lbSubnets = node.Labels[schedulerutil.LBSubnetsLabel] + // If subnets are separated by periods, replace them with commas lbSubnets = strings.ReplaceAll(lbSubnets, ".", ",") } if node.Labels[OSDFleetManagerPairedNodesLabel] != "" && pairLabel == "" { @@ -509,9 +511,11 @@ func (r *DedicatedServingComponentSchedulerAndSizer) Reconcile(ctx context.Conte if len(nodesByZone) > 1 { log.Info("sufficient nodes exist for placement") + // If we have enough nodes, update the hosted cluster. if err := schedulerutil.UpdateHostedCluster(ctx, r.Client, hc, desiredSize, &config, goalNodes); err != nil { return ctrl.Result{}, err } + // Ensure we don't have a placeholder deployment, since we have nodes log.Info("removing placeholder") if err := r.deletePlaceholderDeployment(ctx, hc); err != nil { return ctrl.Result{}, err @@ -578,6 +582,10 @@ func (r *DedicatedServingComponentSchedulerAndSizer) backfillOrClaimNodes(ctx co needClusterLabel = append(needClusterLabel, node) } } + // Find any nodes that are in the same fleet manager group and have the right size + // but are not labeled with the hosted cluster label. Ensure that these nodes are labeled + // and tainted with the hosted cluster label. This can happen if not all nodes were labeled/tainted + // when they were initially selected. if len(needClusterLabel) > 0 { log.Info("backfilling node labels") for _, node := range needClusterLabel { @@ -680,6 +688,7 @@ func (r *DedicatedServingComponentSchedulerAndSizer) deployAndLabelPlaceholderNo return ctrl.Result{}, err } } + // Ensure that any placeholder deployment is deleted log.Info("removing placeholder") if err := r.deletePlaceholderDeployment(ctx, hc); err != nil { return ctrl.Result{}, err diff --git a/hypershift-operator/main.go b/hypershift-operator/main.go index 7f54779aeba..29bbad491fb 100644 --- a/hypershift-operator/main.go +++ b/hypershift-operator/main.go @@ -432,6 +432,8 @@ func resolveOperatorImage(ctx context.Context, mgr ctrl.Manager, opts *StartOpti if err := mgr.GetAPIReader().Get(ctx, crclient.ObjectKeyFromObject(me), me); err != nil { return "", fmt.Errorf("failed to get operator pod %s: %w", crclient.ObjectKeyFromObject(me), err) } + // Use the container status to make sure we get the sha256 reference rather than a potentially + // floating tag. for _, container := range me.Status.ContainerStatuses { if container.Name == "operator" { return strings.TrimPrefix(container.ImageID, "docker-pullable://"), nil @@ -446,6 +448,7 @@ func resolveOperatorImage(ctx context.Context, mgr ctrl.Manager, opts *StartOpti if err != nil { return false, err } + // Apparently this is occasionally set to an empty string if operatorImage == "" { log.Info("operator image is empty, retrying") return false, nil @@ -538,6 +541,9 @@ func setupHostedClusterController(ctx context.Context, mgr ctrl.Manager, opts *S } func cleanupLegacyWebhook(ctx context.Context, mgr ctrl.Manager, opts *StartOptions) error { + // Since we dropped the validation webhook server we need to ensure this resource doesn't exist + // otherwise it will intercept kas requests and fail. + // TODO (alberto): dropped in 4.14. if opts.EnableValidatingWebhook { return nil } @@ -629,6 +635,7 @@ func setupPlatformControllers(mgr ctrl.Manager, opts *StartOptions, mgmtClusterC } func setupAzurePlatformController(mgr ctrl.Manager, log logr.Logger) error { + // ARO HCP uses Swift networking, not Private Link Services if azureutil.IsAroHCP() { return nil } @@ -732,6 +739,8 @@ func setupSupportControllers(mgr ctrl.Manager, opts *StartOptions, mgmtClusterCa return fmt.Errorf("unable to create supported version controller: %w", err) } + // If enabled, start controller to ensure UWM stack is enabled and configured + // to remotely write telemetry metrics. if opts.EnableUWMTelemetryRemoteWrite { if err := (&uwmtelemetry.Reconciler{ Namespace: opts.Namespace, diff --git a/ignition-server/controllers/local_ignitionprovider.go b/ignition-server/controllers/local_ignitionprovider.go index d104689af3a..7a17217734d 100644 --- a/ignition-server/controllers/local_ignitionprovider.go +++ b/ignition-server/controllers/local_ignitionprovider.go @@ -234,6 +234,7 @@ func (p *LocalIgnitionProvider) resolveMCOImage(ctx context.Context, imageProvid return "", err } + // Making sure image uses the registry override for disconnected environments mcoComposedImage := checkedMcoImage.String() if mcoComposedImage != mcoImage { mcoImage = mcoComposedImage @@ -286,6 +287,7 @@ func (p *LocalIgnitionProvider) extractImageReferences(ctx context.Context, rele if err != nil { return "", fmt.Errorf("failed to look up release image metadata: %w", err) } + // Replace the release image with the mirrored release image in disconnected environment cases. if p.ReleaseProvider.GetMirroredReleaseImage() != "" { releaseImage = p.ReleaseProvider.GetMirroredReleaseImage() log.Info("using mirrored release image", "releaseImage", releaseImage) @@ -298,6 +300,7 @@ func (p *LocalIgnitionProvider) extractImageReferences(ctx context.Context, rele return releaseImage, nil } +// For Azure and OpenStack, extract the cloud provider config file as MCO input func (p *LocalIgnitionProvider) writeCloudProviderConfig(ctx context.Context, mcoDir string) error { if p.CloudProvider != hyperv1.AzurePlatform && p.CloudProvider != hyperv1.OpenStackPlatform { return nil @@ -344,6 +347,7 @@ func (p *LocalIgnitionProvider) extractClusterConfigBinary(ctx context.Context, return err } + // Making sure image uses the registry override for disconnected environments ccaComposedImage := checkedClusterConfigImage.String() if ccaComposedImage != clusterConfigImage { clusterConfigImage = ccaComposedImage @@ -402,10 +406,12 @@ func (p *LocalIgnitionProvider) runMCO(ctx context.Context, dirs *payloadDirs, r if err != nil { return fmt.Errorf("failed to serialize pull-secret.yaml: %w", err) } + // NOTE: This overwrites the one in the machine-config-server configmap to ensure it's the one that matches the hash used in the token secret. if err = os.WriteFile(fmt.Sprintf("%s/pull-secret.yaml", dirs.configDir), []byte(serializedPullSecret), 0644); err != nil { return fmt.Errorf("failed to write pull secret to config dir: %w", err) } + // args contains the base args that have not changed over time. args := []string{ "bootstrap", fmt.Sprintf("--root-ca=%s/root-ca.crt", dirs.configDir), @@ -440,6 +446,7 @@ func (p *LocalIgnitionProvider) runMCO(ctx context.Context, dirs *payloadDirs, r return fmt.Errorf("machine-config-operator process failed: %w", err) } + // set missing images condition on the HCP if err := p.reconcileValidReleaseInfoCondition(ctx, imageProvider); err != nil { log.Error(err, "failed to reconcile IgnitionValidReleaseInfo condition") } @@ -467,6 +474,8 @@ func (p *LocalIgnitionProvider) copyMCOOutputToMCC(destDir, mccDir, configDir st } } + // Copy machineconfigpool config data to the MCC input directory. This is + // important to override the pools with the ones generated by the CPO. matches, err := filepath.Glob(filepath.Join(configDir, "*.machineconfigpool.yaml")) if err != nil { return fmt.Errorf("failed to list dir %s: %w", configDir, err) @@ -484,10 +493,12 @@ func (p *LocalIgnitionProvider) runMCC(ctx context.Context, dirs *payloadDirs, i log := ctrl.Log.WithName("get-payload") start := time.Now() + // copy the image config out of the configDir and into the mccBaseDir if err := copyFile(filepath.Join(dirs.configDir, "image-config.yaml"), filepath.Join(dirs.mccDir, "image-config.yaml")); err != nil { return fmt.Errorf("failed to copy image-config.yaml: %w", err) } + // args contains the base args that have not changed over time. args := []string{ "bootstrap", fmt.Sprintf("--manifest-dir=%s", dirs.mccDir), @@ -496,6 +507,7 @@ func (p *LocalIgnitionProvider) runMCC(ctx context.Context, dirs *payloadDirs, i fmt.Sprintf("--dest-dir=%s", dirs.mcsDir), } + // For 4.14 onwards there's a requirement to include the payload version flag. if payloadVersion.GTE(semver.Version{Major: 4, Minor: 14}) { args = append(args, fmt.Sprintf("--payload-version=%s", imageProvider.Version()), @@ -515,6 +527,7 @@ func (p *LocalIgnitionProvider) runMCSAndFetchPayload(ctx context.Context, dirs log := ctrl.Log.WithName("get-payload") start := time.Now() + // The certificate is cached across calls to avoid redundant RSA key generation on every payload request. certPEM, keyPEM, err := p.getOrGenerateMCSCert() if err != nil { return nil, fmt.Errorf("failed to generate certificates: %w", err) @@ -536,12 +549,14 @@ func (p *LocalIgnitionProvider) runMCSAndFetchPayload(ctx context.Context, dirs "--insecure-port=22626", } + // For 4.14 onwards there's a requirement to include the payload version flag. if payloadVersion.GTE(semver.Version{Major: 4, Minor: 14}) { args = append(args, fmt.Sprintf("--payload-version=%s", imageProvider.Version()), ) } + // Spin up the MCS process and ensure it's signaled to terminate when the function returns mcsCtx, cancel := context.WithCancel(ctx) defer cancel() cmd := exec.CommandContext(mcsCtx, filepath.Join(dirs.binDir, "machine-config-server"), args...) @@ -554,6 +569,7 @@ func (p *LocalIgnitionProvider) runMCSAndFetchPayload(ctx context.Context, dirs Timeout: 5 * time.Second, } var payload []byte + // Try connecting to the server until we get a response or the context is closed err = wait.PollUntilContextCancel(ctx, 1*time.Second, true, func(ctx context.Context) (bool, error) { req, err := http.NewRequestWithContext(ctx, "GET", "http://localhost:22626/config/master", nil) if err != nil { @@ -692,6 +708,7 @@ func (p *LocalIgnitionProvider) GetPayload(ctx context.Context, releaseImage, cu return nil, fmt.Errorf("failed to parse payload version: %w", err) } + // set the component to the correct binary name and file path based on the payload version clusterConfigComponent := "cluster-config-api" clusterConfigComponentShort := "cca" clusterConfigFile := "usr/bin/render" @@ -719,14 +736,17 @@ func (p *LocalIgnitionProvider) GetPayload(ctx context.Context, releaseImage, cu return nil, fmt.Errorf("failed to execute %s: %w", clusterConfigComponent, err) } + // First, run the MCO using templates and image refs as input. This generates output for the MCC. if err := p.runMCO(ctx, dirs, releaseImage, pullSecret, mcsConfig, imageProvider, payloadVersion); err != nil { return nil, fmt.Errorf("failed to execute machine-config-operator: %w", err) } + // Next, run the MCC using templates and MCO output as input, producing output for the MCS. if err := p.runMCC(ctx, dirs, imageProvider, payloadVersion); err != nil { return nil, fmt.Errorf("failed to execute machine-config-controller: %w", err) } + // Finally, run the MCS to generate a payload. payload, err := p.runMCSAndFetchPayload(ctx, dirs, imageProvider, payloadVersion) if err != nil { return nil, fmt.Errorf("failed to get payload from mcs: %w", err)