Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion test/e2e/v2/lifecycle/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ func (a *AzurePlatformConfig) ClusterSpecs(releaseImage, n1Image string) []Clust
ExtraArgs: []string{
"--endpoint-access=Private",
"--endpoint-access-private-nat-subnet-id=" + a.privateNATSubnetID,
"--oauth-publishing-strategy=LoadBalancer",
},
},
{
Expand Down Expand Up @@ -194,7 +195,7 @@ func (a *AzurePlatformConfig) TestMatrix(releaseImage string) TestMatrix {
{
Name: "private",
ClusterFile: "cluster-name-private",
LabelFilter: "self-managed-azure-private",
LabelFilter: "self-managed-azure-private || self-managed-azure-oauth-lb-private",
JUnitFile: "junit_self_managed_azure_private.xml",
},
{
Expand Down
98 changes: 98 additions & 0 deletions test/e2e/v2/tests/hosted_cluster_azure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,109 @@ func AzureOAuthLoadBalancerTest(getTestCtx internal.TestContextGetter) {
})
}

// AzureOAuthLoadBalancerPrivateTest registers tests for Azure OAuth LoadBalancer
// publishing validation in a private topology cluster.
// These tests verify that:
// - The oauth-openshift Service is created as a LoadBalancer with an allocated endpoint
// - The Service carries the Azure internal LoadBalancer annotation
// - The OAuth token flow (kubeadmin + htpasswd IDP) works through that endpoint
//
// The OAuth token flow test requires the test runner to have network connectivity
// to the Azure VNet (e.g., running in the management cluster or via VPN/peering).
func AzureOAuthLoadBalancerPrivateTest(getTestCtx internal.TestContextGetter) {
Context("Azure OAuth LoadBalancer in Private Topology", Label("Azure", "self-managed-azure-oauth-lb-private"), Ordered, func() {
var testCtx *internal.TestContext
var controlPlaneNamespace string
var hc *hyperv1.HostedCluster

BeforeAll(func() {
testCtx = getTestCtx()
hc = testCtx.GetHostedCluster()
if hc == nil || hc.Spec.Platform.Type != hyperv1.AzurePlatform {
Skip("Azure OAuth LB Private tests are only for Azure platform")
}
if hc.Spec.Platform.Azure == nil || hc.Spec.Platform.Azure.Topology != hyperv1.AzureTopologyPrivate {
Skip("Azure OAuth LB Private tests require Private topology")
}
controlPlaneNamespace = testCtx.ControlPlaneNamespace
Expect(controlPlaneNamespace).NotTo(BeEmpty(), "control plane namespace must be set")

strategy := netutil.ServicePublishingStrategyByTypeByHC(hc, hyperv1.OAuthServer)
if strategy == nil || strategy.Type != hyperv1.LoadBalancer {
Skip("Azure OAuth LB Private tests require OAuthServer with LoadBalancer publishing strategy")
}
})

It("should create oauth-openshift Service as LoadBalancer with an allocated endpoint", func() {
ctx := testCtx.Context

e2eutil.EventuallyObject(GinkgoTB(), ctx, "oauth-openshift Service is LoadBalancer with an allocated endpoint",
func(ctx context.Context) (*corev1.Service, error) {
svc := hcpmanifests.OauthServerService(controlPlaneNamespace)
err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(svc), svc)
return svc, err
},
[]e2eutil.Predicate[*corev1.Service]{
func(svc *corev1.Service) (done bool, reasons string, err error) {
if svc.Spec.Type != corev1.ServiceTypeLoadBalancer {
return false, fmt.Sprintf("expected Service type LoadBalancer, got %s", svc.Spec.Type), nil
}
return true, "oauth-openshift Service is type LoadBalancer", nil
},
func(svc *corev1.Service) (done bool, reasons string, err error) {
if len(svc.Status.LoadBalancer.Ingress) == 0 {
return false, "LoadBalancer has no ingress entries yet", nil
}
ingress := svc.Status.LoadBalancer.Ingress[0]
if ingress.IP == "" && ingress.Hostname == "" {
return false, "LoadBalancer ingress has no IP or hostname", nil
}
host := ingress.IP
if host == "" {
host = ingress.Hostname
}
return true, fmt.Sprintf("oauth-openshift LoadBalancer has an allocated endpoint: %s", host), nil
},
},
e2eutil.WithTimeout(10*time.Minute),
)
})

It("should have Azure internal LB annotation on oauth-openshift Service", func() {
ctx := testCtx.Context
e2eutil.EventuallyObject(GinkgoTB(), ctx, "oauth-openshift Service has Azure internal LB annotation",
func(ctx context.Context) (*corev1.Service, error) {
svc := hcpmanifests.OauthServerService(controlPlaneNamespace)
err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(svc), svc)
return svc, err
},
[]e2eutil.Predicate[*corev1.Service]{
func(svc *corev1.Service) (done bool, reasons string, err error) {
val, ok := svc.Annotations[azureutil.InternalLoadBalancerAnnotation]
if !ok || val != azureutil.InternalLoadBalancerValue {
return false, fmt.Sprintf("expected annotation %q to be %q, got %q (present: %v)",
azureutil.InternalLoadBalancerAnnotation, azureutil.InternalLoadBalancerValue, val, ok), nil
}
return true, "oauth-openshift Service has internal LB annotation", nil
},
},
e2eutil.WithTimeout(10*time.Minute),
)
})

It("should complete OAuth token flow through LoadBalancer endpoint", func() {
ctx := testCtx.Context
e2eutil.ValidateOAuthWithIdentityProviderViaLoadBalancer(GinkgoTB(), ctx, testCtx.MgmtClient, hc)
})
Comment on lines +379 to +382

@coderabbitai coderabbitai Bot May 26, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Restore cluster auth state after OAuth flow mutation.

Line 381 calls ValidateOAuthWithIdentityProviderViaLoadBalancer, which mutates HostedCluster OAuth config and creates an htpasswd Secret without restoring/deleting them. This makes outcomes order-dependent on shared clusters. Please capture original state and defer restoration, with apierrors.IsNotFound() handling during cleanup.

As per coding guidelines: "When a test mutates cluster state, capture original state before mutation and defer restoration; check apierrors.IsNotFound() in cleanup to handle already-deleted resources".

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@test/e2e/v2/tests/hosted_cluster_azure_test.go` around lines 379 - 382, The
test calls ValidateOAuthWithIdentityProviderViaLoadBalancer which mutates the
HostedCluster OAuth config and creates an htpasswd Secret without restoring
them; capture the original HostedCluster.Spec.OAuth (and any other mutated
fields) and the existence/data of the htpasswd Secret before calling
ValidateOAuthWithIdentityProviderViaLoadBalancer, then defer a restoration
function that re-applies the original OAuth config and deletes or restores the
htpasswd Secret, and in that deferred cleanup handle apierrors.IsNotFound(error)
when attempting to restore/delete resources so cleanup tolerates resources
already removed; use the HostedCluster object and the Secret name returned/used
by ValidateOAuthWithIdentityProviderViaLoadBalancer to identify what to restore.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The mutation happens inside the shared helper ValidateOAuthWithIdentityProviderViaLoadBalancer (in test/e2e/util/oauth.go), which is also used by the existing AzureOAuthLoadBalancerTest (line 284, same file) without cleanup. Adding cleanup in the caller would mean duplicating restore logic that belongs in the helper itself.

This is a pre-existing concern, not introduced by this PR. Noted in the PR description as a known tradeoff. If we address it, it should be done in the shared helper as a follow-up.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Nirshal, that's a fair point — the concern is pre-existing and your reasoning is correct. Duplicating restore logic in each caller would be the wrong approach; the cleanup belongs inside ValidateOAuthWithIdentityProviderViaLoadBalancer itself so all callers benefit automatically.

Would you like me to open a follow-up GitHub issue to track adding htpasswd Secret cleanup (and OAuth config restoration) to the shared helper?


🧠 Learnings used
Learnt from: cblecker
Repo: openshift/hypershift PR: 8511
File: test/e2e/v2/tests/hosted_cluster_metrics_test.go:203-245
Timestamp: 2026-05-15T03:34:00.446Z
Learning: When reviewing HyperShift v2 e2e tests under test/e2e/v2/, do not require MicroShift “guard” logic or markers. HyperShift v2 e2e tests in this directory run only against hosted clusters (no MicroShift targets), so missing exutil.IsMicroShiftCluster() checks and/or [Skipped:MicroShift] labels should not be flagged.

Learnt from: bryan-cox
Repo: openshift/hypershift PR: 8527
File: test/e2e/v2/tests/nodepool_autoscaling_test.go:236-236
Timestamp: 2026-05-15T14:34:25.062Z
Learning: In openshift/hypershift v2 e2e lifecycle CI tests under test/e2e/v2/tests/, it is OK to reference external image registries because the CI environment is not air-gapped. Image references like registry.access.redhat.com/ubi9/ubi-minimal:latest in lifecycle DaemonSet definitions should not be flagged for requiring internal registries. For the autoscaling workload created by newAutoscalingWorkload (nodepool_autoscaling_test.go), ensure it uses registry.k8s.io/pause:3.9 (not ubi-minimal).

Learnt from: bryan-cox
Repo: openshift/hypershift PR: 8469
File: test/e2e/util/util_test.go:17-131
Timestamp: 2026-05-17T22:37:02.057Z
Learning: In Go e2e tests under test/e2e, do not call t.Parallel() (neither at the top-level test nor in any subtest) when the test (or any helper it calls) uses t.Setenv—for example, when calling azureutil.SetAsAroHCPTest(t). The Go test runner panics with "testing: test using t.Setenv can not use t.Parallel" because environment variables are process-global. Keep such tests serial to ensure the environment setup is safe and deterministic.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CI run showed this test can't pass as-is — WaitForOAuthLoadBalancerReady (in test/e2e/util/oauth.go:350-375) does a direct TLS dial to https://<oauth-hostname>/healthz. For private topology that hostname resolves to the Azure internal LB IP (e.g., 10.0.0.22), which is unreachable from the CI build cluster.

Interestingly the OAuth token flow itself succeeds because it routes through the guest kubeconfig transport — it's specifically the raw healthcheck that fails. So the underlying validation logic works; it's just the connectivity check that's incompatible with private topology.

The fix needs to happen in the shared helper. A few approaches:

  1. Skip WaitForOAuthLoadBalancerReady entirely for private topology and rely on the token flow validation alone — it already exercises the full OAuth stack end-to-end.
  2. Use the guest kubeconfig transport for the healthcheck too, the same way the token flow step does.
  3. Route it through a pod on the management cluster that has VNet connectivity.

This needs to be addressed in the helper before this can land.

})
}

// RegisterHostedClusterAzureTests registers all Azure-specific hosted cluster tests.
func RegisterHostedClusterAzureTests(getTestCtx internal.TestContextGetter) {
AzurePublicClusterTest(getTestCtx)
AzurePrivateTopologyTest(getTestCtx)
AzureOAuthLoadBalancerTest(getTestCtx)
AzureOAuthLoadBalancerPrivateTest(getTestCtx)
}

var _ = Describe("Hosted Cluster Azure", Label("hosted-cluster-azure"), func() {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The public variant (AzureOAuthLoadBalancerTest) works cleanly because it runs on a dedicated cluster (cluster-name-oauth-lb) created specifically for it — mutation is safe there since nothing else runs on that cluster. This private variant is trying to replicate that pattern but without the dedicated cluster, sharing cluster-name-private with the read-only AzurePrivateTopologyTest instead.

Registering this last is a reasonable mitigation, but declaration order between Context blocks isn't enforced by Ginkgo's Ordered decorator — that only applies within a single Context. If the registration order changes, or a new test gets inserted between the two, the read-only private topology tests could run after the HC has been mutated (htpasswd IDP created, kubeadmin secret deleted).

The cleanest fix that matches how the public variant is structured would be a dedicated cluster-name-oauth-lb-private cluster. I understand the reviewer feedback to reduce cluster count, so if a dedicated cluster is off the table, the other option is to make this a sequential step in the test matrix (after the read-only private tests), the way etcd-chaos runs sequentially after upgrade on the upgrade cluster. That would give the same isolation guarantee without an extra cluster.

Expand Down