From 4d34fd3ce144141a6399f6ee7afc8d33a20bd5c3 Mon Sep 17 00:00:00 2001 From: enxebre Date: Thu, 19 Feb 2026 12:34:40 +0100 Subject: [PATCH 1/3] feat(cpo): expose KAS availability and latency metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instrument the existing healthCheckKASEndpoint function in the control-plane-operator to expose two new Prometheus metrics: - hypershift_kube_apiserver_available: Gauge reporting 1 when the KAS /healthz endpoint returns HTTP 200, 0 otherwise. - hypershift_kube_apiserver_request_duration_seconds: Histogram tracking the latency of the /healthz health check probe. These metrics are registered with the controller-runtime metrics registry and are automatically scraped by the existing PodMonitor for the control-plane-operator. Each CPO pod runs in its own HCP namespace, so metrics are naturally scoped per hosted cluster. The existing HostedControlPlaneAvailable condition logic is unchanged — metrics are a side-effect of the existing health check, not a replacement. This eliminates the need for external monitoring tooling (e.g. route-monitor-operator) to track KAS availability and latency for SLA purposes in HCP offerings. Ref: CNTRLPLANE-2775 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../hostedcontrolplane_controller.go | 27 +++++++++--- .../hostedcontrolplane/kas/metrics.go | 42 +++++++++++++++++++ control-plane-operator/main.go | 3 ++ 3 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 control-plane-operator/controllers/hostedcontrolplane/kas/metrics.go diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go index 629ab0e2b60..c70e9fd0185 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go @@ -180,6 +180,7 @@ type HostedControlPlaneReconciler struct { OperateOnReleaseImage string DefaultIngressDomain string MetricsSet metrics.MetricsSet + KASHealthMetrics *kas.KASHealthMetrics SREConfigHash string ec2Client awsapi.EC2API awsSession *aws.Config @@ -944,9 +945,9 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C // When the cluster is private, checking the load balancers will depend on whether the load balancer is // using the right subnets. To avoid uncertainty, we'll limit the check to the service endpoint. if hcp.Spec.Platform.Type == hyperv1.IBMCloudPlatform { - return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCIBMCloudPort) + return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCIBMCloudPort, r.KASHealthMetrics) } - return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCPort) + return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCPort, r.KASHealthMetrics) case serviceStrategy.Type == hyperv1.Route: if hcp.Spec.Platform.Type != hyperv1.IBMCloudPlatform { externalRoute := manifests.KubeAPIServerExternalPublicRoute(hcp.Namespace) @@ -958,7 +959,7 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C if err != nil { return err } - return healthCheckKASEndpoint(endpoint, port) + return healthCheckKASEndpoint(endpoint, port, r.KASHealthMetrics) } case serviceStrategy.Type == hyperv1.LoadBalancer: svc := manifests.KubeAPIServerService(hcp.Namespace) @@ -987,17 +988,33 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C } else if LBIngress.IP != "" { ingressPoint = LBIngress.IP } - return healthCheckKASEndpoint(ingressPoint, port) + return healthCheckKASEndpoint(ingressPoint, port, r.KASHealthMetrics) } return nil } -func healthCheckKASEndpoint(ingressPoint string, port int) error { +func healthCheckKASEndpoint(ingressPoint string, port int, m *kas.KASHealthMetrics) error { healthEndpoint := fmt.Sprintf("https://%s:%d/healthz", ingressPoint, port) httpClient := util.InsecureHTTPClient() httpClient.Timeout = 10 * time.Second + + start := time.Now() resp, err := httpClient.Get(healthEndpoint) + duration := time.Since(start).Seconds() + if resp != nil { + defer resp.Body.Close() + } + + if m != nil { + m.RequestDuration.Observe(duration) + if err == nil && resp != nil && resp.StatusCode == http.StatusOK { + m.Available.Set(1) + } else { + m.Available.Set(0) + } + } + if err != nil { return err } diff --git a/control-plane-operator/controllers/hostedcontrolplane/kas/metrics.go b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics.go new file mode 100644 index 00000000000..1f6b7dacb76 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics.go @@ -0,0 +1,42 @@ +package kas + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + KASAvailableMetricName = "hypershift_kube_apiserver_available" + KASRequestDurationMetricName = "hypershift_kube_apiserver_request_duration_seconds" +) + +// kasRequestDurationBuckets defines the histogram bucket boundaries for KAS +// health check latency measurements, ranging from 10ms to 10s. +var kasRequestDurationBuckets = []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10} + +// KASHealthMetrics holds Prometheus metrics for KAS health check probes. +// Each CPO pod runs in its own HCP namespace, so metrics are naturally +// scoped per hosted cluster without needing additional labels. +type KASHealthMetrics struct { + Available prometheus.Gauge + RequestDuration prometheus.Histogram +} + +// NewKASHealthMetrics creates and registers KAS health metrics with the +// controller-runtime metrics registry. The existing PodMonitor for the +// control-plane-operator will automatically scrape these metrics. +func NewKASHealthMetrics(reg prometheus.Registerer) *KASHealthMetrics { + m := &KASHealthMetrics{ + Available: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: KASAvailableMetricName, + Help: "1 if the KAS /healthz endpoint returns HTTP 200, 0 otherwise.", + }), + RequestDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: KASRequestDurationMetricName, + Help: "Latency of the KAS /healthz health check probe in seconds.", + Buckets: kasRequestDurationBuckets, + }), + } + + reg.MustRegister(m.Available, m.RequestDuration) + return m +} diff --git a/control-plane-operator/main.go b/control-plane-operator/main.go index 6fde842dde1..41d2f36ece6 100644 --- a/control-plane-operator/main.go +++ b/control-plane-operator/main.go @@ -16,6 +16,7 @@ import ( "github.com/openshift/hypershift/control-plane-operator/controllers/gcpprivateserviceconnect" "github.com/openshift/hypershift/control-plane-operator/controllers/healthcheck" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" endpointresolver "github.com/openshift/hypershift/control-plane-operator/endpoint-resolver" "github.com/openshift/hypershift/control-plane-operator/featuregates" @@ -67,6 +68,7 @@ import ( crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -530,6 +532,7 @@ func NewStartCommand() *cobra.Command { OperateOnReleaseImage: os.Getenv("OPERATE_ON_RELEASE_IMAGE"), DefaultIngressDomain: defaultIngressDomain, MetricsSet: metricsSet, + KASHealthMetrics: kas.NewKASHealthMetrics(crmetrics.Registry), CertRotationScale: certRotationScale, EnableCVOManagementClusterMetricsAccess: enableCVOManagementClusterMetricsAccess, ImageMetadataProvider: imageMetaDataProvider, From b4f80e4245e68d34ddb54f736d584f0c2a41e391 Mon Sep 17 00:00:00 2001 From: enxebre Date: Thu, 19 Feb 2026 12:34:49 +0100 Subject: [PATCH 2/3] test(cpo): add unit tests for KAS health metrics Add tests for the healthCheckKASEndpoint function that verify metrics are correctly recorded during health check probes: - Gauge set to 1 and histogram observed on successful 200 response - Gauge set to 0 on non-200 response (503) - Gauge set to 0 on connection error (unreachable endpoint) - No panic when metrics is nil (backward compatibility) Also add a basic test for KASHealthMetrics construction and registration. Ref: CNTRLPLANE-2775 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../hostedcontrolplane_controller_test.go | 86 +++++++++++++++++++ .../hostedcontrolplane/kas/metrics_test.go | 40 +++++++++ 2 files changed, 126 insertions(+) create mode 100644 control-plane-operator/controllers/hostedcontrolplane/kas/metrics_test.go diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go index f6c060f3e05..bed04d83dee 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go @@ -4,7 +4,11 @@ import ( "context" _ "embed" "fmt" + "net/http" + "net/http/httptest" + "net/url" "sort" + "strconv" "strings" "testing" "time" @@ -15,6 +19,7 @@ import ( "github.com/openshift/hypershift/api/util/ipnet" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/common" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/infra" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" endpointresolverv2 "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/v2/endpoint_resolver" etcdv2 "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/v2/etcd" @@ -77,6 +82,8 @@ import ( "github.com/docker/distribution" "github.com/go-logr/zapr" "github.com/opencontainers/go-digest" + "github.com/prometheus/client_golang/prometheus" + prometheustestutil "github.com/prometheus/client_golang/prometheus/testutil" "go.uber.org/mock/gomock" "go.uber.org/zap/zaptest" ) @@ -4431,3 +4438,82 @@ var _ util.ImageMetadataProvider = &fakeVersionImageMetadataProvider{} // Compile-time assertion for clock interface used by tests. var _ clock.Clock = &testingclock.FakeClock{} + +func TestHealthCheckKASEndpointMetrics(t *testing.T) { + t.Run("When KAS endpoint returns 200, it should set available to 1 and record duration", func(t *testing.T) { + g := NewGomegaWithT(t) + + server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + m := newTestKASHealthMetrics(t) + host, port := parseHostPort(t, server) + + err := healthCheckKASEndpoint(host, port, m) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(1))) + g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) + + t.Run("When KAS endpoint returns 503, it should set available to 0", func(t *testing.T) { + g := NewGomegaWithT(t) + + server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer server.Close() + + m := newTestKASHealthMetrics(t) + host, port := parseHostPort(t, server) + + err := healthCheckKASEndpoint(host, port, m) + g.Expect(err).To(HaveOccurred()) + g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(0))) + g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) + + t.Run("When KAS endpoint is unreachable, it should set available to 0", func(t *testing.T) { + g := NewGomegaWithT(t) + + m := newTestKASHealthMetrics(t) + + err := healthCheckKASEndpoint("192.0.2.1", 1, m) + g.Expect(err).To(HaveOccurred()) + g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(0))) + g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) + + t.Run("When metrics is nil, it should not panic", func(t *testing.T) { + g := NewGomegaWithT(t) + + server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + host, port := parseHostPort(t, server) + err := healthCheckKASEndpoint(host, port, nil) + g.Expect(err).NotTo(HaveOccurred()) + }) +} + +func newTestKASHealthMetrics(t *testing.T) *kas.KASHealthMetrics { + t.Helper() + return kas.NewKASHealthMetrics(prometheus.NewRegistry()) +} + +func parseHostPort(t *testing.T, server *httptest.Server) (string, int) { + t.Helper() + u, err := url.Parse(server.URL) + if err != nil { + t.Fatalf("failed to parse server URL: %v", err) + } + host := u.Hostname() + port, err := strconv.Atoi(u.Port()) + if err != nil { + t.Fatalf("failed to parse port: %v", err) + } + return host, port +} diff --git a/control-plane-operator/controllers/hostedcontrolplane/kas/metrics_test.go b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics_test.go new file mode 100644 index 00000000000..7be84be5f2e --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics_test.go @@ -0,0 +1,40 @@ +package kas + +import ( + "testing" + + . "github.com/onsi/gomega" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestKASHealthMetricsOperations(t *testing.T) { + t.Parallel() + + t.Run("When creating KAS health metrics, it should register both metrics", func(t *testing.T) { + g := NewGomegaWithT(t) + registry := prometheus.NewRegistry() + m := NewKASHealthMetrics(registry) + + g.Expect(testutil.ToFloat64(m.Available)).To(Equal(0.0)) + }) + + t.Run("When setting availability to 1, it should reflect the new value", func(t *testing.T) { + g := NewGomegaWithT(t) + registry := prometheus.NewRegistry() + m := NewKASHealthMetrics(registry) + + m.Available.Set(1) + g.Expect(testutil.ToFloat64(m.Available)).To(Equal(1.0)) + }) + + t.Run("When observing a request duration, it should be counted", func(t *testing.T) { + g := NewGomegaWithT(t) + registry := prometheus.NewRegistry() + m := NewKASHealthMetrics(registry) + + m.RequestDuration.Observe(0.5) + g.Expect(testutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) +} From db0c022155ae7c5438a38e2dc91cce202f727894 Mon Sep 17 00:00:00 2001 From: enxebre Date: Thu, 19 Feb 2026 12:34:59 +0100 Subject: [PATCH 3/3] test(e2e): validate KAS health metrics from CPO pod Add ValidateCPOMetrics function that verifies both hypershift_kube_apiserver_available and hypershift_kube_apiserver_request_duration_seconds metrics are present on the control-plane-operator pod's metrics endpoint (port 8080). The validation runs as an inline check in TestCreateCluster alongside other Ensure* validations, rather than in the after() hook, to avoid blocking all e2e tests if metrics emission is unstable. It follows the established pattern using GetMetricsFromPod with polling (10s interval, 5min timeout). Ref: CNTRLPLANE-2775 Co-Authored-By: Claude Opus 4.6 (1M context) --- test/e2e/create_cluster_test.go | 1 + test/e2e/util/util.go | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/test/e2e/create_cluster_test.go b/test/e2e/create_cluster_test.go index 75b6daffa82..9dfa4f7eaa8 100644 --- a/test/e2e/create_cluster_test.go +++ b/test/e2e/create_cluster_test.go @@ -125,6 +125,7 @@ func TestCreateCluster(t *testing.T) { e2eutil.EnsureGlobalPullSecret(t, ctx, mgtClient, hostedCluster, globalOpts.AdditionalPullSecretFile) e2eutil.EnsureMetricsForwarderWorking(t, ctx, mgtClient, hostedCluster) + e2eutil.ValidateCPOMetrics(t, ctx, mgtClient, hostedCluster) // Verify CPO override image if TEST_CPO_OVERRIDE=1 is set if os.Getenv("TEST_CPO_OVERRIDE") == "1" { diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index 47e9b812fb1..d2ff2a27586 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -24,6 +24,7 @@ import ( awsinfra "github.com/openshift/hypershift/cmd/infra/aws" awsutil "github.com/openshift/hypershift/cmd/infra/aws/util" awsprivatelink "github.com/openshift/hypershift/control-plane-operator/controllers/awsprivatelink" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" cpomanifests "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" hccokasvap "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/kas" hccomanifests "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/manifests" @@ -2758,6 +2759,8 @@ const ( // TODO (jparrill): We need to separate the metrics.go from the main pkg in the hypershift-operator. // Delete these references when it's done and import it from there HypershiftOperatorInfoName = "hypershift_operator_info" + + cpoMetricsPort = "8080" ) func extractDataFromFamilies(metricFamilies map[string]*dto.MetricFamily, metric, labelKey, labelValue string) []*dto.LabelPair { @@ -2854,6 +2857,44 @@ func ValidateMetrics(t *testing.T, ctx context.Context, client crclient.Client, }) } +// ValidateCPOMetrics verifies that KAS health metrics are exposed from the +// control-plane-operator pod in the HCP namespace. +func ValidateCPOMetrics(t *testing.T, ctx context.Context, c crclient.Client, hc *hyperv1.HostedCluster) { + t.Run("When KAS health metrics are exposed, it should contain availability and latency data", func(t *testing.T) { + AtLeast(t, Version423) + if hc.Spec.Platform.Type == hyperv1.NonePlatform { + t.Skip("skipping on None platform") + } + + kasMetrics := []string{ + kas.KASAvailableMetricName, + kas.KASRequestDurationMetricName, + } + hcpNamespace := manifests.HostedControlPlaneNamespace(hc.Namespace, hc.Name) + + err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { + mf, err := GetMetricsFromPod(ctx, c, "control-plane-operator", "control-plane-operator", hcpNamespace, cpoMetricsPort) + if err != nil { + t.Logf("unable to get CPO metrics: %v", err) + return false, nil + } + for _, metricName := range kasMetrics { + // These metrics are emitted without labels, so check family presence directly + // rather than using ValidateMetricPresence which relies on label iteration. + family, ok := mf[metricName] + if !ok || len(family.Metric) == 0 { + t.Logf("Expected results for metric %q, found none", metricName) + return false, nil + } + } + return true, nil + }) + if err != nil { + t.Errorf("failed to validate KAS health metrics: %v", err) + } + }) +} + func getIngressRouterDefaultIP(t *testing.T, ctx context.Context, client crclient.Client, _ *hyperv1.HostedCluster) (string, error) { t.Helper()