diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go index 629ab0e2b60..c70e9fd0185 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go @@ -180,6 +180,7 @@ type HostedControlPlaneReconciler struct { OperateOnReleaseImage string DefaultIngressDomain string MetricsSet metrics.MetricsSet + KASHealthMetrics *kas.KASHealthMetrics SREConfigHash string ec2Client awsapi.EC2API awsSession *aws.Config @@ -944,9 +945,9 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C // When the cluster is private, checking the load balancers will depend on whether the load balancer is // using the right subnets. To avoid uncertainty, we'll limit the check to the service endpoint. if hcp.Spec.Platform.Type == hyperv1.IBMCloudPlatform { - return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCIBMCloudPort) + return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCIBMCloudPort, r.KASHealthMetrics) } - return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCPort) + return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCPort, r.KASHealthMetrics) case serviceStrategy.Type == hyperv1.Route: if hcp.Spec.Platform.Type != hyperv1.IBMCloudPlatform { externalRoute := manifests.KubeAPIServerExternalPublicRoute(hcp.Namespace) @@ -958,7 +959,7 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C if err != nil { return err } - return healthCheckKASEndpoint(endpoint, port) + return healthCheckKASEndpoint(endpoint, port, r.KASHealthMetrics) } case serviceStrategy.Type == hyperv1.LoadBalancer: svc := manifests.KubeAPIServerService(hcp.Namespace) @@ -987,17 +988,33 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C } else if LBIngress.IP != "" { ingressPoint = LBIngress.IP } - return healthCheckKASEndpoint(ingressPoint, port) + return healthCheckKASEndpoint(ingressPoint, port, r.KASHealthMetrics) } return nil } -func healthCheckKASEndpoint(ingressPoint string, port int) error { +func healthCheckKASEndpoint(ingressPoint string, port int, m *kas.KASHealthMetrics) error { healthEndpoint := fmt.Sprintf("https://%s:%d/healthz", ingressPoint, port) httpClient := util.InsecureHTTPClient() httpClient.Timeout = 10 * time.Second + + start := time.Now() resp, err := httpClient.Get(healthEndpoint) + duration := time.Since(start).Seconds() + if resp != nil { + defer resp.Body.Close() + } + + if m != nil { + m.RequestDuration.Observe(duration) + if err == nil && resp != nil && resp.StatusCode == http.StatusOK { + m.Available.Set(1) + } else { + m.Available.Set(0) + } + } + if err != nil { return err } diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go index f6c060f3e05..bed04d83dee 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller_test.go @@ -4,7 +4,11 @@ import ( "context" _ "embed" "fmt" + "net/http" + "net/http/httptest" + "net/url" "sort" + "strconv" "strings" "testing" "time" @@ -15,6 +19,7 @@ import ( "github.com/openshift/hypershift/api/util/ipnet" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/common" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/infra" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" endpointresolverv2 "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/v2/endpoint_resolver" etcdv2 "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/v2/etcd" @@ -77,6 +82,8 @@ import ( "github.com/docker/distribution" "github.com/go-logr/zapr" "github.com/opencontainers/go-digest" + "github.com/prometheus/client_golang/prometheus" + prometheustestutil "github.com/prometheus/client_golang/prometheus/testutil" "go.uber.org/mock/gomock" "go.uber.org/zap/zaptest" ) @@ -4431,3 +4438,82 @@ var _ util.ImageMetadataProvider = &fakeVersionImageMetadataProvider{} // Compile-time assertion for clock interface used by tests. var _ clock.Clock = &testingclock.FakeClock{} + +func TestHealthCheckKASEndpointMetrics(t *testing.T) { + t.Run("When KAS endpoint returns 200, it should set available to 1 and record duration", func(t *testing.T) { + g := NewGomegaWithT(t) + + server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + m := newTestKASHealthMetrics(t) + host, port := parseHostPort(t, server) + + err := healthCheckKASEndpoint(host, port, m) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(1))) + g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) + + t.Run("When KAS endpoint returns 503, it should set available to 0", func(t *testing.T) { + g := NewGomegaWithT(t) + + server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer server.Close() + + m := newTestKASHealthMetrics(t) + host, port := parseHostPort(t, server) + + err := healthCheckKASEndpoint(host, port, m) + g.Expect(err).To(HaveOccurred()) + g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(0))) + g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) + + t.Run("When KAS endpoint is unreachable, it should set available to 0", func(t *testing.T) { + g := NewGomegaWithT(t) + + m := newTestKASHealthMetrics(t) + + err := healthCheckKASEndpoint("192.0.2.1", 1, m) + g.Expect(err).To(HaveOccurred()) + g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(0))) + g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) + + t.Run("When metrics is nil, it should not panic", func(t *testing.T) { + g := NewGomegaWithT(t) + + server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + host, port := parseHostPort(t, server) + err := healthCheckKASEndpoint(host, port, nil) + g.Expect(err).NotTo(HaveOccurred()) + }) +} + +func newTestKASHealthMetrics(t *testing.T) *kas.KASHealthMetrics { + t.Helper() + return kas.NewKASHealthMetrics(prometheus.NewRegistry()) +} + +func parseHostPort(t *testing.T, server *httptest.Server) (string, int) { + t.Helper() + u, err := url.Parse(server.URL) + if err != nil { + t.Fatalf("failed to parse server URL: %v", err) + } + host := u.Hostname() + port, err := strconv.Atoi(u.Port()) + if err != nil { + t.Fatalf("failed to parse port: %v", err) + } + return host, port +} diff --git a/control-plane-operator/controllers/hostedcontrolplane/kas/metrics.go b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics.go new file mode 100644 index 00000000000..1f6b7dacb76 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics.go @@ -0,0 +1,42 @@ +package kas + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + KASAvailableMetricName = "hypershift_kube_apiserver_available" + KASRequestDurationMetricName = "hypershift_kube_apiserver_request_duration_seconds" +) + +// kasRequestDurationBuckets defines the histogram bucket boundaries for KAS +// health check latency measurements, ranging from 10ms to 10s. +var kasRequestDurationBuckets = []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10} + +// KASHealthMetrics holds Prometheus metrics for KAS health check probes. +// Each CPO pod runs in its own HCP namespace, so metrics are naturally +// scoped per hosted cluster without needing additional labels. +type KASHealthMetrics struct { + Available prometheus.Gauge + RequestDuration prometheus.Histogram +} + +// NewKASHealthMetrics creates and registers KAS health metrics with the +// controller-runtime metrics registry. The existing PodMonitor for the +// control-plane-operator will automatically scrape these metrics. +func NewKASHealthMetrics(reg prometheus.Registerer) *KASHealthMetrics { + m := &KASHealthMetrics{ + Available: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: KASAvailableMetricName, + Help: "1 if the KAS /healthz endpoint returns HTTP 200, 0 otherwise.", + }), + RequestDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: KASRequestDurationMetricName, + Help: "Latency of the KAS /healthz health check probe in seconds.", + Buckets: kasRequestDurationBuckets, + }), + } + + reg.MustRegister(m.Available, m.RequestDuration) + return m +} diff --git a/control-plane-operator/controllers/hostedcontrolplane/kas/metrics_test.go b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics_test.go new file mode 100644 index 00000000000..7be84be5f2e --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/kas/metrics_test.go @@ -0,0 +1,40 @@ +package kas + +import ( + "testing" + + . "github.com/onsi/gomega" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestKASHealthMetricsOperations(t *testing.T) { + t.Parallel() + + t.Run("When creating KAS health metrics, it should register both metrics", func(t *testing.T) { + g := NewGomegaWithT(t) + registry := prometheus.NewRegistry() + m := NewKASHealthMetrics(registry) + + g.Expect(testutil.ToFloat64(m.Available)).To(Equal(0.0)) + }) + + t.Run("When setting availability to 1, it should reflect the new value", func(t *testing.T) { + g := NewGomegaWithT(t) + registry := prometheus.NewRegistry() + m := NewKASHealthMetrics(registry) + + m.Available.Set(1) + g.Expect(testutil.ToFloat64(m.Available)).To(Equal(1.0)) + }) + + t.Run("When observing a request duration, it should be counted", func(t *testing.T) { + g := NewGomegaWithT(t) + registry := prometheus.NewRegistry() + m := NewKASHealthMetrics(registry) + + m.RequestDuration.Observe(0.5) + g.Expect(testutil.CollectAndCount(m.RequestDuration)).To(Equal(1)) + }) +} diff --git a/control-plane-operator/main.go b/control-plane-operator/main.go index 6fde842dde1..41d2f36ece6 100644 --- a/control-plane-operator/main.go +++ b/control-plane-operator/main.go @@ -16,6 +16,7 @@ import ( "github.com/openshift/hypershift/control-plane-operator/controllers/gcpprivateserviceconnect" "github.com/openshift/hypershift/control-plane-operator/controllers/healthcheck" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" endpointresolver "github.com/openshift/hypershift/control-plane-operator/endpoint-resolver" "github.com/openshift/hypershift/control-plane-operator/featuregates" @@ -67,6 +68,7 @@ import ( crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -530,6 +532,7 @@ func NewStartCommand() *cobra.Command { OperateOnReleaseImage: os.Getenv("OPERATE_ON_RELEASE_IMAGE"), DefaultIngressDomain: defaultIngressDomain, MetricsSet: metricsSet, + KASHealthMetrics: kas.NewKASHealthMetrics(crmetrics.Registry), CertRotationScale: certRotationScale, EnableCVOManagementClusterMetricsAccess: enableCVOManagementClusterMetricsAccess, ImageMetadataProvider: imageMetaDataProvider, diff --git a/test/e2e/create_cluster_test.go b/test/e2e/create_cluster_test.go index 75b6daffa82..9dfa4f7eaa8 100644 --- a/test/e2e/create_cluster_test.go +++ b/test/e2e/create_cluster_test.go @@ -125,6 +125,7 @@ func TestCreateCluster(t *testing.T) { e2eutil.EnsureGlobalPullSecret(t, ctx, mgtClient, hostedCluster, globalOpts.AdditionalPullSecretFile) e2eutil.EnsureMetricsForwarderWorking(t, ctx, mgtClient, hostedCluster) + e2eutil.ValidateCPOMetrics(t, ctx, mgtClient, hostedCluster) // Verify CPO override image if TEST_CPO_OVERRIDE=1 is set if os.Getenv("TEST_CPO_OVERRIDE") == "1" { diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index 47e9b812fb1..d2ff2a27586 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -24,6 +24,7 @@ import ( awsinfra "github.com/openshift/hypershift/cmd/infra/aws" awsutil "github.com/openshift/hypershift/cmd/infra/aws/util" awsprivatelink "github.com/openshift/hypershift/control-plane-operator/controllers/awsprivatelink" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" cpomanifests "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" hccokasvap "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/kas" hccomanifests "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/manifests" @@ -2758,6 +2759,8 @@ const ( // TODO (jparrill): We need to separate the metrics.go from the main pkg in the hypershift-operator. // Delete these references when it's done and import it from there HypershiftOperatorInfoName = "hypershift_operator_info" + + cpoMetricsPort = "8080" ) func extractDataFromFamilies(metricFamilies map[string]*dto.MetricFamily, metric, labelKey, labelValue string) []*dto.LabelPair { @@ -2854,6 +2857,44 @@ func ValidateMetrics(t *testing.T, ctx context.Context, client crclient.Client, }) } +// ValidateCPOMetrics verifies that KAS health metrics are exposed from the +// control-plane-operator pod in the HCP namespace. +func ValidateCPOMetrics(t *testing.T, ctx context.Context, c crclient.Client, hc *hyperv1.HostedCluster) { + t.Run("When KAS health metrics are exposed, it should contain availability and latency data", func(t *testing.T) { + AtLeast(t, Version423) + if hc.Spec.Platform.Type == hyperv1.NonePlatform { + t.Skip("skipping on None platform") + } + + kasMetrics := []string{ + kas.KASAvailableMetricName, + kas.KASRequestDurationMetricName, + } + hcpNamespace := manifests.HostedControlPlaneNamespace(hc.Namespace, hc.Name) + + err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { + mf, err := GetMetricsFromPod(ctx, c, "control-plane-operator", "control-plane-operator", hcpNamespace, cpoMetricsPort) + if err != nil { + t.Logf("unable to get CPO metrics: %v", err) + return false, nil + } + for _, metricName := range kasMetrics { + // These metrics are emitted without labels, so check family presence directly + // rather than using ValidateMetricPresence which relies on label iteration. + family, ok := mf[metricName] + if !ok || len(family.Metric) == 0 { + t.Logf("Expected results for metric %q, found none", metricName) + return false, nil + } + } + return true, nil + }) + if err != nil { + t.Errorf("failed to validate KAS health metrics: %v", err) + } + }) +} + func getIngressRouterDefaultIP(t *testing.T, ctx context.Context, client crclient.Client, _ *hyperv1.HostedCluster) (string, error) { t.Helper()