Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ type HostedControlPlaneReconciler struct {
OperateOnReleaseImage string
DefaultIngressDomain string
MetricsSet metrics.MetricsSet
KASHealthMetrics *kas.KASHealthMetrics
SREConfigHash string
ec2Client awsapi.EC2API
awsSession *aws.Config
Expand Down Expand Up @@ -944,9 +945,9 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C
// When the cluster is private, checking the load balancers will depend on whether the load balancer is
// using the right subnets. To avoid uncertainty, we'll limit the check to the service endpoint.
if hcp.Spec.Platform.Type == hyperv1.IBMCloudPlatform {
return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCIBMCloudPort)
return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCIBMCloudPort, r.KASHealthMetrics)
}
return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCPort)
return healthCheckKASEndpoint(manifests.KubeAPIServerService("").Name, config.KASSVCPort, r.KASHealthMetrics)
case serviceStrategy.Type == hyperv1.Route:
if hcp.Spec.Platform.Type != hyperv1.IBMCloudPlatform {
externalRoute := manifests.KubeAPIServerExternalPublicRoute(hcp.Namespace)
Expand All @@ -958,7 +959,7 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C
if err != nil {
return err
}
return healthCheckKASEndpoint(endpoint, port)
return healthCheckKASEndpoint(endpoint, port, r.KASHealthMetrics)
}
case serviceStrategy.Type == hyperv1.LoadBalancer:
svc := manifests.KubeAPIServerService(hcp.Namespace)
Expand Down Expand Up @@ -987,17 +988,33 @@ func (r *HostedControlPlaneReconciler) healthCheckKASLoadBalancers(ctx context.C
} else if LBIngress.IP != "" {
ingressPoint = LBIngress.IP
}
return healthCheckKASEndpoint(ingressPoint, port)
return healthCheckKASEndpoint(ingressPoint, port, r.KASHealthMetrics)
}
return nil
}

func healthCheckKASEndpoint(ingressPoint string, port int) error {
func healthCheckKASEndpoint(ingressPoint string, port int, m *kas.KASHealthMetrics) error {
healthEndpoint := fmt.Sprintf("https://%s:%d/healthz", ingressPoint, port)

httpClient := util.InsecureHTTPClient()
httpClient.Timeout = 10 * time.Second

start := time.Now()
resp, err := httpClient.Get(healthEndpoint)
duration := time.Since(start).Seconds()
if resp != nil {
defer resp.Body.Close()
}

if m != nil {
m.RequestDuration.Observe(duration)
if err == nil && resp != nil && resp.StatusCode == http.StatusOK {
m.Available.Set(1)
} else {
m.Available.Set(0)
}
}

if err != nil {
return err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ import (
"context"
_ "embed"
"fmt"
"net/http"
"net/http/httptest"
"net/url"
"sort"
"strconv"
"strings"
"testing"
"time"
Expand All @@ -15,6 +19,7 @@ import (
"github.com/openshift/hypershift/api/util/ipnet"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/common"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/infra"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests"
endpointresolverv2 "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/v2/endpoint_resolver"
etcdv2 "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/v2/etcd"
Expand Down Expand Up @@ -77,6 +82,8 @@ import (
"github.com/docker/distribution"
"github.com/go-logr/zapr"
"github.com/opencontainers/go-digest"
"github.com/prometheus/client_golang/prometheus"
prometheustestutil "github.com/prometheus/client_golang/prometheus/testutil"
"go.uber.org/mock/gomock"
"go.uber.org/zap/zaptest"
)
Expand Down Expand Up @@ -4431,3 +4438,82 @@ var _ util.ImageMetadataProvider = &fakeVersionImageMetadataProvider{}

// Compile-time assertion for clock interface used by tests.
var _ clock.Clock = &testingclock.FakeClock{}

func TestHealthCheckKASEndpointMetrics(t *testing.T) {
t.Run("When KAS endpoint returns 200, it should set available to 1 and record duration", func(t *testing.T) {
g := NewGomegaWithT(t)

server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer server.Close()

m := newTestKASHealthMetrics(t)
host, port := parseHostPort(t, server)

err := healthCheckKASEndpoint(host, port, m)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(1)))
g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1))
})

t.Run("When KAS endpoint returns 503, it should set available to 0", func(t *testing.T) {
g := NewGomegaWithT(t)

server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable)
}))
defer server.Close()

m := newTestKASHealthMetrics(t)
host, port := parseHostPort(t, server)

err := healthCheckKASEndpoint(host, port, m)
g.Expect(err).To(HaveOccurred())
g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(0)))
g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1))
})

t.Run("When KAS endpoint is unreachable, it should set available to 0", func(t *testing.T) {
g := NewGomegaWithT(t)

m := newTestKASHealthMetrics(t)

err := healthCheckKASEndpoint("192.0.2.1", 1, m)
g.Expect(err).To(HaveOccurred())
g.Expect(prometheustestutil.ToFloat64(m.Available)).To(Equal(float64(0)))
g.Expect(prometheustestutil.CollectAndCount(m.RequestDuration)).To(Equal(1))
})

t.Run("When metrics is nil, it should not panic", func(t *testing.T) {
g := NewGomegaWithT(t)

server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer server.Close()

host, port := parseHostPort(t, server)
err := healthCheckKASEndpoint(host, port, nil)
g.Expect(err).NotTo(HaveOccurred())
})
}

func newTestKASHealthMetrics(t *testing.T) *kas.KASHealthMetrics {
t.Helper()
return kas.NewKASHealthMetrics(prometheus.NewRegistry())
}

func parseHostPort(t *testing.T, server *httptest.Server) (string, int) {
t.Helper()
u, err := url.Parse(server.URL)
if err != nil {
t.Fatalf("failed to parse server URL: %v", err)
}
host := u.Hostname()
port, err := strconv.Atoi(u.Port())
if err != nil {
t.Fatalf("failed to parse port: %v", err)
}
return host, port
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package kas

import (
"github.com/prometheus/client_golang/prometheus"
)

const (
KASAvailableMetricName = "hypershift_kube_apiserver_available"
KASRequestDurationMetricName = "hypershift_kube_apiserver_request_duration_seconds"
)

// kasRequestDurationBuckets defines the histogram bucket boundaries for KAS
// health check latency measurements, ranging from 10ms to 10s.
var kasRequestDurationBuckets = []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}

// KASHealthMetrics holds Prometheus metrics for KAS health check probes.
// Each CPO pod runs in its own HCP namespace, so metrics are naturally
// scoped per hosted cluster without needing additional labels.
type KASHealthMetrics struct {
Available prometheus.Gauge
RequestDuration prometheus.Histogram
}

// NewKASHealthMetrics creates and registers KAS health metrics with the
// controller-runtime metrics registry. The existing PodMonitor for the
// control-plane-operator will automatically scrape these metrics.
func NewKASHealthMetrics(reg prometheus.Registerer) *KASHealthMetrics {
m := &KASHealthMetrics{
Available: prometheus.NewGauge(prometheus.GaugeOpts{
Name: KASAvailableMetricName,
Help: "1 if the KAS /healthz endpoint returns HTTP 200, 0 otherwise.",
}),
RequestDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: KASRequestDurationMetricName,
Help: "Latency of the KAS /healthz health check probe in seconds.",
Buckets: kasRequestDurationBuckets,
}),
}

reg.MustRegister(m.Available, m.RequestDuration)
return m
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package kas

import (
"testing"

. "github.com/onsi/gomega"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)

func TestKASHealthMetricsOperations(t *testing.T) {
t.Parallel()

t.Run("When creating KAS health metrics, it should register both metrics", func(t *testing.T) {
g := NewGomegaWithT(t)
registry := prometheus.NewRegistry()
m := NewKASHealthMetrics(registry)

g.Expect(testutil.ToFloat64(m.Available)).To(Equal(0.0))
})

t.Run("When setting availability to 1, it should reflect the new value", func(t *testing.T) {
g := NewGomegaWithT(t)
registry := prometheus.NewRegistry()
m := NewKASHealthMetrics(registry)

m.Available.Set(1)
g.Expect(testutil.ToFloat64(m.Available)).To(Equal(1.0))
})

t.Run("When observing a request duration, it should be counted", func(t *testing.T) {
g := NewGomegaWithT(t)
registry := prometheus.NewRegistry()
m := NewKASHealthMetrics(registry)

m.RequestDuration.Observe(0.5)
g.Expect(testutil.CollectAndCount(m.RequestDuration)).To(Equal(1))
})
}
3 changes: 3 additions & 0 deletions control-plane-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/openshift/hypershift/control-plane-operator/controllers/gcpprivateserviceconnect"
"github.com/openshift/hypershift/control-plane-operator/controllers/healthcheck"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests"
endpointresolver "github.com/openshift/hypershift/control-plane-operator/endpoint-resolver"
"github.com/openshift/hypershift/control-plane-operator/featuregates"
Expand Down Expand Up @@ -67,6 +68,7 @@ import (
crclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"

Expand Down Expand Up @@ -530,6 +532,7 @@ func NewStartCommand() *cobra.Command {
OperateOnReleaseImage: os.Getenv("OPERATE_ON_RELEASE_IMAGE"),
DefaultIngressDomain: defaultIngressDomain,
MetricsSet: metricsSet,
KASHealthMetrics: kas.NewKASHealthMetrics(crmetrics.Registry),
CertRotationScale: certRotationScale,
EnableCVOManagementClusterMetricsAccess: enableCVOManagementClusterMetricsAccess,
ImageMetadataProvider: imageMetaDataProvider,
Expand Down
1 change: 1 addition & 0 deletions test/e2e/create_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ func TestCreateCluster(t *testing.T) {

e2eutil.EnsureGlobalPullSecret(t, ctx, mgtClient, hostedCluster, globalOpts.AdditionalPullSecretFile)
e2eutil.EnsureMetricsForwarderWorking(t, ctx, mgtClient, hostedCluster)
e2eutil.ValidateCPOMetrics(t, ctx, mgtClient, hostedCluster)

// Verify CPO override image if TEST_CPO_OVERRIDE=1 is set
if os.Getenv("TEST_CPO_OVERRIDE") == "1" {
Expand Down
41 changes: 41 additions & 0 deletions test/e2e/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
awsinfra "github.com/openshift/hypershift/cmd/infra/aws"
awsutil "github.com/openshift/hypershift/cmd/infra/aws/util"
awsprivatelink "github.com/openshift/hypershift/control-plane-operator/controllers/awsprivatelink"
"github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas"
cpomanifests "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests"
hccokasvap "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/kas"
hccomanifests "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/manifests"
Expand Down Expand Up @@ -2758,6 +2759,8 @@ const (
// TODO (jparrill): We need to separate the metrics.go from the main pkg in the hypershift-operator.
// Delete these references when it's done and import it from there
HypershiftOperatorInfoName = "hypershift_operator_info"

cpoMetricsPort = "8080"
)

func extractDataFromFamilies(metricFamilies map[string]*dto.MetricFamily, metric, labelKey, labelValue string) []*dto.LabelPair {
Expand Down Expand Up @@ -2854,6 +2857,44 @@ func ValidateMetrics(t *testing.T, ctx context.Context, client crclient.Client,
})
}

// ValidateCPOMetrics verifies that KAS health metrics are exposed from the
// control-plane-operator pod in the HCP namespace.
func ValidateCPOMetrics(t *testing.T, ctx context.Context, c crclient.Client, hc *hyperv1.HostedCluster) {
t.Run("When KAS health metrics are exposed, it should contain availability and latency data", func(t *testing.T) {
AtLeast(t, Version423)
if hc.Spec.Platform.Type == hyperv1.NonePlatform {
t.Skip("skipping on None platform")
}

kasMetrics := []string{
kas.KASAvailableMetricName,
kas.KASRequestDurationMetricName,
}
hcpNamespace := manifests.HostedControlPlaneNamespace(hc.Namespace, hc.Name)

err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
mf, err := GetMetricsFromPod(ctx, c, "control-plane-operator", "control-plane-operator", hcpNamespace, cpoMetricsPort)
if err != nil {
t.Logf("unable to get CPO metrics: %v", err)
return false, nil
}
for _, metricName := range kasMetrics {
// These metrics are emitted without labels, so check family presence directly
// rather than using ValidateMetricPresence which relies on label iteration.
family, ok := mf[metricName]
if !ok || len(family.Metric) == 0 {
t.Logf("Expected results for metric %q, found none", metricName)
return false, nil
}
}
return true, nil
})
if err != nil {
t.Errorf("failed to validate KAS health metrics: %v", err)
}
})
}

func getIngressRouterDefaultIP(t *testing.T, ctx context.Context, client crclient.Client, _ *hyperv1.HostedCluster) (string, error) {
t.Helper()

Expand Down