From 465f2f70e59a9e891d69d3fb9febe9026a204d44 Mon Sep 17 00:00:00 2001 From: Mateusz Kowalski Date: Fri, 26 Jun 2026 15:40:44 +0200 Subject: [PATCH] test/router: wait for all per-route metrics before asserting The HAProxy router metrics test exits its retry loop as soon as haproxy_backend_connections_total reaches the expected count, but then immediately asserts on other per-route metrics like haproxy_server_http_responses_total. The HAProxy exporter has a scrape interval (typically 5s), so these metrics may not be populated in the same scrape that satisfied the connections check. This causes a 100% failure rate on 5.0 Azure micro-upgrade jobs (regression #42639 / OCPBUGS-92837) because the post-loop assertions find nil where they expect populated gauges. Fix by adding haproxy_server_http_responses_total 2xx to the loop exit condition so we only proceed when all per-route backend stats are confirmed present in the same metrics scrape. Signed-off-by: Mateusz Kowalski Generated-by: AI Signed-off-by: Mateusz Kowalski --- test/extended/router/metrics.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/test/extended/router/metrics.go b/test/extended/router/metrics.go index 61a47a705319..25623633538e 100644 --- a/test/extended/router/metrics.go +++ b/test/extended/router/metrics.go @@ -161,6 +161,10 @@ var _ = g.Describe("[sig-network][Feature:Router]", func() { times := 10 p := expfmt.NewTextParser(model.LegacyValidation) + // The HAProxy exporter has a scrape interval (typically 5s), so + // per-route backend metrics may lag behind server-level metrics. + // Wait until all route-specific metrics are populated in the same + // scrape before exiting the retry loop. err = wait.PollImmediate(2*time.Second, 240*time.Second, func() (bool, error) { results, err = prometheus.GetBearerTokenURLViaPod(oc, execPodName, fmt.Sprintf("http://%s/metrics", net.JoinHostPort(host, strconv.Itoa(int(metricsPort)))), bearerToken) o.Expect(err).NotTo(o.HaveOccurred()) @@ -168,11 +172,18 @@ var _ = g.Describe("[sig-network][Feature:Router]", func() { o.Expect(err).NotTo(o.HaveOccurred()) if len(findNonZeroGaugesWithLabels(metrics["haproxy_server_up"], serverLabels)) == 2 { - if g := findGaugesWithLabels(metrics["haproxy_backend_connections_total"], routeLabels); len(g) > 0 { - // stop retrying if the route got expected number of connections. - if g[0] >= float64(times) { + backendConns := findGaugesWithLabels(metrics["haproxy_backend_connections_total"], routeLabels) + if len(backendConns) > 0 && backendConns[0] >= float64(times) { + // Also verify that the HTTP response metrics have been + // populated for this route before exiting the loop. + // The exporter may not refresh all stats atomically, so + // backend_connections_total can appear before + // server_http_responses_total is populated. + if len(findNonZeroGaugesWithLabels(metrics["haproxy_server_http_responses_total"], serverLabels.With("code", "2xx"))) == 2 { return true, nil } + g.By("retrying metrics until all per-route stats are populated") + return false, nil } // send a burst of traffic to the router g.By("sending traffic to a weighted route")