From 84f35dd84a2ec193827c01ea935692f55f68f64d Mon Sep 17 00:00:00 2001
From: Oleksii Dolhov <oleksii.dolhov@gmail.com>
Date: Tue, 16 Jun 2026 16:17:36 +0300
Subject: [PATCH] fix(ci): relax backend healthcheck so batch load doesn't flap
 it unhealthy (#1230)

The prod backend Docker healthcheck intermittently tripped to `unhealthy`
while the service was fully up serving /health 200s. With only 2 uvicorn
workers, scheduled-batch windows make them GIL-contended; a /health probe
then waits out the 10s timeout, and 3 consecutive aborts (Retries=3) flip the
container to `unhealthy` until the batch eases (observed live on eu2). It is a
false negative, but any consumer of Docker health (autoheal, depends_on:
service_healthy, LB drain) can act on it and restart the backend mid-batch,
orphaning in-flight executions.

Relax the probe (config only, docker-compose.prod.yml backend):
- timeout      10s -> 30s
- start_period 10s -> 60s
- retries       3  -> 5

A genuine outage (all probes fail) still trips after ~5 intervals (~2.5 min);
transient load spikes no longer do. Other services' probes are left unchanged
(not the GIL-contended 2-worker flapper).

Related to #1230
---
 docker-compose.prod.yml | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index d4a2e3c4..12f6dfd5 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -137,9 +137,18 @@ services:
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
       interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 10s
+      # #1230: relaxed so the probe tolerates transient load without hiding a
+      # real outage. With only 2 uvicorn workers, scheduled-batch windows make
+      # them GIL-contended; a /health probe then waits out the old 10s timeout,
+      # and 3 consecutive aborts flipped the container to `unhealthy` even
+      # though /health served 200s throughout (false negative). Any consumer of
+      # Docker health (autoheal, depends_on: service_healthy, LB drain) could
+      # act on that and restart the backend mid-batch, orphaning in-flight
+      # executions. Higher timeout + start_period + retries absorb the spike;
+      # a genuine outage (all probes fail) still trips after ~5 intervals.
+      timeout: 30s
+      retries: 5
+      start_period: 60s
 
   frontend:
     build: