From 84f35dd84a2ec193827c01ea935692f55f68f64d Mon Sep 17 00:00:00 2001 From: Oleksii Dolhov Date: Tue, 16 Jun 2026 16:17:36 +0300 Subject: [PATCH] fix(ci): relax backend healthcheck so batch load doesn't flap it unhealthy (#1230) The prod backend Docker healthcheck intermittently tripped to `unhealthy` while the service was fully up serving /health 200s. With only 2 uvicorn workers, scheduled-batch windows make them GIL-contended; a /health probe then waits out the 10s timeout, and 3 consecutive aborts (Retries=3) flip the container to `unhealthy` until the batch eases (observed live on eu2). It is a false negative, but any consumer of Docker health (autoheal, depends_on: service_healthy, LB drain) can act on it and restart the backend mid-batch, orphaning in-flight executions. Relax the probe (config only, docker-compose.prod.yml backend): - timeout 10s -> 30s - start_period 10s -> 60s - retries 3 -> 5 A genuine outage (all probes fail) still trips after ~5 intervals (~2.5 min); transient load spikes no longer do. Other services' probes are left unchanged (not the GIL-contended 2-worker flapper). Related to #1230 --- docker-compose.prod.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index d4a2e3c4..12f6dfd5 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -137,9 +137,18 @@ services: healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s - timeout: 10s - retries: 3 - start_period: 10s + # #1230: relaxed so the probe tolerates transient load without hiding a + # real outage. With only 2 uvicorn workers, scheduled-batch windows make + # them GIL-contended; a /health probe then waits out the old 10s timeout, + # and 3 consecutive aborts flipped the container to `unhealthy` even + # though /health served 200s throughout (false negative). Any consumer of + # Docker health (autoheal, depends_on: service_healthy, LB drain) could + # act on that and restart the backend mid-batch, orphaning in-flight + # executions. Higher timeout + start_period + retries absorb the spike; + # a genuine outage (all probes fail) still trips after ~5 intervals. + timeout: 30s + retries: 5 + start_period: 60s frontend: build: