From fd1d05b5a7ef1b5052bebe39858ba6b74608d774 Mon Sep 17 00:00:00 2001 From: "MagicMock/mock.effective_git_name/140286118293392" Date: Sat, 20 Jun 2026 17:12:59 +0000 Subject: [PATCH] fix(traefik): set finite readTimeout on entrypoints to stop fd leak Traefik's static config set no connection timeouts, inheriting v2.x defaults of readTimeout=0 / writeTimeout=0 ("no timeout"). On a shard's public IP, internet scanners constantly open connections that never complete a request (silent connects, abandoned TLS handshakes, slowloris). With readTimeout=0 these are held open forever, each consuming a file descriptor, until traefik hits its open-file ceiling and accept() fails with EMFILE -- at which point the shard is unreachable and the error is logged in a hot loop (the 38 GB log that filled root in the tdyz60/#306 incident). Set readTimeout=300s on the http and https entrypoints so abandoned connections are reaped. 300s stays generous for slow/large uploads; writeTimeout is intentionally left at default 0 so large downloads, SSE, and long-poll responses are not cut off. The mqtt (8883) entrypoint is TCP, where respondingTimeouts do not apply -- that vector is covered by a nofile ulimit on the traefik service (separate controller PR). Co-Authored-By: Claude Opus 4.8 (1M context) --- data/traefik.yml | 12 ++++++++++++ data/traefik_no_ssl.yml | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/data/traefik.yml b/data/traefik.yml index 238939e..25079c8 100644 --- a/data/traefik.yml +++ b/data/traefik.yml @@ -1,6 +1,15 @@ entryPoints: http: address: ":80" + # Finite read timeout so abandoned/slowloris connections (constant on a + # public IP from internet scanners) are reaped instead of holding a file + # descriptor forever. Without it traefik leaks fds until accept() fails with + # EMFILE and the shard goes unreachable. 300s stays generous for slow/large + # uploads; writeTimeout is left at its default (0) so large downloads, SSE, + # and long-poll responses are not cut off. + transport: + respondingTimeouts: + readTimeout: "300s" http: redirections: entryPoint: @@ -8,6 +17,9 @@ entryPoints: scheme: https https: address: ":443" + transport: + respondingTimeouts: + readTimeout: "300s" mqtt: address: ":8883" diff --git a/data/traefik_no_ssl.yml b/data/traefik_no_ssl.yml index 61348fd..d429b2a 100644 --- a/data/traefik_no_ssl.yml +++ b/data/traefik_no_ssl.yml @@ -1,6 +1,11 @@ entryPoints: http: address: ":80" + # See traefik.yml: finite read timeout reaps abandoned/slowloris connections + # so traefik does not leak file descriptors until accept() hits EMFILE. + transport: + respondingTimeouts: + readTimeout: "300s" mqtt: address: ":8883"