From b31fc92fee01adb0efc75b022b0543f5708c5f4b Mon Sep 17 00:00:00 2001 From: Georg Grab Date: Mon, 11 May 2026 08:05:54 +0000 Subject: [PATCH] client: drop SelectiveHTTP2Transport, use HTTP/1.1 only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `SelectiveHTTP2Transport` opted /tabpfn/fit and /tabpfn/predict into HTTP/2 while leaving everything else on HTTP/1.1. Both endpoints are long-running unary POSTs (thinking-mode fits keep the stream open for 5–15 min) and pick up no measurable benefit from HTTP/2's multiplexing or HPACK — the body is a multipart upload dominated by raw bytes and we only ever send one request per fit. What HTTP/2 *did* buy us: a race condition. The h2 state machine treats a PING frame received in the CLOSED state as a protocol violation. During long thinking fits the server-side keepalive PINGs (Cloud Run LB sends them on long-lived streams) regularly land on the client *after* the connection has been reaped by some intermediate hop, and httpx surfaces this as `httpx.LocalProtocolError("Invalid input ConnectionInputs.RECV_PING in state ConnectionState.CLOSED")`. That exception class is NOT in either `_fit` or `_predict`'s `@backoff.on_exception` retry tuple (the tuple covers `RemoteProtocolError` — peer sent garbage — but not `LocalProtocolError` — our state machine got wrong-footed by a benign-but-out-of-order frame). So instead of being retried silently, the fit fails hard with a confusing error. Empirically: a 135-fit TabArena-Medium sweep against api.priorlabs.ai ate 6 of these errors across thinking_medium + thinking_high; re-running the same matrix locally after patching `httpx_client` to drop the selective HTTP/2 transport cleared all 5 thinking_high cells without a single protocol error. HTTP/1.1 has no PING frames at all and no equivalent state machine, so the entire class of bug disappears. TCP-level resets surface as the familiar `ConnectError` / `ReadError` family, which the retry tuple already covers. Side change: drop `[http2]` extra from the `httpx` dep since we no longer use it; `h2` will no longer be pulled in transitively. Net diff: 1 transport subclass + 1 import removed, 1 dep extra dropped, ~10 lines of comment added explaining the rationale. --- pyproject.toml | 2 +- src/tabpfn_client/client.py | 34 ++++++++++++++-------------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2a89eab..b67d06d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ classifiers = [ ] license = { file = "LICENSE" } dependencies = [ - "httpx[http2]>=0.25.0,<=0.28.1", + "httpx>=0.25.0,<=0.28.1", "omegaconf>=2.1.2,<=2.3.0", "pandas>=2.1.2,<=2.3.3", "password-strength>=0.0.3.post2,<=0.0.3.post2", diff --git a/src/tabpfn_client/client.py b/src/tabpfn_client/client.py index 170ed4c..88a1171 100644 --- a/src/tabpfn_client/client.py +++ b/src/tabpfn_client/client.py @@ -26,7 +26,6 @@ import backoff import httpx -from httpx._transports.default import HTTPTransport from omegaconf import OmegaConf from tabpfn_client.browser_auth import BrowserAuthHandler from tabpfn_client.constants import ( @@ -187,22 +186,6 @@ class PredictionResult: metadata: dict[str, Any] = field(default_factory=dict) -class SelectiveHTTP2Transport(HTTPTransport): - def __init__(self, http2_paths=None, *args, **kwargs): - self.http2_paths = http2_paths or [] - self.http1 = HTTPTransport(http2=False, *args, **kwargs) - self.http2 = HTTPTransport(http2=True, *args, **kwargs) - - def handle_request(self, request): - if request.url.path in self.http2_paths: - return self.http2.handle_request(request) - return self.http1.handle_request(request) - - def close(self) -> None: - self.http1.close() - self.http2.close() - - class ServiceClient(Singleton): """ Singleton class for handling communication with the server. @@ -215,13 +198,24 @@ class ServiceClient(Singleton): TABPFN_CLIENT_API_URL or f"{server_config.protocol}://{server_config.host}:{server_config.port}" ) - fit_path = SERVER_CONFIG["endpoints"]["fit"]["path"] - predict_path = SERVER_CONFIG["endpoints"]["predict"]["path"] + # NOTE: HTTP/1.1 only. HTTP/2 used to be selectively enabled for the + # /tabpfn/fit and /tabpfn/predict endpoints, but the long-running + # thinking-mode fit kept the stream open for 5-15 min, which raced + # against intermediate keepalive PINGs from Cloud Run's LB. The + # `h2` state machine treats a PING received while the connection + # is CLOSED as a protocol violation and surfaces it as + # `httpx.LocalProtocolError("Invalid input ConnectionInputs.RECV_PING + # in state ConnectionState.CLOSED")` — which is NOT in the SDK's + # retry tuple, so the request fails hard instead of retrying. + # HTTP/1.1 has no PING frames and no equivalent state machine, so + # the race disappears. Unary POSTs against /fit and /predict don't + # benefit from HTTP/2's multiplexing or HPACK in any measurable way + # (one request per fit, dominated by the multipart body), so the + # tradeoff is one-sided. httpx_client = httpx.Client( base_url=base_url, timeout=TABPFN_CLIENT_TIMEOUT, headers={"Prior-Client-Version": get_client_version()}, - transport=SelectiveHTTP2Transport(http2_paths=[fit_path, predict_path]), follow_redirects=True, ) _access_token = None