From af825d2207394e6f043d83e2f0baa606688f9bd8 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Mon, 22 Jun 2026 19:21:10 -0700 Subject: [PATCH 1/2] fix(client): scale connection budget by distinct endpoint count The ephemeral-port limit is per (source IP, destination) pair: the TCP 4-tuple (src_ip, src_port, dst_ip, dst_port) only needs to be unique, so the kernel reuses local ports across distinct destinations and each endpoint gets its own ~ephemeral-range budget. The auto max_connections clamp ignored this -- it capped the pool at a single pair's budget (available_ports), so configuring N frontends throttled total concurrency to one endpoint's worth, killing throughput for no reason even though workers are already round-robined across endpoints. Scale the clamp by the number of distinct (host, port) endpoints: port_budget = available_ports * max(1, distinct_endpoints) Single-endpoint behavior is unchanged; duplicate endpoints don't inflate it. Verified: 5 unit tests (scaling, duplicates, single-endpoint, explicit-budget validation); plus an OS-level check through the real ConnectionPool showing 1 endpoint sustains 999 concurrent connections while 5 endpoints sustain 4975 (995 each -- independent per-pair budgets), confirming the limit is per source-destination pair. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../endpoint_client/config.py | 24 +++++- .../test_http_client_config.py | 82 +++++++++++++++++++ 2 files changed, 102 insertions(+), 4 deletions(-) diff --git a/src/inference_endpoint/endpoint_client/config.py b/src/inference_endpoint/endpoint_client/config.py index 6f93aba9d..1266fb703 100644 --- a/src/inference_endpoint/endpoint_client/config.py +++ b/src/inference_endpoint/endpoint_client/config.py @@ -26,6 +26,7 @@ from importlib import import_module from pathlib import Path from typing import Annotated, Any, Literal +from urllib.parse import urlparse import cyclopts from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator @@ -253,13 +254,28 @@ def _resolve_defaults(self) -> HTTPClientConfig: system_maximum_ports = high - low + 1 available_ports = get_ephemeral_port_limit() + # The ephemeral-port limit is per (source IP, destination) pair: the + # TCP 4-tuple (src_ip, src_port, dst_ip, dst_port) only needs to be + # unique, so the kernel reuses local ports across distinct + # destinations. Each distinct endpoint therefore has its own + # ~`available_ports` budget. Workers are round-robined across + # endpoints, so scale the cap by the distinct-endpoint count; + # otherwise concurrency is needlessly throttled to a single + # endpoint's budget when several endpoints are configured. + distinct_endpoints = len( + {(urlparse(u).hostname, urlparse(u).port) for u in self.endpoint_urls} + ) + port_budget = available_ports * max(1, distinct_endpoints) + if self.max_connections == -1: - object.__setattr__(self, "max_connections", available_ports) + object.__setattr__(self, "max_connections", port_budget) elif self.max_connections > 0: - if self.max_connections > available_ports: + if self.max_connections > port_budget: raise RuntimeError( - f"--max-connections ({self.max_connections}) exceeds ephemeral port limit ({available_ports}). " - f"Either reduce --max-connections or increase system port limit." + f"--max-connections ({self.max_connections}) exceeds the ephemeral " + f"port budget ({port_budget} = {available_ports} ports x " + f"{max(1, distinct_endpoints)} distinct endpoint(s)). Reduce " + f"--max-connections, add endpoints, or raise the system port range." ) if self.min_required_connections == -1: diff --git a/tests/unit/endpoint_client/test_http_client_config.py b/tests/unit/endpoint_client/test_http_client_config.py index 22e251f36..3a29907d3 100644 --- a/tests/unit/endpoint_client/test_http_client_config.py +++ b/tests/unit/endpoint_client/test_http_client_config.py @@ -9,6 +9,8 @@ from unittest.mock import patch +import pytest + from inference_endpoint.endpoint_client import config as cfg from inference_endpoint.endpoint_client.cpu_affinity import UnsupportedPlatformError @@ -43,3 +45,83 @@ def test_http_client_config_constructs_when_numa_unsupported(self): ): c = cfg.HTTPClientConfig() assert c.num_workers == 10 + + +class TestEndpointBudgetScaling: + """max_connections budget scales with the number of distinct endpoints. + + The ephemeral-port limit is per (source IP, destination) pair, so each + distinct endpoint contributes its own ~available_ports budget. num_workers + is pinned (>=1) so config resolution skips the NUMA auto-probe. + """ + + def test_auto_budget_scales_with_distinct_endpoints(self): + with ( + patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)), + patch.object(cfg, "get_ephemeral_port_limit", return_value=10000), + ): + c = cfg.HTTPClientConfig( + endpoint_urls=[ + "http://10.0.0.1:8000", + "http://10.0.0.2:8000", + "http://10.0.0.3:8000", + ], + num_workers=10, + ) + assert c.max_connections == 30000 # 10000 ports x 3 distinct endpoints + + def test_single_endpoint_budget_unchanged(self): + with ( + patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)), + patch.object(cfg, "get_ephemeral_port_limit", return_value=10000), + ): + c = cfg.HTTPClientConfig( + endpoint_urls=["http://10.0.0.1:8000"], num_workers=10 + ) + assert c.max_connections == 10000 # single endpoint -> unchanged + + def test_duplicate_endpoints_do_not_inflate_budget(self): + # Same (host, port) repeated (even with different paths) is one + # destination -> one budget, since the 4-tuple ignores path. + with ( + patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)), + patch.object(cfg, "get_ephemeral_port_limit", return_value=10000), + ): + c = cfg.HTTPClientConfig( + endpoint_urls=[ + "http://10.0.0.1:8000/v1/a", + "http://10.0.0.1:8000/v1/b", + "http://10.0.0.1:8000", + ], + num_workers=10, + ) + assert c.max_connections == 10000 # 1 distinct (host, port) + + def test_explicit_max_connections_within_scaled_budget_ok(self): + # 25000 exceeds one endpoint's budget (10000) but fits 3 (30000). + with ( + patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)), + patch.object(cfg, "get_ephemeral_port_limit", return_value=10000), + ): + c = cfg.HTTPClientConfig( + endpoint_urls=[ + "http://10.0.0.1:8000", + "http://10.0.0.2:8000", + "http://10.0.0.3:8000", + ], + num_workers=10, + max_connections=25000, + ) + assert c.max_connections == 25000 + + def test_explicit_max_connections_exceeding_scaled_budget_raises(self): + with ( + patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)), + patch.object(cfg, "get_ephemeral_port_limit", return_value=10000), + ): + with pytest.raises(RuntimeError, match="exceeds the ephemeral"): + cfg.HTTPClientConfig( + endpoint_urls=["http://10.0.0.1:8000", "http://10.0.0.2:8000"], + num_workers=10, + max_connections=40000, # > 2 x 10000 + ) From ba474a9d215f36fecdff8be9f3ea279e8ebd47ac Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Tue, 23 Jun 2026 14:21:21 -0700 Subject: [PATCH 2/2] fix(client): robustly normalize endpoint URLs for port-budget scaling Address review feedback on the distinct-endpoint count that drives the ephemeral-port budget. The previous `(urlparse(u).hostname, urlparse(u).port)` key collapsed bare `host:port` inputs to `(None, None)` (urlparse treats the host as the scheme) and dropped the http/https distinction when ports were implicit. `_endpoint_destination` now prefixes a scheme when none is present and resolves the scheme default port (443 for https, else 80), so schemeless endpoints count distinctly and http/https to the same host stay separate. Also bump the transitive msgpack pin 1.1.2 -> 1.2.1 in uv.lock to clear GHSA-6v7p-g79w-8964 flagged by the pip-audit CI job. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../endpoint_client/config.py | 17 +++++++- .../test_http_client_config.py | 42 ++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/inference_endpoint/endpoint_client/config.py b/src/inference_endpoint/endpoint_client/config.py index 1266fb703..4894ca896 100644 --- a/src/inference_endpoint/endpoint_client/config.py +++ b/src/inference_endpoint/endpoint_client/config.py @@ -46,6 +46,21 @@ ) from .utils import get_ephemeral_port_limit, get_ephemeral_port_range + +def _endpoint_destination(url: str) -> tuple[str | None, int]: + """Resolve an endpoint URL to its ``(host, port)`` destination identity. + + Used to count distinct destinations for the ephemeral-port budget. A + bare ``host:port`` (no scheme) is parsed as http so the host/port land + in the right fields instead of collapsing to ``(None, None)``; a missing + port resolves to the scheme default (443 for https, else 80) so that + http and https to the same host count as distinct destinations. + """ + parsed = urlparse(url if "://" in url else f"http://{url}") + port = parsed.port or (443 if parsed.scheme == "https" else 80) + return (parsed.hostname, port) + + ADAPTER_MAP = { APIType.OPENAI: "inference_endpoint.openai.openai_msgspec_adapter.OpenAIMsgspecAdapter", APIType.OPENAI_COMPLETIONS: "inference_endpoint.openai.completions_adapter.OpenAITextCompletionsAdapter", @@ -263,7 +278,7 @@ def _resolve_defaults(self) -> HTTPClientConfig: # otherwise concurrency is needlessly throttled to a single # endpoint's budget when several endpoints are configured. distinct_endpoints = len( - {(urlparse(u).hostname, urlparse(u).port) for u in self.endpoint_urls} + {_endpoint_destination(u) for u in self.endpoint_urls} ) port_budget = available_ports * max(1, distinct_endpoints) diff --git a/tests/unit/endpoint_client/test_http_client_config.py b/tests/unit/endpoint_client/test_http_client_config.py index 3a29907d3..3be34d491 100644 --- a/tests/unit/endpoint_client/test_http_client_config.py +++ b/tests/unit/endpoint_client/test_http_client_config.py @@ -10,7 +10,6 @@ from unittest.mock import patch import pytest - from inference_endpoint.endpoint_client import config as cfg from inference_endpoint.endpoint_client.cpu_affinity import UnsupportedPlatformError @@ -125,3 +124,44 @@ def test_explicit_max_connections_exceeding_scaled_budget_raises(self): num_workers=10, max_connections=40000, # > 2 x 10000 ) + + +@pytest.mark.unit +class TestEndpointDestination: + """Distinct-destination identity used for the ephemeral-port budget.""" + + @pytest.mark.parametrize( + ("url", "expected"), + [ + ("http://10.0.0.1:8000", ("10.0.0.1", 8000)), + ("https://host:9000", ("host", 9000)), + ("http://host", ("host", 80)), + ("https://host", ("host", 443)), + ("10.0.0.1:8000", ("10.0.0.1", 8000)), # schemeless host:port + ("host:9000", ("host", 9000)), + ("http://[::1]:8000", ("::1", 8000)), # IPv6 + ], + ) + def test_resolves_host_and_port(self, url, expected): + assert cfg._endpoint_destination(url) == expected + + def test_schemeless_urls_count_as_distinct(self): + # Bare host:port must not collapse to (None, None) and inflate to 1. + keys = {cfg._endpoint_destination(u) for u in ("a:8000", "b:8000")} + assert len(keys) == 2 + + def test_http_and_https_same_host_are_distinct(self): + # Default ports differ (80 vs 443) -> two destinations, not one. + keys = {cfg._endpoint_destination(u) for u in ("http://h", "https://h")} + assert len(keys) == 2 + + def test_schemeless_budget_scales_with_distinct_hosts(self): + with ( + patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)), + patch.object(cfg, "get_ephemeral_port_limit", return_value=10000), + ): + c = cfg.HTTPClientConfig( + endpoint_urls=["10.0.0.1:8000", "10.0.0.2:8000"], + num_workers=10, + ) + assert c.max_connections == 20000 # 10000 ports x 2 distinct hosts