From af825d2207394e6f043d83e2f0baa606688f9bd8 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Mon, 22 Jun 2026 19:21:10 -0700
Subject: [PATCH 1/2] fix(client): scale connection budget by distinct endpoint
 count

The ephemeral-port limit is per (source IP, destination) pair: the TCP 4-tuple
(src_ip, src_port, dst_ip, dst_port) only needs to be unique, so the kernel
reuses local ports across distinct destinations and each endpoint gets its own
~ephemeral-range budget.

The auto max_connections clamp ignored this -- it capped the pool at a single
pair's budget (available_ports), so configuring N frontends throttled total
concurrency to one endpoint's worth, killing throughput for no reason even
though workers are already round-robined across endpoints.

Scale the clamp by the number of distinct (host, port) endpoints:
  port_budget = available_ports * max(1, distinct_endpoints)
Single-endpoint behavior is unchanged; duplicate endpoints don't inflate it.

Verified: 5 unit tests (scaling, duplicates, single-endpoint, explicit-budget
validation); plus an OS-level check through the real ConnectionPool showing 1
endpoint sustains 999 concurrent connections while 5 endpoints sustain 4975
(995 each -- independent per-pair budgets), confirming the limit is per
source-destination pair.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../endpoint_client/config.py                 | 24 +++++-
 .../test_http_client_config.py                | 82 +++++++++++++++++++
 2 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/src/inference_endpoint/endpoint_client/config.py b/src/inference_endpoint/endpoint_client/config.py
index 6f93aba9d..1266fb703 100644
--- a/src/inference_endpoint/endpoint_client/config.py
+++ b/src/inference_endpoint/endpoint_client/config.py
@@ -26,6 +26,7 @@
 from importlib import import_module
 from pathlib import Path
 from typing import Annotated, Any, Literal
+from urllib.parse import urlparse
 
 import cyclopts
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
@@ -253,13 +254,28 @@ def _resolve_defaults(self) -> HTTPClientConfig:
             system_maximum_ports = high - low + 1
             available_ports = get_ephemeral_port_limit()
 
+            # The ephemeral-port limit is per (source IP, destination) pair: the
+            # TCP 4-tuple (src_ip, src_port, dst_ip, dst_port) only needs to be
+            # unique, so the kernel reuses local ports across distinct
+            # destinations. Each distinct endpoint therefore has its own
+            # ~`available_ports` budget. Workers are round-robined across
+            # endpoints, so scale the cap by the distinct-endpoint count;
+            # otherwise concurrency is needlessly throttled to a single
+            # endpoint's budget when several endpoints are configured.
+            distinct_endpoints = len(
+                {(urlparse(u).hostname, urlparse(u).port) for u in self.endpoint_urls}
+            )
+            port_budget = available_ports * max(1, distinct_endpoints)
+
             if self.max_connections == -1:
-                object.__setattr__(self, "max_connections", available_ports)
+                object.__setattr__(self, "max_connections", port_budget)
             elif self.max_connections > 0:
-                if self.max_connections > available_ports:
+                if self.max_connections > port_budget:
                     raise RuntimeError(
-                        f"--max-connections ({self.max_connections}) exceeds ephemeral port limit ({available_ports}). "
-                        f"Either reduce --max-connections or increase system port limit."
+                        f"--max-connections ({self.max_connections}) exceeds the ephemeral "
+                        f"port budget ({port_budget} = {available_ports} ports x "
+                        f"{max(1, distinct_endpoints)} distinct endpoint(s)). Reduce "
+                        f"--max-connections, add endpoints, or raise the system port range."
                     )
 
             if self.min_required_connections == -1:
diff --git a/tests/unit/endpoint_client/test_http_client_config.py b/tests/unit/endpoint_client/test_http_client_config.py
index 22e251f36..3a29907d3 100644
--- a/tests/unit/endpoint_client/test_http_client_config.py
+++ b/tests/unit/endpoint_client/test_http_client_config.py
@@ -9,6 +9,8 @@
 
 from unittest.mock import patch
 
+import pytest
+
 from inference_endpoint.endpoint_client import config as cfg
 from inference_endpoint.endpoint_client.cpu_affinity import UnsupportedPlatformError
 
@@ -43,3 +45,83 @@ def test_http_client_config_constructs_when_numa_unsupported(self):
         ):
             c = cfg.HTTPClientConfig()
         assert c.num_workers == 10
+
+
+class TestEndpointBudgetScaling:
+    """max_connections budget scales with the number of distinct endpoints.
+
+    The ephemeral-port limit is per (source IP, destination) pair, so each
+    distinct endpoint contributes its own ~available_ports budget. num_workers
+    is pinned (>=1) so config resolution skips the NUMA auto-probe.
+    """
+
+    def test_auto_budget_scales_with_distinct_endpoints(self):
+        with (
+            patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)),
+            patch.object(cfg, "get_ephemeral_port_limit", return_value=10000),
+        ):
+            c = cfg.HTTPClientConfig(
+                endpoint_urls=[
+                    "http://10.0.0.1:8000",
+                    "http://10.0.0.2:8000",
+                    "http://10.0.0.3:8000",
+                ],
+                num_workers=10,
+            )
+        assert c.max_connections == 30000  # 10000 ports x 3 distinct endpoints
+
+    def test_single_endpoint_budget_unchanged(self):
+        with (
+            patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)),
+            patch.object(cfg, "get_ephemeral_port_limit", return_value=10000),
+        ):
+            c = cfg.HTTPClientConfig(
+                endpoint_urls=["http://10.0.0.1:8000"], num_workers=10
+            )
+        assert c.max_connections == 10000  # single endpoint -> unchanged
+
+    def test_duplicate_endpoints_do_not_inflate_budget(self):
+        # Same (host, port) repeated (even with different paths) is one
+        # destination -> one budget, since the 4-tuple ignores path.
+        with (
+            patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)),
+            patch.object(cfg, "get_ephemeral_port_limit", return_value=10000),
+        ):
+            c = cfg.HTTPClientConfig(
+                endpoint_urls=[
+                    "http://10.0.0.1:8000/v1/a",
+                    "http://10.0.0.1:8000/v1/b",
+                    "http://10.0.0.1:8000",
+                ],
+                num_workers=10,
+            )
+        assert c.max_connections == 10000  # 1 distinct (host, port)
+
+    def test_explicit_max_connections_within_scaled_budget_ok(self):
+        # 25000 exceeds one endpoint's budget (10000) but fits 3 (30000).
+        with (
+            patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)),
+            patch.object(cfg, "get_ephemeral_port_limit", return_value=10000),
+        ):
+            c = cfg.HTTPClientConfig(
+                endpoint_urls=[
+                    "http://10.0.0.1:8000",
+                    "http://10.0.0.2:8000",
+                    "http://10.0.0.3:8000",
+                ],
+                num_workers=10,
+                max_connections=25000,
+            )
+        assert c.max_connections == 25000
+
+    def test_explicit_max_connections_exceeding_scaled_budget_raises(self):
+        with (
+            patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)),
+            patch.object(cfg, "get_ephemeral_port_limit", return_value=10000),
+        ):
+            with pytest.raises(RuntimeError, match="exceeds the ephemeral"):
+                cfg.HTTPClientConfig(
+                    endpoint_urls=["http://10.0.0.1:8000", "http://10.0.0.2:8000"],
+                    num_workers=10,
+                    max_connections=40000,  # > 2 x 10000
+                )

From ba474a9d215f36fecdff8be9f3ea279e8ebd47ac Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Tue, 23 Jun 2026 14:21:21 -0700
Subject: [PATCH 2/2] fix(client): robustly normalize endpoint URLs for
 port-budget scaling

Address review feedback on the distinct-endpoint count that drives the
ephemeral-port budget. The previous `(urlparse(u).hostname, urlparse(u).port)`
key collapsed bare `host:port` inputs to `(None, None)` (urlparse treats the
host as the scheme) and dropped the http/https distinction when ports were
implicit. `_endpoint_destination` now prefixes a scheme when none is present
and resolves the scheme default port (443 for https, else 80), so schemeless
endpoints count distinctly and http/https to the same host stay separate.

Also bump the transitive msgpack pin 1.1.2 -> 1.2.1 in uv.lock to clear
GHSA-6v7p-g79w-8964 flagged by the pip-audit CI job.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../endpoint_client/config.py                 | 17 +++++++-
 .../test_http_client_config.py                | 42 ++++++++++++++++++-
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/src/inference_endpoint/endpoint_client/config.py b/src/inference_endpoint/endpoint_client/config.py
index 1266fb703..4894ca896 100644
--- a/src/inference_endpoint/endpoint_client/config.py
+++ b/src/inference_endpoint/endpoint_client/config.py
@@ -46,6 +46,21 @@
 )
 from .utils import get_ephemeral_port_limit, get_ephemeral_port_range
 
+
+def _endpoint_destination(url: str) -> tuple[str | None, int]:
+    """Resolve an endpoint URL to its ``(host, port)`` destination identity.
+
+    Used to count distinct destinations for the ephemeral-port budget. A
+    bare ``host:port`` (no scheme) is parsed as http so the host/port land
+    in the right fields instead of collapsing to ``(None, None)``; a missing
+    port resolves to the scheme default (443 for https, else 80) so that
+    http and https to the same host count as distinct destinations.
+    """
+    parsed = urlparse(url if "://" in url else f"http://{url}")
+    port = parsed.port or (443 if parsed.scheme == "https" else 80)
+    return (parsed.hostname, port)
+
+
 ADAPTER_MAP = {
     APIType.OPENAI: "inference_endpoint.openai.openai_msgspec_adapter.OpenAIMsgspecAdapter",
     APIType.OPENAI_COMPLETIONS: "inference_endpoint.openai.completions_adapter.OpenAITextCompletionsAdapter",
@@ -263,7 +278,7 @@ def _resolve_defaults(self) -> HTTPClientConfig:
             # otherwise concurrency is needlessly throttled to a single
             # endpoint's budget when several endpoints are configured.
             distinct_endpoints = len(
-                {(urlparse(u).hostname, urlparse(u).port) for u in self.endpoint_urls}
+                {_endpoint_destination(u) for u in self.endpoint_urls}
             )
             port_budget = available_ports * max(1, distinct_endpoints)
 
diff --git a/tests/unit/endpoint_client/test_http_client_config.py b/tests/unit/endpoint_client/test_http_client_config.py
index 3a29907d3..3be34d491 100644
--- a/tests/unit/endpoint_client/test_http_client_config.py
+++ b/tests/unit/endpoint_client/test_http_client_config.py
@@ -10,7 +10,6 @@
 from unittest.mock import patch
 
 import pytest
-
 from inference_endpoint.endpoint_client import config as cfg
 from inference_endpoint.endpoint_client.cpu_affinity import UnsupportedPlatformError
 
@@ -125,3 +124,44 @@ def test_explicit_max_connections_exceeding_scaled_budget_raises(self):
                     num_workers=10,
                     max_connections=40000,  # > 2 x 10000
                 )
+
+
+@pytest.mark.unit
+class TestEndpointDestination:
+    """Distinct-destination identity used for the ephemeral-port budget."""
+
+    @pytest.mark.parametrize(
+        ("url", "expected"),
+        [
+            ("http://10.0.0.1:8000", ("10.0.0.1", 8000)),
+            ("https://host:9000", ("host", 9000)),
+            ("http://host", ("host", 80)),
+            ("https://host", ("host", 443)),
+            ("10.0.0.1:8000", ("10.0.0.1", 8000)),  # schemeless host:port
+            ("host:9000", ("host", 9000)),
+            ("http://[::1]:8000", ("::1", 8000)),  # IPv6
+        ],
+    )
+    def test_resolves_host_and_port(self, url, expected):
+        assert cfg._endpoint_destination(url) == expected
+
+    def test_schemeless_urls_count_as_distinct(self):
+        # Bare host:port must not collapse to (None, None) and inflate to 1.
+        keys = {cfg._endpoint_destination(u) for u in ("a:8000", "b:8000")}
+        assert len(keys) == 2
+
+    def test_http_and_https_same_host_are_distinct(self):
+        # Default ports differ (80 vs 443) -> two destinations, not one.
+        keys = {cfg._endpoint_destination(u) for u in ("http://h", "https://h")}
+        assert len(keys) == 2
+
+    def test_schemeless_budget_scales_with_distinct_hosts(self):
+        with (
+            patch.object(cfg, "get_ephemeral_port_range", return_value=(32768, 60999)),
+            patch.object(cfg, "get_ephemeral_port_limit", return_value=10000),
+        ):
+            c = cfg.HTTPClientConfig(
+                endpoint_urls=["10.0.0.1:8000", "10.0.0.2:8000"],
+                num_workers=10,
+            )
+        assert c.max_connections == 20000  # 10000 ports x 2 distinct hosts