From 9ab6e838fd845482bf23bcdd578fb056e8009ef8 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Tue, 9 Jun 2026 13:32:42 -0700
Subject: [PATCH 01/20] perf(metrics): batch tokenization with defer-to-flush
 drain

Replace the per-event async tokenize model (one asyncio task per sample's
ISL/OSL/TPOT) with a deferred batch design that keeps tokenization ahead of
completions on high-completion-rate runs, where the per-event tasks otherwise
piled up faster than a single tokenizer thread could clear them and stretched
the end-of-run drain.

- BatchTokenizer: counts whole batches via the raw tokenizers backend
  (encode_batch_fast), sharded across worker processes each pinned to a
  disjoint CORES_PER_WORKER-core block so their rayon pools stay NUMA-local.
  Falls back to a single in-process thread when there is no fast backend or
  fewer than two core blocks fit.
- TokenBatchQueue: triggers enqueue (text/message + a recorder callback)
  instead of spawning tasks; the buffer is tokenized in one sharded call at
  each publish tick (live ISL/OSL/TPOT) and once at end-of-run
  (flush_remaining, bounded by the drain budget). n_pending_tasks now counts
  un-tokenized items, preserving the Report "incomplete drain" contract.
- MetricsTable is now fully synchronous (drops the in-flight task set,
  drain_tasks, and in_flight_tasks_count).
- CORES_PER_WORKER is a module constant; removes the metrics_tokenizer_workers
  config knob (schema/execute/CLI) and regenerates the YAML templates.

Validated: 234 unit + 3 integration tests pass. Offline-burst e2e (echo
server, streaming, real tokenizer) shows a 3000-tokenization backlog at ENDED
drained to n_pending_tasks=0 with the final report state=complete.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/__main__.py   |  33 +-
 .../services/metrics_aggregator/aggregator.py |  56 +-
 .../metrics_aggregator/metrics_table.py       | 172 ++----
 .../services/metrics_aggregator/publisher.py  |  19 +-
 .../metrics_aggregator/token_metrics.py       | 521 +++++++++++++-----
 .../commands/benchmark/execute.py             |   6 -
 src/inference_endpoint/config/schema.py       |  15 -
 .../templates/concurrency_template_full.yaml  |   1 -
 .../templates/offline_template_full.yaml      |   1 -
 .../templates/online_template_full.yaml       |   1 -
 .../services/metrics_aggregator/conftest.py   |  27 +-
 .../metrics_aggregator/test_aggregator.py     | 112 ++--
 .../test_main_signal_handler.py               |   8 +-
 .../metrics_aggregator/test_metrics_table.py  |  47 +-
 .../metrics_aggregator/test_token_metrics.py  | 174 ++++--
 tests/unit/commands/test_benchmark.py         |  62 ---
 16 files changed, 716 insertions(+), 539 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 2231d6dc8..9cd1c7e5e 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -33,7 +33,7 @@
 from .publisher import MetricsPublisher
 from .registry import MetricsRegistry
 from .snapshot import MetricsSnapshotCodec
-from .token_metrics import TokenizePool
+from .token_metrics import BatchTokenizer, TokenBatchQueue
 
 logger = logging.getLogger(__name__)
 
@@ -44,6 +44,7 @@ def _make_sigterm_handler(
     registry: MetricsRegistry,
     publisher: MetricsPublisher,
     table: MetricsTable,
+    token_queue: TokenBatchQueue | None,
     shutdown_event: asyncio.Event,
 ) -> tuple[Callable[[], None], set[asyncio.Task]]:
     """Build the SIGTERM handler that writes the INTERRUPTED final snapshot.
@@ -75,7 +76,7 @@ async def _signal_finalize() -> None:
             )
             await publisher.publish_final(
                 registry,
-                n_pending_tasks=table.in_flight_tasks_count,
+                n_pending_tasks=token_queue.pending if token_queue is not None else 0,
                 interrupted=True,
             )
         except Exception:  # noqa: BLE001 — best-effort.
@@ -134,11 +135,10 @@ async def main() -> None:
         type=float,
         default=60.0,
         help=(
-            "Wall-clock budget (seconds) to wait for in-flight async tokenize "
-            "tasks to finish after ENDED before the aggregator cancels them "
-            "and emits the final snapshot with n_pending_tasks > 0 "
-            "(default: 60.0; 0 = wait indefinitely). Increase for long-context "
-            "/ low-worker-count tokenize workloads."
+            "Wall-clock budget (seconds) to finish tokenizing buffered samples "
+            "after ENDED before the aggregator emits the final snapshot with "
+            "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase "
+            "for very large datasets where the end-of-run tokenize batch is big."
         ),
     )
     parser.add_argument(
@@ -159,12 +159,6 @@ async def main() -> None:
         default=None,
         help="HuggingFace tokenizer name for ISL/OSL/TPOT (e.g. 'gpt2'). If not set, token metrics are disabled.",
     )
-    parser.add_argument(
-        "--tokenizer-workers",
-        type=int,
-        default=2,
-        help="Number of tokenizer worker threads (default: 2)",
-    )
     parser.add_argument(
         "--streaming",
         action="store_true",
@@ -204,15 +198,15 @@ async def main() -> None:
     loop = LoopManager().default_loop
 
     # Using ternary operator causes errors in MyPy object type coalescing
-    # (coalesces to 'object' not 'AbstractContextManager[TokenizePool | None]')
-    pool_cm: AbstractContextManager[TokenizePool | None]
+    # (coalesces to 'object' not 'AbstractContextManager[BatchTokenizer | None]')
+    tokenizer_cm: AbstractContextManager[BatchTokenizer | None]
     if args.tokenizer:
-        pool_cm = TokenizePool(args.tokenizer, n_workers=args.tokenizer_workers)
+        tokenizer_cm = BatchTokenizer(args.tokenizer)
     else:
-        pool_cm = nullcontext()
+        tokenizer_cm = nullcontext()
 
     with (
-        pool_cm as pool,
+        tokenizer_cm as tokenizer,
         ManagedZMQContext.scoped(socket_dir=args.socket_dir) as zmq_ctx,
     ):
         registry = MetricsRegistry()
@@ -234,7 +228,7 @@ async def main() -> None:
                 publish_interval_s=args.publish_interval,
                 sig_figs=args.hdr_sig_figs,
                 n_histogram_buckets=args.n_histogram_buckets,
-                tokenize_pool=pool,
+                tokenizer=tokenizer,
                 streaming=args.streaming,
                 shutdown_event=shutdown_event,
                 drain_timeout_s=None if args.drain_timeout == 0 else args.drain_timeout,
@@ -269,6 +263,7 @@ async def main() -> None:
                 registry=registry,
                 publisher=publisher,
                 table=aggregator._table,
+                token_queue=aggregator._token_queue,
                 shutdown_event=shutdown_event,
             )
             loop.add_signal_handler(signal.SIGTERM, on_sigterm)
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index f01c9753c..ed5ace0a0 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -47,7 +47,7 @@
 from .publisher import MetricsPublisher
 from .registry import MetricsRegistry
 from .snapshot import SessionState
-from .token_metrics import TokenizePool
+from .token_metrics import BatchTokenizer, TokenBatchQueue
 
 logger = logging.getLogger(__name__)
 
@@ -117,7 +117,7 @@ def __init__(
         publish_interval_s: float,
         sig_figs: int,
         n_histogram_buckets: int,
-        tokenize_pool: TokenizePool | None = None,
+        tokenizer: BatchTokenizer | None = None,
         streaming: bool = False,
         shutdown_event: asyncio.Event | None = None,
         drain_timeout_s: float | None = _DEFAULT_DRAIN_TIMEOUT_S,
@@ -133,7 +133,12 @@ def __init__(
         self._registry = registry
         self._publisher = publisher
         self._publish_interval_s = publish_interval_s
-        self._tokenize_pool = tokenize_pool
+        # Token triggers enqueue onto this queue; it is flushed in batches at
+        # each publish tick and at end-of-run. None when no tokenizer is set
+        # (token metrics disabled), in which case those triggers are no-ops.
+        self._token_queue: TokenBatchQueue | None = (
+            TokenBatchQueue(tokenizer, self.loop) if tokenizer is not None else None
+        )
         self._streaming = streaming
         self._shutdown_event = shutdown_event
         self._shutdown_received = False
@@ -223,21 +228,23 @@ def _register_triggers(self, streaming: bool) -> None:
         """
         table = self._table
         registry = self._registry
-        pool = self._tokenize_pool
-        loop = self.loop
+        queue = self._token_queue
 
         # Always registered
-        table.add_trigger(SampleField.ISSUED_NS, IslTrigger(registry, pool, loop))
+        table.add_trigger(SampleField.ISSUED_NS, IslTrigger(registry, queue))
         table.add_trigger(SampleField.COMPLETE_NS, SampleLatencyTrigger(registry))
-        table.add_trigger(SampleField.COMPLETE_NS, OslTrigger(registry, pool, loop))
+        table.add_trigger(SampleField.COMPLETE_NS, OslTrigger(registry, queue))
 
         # Streaming-only
         if streaming:
             table.add_trigger(SampleField.RECV_FIRST_NS, TtftTrigger(registry))
             table.add_trigger(SampleField.LAST_RECV_NS, ChunkDeltaTrigger(registry))
-            table.add_trigger(
-                SampleField.COMPLETE_NS, TpotTrigger(registry, pool, loop)
-            )
+            table.add_trigger(SampleField.COMPLETE_NS, TpotTrigger(registry, queue))
+
+    async def _flush_tokens(self) -> None:
+        """Flush buffered tokenizations so the next snapshot reflects them."""
+        if self._token_queue is not None:
+            await self._token_queue.flush()
 
     # ------------------------------------------------------------------
     # Event processing
@@ -311,8 +318,11 @@ async def process(self, records: list[EventRecord]) -> None:
                                 self._publish_interval_s,
                                 get_runtime_state=lambda: (
                                     self._session_state,
-                                    table.in_flight_tasks_count,
+                                    self._token_queue.pending
+                                    if self._token_queue is not None
+                                    else 0,
                                 ),
+                                pre_publish=self._flush_tokens,
                             )
                     table.handle_session_event(record)
                     if ev == SessionEventType.STOP_PERFORMANCE_TRACKING:
@@ -367,12 +377,18 @@ async def process(self, records: list[EventRecord]) -> None:
             # ENDED has been observed; transition to DRAINING so any tick
             # that fires before publish_final reflects the new state.
             self._session_state = SessionState.DRAINING
-            logger.info("Draining %d async tasks...", table.in_flight_tasks_count)
-            # drain_tasks owns the timeout + cancel-and-await sequence so
-            # the pending count is captured BEFORE done-callbacks empty
-            # the in-flight set. Reading in_flight_tasks_count out here
-            # would always be 0 (see drain_tasks docstring).
-            n_pending = await table.drain_tasks(timeout=self._drain_timeout_s)
+            queue = self._token_queue
+            pending = queue.pending if queue is not None else 0
+            logger.info("Draining %d pending tokenizations...", pending)
+            # flush_remaining tokenizes the whole buffer in one batched pass,
+            # bounded by the drain budget; it returns the count it could not
+            # finish (non-zero only on a timeout), which becomes the snapshot's
+            # n_pending_tasks so Report can flag an incomplete drain.
+            n_pending = (
+                await queue.flush_remaining(self._drain_timeout_s)
+                if queue is not None
+                else 0
+            )
             if n_pending > 0:
                 timeout_str = (
                     f"{self._drain_timeout_s:.1f}s"
@@ -380,13 +396,13 @@ async def process(self, records: list[EventRecord]) -> None:
                     else "unlimited"
                 )
                 logger.warning(
-                    "drain_tasks timed out after %s; %d async tasks "
-                    "did not complete and were cancelled",
+                    "tokenizer drain timed out after %s; %d tokenizations "
+                    "did not complete",
                     timeout_str,
                     n_pending,
                 )
             logger.info(
-                "Async tasks drained (n_pending_tasks=%d at finalize)", n_pending
+                "Tokenizations drained (n_pending_tasks=%d at finalize)", n_pending
             )
             registry.set_counter(
                 MetricCounterKey.TRACKED_DURATION_NS.value,
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py
index 46a17e92f..f67c859e6 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py
@@ -17,9 +17,9 @@
 
 from __future__ import annotations
 
-import asyncio
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum
 from typing import TYPE_CHECKING, Any
@@ -33,7 +33,8 @@
         MetricsRegistry,
     )
     from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import (
-        TokenizePool,
+        MessageParts,
+        TokenBatchQueue,
     )
     from inference_endpoint.core.record import EventRecord
 
@@ -146,8 +147,13 @@ def fire(
         ev_rec: EventRecord,
         row: SampleRow,
         pre_change: dict[str, Any],
-    ) -> asyncio.Task | None:
-        """Must be non-blocking. Return a Task if async work was scheduled."""
+    ) -> None:
+        """Must be non-blocking.
+
+        Sync triggers record into the registry directly. Token triggers
+        enqueue onto the shared ``TokenBatchQueue`` for batched tokenization
+        at the next flush; neither path schedules per-event tasks.
+        """
         raise NotImplementedError()
 
 
@@ -177,28 +183,27 @@ def fire(self, ev_rec, row, pre_change):
 
 
 class AsyncTokenTrigger(EmitTrigger):
-    """Base for triggers that need async tokenization.
-
-    Subclasses implement ``_extract_text()`` to pull the text to tokenize
-    from the event record. If text is returned, an async task is created
-    to tokenize and emit. Subclasses can also override ``_extract_message()``
-    to return (content, reasoning, tool_calls) for chat-template–aware tokenization
-    when tool calls are present. Subclasses can override ``_compute_value()`` to
-    transform the token count before storing.
+    """Base for triggers whose metric needs tokenization.
+
+    Subclasses implement ``_extract_text()`` to pull the text to tokenize from
+    the event record, and may override ``_extract_message()`` to return
+    (content, reasoning, tool_calls) for chat-template–aware tokenization when
+    tool calls are present. ``fire()`` does not tokenize inline — it enqueues
+    the work plus a recorder callback onto the shared ``TokenBatchQueue``, which
+    the aggregator flushes in batches. ``_compute_value()`` can transform the
+    token count before it is recorded.
     """
 
     def __init__(
         self,
         metric_name: str,
         registry: MetricsRegistry,
-        tokenize_pool: TokenizePool | None,
-        loop: asyncio.AbstractEventLoop | None,
+        queue: TokenBatchQueue | None,
         requires: tuple[str, ...] = (),
         dtype: type = int,
     ):
         super().__init__(metric_name, registry, requires=requires, dtype=dtype)
-        self._pool = tokenize_pool
-        self._loop = loop
+        self._queue = queue
 
     @abstractmethod
     def _extract_text(
@@ -209,11 +214,11 @@ def _extract_text(
 
     def _extract_message(
         self, ev_rec: EventRecord, row: SampleRow, pre_change: dict[str, Any]
-    ) -> tuple[str, str | None, tuple[dict[str, Any], ...] | None] | None:
-        """Return (content, reasoning, tool_calls) for message-aware tokenization, or None.
+    ) -> MessageParts | None:
+        """Return (content, reasoning, tool_calls) for message-aware tokenization.
 
-        When non-None is returned, ``token_count_message_async`` is used instead of
-        ``token_count_async``. Default returns None (use text path).
+        When non-None, the message (chat-template) path is used instead of the
+        plain-text path. Default returns None (use text path).
         """
         return None
 
@@ -223,48 +228,32 @@ def _compute_value(
         """Transform token count into the metric value. Default: count as-is."""
         return token_count
 
-    def fire(self, ev_rec, row, pre_change):
-        if self._pool is None or self._loop is None:
-            return None
+    def _make_recorder(
+        self, ev_rec: EventRecord, pre_change: dict[str, Any]
+    ) -> Callable[[int], None]:
+        """Build the callback the queue runs once the token count is known."""
+        registry, name = self.registry, self.metric_name
 
-        message_parts = self._extract_message(ev_rec, row, pre_change)
-        if message_parts is not None:
-            content, reasoning, tool_calls = message_parts
-            pool, loop = self._pool, self._loop
-            registry, name = self.registry, self.metric_name
-            uuid = row.sample_uuid
-
-            async def _tokenize_message_and_emit() -> None:
-                try:
-                    count = await pool.token_count_message_async(
-                        content, reasoning, tool_calls, loop
-                    )
-                    value = self._compute_value(count, ev_rec, pre_change)
-                    if value is not None:
-                        registry.record(name, value)
-                except Exception:
-                    logger.exception("%s tokenization failed for %s", name, uuid)
+        def record(count: int) -> None:
+            value = self._compute_value(count, ev_rec, pre_change)
+            if value is not None:
+                registry.record(name, value)
 
-            return loop.create_task(_tokenize_message_and_emit())
+        return record
 
+    def fire(self, ev_rec, row, pre_change):
+        if self._queue is None:
+            return
+        message_parts = self._extract_message(ev_rec, row, pre_change)
+        if message_parts is not None:
+            self._queue.enqueue_message(
+                message_parts, self._make_recorder(ev_rec, pre_change)
+            )
+            return
         text = self._extract_text(ev_rec, row, pre_change)
         if not text:
-            return None
-
-        pool, loop = self._pool, self._loop
-        registry, name = self.registry, self.metric_name
-        uuid = row.sample_uuid
-
-        async def _tokenize_and_emit() -> None:
-            try:
-                count = await pool.token_count_async(text, loop)
-                value = self._compute_value(count, ev_rec, pre_change)
-                if value is not None:
-                    registry.record(name, value)
-            except Exception:
-                logger.exception("%s tokenization failed for %s", name, uuid)
-
-        return loop.create_task(_tokenize_and_emit())
+            return
+        self._queue.enqueue_text(text, self._make_recorder(ev_rec, pre_change))
 
 
 # ---------------------------------------------------------------------------
@@ -319,19 +308,18 @@ class IslTrigger(AsyncTokenTrigger):
     def __init__(
         self,
         registry: MetricsRegistry,
-        tokenize_pool: TokenizePool | None,
-        loop: asyncio.AbstractEventLoop | None,
+        queue: TokenBatchQueue | None,
     ):
-        super().__init__(MetricSeriesKey.ISL, registry, tokenize_pool, loop)
+        super().__init__(MetricSeriesKey.ISL, registry, queue)
 
     def fire(self, ev_rec, row, pre_change):
         # Sync fast path: any backend that pre-populates token_ids (e.g. SGLang).
         if isinstance(ev_rec.data, PromptData) and ev_rec.data.token_ids is not None:
             self.registry.record(self.metric_name, len(ev_rec.data.token_ids))
-            return None
-        # Async path: tokenize raw text — used when token_ids are unavailable
-        # (e.g. OpenAI-compatible endpoints). Handled by the base class.
-        return super().fire(ev_rec, row, pre_change)
+            return
+        # Text path: tokenize raw prompt text — used when token_ids are
+        # unavailable (e.g. OpenAI-compatible endpoints). Enqueued by the base.
+        super().fire(ev_rec, row, pre_change)
 
     def _extract_text(self, ev_rec, row, pre_change):
         if isinstance(ev_rec.data, PromptData) and ev_rec.data.text is not None:
@@ -345,10 +333,9 @@ class OslTrigger(AsyncTokenTrigger):
     def __init__(
         self,
         registry: MetricsRegistry,
-        tokenize_pool: TokenizePool | None,
-        loop: asyncio.AbstractEventLoop | None,
+        queue: TokenBatchQueue | None,
     ):
-        super().__init__(MetricSeriesKey.OSL, registry, tokenize_pool, loop)
+        super().__init__(MetricSeriesKey.OSL, registry, queue)
 
     def _extract_text(self, ev_rec, row, pre_change):
         if isinstance(ev_rec.data, TextModelOutput):
@@ -383,14 +370,12 @@ class TpotTrigger(AsyncTokenTrigger):
     def __init__(
         self,
         registry: MetricsRegistry,
-        tokenize_pool: TokenizePool | None,
-        loop: asyncio.AbstractEventLoop | None,
+        queue: TokenBatchQueue | None,
     ):
         super().__init__(
             MetricSeriesKey.TPOT_NS,
             registry,
-            tokenize_pool,
-            loop,
+            queue,
             requires=(SampleField.RECV_FIRST_NS,),
             dtype=float,
         )
@@ -444,7 +429,6 @@ def __init__(self, registry: MetricsRegistry) -> None:
         self._registry = registry
         self._in_flight: dict[str, SampleRow] = {}
         self._triggers: dict[str, list[EmitTrigger]] = {}
-        self._in_flight_tasks: set[asyncio.Task] = set()
 
         # Session-level state
         self.is_tracking: bool = False
@@ -538,45 +522,6 @@ def set_field(
             self._update_tracked_block(row, ev_rec.timestamp_ns)
             self._in_flight.pop(sample_uuid, None)
 
-    # --- Task draining ---
-
-    @property
-    def in_flight_tasks_count(self) -> int:
-        """Number of async trigger tasks currently in flight."""
-        return len(self._in_flight_tasks)
-
-    async def drain_tasks(self, *, timeout: float | None = None) -> int:
-        """Await in-flight async trigger tasks.
-
-        With ``timeout``, the pending set at the timeout boundary is
-        cancelled and awaited; the count of those pending tasks is
-        returned (>0 indicates the drain timed out). Without
-        ``timeout``, blocks indefinitely and returns 0 on clean drain.
-
-        The pending count must be captured BEFORE the cancel-and-await
-        step: each task's ``add_done_callback(_in_flight_tasks.discard)``
-        empties ``_in_flight_tasks`` as cancellation propagates, so
-        reading ``in_flight_tasks_count`` after this method returns
-        would always be 0 — making a drain timeout indistinguishable
-        from a clean run.
-        """
-        if not self._in_flight_tasks:
-            return 0
-        if timeout is None:
-            await asyncio.gather(*self._in_flight_tasks, return_exceptions=True)
-            self._in_flight_tasks.clear()
-            return 0
-        _, still_pending = await asyncio.wait(
-            list(self._in_flight_tasks), timeout=timeout
-        )
-        n_pending = len(still_pending)
-        if still_pending:
-            for t in still_pending:
-                t.cancel()
-            await asyncio.gather(*still_pending, return_exceptions=True)
-        self._in_flight_tasks.clear()
-        return n_pending
-
     # --- Internal ---
 
     def _create_row(self, sample_uuid: str) -> SampleRow:
@@ -595,10 +540,7 @@ def _fire_triggers(
     ) -> None:
         for trigger in self._triggers.get(field_name, ()):
             pre_change = {attr: getattr(row, attr) for attr in trigger.requires}
-            task = trigger.fire(ev_rec, row, pre_change)
-            if task is not None:
-                self._in_flight_tasks.add(task)
-                task.add_done_callback(self._in_flight_tasks.discard)
+            trigger.fire(ev_rec, row, pre_change)
 
     def _update_tracked_block(self, row: SampleRow, complete_ns: int) -> None:
         """Extend the sample's tracked block duration and increment count."""
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
index d21973a3f..b58aa05ff 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
@@ -21,7 +21,7 @@
 import json
 import logging
 import os
-from collections.abc import Callable
+from collections.abc import Awaitable, Callable
 from pathlib import Path
 
 from inference_endpoint.async_utils.services.metrics_aggregator.registry import (
@@ -102,15 +102,22 @@ def start(
         registry: MetricsRegistry,
         publish_interval_s: float,
         get_runtime_state: Callable[[], tuple[SessionState, int]],
+        pre_publish: Callable[[], Awaitable[None]] | None = None,
     ) -> None:
         """Begin publishing live ticks every ``publish_interval_s`` seconds.
 
         ``get_runtime_state`` returns ``(state, n_pending_tasks)`` for the
         current moment: the aggregator's session state (``LIVE`` or
-        ``DRAINING``) and the count of in-flight async tokenize tasks. The
-        callable is invoked once per tick and the values are plumbed into
-        the published snapshot. ``COMPLETE`` is emitted only by
-        ``publish_final``, never by the tick task.
+        ``DRAINING``) and the count of pending tokenizations. The callable is
+        invoked once per tick and the values are plumbed into the published
+        snapshot. ``COMPLETE`` is emitted only by ``publish_final``, never by
+        the tick task.
+
+        ``pre_publish``, if given, is awaited at the top of each tick before
+        the snapshot is built — the aggregator uses it to flush buffered
+        tokenizations so live ISL/OSL/TPOT reflect recently completed samples.
+        Its failures are swallowed by the tick's own try/except (the tick keeps
+        going), so a transient tokenizer hiccup never stops live publishing.
 
         Idempotent on the tick-task slot: a second call (e.g. from a
         spurious duplicate ``STARTED`` event or a buggy replay producer)
@@ -133,6 +140,8 @@ async def _tick() -> None:
             while True:
                 try:
                     await asyncio.sleep(publish_interval_s)
+                    if pre_publish is not None:
+                        await pre_publish()
                     state, n_pending = get_runtime_state()
                     snap = registry.build_snapshot(
                         state=state, n_pending_tasks=n_pending
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 3411d5061..57c9704d4 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -13,25 +13,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tokenization utilities for metrics aggregation."""
+"""Tokenization for ISL/OSL/TPOT metrics.
+
+``BatchTokenizer`` tokenizes whole batches of text at once. A single BPE rayon
+pool saturates ~8 CPU cores (memory-bound), so to use the whole machine it
+shards each batch across worker *processes*, one pinned to each block of
+``CORES_PER_WORKER`` cores (their rayon pools stay NUMA-local). The aggregator
+buffers per-sample text as COMPLETE events arrive and calls ``count_texts`` once
+per flush (publish tick + drain) — so batching, not a per-request coalescer,
+keeps tokenization ahead of completions. Falls back to a single in-process
+thread when there is no fast Rust backend or fewer than two core blocks fit.
+"""
 
 from __future__ import annotations
 
 import asyncio
 import json
 import logging
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from typing import TYPE_CHECKING, Any
+import multiprocessing
+import os
+from collections.abc import Callable
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from typing import TYPE_CHECKING, Any, Protocol, cast
 
 import msgspec
 from transformers import AutoTokenizer
+from transformers.utils import logging as transformers_logging
+
+# A single rayon pool peaks at ~8 cores for BPE (memory-bound; more threads
+# oversubscribe and, on multi-socket Grace, cross the NUMA boundary). Sharding
+# across processes pinned to disjoint 8-core blocks is how the whole machine is
+# used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process.
+CORES_PER_WORKER = 8
 
 # Minimal user message used to satisfy chat templates that reject assistant-only
 # message lists. Its token count is subtracted so only the assistant payload is
 # measured.
 _PREFIX_USER_MSG: dict[str, str] = {"role": "user", "content": ""}
 
+logger = logging.getLogger(__name__)
+
 
 def _normalize_tool_calls_for_template(
     tool_calls: tuple[dict[str, Any], ...] | list[dict[str, Any]],
@@ -60,140 +81,252 @@ def _normalize_tool_calls_for_template(
     return normalized
 
 
+# ---------------------------------------------------------------------------
+# Process-worker entry points (module-level so ProcessPoolExecutor can pickle
+# them by name). Each worker holds one raw tokenizers backend, pinned to a
+# fixed core block.
+# ---------------------------------------------------------------------------
+
+_WORKER_BACKEND: Any = None
+
+
+def _init_worker(tokenizer_name: str, core_set: list[int]) -> None:
+    """Pin this worker to ``core_set``, then load the raw tokenizers backend.
+
+    Affinity is set before the first encode so the Rust rayon pool sizes itself
+    to the pinned core count (num_cpus respects sched_getaffinity on Linux).
+    """
+    if core_set:
+        try:
+            os.sched_setaffinity(0, set(core_set))
+        except (OSError, AttributeError):
+            logger.debug("could not pin tokenizer worker to %s", core_set)
+    transformers_logging.set_verbosity_error()
+    tok = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+    global _WORKER_BACKEND
+    _WORKER_BACKEND = getattr(tok, "backend_tokenizer", None)
+    if _WORKER_BACKEND is not None:
+        _WORKER_BACKEND.encode("warmup", add_special_tokens=False)
+
+
+def _worker_encode_lengths(texts: list[str]) -> list[int]:
+    """Per-text token counts for a shard, in one rayon-parallel call."""
+    backend = _WORKER_BACKEND
+    if backend is None:
+        raise RuntimeError("tokenizer worker backend unavailable")
+    encode_batch = getattr(backend, "encode_batch_fast", None) or backend.encode_batch
+    return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)]
+
+
+def _worker_ready(_: int) -> bool:
+    """Warmup probe: returns once the worker's backend is loaded."""
+    return _WORKER_BACKEND is not None
+
+
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
 
-logger = logging.getLogger(__name__)
 
+def _even_chunks(items: list[str], n: int) -> list[list[str]]:
+    """Split ``items`` into at most ``n`` near-equal contiguous chunks."""
+    if n <= 1 or len(items) <= 1:
+        return [items]
+    size = (len(items) + n - 1) // n
+    return [items[i : i + size] for i in range(0, len(items), size)]
 
-class TokenizePool:
-    """A pool of worker threads, each with its own HuggingFace AutoTokenizer.
 
-    Uses multi-threading (not multiprocessing) because HuggingFace tokenizers
-    use a Rust backend that releases the GIL during tokenization, so threads
-    can run tokenization in parallel without GIL contention. Multiprocessing
-    would add process spawn overhead and per-process tokenizer memory and
-    IPC latency.
+class BatchTokenizer:
+    """Counts tokens for batches of text, sharded across pinned CPU cores.
 
-    Thread-safety notes:
-    - The ThreadPoolExecutor itself is thread-safe (submit/shutdown are synchronized).
-    - Each worker thread has its own tokenizer via thread-local storage, so there
-      is no shared mutable state during tokenization.
-    - The blocking `token_count()` method is safe to call from multiple threads
-      concurrently.
-    - In an async context, use `token_count_async` to avoid blocking the event loop.
+    ``count_texts`` / ``count_texts_async`` tokenize a whole list in one shot.
+    The sync ``token_count`` and chat-template ``token_count_message`` paths run
+    on a small in-process thread pool — they are rare (single ISL probes, tool
+    calls) relative to the batched OSL/ISL/TPOT flush.
     """
 
-    def __init__(self, tokenizer_name: str, n_workers: int) -> None:
-        if n_workers < 1:
-            raise ValueError("n_workers must be at least 1")
+    def __init__(
+        self,
+        tokenizer_name: str,
+        *,
+        cores_per_worker: int = CORES_PER_WORKER,
+    ) -> None:
         self._tokenizer_name = tokenizer_name
-        self._n_workers = n_workers
-        self._thread_local = threading.local()
         self._fallback_warned: set[str] = set()
-        self._executor: ThreadPoolExecutor | None = ThreadPoolExecutor(
-            max_workers=n_workers,
-            thread_name_prefix="TokenizePool",
+        self._tokenizer: PreTrainedTokenizerBase | None = None
+        self._prefix_len = 0
+        self._baseline = 0
+        # In-process thread for the sync + chat-template paths.
+        self._thread: ThreadPoolExecutor | None = ThreadPoolExecutor(
+            max_workers=1, thread_name_prefix="tok-thread"
         )
-        # Pre-load a tokenizer on every worker thread so the first real
-        # token_count call doesn't pay the AutoTokenizer.from_pretrained cost.
-        # Submitting n_workers tasks is guaranteed to hit every thread because
-        # AutoTokenizer.from_pretrained blocks long enough that no thread
-        # completes before all tasks are submitted.
-        # **IMPORTANT**: This is not a guarantee - for instance when using a mock
-        # object in tests for the tokenizer, the mock object *must* block in the 100ms
-        # range to simulate proper .from_pretrained behavior.
-        # It is not super impactful if a thread is not pre-initialized - it will just
-        # have to pay the cost of .from_pretrained on the first pool.token_count call
-        # for that thread.
-        futures = [
-            self._executor.submit(self._get_thread_tokenizer) for _ in range(n_workers)
-        ]
+        self._load_tokenizer()  # also computes the chat-template baseline
+        # Process shards for the batched text path (or empty -> in-process).
+        self._procs: list[ProcessPoolExecutor] = []
+        self._setup_shards(cores_per_worker)
+
+    # -- setup --------------------------------------------------------------
+
+    def _load_tokenizer(self) -> None:
+        tok = AutoTokenizer.from_pretrained(
+            self._tokenizer_name, trust_remote_code=True
+        )
+        self._tokenizer = tok
+        # Baseline = tokens from a [user, empty-assistant] pair minus the [user]
+        # prefix alone, so the assistant frame is subtracted from message counts.
         try:
-            for f in futures:
-                f.result()
-        except Exception:
-            self._executor.shutdown(wait=False)
-            self._executor = None
-            raise
-
-    def _get_thread_tokenizer(self) -> PreTrainedTokenizerBase:
-        """Return the tokenizer for the current thread, loading it if needed."""
-        if getattr(self._thread_local, "tokenizer", None) is None:
-            self._thread_local.tokenizer = AutoTokenizer.from_pretrained(
-                self._tokenizer_name, trust_remote_code=True
+            prefix = cast(
+                str,
+                tok.apply_chat_template(
+                    [_PREFIX_USER_MSG], tokenize=False, add_generation_prompt=False
+                ),
             )
-            # Baseline = tokens contributed by a [user, empty-assistant] pair minus
-            # the [user] prefix alone. Some templates (Qwen3-Coder, etc.) reject
-            # assistant-only message lists, so a user prefix is required; we
-            # subtract it out so the baseline reflects only the assistant frame.
-            try:
-                tok = self._thread_local.tokenizer
-                prefix_rendered = tok.apply_chat_template(
-                    [_PREFIX_USER_MSG],
-                    tokenize=False,
-                    add_generation_prompt=False,
-                )
-                prefix_len = len(tok.tokenize(prefix_rendered))
-                with_empty_assistant_rendered = tok.apply_chat_template(
+            self._prefix_len = len(tok.tokenize(prefix))
+            with_assistant = cast(
+                str,
+                tok.apply_chat_template(
                     [_PREFIX_USER_MSG, {"role": "assistant", "content": ""}],
                     tokenize=False,
                     add_generation_prompt=False,
+                ),
+            )
+            self._baseline = len(tok.tokenize(with_assistant)) - self._prefix_len
+        except Exception:
+            self._prefix_len = 0
+            self._baseline = 0
+            logger.exception(
+                "Failed to compute chat-template baseline for %s; tool-call "
+                "token counts may be over-estimated",
+                self._tokenizer_name,
+            )
+
+    def _setup_shards(self, cores_per_worker: int) -> None:
+        """Spawn one pinned single-worker process per core block.
+
+        No-op (leaving the batch path in-process) when the tokenizer has no fast
+        Rust backend, affinity is unavailable, or fewer than two blocks fit — a
+        single shard is no faster than the in-process backend.
+        """
+        if cores_per_worker <= 0:
+            return
+        if getattr(self._tokenizer, "backend_tokenizer", None) is None:
+            return
+        try:
+            available = sorted(os.sched_getaffinity(0))
+        except (OSError, AttributeError):
+            return
+        n = len(available) // cores_per_worker
+        if n < 2:
+            return
+        ctx = multiprocessing.get_context("spawn")
+        procs: list[ProcessPoolExecutor] = []
+        try:
+            for i in range(n):
+                block = available[i * cores_per_worker : (i + 1) * cores_per_worker]
+                ex = ProcessPoolExecutor(
+                    max_workers=1,
+                    mp_context=ctx,
+                    initializer=_init_worker,
+                    initargs=(self._tokenizer_name, block),
                 )
-                with_empty_assistant_len = len(
-                    tok.tokenize(with_empty_assistant_rendered)
-                )
-                self._thread_local.prefix_len = prefix_len
-                self._thread_local.baseline = with_empty_assistant_len - prefix_len
-            except Exception:
-                self._thread_local.prefix_len = 0
-                self._thread_local.baseline = 0
-                logger.exception(
-                    "Failed to compute chat-template baseline for %s; tool-call token counts may be over-estimated",
-                    self._tokenizer_name,
-                )
-        return self._thread_local.tokenizer
+                procs.append(ex)
+            # Force spawn + pin + tokenizer-load now (not on the first batch).
+            # Submit to every shard first so the loads run in parallel, then
+            # await — waiting on each before submitting the next would
+            # serialize P tokenizer loads and can exceed the launch budget.
+            ready = [ex.submit(_worker_ready, 0) for ex in procs]
+            for f in ready:
+                f.result()
+        except Exception:
+            for ex in procs:
+                ex.shutdown(wait=False)
+            logger.exception(
+                "tokenizer shard setup failed; using in-process tokenization"
+            )
+            return
+        self._procs = procs
+        logger.info(
+            "BatchTokenizer: %d shards x %d cores", len(procs), cores_per_worker
+        )
+
+    # -- batched text path --------------------------------------------------
 
-    def _token_count_worker(self, text: str) -> int:
-        """Worker entry: return the number of tokens in text."""
-        tokenizer = self._get_thread_tokenizer()
-        return len(tokenizer.tokenize(text))
+    def _encode_lengths_inproc(self, texts: list[str]) -> list[int]:
+        tok = self._tokenizer
+        backend = getattr(tok, "backend_tokenizer", None)
+        if backend is not None:
+            encode_batch = getattr(backend, "encode_batch_fast", None)
+            if encode_batch is None:
+                encode_batch = backend.encode_batch
+            return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)]
+        return [len(tok.tokenize(t)) for t in texts]  # type: ignore[union-attr]
+
+    def count_texts(self, texts: list[str]) -> list[int]:
+        """Per-text token counts for a whole batch (blocking)."""
+        if not texts:
+            return []
+        if not self._procs:
+            return self._encode_lengths_inproc(texts)
+        chunks = _even_chunks(texts, len(self._procs))
+        futures = [
+            self._procs[i].submit(_worker_encode_lengths, chunk)
+            for i, chunk in enumerate(chunks)
+        ]
+        out: list[int] = []
+        for f in futures:
+            out.extend(f.result())
+        return out
 
-    def _token_count_message_worker(
+    async def count_texts_async(
+        self, texts: list[str], loop: asyncio.AbstractEventLoop
+    ) -> list[int]:
+        """Per-text token counts for a whole batch without blocking the loop."""
+        if not texts:
+            return []
+        if not self._procs:
+            return await loop.run_in_executor(
+                self._thread, self._encode_lengths_inproc, texts
+            )
+        chunks = _even_chunks(texts, len(self._procs))
+        futures = [
+            asyncio.wrap_future(self._procs[i].submit(_worker_encode_lengths, chunk))
+            for i, chunk in enumerate(chunks)
+        ]
+        results = await asyncio.gather(*futures)
+        out: list[int] = []
+        for r in results:
+            out.extend(r)
+        return out
+
+    # -- sync + chat-template paths (in-process thread) ---------------------
+
+    def _token_count_text(self, text: str) -> int:
+        return len(self._tokenizer.tokenize(text))  # type: ignore[union-attr]
+
+    def _token_count_message(
         self,
         content: str,
         reasoning: str | None,
         tool_calls: tuple[dict[str, Any], ...] | None,
     ) -> int:
-        """Worker entry: tokenize a full assistant message using apply_chat_template.
-
-        Falls back to whitespace-split tokenization if apply_chat_template raises
-        (e.g. the template does not support tool_calls or reasoning fields).
-        """
-        tokenizer = self._get_thread_tokenizer()
+        tok = self._tokenizer
         msg: dict[str, Any] = {"role": "assistant", "content": content or ""}
         if reasoning:
             msg["reasoning_content"] = reasoning
         if tool_calls:
             msg["tool_calls"] = _normalize_tool_calls_for_template(tool_calls)
         try:
-            rendered = tokenizer.apply_chat_template(
-                [_PREFIX_USER_MSG, msg],
-                tokenize=False,
-                add_generation_prompt=False,
+            rendered = tok.apply_chat_template(  # type: ignore[union-attr]
+                [_PREFIX_USER_MSG, msg], tokenize=False, add_generation_prompt=False
             )
-            full = len(tokenizer.tokenize(rendered))
-            prefix_len = getattr(self._thread_local, "prefix_len", 0)
-            baseline = getattr(self._thread_local, "baseline", 0)
-            return max(0, full - prefix_len - baseline)
+            full = len(tok.tokenize(rendered))  # type: ignore[union-attr]
+            return max(0, full - self._prefix_len - self._baseline)
         except Exception as exc:
             key = f"{self._tokenizer_name}:{type(exc).__name__}"
             if key not in self._fallback_warned:
                 self._fallback_warned.add(key)
                 logger.exception(
                     "apply_chat_template failed for %s (%s); falling back to "
-                    "whitespace tokenization. Tool-call OSL/TPOT may diverge "
-                    "from server-side counts for this run.",
+                    "whitespace tokenization. Tool-call OSL/TPOT may diverge.",
                     self._tokenizer_name,
                     type(exc).__name__,
                 )
@@ -203,15 +336,13 @@ def _token_count_message_worker(
             parts = [
                 p for p in (content or None, reasoning or None, tool_calls_json) if p
             ]
-            fallback_text = "\n".join(parts)
-            return self._token_count_worker(fallback_text)
+            return self._token_count_text("\n".join(parts))
 
     def token_count(self, text: str) -> int:
-        """Return the number of tokens in the input string (blocking)."""
-        if self._executor is None:
-            raise RuntimeError("TokenizePool is closed")
-        future = self._executor.submit(self._token_count_worker, text)
-        return future.result()
+        """Token count for a single string (blocking)."""
+        if self._thread is None:
+            raise RuntimeError("BatchTokenizer is closed")
+        return self._thread.submit(self._token_count_text, text).result()
 
     def token_count_message(
         self,
@@ -219,27 +350,12 @@ def token_count_message(
         reasoning: str | None,
         tool_calls: tuple[dict[str, Any], ...] | None,
     ) -> int:
-        """Return the token count for an assistant message (blocking)."""
-        if self._executor is None:
-            raise RuntimeError("TokenizePool is closed")
-        future = self._executor.submit(
-            self._token_count_message_worker, content, reasoning, tool_calls
-        )
-        return future.result()
-
-    async def token_count_async(
-        self, text: str, loop: asyncio.AbstractEventLoop
-    ) -> int:
-        """Return the number of tokens without blocking the event loop.
-
-        Submits directly to the TokenizePool's executor so tokenization runs
-        on a thread with a pre-loaded thread-local tokenizer instance.
-        """
-        if self._executor is None:
-            raise RuntimeError("TokenizePool is closed")
-        return await loop.run_in_executor(
-            self._executor, self._token_count_worker, text
-        )
+        """Token count for an assistant message via the chat template (blocking)."""
+        if self._thread is None:
+            raise RuntimeError("BatchTokenizer is closed")
+        return self._thread.submit(
+            self._token_count_message, content, reasoning, tool_calls
+        ).result()
 
     async def token_count_message_async(
         self,
@@ -248,25 +364,148 @@ async def token_count_message_async(
         tool_calls: tuple[dict[str, Any], ...] | None,
         loop: asyncio.AbstractEventLoop,
     ) -> int:
-        """Return the token count for an assistant message without blocking the event loop."""
-        if self._executor is None:
-            raise RuntimeError("TokenizePool is closed")
+        """Chat-template message token count without blocking the loop."""
+        if self._thread is None:
+            raise RuntimeError("BatchTokenizer is closed")
         return await loop.run_in_executor(
-            self._executor,
-            self._token_count_message_worker,
-            content,
-            reasoning,
-            tool_calls,
+            self._thread, self._token_count_message, content, reasoning, tool_calls
         )
 
     def close(self) -> None:
-        """Shut down the worker pool. Idempotent."""
-        if self._executor is not None:
-            self._executor.shutdown(wait=True)
-            self._executor = None
+        """Shut down all workers. Idempotent."""
+        for ex in self._procs:
+            ex.shutdown(wait=False)
+        self._procs = []
+        if self._thread is not None:
+            self._thread.shutdown(wait=True)
+            self._thread = None
 
-    def __enter__(self) -> TokenizePool:
+    def __enter__(self) -> BatchTokenizer:
         return self
 
     def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
         self.close()
+
+
+# Type alias for the (content, reasoning, tool_calls) tuple a message trigger
+# enqueues for chat-template tokenization.
+MessageParts = tuple[str, str | None, "tuple[dict[str, Any], ...] | None"]
+
+
+class TokenCounter(Protocol):
+    """The async tokenization surface ``TokenBatchQueue`` depends on.
+
+    ``BatchTokenizer`` satisfies this structurally; tests pass lightweight
+    stubs. Declared as a Protocol so the queue is decoupled from the concrete
+    tokenizer and test doubles type-check without inheritance.
+    """
+
+    async def count_texts_async(
+        self, texts: list[str], loop: asyncio.AbstractEventLoop, /
+    ) -> list[int]: ...
+
+    async def token_count_message_async(
+        self,
+        content: str,
+        reasoning: str | None,
+        tool_calls: tuple[dict[str, Any], ...] | None,
+        loop: asyncio.AbstractEventLoop,
+        /,
+    ) -> int: ...
+
+
+class TokenBatchQueue:
+    """Buffers per-sample tokenization work and clears it in batches.
+
+    Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with a
+    ``on_count`` callback that records the resulting metric. The aggregator
+    drains the buffer with ``flush`` (once per publish tick, so live ISL/OSL/
+    TPOT stay current) and with ``flush_remaining`` at end-of-run. Holding the
+    work until a flush lets the whole buffer go through ``BatchTokenizer`` in
+    one sharded call, instead of one event-loop task per completion — the latter
+    is what fell behind and stretched the drain on high-completion-rate runs.
+
+    ``pending`` counts enqueued-but-not-yet-recorded items; it is the
+    ``n_pending_tasks`` surfaced on the snapshot, and a non-zero value in the
+    final snapshot means the end-of-run flush did not finish within the drain
+    budget.
+    """
+
+    def __init__(
+        self, tokenizer: TokenCounter, loop: asyncio.AbstractEventLoop
+    ) -> None:
+        self._tokenizer = tokenizer
+        self._loop = loop
+        self._text: list[tuple[str, Callable[[int], None]]] = []
+        self._msg: list[tuple[MessageParts, Callable[[int], None]]] = []
+        self._inflight = 0
+        # Serializes flushes so a periodic tick flush and the end-of-run flush
+        # never record the same item twice or race on the pending count.
+        self._lock = asyncio.Lock()
+
+    @property
+    def pending(self) -> int:
+        """Enqueued items not yet tokenized-and-recorded."""
+        return self._inflight
+
+    def enqueue_text(self, text: str, on_count: Callable[[int], None]) -> None:
+        self._inflight += 1
+        self._text.append((text, on_count))
+
+    def enqueue_message(
+        self, parts: MessageParts, on_count: Callable[[int], None]
+    ) -> None:
+        self._inflight += 1
+        self._msg.append((parts, on_count))
+
+    async def flush(self) -> None:
+        """Tokenize everything buffered so far and run each ``on_count``.
+
+        Items are detached from the buffer up front so concurrent enqueues land
+        in the next flush. ``_inflight`` is decremented only after a callback
+        runs, so a cancellation (drain timeout) leaves it reflecting exactly the
+        items that were not recorded.
+        """
+        async with self._lock:
+            if not (self._text or self._msg):
+                return
+            text_items, self._text = self._text, []
+            msg_items, self._msg = self._msg, []
+            if text_items:
+                counts = await self._tokenizer.count_texts_async(
+                    [t for t, _ in text_items], self._loop
+                )
+                for (_, on_count), count in zip(text_items, counts, strict=True):
+                    try:
+                        on_count(count)
+                    finally:
+                        self._inflight -= 1
+            for (content, reasoning, tool_calls), on_count in msg_items:
+                count = await self._tokenizer.token_count_message_async(
+                    content, reasoning, tool_calls, self._loop
+                )
+                try:
+                    on_count(count)
+                finally:
+                    self._inflight -= 1
+
+    async def flush_remaining(self, timeout: float | None) -> int:
+        """End-of-run flush, bounded by ``timeout`` seconds.
+
+        Returns the number of items still un-tokenized — non-zero only if the
+        budget was exhausted (``timeout`` reached). ``None`` waits indefinitely.
+        """
+        if self._inflight == 0:
+            return 0
+        try:
+            if timeout is None:
+                await self.flush()
+            else:
+                await asyncio.wait_for(self.flush(), timeout)
+        except TimeoutError:
+            logger.warning(
+                "tokenizer drain timed out after %.1fs; %d items not counted",
+                timeout,
+                self._inflight,
+            )
+        return self._inflight
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index a2050bbe3..380a0b14d 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -612,12 +612,6 @@ async def _run_benchmark_async(
         aggregator_args.extend(
             ["--drain-timeout", str(config.settings.drain.metrics_drain_timeout_s)]
         )
-        aggregator_args.extend(
-            [
-                "--tokenizer-workers",
-                str(config.settings.drain.metrics_tokenizer_workers),
-            ]
-        )
 
         # EventLoggerService writes events.jsonl to tmpfs (high-frequency writes)
         event_logger_args: list[str] = [
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 9226d7f85..722652e0d 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -592,21 +592,6 @@ class DrainConfig(BaseModel):
             "in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited)."
         ),
     )
-    metrics_tokenizer_workers: Annotated[
-        int,
-        cyclopts.Parameter(
-            alias="--metrics-tokenizer-workers",
-            help=(
-                "Number of tokenizer worker threads in the metrics aggregator. "
-                "Increase if ISL/OSL/TPOT tokenization can't keep up with request "
-                "throughput (symptoms: large drain timeout warning at run end)."
-            ),
-        ),
-    ] = Field(
-        2,
-        ge=1,
-        description="Number of tokenizer worker threads in the metrics aggregator (default: 2).",
-    )
 
 
 @cyclopts.Parameter(name="*")
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 38829f0f5..693765e57 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -80,7 +80,6 @@ settings:
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
     metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited).
-    metrics_tokenizer_workers: 2  # Number of tokenizer worker threads in the metrics aggregator (default: 2).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index c3454d5da..64439452f 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -80,7 +80,6 @@ settings:
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
     metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited).
-    metrics_tokenizer_workers: 2  # Number of tokenizer worker threads in the metrics aggregator (default: 2).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 5bea95329..0c810f30b 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -80,7 +80,6 @@ settings:
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
     metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited).
-    metrics_tokenizer_workers: 2  # Number of tokenizer worker threads in the metrics aggregator (default: 2).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
index 7adbe0361..b32811bcf 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
@@ -49,24 +49,26 @@
 from inference_endpoint.core.types import TextModelOutput
 
 # ---------------------------------------------------------------------------
-# Mock TokenizePool — used by tests that exercise async triggers directly.
+# Mock BatchTokenizer — whitespace token counts; matches the BatchTokenizer
+# surface the TokenBatchQueue calls (count_texts_async + message path).
 # ---------------------------------------------------------------------------
 
 
-class MockTokenizePool:
-    """Mock TokenizePool that splits on whitespace with artificial async delay."""
+class MockBatchTokenizer:
+    """Mock BatchTokenizer that splits on whitespace with optional async delay."""
 
-    def __init__(self, delay: float = 0.01) -> None:
+    def __init__(self, delay: float = 0.0) -> None:
         self._delay = delay
 
     def token_count(self, text: str) -> int:
         return len(text.split())
 
-    async def token_count_async(
-        self, text: str, _loop: asyncio.AbstractEventLoop
-    ) -> int:
-        await asyncio.sleep(self._delay)
-        return len(text.split())
+    async def count_texts_async(
+        self, texts: list[str], _loop: asyncio.AbstractEventLoop
+    ) -> list[int]:
+        if self._delay:
+            await asyncio.sleep(self._delay)
+        return [len(t.split()) for t in texts]
 
     async def token_count_message_async(
         self,
@@ -77,7 +79,8 @@ async def token_count_message_async(
     ) -> int:
         import msgspec
 
-        await asyncio.sleep(self._delay)
+        if self._delay:
+            await asyncio.sleep(self._delay)
         tool_calls_str = (
             msgspec.json.encode(list(tool_calls)).decode() if tool_calls else ""
         )
@@ -164,7 +167,7 @@ def make_aggregator(
     loop: asyncio.AbstractEventLoop,
     socket_name: str,
     *,
-    tokenize_pool=None,
+    tokenizer=None,
     streaming: bool = True,
     shutdown_event: asyncio.Event | None = None,
 ) -> tuple[MetricsAggregatorService, MetricsRegistry, MagicMock]:
@@ -195,7 +198,7 @@ def make_aggregator(
         publish_interval_s=0.25,
         sig_figs=3,
         n_histogram_buckets=10,
-        tokenize_pool=tokenize_pool,
+        tokenizer=tokenizer,
         streaming=streaming,
         shutdown_event=shutdown_event,
     )
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
index 9877aee5d..87c5ff96b 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
@@ -44,7 +44,7 @@
 from inference_endpoint.core.types import ErrorData, PromptData, TextModelOutput
 
 from .conftest import (
-    MockTokenizePool,
+    MockBatchTokenizer,
     make_aggregator,
     sample_event,
     session_event,
@@ -312,10 +312,10 @@ async def test_chunk_deltas(self, tmp_path):
     async def test_non_streaming_latency_only(self, tmp_path):
         """Non-streaming: emits sample_latency_ns + OSL, no TTFT/chunk_delta/TPOT."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.0)
+        tokenizer = MockBatchTokenizer(delay=0.0)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_non_streaming", tokenize_pool=pool
+                ctx, loop, "agg_non_streaming", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -332,7 +332,7 @@ async def test_non_streaming_latency_only(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 # sample_latency = 3000-1000 = 2000
                 assert (
                     snapshot_series_total(
@@ -380,7 +380,7 @@ async def test_chunk_delta_not_emitted_without_last_recv(self, tmp_path):
 
 
 # ---------------------------------------------------------------------------
-# ISL (token_ids path -- sync, no tokenize_pool needed)
+# ISL (token_ids path -- sync, no tokenizer needed)
 # ---------------------------------------------------------------------------
 
 
@@ -766,7 +766,7 @@ async def test_total_vs_tracked_counters(self, tmp_path):
 
 
 # ---------------------------------------------------------------------------
-# Async trigger tests (with mock TokenizePool and real event loop)
+# Token trigger tests (with mock BatchTokenizer and real event loop)
 # ---------------------------------------------------------------------------
 
 
@@ -776,10 +776,10 @@ class TestAsyncTriggers:
     async def test_isl_text_path_async(self, tmp_path):
         """ISL with text prompt triggers async tokenization."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.01)
+        tokenizer = MockBatchTokenizer(delay=0.01)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_isl_text_async", tokenize_pool=pool
+                ctx, loop, "agg_isl_text_async", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -796,7 +796,7 @@ async def test_isl_text_path_async(self, tmp_path):
                     ]
                 )
                 # ISL task is in-flight; drain it
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 4
             finally:
                 agg.close()
@@ -805,10 +805,10 @@ async def test_isl_text_path_async(self, tmp_path):
     async def test_osl_emitted_on_complete(self, tmp_path):
         """OSL is emitted via async tokenization when COMPLETE carries text."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.01)
+        tokenizer = MockBatchTokenizer(delay=0.01)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_osl_complete", tokenize_pool=pool
+                ctx, loop, "agg_osl_complete", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -825,7 +825,7 @@ async def test_osl_emitted_on_complete(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 # sample_latency_ns = 5000-1000 = 4000
                 assert (
                     snapshot_series_total(
@@ -842,10 +842,10 @@ async def test_osl_emitted_on_complete(self, tmp_path):
     async def test_tpot_emitted_for_streaming(self, tmp_path):
         """TPOT is emitted for streaming responses using text_after_first_chunk."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.0)
+        tokenizer = MockBatchTokenizer(delay=0.0)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_tpot_streaming", tokenize_pool=pool
+                ctx, loop, "agg_tpot_streaming", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -864,7 +864,7 @@ async def test_tpot_emitted_for_streaming(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 # OSL = "hello world foo" = 3 tokens
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3
                 # tpot = (5000 - 2000) / token_count("world foo") = 3000 / 2 = 1500
@@ -878,10 +878,10 @@ async def test_tpot_emitted_for_streaming(self, tmp_path):
     async def test_tpot_skipped_when_single_chunk(self, tmp_path):
         """TPOT is not emitted when there are no tokens after the first chunk."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.0)
+        tokenizer = MockBatchTokenizer(delay=0.0)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_tpot_single_chunk", tokenize_pool=pool
+                ctx, loop, "agg_tpot_single_chunk", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -900,7 +900,7 @@ async def test_tpot_skipped_when_single_chunk(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 1
                 assert (
                     snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0
@@ -914,13 +914,13 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path):
         registered at all — the aggregator's snapshot has no entry for them.
         """
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.0)
+        tokenizer = MockBatchTokenizer(delay=0.0)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
                 ctx,
                 loop,
                 "agg_tpot_no_streaming",
-                tokenize_pool=pool,
+                tokenizer=tokenizer,
                 streaming=False,
             )
             try:
@@ -939,7 +939,7 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 # sample_latency / OSL still emitted in non-streaming mode.
                 assert (
                     snapshot_series_total(
@@ -959,10 +959,10 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path):
     async def test_tpot_non_streaming_output_skipped(self, tmp_path):
         """TPOT is not emitted for non-streaming (str) TextModelOutput."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.0)
+        tokenizer = MockBatchTokenizer(delay=0.0)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_tpot_str_output", tokenize_pool=pool
+                ctx, loop, "agg_tpot_str_output", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -981,7 +981,7 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3
                 assert (
                     snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0
@@ -990,13 +990,13 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path):
                 agg.close()
 
     @pytest.mark.asyncio
-    async def test_drain_tasks_awaits_in_flight(self, tmp_path):
-        """drain_tasks() properly awaits all in-flight async trigger tasks."""
+    async def test_flush_records_buffered_tokenizations(self, tmp_path):
+        """fire() buffers tokenization; flush() tokenizes the batch and records."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.05)
+        tokenizer = MockBatchTokenizer()
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_drain_in_flight", tokenize_pool=pool
+                ctx, loop, "agg_flush_records", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -1012,23 +1012,24 @@ async def test_drain_tasks_awaits_in_flight(self, tmp_path):
                         ),
                     ]
                 )
-                # Tasks are in-flight but not yet complete
-                assert agg._table.in_flight_tasks_count > 0
+                assert agg._token_queue is not None
+                # Enqueued by fire(), not yet tokenized (no tick/drain flush).
+                assert agg._token_queue.pending > 0
 
-                await agg._table.drain_tasks()
-                assert agg._table.in_flight_tasks_count == 0
+                await agg._flush_tokens()
+                assert agg._token_queue.pending == 0
                 assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 5
             finally:
                 agg.close()
 
     @pytest.mark.asyncio
-    async def test_shutdown_drains_async_tasks(self, tmp_path):
-        """ENDED drains in-flight async tasks before finalizing."""
+    async def test_shutdown_flushes_buffered_tokenizations(self, tmp_path):
+        """ENDED flushes buffered tokenizations before finalizing."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.02)
+        tokenizer = MockBatchTokenizer(delay=0.02)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, publisher = make_aggregator(
-                ctx, loop, "agg_shutdown_drain", tokenize_pool=pool
+                ctx, loop, "agg_shutdown_drain", tokenizer=tokenizer
             )
             try:
                 await agg.process(
@@ -1045,16 +1046,16 @@ async def test_shutdown_drains_async_tasks(self, tmp_path):
                         session_event(SessionEventType.ENDED, ts=2000),
                     ]
                 )
-                # After ENDED, drain_tasks ran inside process() — ISL emitted.
+                # After ENDED, flush_remaining ran inside process() — ISL emitted.
                 assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 3
                 publisher.publish_final.assert_awaited_once()
             finally:
                 agg.close()
 
     # NOTE(agents): Trigger exception handling (logger.exception paths) is not
-    # exercised here. Adding a MockTokenizePool that raises on
-    # token_count_async would let us assert no metric is emitted, the
-    # aggregator does not crash, and the task set is cleaned up.
+    # exercised here. A MockBatchTokenizer whose count_texts_async raises would
+    # let us assert the flush surfaces the error without crashing the
+    # aggregator and that the buffer is cleared.
 
     @pytest.mark.asyncio
     async def test_drain_timeout_reports_pending_count(self, tmp_path):
@@ -1068,29 +1069,21 @@ async def test_drain_timeout_reports_pending_count(self, tmp_path):
         """
         loop = asyncio.get_event_loop()
 
-        class BlockingTokenizePool:
-            async def token_count_async(self, text, _loop):
+        class BlockingBatchTokenizer:
+            async def count_texts_async(self, texts, _loop):
                 await asyncio.sleep(10.0)  # exceeds drain timeout
-                return 0
+                return [0] * len(texts)
 
-            def token_count(self, text):
+            async def token_count_message_async(self, *args):
+                await asyncio.sleep(10.0)
                 return 0
 
-            def close(self):
-                pass
-
-            def __enter__(self):
-                return self
-
-            def __exit__(self, *args):
-                self.close()
-
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, _, publisher = make_aggregator(
                 ctx,
                 loop,
                 "agg_drain_timeout",
-                tokenize_pool=BlockingTokenizePool(),
+                tokenizer=BlockingBatchTokenizer(),
             )
             agg._drain_timeout_s = 0.05
             try:
@@ -1107,9 +1100,10 @@ def __exit__(self, *args):
                         ),
                     ]
                 )
+                assert agg._token_queue is not None
                 assert (
-                    agg._table.in_flight_tasks_count > 0
-                ), "precondition: ISL task must be in-flight before ENDED"
+                    agg._token_queue.pending > 0
+                ), "precondition: ISL must be buffered before ENDED"
                 await agg.process([session_event(SessionEventType.ENDED, ts=2000)])
 
                 publisher.publish_final.assert_awaited_once()
@@ -1125,10 +1119,10 @@ def __exit__(self, *args):
     async def test_tpot_osl_for_tool_call_complete(self, tmp_path):
         """OSL and TPOT use message-path tokenization when COMPLETE carries tool_calls."""
         loop = asyncio.get_event_loop()
-        pool = MockTokenizePool(delay=0.0)
+        tokenizer = MockBatchTokenizer(delay=0.0)
         with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
             agg, registry, _ = make_aggregator(
-                ctx, loop, "agg_tpot_osl_tool_call", tokenize_pool=pool
+                ctx, loop, "agg_tpot_osl_tool_call", tokenizer=tokenizer
             )
             try:
                 tool_call = {
@@ -1151,7 +1145,7 @@ async def test_tpot_osl_for_tool_call_complete(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._table.drain_tasks()
+                await agg._flush_tokens()
                 # OSL = token_count("ok" + tool_calls_json) = 2
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 2
                 # tpot = (5000 - 2000) / token_count(tool_calls_json) = 3000 / 1 = 3000
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
index 550a4863c..32f159403 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
@@ -50,7 +50,8 @@ async def test_sigterm_handler_holds_strong_reference_to_finalize_task():
     registry = MagicMock()
     table = MagicMock()
     table.total_tracked_duration_ns = 0
-    table.in_flight_tasks_count = 0
+    token_queue = MagicMock()
+    token_queue.pending = 0
 
     # publish_final blocks on an event so we can observe the task
     # mid-execution and exercise the strong-ref contract.
@@ -69,6 +70,7 @@ async def _slow_publish(*args, **kwargs):
         registry=registry,
         publisher=publisher,
         table=table,
+        token_queue=token_queue,
         shutdown_event=shutdown_event,
     )
 
@@ -122,7 +124,8 @@ async def test_sigterm_handler_refreshes_tracked_duration():
     registry = MagicMock()
     table = MagicMock()
     table.total_tracked_duration_ns = 12345
-    table.in_flight_tasks_count = 3
+    token_queue = MagicMock()
+    token_queue.pending = 3
 
     publisher = MagicMock()
     publisher.publish_final = AsyncMock()
@@ -134,6 +137,7 @@ async def test_sigterm_handler_refreshes_tracked_duration():
         registry=registry,
         publisher=publisher,
         table=table,
+        token_queue=token_queue,
         shutdown_event=shutdown_event,
     )
     on_sigterm()
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py b/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py
index 077923ff8..4ed957a98 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py
@@ -34,6 +34,9 @@
 from inference_endpoint.async_utils.services.metrics_aggregator.registry import (
     MetricsRegistry,
 )
+from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import (
+    TokenBatchQueue,
+)
 from inference_endpoint.core.record import (
     EventRecord,
     SampleEventType,
@@ -294,13 +297,13 @@ async def test_osl_with_tool_calls_uses_message_path(self):
         )
         from inference_endpoint.core.types import TextModelOutput
 
-        from .conftest import MockTokenizePool, snapshot_series_count
+        from .conftest import MockBatchTokenizer, snapshot_series_count
 
         registry = MetricsRegistry()
         registry.register_series("osl", hdr_low=1, hdr_high=100_000)
         loop = asyncio.get_running_loop()
-        pool = MockTokenizePool(delay=0)
-        trigger = OslTrigger(registry, pool, loop)
+        queue = TokenBatchQueue(MockBatchTokenizer(), loop)
+        trigger = OslTrigger(registry, queue)
 
         tool_calls = (
             {
@@ -317,9 +320,8 @@ async def test_osl_with_tool_calls_uses_message_path(self):
             data=tmo,
         )
         row = SampleRow(sample_uuid="s1")
-        task = trigger.fire(ev, row, {})
-        assert task is not None
-        await task
+        trigger.fire(ev, row, {})
+        await queue.flush()
 
         assert snapshot_series_count(registry, "osl") == 1
 
@@ -331,13 +333,13 @@ async def test_osl_without_tool_calls_uses_text_path(self):
         )
         from inference_endpoint.core.types import TextModelOutput
 
-        from .conftest import MockTokenizePool, snapshot_series_count
+        from .conftest import MockBatchTokenizer, snapshot_series_count
 
         registry = MetricsRegistry()
         registry.register_series("osl", hdr_low=1, hdr_high=100_000)
         loop = asyncio.get_running_loop()
-        pool = MockTokenizePool(delay=0)
-        trigger = OslTrigger(registry, pool, loop)
+        queue = TokenBatchQueue(MockBatchTokenizer(), loop)
+        trigger = OslTrigger(registry, queue)
 
         tmo = TextModelOutput(output="hello world")
         ev = EventRecord(
@@ -347,9 +349,8 @@ async def test_osl_without_tool_calls_uses_text_path(self):
             data=tmo,
         )
         row = SampleRow(sample_uuid="s1")
-        task = trigger.fire(ev, row, {})
-        assert task is not None
-        await task
+        trigger.fire(ev, row, {})
+        await queue.flush()
 
         assert snapshot_series_count(registry, "osl") == 1
 
@@ -368,15 +369,15 @@ async def test_tpot_tool_calls_only_response(self):
         )
         from inference_endpoint.core.types import TextModelOutput
 
-        from .conftest import MockTokenizePool, snapshot_series_count
+        from .conftest import MockBatchTokenizer, snapshot_series_count
 
         registry = MetricsRegistry()
         registry.register_series(
             "tpot_ns", hdr_low=1, hdr_high=100_000_000_000, dtype=float
         )
         loop = asyncio.get_running_loop()
-        pool = MockTokenizePool(delay=0)
-        trigger = TpotTrigger(registry, pool, loop)
+        queue = TokenBatchQueue(MockBatchTokenizer(), loop)
+        trigger = TpotTrigger(registry, queue)
 
         tool_calls = (
             {
@@ -395,9 +396,8 @@ async def test_tpot_tool_calls_only_response(self):
         row = SampleRow(sample_uuid="s1")
         # RECV_FIRST_NS was set at t=1000
         pre_change = {SampleField.RECV_FIRST_NS: 1000}
-        task = trigger.fire(ev, row, pre_change)
-        assert task is not None
-        await task
+        trigger.fire(ev, row, pre_change)
+        await queue.flush()
 
         assert snapshot_series_count(registry, "tpot_ns") == 1
 
@@ -409,15 +409,15 @@ async def test_tpot_uses_tool_call_deltas_after_first_chunk(self):
         )
         from inference_endpoint.core.types import TextModelOutput
 
-        from .conftest import MockTokenizePool, snapshot_series_total
+        from .conftest import MockBatchTokenizer, snapshot_series_total
 
         registry = MetricsRegistry()
         registry.register_series(
             "tpot_ns", hdr_low=1, hdr_high=100_000_000_000, dtype=float
         )
         loop = asyncio.get_running_loop()
-        pool = MockTokenizePool(delay=0)
-        trigger = TpotTrigger(registry, pool, loop)
+        queue = TokenBatchQueue(MockBatchTokenizer(), loop)
+        trigger = TpotTrigger(registry, queue)
 
         tool_call_chunks = (
             (
@@ -442,8 +442,7 @@ async def test_tpot_uses_tool_call_deltas_after_first_chunk(self):
         )
         row = SampleRow(sample_uuid="s1")
         pre_change = {SampleField.RECV_FIRST_NS: 1000}
-        task = trigger.fire(ev, row, pre_change)
-        assert task is not None
-        await task
+        trigger.fire(ev, row, pre_change)
+        await queue.flush()
 
         assert snapshot_series_total(registry, "tpot_ns") == pytest.approx(2000.0)
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index e25bf0022..b59e56501 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for TokenizePool thread-safety and correctness."""
+"""Tests for BatchTokenizer and TokenBatchQueue."""
 
 import asyncio
 import time
@@ -22,18 +22,22 @@
 
 import pytest
 from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import (
-    TokenizePool,
+    BatchTokenizer,
+    TokenBatchQueue,
 )
 
 _MOCK_TARGET = "inference_endpoint.async_utils.services.metrics_aggregator.token_metrics.AutoTokenizer"
 
 
 class _FakeTokenizer:
-    """Deterministic tokenizer that splits on whitespace."""
+    """Deterministic tokenizer that splits on whitespace.
 
-    def __init__(self, load_delay: float = 0.1):
-        # Simulate the blocking cost of from_pretrained so that
-        # pre-initialization in __init__ saturates all worker threads.
+    Has no ``backend_tokenizer``, so BatchTokenizer keeps the batch path
+    in-process (no subprocess shards) and ``count_texts`` falls back to
+    ``tokenize`` per text — which is what these tests assert against.
+    """
+
+    def __init__(self, load_delay: float = 0.0):
         time.sleep(load_delay)
 
     def tokenize(self, text: str) -> list[str]:
@@ -46,64 +50,58 @@ def from_pretrained(cls, name: str, **kwargs: object) -> "_FakeTokenizer":
 
 
 @pytest.mark.unit
-class TestTokenizePool:
+class TestBatchTokenizer:
     def test_token_count_returns_int(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            with TokenizePool("fake", n_workers=1) as pool:
-                count = pool.token_count("Hello world")
-                assert count == 2
+            with BatchTokenizer("fake") as tok:
+                assert tok.token_count("Hello world") == 2
 
-    def test_multiple_workers(self):
+    def test_count_texts_batch(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            with TokenizePool("fake", n_workers=4) as pool:
-                results = []
-                for i in range(10):
-                    results.append(pool.token_count(f"Sentence number {i}"))
-                assert all(isinstance(r, int) and r > 0 for r in results)
+            with BatchTokenizer("fake") as tok:
+                assert tok.count_texts(["a b", "c d e", "x"]) == [2, 3, 1]
 
-    def test_concurrent_calls_thread_safe(self):
+    def test_count_texts_empty(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            with TokenizePool("fake", n_workers=2) as pool:
-                texts = [f"word{i} word{i+1}" for i in range(20)]
+            with BatchTokenizer("fake") as tok:
+                assert tok.count_texts([]) == []
 
+    def test_concurrent_token_count_thread_safe(self):
+        with patch(_MOCK_TARGET, _FakeTokenizer):
+            with BatchTokenizer("fake") as tok:
+                texts = [f"word{i} word{i + 1}" for i in range(20)]
                 with ThreadPoolExecutor(max_workers=8) as executor:
-                    futures = [executor.submit(pool.token_count, t) for t in texts]
+                    futures = [executor.submit(tok.token_count, t) for t in texts]
                     results = [f.result() for f in futures]
-
-                assert len(results) == 20
-                assert all(r == 2 for r in results)
+                assert results == [2] * 20
 
     def test_close_is_idempotent(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            pool = TokenizePool("fake", n_workers=1)
-            pool.close()
-            pool.close()  # Should not raise
+            tok = BatchTokenizer("fake")
+            tok.close()
+            tok.close()  # must not raise
 
     def test_use_after_close_raises(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            pool = TokenizePool("fake", n_workers=1)
-            pool.close()
+            tok = BatchTokenizer("fake")
+            tok.close()
             with pytest.raises(RuntimeError, match="closed"):
-                pool.token_count("hello")
-
-    def test_n_workers_zero_raises(self):
-        with pytest.raises(ValueError, match="n_workers"):
-            TokenizePool("fake", n_workers=0)
+                tok.token_count("hello")
 
     @pytest.mark.asyncio
-    async def test_token_count_async(self):
+    async def test_count_texts_async(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with TokenizePool("fake", n_workers=1) as pool:
-                count = await pool.token_count_async("Hello world foo", loop)
-                assert count == 3
+            with BatchTokenizer("fake") as tok:
+                counts = await tok.count_texts_async(["Hello world foo", "a"], loop)
+                assert counts == [3, 1]
 
     def test_context_manager(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            with TokenizePool("fake", n_workers=1) as pool:
-                assert pool.token_count("a b c") == 3
+            with BatchTokenizer("fake") as tok:
+                assert tok.token_count("a b c") == 3
             with pytest.raises(RuntimeError, match="closed"):
-                pool.token_count("test")
+                tok.token_count("test")
 
 
 class _FakeTokenizerWithTemplate(_FakeTokenizer):
@@ -131,19 +129,18 @@ def apply_chat_template(
 
 
 @pytest.mark.unit
-class TestTokenizePoolMessageTokenization:
+class TestBatchTokenizerMessageTokenization:
     def test_token_count_message_subtracts_baseline(self):
         """token_count_message returns full_tokens - baseline."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
-            with TokenizePool("fake", n_workers=1) as pool:
-                # "hello world" -> 2 content words + 2 wrapper = 4; baseline = 0 + 2 = 2; net = 2
-                count = pool.token_count_message("hello world", None, None)
-                assert count == 2
+            with BatchTokenizer("fake") as tok:
+                # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2
+                assert tok.token_count_message("hello world", None, None) == 2
 
     def test_token_count_message_includes_tool_calls(self):
         """token_count_message includes tool-call JSON tokens."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
-            with TokenizePool("fake", n_workers=1) as pool:
+            with BatchTokenizer("fake") as tok:
                 tool_calls = (
                     {
                         "id": "c1",
@@ -151,9 +148,9 @@ def test_token_count_message_includes_tool_calls(self):
                         "function": {"name": "f", "arguments": "{}"},
                     },
                 )
-                count_without = pool.token_count_message("hello", None, None)
-                count_with = pool.token_count_message("hello", None, tool_calls)
-                assert count_with > count_without
+                without = tok.token_count_message("hello", None, None)
+                with_calls = tok.token_count_message("hello", None, tool_calls)
+                assert with_calls > without
 
     def test_token_count_message_fallback_on_exception(self):
         """Falls back to whitespace split when apply_chat_template raises."""
@@ -163,7 +160,7 @@ def apply_chat_template(self, *args, **kwargs):
                 raise ValueError("template does not support tool_calls")
 
         with patch(_MOCK_TARGET, _BadTemplateTokenizer):
-            with TokenizePool("fake", n_workers=1) as pool:
+            with BatchTokenizer("fake") as tok:
                 tool_calls = (
                     {
                         "id": "c1",
@@ -171,17 +168,82 @@ def apply_chat_template(self, *args, **kwargs):
                         "function": {"name": "f", "arguments": "{}"},
                     },
                 )
-                # Should not raise; falls back to whitespace tokenizer
-                count = pool.token_count_message("hello world", None, tool_calls)
-                assert count > 0
+                # Must not raise; falls back to whitespace tokenizer.
+                assert tok.token_count_message("hello world", None, tool_calls) > 0
 
     @pytest.mark.asyncio
     async def test_token_count_message_async(self):
-        """token_count_message_async returns count without blocking event loop."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
             loop = asyncio.get_running_loop()
-            with TokenizePool("fake", n_workers=1) as pool:
-                count = await pool.token_count_message_async(
+            with BatchTokenizer("fake") as tok:
+                count = await tok.token_count_message_async(
                     "hello world", None, None, loop
                 )
                 assert count == 2
+
+
+class _CapturingTokenizer:
+    """Minimal tokenizer stub for queue tests: whitespace counts, no procs."""
+
+    async def count_texts_async(self, texts, _loop):
+        return [len(t.split()) for t in texts]
+
+    async def token_count_message_async(self, content, reasoning, tool_calls, _loop):
+        parts = [p for p in (content, reasoning) if p]
+        return len(" ".join(parts).split()) + (len(tool_calls) if tool_calls else 0)
+
+
+@pytest.mark.unit
+@pytest.mark.asyncio
+class TestTokenBatchQueue:
+    async def test_flush_records_text_via_callback(self):
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("a b c", recorded.append)
+        queue.enqueue_text("d e", recorded.append)
+        assert queue.pending == 2
+        await queue.flush()
+        assert sorted(recorded) == [2, 3]
+        assert queue.pending == 0
+
+    async def test_flush_records_message_via_callback(self):
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        recorded: list[int] = []
+        queue.enqueue_message(("hello world", None, None), recorded.append)
+        await queue.flush()
+        assert recorded == [2]
+
+    async def test_flush_empty_is_noop(self):
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        await queue.flush()
+        assert queue.pending == 0
+
+    async def test_flush_remaining_clean_returns_zero(self):
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("a b", recorded.append)
+        assert await queue.flush_remaining(timeout=5.0) == 0
+        assert recorded == [2]
+
+    async def test_flush_remaining_timeout_reports_pending(self):
+        """A tokenizer slower than the budget leaves items pending."""
+
+        class _BlockingTokenizer:
+            async def count_texts_async(self, texts, _loop):
+                await asyncio.sleep(10.0)
+                return [0] * len(texts)
+
+            async def token_count_message_async(self, *args):
+                return 0
+
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_BlockingTokenizer(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("never counted", recorded.append)
+        n_pending = await queue.flush_remaining(timeout=0.05)
+        assert n_pending == 1
+        assert recorded == []
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 1c90554fb..969f22ce2 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -490,7 +490,6 @@ def test_defaults(self):
         assert cfg.performance_timeout_s == 240.0
         assert cfg.accuracy_timeout_s is None
         assert cfg.metrics_drain_timeout_s == 60.0
-        assert cfg.metrics_tokenizer_workers == 2
 
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -512,11 +511,6 @@ def test_metrics_drain_timeout_negative_rejected(self):
         with pytest.raises(ValidationError):
             DrainConfig(metrics_drain_timeout_s=-1.0)
 
-    @pytest.mark.unit
-    def test_metrics_tokenizer_workers_must_be_at_least_one(self):
-        with pytest.raises(ValidationError):
-            DrainConfig(metrics_tokenizer_workers=0)
-
     @pytest.mark.unit
     def test_extra_fields_rejected(self):
         with pytest.raises(ValidationError):
@@ -538,7 +532,6 @@ def test_yaml_roundtrip(self, tmp_path):
     performance_timeout_s: 30.0
     accuracy_timeout_s: null
     metrics_drain_timeout_s: 300.0
-    metrics_tokenizer_workers: 8
 """
         config_file = tmp_path / "drain.yaml"
         config_file.write_text(yaml_content)
@@ -548,7 +541,6 @@ def test_yaml_roundtrip(self, tmp_path):
         assert drain.performance_timeout_s == 30.0
         assert drain.accuracy_timeout_s is None
         assert drain.metrics_drain_timeout_s == 300.0
-        assert drain.metrics_tokenizer_workers == 8
 
 
 class TestAggregatorArgs:
@@ -639,60 +631,6 @@ async def _capture_launch(service_configs, *, timeout):
         idx = args.index("--drain-timeout")
         assert args[idx + 1] == expected_flag
 
-    @pytest.mark.unit
-    @pytest.mark.asyncio
-    @pytest.mark.parametrize("workers, expected_flag", [(4, "4"), (8, "8"), (2, "2")])
-    async def test_tokenizer_workers_forwarded_to_aggregator_args(
-        self, tmp_path, workers, expected_flag
-    ):
-        config = OfflineConfig(
-            **_OFFLINE_KWARGS,
-            settings=OfflineSettings(
-                drain=DrainConfig(metrics_tokenizer_workers=workers)
-            ),
-        )
-        ctx = self._make_ctx(config, tmp_path)
-
-        captured: list = []
-
-        async def _capture_launch(service_configs, *, timeout):
-            captured.extend(service_configs)
-            raise KeyboardInterrupt("stop after launch")
-
-        mock_zmq = MagicMock()
-        mock_zmq.socket_dir = str(tmp_path / "sockets")
-
-        with (
-            patch(
-                "inference_endpoint.commands.benchmark.execute.ManagedZMQContext"
-            ) as MockZMQ,
-            patch(
-                "inference_endpoint.commands.benchmark.execute.EventPublisherService"
-            ) as MockPub,
-            patch(
-                "inference_endpoint.commands.benchmark.execute.MetricsSnapshotSubscriber"
-            ) as MockSub,
-            patch(
-                "inference_endpoint.commands.benchmark.execute.ServiceLauncher"
-            ) as MockLauncher,
-            patch("inference_endpoint.commands.benchmark.execute.tqdm"),
-        ):
-            MockZMQ.scoped.return_value.__enter__ = MagicMock(return_value=mock_zmq)
-            MockZMQ.scoped.return_value.__exit__ = MagicMock(return_value=False)
-            MockPub.return_value.socket_name = "test_pub"
-            MockSub.return_value.start = MagicMock()
-            MockLauncher.return_value.launch = _capture_launch
-
-            loop = asyncio.get_event_loop()
-            with pytest.raises(KeyboardInterrupt):
-                await _run_benchmark_async(ctx, loop)
-
-        aggregator_cfg = next(c for c in captured if "metrics_aggregator" in c.module)
-        args = aggregator_cfg.args
-        assert "--tokenizer-workers" in args
-        idx = args.index("--tokenizer-workers")
-        assert args[idx + 1] == expected_flag
-
 
 class TestBuildPhases:
     """Tests for _build_phases() in execute.py."""

From b40f72f0fba65f01f1e5719202c18afd47bb5eb9 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Tue, 9 Jun 2026 17:12:33 -0700
Subject: [PATCH 02/20] fix(metrics): never skip finalize on tokenizer drain
 failure; cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ENDED drain sat outside the finalization try/finally and
flush_remaining caught only TimeoutError: any other tokenizer failure
(e.g. BrokenProcessPool from a dead shard) escaped the fire-and-forget
process() task, skipped publish_final, and hung the subprocess with no
final_snapshot.json. The drain now runs inside the finalization
boundary and flush_remaining swallows non-timeout failures, logs them,
and returns the un-tokenized count — surfacing as an incomplete drain
(n_pending_tasks > 0) instead of a hang.

Cleanup (review feedback):
- delete the test-only sync API (count_texts / token_count /
  token_count_message); production uses only the async paths, and
  count_texts_async now raises RuntimeError after close()
- rename AsyncTokenTrigger -> TokenTrigger (fire() is sync; it enqueues)
- extract _encode_batch_lengths shared by the worker and in-process paths
- pending_tokens property collapses the triple None-guard; the SIGTERM
  handler takes a pending_tokens callback instead of reaching into
  aggregator._token_queue
- drop vestigial return None and quoted forward-ref; trim stale
  "async tasks" wording in docs and the drain-timeout help text
  (templates regenerated); document the wait=False shard shutdown

Tests: sharded-path reassembly + BrokenProcessPool propagation,
_even_chunks, and queue/aggregator drain-failure regression tests.
145 aggregator unit + 160 config/commands/integration tests pass;
pre-commit clean.

Validated on GB200 (ptyche, 144-core Grace, 18 shards, real DeepSeek-R1
tokenizer at mean OSL 3877): 38x vs the per-event pool; 1M-output drain
84s vs ~53min.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/__main__.py   |   8 +-
 .../services/metrics_aggregator/aggregator.py |  83 ++++-----
 .../metrics_aggregator/metrics_table.py       |  29 ++--
 .../metrics_aggregator/token_metrics.py       | 140 ++++++---------
 src/inference_endpoint/config/schema.py       |  11 +-
 .../templates/concurrency_template_full.yaml  |   2 +-
 .../templates/offline_template_full.yaml      |   2 +-
 .../templates/online_template_full.yaml       |   2 +-
 .../services/metrics_aggregator/conftest.py   |   3 -
 .../metrics_aggregator/test_aggregator.py     |  48 +++++-
 .../test_main_signal_handler.py               |  10 +-
 .../metrics_aggregator/test_token_metrics.py  | 163 ++++++++++++------
 12 files changed, 285 insertions(+), 216 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 9cd1c7e5e..7d2101c11 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -33,7 +33,7 @@
 from .publisher import MetricsPublisher
 from .registry import MetricsRegistry
 from .snapshot import MetricsSnapshotCodec
-from .token_metrics import BatchTokenizer, TokenBatchQueue
+from .token_metrics import BatchTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -44,7 +44,7 @@ def _make_sigterm_handler(
     registry: MetricsRegistry,
     publisher: MetricsPublisher,
     table: MetricsTable,
-    token_queue: TokenBatchQueue | None,
+    pending_tokens: Callable[[], int],
     shutdown_event: asyncio.Event,
 ) -> tuple[Callable[[], None], set[asyncio.Task]]:
     """Build the SIGTERM handler that writes the INTERRUPTED final snapshot.
@@ -76,7 +76,7 @@ async def _signal_finalize() -> None:
             )
             await publisher.publish_final(
                 registry,
-                n_pending_tasks=token_queue.pending if token_queue is not None else 0,
+                n_pending_tasks=pending_tokens(),
                 interrupted=True,
             )
         except Exception:  # noqa: BLE001 — best-effort.
@@ -263,7 +263,7 @@ async def main() -> None:
                 registry=registry,
                 publisher=publisher,
                 table=aggregator._table,
-                token_queue=aggregator._token_queue,
+                pending_tokens=lambda: aggregator.pending_tokens,
                 shutdown_event=shutdown_event,
             )
             loop.add_signal_handler(signal.SIGTERM, on_sigterm)
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index ed5ace0a0..e87448036 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -241,6 +241,11 @@ def _register_triggers(self, streaming: bool) -> None:
             table.add_trigger(SampleField.LAST_RECV_NS, ChunkDeltaTrigger(registry))
             table.add_trigger(SampleField.COMPLETE_NS, TpotTrigger(registry, queue))
 
+    @property
+    def pending_tokens(self) -> int:
+        """Enqueued tokenizations not yet recorded (the snapshot n_pending_tasks)."""
+        return self._token_queue.pending if self._token_queue is not None else 0
+
     async def _flush_tokens(self) -> None:
         """Flush buffered tokenizations so the next snapshot reflects them."""
         if self._token_queue is not None:
@@ -318,9 +323,7 @@ async def process(self, records: list[EventRecord]) -> None:
                                 self._publish_interval_s,
                                 get_runtime_state=lambda: (
                                     self._session_state,
-                                    self._token_queue.pending
-                                    if self._token_queue is not None
-                                    else 0,
+                                    self.pending_tokens,
                                 ),
                                 pre_publish=self._flush_tokens,
                             )
@@ -377,47 +380,47 @@ async def process(self, records: list[EventRecord]) -> None:
             # ENDED has been observed; transition to DRAINING so any tick
             # that fires before publish_final reflects the new state.
             self._session_state = SessionState.DRAINING
-            queue = self._token_queue
-            pending = queue.pending if queue is not None else 0
-            logger.info("Draining %d pending tokenizations...", pending)
-            # flush_remaining tokenizes the whole buffer in one batched pass,
-            # bounded by the drain budget; it returns the count it could not
-            # finish (non-zero only on a timeout), which becomes the snapshot's
-            # n_pending_tasks so Report can flag an incomplete drain.
-            n_pending = (
-                await queue.flush_remaining(self._drain_timeout_s)
-                if queue is not None
-                else 0
-            )
-            if n_pending > 0:
-                timeout_str = (
-                    f"{self._drain_timeout_s:.1f}s"
-                    if self._drain_timeout_s is not None
-                    else "unlimited"
+            logger.info("Draining %d pending tokenizations...", self.pending_tokens)
+            # The drain and final publish are wrapped together so the aggregator
+            # ALWAYS reaches _finalize (which sets the shutdown event); a
+            # tokenizer failure during the drain must not skip publish_final and
+            # leave main()'s `await shutdown_event.wait()` hanging.
+            n_pending = self.pending_tokens
+            try:
+                # flush_remaining tokenizes the whole buffer in one batched pass,
+                # bounded by the drain budget, and never raises: it returns the
+                # count it could not finish (timeout or failure), which becomes
+                # the snapshot's n_pending_tasks so Report flags an incomplete drain.
+                if self._token_queue is not None:
+                    n_pending = await self._token_queue.flush_remaining(
+                        self._drain_timeout_s
+                    )
+                if n_pending > 0:
+                    budget = (
+                        f"{self._drain_timeout_s:.1f}s"
+                        if self._drain_timeout_s is not None
+                        else "unlimited"
+                    )
+                    logger.warning(
+                        "tokenizer drain incomplete (budget %s); %d tokenizations "
+                        "did not complete",
+                        budget,
+                        n_pending,
+                    )
+                logger.info(
+                    "Tokenizations drained (n_pending_tasks=%d at finalize)", n_pending
                 )
-                logger.warning(
-                    "tokenizer drain timed out after %s; %d tokenizations "
-                    "did not complete",
-                    timeout_str,
-                    n_pending,
+                registry.set_counter(
+                    MetricCounterKey.TRACKED_DURATION_NS.value,
+                    table.total_tracked_duration_ns,
                 )
-            logger.info(
-                "Tokenizations drained (n_pending_tasks=%d at finalize)", n_pending
-            )
-            registry.set_counter(
-                MetricCounterKey.TRACKED_DURATION_NS.value,
-                table.total_tracked_duration_ns,
-            )
-            try:
                 await self._publisher.publish_final(registry, n_pending_tasks=n_pending)
             finally:
-                # Whatever happens above, the aggregator MUST close the
-                # publisher and signal shutdown — otherwise the main()
-                # entry point's `await shutdown_event.wait()` hangs
-                # forever and the subprocess never exits cleanly. Each
-                # cleanup step is independently wrapped: a failure in
-                # aclose must not prevent _finalize, since _finalize is
-                # what sets the shutdown event.
+                # The aggregator MUST close the publisher and signal shutdown even
+                # if the drain/publish above failed — otherwise main()'s
+                # `await shutdown_event.wait()` hangs forever. aclose is
+                # independently wrapped: its failure must not prevent _finalize,
+                # which is what sets the shutdown event.
                 try:
                     await self._publisher.aclose()
                 except Exception:  # noqa: BLE001 — best-effort cleanup.
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py
index f67c859e6..88d2693ee 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py
@@ -179,10 +179,9 @@ def fire(self, ev_rec, row, pre_change):
         baseline = pre_change.get(self._delta_start_fieldname)
         if baseline is not None:
             self.registry.record(self.metric_name, ev_rec.timestamp_ns - baseline)
-        return None
 
 
-class AsyncTokenTrigger(EmitTrigger):
+class TokenTrigger(EmitTrigger):
     """Base for triggers whose metric needs tokenization.
 
     Subclasses implement ``_extract_text()`` to pull the text to tokenize from
@@ -298,12 +297,12 @@ def __init__(self, registry: MetricsRegistry):
 
 
 # ---------------------------------------------------------------------------
-# Token triggers (async)
+# Token triggers (batched)
 # ---------------------------------------------------------------------------
 
 
-class IslTrigger(AsyncTokenTrigger):
-    """ISL from PromptData: len(token_ids) sync, or token_count(text) async."""
+class IslTrigger(TokenTrigger):
+    """ISL from PromptData: ``len(token_ids)`` or the tokenized prompt text."""
 
     def __init__(
         self,
@@ -327,7 +326,7 @@ def _extract_text(self, ev_rec, row, pre_change):
         return None
 
 
-class OslTrigger(AsyncTokenTrigger):
+class OslTrigger(TokenTrigger):
     """OSL = token_count(full output text) from COMPLETE event data."""
 
     def __init__(
@@ -352,19 +351,13 @@ def _extract_message(self, ev_rec, row, pre_change):
         return None
 
 
-class TpotTrigger(AsyncTokenTrigger):
-    """TPOT = (complete_ns - recv_first_ns) / token_count(text_after_first_chunk).
-
-    Only registered when streaming mode is enabled.
+class TpotTrigger(TokenTrigger):
+    """TPOT = (complete_ns - recv_first_ns) / output token count.
 
-    # NOTE(agents): This trigger tokenizes text_after_first_chunk independently
-    # from OslTrigger, which tokenizes the full output. This means the output is
-    # tokenized twice at COMPLETE time for streaming samples. This is intentional:
-    # OSL is always required (non-streaming and streaming), while TPOT is
-    # streaming-only. Keeping them as separate triggers allows conditional
-    # registration via the streaming flag. If tokenization throughput becomes a
-    # bottleneck, consider merging OSL and TPOT into a single trigger that
-    # tokenizes once and derives both metrics.
+    Streaming-only. Tokenizes the post-first-chunk output independently of
+    ``OslTrigger`` (full output), so streaming samples are tokenized twice —
+    intentional: OSL is always required, TPOT is conditional on the streaming
+    flag.
     """
 
     def __init__(
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 57c9704d4..d3a236d2b 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -15,14 +15,12 @@
 
 """Tokenization for ISL/OSL/TPOT metrics.
 
-``BatchTokenizer`` tokenizes whole batches of text at once. A single BPE rayon
-pool saturates ~8 CPU cores (memory-bound), so to use the whole machine it
-shards each batch across worker *processes*, one pinned to each block of
-``CORES_PER_WORKER`` cores (their rayon pools stay NUMA-local). The aggregator
-buffers per-sample text as COMPLETE events arrive and calls ``count_texts`` once
-per flush (publish tick + drain) — so batching, not a per-request coalescer,
-keeps tokenization ahead of completions. Falls back to a single in-process
-thread when there is no fast Rust backend or fewer than two core blocks fit.
+``BatchTokenizer`` tokenizes whole batches at once, sharded across worker
+processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE
+rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers
+per-sample text and flushes the batch once per publish tick and at drain. Falls
+back to a single in-process thread when there is no fast Rust backend or fewer
+than two core blocks fit.
 """
 
 from __future__ import annotations
@@ -109,13 +107,18 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None:
         _WORKER_BACKEND.encode("warmup", add_special_tokens=False)
 
 
+def _encode_batch_lengths(backend: Any, texts: list[str]) -> list[int]:
+    """Per-text token counts via the raw tokenizers backend, one rayon call."""
+    encode_batch = getattr(backend, "encode_batch_fast", None) or backend.encode_batch
+    return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)]
+
+
 def _worker_encode_lengths(texts: list[str]) -> list[int]:
     """Per-text token counts for a shard, in one rayon-parallel call."""
     backend = _WORKER_BACKEND
     if backend is None:
         raise RuntimeError("tokenizer worker backend unavailable")
-    encode_batch = getattr(backend, "encode_batch_fast", None) or backend.encode_batch
-    return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)]
+    return _encode_batch_lengths(backend, texts)
 
 
 def _worker_ready(_: int) -> bool:
@@ -138,10 +141,9 @@ def _even_chunks(items: list[str], n: int) -> list[list[str]]:
 class BatchTokenizer:
     """Counts tokens for batches of text, sharded across pinned CPU cores.
 
-    ``count_texts`` / ``count_texts_async`` tokenize a whole list in one shot.
-    The sync ``token_count`` and chat-template ``token_count_message`` paths run
-    on a small in-process thread pool — they are rare (single ISL probes, tool
-    calls) relative to the batched OSL/ISL/TPOT flush.
+    ``count_texts_async`` tokenizes a whole list in one sharded call. The
+    chat-template ``token_count_message_async`` path runs on a small in-process
+    thread — rare (tool calls) relative to the batched OSL/ISL/TPOT flush.
     """
 
     def __init__(
@@ -254,48 +256,31 @@ def _encode_lengths_inproc(self, texts: list[str]) -> list[int]:
         tok = self._tokenizer
         backend = getattr(tok, "backend_tokenizer", None)
         if backend is not None:
-            encode_batch = getattr(backend, "encode_batch_fast", None)
-            if encode_batch is None:
-                encode_batch = backend.encode_batch
-            return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)]
+            return _encode_batch_lengths(backend, texts)
         return [len(tok.tokenize(t)) for t in texts]  # type: ignore[union-attr]
 
-    def count_texts(self, texts: list[str]) -> list[int]:
-        """Per-text token counts for a whole batch (blocking)."""
-        if not texts:
-            return []
-        if not self._procs:
-            return self._encode_lengths_inproc(texts)
-        chunks = _even_chunks(texts, len(self._procs))
-        futures = [
-            self._procs[i].submit(_worker_encode_lengths, chunk)
-            for i, chunk in enumerate(chunks)
-        ]
-        out: list[int] = []
-        for f in futures:
-            out.extend(f.result())
-        return out
-
     async def count_texts_async(
         self, texts: list[str], loop: asyncio.AbstractEventLoop
     ) -> list[int]:
-        """Per-text token counts for a whole batch without blocking the loop."""
+        """Per-text token counts for a whole batch without blocking the loop.
+
+        A worker-shard failure propagates and is treated as an incomplete drain.
+        """
         if not texts:
             return []
-        if not self._procs:
-            return await loop.run_in_executor(
-                self._thread, self._encode_lengths_inproc, texts
-            )
-        chunks = _even_chunks(texts, len(self._procs))
-        futures = [
-            asyncio.wrap_future(self._procs[i].submit(_worker_encode_lengths, chunk))
-            for i, chunk in enumerate(chunks)
-        ]
-        results = await asyncio.gather(*futures)
-        out: list[int] = []
-        for r in results:
-            out.extend(r)
-        return out
+        if self._procs:
+            chunks = _even_chunks(texts, len(self._procs))
+            futures = [
+                asyncio.wrap_future(ex.submit(_worker_encode_lengths, chunk))
+                for ex, chunk in zip(self._procs, chunks, strict=False)
+            ]
+            results = await asyncio.gather(*futures)
+            return [n for r in results for n in r]
+        if self._thread is None:
+            raise RuntimeError("BatchTokenizer is closed")
+        return await loop.run_in_executor(
+            self._thread, self._encode_lengths_inproc, texts
+        )
 
     # -- sync + chat-template paths (in-process thread) ---------------------
 
@@ -338,25 +323,6 @@ def _token_count_message(
             ]
             return self._token_count_text("\n".join(parts))
 
-    def token_count(self, text: str) -> int:
-        """Token count for a single string (blocking)."""
-        if self._thread is None:
-            raise RuntimeError("BatchTokenizer is closed")
-        return self._thread.submit(self._token_count_text, text).result()
-
-    def token_count_message(
-        self,
-        content: str,
-        reasoning: str | None,
-        tool_calls: tuple[dict[str, Any], ...] | None,
-    ) -> int:
-        """Token count for an assistant message via the chat template (blocking)."""
-        if self._thread is None:
-            raise RuntimeError("BatchTokenizer is closed")
-        return self._thread.submit(
-            self._token_count_message, content, reasoning, tool_calls
-        ).result()
-
     async def token_count_message_async(
         self,
         content: str,
@@ -372,7 +338,11 @@ async def token_count_message_async(
         )
 
     def close(self) -> None:
-        """Shut down all workers. Idempotent."""
+        """Shut down all workers. Idempotent.
+
+        Shard shutdown uses ``wait=False``: a hung worker must not block
+        aggregator shutdown; idle workers exit on their own once signalled.
+        """
         for ex in self._procs:
             ex.shutdown(wait=False)
         self._procs = []
@@ -389,7 +359,7 @@ def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
 
 # Type alias for the (content, reasoning, tool_calls) tuple a message trigger
 # enqueues for chat-template tokenization.
-MessageParts = tuple[str, str | None, "tuple[dict[str, Any], ...] | None"]
+MessageParts = tuple[str, str | None, tuple[dict[str, Any], ...] | None]
 
 
 class TokenCounter(Protocol):
@@ -417,18 +387,15 @@ async def token_count_message_async(
 class TokenBatchQueue:
     """Buffers per-sample tokenization work and clears it in batches.
 
-    Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with a
+    Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an
     ``on_count`` callback that records the resulting metric. The aggregator
-    drains the buffer with ``flush`` (once per publish tick, so live ISL/OSL/
-    TPOT stay current) and with ``flush_remaining`` at end-of-run. Holding the
-    work until a flush lets the whole buffer go through ``BatchTokenizer`` in
-    one sharded call, instead of one event-loop task per completion — the latter
-    is what fell behind and stretched the drain on high-completion-rate runs.
+    flushes the buffer with ``flush`` once per publish tick (so live ISL/OSL/
+    TPOT stay current) and with ``flush_remaining`` at end-of-run, sending the
+    whole batch through ``BatchTokenizer`` in one sharded call.
 
     ``pending`` counts enqueued-but-not-yet-recorded items; it is the
-    ``n_pending_tasks`` surfaced on the snapshot, and a non-zero value in the
-    final snapshot means the end-of-run flush did not finish within the drain
-    budget.
+    ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot
+    means the end-of-run flush did not finish within the drain budget or failed.
     """
 
     def __init__(
@@ -463,8 +430,9 @@ async def flush(self) -> None:
 
         Items are detached from the buffer up front so concurrent enqueues land
         in the next flush. ``_inflight`` is decremented only after a callback
-        runs, so a cancellation (drain timeout) leaves it reflecting exactly the
-        items that were not recorded.
+        runs, so a cancellation (drain timeout) or a tokenizer error leaves it
+        reflecting exactly the items that were not recorded — those surface as
+        ``pending`` (an incomplete drain), not as silently dropped samples.
         """
         async with self._lock:
             if not (self._text or self._msg):
@@ -492,8 +460,10 @@ async def flush(self) -> None:
     async def flush_remaining(self, timeout: float | None) -> int:
         """End-of-run flush, bounded by ``timeout`` seconds.
 
-        Returns the number of items still un-tokenized — non-zero only if the
-        budget was exhausted (``timeout`` reached). ``None`` waits indefinitely.
+        Returns the number of items still un-tokenized — non-zero if the budget
+        was exhausted (``timeout`` reached) or tokenization failed. ``None``
+        waits indefinitely. Never raises: a failure here must not stop the
+        aggregator from publishing the (incomplete) final snapshot.
         """
         if self._inflight == 0:
             return 0
@@ -508,4 +478,8 @@ async def flush_remaining(self, timeout: float | None) -> int:
                 timeout,
                 self._inflight,
             )
+        except Exception:  # noqa: BLE001 — drain must not block finalize.
+            logger.exception(
+                "tokenizer drain failed; %d items not counted", self._inflight
+            )
         return self._inflight
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 722652e0d..0a59074f5 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -578,18 +578,17 @@ class DrainConfig(BaseModel):
             alias="--metrics-drain-timeout",
             help=(
                 "Wall-clock budget (seconds) for the metrics aggregator to finish "
-                "in-flight async tokenize tasks after the run ends before cancelling "
-                "them. Set to 0 to wait indefinitely. Increase for large datasets or "
-                "long-context workloads where ISL/OSL/TPOT tokenization lags behind "
-                "request throughput."
+                "tokenizing buffered samples after the run ends. Set to 0 to wait "
+                "indefinitely. Increase for very large datasets where the end-of-run "
+                "tokenize batch is big."
             ),
         ),
     ] = Field(
         60.0,
         ge=0,
         description=(
-            "Wall-clock budget (seconds) for the metrics aggregator to drain "
-            "in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited)."
+            "Wall-clock budget (seconds) to finish tokenizing buffered samples "
+            "after ENDED (default: 60.0; 0 = unlimited)."
         ),
     )
 
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 693765e57..75feab6fb 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index 64439452f..3ff1ccd17 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 0c810f30b..1287b99af 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
index b32811bcf..51d25565a 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
@@ -60,9 +60,6 @@ class MockBatchTokenizer:
     def __init__(self, delay: float = 0.0) -> None:
         self._delay = delay
 
-    def token_count(self, text: str) -> int:
-        return len(text.split())
-
     async def count_texts_async(
         self, texts: list[str], _loop: asyncio.AbstractEventLoop
     ) -> list[int]:
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
index 87c5ff96b..3337b168b 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
@@ -1052,10 +1052,50 @@ async def test_shutdown_flushes_buffered_tokenizations(self, tmp_path):
             finally:
                 agg.close()
 
-    # NOTE(agents): Trigger exception handling (logger.exception paths) is not
-    # exercised here. A MockBatchTokenizer whose count_texts_async raises would
-    # let us assert the flush surfaces the error without crashing the
-    # aggregator and that the buffer is cleared.
+    @pytest.mark.asyncio
+    async def test_drain_failure_reports_pending_and_finalizes(self, tmp_path):
+        """A tokenizer error during the ENDED drain must not skip finalize.
+
+        flush_remaining swallows non-timeout failures and returns the stuck
+        count, so publish_final still runs with n_pending_tasks > 0 (incomplete
+        drain) instead of the error escaping process() and hanging main().
+        """
+        loop = asyncio.get_event_loop()
+
+        class FailingBatchTokenizer:
+            async def count_texts_async(self, texts, _loop):
+                raise RuntimeError("tokenizer backend died")
+
+            async def token_count_message_async(self, *args):
+                raise RuntimeError("tokenizer backend died")
+
+        with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
+            agg, _, publisher = make_aggregator(
+                ctx, loop, "agg_drain_failure", tokenizer=FailingBatchTokenizer()
+            )
+            try:
+                await agg.process(
+                    [
+                        session_event(
+                            SessionEventType.START_PERFORMANCE_TRACKING, ts=0
+                        ),
+                        sample_event(
+                            SampleEventType.ISSUED,
+                            "s1",
+                            ts=1000,
+                            data=PromptData(text="some text to tokenize"),
+                        ),
+                    ]
+                )
+                assert agg._token_queue is not None
+                assert agg._token_queue.pending > 0
+                await agg.process([session_event(SessionEventType.ENDED, ts=2000)])
+
+                publisher.publish_final.assert_awaited_once()
+                assert publisher.publish_final.await_args.kwargs["n_pending_tasks"] > 0
+                publisher.aclose.assert_awaited_once()
+            finally:
+                agg.close()
 
     @pytest.mark.asyncio
     async def test_drain_timeout_reports_pending_count(self, tmp_path):
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
index 32f159403..3428f6f22 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
@@ -50,8 +50,7 @@ async def test_sigterm_handler_holds_strong_reference_to_finalize_task():
     registry = MagicMock()
     table = MagicMock()
     table.total_tracked_duration_ns = 0
-    token_queue = MagicMock()
-    token_queue.pending = 0
+    n_pending = 0
 
     # publish_final blocks on an event so we can observe the task
     # mid-execution and exercise the strong-ref contract.
@@ -70,7 +69,7 @@ async def _slow_publish(*args, **kwargs):
         registry=registry,
         publisher=publisher,
         table=table,
-        token_queue=token_queue,
+        pending_tokens=lambda: n_pending,
         shutdown_event=shutdown_event,
     )
 
@@ -124,8 +123,7 @@ async def test_sigterm_handler_refreshes_tracked_duration():
     registry = MagicMock()
     table = MagicMock()
     table.total_tracked_duration_ns = 12345
-    token_queue = MagicMock()
-    token_queue.pending = 3
+    n_pending = 3
 
     publisher = MagicMock()
     publisher.publish_final = AsyncMock()
@@ -137,7 +135,7 @@ async def test_sigterm_handler_refreshes_tracked_duration():
         registry=registry,
         publisher=publisher,
         table=table,
-        token_queue=token_queue,
+        pending_tokens=lambda: n_pending,
         shutdown_event=shutdown_event,
     )
     on_sigterm()
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index b59e56501..03e47158c 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -17,13 +17,15 @@
 
 import asyncio
 import time
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import Future
+from concurrent.futures.process import BrokenProcessPool
 from unittest.mock import patch
 
 import pytest
 from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import (
     BatchTokenizer,
     TokenBatchQueue,
+    _even_chunks,
 )
 
 _MOCK_TARGET = "inference_endpoint.async_utils.services.metrics_aggregator.token_metrics.AutoTokenizer"
@@ -33,8 +35,7 @@ class _FakeTokenizer:
     """Deterministic tokenizer that splits on whitespace.
 
     Has no ``backend_tokenizer``, so BatchTokenizer keeps the batch path
-    in-process (no subprocess shards) and ``count_texts`` falls back to
-    ``tokenize`` per text — which is what these tests assert against.
+    in-process (no subprocess shards) and counts via ``tokenize`` per text.
     """
 
     def __init__(self, load_delay: float = 0.0):
@@ -49,31 +50,66 @@ def from_pretrained(cls, name: str, **kwargs: object) -> "_FakeTokenizer":
         return cls()
 
 
+class _FakeProc:
+    """Stands in for a ProcessPoolExecutor shard; whitespace-counts its chunk."""
+
+    def submit(self, _fn, chunk):
+        fut: Future = Future()
+        fut.set_result([len(t.split()) for t in chunk])
+        return fut
+
+    def shutdown(self, wait=False):
+        pass
+
+
+class _BrokenProc:
+    """A shard whose work resolves to BrokenProcessPool (worker died)."""
+
+    def submit(self, _fn, _chunk):
+        fut: Future = Future()
+        fut.set_exception(BrokenProcessPool("worker died"))
+        return fut
+
+    def shutdown(self, wait=False):
+        pass
+
+
 @pytest.mark.unit
 class TestBatchTokenizer:
-    def test_token_count_returns_int(self):
+    @pytest.mark.asyncio
+    async def test_count_texts_async(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
+            loop = asyncio.get_running_loop()
             with BatchTokenizer("fake") as tok:
-                assert tok.token_count("Hello world") == 2
+                counts = await tok.count_texts_async(["Hello world foo", "a"], loop)
+                assert counts == [3, 1]
 
-    def test_count_texts_batch(self):
+    @pytest.mark.asyncio
+    async def test_count_texts_async_empty(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
+            loop = asyncio.get_running_loop()
             with BatchTokenizer("fake") as tok:
-                assert tok.count_texts(["a b", "c d e", "x"]) == [2, 3, 1]
+                assert await tok.count_texts_async([], loop) == []
 
-    def test_count_texts_empty(self):
+    @pytest.mark.asyncio
+    async def test_count_texts_async_sharded(self):
+        """With shards present, chunks are reassembled in original order."""
         with patch(_MOCK_TARGET, _FakeTokenizer):
+            loop = asyncio.get_running_loop()
             with BatchTokenizer("fake") as tok:
-                assert tok.count_texts([]) == []
+                tok._procs = [_FakeProc(), _FakeProc()]
+                counts = await tok.count_texts_async(["a", "b b", "c c c", "d"], loop)
+                assert counts == [1, 2, 3, 1]
 
-    def test_concurrent_token_count_thread_safe(self):
+    @pytest.mark.asyncio
+    async def test_count_texts_async_shard_failure_propagates(self):
+        """A dead shard surfaces as an error, not a silent in-process fallback."""
         with patch(_MOCK_TARGET, _FakeTokenizer):
+            loop = asyncio.get_running_loop()
             with BatchTokenizer("fake") as tok:
-                texts = [f"word{i} word{i + 1}" for i in range(20)]
-                with ThreadPoolExecutor(max_workers=8) as executor:
-                    futures = [executor.submit(tok.token_count, t) for t in texts]
-                    results = [f.result() for f in futures]
-                assert results == [2] * 20
+                tok._procs = [_BrokenProc()]
+                with pytest.raises(BrokenProcessPool):
+                    await tok.count_texts_async(["a b"], loop)
 
     def test_close_is_idempotent(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
@@ -81,27 +117,14 @@ def test_close_is_idempotent(self):
             tok.close()
             tok.close()  # must not raise
 
-    def test_use_after_close_raises(self):
-        with patch(_MOCK_TARGET, _FakeTokenizer):
-            tok = BatchTokenizer("fake")
-            tok.close()
-            with pytest.raises(RuntimeError, match="closed"):
-                tok.token_count("hello")
-
     @pytest.mark.asyncio
-    async def test_count_texts_async(self):
+    async def test_use_after_close_raises(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
-                counts = await tok.count_texts_async(["Hello world foo", "a"], loop)
-                assert counts == [3, 1]
-
-    def test_context_manager(self):
-        with patch(_MOCK_TARGET, _FakeTokenizer):
-            with BatchTokenizer("fake") as tok:
-                assert tok.token_count("a b c") == 3
+            tok = BatchTokenizer("fake")
+            tok.close()
             with pytest.raises(RuntimeError, match="closed"):
-                tok.token_count("test")
+                await tok.count_texts_async(["hello"], loop)
 
 
 class _FakeTokenizerWithTemplate(_FakeTokenizer):
@@ -130,16 +153,23 @@ def apply_chat_template(
 
 @pytest.mark.unit
 class TestBatchTokenizerMessageTokenization:
-    def test_token_count_message_subtracts_baseline(self):
-        """token_count_message returns full_tokens - baseline."""
+    @pytest.mark.asyncio
+    async def test_token_count_message_subtracts_baseline(self):
+        """token_count_message_async returns full_tokens - baseline."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
+            loop = asyncio.get_running_loop()
             with BatchTokenizer("fake") as tok:
                 # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2
-                assert tok.token_count_message("hello world", None, None) == 2
+                count = await tok.token_count_message_async(
+                    "hello world", None, None, loop
+                )
+                assert count == 2
 
-    def test_token_count_message_includes_tool_calls(self):
-        """token_count_message includes tool-call JSON tokens."""
+    @pytest.mark.asyncio
+    async def test_token_count_message_includes_tool_calls(self):
+        """Tool-call JSON tokens are included in the count."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
+            loop = asyncio.get_running_loop()
             with BatchTokenizer("fake") as tok:
                 tool_calls = (
                     {
@@ -148,11 +178,14 @@ def test_token_count_message_includes_tool_calls(self):
                         "function": {"name": "f", "arguments": "{}"},
                     },
                 )
-                without = tok.token_count_message("hello", None, None)
-                with_calls = tok.token_count_message("hello", None, tool_calls)
+                without = await tok.token_count_message_async("hello", None, None, loop)
+                with_calls = await tok.token_count_message_async(
+                    "hello", None, tool_calls, loop
+                )
                 assert with_calls > without
 
-    def test_token_count_message_fallback_on_exception(self):
+    @pytest.mark.asyncio
+    async def test_token_count_message_fallback_on_exception(self):
         """Falls back to whitespace split when apply_chat_template raises."""
 
         class _BadTemplateTokenizer(_FakeTokenizer):
@@ -160,6 +193,7 @@ def apply_chat_template(self, *args, **kwargs):
                 raise ValueError("template does not support tool_calls")
 
         with patch(_MOCK_TARGET, _BadTemplateTokenizer):
+            loop = asyncio.get_running_loop()
             with BatchTokenizer("fake") as tok:
                 tool_calls = (
                     {
@@ -169,17 +203,31 @@ def apply_chat_template(self, *args, **kwargs):
                     },
                 )
                 # Must not raise; falls back to whitespace tokenizer.
-                assert tok.token_count_message("hello world", None, tool_calls) > 0
-
-    @pytest.mark.asyncio
-    async def test_token_count_message_async(self):
-        with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
-            loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
                 count = await tok.token_count_message_async(
-                    "hello world", None, None, loop
+                    "hello world", None, tool_calls, loop
                 )
-                assert count == 2
+                assert count > 0
+
+
+@pytest.mark.unit
+class TestEvenChunks:
+    def test_splits_into_near_equal_chunks(self):
+        assert _even_chunks(["a", "b", "c", "d", "e"], 2) == [
+            ["a", "b", "c"],
+            ["d", "e"],
+        ]
+
+    def test_single_chunk_when_n_le_one(self):
+        assert _even_chunks(["a", "b"], 1) == [["a", "b"]]
+
+    def test_single_item_input(self):
+        assert _even_chunks(["only"], 4) == [["only"]]
+
+    def test_preserves_order_and_bounds_chunk_count(self):
+        items = [str(i) for i in range(10)]
+        chunks = _even_chunks(items, 3)
+        assert [x for c in chunks for x in c] == items
+        assert len(chunks) <= 3
 
 
 class _CapturingTokenizer:
@@ -247,3 +295,20 @@ async def token_count_message_async(self, *args):
         n_pending = await queue.flush_remaining(timeout=0.05)
         assert n_pending == 1
         assert recorded == []
+
+    async def test_flush_remaining_failure_reports_pending(self):
+        """A tokenizer error leaves items pending and never raises."""
+
+        class _FailingTokenizer:
+            async def count_texts_async(self, texts, _loop):
+                raise RuntimeError("tokenizer boom")
+
+            async def token_count_message_async(self, *args):
+                raise RuntimeError("tokenizer boom")
+
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_FailingTokenizer(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("x y", recorded.append)
+        assert await queue.flush_remaining(timeout=5.0) == 1
+        assert recorded == []

From 82a12bca9a80840ca4c1b8b293d03afe9a53de0e Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Tue, 9 Jun 2026 18:12:32 -0700
Subject: [PATCH 03/20] fix(metrics): tokenizer stage uses the whole machine;
 restore --tokenizer-workers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review-council + e2e findings on the batch-tokenization branch. The
tokenizer drain runs after the benchmark, so the loadgen/worker affinity
partition does not apply to it — but the aggregator subprocess inherited
the loadgen's narrow pin (subprocess.Popen propagates the parent mask)
and sharding silently never engaged under the default
enable_cpu_affinity=true.

- cpu_affinity: add expand_to_all_online_cpus() — reset the current
  process to every online CPU (kernel still clamps to the cgroup/Slurm
  cpuset). The aggregator calls it before constructing the tokenizer, so
  shards size to the full machine by default.
- Restore the --tokenizer-workers service flag with shard semantics:
  -1 auto (one process per 8-core block), explicit count clamped to
  capacity, 0 disables sharding. Every fallback path logs its reason and
  the success log includes setup time.
- flush() phase isolation: a text-batch failure no longer drops the
  message items (separate failure scopes per executor; first error
  re-raised after both phases), and a raising recorder callback is
  logged instead of poisoning the rest of the batch.
- Shard workers ignore SIGINT: Ctrl-C goes to the whole process group;
  the parent drain must control worker lifetime.
- Stale "in-flight async tokenize tasks" wording updated in
  snapshot.py, publisher.py, and AGENTS.md (TokenizePool reference);
  documented the wait=False shard shutdown.

Validated e2e through the real launch path (echo server, default flags,
48-CPU host): aggregator expands 10 -> 48 CPUs, "BatchTokenizer: 6
shards x 8 cores", drain to n_pending_tasks=0, state=complete. 166 unit
tests pass; pre-commit clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                                     |   2 +-
 .../services/metrics_aggregator/__main__.py   |  24 ++++-
 .../services/metrics_aggregator/publisher.py  |   6 +-
 .../services/metrics_aggregator/snapshot.py   |  19 ++--
 .../metrics_aggregator/token_metrics.py       | 100 +++++++++++++-----
 .../endpoint_client/cpu_affinity.py           |  26 +++++
 .../metrics_aggregator/test_publisher.py      |  77 ++++++++++++++
 .../metrics_aggregator/test_token_metrics.py  |  79 ++++++++++++++
 .../unit/endpoint_client/test_cpu_affinity.py |  28 +++++
 9 files changed, 323 insertions(+), 38 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 79bc5ded4..050d9e5b3 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -204,7 +204,7 @@ src/inference_endpoint/
 │   │       ├── publisher.py    # MetricsPublisher (tick task + atomic disk fallback)
 │   │       ├── subscriber.py   # MetricsSnapshotSubscriber (latest + COMPLETE snapshot capture)
 │   │       ├── metrics_table.py # In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL)
-│   │       └── token_metrics.py # TokenizePool (HF tokenizer thread pool for ISL/OSL/TPOT)
+│   │       └── token_metrics.py # BatchTokenizer (sharded batch tokenization) + TokenBatchQueue (defer-to-flush buffer) for ISL/OSL/TPOT
 │   └── transport/             # ZMQ-based IPC transport layer
 │       ├── protocol.py        # Transport protocols + TransportConfig + MessageCodec[T]
 │       └── zmq/               # ZMQ implementation (context, pubsub, transport, ZMQTransportConfig)
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 7d2101c11..fc975246a 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -26,6 +26,10 @@
 from inference_endpoint.async_utils.loop_manager import LoopManager
 from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
 from inference_endpoint.async_utils.transport.zmq.ready_check import send_ready_signal
+from inference_endpoint.endpoint_client.cpu_affinity import (
+    UnsupportedPlatformError,
+    expand_to_all_online_cpus,
+)
 from inference_endpoint.utils.logging import setup_logging
 
 from .aggregator import MetricCounterKey, MetricsAggregatorService
@@ -159,6 +163,15 @@ async def main() -> None:
         default=None,
         help="HuggingFace tokenizer name for ISL/OSL/TPOT (e.g. 'gpt2'). If not set, token metrics are disabled.",
     )
+    parser.add_argument(
+        "--tokenizer-workers",
+        type=int,
+        default=-1,
+        help=(
+            "Number of tokenizer shard processes (-1 = auto: one per "
+            "8-core block of this machine; 0 = in-process tokenization)."
+        ),
+    )
     parser.add_argument(
         "--streaming",
         action="store_true",
@@ -201,7 +214,16 @@ async def main() -> None:
     # (coalesces to 'object' not 'AbstractContextManager[BatchTokenizer | None]')
     tokenizer_cm: AbstractContextManager[BatchTokenizer | None]
     if args.tokenizer:
-        tokenizer_cm = BatchTokenizer(args.tokenizer)
+        # Tokenization drains after the benchmark run, so the loadgen/worker
+        # affinity partition does not apply to this stage: drop the narrow
+        # mask inherited from the pinned parent so shards size to the whole
+        # machine (cgroup/Slurm CPU limits still apply).
+        try:
+            cpus = expand_to_all_online_cpus()
+            logger.info("metrics aggregator affinity: %d CPUs", len(cpus))
+        except UnsupportedPlatformError:
+            pass  # non-Linux: no inherited pin to undo.
+        tokenizer_cm = BatchTokenizer(args.tokenizer, n_workers=args.tokenizer_workers)
     else:
         tokenizer_cm = nullcontext()
 
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
index b58aa05ff..ae60942f0 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
@@ -168,9 +168,9 @@ async def publish_final(
     ) -> None:
         """Write the final snapshot to disk and signal pub/sub consumers.
 
-        ``n_pending_tasks`` is the count of in-flight async tokenize tasks
-        at finalization time. Drain timeout is detected by Report consumers
-        as ``state == COMPLETE and n_pending_tasks > 0``.
+        ``n_pending_tasks`` is the count of buffered tokenizations not yet
+        recorded at finalization time. An incomplete drain is detected by
+        Report consumers as ``state == COMPLETE and n_pending_tasks > 0``.
 
         ``interrupted=True`` is set by the signal handler in __main__.py
         when SIGTERM/SIGINT triggers shutdown before ``ENDED`` arrived;
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
index 95c68ab16..eacac94f5 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
@@ -44,8 +44,8 @@ class SessionState(str, Enum):
                   state to carry).
     LIVE        → run in progress; tick task publishing live HDR-derived stats.
     DRAINING    → ``SessionEventType.ENDED`` has been received; the aggregator
-                  is awaiting the in-flight async tokenize tasks (bounded by
-                  the ``--drain-timeout`` budget, default 60 s). Tick task
+                  is tokenizing the buffered samples (bounded by the
+                  ``--drain-timeout`` budget, default 60 s). Tick task
                   continues at this stage, still HDR-derived; no new events
                   will arrive.
     COMPLETE    → terminal clean state. The ``publish_final()`` snapshot
@@ -149,13 +149,14 @@ class MetricsSnapshot(
                           ``INTERRUPTED``) mark the last snapshot of the run;
                           for ``COMPLETE`` snapshots percentiles and
                           histograms are exact, otherwise HDR-derived.
-        n_pending_tasks:  Count of in-flight async tokenize tasks at snapshot
-                          composition time. ``> 0`` during normal load (ISL/
-                          OSL/TPOT post-processing in flight) and during the
-                          drain phase. **Drain timeout is detected as**
-                          ``state == COMPLETE and n_pending_tasks > 0``: the
-                          aggregator gave up draining; some async-only series
-                          are missing samples that were still being tokenized.
+        n_pending_tasks:  Count of buffered tokenizations not yet recorded at
+                          snapshot composition time. ``> 0`` during normal
+                          load (ISL/OSL/TPOT buffered between publish-tick
+                          flushes) and during the drain phase. **An
+                          incomplete drain is detected as** ``state ==
+                          COMPLETE and n_pending_tasks > 0``: the end-of-run
+                          flush timed out or failed; the token-derived series
+                          are missing those samples.
         metrics:          Tagged union of ``CounterStat`` and ``SeriesStat``,
                           ordered counters-first then series, registration
                           order within each.
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index d3a236d2b..8ad0ae10b 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -30,6 +30,8 @@
 import logging
 import multiprocessing
 import os
+import signal
+import time
 from collections.abc import Callable
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from typing import TYPE_CHECKING, Any, Protocol, cast
@@ -94,6 +96,10 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None:
     Affinity is set before the first encode so the Rust rayon pool sizes itself
     to the pinned core count (num_cpus respects sched_getaffinity on Linux).
     """
+    # Ctrl-C sends SIGINT to the whole foreground process group; the parent
+    # drives worker shutdown, so a worker dying mid-drain would break the pool
+    # and lose the buffered tokenizations it was counting.
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
     if core_set:
         try:
             os.sched_setaffinity(0, set(core_set))
@@ -151,20 +157,21 @@ def __init__(
         tokenizer_name: str,
         *,
         cores_per_worker: int = CORES_PER_WORKER,
+        n_workers: int = -1,
     ) -> None:
         self._tokenizer_name = tokenizer_name
         self._fallback_warned: set[str] = set()
         self._tokenizer: PreTrainedTokenizerBase | None = None
         self._prefix_len = 0
         self._baseline = 0
-        # In-process thread for the sync + chat-template paths.
+        # In-process thread for the chat-template path.
         self._thread: ThreadPoolExecutor | None = ThreadPoolExecutor(
             max_workers=1, thread_name_prefix="tok-thread"
         )
         self._load_tokenizer()  # also computes the chat-template baseline
         # Process shards for the batched text path (or empty -> in-process).
         self._procs: list[ProcessPoolExecutor] = []
-        self._setup_shards(cores_per_worker)
+        self._setup_shards(cores_per_worker, n_workers)
 
     # -- setup --------------------------------------------------------------
 
@@ -201,24 +208,47 @@ def _load_tokenizer(self) -> None:
                 self._tokenizer_name,
             )
 
-    def _setup_shards(self, cores_per_worker: int) -> None:
+    def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
         """Spawn one pinned single-worker process per core block.
 
-        No-op (leaving the batch path in-process) when the tokenizer has no fast
-        Rust backend, affinity is unavailable, or fewer than two blocks fit — a
-        single shard is no faster than the in-process backend.
+        ``n_workers <= 0`` (auto) fits as many shards as this process's
+        affinity mask allows, one per ``cores_per_worker`` block; an explicit
+        count is clamped to that capacity. No-op (leaving the batch path
+        in-process) when the tokenizer has no fast Rust backend, affinity is
+        unavailable, or — in auto mode — fewer than two blocks fit (a single
+        shard is no faster than the in-process backend). Each fallback is
+        logged: a missing "shards" INFO line is the only other signal that
+        the batched path is running single-threaded.
         """
-        if cores_per_worker <= 0:
+        if cores_per_worker <= 0 or n_workers == 0:
+            logger.info("BatchTokenizer: sharding disabled")
             return
         if getattr(self._tokenizer, "backend_tokenizer", None) is None:
+            logger.info(
+                "BatchTokenizer: no fast tokenizer backend; using in-process "
+                "tokenization"
+            )
             return
         try:
             available = sorted(os.sched_getaffinity(0))
         except (OSError, AttributeError):
+            logger.info(
+                "BatchTokenizer: CPU affinity unavailable; using in-process "
+                "tokenization"
+            )
             return
-        n = len(available) // cores_per_worker
-        if n < 2:
+        capacity = len(available) // cores_per_worker
+        n = capacity if n_workers < 0 else min(n_workers, capacity)
+        if n < (2 if n_workers < 0 else 1):
+            logger.info(
+                "BatchTokenizer: %d CPUs available (capacity %d blocks of %d); "
+                "using in-process tokenization",
+                len(available),
+                capacity,
+                cores_per_worker,
+            )
             return
+        t0 = time.perf_counter()
         ctx = multiprocessing.get_context("spawn")
         procs: list[ProcessPoolExecutor] = []
         try:
@@ -247,7 +277,10 @@ def _setup_shards(self, cores_per_worker: int) -> None:
             return
         self._procs = procs
         logger.info(
-            "BatchTokenizer: %d shards x %d cores", len(procs), cores_per_worker
+            "BatchTokenizer: %d shards x %d cores (setup %.1fs)",
+            len(procs),
+            cores_per_worker,
+            time.perf_counter() - t0,
         )
 
     # -- batched text path --------------------------------------------------
@@ -439,23 +472,42 @@ async def flush(self) -> None:
                 return
             text_items, self._text = self._text, []
             msg_items, self._msg = self._msg, []
+            # The text and message phases fail independently — they run on
+            # separate executors, so a dead text shard must not drop message
+            # items that would still succeed (and vice versa). The first
+            # failure is re-raised after both phases so callers still see it.
+            failure: Exception | None = None
             if text_items:
-                counts = await self._tokenizer.count_texts_async(
-                    [t for t, _ in text_items], self._loop
-                )
-                for (_, on_count), count in zip(text_items, counts, strict=True):
-                    try:
-                        on_count(count)
-                    finally:
-                        self._inflight -= 1
+                try:
+                    counts = await self._tokenizer.count_texts_async(
+                        [t for t, _ in text_items], self._loop
+                    )
+                except Exception as exc:  # noqa: BLE001 — isolate phases.
+                    failure = exc
+                else:
+                    for (_, on_count), count in zip(text_items, counts, strict=True):
+                        self._record(on_count, count)
             for (content, reasoning, tool_calls), on_count in msg_items:
-                count = await self._tokenizer.token_count_message_async(
-                    content, reasoning, tool_calls, self._loop
-                )
                 try:
-                    on_count(count)
-                finally:
-                    self._inflight -= 1
+                    count = await self._tokenizer.token_count_message_async(
+                        content, reasoning, tool_calls, self._loop
+                    )
+                except Exception as exc:  # noqa: BLE001 — isolate items.
+                    failure = failure or exc
+                    continue
+                self._record(on_count, count)
+            if failure is not None:
+                raise failure
+
+    def _record(self, on_count: Callable[[int], None], count: int) -> None:
+        """Run one recorder callback; a raising recorder must not poison the
+        rest of the batch, and the item still counts as recorded."""
+        try:
+            on_count(count)
+        except Exception:  # noqa: BLE001 — per-item isolation.
+            logger.exception("token metric recorder failed")
+        finally:
+            self._inflight -= 1
 
     async def flush_remaining(self, timeout: float | None) -> int:
         """End-of-run flush, bounded by ``timeout`` seconds.
diff --git a/src/inference_endpoint/endpoint_client/cpu_affinity.py b/src/inference_endpoint/endpoint_client/cpu_affinity.py
index 8972a59d9..0de6e39a4 100644
--- a/src/inference_endpoint/endpoint_client/cpu_affinity.py
+++ b/src/inference_endpoint/endpoint_client/cpu_affinity.py
@@ -317,6 +317,32 @@ def pin_loadgen(
         return None
 
 
+@require_linux
+def expand_to_all_online_cpus() -> set[int]:
+    """Reset the current process's affinity to every online CPU.
+
+    Undoes a narrow mask inherited from a pinned parent (subprocesses spawned
+    after ``pin_loadgen`` inherit the loadgen mask). The kernel intersects the
+    request with the cgroup cpuset, so container/Slurm CPU limits still apply.
+
+    Returns:
+        The effective CPU set after the reset.
+
+    Raises:
+        UnsupportedPlatformError: If not running on Linux.
+    """
+    online = _read_sysfs_cpulist(_SYSFS_CPU / "online") or set()
+    if online:
+        try:
+            os.sched_setaffinity(0, online)
+        except OSError as e:
+            logger.warning(f"Could not expand CPU affinity: {e}")
+    try:
+        return os.sched_getaffinity(0)
+    except OSError:
+        return online
+
+
 @require_linux
 def set_cpu_affinity(pid: int, cpus: set[int]) -> bool:
     """Set CPU affinity for a process.
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
index 9e26f734a..15ad4d95c 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
@@ -81,6 +81,83 @@ def get_runtime_state() -> tuple[SessionState, int]:
         finally:
             publisher.close()
 
+    @pytest.mark.asyncio
+    async def test_pre_publish_runs_before_each_tick_snapshot(
+        self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext
+    ):
+        """pre_publish is awaited before the runtime state is captured."""
+        loop = asyncio.get_event_loop()
+        publisher = MetricsPublisher(
+            MetricsSnapshotCodec(),
+            zmq_ctx_scope,
+            "test_pub_pre",
+            loop,
+            final_snapshot_path=tmp_path / "final_snapshot.json",
+        )
+        try:
+            registry = MetricsRegistry()
+            registry.register_counter("c")
+            order: list[str] = []
+
+            async def pre_publish() -> None:
+                order.append("flush")
+
+            def get_runtime_state() -> tuple[SessionState, int]:
+                order.append("state")
+                return SessionState.LIVE, 0
+
+            publisher.start(
+                registry,
+                publish_interval_s=0.01,
+                get_runtime_state=get_runtime_state,
+                pre_publish=pre_publish,
+            )
+            await asyncio.sleep(0.05)
+            assert order, "no tick ran"
+            # Every state capture is preceded by a flush in the same tick.
+            assert order[0] == "flush"
+            for i, entry in enumerate(order):
+                if entry == "state":
+                    assert order[i - 1] == "flush"
+        finally:
+            publisher.close()
+
+    @pytest.mark.asyncio
+    async def test_pre_publish_failure_keeps_ticking(
+        self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext
+    ):
+        """A raising pre_publish is swallowed by the tick; ticks continue."""
+        loop = asyncio.get_event_loop()
+        publisher = MetricsPublisher(
+            MetricsSnapshotCodec(),
+            zmq_ctx_scope,
+            "test_pub_pre_fail",
+            loop,
+            final_snapshot_path=tmp_path / "final_snapshot.json",
+        )
+        try:
+            registry = MetricsRegistry()
+            registry.register_counter("c")
+            attempts = 0
+
+            async def pre_publish() -> None:
+                nonlocal attempts
+                attempts += 1
+                raise RuntimeError("tokenizer hiccup")
+
+            publisher.start(
+                registry,
+                publish_interval_s=0.01,
+                get_runtime_state=lambda: (SessionState.LIVE, 0),
+                pre_publish=pre_publish,
+            )
+            await asyncio.sleep(0.08)
+            assert attempts >= 2, "tick task died after a pre_publish failure"
+            assert publisher._tick_task is not None
+            assert not publisher._tick_task.done()
+        finally:
+            publisher.close()
+
     @pytest.mark.asyncio
     async def test_publish_final_writes_json_atomically(
         self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 03e47158c..82609f275 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -22,10 +22,15 @@
 from unittest.mock import patch
 
 import pytest
+from inference_endpoint.async_utils.services.metrics_aggregator import (
+    token_metrics as token_metrics_module,
+)
 from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import (
     BatchTokenizer,
     TokenBatchQueue,
+    _encode_batch_lengths,
     _even_chunks,
+    _worker_encode_lengths,
 )
 
 _MOCK_TARGET = "inference_endpoint.async_utils.services.metrics_aggregator.token_metrics.AutoTokenizer"
@@ -209,6 +214,43 @@ def apply_chat_template(self, *args, **kwargs):
                 assert count > 0
 
 
+class _Encoding:
+    def __init__(self, n: int):
+        self.ids = list(range(n))
+
+
+class _FastBackend:
+    """Raw-tokenizers backend stub with the fast batch entry point."""
+
+    def encode_batch_fast(self, texts, add_special_tokens=False):
+        return [_Encoding(len(t.split())) for t in texts]
+
+
+class _SlowBackend:
+    """Raw-tokenizers backend stub without encode_batch_fast."""
+
+    def encode_batch(self, texts, add_special_tokens=False):
+        return [_Encoding(len(t.split())) for t in texts]
+
+
+@pytest.mark.unit
+class TestEncodeHelpers:
+    def test_encode_batch_lengths_prefers_fast(self):
+        assert _encode_batch_lengths(_FastBackend(), ["a b", "c"]) == [2, 1]
+
+    def test_encode_batch_lengths_falls_back_to_encode_batch(self):
+        assert _encode_batch_lengths(_SlowBackend(), ["a b c", "d"]) == [3, 1]
+
+    def test_worker_encode_lengths_raises_without_backend(self, monkeypatch):
+        monkeypatch.setattr(token_metrics_module, "_WORKER_BACKEND", None)
+        with pytest.raises(RuntimeError, match="backend unavailable"):
+            _worker_encode_lengths(["a"])
+
+    def test_worker_encode_lengths_uses_backend(self, monkeypatch):
+        monkeypatch.setattr(token_metrics_module, "_WORKER_BACKEND", _FastBackend())
+        assert _worker_encode_lengths(["a b", "c d e"]) == [2, 3]
+
+
 @pytest.mark.unit
 class TestEvenChunks:
     def test_splits_into_near_equal_chunks(self):
@@ -312,3 +354,40 @@ async def token_count_message_async(self, *args):
         queue.enqueue_text("x y", recorded.append)
         assert await queue.flush_remaining(timeout=5.0) == 1
         assert recorded == []
+
+    async def test_flush_text_failure_does_not_drop_message_items(self):
+        """The message phase runs (and records) even when the text batch fails."""
+
+        class _TextFailingTokenizer:
+            async def count_texts_async(self, texts, _loop):
+                raise RuntimeError("text shard died")
+
+            async def token_count_message_async(
+                self, content, reasoning, tool_calls, _loop
+            ):
+                return len(content.split())
+
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_TextFailingTokenizer(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("never counted", recorded.append)
+        queue.enqueue_message(("hello world", None, None), recorded.append)
+        with pytest.raises(RuntimeError, match="text shard died"):
+            await queue.flush()
+        assert recorded == [2], "message item must survive the text failure"
+        assert queue.pending == 1, "only the text item remains pending"
+
+    async def test_flush_recorder_failure_does_not_poison_batch(self):
+        """One raising on_count is logged; the rest of the batch still records."""
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        recorded: list[int] = []
+
+        def bad_recorder(count: int) -> None:
+            raise ValueError("recorder bug")
+
+        queue.enqueue_text("a b", bad_recorder)
+        queue.enqueue_text("c d e", recorded.append)
+        await queue.flush()
+        assert recorded == [3]
+        assert queue.pending == 0, "a raising recorder still counts as recorded"
diff --git a/tests/unit/endpoint_client/test_cpu_affinity.py b/tests/unit/endpoint_client/test_cpu_affinity.py
index 52ef724e2..7d100be9d 100644
--- a/tests/unit/endpoint_client/test_cpu_affinity.py
+++ b/tests/unit/endpoint_client/test_cpu_affinity.py
@@ -6,6 +6,7 @@
 from inference_endpoint.endpoint_client.cpu_affinity import (
     AffinityPlan,
     compute_affinity_plan,
+    expand_to_all_online_cpus,
     get_all_online_cpus,
     pin_loadgen,
     set_cpu_affinity,
@@ -146,3 +147,30 @@ def test_all_methods_fail_returns_empty(
         """Test that empty set is returned when all methods fail."""
         cpus = get_all_online_cpus()
         assert cpus == set()
+
+
+class TestExpandToAllOnlineCpus:
+    @patch("os.sched_getaffinity")
+    @patch("os.sched_setaffinity")
+    @patch("pathlib.Path.read_text")
+    def test_expands_inherited_mask_to_online(self, mock_read, mock_set, mock_get):
+        """The full sysfs online set is requested; the effective mask is returned."""
+        mock_read.return_value = "0-7\n"
+        mock_get.return_value = {0, 1, 2, 3, 4, 5, 6, 7}
+
+        cpus = expand_to_all_online_cpus()
+
+        mock_set.assert_called_once_with(0, {0, 1, 2, 3, 4, 5, 6, 7})
+        assert cpus == {0, 1, 2, 3, 4, 5, 6, 7}
+
+    @patch("os.sched_getaffinity")
+    @patch("os.sched_setaffinity", side_effect=OSError("cpuset denies"))
+    @patch("pathlib.Path.read_text")
+    def test_setaffinity_failure_returns_current_mask(
+        self, mock_read, mock_set, mock_get
+    ):
+        """A denied expansion is non-fatal: the current mask is reported."""
+        mock_read.return_value = "0-7\n"
+        mock_get.return_value = {0, 1}
+
+        assert expand_to_all_online_cpus() == {0, 1}

From 8033c4726fd14d1e86099b8e94160f7e19e06a67 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Tue, 9 Jun 2026 23:41:43 -0700
Subject: [PATCH 04/20] chore(metrics): use pass bodies in TokenCounter
 protocol stubs

The ellipsis bodies trip the code-quality bot's "statement has no
effect" check on every push; pass is semantically identical for
Protocol method declarations and keeps the report clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/token_metrics.py            | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 8ad0ae10b..4e49aedc8 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -405,7 +405,8 @@ class TokenCounter(Protocol):
 
     async def count_texts_async(
         self, texts: list[str], loop: asyncio.AbstractEventLoop, /
-    ) -> list[int]: ...
+    ) -> list[int]:
+        pass
 
     async def token_count_message_async(
         self,
@@ -414,7 +415,8 @@ async def token_count_message_async(
         tool_calls: tuple[dict[str, Any], ...] | None,
         loop: asyncio.AbstractEventLoop,
         /,
-    ) -> int: ...
+    ) -> int:
+        pass
 
 
 class TokenBatchQueue:

From 47c4f35ac669ae58f1cce7d782863e64c1fd7541 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 01:50:03 -0700
Subject: [PATCH 05/20] docs(metrics): add metrics-aggregator design doc;
 refresh services overview

New docs/async_utils/services/metrics_aggregator/DESIGN.md (mirroring the
event_logger convention) covering the service lifecycle and the token
metrics pipeline: defer-to-flush batching, process-sharded batch encoding,
the post-run affinity expansion, failure isolation, and the
n_pending_tasks contract. The services overview 6.2 entry now reflects the
batched tokenizer, the snapshot outputs, and the current CLI flags, and
links the new doc.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/async_utils/services/DESIGN.md           |   6 +-
 .../services/metrics_aggregator/DESIGN.md     | 171 ++++++++++++++++++
 2 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 docs/async_utils/services/metrics_aggregator/DESIGN.md

diff --git a/docs/async_utils/services/DESIGN.md b/docs/async_utils/services/DESIGN.md
index a26f13783..e12eb8a4d 100644
--- a/docs/async_utils/services/DESIGN.md
+++ b/docs/async_utils/services/DESIGN.md
@@ -306,9 +306,9 @@ stateDiagram-v2
 
 ### 6.2 Metrics aggregator
 
-- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). May use a tokenizer pool for token-based metrics. Shuts down on **session.ended**.
-- **Outputs**: Planned is to push real time metrics to Prometheus via PushGateway. Currently, logging / writing final report to JSON is sufficient legacy behavior.
-- **Process**: Run as a **subprocess**; given `--metrics-dir`, `--socket-dir`, `--socket-name`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC address.
+- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). Token metrics (ISL/OSL/TPOT) are computed by a batched, process-sharded tokenizer — see [metrics_aggregator/DESIGN.md](metrics_aggregator/DESIGN.md). Shuts down on **session.ended**.
+- **Outputs**: Live `MetricsSnapshot` frames over an IPC PUB socket, and an atomically written `final_snapshot.json` (the primary Report source). Planned is to push real time metrics to Prometheus via PushGateway.
+- **Process**: Run as a **subprocess**; given `--metrics-output-dir`, `--socket-dir`, `--socket-name`, `--metrics-socket`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC address.
 
 ---
 
diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
new file mode 100644
index 000000000..4f094097a
--- /dev/null
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -0,0 +1,171 @@
+# Metrics Aggregator Service — Design Document
+
+## Overview
+
+The metrics aggregator is a **subprocess** (`python -m
+inference_endpoint.async_utils.services.metrics_aggregator`) that subscribes to
+the EventRecord pub/sub stream, folds per-sample events into a
+`MetricsRegistry` (counters + HDR-histogram series + raw values), and publishes
+`MetricsSnapshot` frames over an IPC PUB socket at a fixed cadence. At
+end-of-run it atomically writes `final_snapshot.json`, which is the **primary**
+source for `Report`; the terminal pub/sub frame is only a TUI "run finished"
+signal.
+
+This document covers the service's lifecycle and, in depth, the **token
+metrics pipeline** — how ISL/OSL/TPOT tokenization keeps pace with
+high-completion-rate runs.
+
+## Module Layout
+
+| File               | Purpose                                                                   |
+| ------------------ | ------------------------------------------------------------------------- |
+| `__main__.py`      | Subprocess entry: argparse, affinity expansion, lifecycle wiring, SIGTERM |
+| `aggregator.py`    | `MetricsAggregatorService` — event router, session state, drain           |
+| `registry.py`      | `MetricsRegistry`, `CounterSampler`, `SeriesSampler`                      |
+| `snapshot.py`      | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec              |
+| `publisher.py`     | `MetricsPublisher` — tick task + atomic final-snapshot write              |
+| `subscriber.py`    | `MetricsSnapshotSubscriber` — main-process consumer                       |
+| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL)              |
+| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue`         |
+
+## Lifecycle
+
+```
+INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► COMPLETE
+                                                └──► INTERRUPTED  (SIGTERM/SIGINT)
+```
+
+- **LIVE**: the publisher tick task emits a snapshot every
+  `--publish-interval` seconds (default 0.25 s).
+- **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed,
+  bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited).
+- The ENDED path runs inside a finalization boundary: whatever the drain does
+  — finish, time out, or fail — `publish_final` and the shutdown signal always
+  run. A tokenizer failure can degrade the snapshot (see the
+  `n_pending_tasks` contract below) but can never hang the subprocess.
+- **INTERRUPTED**: a signal handler writes a best-effort partial final
+  snapshot so `Report` can distinguish a killed run from a clean one.
+
+## Token Metrics Pipeline
+
+ISL, OSL, and TPOT all require running the HF tokenizer over prompt or
+completion text. With streaming on, each completed sample needs up to three
+tokenizer passes, so at high completion rates tokenization is the service's
+dominant CPU cost — and a per-event dispatch model cannot keep up: work
+arriving faster than it drains accumulates an unbounded backlog that must be
+paid at end-of-run. The pipeline is therefore built around two ideas:
+**defer-to-flush batching** and **process-sharded batch encoding**.
+
+### Defer-to-flush (`TokenBatchQueue`)
+
+Token triggers do no work at event time. `fire()` appends
+`(text, on_count)` — or `(message_parts, on_count)` for chat-template items —
+to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared
+in batches at exactly two points:
+
+1. **Every publish tick** — the publisher awaits a `pre_publish` hook before
+   composing each snapshot, so live ISL/OSL/TPOT reflect recently completed
+   samples. A failure here is swallowed by the tick (live publishing never
+   stops).
+2. **End-of-run** — `flush_remaining(timeout)` drains everything still
+   buffered, bounded by the drain budget.
+
+`flush()` serializes under an asyncio lock and detaches the buffer up front,
+so enqueues that race a flush land in the next one. Failure isolation is
+layered: the plain-text phase and the chat-template phase fail independently
+(they run on separate executors, so a dead text shard must not drop message
+items), a raising recorder callback is logged without aborting the rest of
+the batch, and the first error is re-raised only after both phases ran.
+`flush_remaining` never raises — a timeout or tokenizer failure becomes a
+logged, non-zero pending count.
+
+### Sharded batch encoding (`BatchTokenizer`)
+
+A flush hands the whole buffer to `count_texts_async`, which splits it into
+contiguous chunks and fans them out across worker **processes**, one pinned to
+each block of `CORES_PER_WORKER` (8) cores. Why this shape:
+
+- Each worker runs the raw `tokenizers` backend's `encode_batch_fast` — Rust,
+  rayon-parallel, no Python-per-text cost. Batching amortizes the
+  submit/result overhead over thousands of texts.
+- A single BPE rayon pool is memory-bound and saturates at ~8 cores; more
+  threads oversubscribe and, on multi-socket parts, cross the NUMA boundary.
+  Sharding across processes pinned to disjoint 8-core blocks (affinity set
+  **before** the backend loads, so each rayon pool sizes itself to its block
+  and stays NUMA-local) is how the whole machine is used.
+- Workers are spawn-context processes with module-level entry points (pickled
+  by name), warmed in parallel at construction so N tokenizer loads do not
+  serialize, and they ignore SIGINT — Ctrl-C goes to the whole process group,
+  and worker lifetime must stay under the parent drain's control.
+
+`--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one
+shard per 8-core block of the process affinity mask, an explicit count is
+clamped to that capacity, and `0` disables sharding. Every fallback to the
+in-process path (no fast Rust backend, affinity unavailable, fewer than two
+blocks) is logged with its reason — a missing "shards" INFO line should never
+be the only signal that the batch path is running single-threaded.
+
+Chat-template items (tool-call outputs) take a separate in-process thread:
+they are rare relative to the batched flush, and `apply_chat_template` is
+Python/Jinja — sharding buys nothing. A template baseline (the empty
+assistant-message frame) is computed once and subtracted so only the payload
+is counted.
+
+### CPU affinity: the tokenizer stage is post-run
+
+The benchmark parent pins itself to the loadgen cores before launching
+services, and subprocesses inherit that narrow mask. The tokenizer's heavy
+work happens **after** the run (the end-of-run flush), so the run-time core
+partition does not apply to it: at startup the service calls
+`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`) to reset
+its mask to every online CPU — the kernel still clamps to the cgroup/Slurm
+cpuset — and shards size to the full machine. Mid-run tick flushes are small
+batches; the drain is where the core count pays.
+
+### The `n_pending_tasks` contract
+
+`TokenBatchQueue.pending` counts enqueued-but-not-yet-recorded items and is
+surfaced on every snapshot as `n_pending_tasks`. In the **final** snapshot:
+
+- `state == complete && n_pending_tasks == 0` — clean run, token series exact.
+- `state == complete && n_pending_tasks > 0` — **incomplete drain**: the
+  end-of-run flush ran out of budget or the tokenizer failed; token-derived
+  series are missing exactly that many samples. `Report` renders a warning.
+
+Items dropped by a failed flush are intentionally _not_ removed from the
+pending count — under-reporting an incomplete drain would silently rebadge it
+as a clean run.
+
+### Data flow
+
+```
+COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count)   [O(1)]
+                                            │
+        publish tick (0.25 s) ──────────────┤  flush()
+        ENDED drain (budgeted) ─────────────┘    │
+                                                 ├─► chunks ─► N pinned worker procs
+                                                 │             (encode_batch_fast)
+                                                 └─► on_count(n) ─► registry.record()
+```
+
+## CLI Interface
+
+| Flag                             | Default  | Purpose                                             |
+| -------------------------------- | -------- | --------------------------------------------------- |
+| `--socket-dir` / `--socket-name` | required | EventRecord SUB socket                              |
+| `--metrics-socket`               | required | Snapshot PUB socket name                            |
+| `--metrics-output-dir`           | required | Directory for `final_snapshot.json`                 |
+| `--publish-interval`             | 0.25     | Live snapshot cadence (seconds)                     |
+| `--drain-timeout`                | 60.0     | End-of-run tokenize budget (`0` = unlimited)        |
+| `--tokenizer`                    | none     | HF name or local path; unset disables token metrics |
+| `--tokenizer-workers`            | -1       | Shard processes (`-1` auto, `0` in-process)         |
+| `--streaming`                    | off      | Register TTFT/chunk-delta/TPOT triggers             |
+
+## References
+
+- [docs/async_utils/services/DESIGN.md](../DESIGN.md) — the EventRecord
+  pub/sub system this service subscribes to.
+- [docs/PERF_ARCHITECTURE.md](../../../PERF_ARCHITECTURE.md) — CPU pinning
+  strategy for the loadgen/worker hot path.
+- AGENTS.md "Metrics Aggregator subprocess" — the condensed contract summary
+  for AI agents.

From 1315a737916ea69ed45ad201ed1970002c6b596b Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 14:41:19 -0700
Subject: [PATCH 06/20] fix(metrics): publish live snapshots through tokenizer
 failures; bound shard warmup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review-council findings (handled locally):

- A persistently failing pre_publish flush aborted every tick before the
  snapshot was built, silently stopping ALL live metrics publishing — not
  just token series. The flush now fails in its own handler (logged once)
  and the tick always proceeds to build and publish; unflushed items stay
  visible as n_pending_tasks. Regression-tested: a failing flush must not
  suppress state capture/publish.
- Shard warmup waits are bounded (_SHARD_WARMUP_TIMEOUT_S): a hung
  tokenizer load (e.g. stuck network filesystem) now degrades to the
  in-process path instead of wedging service startup forever.
- close() and warmup cleanup terminate shard workers (cancel_futures +
  SIGTERM) so an in-flight encode cannot stall interpreter exit after a
  drain timeout.
- TokenCounter protocol stubs use docstring + raise NotImplementedError
  (the one body shape CodeQL, mypy, and Pyright all accept).
- New TestSetupShardsDecisions pins the --tokenizer-workers contract
  (auto/clamp/disable thresholds, block pinning, affinity and warmup
  failure fallbacks) — previously zero coverage of the decision logic.

162 aggregator unit tests pass; pre-commit clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/DESIGN.md     |  6 +-
 .../services/metrics_aggregator/publisher.py  | 25 ++++-
 .../metrics_aggregator/token_metrics.py       | 43 +++++++--
 .../metrics_aggregator/test_publisher.py      | 11 ++-
 .../metrics_aggregator/test_token_metrics.py  | 93 ++++++++++++++++++-
 5 files changed, 161 insertions(+), 17 deletions(-)

diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
index 4f094097a..30fe533d6 100644
--- a/docs/async_utils/services/metrics_aggregator/DESIGN.md
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -95,8 +95,10 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape:
   and stays NUMA-local) is how the whole machine is used.
 - Workers are spawn-context processes with module-level entry points (pickled
   by name), warmed in parallel at construction so N tokenizer loads do not
-  serialize, and they ignore SIGINT — Ctrl-C goes to the whole process group,
-  and worker lifetime must stay under the parent drain's control.
+  serialize (the warmup wait is bounded — a hung load degrades to the
+  in-process path instead of wedging startup), and they ignore SIGINT —
+  Ctrl-C goes to the whole process group, and worker lifetime must stay under
+  the parent drain's control.
 
 `--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one
 shard per 8-core block of the process affinity mask, an explicit count is
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
index ae60942f0..fedc0fbe1 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
@@ -116,8 +116,10 @@ def start(
         ``pre_publish``, if given, is awaited at the top of each tick before
         the snapshot is built — the aggregator uses it to flush buffered
         tokenizations so live ISL/OSL/TPOT reflect recently completed samples.
-        Its failures are swallowed by the tick's own try/except (the tick keeps
-        going), so a transient tokenizer hiccup never stops live publishing.
+        Its failures are swallowed in their own handler so the snapshot is
+        still built and published — even a tokenizer that fails on every tick
+        cannot stop live publishing; the unflushed items remain visible as
+        ``n_pending_tasks``.
 
         Idempotent on the tick-task slot: a second call (e.g. from a
         spurious duplicate ``STARTED`` event or a buggy replay producer)
@@ -137,11 +139,28 @@ def start(
             )
 
         async def _tick() -> None:
+            flush_failure_logged = False
             while True:
                 try:
                     await asyncio.sleep(publish_interval_s)
                     if pre_publish is not None:
-                        await pre_publish()
+                        # Isolated from the publish path: a persistently
+                        # broken tokenizer would otherwise abort every tick
+                        # here and stop ALL live snapshots, not just token
+                        # series. Unflushed items stay visible to consumers
+                        # via n_pending_tasks.
+                        try:
+                            await pre_publish()
+                        except Exception:  # noqa: BLE001 — publish anyway.
+                            if not flush_failure_logged:
+                                flush_failure_logged = True
+                                logger.exception(
+                                    "pre_publish flush failed; live snapshots "
+                                    "continue without fresh token metrics "
+                                    "(further failures logged at debug)"
+                                )
+                            else:
+                                logger.debug("pre_publish flush failed again")
                     state, n_pending = get_runtime_state()
                     snap = registry.build_snapshot(
                         state=state, n_pending_tasks=n_pending
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 4e49aedc8..5e57a197d 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -46,6 +46,11 @@
 # used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process.
 CORES_PER_WORKER = 8
 
+# Budget for the parallel shard warmup (spawn + transformers import +
+# tokenizer load per worker). A hung load (e.g. a stuck network filesystem)
+# must degrade to the in-process path, not wedge service startup.
+_SHARD_WARMUP_TIMEOUT_S = 120.0
+
 # Minimal user message used to satisfy chat templates that reject assistant-only
 # message lists. Its token count is subtracted so only the assistant payload is
 # measured.
@@ -132,6 +137,23 @@ def _worker_ready(_: int) -> bool:
     return _WORKER_BACKEND is not None
 
 
+def _terminate_procs(procs: list[ProcessPoolExecutor]) -> None:
+    """Best-effort immediate stop: cancel queued work and SIGTERM workers.
+
+    ``shutdown(wait=False)`` alone leaves an in-flight encode running, and the
+    non-daemon worker would still be joined at interpreter exit — so a drain
+    timeout could stall process shutdown until the chunk finished.
+    """
+    for ex in procs:
+        ex.shutdown(wait=False, cancel_futures=True)
+        workers = getattr(ex, "_processes", None) or {}  # CPython impl detail.
+        for p in workers.values():
+            try:
+                p.terminate()
+            except Exception:  # noqa: BLE001 — already-dead workers are fine.
+                pass
+
+
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
 
@@ -265,12 +287,13 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
             # Submit to every shard first so the loads run in parallel, then
             # await — waiting on each before submitting the next would
             # serialize P tokenizer loads and can exceed the launch budget.
+            # The wait is bounded: one hung load must not wedge startup.
             ready = [ex.submit(_worker_ready, 0) for ex in procs]
+            deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S
             for f in ready:
-                f.result()
+                f.result(timeout=max(0.0, deadline - time.monotonic()))
         except Exception:
-            for ex in procs:
-                ex.shutdown(wait=False)
+            _terminate_procs(procs)
             logger.exception(
                 "tokenizer shard setup failed; using in-process tokenization"
             )
@@ -373,11 +396,11 @@ async def token_count_message_async(
     def close(self) -> None:
         """Shut down all workers. Idempotent.
 
-        Shard shutdown uses ``wait=False``: a hung worker must not block
-        aggregator shutdown; idle workers exit on their own once signalled.
+        Shards are stopped without waiting (a hung worker must not block
+        aggregator shutdown) and terminated so an in-flight encode cannot
+        stall interpreter exit after a drain timeout.
         """
-        for ex in self._procs:
-            ex.shutdown(wait=False)
+        _terminate_procs(self._procs)
         self._procs = []
         if self._thread is not None:
             self._thread.shutdown(wait=True)
@@ -406,7 +429,8 @@ class TokenCounter(Protocol):
     async def count_texts_async(
         self, texts: list[str], loop: asyncio.AbstractEventLoop, /
     ) -> list[int]:
-        pass
+        """Per-text token counts for a whole batch."""
+        raise NotImplementedError
 
     async def token_count_message_async(
         self,
@@ -416,7 +440,8 @@ async def token_count_message_async(
         loop: asyncio.AbstractEventLoop,
         /,
     ) -> int:
-        pass
+        """Chat-template token count for one assistant message."""
+        raise NotImplementedError
 
 
 class TokenBatchQueue:
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
index 15ad4d95c..1d540ddec 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
@@ -139,20 +139,29 @@ async def test_pre_publish_failure_keeps_ticking(
             registry = MetricsRegistry()
             registry.register_counter("c")
             attempts = 0
+            published_states = 0
 
             async def pre_publish() -> None:
                 nonlocal attempts
                 attempts += 1
                 raise RuntimeError("tokenizer hiccup")
 
+            def get_runtime_state() -> tuple[SessionState, int]:
+                nonlocal published_states
+                published_states += 1
+                return SessionState.LIVE, 0
+
             publisher.start(
                 registry,
                 publish_interval_s=0.01,
-                get_runtime_state=lambda: (SessionState.LIVE, 0),
+                get_runtime_state=get_runtime_state,
                 pre_publish=pre_publish,
             )
             await asyncio.sleep(0.08)
             assert attempts >= 2, "tick task died after a pre_publish failure"
+            # The failure must not suppress the snapshot: every failing tick
+            # still proceeds to capture state and publish.
+            assert published_states >= 2, "failing pre_publish suppressed publishing"
             assert publisher._tick_task is not None
             assert not publisher._tick_task.done()
         finally:
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 82609f275..14805b011 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -63,7 +63,7 @@ def submit(self, _fn, chunk):
         fut.set_result([len(t.split()) for t in chunk])
         return fut
 
-    def shutdown(self, wait=False):
+    def shutdown(self, wait=False, cancel_futures=False):
         pass
 
 
@@ -75,7 +75,7 @@ def submit(self, _fn, _chunk):
         fut.set_exception(BrokenProcessPool("worker died"))
         return fut
 
-    def shutdown(self, wait=False):
+    def shutdown(self, wait=False, cancel_futures=False):
         pass
 
 
@@ -251,6 +251,95 @@ def test_worker_encode_lengths_uses_backend(self, monkeypatch):
         assert _worker_encode_lengths(["a b", "c d e"]) == [2, 3]
 
 
+class _FakeTokenizerWithBackend(_FakeTokenizer):
+    """Fast-backend fake: lets ``_setup_shards`` proceed past the backend guard."""
+
+    backend_tokenizer = _FastBackend()
+
+
+class _SpawnlessExecutor:
+    """Stands in for ProcessPoolExecutor: records ctor args, instant warmup."""
+
+    def __init__(self, max_workers, mp_context=None, initializer=None, initargs=()):
+        self.initargs = initargs
+
+    def submit(self, fn, *args):
+        fut: Future = Future()
+        fut.set_result(True)
+        return fut
+
+    def shutdown(self, wait=False, cancel_futures=False):
+        pass
+
+
+@pytest.mark.unit
+class TestSetupShardsDecisions:
+    """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 disabled."""
+
+    def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor):
+        monkeypatch.setattr(token_metrics_module, "ProcessPoolExecutor", executor)
+        monkeypatch.setattr(
+            token_metrics_module.os, "sched_getaffinity", lambda pid: set(range(cpus))
+        )
+        with patch(_MOCK_TARGET, _FakeTokenizerWithBackend):
+            return BatchTokenizer("fake", n_workers=n_workers)
+
+    @pytest.mark.parametrize(
+        "cpus, n_workers, expected_shards",
+        [
+            (16, -1, 2),  # auto: one shard per 8-core block
+            (10, -1, 0),  # auto needs >= 2 blocks (1 shard ~= in-process)
+            (48, 3, 3),  # explicit count under capacity
+            (16, 10, 2),  # explicit count clamped to capacity
+            (16, 1, 1),  # explicit single shard honored
+            (16, 0, 0),  # 0 disables sharding
+        ],
+    )
+    def test_shard_count(self, monkeypatch, cpus, n_workers, expected_shards):
+        tok = self._make(monkeypatch, cpus, n_workers)
+        try:
+            assert len(tok._procs) == expected_shards
+        finally:
+            tok.close()
+
+    def test_blocks_are_disjoint_consecutive_core_sets(self, monkeypatch):
+        tok = self._make(monkeypatch, 16, -1)
+        try:
+            blocks = [set(ex.initargs[1]) for ex in tok._procs]
+            assert blocks == [set(range(0, 8)), set(range(8, 16))]
+        finally:
+            tok.close()
+
+    def test_affinity_failure_falls_back_in_process(self, monkeypatch):
+        monkeypatch.setattr(
+            token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor
+        )
+
+        def _raise(pid):
+            raise OSError("affinity unavailable")
+
+        monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise)
+        with patch(_MOCK_TARGET, _FakeTokenizerWithBackend):
+            tok = BatchTokenizer("fake")
+        try:
+            assert tok._procs == []
+        finally:
+            tok.close()
+
+    def test_warmup_failure_falls_back_in_process(self, monkeypatch):
+        class _BrokenWarmup(_SpawnlessExecutor):
+            def submit(self, fn, *args):
+                fut: Future = Future()
+                fut.set_exception(RuntimeError("spawn died"))
+                return fut
+
+        tok = self._make(monkeypatch, 16, -1, executor=_BrokenWarmup)
+        try:
+            assert tok._procs == []
+        finally:
+            tok.close()
+
+
 @pytest.mark.unit
 class TestEvenChunks:
     def test_splits_into_near_equal_chunks(self):

From 6d227bfc0a8a1c79d8bd8e9c6f46b1234c5be0f2 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 14:55:54 -0700
Subject: [PATCH 07/20] =?UTF-8?q?feat(metrics):=20no=20silent=20tokenizer?=
 =?UTF-8?q?=20fallbacks=20=E2=80=94=20shard=20or=20exit=20cleanly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A fully set-up environment (fast Rust tokenizer backend + Linux affinity)
always shards; anything else was previously a silent in-process fallback
that cannot keep up with completions and only surfaces much later as an
incomplete drain. Setup is now strict:

- no fast backend / no CPU affinity / failed or over-budget warmup ->
  RuntimeError, surfaced by the service entry as a FATAL launch failure
- --tokenizer-workers 0 is the only (explicit) in-process mode
- auto mode always shards: max(1, cpus // 8) — the "fewer than two
  blocks" in-process heuristic is gone; one pinned shard below a full
  block

Also converts the new shard-decision tests to context-managed
BatchTokenizer construction (CodeQL: use-with-statement).

164 aggregator unit tests pass; pre-commit clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/DESIGN.md     | 18 ++---
 .../services/metrics_aggregator/__main__.py   | 15 +++-
 .../metrics_aggregator/token_metrics.py       | 70 +++++++++----------
 .../metrics_aggregator/test_token_metrics.py  | 65 ++++++++---------
 4 files changed, 87 insertions(+), 81 deletions(-)

diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
index 30fe533d6..074cb569b 100644
--- a/docs/async_utils/services/metrics_aggregator/DESIGN.md
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -95,17 +95,17 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape:
   and stays NUMA-local) is how the whole machine is used.
 - Workers are spawn-context processes with module-level entry points (pickled
   by name), warmed in parallel at construction so N tokenizer loads do not
-  serialize (the warmup wait is bounded — a hung load degrades to the
-  in-process path instead of wedging startup), and they ignore SIGINT —
-  Ctrl-C goes to the whole process group, and worker lifetime must stay under
-  the parent drain's control.
+  serialize (the warmup wait is bounded — a hung load is a startup error, not
+  a wedge), and they ignore SIGINT — Ctrl-C goes to the whole process group,
+  and worker lifetime must stay under the parent drain's control.
 
 `--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one
-shard per 8-core block of the process affinity mask, an explicit count is
-clamped to that capacity, and `0` disables sharding. Every fallback to the
-in-process path (no fast Rust backend, affinity unavailable, fewer than two
-blocks) is logged with its reason — a missing "shards" INFO line should never
-be the only signal that the batch path is running single-threaded.
+shard per 8-core block of the process affinity mask (always at least one), an
+explicit count is clamped to that capacity, and `0` explicitly selects
+in-process tokenization. There is no implicit fallback: an environment that
+cannot shard — no fast Rust backend, no CPU affinity, a failed or over-budget
+warmup — is a startup error, because a silent in-process slow path cannot
+keep up with completions and would surface much later as an incomplete drain.
 
 Chat-template items (tool-call outputs) take a separate in-process thread:
 they are rare relative to the batched flush, and `apply_chat_template` is
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index fc975246a..20a5b1dfb 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -169,7 +169,10 @@ async def main() -> None:
         default=-1,
         help=(
             "Number of tokenizer shard processes (-1 = auto: one per "
-            "8-core block of this machine; 0 = in-process tokenization)."
+            "8-core block of this machine, minimum one; 0 = explicit "
+            "in-process tokenization). An environment that cannot shard "
+            "(no fast tokenizer backend, no CPU affinity) is a startup "
+            "error unless 0 is passed."
         ),
     )
     parser.add_argument(
@@ -223,7 +226,15 @@ async def main() -> None:
             logger.info("metrics aggregator affinity: %d CPUs", len(cpus))
         except UnsupportedPlatformError:
             pass  # non-Linux: no inherited pin to undo.
-        tokenizer_cm = BatchTokenizer(args.tokenizer, n_workers=args.tokenizer_workers)
+        try:
+            tokenizer_cm = BatchTokenizer(
+                args.tokenizer, n_workers=args.tokenizer_workers
+            )
+        except RuntimeError as exc:
+            # Fail-fast contract: a tokenizer environment that cannot shard
+            # must surface as a clear service-launch failure, not a silent
+            # slow path that cannot keep up with completions.
+            raise SystemExit(f"FATAL: {exc}") from exc
     else:
         tokenizer_cm = nullcontext()
 
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 5e57a197d..8aa678f84 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -18,9 +18,10 @@
 ``BatchTokenizer`` tokenizes whole batches at once, sharded across worker
 processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE
 rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers
-per-sample text and flushes the batch once per publish tick and at drain. Falls
-back to a single in-process thread when there is no fast Rust backend or fewer
-than two core blocks fit.
+per-sample text and flushes the batch once per publish tick and at drain.
+Sharding requires the fast (Rust) tokenizers backend and Linux CPU affinity;
+an environment that cannot shard is a startup error, never a silent slow
+path — ``--tokenizer-workers 0`` is the only (explicit) in-process mode.
 """
 
 from __future__ import annotations
@@ -191,7 +192,8 @@ def __init__(
             max_workers=1, thread_name_prefix="tok-thread"
         )
         self._load_tokenizer()  # also computes the chat-template baseline
-        # Process shards for the batched text path (or empty -> in-process).
+        # Process shards for the batched text path. Empty only when
+        # in-process mode was explicitly requested (n_workers=0).
         self._procs: list[ProcessPoolExecutor] = []
         self._setup_shards(cores_per_worker, n_workers)
 
@@ -233,43 +235,34 @@ def _load_tokenizer(self) -> None:
     def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
         """Spawn one pinned single-worker process per core block.
 
-        ``n_workers <= 0`` (auto) fits as many shards as this process's
-        affinity mask allows, one per ``cores_per_worker`` block; an explicit
-        count is clamped to that capacity. No-op (leaving the batch path
-        in-process) when the tokenizer has no fast Rust backend, affinity is
-        unavailable, or — in auto mode — fewer than two blocks fit (a single
-        shard is no faster than the in-process backend). Each fallback is
-        logged: a missing "shards" INFO line is the only other signal that
-        the batched path is running single-threaded.
+        ``n_workers == 0`` explicitly selects in-process tokenization. Auto
+        (``< 0``) fits one shard per ``cores_per_worker`` block of this
+        process's affinity mask, always at least one; an explicit count is
+        clamped to that capacity. An environment that cannot shard — no fast
+        Rust backend, no CPU affinity, a warmup that fails or exceeds its
+        budget — raises instead of silently degrading to a slow path that
+        cannot keep up with completions.
         """
         if cores_per_worker <= 0 or n_workers == 0:
-            logger.info("BatchTokenizer: sharding disabled")
+            logger.info("BatchTokenizer: in-process tokenization (explicit)")
             return
         if getattr(self._tokenizer, "backend_tokenizer", None) is None:
-            logger.info(
-                "BatchTokenizer: no fast tokenizer backend; using in-process "
-                "tokenization"
+            raise RuntimeError(
+                f"tokenizer {self._tokenizer_name!r} has no fast (Rust) "
+                "backend; token metrics require one to keep up with "
+                "completions. Pass --tokenizer-workers 0 to explicitly run "
+                "single-threaded in-process tokenization."
             )
-            return
         try:
             available = sorted(os.sched_getaffinity(0))
-        except (OSError, AttributeError):
-            logger.info(
-                "BatchTokenizer: CPU affinity unavailable; using in-process "
-                "tokenization"
-            )
-            return
-        capacity = len(available) // cores_per_worker
+        except (OSError, AttributeError) as exc:
+            raise RuntimeError(
+                "CPU affinity is unavailable; tokenizer sharding requires "
+                "Linux. Pass --tokenizer-workers 0 to explicitly run "
+                "in-process tokenization."
+            ) from exc
+        capacity = max(1, len(available) // cores_per_worker)
         n = capacity if n_workers < 0 else min(n_workers, capacity)
-        if n < (2 if n_workers < 0 else 1):
-            logger.info(
-                "BatchTokenizer: %d CPUs available (capacity %d blocks of %d); "
-                "using in-process tokenization",
-                len(available),
-                capacity,
-                cores_per_worker,
-            )
-            return
         t0 = time.perf_counter()
         ctx = multiprocessing.get_context("spawn")
         procs: list[ProcessPoolExecutor] = []
@@ -292,12 +285,13 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
             deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S
             for f in ready:
                 f.result(timeout=max(0.0, deadline - time.monotonic()))
-        except Exception:
+        except Exception as exc:
             _terminate_procs(procs)
-            logger.exception(
-                "tokenizer shard setup failed; using in-process tokenization"
-            )
-            return
+            raise RuntimeError(
+                "tokenizer shard warmup failed; refusing to fall back to a "
+                "slow path. Fix the environment (or pass --tokenizer-workers "
+                "0 to explicitly run in-process)."
+            ) from exc
         self._procs = procs
         logger.info(
             "BatchTokenizer: %d shards x %d cores (setup %.1fs)",
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 14805b011..2558c6e69 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -85,7 +85,7 @@ class TestBatchTokenizer:
     async def test_count_texts_async(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", n_workers=0) as tok:
                 counts = await tok.count_texts_async(["Hello world foo", "a"], loop)
                 assert counts == [3, 1]
 
@@ -93,7 +93,7 @@ async def test_count_texts_async(self):
     async def test_count_texts_async_empty(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", n_workers=0) as tok:
                 assert await tok.count_texts_async([], loop) == []
 
     @pytest.mark.asyncio
@@ -101,7 +101,7 @@ async def test_count_texts_async_sharded(self):
         """With shards present, chunks are reassembled in original order."""
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", n_workers=0) as tok:
                 tok._procs = [_FakeProc(), _FakeProc()]
                 counts = await tok.count_texts_async(["a", "b b", "c c c", "d"], loop)
                 assert counts == [1, 2, 3, 1]
@@ -111,14 +111,14 @@ async def test_count_texts_async_shard_failure_propagates(self):
         """A dead shard surfaces as an error, not a silent in-process fallback."""
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", n_workers=0) as tok:
                 tok._procs = [_BrokenProc()]
                 with pytest.raises(BrokenProcessPool):
                     await tok.count_texts_async(["a b"], loop)
 
     def test_close_is_idempotent(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            tok = BatchTokenizer("fake")
+            tok = BatchTokenizer("fake", n_workers=0)
             tok.close()
             tok.close()  # must not raise
 
@@ -126,7 +126,7 @@ def test_close_is_idempotent(self):
     async def test_use_after_close_raises(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            tok = BatchTokenizer("fake")
+            tok = BatchTokenizer("fake", n_workers=0)
             tok.close()
             with pytest.raises(RuntimeError, match="closed"):
                 await tok.count_texts_async(["hello"], loop)
@@ -163,7 +163,7 @@ async def test_token_count_message_subtracts_baseline(self):
         """token_count_message_async returns full_tokens - baseline."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", n_workers=0) as tok:
                 # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2
                 count = await tok.token_count_message_async(
                     "hello world", None, None, loop
@@ -175,7 +175,7 @@ async def test_token_count_message_includes_tool_calls(self):
         """Tool-call JSON tokens are included in the count."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", n_workers=0) as tok:
                 tool_calls = (
                     {
                         "id": "c1",
@@ -199,7 +199,7 @@ def apply_chat_template(self, *args, **kwargs):
 
         with patch(_MOCK_TARGET, _BadTemplateTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", n_workers=0) as tok:
                 tool_calls = (
                     {
                         "id": "c1",
@@ -274,7 +274,11 @@ def shutdown(self, wait=False, cancel_futures=False):
 
 @pytest.mark.unit
 class TestSetupShardsDecisions:
-    """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 disabled."""
+    """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 explicit.
+
+    An environment that cannot shard is a startup error — never a silent
+    in-process fallback.
+    """
 
     def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor):
         monkeypatch.setattr(token_metrics_module, "ProcessPoolExecutor", executor)
@@ -288,29 +292,32 @@ def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor):
         "cpus, n_workers, expected_shards",
         [
             (16, -1, 2),  # auto: one shard per 8-core block
-            (10, -1, 0),  # auto needs >= 2 blocks (1 shard ~= in-process)
+            (10, -1, 1),  # auto: always at least one shard
+            (6, -1, 1),  # auto: even below one full block
             (48, 3, 3),  # explicit count under capacity
             (16, 10, 2),  # explicit count clamped to capacity
             (16, 1, 1),  # explicit single shard honored
-            (16, 0, 0),  # 0 disables sharding
+            (16, 0, 0),  # 0 = explicit in-process mode
         ],
     )
     def test_shard_count(self, monkeypatch, cpus, n_workers, expected_shards):
-        tok = self._make(monkeypatch, cpus, n_workers)
-        try:
+        with self._make(monkeypatch, cpus, n_workers) as tok:
             assert len(tok._procs) == expected_shards
-        finally:
-            tok.close()
 
     def test_blocks_are_disjoint_consecutive_core_sets(self, monkeypatch):
-        tok = self._make(monkeypatch, 16, -1)
-        try:
+        with self._make(monkeypatch, 16, -1) as tok:
             blocks = [set(ex.initargs[1]) for ex in tok._procs]
             assert blocks == [set(range(0, 8)), set(range(8, 16))]
-        finally:
-            tok.close()
 
-    def test_affinity_failure_falls_back_in_process(self, monkeypatch):
+    def test_no_fast_backend_is_a_startup_error(self, monkeypatch):
+        monkeypatch.setattr(
+            token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor
+        )
+        with patch(_MOCK_TARGET, _FakeTokenizer):  # no backend_tokenizer
+            with pytest.raises(RuntimeError, match="fast"):
+                BatchTokenizer("fake")
+
+    def test_affinity_failure_is_a_startup_error(self, monkeypatch):
         monkeypatch.setattr(
             token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor
         )
@@ -320,24 +327,18 @@ def _raise(pid):
 
         monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise)
         with patch(_MOCK_TARGET, _FakeTokenizerWithBackend):
-            tok = BatchTokenizer("fake")
-        try:
-            assert tok._procs == []
-        finally:
-            tok.close()
+            with pytest.raises(RuntimeError, match="affinity"):
+                BatchTokenizer("fake")
 
-    def test_warmup_failure_falls_back_in_process(self, monkeypatch):
+    def test_warmup_failure_is_a_startup_error(self, monkeypatch):
         class _BrokenWarmup(_SpawnlessExecutor):
             def submit(self, fn, *args):
                 fut: Future = Future()
                 fut.set_exception(RuntimeError("spawn died"))
                 return fut
 
-        tok = self._make(monkeypatch, 16, -1, executor=_BrokenWarmup)
-        try:
-            assert tok._procs == []
-        finally:
-            tok.close()
+        with pytest.raises(RuntimeError, match="warmup"):
+            self._make(monkeypatch, 16, -1, executor=_BrokenWarmup)
 
 
 @pytest.mark.unit

From 0cca84a0de6daad267a4821f847db74e8ead85a5 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 15:24:37 -0700
Subject: [PATCH 08/20] fix(metrics): shard unpinned on platforms without CPU
 affinity (macOS)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The affinity API's absence is a platform property, not a broken
environment: sharding works identically without pinning — the OS
scheduler spreads the workers and only cache/NUMA locality is lost.
_setup_shards now sizes blocks from the online CPU count when
sched_getaffinity is unavailable, and each worker that cannot pin caps
its rayon pool to its block size via RAYON_NUM_THREADS so unpinned
shards do not oversubscribe each other.

The strict startup errors remain for genuine environment problems: a
tokenizer without a fast (Rust) backend, and a failed or over-budget
shard warmup.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/DESIGN.md     |  9 +++--
 .../services/metrics_aggregator/__main__.py   |  6 ++--
 .../metrics_aggregator/token_metrics.py       | 33 +++++++++++--------
 .../metrics_aggregator/test_token_metrics.py  |  8 +++--
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
index 074cb569b..5b8cd2c50 100644
--- a/docs/async_utils/services/metrics_aggregator/DESIGN.md
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -103,9 +103,12 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape:
 shard per 8-core block of the process affinity mask (always at least one), an
 explicit count is clamped to that capacity, and `0` explicitly selects
 in-process tokenization. There is no implicit fallback: an environment that
-cannot shard — no fast Rust backend, no CPU affinity, a failed or over-budget
-warmup — is a startup error, because a silent in-process slow path cannot
-keep up with completions and would surface much later as an incomplete drain.
+cannot shard — no fast Rust backend, a failed or over-budget warmup — is a
+startup error, because a silent in-process slow path cannot keep up with
+completions and would surface much later as an incomplete drain. Platforms
+without a CPU-affinity API (e.g. macOS) still shard at full speed, just
+unpinned: blocks are sized from the online CPU count and each worker caps its
+rayon pool to the block size instead of pinning.
 
 Chat-template items (tool-call outputs) take a separate in-process thread:
 they are rare relative to the batched flush, and `apply_chat_template` is
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 20a5b1dfb..7e2acea30 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -170,9 +170,9 @@ async def main() -> None:
         help=(
             "Number of tokenizer shard processes (-1 = auto: one per "
             "8-core block of this machine, minimum one; 0 = explicit "
-            "in-process tokenization). An environment that cannot shard "
-            "(no fast tokenizer backend, no CPU affinity) is a startup "
-            "error unless 0 is passed."
+            "in-process tokenization). A tokenizer without a fast (Rust) "
+            "backend is a startup error unless 0 is passed; platforms "
+            "without CPU affinity (e.g. macOS) shard unpinned."
         ),
     )
     parser.add_argument(
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 8aa678f84..02b927d24 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -19,9 +19,10 @@
 processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE
 rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers
 per-sample text and flushes the batch once per publish tick and at drain.
-Sharding requires the fast (Rust) tokenizers backend and Linux CPU affinity;
-an environment that cannot shard is a startup error, never a silent slow
-path — ``--tokenizer-workers 0`` is the only (explicit) in-process mode.
+Sharding requires the fast (Rust) tokenizers backend; an environment without
+one is a startup error, never a silent slow path — ``--tokenizer-workers 0``
+is the only (explicit) in-process mode. Platforms without CPU affinity (e.g.
+macOS) shard unpinned at full speed; only cache/NUMA locality is lost.
 """
 
 from __future__ import annotations
@@ -110,6 +111,9 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None:
         try:
             os.sched_setaffinity(0, set(core_set))
         except (OSError, AttributeError):
+            # No pinning (e.g. macOS): cap the rayon pool to the block size
+            # instead, so unpinned shards don't oversubscribe each other.
+            os.environ.setdefault("RAYON_NUM_THREADS", str(len(core_set)))
             logger.debug("could not pin tokenizer worker to %s", core_set)
     transformers_logging.set_verbosity_error()
     tok = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
@@ -237,11 +241,12 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
 
         ``n_workers == 0`` explicitly selects in-process tokenization. Auto
         (``< 0``) fits one shard per ``cores_per_worker`` block of this
-        process's affinity mask, always at least one; an explicit count is
-        clamped to that capacity. An environment that cannot shard — no fast
-        Rust backend, no CPU affinity, a warmup that fails or exceeds its
-        budget — raises instead of silently degrading to a slow path that
-        cannot keep up with completions.
+        process's affinity mask (or the online CPU count when the platform
+        has no affinity API — shards then run unpinned), always at least one;
+        an explicit count is clamped to that capacity. An environment that
+        cannot shard — no fast Rust backend, a warmup that fails or exceeds
+        its budget — raises instead of silently degrading to a slow path
+        that cannot keep up with completions.
         """
         if cores_per_worker <= 0 or n_workers == 0:
             logger.info("BatchTokenizer: in-process tokenization (explicit)")
@@ -255,12 +260,12 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
             )
         try:
             available = sorted(os.sched_getaffinity(0))
-        except (OSError, AttributeError) as exc:
-            raise RuntimeError(
-                "CPU affinity is unavailable; tokenizer sharding requires "
-                "Linux. Pass --tokenizer-workers 0 to explicitly run "
-                "in-process tokenization."
-            ) from exc
+        except (OSError, AttributeError):
+            # No affinity API (e.g. macOS): shard unpinned — the OS scheduler
+            # spreads the workers; only cache/NUMA locality is lost. Workers
+            # cap their rayon pools to the block size instead (_init_worker).
+            available = list(range(os.cpu_count() or 1))
+            logger.info("BatchTokenizer: CPU affinity unavailable; sharding unpinned")
         capacity = max(1, len(available) // cores_per_worker)
         n = capacity if n_workers < 0 else min(n_workers, capacity)
         t0 = time.perf_counter()
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 2558c6e69..1bee9ba7f 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -317,7 +317,8 @@ def test_no_fast_backend_is_a_startup_error(self, monkeypatch):
             with pytest.raises(RuntimeError, match="fast"):
                 BatchTokenizer("fake")
 
-    def test_affinity_failure_is_a_startup_error(self, monkeypatch):
+    def test_affinity_unavailable_shards_unpinned(self, monkeypatch):
+        """No affinity API (e.g. macOS): shard from the CPU count, unpinned."""
         monkeypatch.setattr(
             token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor
         )
@@ -326,9 +327,10 @@ def _raise(pid):
             raise OSError("affinity unavailable")
 
         monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise)
+        monkeypatch.setattr(token_metrics_module.os, "cpu_count", lambda: 16)
         with patch(_MOCK_TARGET, _FakeTokenizerWithBackend):
-            with pytest.raises(RuntimeError, match="affinity"):
-                BatchTokenizer("fake")
+            with BatchTokenizer("fake") as tok:
+                assert len(tok._procs) == 2
 
     def test_warmup_failure_is_a_startup_error(self, monkeypatch):
         class _BrokenWarmup(_SpawnlessExecutor):

From 443a923f223758f9dc5a86023ff7f1ad655237ef Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 15:50:41 -0700
Subject: [PATCH 09/20] refactor(metrics): queue-owned live flush lane; drop
 the pre_publish hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The publisher no longer knows about tokenization: TokenBatchQueue owns
its flush cadence via start_live(interval), removing the pre_publish
callback (and its failure-isolation machinery) added earlier in this
branch. Mid-run flushes go through a bounded live lane —
--live-tokenizers shards (default 1), taken from the highest core
blocks, farthest from the loadgen's low cores — so live ISL/OSL/TPOT
stay current without contending with the benchmark hot path;
--live-tokenizers 0 defers all tokenization to the end-of-run drain,
which always uses every shard.

Live-flush failures and cancellations re-queue the detached items so a
mid-run hiccup never loses samples (the drain retries them); drain
failures remain terminal and pending-counted. Default
metrics-drain-timeout rises 60s -> 300s since the live lane is sized
for currency, not for keeping up with peak completion rates.

For comparison, main tokenizes continuously during the run on 2 threads
inside the aggregator process — which inherits the loadgen's pinned
mask, i.e. directly on the loadgen's cores.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                                     |   2 +-
 .../services/metrics_aggregator/DESIGN.md     |  24 +--
 .../services/metrics_aggregator/__main__.py   |  21 ++-
 .../services/metrics_aggregator/aggregator.py |  17 ++-
 .../services/metrics_aggregator/publisher.py  |  30 +---
 .../services/metrics_aggregator/snapshot.py   |   2 +-
 .../metrics_aggregator/token_metrics.py       | 137 ++++++++++++++----
 src/inference_endpoint/config/schema.py       |   4 +-
 .../templates/concurrency_template_full.yaml  |   2 +-
 .../templates/offline_template_full.yaml      |   2 +-
 .../templates/online_template_full.yaml       |   2 +-
 .../services/metrics_aggregator/conftest.py   |   5 +
 .../metrics_aggregator/test_aggregator.py     |  18 +--
 .../metrics_aggregator/test_publisher.py      |  86 -----------
 .../metrics_aggregator/test_token_metrics.py  |  83 +++++++++++
 tests/unit/commands/test_benchmark.py         |   2 +-
 16 files changed, 262 insertions(+), 175 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 050d9e5b3..dbc8ce953 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils.
 
 - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O.
 - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks).
-- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
+- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 300 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
 - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`).
 - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots.
 
diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
index 5b8cd2c50..707ded06d 100644
--- a/docs/async_utils/services/metrics_aggregator/DESIGN.md
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -38,7 +38,7 @@ INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► C
 - **LIVE**: the publisher tick task emits a snapshot every
   `--publish-interval` seconds (default 0.25 s).
 - **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed,
-  bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited).
+  bounded by the `--drain-timeout` budget (default 300 s; `0` = unlimited).
 - The ENDED path runs inside a finalization boundary: whatever the drain does
   — finish, time out, or fail — `publish_final` and the shutdown signal always
   run. A tokenizer failure can degrade the snapshot (see the
@@ -63,12 +63,17 @@ Token triggers do no work at event time. `fire()` appends
 to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared
 in batches at exactly two points:
 
-1. **Every publish tick** — the publisher awaits a `pre_publish` hook before
-   composing each snapshot, so live ISL/OSL/TPOT reflect recently completed
-   samples. A failure here is swallowed by the tick (live publishing never
-   stops).
-2. **End-of-run** — `flush_remaining(timeout)` drains everything still
-   buffered, bounded by the drain budget.
+1. **The queue's own live loop** — `start_live(interval)` flushes
+   periodically (at the publish cadence) through the tokenizer's **bounded
+   live lane**: the last `--live-tokenizers` shards (default 1 — the highest
+   core block, farthest from the loadgen's low cores), so live ISL/OSL/TPOT
+   stay current without touching the benchmark hot path. `--live-tokenizers
+0` disables mid-run tokenization entirely. Failures are logged once and
+   never stop the loop.
+2. **End-of-run** — `flush_remaining(timeout)` stops the live loop and drains
+   everything still buffered through **every** shard, bounded by the drain
+   budget. The publisher knows nothing about tokenization — it only reads
+   `(state, n_pending_tasks)`.
 
 `flush()` serializes under an asyncio lock and detaches the buffer up front,
 so enqueues that race a flush land in the next one. Failure isolation is
@@ -146,7 +151,7 @@ as a clean run.
 ```
 COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count)   [O(1)]
                                             │
-        publish tick (0.25 s) ──────────────┤  flush()
+        live loop (0.25 s, live lane) ─────┤  flush()
         ENDED drain (budgeted) ─────────────┘    │
                                                  ├─► chunks ─► N pinned worker procs
                                                  │             (encode_batch_fast)
@@ -161,9 +166,10 @@ COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count)   [
 | `--metrics-socket`               | required | Snapshot PUB socket name                            |
 | `--metrics-output-dir`           | required | Directory for `final_snapshot.json`                 |
 | `--publish-interval`             | 0.25     | Live snapshot cadence (seconds)                     |
-| `--drain-timeout`                | 60.0     | End-of-run tokenize budget (`0` = unlimited)        |
+| `--drain-timeout`                | 300.0    | End-of-run tokenize budget (`0` = unlimited)        |
 | `--tokenizer`                    | none     | HF name or local path; unset disables token metrics |
 | `--tokenizer-workers`            | -1       | Shard processes (`-1` auto, `0` in-process)         |
+| `--live-tokenizers`              | 1        | Shards for mid-run live flushes (`0` = defer all)   |
 | `--streaming`                    | off      | Register TTFT/chunk-delta/TPOT triggers             |
 
 ## References
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 7e2acea30..628811a36 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -137,11 +137,11 @@ async def main() -> None:
     parser.add_argument(
         "--drain-timeout",
         type=float,
-        default=60.0,
+        default=300.0,
         help=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
             "after ENDED before the aggregator emits the final snapshot with "
-            "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase "
+            "n_pending_tasks > 0 (default: 300.0; 0 = wait indefinitely). Increase "
             "for very large datasets where the end-of-run tokenize batch is big."
         ),
     )
@@ -175,6 +175,16 @@ async def main() -> None:
             "without CPU affinity (e.g. macOS) shard unpinned."
         ),
     )
+    parser.add_argument(
+        "--live-tokenizers",
+        type=int,
+        default=1,
+        help=(
+            "Shards used for mid-run (live) token-metric flushes (default: 1 "
+            "— the highest core block, away from the loadgen's cores; 0 = no "
+            "mid-run tokenization, everything defers to the end-of-run drain)."
+        ),
+    )
     parser.add_argument(
         "--streaming",
         action="store_true",
@@ -228,7 +238,9 @@ async def main() -> None:
             pass  # non-Linux: no inherited pin to undo.
         try:
             tokenizer_cm = BatchTokenizer(
-                args.tokenizer, n_workers=args.tokenizer_workers
+                args.tokenizer,
+                n_workers=args.tokenizer_workers,
+                live_workers=args.live_tokenizers,
             )
         except RuntimeError as exc:
             # Fail-fast contract: a tokenizer environment that cannot shard
@@ -262,6 +274,9 @@ async def main() -> None:
                 sig_figs=args.hdr_sig_figs,
                 n_histogram_buckets=args.n_histogram_buckets,
                 tokenizer=tokenizer,
+                live_flush_interval_s=(
+                    args.publish_interval if args.live_tokenizers > 0 else None
+                ),
                 streaming=args.streaming,
                 shutdown_event=shutdown_event,
                 drain_timeout_s=None if args.drain_timeout == 0 else args.drain_timeout,
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index e87448036..cb2db5878 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -118,6 +118,7 @@ def __init__(
         sig_figs: int,
         n_histogram_buckets: int,
         tokenizer: BatchTokenizer | None = None,
+        live_flush_interval_s: float | None = None,
         streaming: bool = False,
         shutdown_event: asyncio.Event | None = None,
         drain_timeout_s: float | None = _DEFAULT_DRAIN_TIMEOUT_S,
@@ -139,6 +140,9 @@ def __init__(
         self._token_queue: TokenBatchQueue | None = (
             TokenBatchQueue(tokenizer, self.loop) if tokenizer is not None else None
         )
+        # Cadence of the queue's live flush loop (None = no mid-run
+        # tokenization; everything defers to the end-of-run drain).
+        self._live_flush_interval_s = live_flush_interval_s
         self._streaming = streaming
         self._shutdown_event = shutdown_event
         self._shutdown_received = False
@@ -246,11 +250,6 @@ def pending_tokens(self) -> int:
         """Enqueued tokenizations not yet recorded (the snapshot n_pending_tasks)."""
         return self._token_queue.pending if self._token_queue is not None else 0
 
-    async def _flush_tokens(self) -> None:
-        """Flush buffered tokenizations so the next snapshot reflects them."""
-        if self._token_queue is not None:
-            await self._token_queue.flush()
-
     # ------------------------------------------------------------------
     # Event processing
     # ------------------------------------------------------------------
@@ -325,8 +324,14 @@ async def process(self, records: list[EventRecord]) -> None:
                                     self._session_state,
                                     self.pending_tokens,
                                 ),
-                                pre_publish=self._flush_tokens,
                             )
+                            if (
+                                self._token_queue is not None
+                                and self._live_flush_interval_s is not None
+                            ):
+                                self._token_queue.start_live(
+                                    self._live_flush_interval_s
+                                )
                     table.handle_session_event(record)
                     if ev == SessionEventType.STOP_PERFORMANCE_TRACKING:
                         registry.set_counter(
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
index fedc0fbe1..c90ca11fc 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
@@ -21,7 +21,7 @@
 import json
 import logging
 import os
-from collections.abc import Awaitable, Callable
+from collections.abc import Callable
 from pathlib import Path
 
 from inference_endpoint.async_utils.services.metrics_aggregator.registry import (
@@ -102,7 +102,6 @@ def start(
         registry: MetricsRegistry,
         publish_interval_s: float,
         get_runtime_state: Callable[[], tuple[SessionState, int]],
-        pre_publish: Callable[[], Awaitable[None]] | None = None,
     ) -> None:
         """Begin publishing live ticks every ``publish_interval_s`` seconds.
 
@@ -113,14 +112,6 @@ def start(
         snapshot. ``COMPLETE`` is emitted only by ``publish_final``, never by
         the tick task.
 
-        ``pre_publish``, if given, is awaited at the top of each tick before
-        the snapshot is built — the aggregator uses it to flush buffered
-        tokenizations so live ISL/OSL/TPOT reflect recently completed samples.
-        Its failures are swallowed in their own handler so the snapshot is
-        still built and published — even a tokenizer that fails on every tick
-        cannot stop live publishing; the unflushed items remain visible as
-        ``n_pending_tasks``.
-
         Idempotent on the tick-task slot: a second call (e.g. from a
         spurious duplicate ``STARTED`` event or a buggy replay producer)
         is a no-op rather than orphaning the original task. The original
@@ -139,28 +130,9 @@ def start(
             )
 
         async def _tick() -> None:
-            flush_failure_logged = False
             while True:
                 try:
                     await asyncio.sleep(publish_interval_s)
-                    if pre_publish is not None:
-                        # Isolated from the publish path: a persistently
-                        # broken tokenizer would otherwise abort every tick
-                        # here and stop ALL live snapshots, not just token
-                        # series. Unflushed items stay visible to consumers
-                        # via n_pending_tasks.
-                        try:
-                            await pre_publish()
-                        except Exception:  # noqa: BLE001 — publish anyway.
-                            if not flush_failure_logged:
-                                flush_failure_logged = True
-                                logger.exception(
-                                    "pre_publish flush failed; live snapshots "
-                                    "continue without fresh token metrics "
-                                    "(further failures logged at debug)"
-                                )
-                            else:
-                                logger.debug("pre_publish flush failed again")
                     state, n_pending = get_runtime_state()
                     snap = registry.build_snapshot(
                         state=state, n_pending_tasks=n_pending
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
index eacac94f5..e233f36a3 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
@@ -45,7 +45,7 @@ class SessionState(str, Enum):
     LIVE        → run in progress; tick task publishing live HDR-derived stats.
     DRAINING    → ``SessionEventType.ENDED`` has been received; the aggregator
                   is tokenizing the buffered samples (bounded by the
-                  ``--drain-timeout`` budget, default 60 s). Tick task
+                  ``--drain-timeout`` budget, default 300 s). Tick task
                   continues at this stage, still HDR-derived; no new events
                   will arrive.
     COMPLETE    → terminal clean state. The ``publish_final()`` snapshot
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 02b927d24..7723d84d4 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -18,7 +18,8 @@
 ``BatchTokenizer`` tokenizes whole batches at once, sharded across worker
 processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE
 rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers
-per-sample text and flushes the batch once per publish tick and at drain.
+per-sample text; a queue-owned live loop flushes through a bounded live lane
+(default one shard) mid-run, and the end-of-run drain uses every shard.
 Sharding requires the fast (Rust) tokenizers backend; an environment without
 one is a startup error, never a silent slow path — ``--tokenizer-workers 0``
 is the only (explicit) in-process mode. Platforms without CPU affinity (e.g.
@@ -28,6 +29,7 @@
 from __future__ import annotations
 
 import asyncio
+import contextlib
 import json
 import logging
 import multiprocessing
@@ -185,8 +187,10 @@ def __init__(
         *,
         cores_per_worker: int = CORES_PER_WORKER,
         n_workers: int = -1,
+        live_workers: int = 1,
     ) -> None:
         self._tokenizer_name = tokenizer_name
+        self._live_workers = live_workers
         self._fallback_warned: set[str] = set()
         self._tokenizer: PreTrainedTokenizerBase | None = None
         self._prefix_len = 0
@@ -324,19 +328,40 @@ async def count_texts_async(
         if not texts:
             return []
         if self._procs:
-            chunks = _even_chunks(texts, len(self._procs))
-            futures = [
-                asyncio.wrap_future(ex.submit(_worker_encode_lengths, chunk))
-                for ex, chunk in zip(self._procs, chunks, strict=False)
-            ]
-            results = await asyncio.gather(*futures)
-            return [n for r in results for n in r]
+            return await self._fan_out(self._procs, texts)
         if self._thread is None:
             raise RuntimeError("BatchTokenizer is closed")
         return await loop.run_in_executor(
             self._thread, self._encode_lengths_inproc, texts
         )
 
+    async def count_texts_live_async(
+        self, texts: list[str], loop: asyncio.AbstractEventLoop
+    ) -> list[int]:
+        """Like ``count_texts_async``, bounded to the live lane.
+
+        Mid-run flushes use only the last ``live_workers`` shards — the
+        highest core blocks, farthest from the loadgen's low cores — so live
+        token metrics never contend with the benchmark hot path. The
+        end-of-run drain uses every shard.
+        """
+        if not texts:
+            return []
+        live_n = max(1, self._live_workers)
+        if self._procs and live_n < len(self._procs):
+            return await self._fan_out(self._procs[-live_n:], texts)
+        return await self.count_texts_async(texts, loop)
+
+    @staticmethod
+    async def _fan_out(procs: list[ProcessPoolExecutor], texts: list[str]) -> list[int]:
+        chunks = _even_chunks(texts, len(procs))
+        futures = [
+            asyncio.wrap_future(ex.submit(_worker_encode_lengths, chunk))
+            for ex, chunk in zip(procs, chunks, strict=False)
+        ]
+        results = await asyncio.gather(*futures)
+        return [n for r in results for n in r]
+
     # -- sync + chat-template paths (in-process thread) ---------------------
 
     def _token_count_text(self, text: str) -> int:
@@ -428,7 +453,13 @@ class TokenCounter(Protocol):
     async def count_texts_async(
         self, texts: list[str], loop: asyncio.AbstractEventLoop, /
     ) -> list[int]:
-        """Per-text token counts for a whole batch."""
+        """Per-text token counts for a whole batch (full pool)."""
+        raise NotImplementedError
+
+    async def count_texts_live_async(
+        self, texts: list[str], loop: asyncio.AbstractEventLoop, /
+    ) -> list[int]:
+        """Per-text token counts via the bounded live lane."""
         raise NotImplementedError
 
     async def token_count_message_async(
@@ -447,10 +478,11 @@ class TokenBatchQueue:
     """Buffers per-sample tokenization work and clears it in batches.
 
     Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an
-    ``on_count`` callback that records the resulting metric. The aggregator
-    flushes the buffer with ``flush`` once per publish tick (so live ISL/OSL/
-    TPOT stay current) and with ``flush_remaining`` at end-of-run, sending the
-    whole batch through ``BatchTokenizer`` in one sharded call.
+    ``on_count`` callback that records the resulting metric. The queue owns
+    its own flush cadence: ``start_live`` begins a periodic flush through the
+    tokenizer's bounded live lane (so live ISL/OSL/TPOT stay current without
+    touching the benchmark's cores), and ``flush_remaining`` drains everything
+    left at end-of-run through every shard.
 
     ``pending`` counts enqueued-but-not-yet-recorded items; it is the
     ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot
@@ -465,10 +497,38 @@ def __init__(
         self._text: list[tuple[str, Callable[[int], None]]] = []
         self._msg: list[tuple[MessageParts, Callable[[int], None]]] = []
         self._inflight = 0
-        # Serializes flushes so a periodic tick flush and the end-of-run flush
-        # never record the same item twice or race on the pending count.
+        self._live_task: asyncio.Task | None = None
+        # Serializes flushes so the periodic live flush and the end-of-run
+        # flush never record the same item twice or race on the pending count.
         self._lock = asyncio.Lock()
 
+    def start_live(self, interval_s: float) -> None:
+        """Begin the periodic live flush (idempotent).
+
+        Failures are logged once and never interrupt the loop — unflushed
+        items stay visible as ``pending`` and the end-of-run drain picks
+        them up.
+        """
+        if self._live_task is not None:
+            return
+        self._live_task = self._loop.create_task(self._live_flush_loop(interval_s))
+
+    async def _live_flush_loop(self, interval_s: float) -> None:
+        failure_logged = False
+        while True:
+            await asyncio.sleep(interval_s)
+            try:
+                await self.flush(live=True)
+            except Exception:  # noqa: BLE001 — keep live metrics flowing.
+                if not failure_logged:
+                    failure_logged = True
+                    logger.exception(
+                        "live token flush failed; retrying each interval "
+                        "(further failures logged at debug)"
+                    )
+                else:
+                    logger.debug("live token flush failed again")
+
     @property
     def pending(self) -> int:
         """Enqueued items not yet tokenized-and-recorded."""
@@ -484,15 +544,23 @@ def enqueue_message(
         self._inflight += 1
         self._msg.append((parts, on_count))
 
-    async def flush(self) -> None:
+    async def flush(self, live: bool = False) -> None:
         """Tokenize everything buffered so far and run each ``on_count``.
 
-        Items are detached from the buffer up front so concurrent enqueues land
-        in the next flush. ``_inflight`` is decremented only after a callback
-        runs, so a cancellation (drain timeout) or a tokenizer error leaves it
-        reflecting exactly the items that were not recorded — those surface as
-        ``pending`` (an incomplete drain), not as silently dropped samples.
+        ``live=True`` routes text batches through the tokenizer's bounded
+        live lane instead of the full shard pool, and re-queues items on
+        failure or cancellation so a mid-run hiccup never loses samples — the
+        end-of-run drain retries them. Drain-mode failures are terminal: the
+        un-recorded items stay counted in ``pending`` (``_inflight`` is
+        decremented only after a callback runs) and surface as an incomplete
+        drain, not as silently dropped samples. Items are detached from the
+        buffer up front so concurrent enqueues land in the next flush.
         """
+        count_texts = (
+            self._tokenizer.count_texts_live_async
+            if live
+            else self._tokenizer.count_texts_async
+        )
         async with self._lock:
             if not (self._text or self._msg):
                 return
@@ -505,21 +573,34 @@ async def flush(self) -> None:
             failure: Exception | None = None
             if text_items:
                 try:
-                    counts = await self._tokenizer.count_texts_async(
-                        [t for t, _ in text_items], self._loop
-                    )
+                    counts = await count_texts([t for t, _ in text_items], self._loop)
+                except asyncio.CancelledError:
+                    if live:
+                        self._text[:0] = text_items
+                    raise
                 except Exception as exc:  # noqa: BLE001 — isolate phases.
                     failure = exc
+                    if live:
+                        # A live hiccup must not lose samples: give the items
+                        # back so the end-of-run drain (full pool) retries.
+                        # Drain failures are terminal and stay pending-only.
+                        self._text[:0] = text_items
                 else:
                     for (_, on_count), count in zip(text_items, counts, strict=True):
                         self._record(on_count, count)
-            for (content, reasoning, tool_calls), on_count in msg_items:
+            for i, ((content, reasoning, tool_calls), on_count) in enumerate(msg_items):
                 try:
                     count = await self._tokenizer.token_count_message_async(
                         content, reasoning, tool_calls, self._loop
                     )
+                except asyncio.CancelledError:
+                    if live:
+                        self._msg[:0] = msg_items[i:]
+                    raise
                 except Exception as exc:  # noqa: BLE001 — isolate items.
                     failure = failure or exc
+                    if live:
+                        self._msg.append(((content, reasoning, tool_calls), on_count))
                     continue
                 self._record(on_count, count)
             if failure is not None:
@@ -538,11 +619,17 @@ def _record(self, on_count: Callable[[int], None], count: int) -> None:
     async def flush_remaining(self, timeout: float | None) -> int:
         """End-of-run flush, bounded by ``timeout`` seconds.
 
+        Stops the live flush loop, then drains through the full shard pool.
         Returns the number of items still un-tokenized — non-zero if the budget
         was exhausted (``timeout`` reached) or tokenization failed. ``None``
         waits indefinitely. Never raises: a failure here must not stop the
         aggregator from publishing the (incomplete) final snapshot.
         """
+        if self._live_task is not None:
+            self._live_task.cancel()
+            with contextlib.suppress(asyncio.CancelledError):
+                await self._live_task
+            self._live_task = None
         if self._inflight == 0:
             return 0
         try:
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 0a59074f5..6a8b9b872 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -584,11 +584,11 @@ class DrainConfig(BaseModel):
             ),
         ),
     ] = Field(
-        60.0,
+        300.0,
         ge=0,
         description=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
-            "after ENDED (default: 60.0; 0 = unlimited)."
+            "after ENDED (default: 300.0; 0 = unlimited)."
         ),
     )
 
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 75feab6fb..5132f5b0e 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index 3ff1ccd17..e3ec95284 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 1287b99af..73c0b69d4 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
index 51d25565a..05b68d3ee 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
@@ -67,6 +67,11 @@ async def count_texts_async(
             await asyncio.sleep(self._delay)
         return [len(t.split()) for t in texts]
 
+    async def count_texts_live_async(
+        self, texts: list[str], loop: asyncio.AbstractEventLoop
+    ) -> list[int]:
+        return await self.count_texts_async(texts, loop)
+
     async def token_count_message_async(
         self,
         content: str,
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
index 3337b168b..bae2a0aa5 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
@@ -332,7 +332,7 @@ async def test_non_streaming_latency_only(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 # sample_latency = 3000-1000 = 2000
                 assert (
                     snapshot_series_total(
@@ -796,7 +796,7 @@ async def test_isl_text_path_async(self, tmp_path):
                     ]
                 )
                 # ISL task is in-flight; drain it
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 4
             finally:
                 agg.close()
@@ -825,7 +825,7 @@ async def test_osl_emitted_on_complete(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 # sample_latency_ns = 5000-1000 = 4000
                 assert (
                     snapshot_series_total(
@@ -864,7 +864,7 @@ async def test_tpot_emitted_for_streaming(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 # OSL = "hello world foo" = 3 tokens
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3
                 # tpot = (5000 - 2000) / token_count("world foo") = 3000 / 2 = 1500
@@ -900,7 +900,7 @@ async def test_tpot_skipped_when_single_chunk(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 1
                 assert (
                     snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0
@@ -939,7 +939,7 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 # sample_latency / OSL still emitted in non-streaming mode.
                 assert (
                     snapshot_series_total(
@@ -981,7 +981,7 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3
                 assert (
                     snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0
@@ -1016,7 +1016,7 @@ async def test_flush_records_buffered_tokenizations(self, tmp_path):
                 # Enqueued by fire(), not yet tokenized (no tick/drain flush).
                 assert agg._token_queue.pending > 0
 
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 assert agg._token_queue.pending == 0
                 assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 5
             finally:
@@ -1185,7 +1185,7 @@ async def test_tpot_osl_for_tool_call_complete(self, tmp_path):
                         ),
                     ]
                 )
-                await agg._flush_tokens()
+                await agg._token_queue.flush()
                 # OSL = token_count("ok" + tool_calls_json) = 2
                 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 2
                 # tpot = (5000 - 2000) / token_count(tool_calls_json) = 3000 / 1 = 3000
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
index 1d540ddec..9e26f734a 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py
@@ -81,92 +81,6 @@ def get_runtime_state() -> tuple[SessionState, int]:
         finally:
             publisher.close()
 
-    @pytest.mark.asyncio
-    async def test_pre_publish_runs_before_each_tick_snapshot(
-        self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext
-    ):
-        """pre_publish is awaited before the runtime state is captured."""
-        loop = asyncio.get_event_loop()
-        publisher = MetricsPublisher(
-            MetricsSnapshotCodec(),
-            zmq_ctx_scope,
-            "test_pub_pre",
-            loop,
-            final_snapshot_path=tmp_path / "final_snapshot.json",
-        )
-        try:
-            registry = MetricsRegistry()
-            registry.register_counter("c")
-            order: list[str] = []
-
-            async def pre_publish() -> None:
-                order.append("flush")
-
-            def get_runtime_state() -> tuple[SessionState, int]:
-                order.append("state")
-                return SessionState.LIVE, 0
-
-            publisher.start(
-                registry,
-                publish_interval_s=0.01,
-                get_runtime_state=get_runtime_state,
-                pre_publish=pre_publish,
-            )
-            await asyncio.sleep(0.05)
-            assert order, "no tick ran"
-            # Every state capture is preceded by a flush in the same tick.
-            assert order[0] == "flush"
-            for i, entry in enumerate(order):
-                if entry == "state":
-                    assert order[i - 1] == "flush"
-        finally:
-            publisher.close()
-
-    @pytest.mark.asyncio
-    async def test_pre_publish_failure_keeps_ticking(
-        self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext
-    ):
-        """A raising pre_publish is swallowed by the tick; ticks continue."""
-        loop = asyncio.get_event_loop()
-        publisher = MetricsPublisher(
-            MetricsSnapshotCodec(),
-            zmq_ctx_scope,
-            "test_pub_pre_fail",
-            loop,
-            final_snapshot_path=tmp_path / "final_snapshot.json",
-        )
-        try:
-            registry = MetricsRegistry()
-            registry.register_counter("c")
-            attempts = 0
-            published_states = 0
-
-            async def pre_publish() -> None:
-                nonlocal attempts
-                attempts += 1
-                raise RuntimeError("tokenizer hiccup")
-
-            def get_runtime_state() -> tuple[SessionState, int]:
-                nonlocal published_states
-                published_states += 1
-                return SessionState.LIVE, 0
-
-            publisher.start(
-                registry,
-                publish_interval_s=0.01,
-                get_runtime_state=get_runtime_state,
-                pre_publish=pre_publish,
-            )
-            await asyncio.sleep(0.08)
-            assert attempts >= 2, "tick task died after a pre_publish failure"
-            # The failure must not suppress the snapshot: every failing tick
-            # still proceeds to capture state and publish.
-            assert published_states >= 2, "failing pre_publish suppressed publishing"
-            assert publisher._tick_task is not None
-            assert not publisher._tick_task.done()
-        finally:
-            publisher.close()
-
     @pytest.mark.asyncio
     async def test_publish_final_writes_json_atomically(
         self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 1bee9ba7f..2588e16cd 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -343,6 +343,86 @@ def submit(self, fn, *args):
             self._make(monkeypatch, 16, -1, executor=_BrokenWarmup)
 
 
+class _RecordingProc(_FakeProc):
+    """_FakeProc that records the chunks submitted to it."""
+
+    def __init__(self):
+        self.chunks = []
+
+    def submit(self, _fn, chunk):
+        self.chunks.append(list(chunk))
+        return super().submit(_fn, chunk)
+
+
+@pytest.mark.unit
+class TestLiveLane:
+    @pytest.mark.asyncio
+    async def test_live_uses_only_the_last_shards(self):
+        """Mid-run flushes stay off the low core blocks (loadgen side)."""
+        with patch(_MOCK_TARGET, _FakeTokenizer):
+            loop = asyncio.get_running_loop()
+            with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok:
+                procs = [_RecordingProc(), _RecordingProc(), _RecordingProc()]
+                tok._procs = procs
+                counts = await tok.count_texts_live_async(["a b", "c"], loop)
+                assert counts == [2, 1]
+                assert procs[0].chunks == [] and procs[1].chunks == []
+                assert procs[2].chunks == [["a b", "c"]]
+
+    @pytest.mark.asyncio
+    async def test_drain_uses_every_shard(self):
+        with patch(_MOCK_TARGET, _FakeTokenizer):
+            loop = asyncio.get_running_loop()
+            with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok:
+                procs = [_RecordingProc(), _RecordingProc()]
+                tok._procs = procs
+                await tok.count_texts_async(["a", "b", "c", "d"], loop)
+                assert all(p.chunks for p in procs)
+
+
+@pytest.mark.unit
+@pytest.mark.asyncio
+class TestQueueLiveLoop:
+    async def test_start_live_flushes_periodically(self):
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("a b c", recorded.append)
+        queue.start_live(0.01)
+        queue.start_live(0.01)  # idempotent
+        await asyncio.sleep(0.05)
+        assert recorded == [3]
+        assert queue.pending == 0
+        await queue.flush_remaining(timeout=1.0)
+
+    async def test_live_loop_survives_tokenizer_failure(self):
+        class _FailingLive(_CapturingTokenizer):
+            async def count_texts_live_async(self, texts, _loop):
+                raise RuntimeError("live lane boom")
+
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_FailingLive(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("a b", recorded.append)
+        queue.start_live(0.01)
+        await asyncio.sleep(0.05)
+        assert recorded == []
+        assert queue.pending == 1, "failed live flush must keep items pending"
+        assert queue._live_task is not None and not queue._live_task.done()
+        # The end-of-run drain (full pool) still recovers the items.
+        assert await queue.flush_remaining(timeout=1.0) == 0
+        assert recorded == [2]
+
+    async def test_flush_remaining_stops_live_loop(self):
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        queue.start_live(0.01)
+        task = queue._live_task
+        await queue.flush_remaining(timeout=1.0)
+        assert queue._live_task is None
+        assert task is not None and task.cancelled()
+
+
 @pytest.mark.unit
 class TestEvenChunks:
     def test_splits_into_near_equal_chunks(self):
@@ -370,6 +450,9 @@ class _CapturingTokenizer:
     async def count_texts_async(self, texts, _loop):
         return [len(t.split()) for t in texts]
 
+    async def count_texts_live_async(self, texts, _loop):
+        return await self.count_texts_async(texts, _loop)
+
     async def token_count_message_async(self, content, reasoning, tool_calls, _loop):
         parts = [p for p in (content, reasoning) if p]
         return len(" ".join(parts).split()) + (len(tool_calls) if tool_calls else 0)
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 969f22ce2..9da2dcf56 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -489,7 +489,7 @@ def test_defaults(self):
         assert cfg.warmup_timeout_s == 240.0
         assert cfg.performance_timeout_s == 240.0
         assert cfg.accuracy_timeout_s is None
-        assert cfg.metrics_drain_timeout_s == 60.0
+        assert cfg.metrics_drain_timeout_s == 300.0
 
     @pytest.mark.unit
     @pytest.mark.parametrize(

From aed6b78aa2ba19f63bf1eb789e25fc4a528ec4ed Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 16:30:03 -0700
Subject: [PATCH 10/20] fix(metrics): bound live flushes; align defaults;
 audit-driven hardening

Workstreams from the full design audit:

- Live flushes take at most _LIVE_FLUSH_MAX_ITEMS per kind: bounds the
  queue-lock hold time, the unstoppable in-flight thread encode left
  behind by a drain-start cancellation (close(wait=True) is now bounded
  by ~one slice), and the drain's re-encode of requeued items.
- BatchTokenizer live_workers ctor default aligned to 2 (the CLI
  default); the aggregator class drain-timeout default aligned to 300s
  (the CLI default); --tokenizer-workers < 0 rejected at startup.
- A failed restore of the inherited CPU mask is logged instead of
  silently leaving the aggregator expanded.
- Comment/docstring hygiene: removed prior-implementation narration and
  stale shard-lane/warmup-degrade/publish-tick wording; SIGTERM-only
  phrasing in publisher docs.
- Tests: shard-decision suite no longer issues real sched_setaffinity
  syscalls (probes and restore are patched and asserted); live lane
  pinned as in-process-only; new coverage for RAYON caps (ctor,
  operator override, per-shard block override), live flush slice cap,
  live cancellation/message-failure requeue, and STARTED arming the
  live loop with ENDED stopping it; live-method aliases on all stubs.
- DESIGN.md rewritten for the final shape (in-process live lane,
  drain-only auto-sized shards, probe-and-restore affinity, requeue
  semantics, diagram + CLI table); services overview and AGENTS.md row
  aligned.

345 unit tests pass; pre-commit clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                                     |   2 +-
 docs/async_utils/services/DESIGN.md           |   2 +-
 .../services/metrics_aggregator/DESIGN.md     | 101 +++++++-------
 .../services/metrics_aggregator/__main__.py   |  43 ++----
 .../services/metrics_aggregator/aggregator.py |   9 +-
 .../services/metrics_aggregator/publisher.py  |   6 +-
 .../metrics_aggregator/token_metrics.py       | 118 +++++++++++-----
 .../services/metrics_aggregator/conftest.py   |   2 +
 .../metrics_aggregator/test_aggregator.py     |  29 ++++
 .../metrics_aggregator/test_token_metrics.py  | 130 +++++++++++++++++-
 10 files changed, 310 insertions(+), 132 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index dbc8ce953..e6182d198 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -204,7 +204,7 @@ src/inference_endpoint/
 │   │       ├── publisher.py    # MetricsPublisher (tick task + atomic disk fallback)
 │   │       ├── subscriber.py   # MetricsSnapshotSubscriber (latest + COMPLETE snapshot capture)
 │   │       ├── metrics_table.py # In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL)
-│   │       └── token_metrics.py # BatchTokenizer (sharded batch tokenization) + TokenBatchQueue (defer-to-flush buffer) for ISL/OSL/TPOT
+│   │       └── token_metrics.py # BatchTokenizer (live thread lane + drain-only sharded pool) + TokenBatchQueue (defer-to-flush buffer, owns the live flush loop) for ISL/OSL/TPOT
 │   └── transport/             # ZMQ-based IPC transport layer
 │       ├── protocol.py        # Transport protocols + TransportConfig + MessageCodec[T]
 │       └── zmq/               # ZMQ implementation (context, pubsub, transport, ZMQTransportConfig)
diff --git a/docs/async_utils/services/DESIGN.md b/docs/async_utils/services/DESIGN.md
index e12eb8a4d..e013b4ea1 100644
--- a/docs/async_utils/services/DESIGN.md
+++ b/docs/async_utils/services/DESIGN.md
@@ -306,7 +306,7 @@ stateDiagram-v2
 
 ### 6.2 Metrics aggregator
 
-- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). Token metrics (ISL/OSL/TPOT) are computed by a batched, process-sharded tokenizer — see [metrics_aggregator/DESIGN.md](metrics_aggregator/DESIGN.md). Shuts down on **session.ended**.
+- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). Token metrics (ISL/OSL/TPOT) are computed by a batched tokenizer (in-process threads live; process-sharded end-of-run drain) — see [metrics_aggregator/DESIGN.md](metrics_aggregator/DESIGN.md). Shuts down on **session.ended**.
 - **Outputs**: Live `MetricsSnapshot` frames over an IPC PUB socket, and an atomically written `final_snapshot.json` (the primary Report source). Planned is to push real time metrics to Prometheus via PushGateway.
 - **Process**: Run as a **subprocess**; given `--metrics-output-dir`, `--socket-dir`, `--socket-name`, `--metrics-socket`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC address.
 
diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
index 707ded06d..882683968 100644
--- a/docs/async_utils/services/metrics_aggregator/DESIGN.md
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -17,16 +17,16 @@ high-completion-rate runs.
 
 ## Module Layout
 
-| File               | Purpose                                                                   |
-| ------------------ | ------------------------------------------------------------------------- |
-| `__main__.py`      | Subprocess entry: argparse, affinity expansion, lifecycle wiring, SIGTERM |
-| `aggregator.py`    | `MetricsAggregatorService` — event router, session state, drain           |
-| `registry.py`      | `MetricsRegistry`, `CounterSampler`, `SeriesSampler`                      |
-| `snapshot.py`      | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec              |
-| `publisher.py`     | `MetricsPublisher` — tick task + atomic final-snapshot write              |
-| `subscriber.py`    | `MetricsSnapshotSubscriber` — main-process consumer                       |
-| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL)              |
-| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue`         |
+| File               | Purpose                                                                         |
+| ------------------ | ------------------------------------------------------------------------------- |
+| `__main__.py`      | Subprocess entry: argparse, strict tokenizer startup, lifecycle wiring, SIGTERM |
+| `aggregator.py`    | `MetricsAggregatorService` — event router, session state, drain                 |
+| `registry.py`      | `MetricsRegistry`, `CounterSampler`, `SeriesSampler`                            |
+| `snapshot.py`      | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec                    |
+| `publisher.py`     | `MetricsPublisher` — tick task + atomic final-snapshot write                    |
+| `subscriber.py`    | `MetricsSnapshotSubscriber` — main-process consumer                             |
+| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL)                    |
+| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue`               |
 
 ## Lifecycle
 
@@ -64,12 +64,15 @@ to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared
 in batches at exactly two points:
 
 1. **The queue's own live loop** — `start_live(interval)` flushes
-   periodically (at the publish cadence) through the tokenizer's **bounded
-   live lane**: the last `--live-tokenizers` shards (default 1 — the highest
-   core block, farthest from the loadgen's low cores), so live ISL/OSL/TPOT
-   stay current without touching the benchmark hot path. `--live-tokenizers
-0` disables mid-run tokenization entirely. Failures are logged once and
-   never stop the loop.
+   periodically (at the publish cadence) through the tokenizer's **in-process
+   live lane**: a small thread pool of `--tokenizer-workers` threads
+   (default 2) whose rayon pool is capped to the same width, taking at most
+   `_LIVE_FLUSH_MAX_ITEMS` per flush so the queue lock is never held for a
+   long encode. Live flushes never touch the shard processes; they run inside
+   the aggregator process, wherever the parent placed it.
+   `--tokenizer-workers 0` disables mid-run tokenization entirely. Failures
+   are logged once and never stop the loop — failed or cancelled live items
+   are **re-queued** so the drain retries them.
 2. **End-of-run** — `flush_remaining(timeout)` stops the live loop and drains
    everything still buffered through **every** shard, bounded by the drain
    budget. The publisher knows nothing about tokenization — it only reads
@@ -78,15 +81,17 @@ in batches at exactly two points:
 `flush()` serializes under an asyncio lock and detaches the buffer up front,
 so enqueues that race a flush land in the next one. Failure isolation is
 layered: the plain-text phase and the chat-template phase fail independently
-(they run on separate executors, so a dead text shard must not drop message
-items), a raising recorder callback is logged without aborting the rest of
-the batch, and the first error is re-raised only after both phases ran.
-`flush_remaining` never raises — a timeout or tokenizer failure becomes a
-logged, non-zero pending count.
+(in drain mode they run on separate executors, so a dead text shard must not
+drop message items), a raising recorder callback is logged without aborting
+the rest of the batch, and the first error is re-raised only after both
+phases ran. Live-mode failures and cancellations re-queue the detached items
+(a mid-run hiccup never loses samples); drain-mode failures are terminal —
+the items stay counted in `pending`. `flush_remaining` never raises — a
+timeout or tokenizer failure becomes a logged, non-zero pending count.
 
 ### Sharded batch encoding (`BatchTokenizer`)
 
-A flush hands the whole buffer to `count_texts_async`, which splits it into
+The end-of-run drain hands the whole buffer to `count_texts_async`, which splits it into
 contiguous chunks and fans them out across worker **processes**, one pinned to
 each block of `CORES_PER_WORKER` (8) cores. Why this shape:
 
@@ -104,16 +109,16 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape:
   a wedge), and they ignore SIGINT — Ctrl-C goes to the whole process group,
   and worker lifetime must stay under the parent drain's control.
 
-`--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one
-shard per 8-core block of the process affinity mask (always at least one), an
-explicit count is clamped to that capacity, and `0` explicitly selects
-in-process tokenization. There is no implicit fallback: an environment that
-cannot shard — no fast Rust backend, a failed or over-budget warmup — is a
-startup error, because a silent in-process slow path cannot keep up with
-completions and would surface much later as an incomplete drain. Platforms
-without a CPU-affinity API (e.g. macOS) still shard at full speed, just
-unpinned: blocks are sized from the online CPU count and each worker caps its
-rayon pool to the block size instead of pinning.
+The shard pool has no CLI knob: it always auto-sizes to one shard per
+8-core block of the allowed CPU universe (always at least one).
+`--tokenizer-workers` sizes the **live** in-process thread lane instead
+(default 2; `0` = no mid-run tokenization). There is no implicit fallback: an
+environment that cannot shard — no fast Rust backend, a failed or over-budget
+warmup — is a startup error, because a silent in-process slow path cannot
+keep up with completions and would surface much later as an incomplete drain.
+Platforms without a CPU-affinity API (e.g. macOS) still shard at full speed,
+just unpinned: blocks are sized from the online CPU count and each worker
+caps its rayon pool to the block size instead of pinning.
 
 Chat-template items (tool-call outputs) take a separate in-process thread:
 they are rare relative to the batched flush, and `apply_chat_template` is
@@ -123,14 +128,17 @@ is counted.
 
 ### CPU affinity: the tokenizer stage is post-run
 
-The benchmark parent pins itself to the loadgen cores before launching
-services, and subprocesses inherit that narrow mask. The tokenizer's heavy
-work happens **after** the run (the end-of-run flush), so the run-time core
-partition does not apply to it: at startup the service calls
-`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`) to reset
-its mask to every online CPU — the kernel still clamps to the cgroup/Slurm
-cpuset — and shards size to the full machine. Mid-run tick flushes are small
-batches; the drain is where the core count pays.
+The benchmark parent pins itself to the loadgen cores (the fastest
+perf-ranked physical cores) before launching services, and subprocesses
+inherit that narrow mask. The tokenizer's heavy work happens **after** the
+run, so the run-time core partition does not apply to it — but the aggregator
+itself must not move: `_setup_shards` probes the full allowed universe via
+`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`; the
+kernel still clamps to the cgroup/Slurm cpuset) **and then restores the
+inherited mask**, so the event loop, the publisher, and the live tokenizer
+threads stay exactly where the parent placed them. Only the drain-phase shard
+children, which pin themselves to their own 8-core blocks, span the whole
+machine — and they are idle until `ENDED`.
 
 ### The `n_pending_tasks` contract
 
@@ -151,11 +159,11 @@ as a clean run.
 ```
 COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count)   [O(1)]
                                             │
-        live loop (0.25 s, live lane) ─────┤  flush()
-        ENDED drain (budgeted) ─────────────┘    │
-                                                 ├─► chunks ─► N pinned worker procs
-                                                 │             (encode_batch_fast)
-                                                 └─► on_count(n) ─► registry.record()
+   live loop (0.25 s) ── flush(live) ───────┤─► in-process thread pool
+                                            │   (rayon capped to --tokenizer-workers)
+   ENDED drain (budgeted) ── flush() ───────┘─► chunks ─► N pinned worker procs
+                                                │          (encode_batch_fast)
+                                                └─► on_count(n) ─► registry.record()
 ```
 
 ## CLI Interface
@@ -168,8 +176,7 @@ COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count)   [
 | `--publish-interval`             | 0.25     | Live snapshot cadence (seconds)                     |
 | `--drain-timeout`                | 300.0    | End-of-run tokenize budget (`0` = unlimited)        |
 | `--tokenizer`                    | none     | HF name or local path; unset disables token metrics |
-| `--tokenizer-workers`            | -1       | Shard processes (`-1` auto, `0` in-process)         |
-| `--live-tokenizers`              | 1        | Shards for mid-run live flushes (`0` = defer all)   |
+| `--tokenizer-workers`            | 2        | Live in-process threads (`0` = defer all to drain)  |
 | `--streaming`                    | off      | Register TTFT/chunk-delta/TPOT triggers             |
 
 ## References
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 628811a36..0d5be495e 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -26,10 +26,6 @@
 from inference_endpoint.async_utils.loop_manager import LoopManager
 from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
 from inference_endpoint.async_utils.transport.zmq.ready_check import send_ready_signal
-from inference_endpoint.endpoint_client.cpu_affinity import (
-    UnsupportedPlatformError,
-    expand_to_all_online_cpus,
-)
 from inference_endpoint.utils.logging import setup_logging
 
 from .aggregator import MetricCounterKey, MetricsAggregatorService
@@ -166,23 +162,12 @@ async def main() -> None:
     parser.add_argument(
         "--tokenizer-workers",
         type=int,
-        default=-1,
+        default=2,
         help=(
-            "Number of tokenizer shard processes (-1 = auto: one per "
-            "8-core block of this machine, minimum one; 0 = explicit "
-            "in-process tokenization). A tokenizer without a fast (Rust) "
-            "backend is a startup error unless 0 is passed; platforms "
-            "without CPU affinity (e.g. macOS) shard unpinned."
-        ),
-    )
-    parser.add_argument(
-        "--live-tokenizers",
-        type=int,
-        default=1,
-        help=(
-            "Shards used for mid-run (live) token-metric flushes (default: 1 "
-            "— the highest core block, away from the loadgen's cores; 0 = no "
-            "mid-run tokenization, everything defers to the end-of-run drain)."
+            "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT "
+            "(default: 2; 0 = no mid-run tokenization, everything defers "
+            "to the end-of-run drain). The drain always uses the auto-sized "
+            "sharded pool — one worker process per 8-core block."
         ),
     )
     parser.add_argument(
@@ -206,6 +191,9 @@ async def main() -> None:
     args = parser.parse_args()
     setup_logging(level="INFO")
 
+    if args.tokenizer_workers < 0:
+        raise SystemExit("FATAL: --tokenizer-workers must be >= 0")
+
     # The parent owns directory setup — `commands/benchmark/execute.py`
     # creates `<report_dir>/metrics/` and validates it before launching
     # this subprocess. Validate here as a fail-fast contract check so a
@@ -227,20 +215,9 @@ async def main() -> None:
     # (coalesces to 'object' not 'AbstractContextManager[BatchTokenizer | None]')
     tokenizer_cm: AbstractContextManager[BatchTokenizer | None]
     if args.tokenizer:
-        # Tokenization drains after the benchmark run, so the loadgen/worker
-        # affinity partition does not apply to this stage: drop the narrow
-        # mask inherited from the pinned parent so shards size to the whole
-        # machine (cgroup/Slurm CPU limits still apply).
-        try:
-            cpus = expand_to_all_online_cpus()
-            logger.info("metrics aggregator affinity: %d CPUs", len(cpus))
-        except UnsupportedPlatformError:
-            pass  # non-Linux: no inherited pin to undo.
         try:
             tokenizer_cm = BatchTokenizer(
-                args.tokenizer,
-                n_workers=args.tokenizer_workers,
-                live_workers=args.live_tokenizers,
+                args.tokenizer, live_workers=args.tokenizer_workers
             )
         except RuntimeError as exc:
             # Fail-fast contract: a tokenizer environment that cannot shard
@@ -275,7 +252,7 @@ async def main() -> None:
                 n_histogram_buckets=args.n_histogram_buckets,
                 tokenizer=tokenizer,
                 live_flush_interval_s=(
-                    args.publish_interval if args.live_tokenizers > 0 else None
+                    args.publish_interval if args.tokenizer_workers > 0 else None
                 ),
                 streaming=args.streaming,
                 shutdown_event=shutdown_event,
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index cb2db5878..14bb28189 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -96,7 +96,7 @@ class MetricCounterKey(str, Enum):
 _TOKEN_HDR_LOW: Final[int] = 1
 _TOKEN_HDR_HIGH: Final[int] = 10_000_000  # 10M tokens
 
-_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 60.0
+_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 300.0
 
 
 class MetricsAggregatorService(ZmqMessageSubscriber[EventRecord]):
@@ -134,9 +134,10 @@ def __init__(
         self._registry = registry
         self._publisher = publisher
         self._publish_interval_s = publish_interval_s
-        # Token triggers enqueue onto this queue; it is flushed in batches at
-        # each publish tick and at end-of-run. None when no tokenizer is set
-        # (token metrics disabled), in which case those triggers are no-ops.
+        # Token triggers enqueue onto this queue; it is flushed by the
+        # queue's own live loop (start_live) and by the end-of-run drain.
+        # None when no tokenizer is set (token metrics disabled), in which
+        # case those triggers are no-ops.
         self._token_queue: TokenBatchQueue | None = (
             TokenBatchQueue(tokenizer, self.loop) if tokenizer is not None else None
         )
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
index c90ca11fc..578e47198 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py
@@ -88,7 +88,7 @@ def __init__(
         self._final_snapshot_path = final_snapshot_path
         self._tick_task: asyncio.Task | None = None
         self._closed = False
-        # publish_final is idempotent: the SIGTERM/SIGINT handler in
+        # publish_final is idempotent: the SIGTERM handler in
         # __main__.py and the aggregator's ENDED-driven path can both
         # call it; the second call must not re-publish or re-write.
         self._finalized = False
@@ -164,7 +164,7 @@ async def publish_final(
         Report consumers as ``state == COMPLETE and n_pending_tasks > 0``.
 
         ``interrupted=True`` is set by the signal handler in __main__.py
-        when SIGTERM/SIGINT triggers shutdown before ``ENDED`` arrived;
+        when SIGTERM triggers shutdown before ``ENDED`` arrived;
         the resulting snapshot is tagged ``state=INTERRUPTED`` so Report
         can distinguish "user killed the run mid-execution" from a clean
         end. Stats in an INTERRUPTED snapshot are best-effort partial
@@ -190,7 +190,7 @@ async def publish_final(
         of the terminal state as the last message).
 
         Idempotent: only the first call writes/publishes; subsequent
-        calls early-return. The SIGTERM/SIGINT handler relies on this to
+        calls early-return. The SIGTERM handler relies on this to
         race safely with the ENDED-driven path.
         """
         if self._finalized:
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 7723d84d4..dc2736e5d 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -18,12 +18,12 @@
 ``BatchTokenizer`` tokenizes whole batches at once, sharded across worker
 processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE
 rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers
-per-sample text; a queue-owned live loop flushes through a bounded live lane
-(default one shard) mid-run, and the end-of-run drain uses every shard.
-Sharding requires the fast (Rust) tokenizers backend; an environment without
-one is a startup error, never a silent slow path — ``--tokenizer-workers 0``
-is the only (explicit) in-process mode. Platforms without CPU affinity (e.g.
-macOS) shard unpinned at full speed; only cache/NUMA locality is lost.
+per-sample text. The sharded pool is the drain-phase accelerator and is
+auto-sized (one shard per core block); live mid-run flushes run on a small
+in-process thread pool (``--tokenizer-workers``, default 2) owned by the
+queue's live loop. A tokenizer without a fast (Rust) backend is a startup
+error, never a silent slow path. Platforms without CPU affinity (e.g. macOS)
+shard unpinned at full speed; only cache/NUMA locality is lost.
 """
 
 from __future__ import annotations
@@ -41,6 +41,9 @@
 from typing import TYPE_CHECKING, Any, Protocol, cast
 
 import msgspec
+from inference_endpoint.endpoint_client.cpu_affinity import (
+    expand_to_all_online_cpus,
+)
 from transformers import AutoTokenizer
 from transformers.utils import logging as transformers_logging
 
@@ -52,9 +55,16 @@
 
 # Budget for the parallel shard warmup (spawn + transformers import +
 # tokenizer load per worker). A hung load (e.g. a stuck network filesystem)
-# must degrade to the in-process path, not wedge service startup.
+# must become a bounded startup error, not wedge service startup.
 _SHARD_WARMUP_TIMEOUT_S = 120.0
 
+# Per-flush ceiling for the LIVE lane. Bounds three things at once: how long
+# the queue lock is held mid-run, how much work an unstoppable in-flight
+# thread encode can hold after a drain-start cancellation, and how much the
+# drain re-encodes for items the cancelled flush gave back. The drain has no
+# ceiling — it always takes the whole buffer.
+_LIVE_FLUSH_MAX_ITEMS = 1024
+
 # Minimal user message used to satisfy chat templates that reject assistant-only
 # message lists. Its token count is subtracted so only the assistant payload is
 # measured.
@@ -110,12 +120,15 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None:
     # and lose the buffered tokenizations it was counting.
     signal.signal(signal.SIGINT, signal.SIG_IGN)
     if core_set:
+        # Size the rayon pool to the block explicitly: the parent process caps
+        # its own pool for the live lane, and spawn children inherit that env —
+        # without the override every shard would run at the live-lane width.
+        os.environ["RAYON_NUM_THREADS"] = str(len(core_set))
         try:
             os.sched_setaffinity(0, set(core_set))
         except (OSError, AttributeError):
-            # No pinning (e.g. macOS): cap the rayon pool to the block size
-            # instead, so unpinned shards don't oversubscribe each other.
-            os.environ.setdefault("RAYON_NUM_THREADS", str(len(core_set)))
+            # No pinning (e.g. macOS): the rayon cap above still keeps
+            # unpinned shards from oversubscribing each other.
             logger.debug("could not pin tokenizer worker to %s", core_set)
     transformers_logging.set_verbosity_error()
     tok = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
@@ -187,21 +200,29 @@ def __init__(
         *,
         cores_per_worker: int = CORES_PER_WORKER,
         n_workers: int = -1,
-        live_workers: int = 1,
+        live_workers: int = 2,
     ) -> None:
         self._tokenizer_name = tokenizer_name
+        # The live lane runs in-process: cap this process's rayon pool so a
+        # mid-run batched encode uses ~live_workers cores, not the whole
+        # machine. Must be set before the first encode initializes the pool;
+        # setdefault lets an operator-exported RAYON_NUM_THREADS win.
+        os.environ.setdefault("RAYON_NUM_THREADS", str(max(1, live_workers)))
         self._live_workers = live_workers
         self._fallback_warned: set[str] = set()
         self._tokenizer: PreTrainedTokenizerBase | None = None
         self._prefix_len = 0
         self._baseline = 0
-        # In-process thread for the chat-template path.
+        # In-process threads: the live token-metric lane plus the
+        # chat-template path.
         self._thread: ThreadPoolExecutor | None = ThreadPoolExecutor(
-            max_workers=1, thread_name_prefix="tok-thread"
+            max_workers=max(1, live_workers), thread_name_prefix="tok-thread"
         )
         self._load_tokenizer()  # also computes the chat-template baseline
         # Process shards for the batched text path. Empty only when
-        # in-process mode was explicitly requested (n_workers=0).
+        # in-process mode was explicitly requested (n_workers=0 or
+        # cores_per_worker<=0; ctor overrides used primarily by tests —
+        # production wiring passes live_workers only and shards auto-size).
         self._procs: list[ProcessPoolExecutor] = []
         self._setup_shards(cores_per_worker, n_workers)
 
@@ -259,17 +280,35 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
             raise RuntimeError(
                 f"tokenizer {self._tokenizer_name!r} has no fast (Rust) "
                 "backend; token metrics require one to keep up with "
-                "completions. Pass --tokenizer-workers 0 to explicitly run "
-                "single-threaded in-process tokenization."
+                "completions. Use a fast tokenizer, or disable token metrics."
             )
+        # Probe the full allowed CPU universe (cgroup-clamped) for the shard
+        # block math, then restore this process's inherited mask: the
+        # aggregator's event loop, publisher, and live tokenizer threads stay
+        # exactly where the parent placed them (the loadgen mask on a pinned
+        # Linux run). Only the drain-phase shard processes, pinned to their
+        # own blocks, span the whole machine.
         try:
-            available = sorted(os.sched_getaffinity(0))
+            original = os.sched_getaffinity(0)
         except (OSError, AttributeError):
-            # No affinity API (e.g. macOS): shard unpinned — the OS scheduler
-            # spreads the workers; only cache/NUMA locality is lost. Workers
-            # cap their rayon pools to the block size instead (_init_worker).
+            original = None
+        try:
+            available = sorted(expand_to_all_online_cpus())
+        except Exception:  # noqa: BLE001 — no affinity API (e.g. macOS).
+            # Shard unpinned: the OS scheduler spreads the workers; only
+            # cache/NUMA locality is lost. Workers cap their rayon pools to
+            # the block size instead (_init_worker).
             available = list(range(os.cpu_count() or 1))
             logger.info("BatchTokenizer: CPU affinity unavailable; sharding unpinned")
+        else:
+            if original is not None:
+                try:
+                    os.sched_setaffinity(0, original)
+                except OSError:
+                    logger.warning(
+                        "could not restore the aggregator's inherited CPU "
+                        "mask; this process stays expanded to all CPUs"
+                    )
         capacity = max(1, len(available) // cores_per_worker)
         n = capacity if n_workers < 0 else min(n_workers, capacity)
         t0 = time.perf_counter()
@@ -298,8 +337,8 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
             _terminate_procs(procs)
             raise RuntimeError(
                 "tokenizer shard warmup failed; refusing to fall back to a "
-                "slow path. Fix the environment (or pass --tokenizer-workers "
-                "0 to explicitly run in-process)."
+                "slow path that cannot keep up with completions. Fix the "
+                "environment (see the chained error)."
             ) from exc
         self._procs = procs
         logger.info(
@@ -338,19 +377,19 @@ async def count_texts_async(
     async def count_texts_live_async(
         self, texts: list[str], loop: asyncio.AbstractEventLoop
     ) -> list[int]:
-        """Like ``count_texts_async``, bounded to the live lane.
+        """Like ``count_texts_async``, bounded to the in-process live lane.
 
-        Mid-run flushes use only the last ``live_workers`` shards — the
-        highest core blocks, farthest from the loadgen's low cores — so live
-        token metrics never contend with the benchmark hot path. The
-        end-of-run drain uses every shard.
+        Mid-run flushes never touch the shard processes: they run on this
+        process's small thread pool with a rayon pool capped to
+        ``live_workers`` cores. The end-of-run drain uses every shard.
         """
         if not texts:
             return []
-        live_n = max(1, self._live_workers)
-        if self._procs and live_n < len(self._procs):
-            return await self._fan_out(self._procs[-live_n:], texts)
-        return await self.count_texts_async(texts, loop)
+        if self._thread is None:
+            raise RuntimeError("BatchTokenizer is closed")
+        return await loop.run_in_executor(
+            self._thread, self._encode_lengths_inproc, texts
+        )
 
     @staticmethod
     async def _fan_out(procs: list[ProcessPoolExecutor], texts: list[str]) -> list[int]:
@@ -548,9 +587,11 @@ async def flush(self, live: bool = False) -> None:
         """Tokenize everything buffered so far and run each ``on_count``.
 
         ``live=True`` routes text batches through the tokenizer's bounded
-        live lane instead of the full shard pool, and re-queues items on
-        failure or cancellation so a mid-run hiccup never loses samples — the
-        end-of-run drain retries them. Drain-mode failures are terminal: the
+        live lane instead of the full shard pool, takes at most
+        ``_LIVE_FLUSH_MAX_ITEMS`` per kind (bounding lock-hold time and the
+        unstoppable in-flight encode a drain-start cancellation leaves
+        behind), and re-queues items on failure or cancellation so a mid-run
+        hiccup never loses samples — the end-of-run drain retries them. Drain-mode failures are terminal: the
         un-recorded items stay counted in ``pending`` (``_inflight`` is
         decremented only after a callback runs) and surface as an incomplete
         drain, not as silently dropped samples. Items are detached from the
@@ -564,8 +605,13 @@ async def flush(self, live: bool = False) -> None:
         async with self._lock:
             if not (self._text or self._msg):
                 return
-            text_items, self._text = self._text, []
-            msg_items, self._msg = self._msg, []
+            if live:
+                cap = _LIVE_FLUSH_MAX_ITEMS
+                text_items, self._text = self._text[:cap], self._text[cap:]
+                msg_items, self._msg = self._msg[:cap], self._msg[cap:]
+            else:
+                text_items, self._text = self._text, []
+                msg_items, self._msg = self._msg, []
             # The text and message phases fail independently — they run on
             # separate executors, so a dead text shard must not drop message
             # items that would still succeed (and vice versa). The first
diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
index 05b68d3ee..f28b48f7a 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
@@ -170,6 +170,7 @@ def make_aggregator(
     socket_name: str,
     *,
     tokenizer=None,
+    live_flush_interval_s: float | None = None,
     streaming: bool = True,
     shutdown_event: asyncio.Event | None = None,
 ) -> tuple[MetricsAggregatorService, MetricsRegistry, MagicMock]:
@@ -201,6 +202,7 @@ def make_aggregator(
         sig_figs=3,
         n_histogram_buckets=10,
         tokenizer=tokenizer,
+        live_flush_interval_s=live_flush_interval_s,
         streaming=streaming,
         shutdown_event=shutdown_event,
     )
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
index bae2a0aa5..bc7d2763b 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
@@ -989,6 +989,29 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path):
             finally:
                 agg.close()
 
+    @pytest.mark.asyncio
+    async def test_started_arms_the_live_flush_loop(self, tmp_path):
+        """STARTED starts the queue's live loop when an interval is set."""
+        loop = asyncio.get_event_loop()
+        with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx:
+            agg, _, _ = make_aggregator(
+                ctx,
+                loop,
+                "agg_live_arm",
+                tokenizer=MockBatchTokenizer(),
+                live_flush_interval_s=0.01,
+            )
+            try:
+                await agg.process([session_event(SessionEventType.STARTED, ts=0)])
+                assert agg._token_queue is not None
+                assert agg._token_queue._live_task is not None
+                await agg.process([session_event(SessionEventType.ENDED, ts=100)])
+                assert (
+                    agg._token_queue._live_task is None
+                ), "drain must stop the live loop"
+            finally:
+                agg.close()
+
     @pytest.mark.asyncio
     async def test_flush_records_buffered_tokenizations(self, tmp_path):
         """fire() buffers tokenization; flush() tokenizes the batch and records."""
@@ -1066,6 +1089,9 @@ class FailingBatchTokenizer:
             async def count_texts_async(self, texts, _loop):
                 raise RuntimeError("tokenizer backend died")
 
+            async def count_texts_live_async(self, texts, _loop):
+                return await self.count_texts_async(texts, _loop)
+
             async def token_count_message_async(self, *args):
                 raise RuntimeError("tokenizer backend died")
 
@@ -1114,6 +1140,9 @@ async def count_texts_async(self, texts, _loop):
                 await asyncio.sleep(10.0)  # exceeds drain timeout
                 return [0] * len(texts)
 
+            async def count_texts_live_async(self, texts, _loop):
+                return await self.count_texts_async(texts, _loop)
+
             async def token_count_message_async(self, *args):
                 await asyncio.sleep(10.0)
                 return 0
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 2588e16cd..270fd683a 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -274,7 +274,9 @@ def shutdown(self, wait=False, cancel_futures=False):
 
 @pytest.mark.unit
 class TestSetupShardsDecisions:
-    """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 explicit.
+    """Pins the BatchTokenizer(n_workers=...) shard contract: -1 auto / N
+    clamped / 0 explicit in-process (auto-sized in production — the CLI's
+    --tokenizer-workers maps to the live thread lane, not to shards).
 
     An environment that cannot shard is a startup error — never a silent
     in-process fallback.
@@ -282,8 +284,20 @@ class TestSetupShardsDecisions:
 
     def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor):
         monkeypatch.setattr(token_metrics_module, "ProcessPoolExecutor", executor)
+        # Patch the probe + the restore so no real affinity syscalls run.
         monkeypatch.setattr(
-            token_metrics_module.os, "sched_getaffinity", lambda pid: set(range(cpus))
+            token_metrics_module,
+            "expand_to_all_online_cpus",
+            lambda: set(range(cpus)),
+        )
+        monkeypatch.setattr(
+            token_metrics_module.os, "sched_getaffinity", lambda pid: {0, 1}
+        )
+        self.restored: list[set] = []
+        monkeypatch.setattr(
+            token_metrics_module.os,
+            "sched_setaffinity",
+            lambda pid, mask: self.restored.append(set(mask)),
         )
         with patch(_MOCK_TARGET, _FakeTokenizerWithBackend):
             return BatchTokenizer("fake", n_workers=n_workers)
@@ -309,6 +323,13 @@ def test_blocks_are_disjoint_consecutive_core_sets(self, monkeypatch):
             blocks = [set(ex.initargs[1]) for ex in tok._procs]
             assert blocks == [set(range(0, 8)), set(range(8, 16))]
 
+    def test_probe_restores_the_inherited_mask(self, monkeypatch):
+        """The aggregator keeps the mask its parent gave it; only the probe
+        widens, and only the shard children pin elsewhere."""
+        with self._make(monkeypatch, 16, -1):
+            pass
+        assert self.restored == [{0, 1}]
+
     def test_no_fast_backend_is_a_startup_error(self, monkeypatch):
         monkeypatch.setattr(
             token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor
@@ -323,8 +344,15 @@ def test_affinity_unavailable_shards_unpinned(self, monkeypatch):
             token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor
         )
 
+        def _unsupported():
+            raise RuntimeError("affinity requires Linux")
+
+        monkeypatch.setattr(
+            token_metrics_module, "expand_to_all_online_cpus", _unsupported
+        )
+
         def _raise(pid):
-            raise OSError("affinity unavailable")
+            raise AttributeError("no sched_getaffinity")
 
         monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise)
         monkeypatch.setattr(token_metrics_module.os, "cpu_count", lambda: 16)
@@ -357,8 +385,8 @@ def submit(self, _fn, chunk):
 @pytest.mark.unit
 class TestLiveLane:
     @pytest.mark.asyncio
-    async def test_live_uses_only_the_last_shards(self):
-        """Mid-run flushes stay off the low core blocks (loadgen side)."""
+    async def test_live_never_touches_the_shard_pool(self):
+        """Mid-run flushes run in-process; the shards are drain-only."""
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
             with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok:
@@ -366,8 +394,7 @@ async def test_live_uses_only_the_last_shards(self):
                 tok._procs = procs
                 counts = await tok.count_texts_live_async(["a b", "c"], loop)
                 assert counts == [2, 1]
-                assert procs[0].chunks == [] and procs[1].chunks == []
-                assert procs[2].chunks == [["a b", "c"]]
+                assert all(p.chunks == [] for p in procs)
 
     @pytest.mark.asyncio
     async def test_drain_uses_every_shard(self):
@@ -423,6 +450,86 @@ async def test_flush_remaining_stops_live_loop(self):
         assert task is not None and task.cancelled()
 
 
+@pytest.mark.unit
+class TestRayonCaps:
+    def test_ctor_caps_rayon_to_live_workers(self, monkeypatch):
+        monkeypatch.delenv("RAYON_NUM_THREADS", raising=False)
+        with patch(_MOCK_TARGET, _FakeTokenizer):
+            with BatchTokenizer("fake", n_workers=0, live_workers=3):
+                assert token_metrics_module.os.environ["RAYON_NUM_THREADS"] == "3"
+
+    def test_ctor_respects_operator_exported_cap(self, monkeypatch):
+        monkeypatch.setenv("RAYON_NUM_THREADS", "7")
+        with patch(_MOCK_TARGET, _FakeTokenizer):
+            with BatchTokenizer("fake", n_workers=0, live_workers=3):
+                assert token_metrics_module.os.environ["RAYON_NUM_THREADS"] == "7"
+
+    def test_init_worker_overrides_inherited_cap_with_block_size(self, monkeypatch):
+        """Spawn children inherit the parent's live cap; each shard must
+        re-size its rayon pool to its own core block."""
+        monkeypatch.setenv("RAYON_NUM_THREADS", "2")
+
+        def _no_affinity(pid, mask):
+            raise AttributeError("no sched_setaffinity")
+
+        monkeypatch.setattr(token_metrics_module.os, "sched_setaffinity", _no_affinity)
+        with patch(_MOCK_TARGET, _FakeTokenizer):
+            token_metrics_module._init_worker("fake", [0, 1, 2, 3, 4, 5, 6, 7])
+        assert token_metrics_module.os.environ["RAYON_NUM_THREADS"] == "8"
+
+
+@pytest.mark.unit
+@pytest.mark.asyncio
+class TestLiveFlushBounds:
+    async def test_live_flush_takes_at_most_the_cap(self, monkeypatch):
+        monkeypatch.setattr(token_metrics_module, "_LIVE_FLUSH_MAX_ITEMS", 3)
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_CapturingTokenizer(), loop)
+        recorded: list[int] = []
+        for i in range(5):
+            queue.enqueue_text(f"t{i}", recorded.append)
+        await queue.flush(live=True)
+        assert len(recorded) == 3
+        assert queue.pending == 2
+        # The drain takes everything that remains.
+        assert await queue.flush_remaining(timeout=1.0) == 0
+        assert len(recorded) == 5
+
+    async def test_live_cancellation_requeues_texts(self):
+        class _Hanging(_CapturingTokenizer):
+            async def count_texts_live_async(self, texts, _loop):
+                await asyncio.sleep(30)
+                return [0] * len(texts)
+
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_Hanging(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("a b", recorded.append)
+        task = loop.create_task(queue.flush(live=True))
+        await asyncio.sleep(0.01)
+        task.cancel()
+        with pytest.raises(asyncio.CancelledError):
+            await task
+        assert queue.pending == 1
+        assert len(queue._text) == 1, "cancelled live flush must give items back"
+        assert await queue.flush_remaining(timeout=1.0) == 0
+        assert recorded == [2]
+
+    async def test_live_message_failure_requeues_message(self):
+        class _MsgFailing(_CapturingTokenizer):
+            async def token_count_message_async(self, *args):
+                raise RuntimeError("template boom")
+
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_MsgFailing(), loop)
+        recorded: list[int] = []
+        queue.enqueue_message(("hello world", None, None), recorded.append)
+        with pytest.raises(RuntimeError, match="template boom"):
+            await queue.flush(live=True)
+        assert queue.pending == 1
+        assert len(queue._msg) == 1, "failed live message must be re-queued"
+
+
 @pytest.mark.unit
 class TestEvenChunks:
     def test_splits_into_near_equal_chunks(self):
@@ -502,6 +609,9 @@ async def count_texts_async(self, texts, _loop):
                 await asyncio.sleep(10.0)
                 return [0] * len(texts)
 
+            async def count_texts_live_async(self, texts, _loop):
+                return await self.count_texts_async(texts, _loop)
+
             async def token_count_message_async(self, *args):
                 return 0
 
@@ -520,6 +630,9 @@ class _FailingTokenizer:
             async def count_texts_async(self, texts, _loop):
                 raise RuntimeError("tokenizer boom")
 
+            async def count_texts_live_async(self, texts, _loop):
+                return await self.count_texts_async(texts, _loop)
+
             async def token_count_message_async(self, *args):
                 raise RuntimeError("tokenizer boom")
 
@@ -537,6 +650,9 @@ class _TextFailingTokenizer:
             async def count_texts_async(self, texts, _loop):
                 raise RuntimeError("text shard died")
 
+            async def count_texts_live_async(self, texts, _loop):
+                return await self.count_texts_async(texts, _loop)
+
             async def token_count_message_async(
                 self, content, reasoning, tool_calls, _loop
             ):

From 700423e92113250ee76133cb94dd49b2595301f0 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 16:40:53 -0700
Subject: [PATCH 11/20] fix(metrics): call-shaped awaits for cancelled tasks;
 pin aggregator-args seam
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- flush_remaining gathers the cancelled live task (return_exceptions)
  instead of a bare suppressed await; the cancellation test awaits via
  wait_for. Both silence the code-quality ineffectual-statement check
  without changing semantics.
- New TestAggregatorArgs case pins the SUT-intrusion seam: --tokenizer
  is forwarded, and no live/worker knobs are — the service defaults
  deliberately govern mid-run tokenization (review feedback).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../metrics_aggregator/token_metrics.py       |  4 +-
 .../metrics_aggregator/test_token_metrics.py  |  2 +-
 tests/unit/commands/test_benchmark.py         | 53 +++++++++++++++++++
 3 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index dc2736e5d..c5d742087 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -29,7 +29,6 @@
 from __future__ import annotations
 
 import asyncio
-import contextlib
 import json
 import logging
 import multiprocessing
@@ -673,8 +672,7 @@ async def flush_remaining(self, timeout: float | None) -> int:
         """
         if self._live_task is not None:
             self._live_task.cancel()
-            with contextlib.suppress(asyncio.CancelledError):
-                await self._live_task
+            await asyncio.gather(self._live_task, return_exceptions=True)
             self._live_task = None
         if self._inflight == 0:
             return 0
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 270fd683a..1a51c1a18 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -509,7 +509,7 @@ async def count_texts_live_async(self, texts, _loop):
         await asyncio.sleep(0.01)
         task.cancel()
         with pytest.raises(asyncio.CancelledError):
-            await task
+            await asyncio.wait_for(task, timeout=1.0)
         assert queue.pending == 1
         assert len(queue._text) == 1, "cancelled live flush must give items back"
         assert await queue.flush_remaining(timeout=1.0) == 0
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 9da2dcf56..7d43017dc 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -631,6 +631,59 @@ async def _capture_launch(service_configs, *, timeout):
         idx = args.index("--drain-timeout")
         assert args[idx + 1] == expected_flag
 
+    @pytest.mark.unit
+    @pytest.mark.asyncio
+    async def test_tokenizer_forwarded_and_live_args_left_to_service_defaults(
+        self, tmp_path
+    ):
+        """Pins the SUT-intrusion seam: the benchmark forwards --tokenizer but
+        deliberately no live/worker knobs — the service's own defaults govern
+        mid-run tokenization."""
+        config = OfflineConfig(**_OFFLINE_KWARGS, settings=OfflineSettings())
+        ctx = self._make_ctx(config, tmp_path)
+        ctx.tokenizer_name = "gpt2"
+
+        captured: list = []
+
+        async def _capture_launch(service_configs, *, timeout):
+            captured.extend(service_configs)
+            raise KeyboardInterrupt("stop after launch")
+
+        mock_zmq = MagicMock()
+        mock_zmq.socket_dir = str(tmp_path / "sockets")
+
+        with (
+            patch(
+                "inference_endpoint.commands.benchmark.execute.ManagedZMQContext"
+            ) as MockZMQ,
+            patch(
+                "inference_endpoint.commands.benchmark.execute.EventPublisherService"
+            ) as MockPub,
+            patch(
+                "inference_endpoint.commands.benchmark.execute.MetricsSnapshotSubscriber"
+            ) as MockSub,
+            patch(
+                "inference_endpoint.commands.benchmark.execute.ServiceLauncher"
+            ) as MockLauncher,
+            patch("inference_endpoint.commands.benchmark.execute.tqdm"),
+        ):
+            MockZMQ.scoped.return_value.__enter__ = MagicMock(return_value=mock_zmq)
+            MockZMQ.scoped.return_value.__exit__ = MagicMock(return_value=False)
+            MockPub.return_value.socket_name = "test_pub"
+            MockSub.return_value.start = MagicMock()
+            MockLauncher.return_value.launch = _capture_launch
+
+            loop = asyncio.get_event_loop()
+            with pytest.raises(KeyboardInterrupt):
+                await _run_benchmark_async(ctx, loop)
+
+        aggregator_cfg = next(c for c in captured if "metrics_aggregator" in c.module)
+        args = aggregator_cfg.args
+        idx = args.index("--tokenizer")
+        assert args[idx + 1] == "gpt2"
+        assert "--tokenizer-workers" not in args
+        assert "--live-tokenizers" not in args
+
 
 class TestBuildPhases:
     """Tests for _build_phases() in execute.py."""

From 9640bd79dcca2a63a930b4a616d6ad358a0724df Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 16:59:35 -0700
Subject: [PATCH 12/20] fix(metrics): requeue messages on live-cancel; shrink
 the tokenizer API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review feedback (human + council), with the API surface pulled back
toward main:

- A live-flush cancellation landing in the text encode dropped the
  already-detached message items — lost tool-call samples and a final
  snapshot stuck at n_pending_tasks > 0 for work the drain could never
  reach. The text-phase CancelledError handler now re-queues both
  kinds; regression test covers text+message together.
- count_texts_live_async is gone: the live lane is a live= keyword on
  count_texts_async, so the TokenCounter protocol is back to two
  methods and every test stub lost its alias.
- The SIGTERM handler takes the token queue object again (reads
  .pending), not a callable.
- Live flushes take their slice in place (del list[:cap]) instead of
  copying the whole backlog tail under the queue lock each tick.
- Shard warmup budget reduced to 25s so its diagnostic FATAL fires
  before the parent's 30s service-launch kill.
- TestAggregatorArgs pins the SUT-intrusion seam: --tokenizer is
  forwarded; live/worker knobs deliberately are not.

276 unit tests pass; pre-commit clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/__main__.py   |  8 +--
 .../metrics_aggregator/token_metrics.py       | 69 +++++++++----------
 .../services/metrics_aggregator/conftest.py   | 10 ++-
 .../metrics_aggregator/test_aggregator.py     | 10 +--
 .../test_main_signal_handler.py               |  9 +--
 .../metrics_aggregator/test_token_metrics.py  | 60 ++++++++++------
 6 files changed, 84 insertions(+), 82 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 0d5be495e..b4cd8bba9 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -33,7 +33,7 @@
 from .publisher import MetricsPublisher
 from .registry import MetricsRegistry
 from .snapshot import MetricsSnapshotCodec
-from .token_metrics import BatchTokenizer
+from .token_metrics import BatchTokenizer, TokenBatchQueue
 
 logger = logging.getLogger(__name__)
 
@@ -44,7 +44,7 @@ def _make_sigterm_handler(
     registry: MetricsRegistry,
     publisher: MetricsPublisher,
     table: MetricsTable,
-    pending_tokens: Callable[[], int],
+    token_queue: TokenBatchQueue | None,
     shutdown_event: asyncio.Event,
 ) -> tuple[Callable[[], None], set[asyncio.Task]]:
     """Build the SIGTERM handler that writes the INTERRUPTED final snapshot.
@@ -76,7 +76,7 @@ async def _signal_finalize() -> None:
             )
             await publisher.publish_final(
                 registry,
-                n_pending_tasks=pending_tokens(),
+                n_pending_tasks=token_queue.pending if token_queue is not None else 0,
                 interrupted=True,
             )
         except Exception:  # noqa: BLE001 — best-effort.
@@ -288,7 +288,7 @@ async def main() -> None:
                 registry=registry,
                 publisher=publisher,
                 table=aggregator._table,
-                pending_tokens=lambda: aggregator.pending_tokens,
+                token_queue=aggregator._token_queue,
                 shutdown_event=shutdown_event,
             )
             loop.add_signal_handler(signal.SIGTERM, on_sigterm)
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index c5d742087..1ae3c5ce7 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -54,8 +54,10 @@
 
 # Budget for the parallel shard warmup (spawn + transformers import +
 # tokenizer load per worker). A hung load (e.g. a stuck network filesystem)
-# must become a bounded startup error, not wedge service startup.
-_SHARD_WARMUP_TIMEOUT_S = 120.0
+# must become a bounded startup error, not wedge service startup — and the
+# error must fire before the parent's 30 s service-launch budget kills the
+# subprocess, so the diagnostic wins the race.
+_SHARD_WARMUP_TIMEOUT_S = 25.0
 
 # Per-flush ceiling for the LIVE lane. Bounds three things at once: how long
 # the queue lock is held mid-run, how much work an unstoppable in-flight
@@ -357,15 +359,23 @@ def _encode_lengths_inproc(self, texts: list[str]) -> list[int]:
         return [len(tok.tokenize(t)) for t in texts]  # type: ignore[union-attr]
 
     async def count_texts_async(
-        self, texts: list[str], loop: asyncio.AbstractEventLoop
+        self,
+        texts: list[str],
+        loop: asyncio.AbstractEventLoop,
+        *,
+        live: bool = False,
     ) -> list[int]:
         """Per-text token counts for a whole batch without blocking the loop.
 
-        A worker-shard failure propagates and is treated as an incomplete drain.
+        ``live=True`` is the mid-run lane: it never touches the shard
+        processes — it runs on this process's small thread pool with a rayon
+        pool capped to ``live_workers`` cores. The default (drain) path fans
+        out across every shard; a worker-shard failure propagates and is
+        treated as an incomplete drain.
         """
         if not texts:
             return []
-        if self._procs:
+        if self._procs and not live:
             return await self._fan_out(self._procs, texts)
         if self._thread is None:
             raise RuntimeError("BatchTokenizer is closed")
@@ -373,23 +383,6 @@ async def count_texts_async(
             self._thread, self._encode_lengths_inproc, texts
         )
 
-    async def count_texts_live_async(
-        self, texts: list[str], loop: asyncio.AbstractEventLoop
-    ) -> list[int]:
-        """Like ``count_texts_async``, bounded to the in-process live lane.
-
-        Mid-run flushes never touch the shard processes: they run on this
-        process's small thread pool with a rayon pool capped to
-        ``live_workers`` cores. The end-of-run drain uses every shard.
-        """
-        if not texts:
-            return []
-        if self._thread is None:
-            raise RuntimeError("BatchTokenizer is closed")
-        return await loop.run_in_executor(
-            self._thread, self._encode_lengths_inproc, texts
-        )
-
     @staticmethod
     async def _fan_out(procs: list[ProcessPoolExecutor], texts: list[str]) -> list[int]:
         chunks = _even_chunks(texts, len(procs))
@@ -489,15 +482,14 @@ class TokenCounter(Protocol):
     """
 
     async def count_texts_async(
-        self, texts: list[str], loop: asyncio.AbstractEventLoop, /
-    ) -> list[int]:
-        """Per-text token counts for a whole batch (full pool)."""
-        raise NotImplementedError
-
-    async def count_texts_live_async(
-        self, texts: list[str], loop: asyncio.AbstractEventLoop, /
+        self,
+        texts: list[str],
+        loop: asyncio.AbstractEventLoop,
+        /,
+        *,
+        live: bool = False,
     ) -> list[int]:
-        """Per-text token counts via the bounded live lane."""
+        """Per-text token counts (``live=True`` = the bounded mid-run lane)."""
         raise NotImplementedError
 
     async def token_count_message_async(
@@ -596,18 +588,15 @@ async def flush(self, live: bool = False) -> None:
         drain, not as silently dropped samples. Items are detached from the
         buffer up front so concurrent enqueues land in the next flush.
         """
-        count_texts = (
-            self._tokenizer.count_texts_live_async
-            if live
-            else self._tokenizer.count_texts_async
-        )
         async with self._lock:
             if not (self._text or self._msg):
                 return
             if live:
                 cap = _LIVE_FLUSH_MAX_ITEMS
-                text_items, self._text = self._text[:cap], self._text[cap:]
-                msg_items, self._msg = self._msg[:cap], self._msg[cap:]
+                text_items = self._text[:cap]
+                del self._text[:cap]  # in-place: O(cap), not O(backlog).
+                msg_items = self._msg[:cap]
+                del self._msg[:cap]
             else:
                 text_items, self._text = self._text, []
                 msg_items, self._msg = self._msg, []
@@ -618,10 +607,14 @@ async def flush(self, live: bool = False) -> None:
             failure: Exception | None = None
             if text_items:
                 try:
-                    counts = await count_texts([t for t, _ in text_items], self._loop)
+                    counts = await self._tokenizer.count_texts_async(
+                        [t for t, _ in text_items], self._loop, live=live
+                    )
                 except asyncio.CancelledError:
                     if live:
                         self._text[:0] = text_items
+                        self._msg[:0] = msg_items
+                        msg_items = []
                     raise
                 except Exception as exc:  # noqa: BLE001 — isolate phases.
                     failure = exc
diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
index f28b48f7a..38e25945c 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
@@ -61,17 +61,15 @@ def __init__(self, delay: float = 0.0) -> None:
         self._delay = delay
 
     async def count_texts_async(
-        self, texts: list[str], _loop: asyncio.AbstractEventLoop
+        self,
+        texts: list[str],
+        _loop: asyncio.AbstractEventLoop,
+        live: bool = False,
     ) -> list[int]:
         if self._delay:
             await asyncio.sleep(self._delay)
         return [len(t.split()) for t in texts]
 
-    async def count_texts_live_async(
-        self, texts: list[str], loop: asyncio.AbstractEventLoop
-    ) -> list[int]:
-        return await self.count_texts_async(texts, loop)
-
     async def token_count_message_async(
         self,
         content: str,
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
index bc7d2763b..075e4a0d5 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py
@@ -1086,12 +1086,9 @@ async def test_drain_failure_reports_pending_and_finalizes(self, tmp_path):
         loop = asyncio.get_event_loop()
 
         class FailingBatchTokenizer:
-            async def count_texts_async(self, texts, _loop):
+            async def count_texts_async(self, texts, _loop, live=False):
                 raise RuntimeError("tokenizer backend died")
 
-            async def count_texts_live_async(self, texts, _loop):
-                return await self.count_texts_async(texts, _loop)
-
             async def token_count_message_async(self, *args):
                 raise RuntimeError("tokenizer backend died")
 
@@ -1136,13 +1133,10 @@ async def test_drain_timeout_reports_pending_count(self, tmp_path):
         loop = asyncio.get_event_loop()
 
         class BlockingBatchTokenizer:
-            async def count_texts_async(self, texts, _loop):
+            async def count_texts_async(self, texts, _loop, live=False):
                 await asyncio.sleep(10.0)  # exceeds drain timeout
                 return [0] * len(texts)
 
-            async def count_texts_live_async(self, texts, _loop):
-                return await self.count_texts_async(texts, _loop)
-
             async def token_count_message_async(self, *args):
                 await asyncio.sleep(10.0)
                 return 0
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
index 3428f6f22..13fb1f40b 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py
@@ -27,6 +27,7 @@
 import asyncio
 import gc
 import weakref
+from types import SimpleNamespace
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -50,7 +51,7 @@ async def test_sigterm_handler_holds_strong_reference_to_finalize_task():
     registry = MagicMock()
     table = MagicMock()
     table.total_tracked_duration_ns = 0
-    n_pending = 0
+    token_queue = SimpleNamespace(pending=0)
 
     # publish_final blocks on an event so we can observe the task
     # mid-execution and exercise the strong-ref contract.
@@ -69,7 +70,7 @@ async def _slow_publish(*args, **kwargs):
         registry=registry,
         publisher=publisher,
         table=table,
-        pending_tokens=lambda: n_pending,
+        token_queue=token_queue,
         shutdown_event=shutdown_event,
     )
 
@@ -123,7 +124,7 @@ async def test_sigterm_handler_refreshes_tracked_duration():
     registry = MagicMock()
     table = MagicMock()
     table.total_tracked_duration_ns = 12345
-    n_pending = 3
+    token_queue = SimpleNamespace(pending=3)
 
     publisher = MagicMock()
     publisher.publish_final = AsyncMock()
@@ -135,7 +136,7 @@ async def test_sigterm_handler_refreshes_tracked_duration():
         registry=registry,
         publisher=publisher,
         table=table,
-        pending_tokens=lambda: n_pending,
+        token_queue=token_queue,
         shutdown_event=shutdown_event,
     )
     on_sigterm()
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 1a51c1a18..8bf838c17 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -392,7 +392,7 @@ async def test_live_never_touches_the_shard_pool(self):
             with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok:
                 procs = [_RecordingProc(), _RecordingProc(), _RecordingProc()]
                 tok._procs = procs
-                counts = await tok.count_texts_live_async(["a b", "c"], loop)
+                counts = await tok.count_texts_async(["a b", "c"], loop, live=True)
                 assert counts == [2, 1]
                 assert all(p.chunks == [] for p in procs)
 
@@ -424,8 +424,10 @@ async def test_start_live_flushes_periodically(self):
 
     async def test_live_loop_survives_tokenizer_failure(self):
         class _FailingLive(_CapturingTokenizer):
-            async def count_texts_live_async(self, texts, _loop):
-                raise RuntimeError("live lane boom")
+            async def count_texts_async(self, texts, _loop, live=False):
+                if live:
+                    raise RuntimeError("live lane boom")
+                return await super().count_texts_async(texts, _loop)
 
         loop = asyncio.get_running_loop()
         queue = TokenBatchQueue(_FailingLive(), loop)
@@ -497,9 +499,10 @@ async def test_live_flush_takes_at_most_the_cap(self, monkeypatch):
 
     async def test_live_cancellation_requeues_texts(self):
         class _Hanging(_CapturingTokenizer):
-            async def count_texts_live_async(self, texts, _loop):
-                await asyncio.sleep(30)
-                return [0] * len(texts)
+            async def count_texts_async(self, texts, _loop, live=False):
+                if live:
+                    await asyncio.sleep(30)
+                return await super().count_texts_async(texts, _loop)
 
         loop = asyncio.get_running_loop()
         queue = TokenBatchQueue(_Hanging(), loop)
@@ -515,6 +518,31 @@ async def count_texts_live_async(self, texts, _loop):
         assert await queue.flush_remaining(timeout=1.0) == 0
         assert recorded == [2]
 
+    async def test_live_cancellation_requeues_messages_too(self):
+        """A cancel landing in the text encode must give back BOTH kinds."""
+
+        class _Hanging(_CapturingTokenizer):
+            async def count_texts_async(self, texts, _loop, live=False):
+                if live:
+                    await asyncio.sleep(30)
+                return await super().count_texts_async(texts, _loop)
+
+        loop = asyncio.get_running_loop()
+        queue = TokenBatchQueue(_Hanging(), loop)
+        recorded: list[int] = []
+        queue.enqueue_text("a b", recorded.append)
+        queue.enqueue_message(("hello world", None, None), recorded.append)
+        task = loop.create_task(queue.flush(live=True))
+        await asyncio.sleep(0.01)
+        task.cancel()
+        with pytest.raises(asyncio.CancelledError):
+            await asyncio.wait_for(task, timeout=1.0)
+        assert queue.pending == 2
+        assert len(queue._text) == 1
+        assert len(queue._msg) == 1, "detached messages must be re-queued"
+        assert await queue.flush_remaining(timeout=1.0) == 0
+        assert sorted(recorded) == [2, 2]
+
     async def test_live_message_failure_requeues_message(self):
         class _MsgFailing(_CapturingTokenizer):
             async def token_count_message_async(self, *args):
@@ -554,12 +582,9 @@ def test_preserves_order_and_bounds_chunk_count(self):
 class _CapturingTokenizer:
     """Minimal tokenizer stub for queue tests: whitespace counts, no procs."""
 
-    async def count_texts_async(self, texts, _loop):
+    async def count_texts_async(self, texts, _loop, live=False):
         return [len(t.split()) for t in texts]
 
-    async def count_texts_live_async(self, texts, _loop):
-        return await self.count_texts_async(texts, _loop)
-
     async def token_count_message_async(self, content, reasoning, tool_calls, _loop):
         parts = [p for p in (content, reasoning) if p]
         return len(" ".join(parts).split()) + (len(tool_calls) if tool_calls else 0)
@@ -605,13 +630,10 @@ async def test_flush_remaining_timeout_reports_pending(self):
         """A tokenizer slower than the budget leaves items pending."""
 
         class _BlockingTokenizer:
-            async def count_texts_async(self, texts, _loop):
+            async def count_texts_async(self, texts, _loop, live=False):
                 await asyncio.sleep(10.0)
                 return [0] * len(texts)
 
-            async def count_texts_live_async(self, texts, _loop):
-                return await self.count_texts_async(texts, _loop)
-
             async def token_count_message_async(self, *args):
                 return 0
 
@@ -627,12 +649,9 @@ async def test_flush_remaining_failure_reports_pending(self):
         """A tokenizer error leaves items pending and never raises."""
 
         class _FailingTokenizer:
-            async def count_texts_async(self, texts, _loop):
+            async def count_texts_async(self, texts, _loop, live=False):
                 raise RuntimeError("tokenizer boom")
 
-            async def count_texts_live_async(self, texts, _loop):
-                return await self.count_texts_async(texts, _loop)
-
             async def token_count_message_async(self, *args):
                 raise RuntimeError("tokenizer boom")
 
@@ -647,12 +666,9 @@ async def test_flush_text_failure_does_not_drop_message_items(self):
         """The message phase runs (and records) even when the text batch fails."""
 
         class _TextFailingTokenizer:
-            async def count_texts_async(self, texts, _loop):
+            async def count_texts_async(self, texts, _loop, live=False):
                 raise RuntimeError("text shard died")
 
-            async def count_texts_live_async(self, texts, _loop):
-                return await self.count_texts_async(texts, _loop)
-
             async def token_count_message_async(
                 self, content, reasoning, tool_calls, _loop
             ):

From a1b93868f054b16a42a4a4f952a2c850f22fd1a6 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 17:00:45 -0700
Subject: [PATCH 13/20] chore(metrics): public read-only wiring surface on the
 aggregator

The service entry wires the SIGTERM handler from the aggregator's table
and token queue; expose them as read-only properties instead of
reaching into private attributes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/__main__.py            |  4 ++--
 .../services/metrics_aggregator/aggregator.py          | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index b4cd8bba9..e3d136ab0 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -287,8 +287,8 @@ async def main() -> None:
                 loop=loop,
                 registry=registry,
                 publisher=publisher,
-                table=aggregator._table,
-                token_queue=aggregator._token_queue,
+                table=aggregator.table,
+                token_queue=aggregator.token_queue,
                 shutdown_event=shutdown_event,
             )
             loop.add_signal_handler(signal.SIGTERM, on_sigterm)
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index 14bb28189..233342b46 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -246,6 +246,16 @@ def _register_triggers(self, streaming: bool) -> None:
             table.add_trigger(SampleField.LAST_RECV_NS, ChunkDeltaTrigger(registry))
             table.add_trigger(SampleField.COMPLETE_NS, TpotTrigger(registry, queue))
 
+    @property
+    def table(self) -> MetricsTable:
+        """The per-sample metrics table (read-only; for service wiring)."""
+        return self._table
+
+    @property
+    def token_queue(self) -> TokenBatchQueue | None:
+        """The token batch queue, if token metrics are enabled."""
+        return self._token_queue
+
     @property
     def pending_tokens(self) -> int:
         """Enqueued tokenizations not yet recorded (the snapshot n_pending_tasks)."""

From 34e0c489fcdcd874a5077e40c5ebc80d9d6a384a Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 17:02:41 -0700
Subject: [PATCH 14/20] chore(metrics): drain-timeout default back to 60s
 (review feedback)

The end-of-run drain runs on the full shard pool, so 60s covers
roughly a million buffered tokenizations on a large node; bigger runs
set --metrics-drain-timeout explicitly (0 = unlimited).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                                                     | 2 +-
 docs/async_utils/services/metrics_aggregator/DESIGN.md        | 4 ++--
 .../async_utils/services/metrics_aggregator/__main__.py       | 4 ++--
 .../async_utils/services/metrics_aggregator/aggregator.py     | 2 +-
 .../async_utils/services/metrics_aggregator/snapshot.py       | 2 +-
 src/inference_endpoint/config/schema.py                       | 4 ++--
 .../config/templates/concurrency_template_full.yaml           | 2 +-
 .../config/templates/offline_template_full.yaml               | 2 +-
 .../config/templates/online_template_full.yaml                | 2 +-
 tests/unit/commands/test_benchmark.py                         | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index e6182d198..907a25109 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils.
 
 - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O.
 - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks).
-- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 300 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
+- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
 - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`).
 - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots.
 
diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
index 882683968..207c83889 100644
--- a/docs/async_utils/services/metrics_aggregator/DESIGN.md
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -38,7 +38,7 @@ INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► C
 - **LIVE**: the publisher tick task emits a snapshot every
   `--publish-interval` seconds (default 0.25 s).
 - **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed,
-  bounded by the `--drain-timeout` budget (default 300 s; `0` = unlimited).
+  bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited).
 - The ENDED path runs inside a finalization boundary: whatever the drain does
   — finish, time out, or fail — `publish_final` and the shutdown signal always
   run. A tokenizer failure can degrade the snapshot (see the
@@ -174,7 +174,7 @@ COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count)   [
 | `--metrics-socket`               | required | Snapshot PUB socket name                            |
 | `--metrics-output-dir`           | required | Directory for `final_snapshot.json`                 |
 | `--publish-interval`             | 0.25     | Live snapshot cadence (seconds)                     |
-| `--drain-timeout`                | 300.0    | End-of-run tokenize budget (`0` = unlimited)        |
+| `--drain-timeout`                | 60.0     | End-of-run tokenize budget (`0` = unlimited)        |
 | `--tokenizer`                    | none     | HF name or local path; unset disables token metrics |
 | `--tokenizer-workers`            | 2        | Live in-process threads (`0` = defer all to drain)  |
 | `--streaming`                    | off      | Register TTFT/chunk-delta/TPOT triggers             |
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index e3d136ab0..2e042943c 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -133,11 +133,11 @@ async def main() -> None:
     parser.add_argument(
         "--drain-timeout",
         type=float,
-        default=300.0,
+        default=60.0,
         help=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
             "after ENDED before the aggregator emits the final snapshot with "
-            "n_pending_tasks > 0 (default: 300.0; 0 = wait indefinitely). Increase "
+            "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase "
             "for very large datasets where the end-of-run tokenize batch is big."
         ),
     )
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index 233342b46..686e5eb2f 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -96,7 +96,7 @@ class MetricCounterKey(str, Enum):
 _TOKEN_HDR_LOW: Final[int] = 1
 _TOKEN_HDR_HIGH: Final[int] = 10_000_000  # 10M tokens
 
-_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 300.0
+_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 60.0
 
 
 class MetricsAggregatorService(ZmqMessageSubscriber[EventRecord]):
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
index e233f36a3..eacac94f5 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
@@ -45,7 +45,7 @@ class SessionState(str, Enum):
     LIVE        → run in progress; tick task publishing live HDR-derived stats.
     DRAINING    → ``SessionEventType.ENDED`` has been received; the aggregator
                   is tokenizing the buffered samples (bounded by the
-                  ``--drain-timeout`` budget, default 300 s). Tick task
+                  ``--drain-timeout`` budget, default 60 s). Tick task
                   continues at this stage, still HDR-derived; no new events
                   will arrive.
     COMPLETE    → terminal clean state. The ``publish_final()`` snapshot
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 6a8b9b872..0a59074f5 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -584,11 +584,11 @@ class DrainConfig(BaseModel):
             ),
         ),
     ] = Field(
-        300.0,
+        60.0,
         ge=0,
         description=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
-            "after ENDED (default: 300.0; 0 = unlimited)."
+            "after ENDED (default: 60.0; 0 = unlimited)."
         ),
     )
 
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 5132f5b0e..75feab6fb 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
+    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index e3ec95284..3ff1ccd17 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
+    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 73c0b69d4..1287b99af 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
+    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 7d43017dc..4109ebfc2 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -489,7 +489,7 @@ def test_defaults(self):
         assert cfg.warmup_timeout_s == 240.0
         assert cfg.performance_timeout_s == 240.0
         assert cfg.accuracy_timeout_s is None
-        assert cfg.metrics_drain_timeout_s == 300.0
+        assert cfg.metrics_drain_timeout_s == 60.0
 
     @pytest.mark.unit
     @pytest.mark.parametrize(

From 033d724b797bfb351b563f8f7661df8376326de9 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 17:03:37 -0700
Subject: [PATCH 15/20] chore(metrics): drop dead local in the live-cancel
 handler

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../async_utils/services/metrics_aggregator/token_metrics.py     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 1ae3c5ce7..daa3f1424 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -614,7 +614,6 @@ async def flush(self, live: bool = False) -> None:
                     if live:
                         self._text[:0] = text_items
                         self._msg[:0] = msg_items
-                        msg_items = []
                     raise
                 except Exception as exc:  # noqa: BLE001 — isolate phases.
                     failure = exc

From 6b704332d0cdefb26929186281d221a69492652d Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 17:42:20 -0700
Subject: [PATCH 16/20] refactor(metrics): single-source service defaults in
 schema; tighten docs

metrics_tokenizer_workers returns to DrainConfig (default 2, ge=0; 0 =
defer all to drain) and execute.py forwards it again. --drain-timeout and
--tokenizer-workers become required service args; the aggregator ctor and
BatchTokenizer lose their duplicated defaults. Docs and comments trimmed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                                     |   2 +-
 .../services/metrics_aggregator/DESIGN.md     | 246 +++++++-----------
 .../services/metrics_aggregator/__main__.py   |  97 ++-----
 .../services/metrics_aggregator/aggregator.py |  44 +---
 .../services/metrics_aggregator/snapshot.py   |   2 +-
 .../metrics_aggregator/token_metrics.py       | 111 ++++----
 .../commands/benchmark/execute.py             |   6 +
 src/inference_endpoint/config/schema.py       |  18 ++
 .../templates/concurrency_template_full.yaml  |   1 +
 .../templates/offline_template_full.yaml      |   1 +
 .../templates/online_template_full.yaml       |   1 +
 .../services/metrics_aggregator/conftest.py   |   2 +
 .../test_aggregator_error_handler.py          |   1 +
 .../metrics_aggregator/test_token_metrics.py  |  24 +-
 tests/unit/commands/test_benchmark.py         |  15 +-
 15 files changed, 222 insertions(+), 349 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 907a25109..c9e0c7f41 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils.
 
 - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O.
 - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks).
-- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
+- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
 - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`).
 - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots.
 
diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md
index 207c83889..42929f5a4 100644
--- a/docs/async_utils/services/metrics_aggregator/DESIGN.md
+++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md
@@ -1,189 +1,119 @@
-# Metrics Aggregator Service — Design Document
-
-## Overview
-
-The metrics aggregator is a **subprocess** (`python -m
-inference_endpoint.async_utils.services.metrics_aggregator`) that subscribes to
-the EventRecord pub/sub stream, folds per-sample events into a
-`MetricsRegistry` (counters + HDR-histogram series + raw values), and publishes
-`MetricsSnapshot` frames over an IPC PUB socket at a fixed cadence. At
-end-of-run it atomically writes `final_snapshot.json`, which is the **primary**
-source for `Report`; the terminal pub/sub frame is only a TUI "run finished"
-signal.
-
-This document covers the service's lifecycle and, in depth, the **token
-metrics pipeline** — how ISL/OSL/TPOT tokenization keeps pace with
-high-completion-rate runs.
-
-## Module Layout
-
-| File               | Purpose                                                                         |
-| ------------------ | ------------------------------------------------------------------------------- |
-| `__main__.py`      | Subprocess entry: argparse, strict tokenizer startup, lifecycle wiring, SIGTERM |
-| `aggregator.py`    | `MetricsAggregatorService` — event router, session state, drain                 |
-| `registry.py`      | `MetricsRegistry`, `CounterSampler`, `SeriesSampler`                            |
-| `snapshot.py`      | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec                    |
-| `publisher.py`     | `MetricsPublisher` — tick task + atomic final-snapshot write                    |
-| `subscriber.py`    | `MetricsSnapshotSubscriber` — main-process consumer                             |
-| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL)                    |
-| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue`               |
+# Metrics Aggregator Service — Design
+
+The metrics aggregator is a subprocess (`python -m
+inference_endpoint.async_utils.services.metrics_aggregator`) that subscribes
+to the EventRecord stream, folds per-sample events into a `MetricsRegistry`,
+and publishes `MetricsSnapshot` frames over IPC PUB at a fixed cadence. At
+end-of-run it atomically writes `final_snapshot.json` — the **primary** source
+for `Report`; the terminal pub/sub frame is only a TUI "run finished" signal.
 
 ## Lifecycle
 
 ```
 INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► COMPLETE
-                                                └──► INTERRUPTED  (SIGTERM/SIGINT)
+                                                └──► INTERRUPTED  (SIGTERM)
 ```
 
-- **LIVE**: the publisher tick task emits a snapshot every
-  `--publish-interval` seconds (default 0.25 s).
-- **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed,
-  bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited).
-- The ENDED path runs inside a finalization boundary: whatever the drain does
-  — finish, time out, or fail — `publish_final` and the shutdown signal always
-  run. A tokenizer failure can degrade the snapshot (see the
-  `n_pending_tasks` contract below) but can never hang the subprocess.
-- **INTERRUPTED**: a signal handler writes a best-effort partial final
-  snapshot so `Report` can distinguish a killed run from a clean one.
-
-## Token Metrics Pipeline
-
-ISL, OSL, and TPOT all require running the HF tokenizer over prompt or
-completion text. With streaming on, each completed sample needs up to three
-tokenizer passes, so at high completion rates tokenization is the service's
-dominant CPU cost — and a per-event dispatch model cannot keep up: work
-arriving faster than it drains accumulates an unbounded backlog that must be
-paid at end-of-run. The pipeline is therefore built around two ideas:
-**defer-to-flush batching** and **process-sharded batch encoding**.
+The ENDED path runs inside a finalization boundary: whatever the drain does —
+finish, time out, or fail — `publish_final` and the shutdown signal always
+run. A tokenizer failure can degrade the snapshot (see the `n_pending_tasks`
+contract) but can never hang the subprocess. SIGTERM writes a best-effort
+partial snapshot tagged `INTERRUPTED`.
+
+## Token metrics pipeline
+
+ISL/OSL/TPOT require tokenizer passes per completed sample; at high completion
+rates a per-event dispatch model accumulates an unbounded backlog. The
+pipeline batches instead: **defer-to-flush** + **process-sharded encoding**.
 
 ### Defer-to-flush (`TokenBatchQueue`)
 
-Token triggers do no work at event time. `fire()` appends
-`(text, on_count)` — or `(message_parts, on_count)` for chat-template items —
-to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared
-in batches at exactly two points:
-
-1. **The queue's own live loop** — `start_live(interval)` flushes
-   periodically (at the publish cadence) through the tokenizer's **in-process
-   live lane**: a small thread pool of `--tokenizer-workers` threads
-   (default 2) whose rayon pool is capped to the same width, taking at most
-   `_LIVE_FLUSH_MAX_ITEMS` per flush so the queue lock is never held for a
-   long encode. Live flushes never touch the shard processes; they run inside
-   the aggregator process, wherever the parent placed it.
-   `--tokenizer-workers 0` disables mid-run tokenization entirely. Failures
-   are logged once and never stop the loop — failed or cancelled live items
-   are **re-queued** so the drain retries them.
+Triggers do no work at event time — `fire()` appends `(text, on_count)` to a
+buffer, O(1), no tasks. The buffer is cleared at two points:
+
+1. **Live loop** — `start_live(interval)` flushes periodically through the
+   tokenizer's in-process lane: `--tokenizer-workers` threads, rayon capped
+   to the same width, at most `_LIVE_FLUSH_MAX_ITEMS` per flush. Never
+   touches the shard processes. `0` disables mid-run tokenization. Failed or
+   cancelled live items are **re-queued** — the drain retries them.
 2. **End-of-run** — `flush_remaining(timeout)` stops the live loop and drains
-   everything still buffered through **every** shard, bounded by the drain
-   budget. The publisher knows nothing about tokenization — it only reads
-   `(state, n_pending_tasks)`.
-
-`flush()` serializes under an asyncio lock and detaches the buffer up front,
-so enqueues that race a flush land in the next one. Failure isolation is
-layered: the plain-text phase and the chat-template phase fail independently
-(in drain mode they run on separate executors, so a dead text shard must not
-drop message items), a raising recorder callback is logged without aborting
-the rest of the batch, and the first error is re-raised only after both
-phases ran. Live-mode failures and cancellations re-queue the detached items
-(a mid-run hiccup never loses samples); drain-mode failures are terminal —
-the items stay counted in `pending`. `flush_remaining` never raises — a
-timeout or tokenizer failure becomes a logged, non-zero pending count.
+   everything left through every shard, bounded by the drain budget.
+
+`flush()` serializes under an asyncio lock and detaches the buffer up front.
+The text and chat-template phases fail independently; a raising recorder is
+logged without aborting the batch. Drain failures are terminal — items stay
+counted in `pending`. `flush_remaining` never raises.
 
 ### Sharded batch encoding (`BatchTokenizer`)
 
-The end-of-run drain hands the whole buffer to `count_texts_async`, which splits it into
-contiguous chunks and fans them out across worker **processes**, one pinned to
-each block of `CORES_PER_WORKER` (8) cores. Why this shape:
-
-- Each worker runs the raw `tokenizers` backend's `encode_batch_fast` — Rust,
-  rayon-parallel, no Python-per-text cost. Batching amortizes the
-  submit/result overhead over thousands of texts.
-- A single BPE rayon pool is memory-bound and saturates at ~8 cores; more
-  threads oversubscribe and, on multi-socket parts, cross the NUMA boundary.
-  Sharding across processes pinned to disjoint 8-core blocks (affinity set
-  **before** the backend loads, so each rayon pool sizes itself to its block
-  and stays NUMA-local) is how the whole machine is used.
-- Workers are spawn-context processes with module-level entry points (pickled
-  by name), warmed in parallel at construction so N tokenizer loads do not
-  serialize (the warmup wait is bounded — a hung load is a startup error, not
-  a wedge), and they ignore SIGINT — Ctrl-C goes to the whole process group,
-  and worker lifetime must stay under the parent drain's control.
-
-The shard pool has no CLI knob: it always auto-sizes to one shard per
-8-core block of the allowed CPU universe (always at least one).
-`--tokenizer-workers` sizes the **live** in-process thread lane instead
-(default 2; `0` = no mid-run tokenization). There is no implicit fallback: an
-environment that cannot shard — no fast Rust backend, a failed or over-budget
-warmup — is a startup error, because a silent in-process slow path cannot
-keep up with completions and would surface much later as an incomplete drain.
-Platforms without a CPU-affinity API (e.g. macOS) still shard at full speed,
-just unpinned: blocks are sized from the online CPU count and each worker
-caps its rayon pool to the block size instead of pinning.
-
-Chat-template items (tool-call outputs) take a separate in-process thread:
-they are rare relative to the batched flush, and `apply_chat_template` is
-Python/Jinja — sharding buys nothing. A template baseline (the empty
-assistant-message frame) is computed once and subtracted so only the payload
-is counted.
-
-### CPU affinity: the tokenizer stage is post-run
-
-The benchmark parent pins itself to the loadgen cores (the fastest
-perf-ranked physical cores) before launching services, and subprocesses
-inherit that narrow mask. The tokenizer's heavy work happens **after** the
-run, so the run-time core partition does not apply to it — but the aggregator
-itself must not move: `_setup_shards` probes the full allowed universe via
-`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`; the
-kernel still clamps to the cgroup/Slurm cpuset) **and then restores the
-inherited mask**, so the event loop, the publisher, and the live tokenizer
-threads stay exactly where the parent placed them. Only the drain-phase shard
-children, which pin themselves to their own 8-core blocks, span the whole
-machine — and they are idle until `ENDED`.
+The drain fans the whole buffer out across worker **processes**, one pinned
+per `CORES_PER_WORKER` (8) core block. Each worker runs the raw `tokenizers`
+backend's `encode_batch_fast` (Rust, rayon); a single BPE rayon pool
+saturates ~8 cores, so disjoint pinned blocks are how the whole machine is
+used. Workers are spawn-context, warmed in parallel at construction (bounded
+— a hung load is a startup error), and ignore SIGINT.
 
-### The `n_pending_tasks` contract
+The shard pool has no knob: it auto-sizes to one shard per 8-core block of
+the allowed CPU universe. There is no fallback — no fast Rust backend, or a
+failed/over-budget warmup, is a startup error, because an in-process slow
+path cannot keep up and would surface much later as an incomplete drain.
+Platforms without an affinity API (macOS) shard unpinned; each worker caps
+its rayon pool to the block size instead.
 
-`TokenBatchQueue.pending` counts enqueued-but-not-yet-recorded items and is
-surfaced on every snapshot as `n_pending_tasks`. In the **final** snapshot:
+Chat-template items (tool calls) run on the in-process thread lane —
+`apply_chat_template` is Python/Jinja; sharding buys nothing.
 
-- `state == complete && n_pending_tasks == 0` — clean run, token series exact.
-- `state == complete && n_pending_tasks > 0` — **incomplete drain**: the
-  end-of-run flush ran out of budget or the tokenizer failed; token-derived
-  series are missing exactly that many samples. `Report` renders a warning.
+### CPU affinity: tokenize is post-run
 
-Items dropped by a failed flush are intentionally _not_ removed from the
-pending count — under-reporting an incomplete drain would silently rebadge it
-as a clean run.
+The parent pins itself to the loadgen cores and children inherit that narrow
+mask. `_setup_shards` probes the full allowed universe via
+`expand_to_all_online_cpus()` (cgroup/Slurm-clamped) for the block math,
+**then restores the inherited mask** — the aggregator stays where the parent
+placed it; only the drain-phase shard children span the machine, and they
+are idle until `ENDED`.
+
+### The `n_pending_tasks` contract
+
+`TokenBatchQueue.pending` (enqueued-but-not-recorded) is surfaced on every
+snapshot as `n_pending_tasks`. In the final snapshot:
+
+- `state == complete && n_pending_tasks == 0` — clean run, exact series.
+- `state == complete && n_pending_tasks > 0` — **incomplete drain** (budget
+  exhausted or tokenizer failed); `Report` renders a warning. Failed items
+  are deliberately not removed from the count — under-reporting would
+  rebadge an incomplete drain as clean.
 
 ### Data flow
 
 ```
-COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count)   [O(1)]
-                                            │
-   live loop (0.25 s) ── flush(live) ───────┤─► in-process thread pool
-                                            │   (rayon capped to --tokenizer-workers)
-   ENDED drain (budgeted) ── flush() ───────┘─► chunks ─► N pinned worker procs
-                                                │          (encode_batch_fast)
-                                                └─► on_count(n) ─► registry.record()
+COMPLETE event ─► trigger.fire ─► queue.enqueue(text, on_count)        [O(1)]
+                                       │
+  live loop (publish cadence) ─ flush(live) ─► in-process threads (rayon-capped)
+  ENDED drain (budgeted) ────── flush() ─────► chunks ─► N pinned worker procs
+                                                  └─► on_count(n) ─► registry.record()
 ```
 
-## CLI Interface
+## CLI
+
+| Flag                             | Default           | Purpose                                             |
+| -------------------------------- | ----------------- | --------------------------------------------------- |
+| `--socket-dir` / `--socket-name` | required          | EventRecord SUB socket                              |
+| `--metrics-socket`               | required          | Snapshot PUB socket name                            |
+| `--metrics-output-dir`           | required          | Directory for `final_snapshot.json`                 |
+| `--publish-interval`             | 0.25              | Live snapshot cadence (seconds)                     |
+| `--drain-timeout`                | required (schema) | End-of-run tokenize budget (`0` = unlimited)        |
+| `--tokenizer`                    | none              | HF name or local path; unset disables token metrics |
+| `--tokenizer-workers`            | required (schema) | Live in-process threads (`0` = defer all to drain)  |
+| `--streaming`                    | off               | Register TTFT/chunk-delta/TPOT triggers             |
 
-| Flag                             | Default  | Purpose                                             |
-| -------------------------------- | -------- | --------------------------------------------------- |
-| `--socket-dir` / `--socket-name` | required | EventRecord SUB socket                              |
-| `--metrics-socket`               | required | Snapshot PUB socket name                            |
-| `--metrics-output-dir`           | required | Directory for `final_snapshot.json`                 |
-| `--publish-interval`             | 0.25     | Live snapshot cadence (seconds)                     |
-| `--drain-timeout`                | 60.0     | End-of-run tokenize budget (`0` = unlimited)        |
-| `--tokenizer`                    | none     | HF name or local path; unset disables token metrics |
-| `--tokenizer-workers`            | 2        | Live in-process threads (`0` = defer all to drain)  |
-| `--streaming`                    | off      | Register TTFT/chunk-delta/TPOT triggers             |
+`--drain-timeout` and `--tokenizer-workers` carry no service-side defaults:
+the benchmark always forwards them from `config/schema.py`
+(`--metrics-drain-timeout`, `--metrics-tokenizer-workers`), the single source
+of truth for their values.
 
 ## References
 
 - [docs/async_utils/services/DESIGN.md](../DESIGN.md) — the EventRecord
   pub/sub system this service subscribes to.
 - [docs/PERF_ARCHITECTURE.md](../../../PERF_ARCHITECTURE.md) — CPU pinning
-  strategy for the loadgen/worker hot path.
-- AGENTS.md "Metrics Aggregator subprocess" — the condensed contract summary
-  for AI agents.
+  for the loadgen/worker hot path.
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 2e042943c..58733c087 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -49,27 +49,18 @@ def _make_sigterm_handler(
 ) -> tuple[Callable[[], None], set[asyncio.Task]]:
     """Build the SIGTERM handler that writes the INTERRUPTED final snapshot.
 
-    Returns ``(handler, pending_tasks)``. ``pending_tasks`` is the
-    strong-reference container that keeps spawned finalize tasks alive
-    while they run: asyncio tracks tasks only by weakref, so a task
-    whose only reference is the local variable inside the handler can
-    be garbage-collected mid-execution (per Python's asyncio docs).
-    Each spawned task self-removes from the set via
-    ``add_done_callback`` once it completes.
-
-    Exposed at module level (rather than nested in ``main()``) so the
-    GC-safety contract is unit-testable without driving the whole
-    subprocess lifecycle.
+    Returns ``(handler, pending_tasks)``: asyncio holds tasks only by
+    weakref, so the handler's finalize task must live in this
+    strong-reference set until done. Module-level so the GC-safety
+    contract is unit-testable.
     """
     pending_tasks: set[asyncio.Task] = set()
 
     async def _signal_finalize() -> None:
         try:
-            # Mirror the ENDED-driven path: refresh tracked_duration_ns
-            # from the table BEFORE publish_final, otherwise an
-            # interrupted run whose STOP_PERFORMANCE_TRACKING never
-            # fired would report duration_ns=0 and QPS=N/A in the final
-            # report even after processing many tracked samples.
+            # Refresh tracked_duration_ns before publish_final (mirrors the
+            # ENDED path) — otherwise an interrupted run whose
+            # STOP_PERFORMANCE_TRACKING never fired reports QPS=N/A.
             registry.set_counter(
                 MetricCounterKey.TRACKED_DURATION_NS.value,
                 table.total_tracked_duration_ns,
@@ -133,12 +124,11 @@ async def main() -> None:
     parser.add_argument(
         "--drain-timeout",
         type=float,
-        default=60.0,
+        required=True,
         help=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
-            "after ENDED before the aggregator emits the final snapshot with "
-            "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase "
-            "for very large datasets where the end-of-run tokenize batch is big."
+            "after ENDED (0 = wait indefinitely). The benchmark forwards "
+            "--metrics-drain-timeout; the default lives in config/schema.py."
         ),
     )
     parser.add_argument(
@@ -162,12 +152,12 @@ async def main() -> None:
     parser.add_argument(
         "--tokenizer-workers",
         type=int,
-        default=2,
+        required=True,
         help=(
             "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT "
-            "(default: 2; 0 = no mid-run tokenization, everything defers "
-            "to the end-of-run drain). The drain always uses the auto-sized "
-            "sharded pool — one worker process per 8-core block."
+            "(0 = defer everything to the end-of-run drain, which always uses "
+            "the auto-sized sharded pool). The benchmark forwards "
+            "--metrics-tokenizer-workers; the default lives in config/schema.py."
         ),
     )
     parser.add_argument(
@@ -194,12 +184,8 @@ async def main() -> None:
     if args.tokenizer_workers < 0:
         raise SystemExit("FATAL: --tokenizer-workers must be >= 0")
 
-    # The parent owns directory setup — `commands/benchmark/execute.py`
-    # creates `<report_dir>/metrics/` and validates it before launching
-    # this subprocess. Validate here as a fail-fast contract check so a
-    # misbehaving launcher (or a manual invocation) surfaces a clear
-    # error in this subprocess's stderr instead of crashing later on
-    # the atomic-write path.
+    # The parent (commands/benchmark/execute.py) owns directory creation;
+    # fail fast here so a bad launcher errors now, not on the atomic write.
     metrics_output_dir: Path = args.metrics_output_dir
     if not metrics_output_dir.is_dir():
         raise SystemExit(
@@ -220,9 +206,8 @@ async def main() -> None:
                 args.tokenizer, live_workers=args.tokenizer_workers
             )
         except RuntimeError as exc:
-            # Fail-fast contract: a tokenizer environment that cannot shard
-            # must surface as a clear service-launch failure, not a silent
-            # slow path that cannot keep up with completions.
+            # An environment that cannot shard is a launch failure, not a
+            # silent slow path that cannot keep up with completions.
             raise SystemExit(f"FATAL: {exc}") from exc
     else:
         tokenizer_cm = nullcontext()
@@ -260,29 +245,12 @@ async def main() -> None:
             )
             aggregator.start()
 
-            # SIGTERM only — the parent's ServiceLauncher.kill_all uses
-            # SIGTERM to kill the aggregator child before an ENDED event
-            # arrives; without this handler that path leaves the Report
-            # consumer with no final_snapshot file. The signal-triggered
-            # snapshot is tagged INTERRUPTED so Report can distinguish
-            # "parent killed the run" from a clean shutdown.
-            # publish_final is idempotent (see
-            # MetricsPublisher._finalized), so racing with the
-            # ENDED-driven call is safe.
-            #
-            # SIGINT is deliberately NOT handled in the same way. On an
-            # interactive ^C, the OS sends SIGINT to the whole
-            # foreground process group — parent + child both receive
-            # it. If we finalized eagerly here, the aggregator would
-            # write final_snapshot.json from whatever state it had at
-            # signal time, then exit; samples that completed during the
-            # parent's own graceful shutdown window would never reach
-            # the file (the parent eventually emits ENDED on its events
-            # channel, but `_finalized=True` makes that a no-op). The
-            # parent's clean-shutdown path is what we want to drive the
-            # aggregator's finalize — so we install a no-op handler for
-            # SIGINT here, which prevents Python's default
-            # KeyboardInterrupt and lets the parent control the lifecycle.
+            # SIGTERM (ServiceLauncher.kill_all) must still produce a final
+            # snapshot, tagged INTERRUPTED; publish_final is idempotent, so
+            # racing the ENDED-driven call is safe. SIGINT (^C hits the whole
+            # process group) is a no-op: finalizing eagerly at signal time
+            # would freeze the snapshot before the parent's graceful-shutdown
+            # samples land — the parent's ENDED drives finalize instead.
             on_sigterm, _sigterm_tasks = _make_sigterm_handler(
                 loop=loop,
                 registry=registry,
@@ -292,8 +260,6 @@ async def main() -> None:
                 shutdown_event=shutdown_event,
             )
             loop.add_signal_handler(signal.SIGTERM, on_sigterm)
-            # No-op SIGINT handler: silence the default KeyboardInterrupt
-            # and let the parent's ENDED-driven path drive shutdown.
             loop.add_signal_handler(
                 signal.SIGINT,
                 lambda: logger.info(
@@ -313,24 +279,13 @@ async def main() -> None:
 
 
 if __name__ == "__main__":
-    # Surface startup / bind / tokenizer-load failures with structured
-    # context. Without this wrap, the parent's ServiceLauncher only sees
-    # the non-zero exit code and a raw traceback — no diagnostic context
-    # to correlate against the parent's logs. The except/raise pattern
-    # preserves the original exit code (1) and traceback while emitting
-    # the structured logger.exception line before the interpreter prints
-    # the trace.
     try:
         LoopManager().default_loop.run_until_complete(main())
     except SystemExit:
         # argparse / explicit sys.exit — already user-facing, don't dress up.
         raise
     except Exception as e:
-        # Catch Exception (not BaseException) so KeyboardInterrupt /
-        # SystemExit propagate untouched — those are control-flow
-        # signals, not crashes, and labeling them as "crashed" would
-        # mislead operators. The exception type goes first in the log
-        # message so it's grep-able without scrolling through the
-        # traceback.
+        # Structured log line so the crash is grep-able against the parent's
+        # logs; KeyboardInterrupt/SystemExit propagate untouched.
         logger.exception("metrics aggregator subprocess crashed (%s)", type(e).__name__)
         raise
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index 686e5eb2f..0a6f2dfbd 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -96,8 +96,6 @@ class MetricCounterKey(str, Enum):
 _TOKEN_HDR_LOW: Final[int] = 1
 _TOKEN_HDR_HIGH: Final[int] = 10_000_000  # 10M tokens
 
-_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 60.0
-
 
 class MetricsAggregatorService(ZmqMessageSubscriber[EventRecord]):
     """Subscribes to EventRecords and computes per-sample metrics in real time.
@@ -121,15 +119,11 @@ def __init__(
         live_flush_interval_s: float | None = None,
         streaming: bool = False,
         shutdown_event: asyncio.Event | None = None,
-        drain_timeout_s: float | None = _DEFAULT_DRAIN_TIMEOUT_S,
+        drain_timeout_s: float | None,
         **kwargs,
     ):
-        # drain_timeout_s is injected (not derived) because the right
-        # value is workload-dependent: long-context tokenize-heavy runs
-        # need more headroom than the default 60 s, and the aggregator
-        # itself can't measure that ahead of time. Keeping it as an arg
-        # lets the __main__ CLI flag plumb the user's choice through
-        # without coupling this class to argparse.
+        # drain_timeout_s has no default here: the one default lives in
+        # config/schema.py (metrics_drain_timeout_s). None = wait forever.
         super().__init__(EventRecordCodec(), *args, **kwargs)
         self._registry = registry
         self._publisher = publisher
@@ -304,15 +298,9 @@ async def process(self, records: list[EventRecord]) -> None:
                 else:
                     if ev == SessionEventType.STARTED:
                         if self._session_start_ns is not None:
-                            # A duplicate STARTED is a producer bug:
-                            # re-assigning _session_start_ns would freeze
-                            # total_duration_ns (the max-of-elapsed guard
-                            # never updates once the start moves forward)
-                            # and corrupt every downstream rate calc for
-                            # the rest of the run. Surface loudly and
-                            # ignore — the publisher.start guard already
-                            # rejects the second tick-task spawn, but
-                            # session-state must also be defended here.
+                            # Producer bug: re-assigning _session_start_ns
+                            # would freeze total_duration_ns (max-of-elapsed
+                            # guard) and corrupt every downstream rate calc.
                             logger.error(
                                 "Duplicate STARTED event received "
                                 "(original at ts=%d, duplicate at ts=%d); "
@@ -397,16 +385,13 @@ async def process(self, records: list[EventRecord]) -> None:
             # that fires before publish_final reflects the new state.
             self._session_state = SessionState.DRAINING
             logger.info("Draining %d pending tokenizations...", self.pending_tokens)
-            # The drain and final publish are wrapped together so the aggregator
-            # ALWAYS reaches _finalize (which sets the shutdown event); a
-            # tokenizer failure during the drain must not skip publish_final and
-            # leave main()'s `await shutdown_event.wait()` hanging.
+            # Drain + final publish run inside one finalization boundary: a
+            # tokenizer failure must not skip publish_final and leave
+            # main()'s `await shutdown_event.wait()` hanging.
             n_pending = self.pending_tokens
             try:
-                # flush_remaining tokenizes the whole buffer in one batched pass,
-                # bounded by the drain budget, and never raises: it returns the
-                # count it could not finish (timeout or failure), which becomes
-                # the snapshot's n_pending_tasks so Report flags an incomplete drain.
+                # flush_remaining never raises; it returns the count it could
+                # not finish, which becomes the snapshot's n_pending_tasks.
                 if self._token_queue is not None:
                     n_pending = await self._token_queue.flush_remaining(
                         self._drain_timeout_s
@@ -432,11 +417,8 @@ async def process(self, records: list[EventRecord]) -> None:
                 )
                 await self._publisher.publish_final(registry, n_pending_tasks=n_pending)
             finally:
-                # The aggregator MUST close the publisher and signal shutdown even
-                # if the drain/publish above failed — otherwise main()'s
-                # `await shutdown_event.wait()` hangs forever. aclose is
-                # independently wrapped: its failure must not prevent _finalize,
-                # which is what sets the shutdown event.
+                # aclose is independently wrapped: its failure must not
+                # prevent _finalize, which sets the shutdown event.
                 try:
                     await self._publisher.aclose()
                 except Exception:  # noqa: BLE001 — best-effort cleanup.
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
index eacac94f5..a1e461c43 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
@@ -45,7 +45,7 @@ class SessionState(str, Enum):
     LIVE        → run in progress; tick task publishing live HDR-derived stats.
     DRAINING    → ``SessionEventType.ENDED`` has been received; the aggregator
                   is tokenizing the buffered samples (bounded by the
-                  ``--drain-timeout`` budget, default 60 s). Tick task
+                  ``--drain-timeout`` budget — schema default 60 s). Tick task
                   continues at this stage, still HDR-derived; no new events
                   will arrive.
     COMPLETE    → terminal clean state. The ``publish_final()`` snapshot
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index daa3f1424..d67d82f97 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -15,15 +15,12 @@
 
 """Tokenization for ISL/OSL/TPOT metrics.
 
-``BatchTokenizer`` tokenizes whole batches at once, sharded across worker
-processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE
-rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers
-per-sample text. The sharded pool is the drain-phase accelerator and is
-auto-sized (one shard per core block); live mid-run flushes run on a small
-in-process thread pool (``--tokenizer-workers``, default 2) owned by the
-queue's live loop. A tokenizer without a fast (Rust) backend is a startup
-error, never a silent slow path. Platforms without CPU affinity (e.g. macOS)
-shard unpinned at full speed; only cache/NUMA locality is lost.
+``BatchTokenizer`` runs two lanes: live mid-run flushes on a small in-process
+thread pool (``--tokenizer-workers``), and the end-of-run drain sharded
+across worker processes each pinned to a ``CORES_PER_WORKER`` block.
+``TokenBatchQueue`` buffers per-sample work and clears it in batches. A
+tokenizer without a fast (Rust) backend is a startup error, never a silent
+slow path; platforms without CPU affinity (e.g. macOS) shard unpinned.
 """
 
 from __future__ import annotations
@@ -52,18 +49,14 @@
 # used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process.
 CORES_PER_WORKER = 8
 
-# Budget for the parallel shard warmup (spawn + transformers import +
-# tokenizer load per worker). A hung load (e.g. a stuck network filesystem)
-# must become a bounded startup error, not wedge service startup — and the
-# error must fire before the parent's 30 s service-launch budget kills the
-# subprocess, so the diagnostic wins the race.
+# Warmup budget (spawn + transformers import + tokenizer load per worker).
+# A hung load must become a startup error that fires before the parent's
+# 30 s service-launch budget kills the subprocess.
 _SHARD_WARMUP_TIMEOUT_S = 25.0
 
-# Per-flush ceiling for the LIVE lane. Bounds three things at once: how long
-# the queue lock is held mid-run, how much work an unstoppable in-flight
-# thread encode can hold after a drain-start cancellation, and how much the
-# drain re-encodes for items the cancelled flush gave back. The drain has no
-# ceiling — it always takes the whole buffer.
+# Per-flush ceiling for the LIVE lane: bounds the lock-hold time and the
+# unstoppable in-flight encode a drain-start cancellation leaves behind.
+# The drain has no ceiling — it always takes the whole buffer.
 _LIVE_FLUSH_MAX_ITEMS = 1024
 
 # Minimal user message used to satisfy chat templates that reject assistant-only
@@ -199,15 +192,16 @@ def __init__(
         self,
         tokenizer_name: str,
         *,
+        live_workers: int,
         cores_per_worker: int = CORES_PER_WORKER,
         n_workers: int = -1,
-        live_workers: int = 2,
     ) -> None:
         self._tokenizer_name = tokenizer_name
-        # The live lane runs in-process: cap this process's rayon pool so a
-        # mid-run batched encode uses ~live_workers cores, not the whole
-        # machine. Must be set before the first encode initializes the pool;
-        # setdefault lets an operator-exported RAYON_NUM_THREADS win.
+        # Cap this process's rayon pool so a live (in-process) batched encode
+        # uses ~live_workers cores, not the whole machine. Must be set before
+        # the first encode initializes the pool; setdefault lets an
+        # operator-exported RAYON_NUM_THREADS win. live_workers has no
+        # default — the one default lives in config/schema.py.
         os.environ.setdefault("RAYON_NUM_THREADS", str(max(1, live_workers)))
         self._live_workers = live_workers
         self._fallback_warned: set[str] = set()
@@ -220,10 +214,9 @@ def __init__(
             max_workers=max(1, live_workers), thread_name_prefix="tok-thread"
         )
         self._load_tokenizer()  # also computes the chat-template baseline
-        # Process shards for the batched text path. Empty only when
-        # in-process mode was explicitly requested (n_workers=0 or
-        # cores_per_worker<=0; ctor overrides used primarily by tests —
-        # production wiring passes live_workers only and shards auto-size).
+        # Process shards for the drain. Empty only when in-process mode was
+        # explicitly requested (n_workers=0 / cores_per_worker<=0, test-only
+        # seams — production wiring always auto-sizes).
         self._procs: list[ProcessPoolExecutor] = []
         self._setup_shards(cores_per_worker, n_workers)
 
@@ -265,14 +258,11 @@ def _load_tokenizer(self) -> None:
     def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
         """Spawn one pinned single-worker process per core block.
 
-        ``n_workers == 0`` explicitly selects in-process tokenization. Auto
-        (``< 0``) fits one shard per ``cores_per_worker`` block of this
-        process's affinity mask (or the online CPU count when the platform
-        has no affinity API — shards then run unpinned), always at least one;
-        an explicit count is clamped to that capacity. An environment that
-        cannot shard — no fast Rust backend, a warmup that fails or exceeds
-        its budget — raises instead of silently degrading to a slow path
-        that cannot keep up with completions.
+        ``n_workers == 0`` selects in-process tokenization; auto (``< 0``)
+        fits one shard per ``cores_per_worker`` block (at least one); an
+        explicit count is clamped to capacity. An environment that cannot
+        shard — no fast Rust backend, a failed or over-budget warmup —
+        raises instead of degrading to a slow path.
         """
         if cores_per_worker <= 0 or n_workers == 0:
             logger.info("BatchTokenizer: in-process tokenization (explicit)")
@@ -283,12 +273,9 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
                 "backend; token metrics require one to keep up with "
                 "completions. Use a fast tokenizer, or disable token metrics."
             )
-        # Probe the full allowed CPU universe (cgroup-clamped) for the shard
-        # block math, then restore this process's inherited mask: the
-        # aggregator's event loop, publisher, and live tokenizer threads stay
-        # exactly where the parent placed them (the loadgen mask on a pinned
-        # Linux run). Only the drain-phase shard processes, pinned to their
-        # own blocks, span the whole machine.
+        # Probe the full allowed CPU universe (cgroup-clamped) for the block
+        # math, then restore the inherited mask: the aggregator stays where
+        # the parent placed it; only the drain shards span the machine.
         try:
             original = os.sched_getaffinity(0)
         except (OSError, AttributeError):
@@ -325,11 +312,9 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
                     initargs=(self._tokenizer_name, block),
                 )
                 procs.append(ex)
-            # Force spawn + pin + tokenizer-load now (not on the first batch).
-            # Submit to every shard first so the loads run in parallel, then
-            # await — waiting on each before submitting the next would
-            # serialize P tokenizer loads and can exceed the launch budget.
-            # The wait is bounded: one hung load must not wedge startup.
+            # Warm all shards in parallel (submit-then-await; awaiting each
+            # before the next would serialize N tokenizer loads). Bounded:
+            # one hung load must not wedge startup.
             ready = [ex.submit(_worker_ready, 0) for ex in procs]
             deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S
             for f in ready:
@@ -507,16 +492,12 @@ async def token_count_message_async(
 class TokenBatchQueue:
     """Buffers per-sample tokenization work and clears it in batches.
 
-    Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an
-    ``on_count`` callback that records the resulting metric. The queue owns
-    its own flush cadence: ``start_live`` begins a periodic flush through the
-    tokenizer's bounded live lane (so live ISL/OSL/TPOT stay current without
-    touching the benchmark's cores), and ``flush_remaining`` drains everything
-    left at end-of-run through every shard.
-
-    ``pending`` counts enqueued-but-not-yet-recorded items; it is the
-    ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot
-    means the end-of-run flush did not finish within the drain budget or failed.
+    Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with
+    an ``on_count`` recorder callback. The queue owns its flush cadence:
+    ``start_live`` flushes periodically through the tokenizer's bounded live
+    lane; ``flush_remaining`` drains everything left at end-of-run through
+    every shard. ``pending`` is the snapshot's ``n_pending_tasks`` —
+    non-zero in the final snapshot means an incomplete drain.
     """
 
     def __init__(
@@ -577,16 +558,12 @@ def enqueue_message(
     async def flush(self, live: bool = False) -> None:
         """Tokenize everything buffered so far and run each ``on_count``.
 
-        ``live=True`` routes text batches through the tokenizer's bounded
-        live lane instead of the full shard pool, takes at most
-        ``_LIVE_FLUSH_MAX_ITEMS`` per kind (bounding lock-hold time and the
-        unstoppable in-flight encode a drain-start cancellation leaves
-        behind), and re-queues items on failure or cancellation so a mid-run
-        hiccup never loses samples — the end-of-run drain retries them. Drain-mode failures are terminal: the
-        un-recorded items stay counted in ``pending`` (``_inflight`` is
-        decremented only after a callback runs) and surface as an incomplete
-        drain, not as silently dropped samples. Items are detached from the
-        buffer up front so concurrent enqueues land in the next flush.
+        ``live=True`` routes text through the tokenizer's bounded live lane,
+        takes at most ``_LIVE_FLUSH_MAX_ITEMS`` per kind, and re-queues items
+        on failure or cancellation — a mid-run hiccup never loses samples.
+        Drain-mode failures are terminal: un-recorded items stay counted in
+        ``pending`` and surface as an incomplete drain. Items are detached up
+        front so concurrent enqueues land in the next flush.
         """
         async with self._lock:
             if not (self._text or self._msg):
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 380a0b14d..a2050bbe3 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -612,6 +612,12 @@ async def _run_benchmark_async(
         aggregator_args.extend(
             ["--drain-timeout", str(config.settings.drain.metrics_drain_timeout_s)]
         )
+        aggregator_args.extend(
+            [
+                "--tokenizer-workers",
+                str(config.settings.drain.metrics_tokenizer_workers),
+            ]
+        )
 
         # EventLoggerService writes events.jsonl to tmpfs (high-frequency writes)
         event_logger_args: list[str] = [
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 0a59074f5..f093a547a 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -591,6 +591,24 @@ class DrainConfig(BaseModel):
             "after ENDED (default: 60.0; 0 = unlimited)."
         ),
     )
+    metrics_tokenizer_workers: Annotated[
+        int,
+        cyclopts.Parameter(
+            alias="--metrics-tokenizer-workers",
+            help=(
+                "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT in "
+                "the metrics aggregator. 0 defers all tokenization to the "
+                "end-of-run drain, which always uses the auto-sized sharded pool."
+            ),
+        ),
+    ] = Field(
+        2,
+        ge=0,
+        description=(
+            "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT "
+            "(default: 2; 0 = defer everything to the end-of-run drain)."
+        ),
+    )
 
 
 @cyclopts.Parameter(name="*")
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 75feab6fb..30b224402 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -80,6 +80,7 @@ settings:
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
     metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index 3ff1ccd17..ae0ae939c 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -80,6 +80,7 @@ settings:
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
     metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 1287b99af..9c5c62842 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -80,6 +80,7 @@ settings:
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
     metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
     n_requests: null  # Warmup request count (None = full dataset once)
diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
index 38e25945c..aae7a07ac 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py
@@ -171,6 +171,7 @@ def make_aggregator(
     live_flush_interval_s: float | None = None,
     streaming: bool = True,
     shutdown_event: asyncio.Event | None = None,
+    drain_timeout_s: float | None = None,
 ) -> tuple[MetricsAggregatorService, MetricsRegistry, MagicMock]:
     """Construct an aggregator wired to a real SUB socket and a mocked publisher.
 
@@ -203,5 +204,6 @@ def make_aggregator(
         live_flush_interval_s=live_flush_interval_s,
         streaming=streaming,
         shutdown_event=shutdown_event,
+        drain_timeout_s=drain_timeout_s,
     )
     return agg, registry, publisher
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py
index 4e8222c48..40e6eb91b 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py
@@ -82,6 +82,7 @@ def _make_aggregator(
         sig_figs=3,
         n_histogram_buckets=10,
         streaming=streaming,
+        drain_timeout_s=None,
     )
     return agg, registry, publisher
 
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
index 8bf838c17..ba0d2e3e9 100644
--- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
+++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py
@@ -85,7 +85,7 @@ class TestBatchTokenizer:
     async def test_count_texts_async(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake", n_workers=0) as tok:
+            with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok:
                 counts = await tok.count_texts_async(["Hello world foo", "a"], loop)
                 assert counts == [3, 1]
 
@@ -93,7 +93,7 @@ async def test_count_texts_async(self):
     async def test_count_texts_async_empty(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake", n_workers=0) as tok:
+            with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok:
                 assert await tok.count_texts_async([], loop) == []
 
     @pytest.mark.asyncio
@@ -101,7 +101,7 @@ async def test_count_texts_async_sharded(self):
         """With shards present, chunks are reassembled in original order."""
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake", n_workers=0) as tok:
+            with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok:
                 tok._procs = [_FakeProc(), _FakeProc()]
                 counts = await tok.count_texts_async(["a", "b b", "c c c", "d"], loop)
                 assert counts == [1, 2, 3, 1]
@@ -111,14 +111,14 @@ async def test_count_texts_async_shard_failure_propagates(self):
         """A dead shard surfaces as an error, not a silent in-process fallback."""
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake", n_workers=0) as tok:
+            with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok:
                 tok._procs = [_BrokenProc()]
                 with pytest.raises(BrokenProcessPool):
                     await tok.count_texts_async(["a b"], loop)
 
     def test_close_is_idempotent(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
-            tok = BatchTokenizer("fake", n_workers=0)
+            tok = BatchTokenizer("fake", n_workers=0, live_workers=2)
             tok.close()
             tok.close()  # must not raise
 
@@ -126,7 +126,7 @@ def test_close_is_idempotent(self):
     async def test_use_after_close_raises(self):
         with patch(_MOCK_TARGET, _FakeTokenizer):
             loop = asyncio.get_running_loop()
-            tok = BatchTokenizer("fake", n_workers=0)
+            tok = BatchTokenizer("fake", n_workers=0, live_workers=2)
             tok.close()
             with pytest.raises(RuntimeError, match="closed"):
                 await tok.count_texts_async(["hello"], loop)
@@ -163,7 +163,7 @@ async def test_token_count_message_subtracts_baseline(self):
         """token_count_message_async returns full_tokens - baseline."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake", n_workers=0) as tok:
+            with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok:
                 # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2
                 count = await tok.token_count_message_async(
                     "hello world", None, None, loop
@@ -175,7 +175,7 @@ async def test_token_count_message_includes_tool_calls(self):
         """Tool-call JSON tokens are included in the count."""
         with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake", n_workers=0) as tok:
+            with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok:
                 tool_calls = (
                     {
                         "id": "c1",
@@ -199,7 +199,7 @@ def apply_chat_template(self, *args, **kwargs):
 
         with patch(_MOCK_TARGET, _BadTemplateTokenizer):
             loop = asyncio.get_running_loop()
-            with BatchTokenizer("fake", n_workers=0) as tok:
+            with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok:
                 tool_calls = (
                     {
                         "id": "c1",
@@ -300,7 +300,7 @@ def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor):
             lambda pid, mask: self.restored.append(set(mask)),
         )
         with patch(_MOCK_TARGET, _FakeTokenizerWithBackend):
-            return BatchTokenizer("fake", n_workers=n_workers)
+            return BatchTokenizer("fake", n_workers=n_workers, live_workers=2)
 
     @pytest.mark.parametrize(
         "cpus, n_workers, expected_shards",
@@ -336,7 +336,7 @@ def test_no_fast_backend_is_a_startup_error(self, monkeypatch):
         )
         with patch(_MOCK_TARGET, _FakeTokenizer):  # no backend_tokenizer
             with pytest.raises(RuntimeError, match="fast"):
-                BatchTokenizer("fake")
+                BatchTokenizer("fake", live_workers=2)
 
     def test_affinity_unavailable_shards_unpinned(self, monkeypatch):
         """No affinity API (e.g. macOS): shard from the CPU count, unpinned."""
@@ -357,7 +357,7 @@ def _raise(pid):
         monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise)
         monkeypatch.setattr(token_metrics_module.os, "cpu_count", lambda: 16)
         with patch(_MOCK_TARGET, _FakeTokenizerWithBackend):
-            with BatchTokenizer("fake") as tok:
+            with BatchTokenizer("fake", live_workers=2) as tok:
                 assert len(tok._procs) == 2
 
     def test_warmup_failure_is_a_startup_error(self, monkeypatch):
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 4109ebfc2..68498c9b4 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -633,12 +633,10 @@ async def _capture_launch(service_configs, *, timeout):
 
     @pytest.mark.unit
     @pytest.mark.asyncio
-    async def test_tokenizer_forwarded_and_live_args_left_to_service_defaults(
-        self, tmp_path
-    ):
-        """Pins the SUT-intrusion seam: the benchmark forwards --tokenizer but
-        deliberately no live/worker knobs — the service's own defaults govern
-        mid-run tokenization."""
+    async def test_tokenizer_and_workers_forwarded_from_schema(self, tmp_path):
+        """The benchmark forwards --tokenizer and --tokenizer-workers; the
+        workers value comes from the schema default
+        (drain.metrics_tokenizer_workers), the single source of truth."""
         config = OfflineConfig(**_OFFLINE_KWARGS, settings=OfflineSettings())
         ctx = self._make_ctx(config, tmp_path)
         ctx.tokenizer_name = "gpt2"
@@ -681,8 +679,9 @@ async def _capture_launch(service_configs, *, timeout):
         args = aggregator_cfg.args
         idx = args.index("--tokenizer")
         assert args[idx + 1] == "gpt2"
-        assert "--tokenizer-workers" not in args
-        assert "--live-tokenizers" not in args
+        idx = args.index("--tokenizer-workers")
+        expected = str(config.settings.drain.metrics_tokenizer_workers)
+        assert args[idx + 1] == expected
 
 
 class TestBuildPhases:

From 6361768fb33c13121f0b22aa8f482c98bc9a1726 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 17:57:33 -0700
Subject: [PATCH 17/20] chore(metrics): restore original comments; keep only
 default-related edits

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/__main__.py   |  95 +++++++++++----
 .../services/metrics_aggregator/aggregator.py |  40 +++++--
 .../metrics_aggregator/token_metrics.py       | 109 +++++++++++-------
 3 files changed, 165 insertions(+), 79 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 58733c087..70633fb6d 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -49,18 +49,27 @@ def _make_sigterm_handler(
 ) -> tuple[Callable[[], None], set[asyncio.Task]]:
     """Build the SIGTERM handler that writes the INTERRUPTED final snapshot.
 
-    Returns ``(handler, pending_tasks)``: asyncio holds tasks only by
-    weakref, so the handler's finalize task must live in this
-    strong-reference set until done. Module-level so the GC-safety
-    contract is unit-testable.
+    Returns ``(handler, pending_tasks)``. ``pending_tasks`` is the
+    strong-reference container that keeps spawned finalize tasks alive
+    while they run: asyncio tracks tasks only by weakref, so a task
+    whose only reference is the local variable inside the handler can
+    be garbage-collected mid-execution (per Python's asyncio docs).
+    Each spawned task self-removes from the set via
+    ``add_done_callback`` once it completes.
+
+    Exposed at module level (rather than nested in ``main()``) so the
+    GC-safety contract is unit-testable without driving the whole
+    subprocess lifecycle.
     """
     pending_tasks: set[asyncio.Task] = set()
 
     async def _signal_finalize() -> None:
         try:
-            # Refresh tracked_duration_ns before publish_final (mirrors the
-            # ENDED path) — otherwise an interrupted run whose
-            # STOP_PERFORMANCE_TRACKING never fired reports QPS=N/A.
+            # Mirror the ENDED-driven path: refresh tracked_duration_ns
+            # from the table BEFORE publish_final, otherwise an
+            # interrupted run whose STOP_PERFORMANCE_TRACKING never
+            # fired would report duration_ns=0 and QPS=N/A in the final
+            # report even after processing many tracked samples.
             registry.set_counter(
                 MetricCounterKey.TRACKED_DURATION_NS.value,
                 table.total_tracked_duration_ns,
@@ -127,8 +136,10 @@ async def main() -> None:
         required=True,
         help=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
-            "after ENDED (0 = wait indefinitely). The benchmark forwards "
-            "--metrics-drain-timeout; the default lives in config/schema.py."
+            "after ENDED before the aggregator emits the final snapshot with "
+            "n_pending_tasks > 0 (0 = wait indefinitely; the benchmark forwards "
+            "the schema default, see config/schema.py). Increase for very large "
+            "datasets where the end-of-run tokenize batch is big."
         ),
     )
     parser.add_argument(
@@ -155,9 +166,10 @@ async def main() -> None:
         required=True,
         help=(
             "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT "
-            "(0 = defer everything to the end-of-run drain, which always uses "
-            "the auto-sized sharded pool). The benchmark forwards "
-            "--metrics-tokenizer-workers; the default lives in config/schema.py."
+            "(0 = no mid-run tokenization, everything defers to the "
+            "end-of-run drain; the benchmark forwards the schema default, "
+            "see config/schema.py). The drain always uses the auto-sized "
+            "sharded pool — one worker process per 8-core block."
         ),
     )
     parser.add_argument(
@@ -184,8 +196,12 @@ async def main() -> None:
     if args.tokenizer_workers < 0:
         raise SystemExit("FATAL: --tokenizer-workers must be >= 0")
 
-    # The parent (commands/benchmark/execute.py) owns directory creation;
-    # fail fast here so a bad launcher errors now, not on the atomic write.
+    # The parent owns directory setup — `commands/benchmark/execute.py`
+    # creates `<report_dir>/metrics/` and validates it before launching
+    # this subprocess. Validate here as a fail-fast contract check so a
+    # misbehaving launcher (or a manual invocation) surfaces a clear
+    # error in this subprocess's stderr instead of crashing later on
+    # the atomic-write path.
     metrics_output_dir: Path = args.metrics_output_dir
     if not metrics_output_dir.is_dir():
         raise SystemExit(
@@ -206,8 +222,9 @@ async def main() -> None:
                 args.tokenizer, live_workers=args.tokenizer_workers
             )
         except RuntimeError as exc:
-            # An environment that cannot shard is a launch failure, not a
-            # silent slow path that cannot keep up with completions.
+            # Fail-fast contract: a tokenizer environment that cannot shard
+            # must surface as a clear service-launch failure, not a silent
+            # slow path that cannot keep up with completions.
             raise SystemExit(f"FATAL: {exc}") from exc
     else:
         tokenizer_cm = nullcontext()
@@ -245,12 +262,29 @@ async def main() -> None:
             )
             aggregator.start()
 
-            # SIGTERM (ServiceLauncher.kill_all) must still produce a final
-            # snapshot, tagged INTERRUPTED; publish_final is idempotent, so
-            # racing the ENDED-driven call is safe. SIGINT (^C hits the whole
-            # process group) is a no-op: finalizing eagerly at signal time
-            # would freeze the snapshot before the parent's graceful-shutdown
-            # samples land — the parent's ENDED drives finalize instead.
+            # SIGTERM only — the parent's ServiceLauncher.kill_all uses
+            # SIGTERM to kill the aggregator child before an ENDED event
+            # arrives; without this handler that path leaves the Report
+            # consumer with no final_snapshot file. The signal-triggered
+            # snapshot is tagged INTERRUPTED so Report can distinguish
+            # "parent killed the run" from a clean shutdown.
+            # publish_final is idempotent (see
+            # MetricsPublisher._finalized), so racing with the
+            # ENDED-driven call is safe.
+            #
+            # SIGINT is deliberately NOT handled in the same way. On an
+            # interactive ^C, the OS sends SIGINT to the whole
+            # foreground process group — parent + child both receive
+            # it. If we finalized eagerly here, the aggregator would
+            # write final_snapshot.json from whatever state it had at
+            # signal time, then exit; samples that completed during the
+            # parent's own graceful shutdown window would never reach
+            # the file (the parent eventually emits ENDED on its events
+            # channel, but `_finalized=True` makes that a no-op). The
+            # parent's clean-shutdown path is what we want to drive the
+            # aggregator's finalize — so we install a no-op handler for
+            # SIGINT here, which prevents Python's default
+            # KeyboardInterrupt and lets the parent control the lifecycle.
             on_sigterm, _sigterm_tasks = _make_sigterm_handler(
                 loop=loop,
                 registry=registry,
@@ -260,6 +294,8 @@ async def main() -> None:
                 shutdown_event=shutdown_event,
             )
             loop.add_signal_handler(signal.SIGTERM, on_sigterm)
+            # No-op SIGINT handler: silence the default KeyboardInterrupt
+            # and let the parent's ENDED-driven path drive shutdown.
             loop.add_signal_handler(
                 signal.SIGINT,
                 lambda: logger.info(
@@ -279,13 +315,24 @@ async def main() -> None:
 
 
 if __name__ == "__main__":
+    # Surface startup / bind / tokenizer-load failures with structured
+    # context. Without this wrap, the parent's ServiceLauncher only sees
+    # the non-zero exit code and a raw traceback — no diagnostic context
+    # to correlate against the parent's logs. The except/raise pattern
+    # preserves the original exit code (1) and traceback while emitting
+    # the structured logger.exception line before the interpreter prints
+    # the trace.
     try:
         LoopManager().default_loop.run_until_complete(main())
     except SystemExit:
         # argparse / explicit sys.exit — already user-facing, don't dress up.
         raise
     except Exception as e:
-        # Structured log line so the crash is grep-able against the parent's
-        # logs; KeyboardInterrupt/SystemExit propagate untouched.
+        # Catch Exception (not BaseException) so KeyboardInterrupt /
+        # SystemExit propagate untouched — those are control-flow
+        # signals, not crashes, and labeling them as "crashed" would
+        # mislead operators. The exception type goes first in the log
+        # message so it's grep-able without scrolling through the
+        # traceback.
         logger.exception("metrics aggregator subprocess crashed (%s)", type(e).__name__)
         raise
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index 0a6f2dfbd..64a31ee42 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -122,8 +122,12 @@ def __init__(
         drain_timeout_s: float | None,
         **kwargs,
     ):
-        # drain_timeout_s has no default here: the one default lives in
-        # config/schema.py (metrics_drain_timeout_s). None = wait forever.
+        # drain_timeout_s is injected (not derived) because the right
+        # value is workload-dependent: long-context tokenize-heavy runs
+        # need more headroom than the schema default 60 s, and the
+        # aggregator itself can't measure that ahead of time. Keeping it
+        # as an arg lets the __main__ CLI flag plumb the user's choice
+        # through without coupling this class to argparse.
         super().__init__(EventRecordCodec(), *args, **kwargs)
         self._registry = registry
         self._publisher = publisher
@@ -298,9 +302,15 @@ async def process(self, records: list[EventRecord]) -> None:
                 else:
                     if ev == SessionEventType.STARTED:
                         if self._session_start_ns is not None:
-                            # Producer bug: re-assigning _session_start_ns
-                            # would freeze total_duration_ns (max-of-elapsed
-                            # guard) and corrupt every downstream rate calc.
+                            # A duplicate STARTED is a producer bug:
+                            # re-assigning _session_start_ns would freeze
+                            # total_duration_ns (the max-of-elapsed guard
+                            # never updates once the start moves forward)
+                            # and corrupt every downstream rate calc for
+                            # the rest of the run. Surface loudly and
+                            # ignore — the publisher.start guard already
+                            # rejects the second tick-task spawn, but
+                            # session-state must also be defended here.
                             logger.error(
                                 "Duplicate STARTED event received "
                                 "(original at ts=%d, duplicate at ts=%d); "
@@ -385,13 +395,16 @@ async def process(self, records: list[EventRecord]) -> None:
             # that fires before publish_final reflects the new state.
             self._session_state = SessionState.DRAINING
             logger.info("Draining %d pending tokenizations...", self.pending_tokens)
-            # Drain + final publish run inside one finalization boundary: a
-            # tokenizer failure must not skip publish_final and leave
-            # main()'s `await shutdown_event.wait()` hanging.
+            # The drain and final publish are wrapped together so the aggregator
+            # ALWAYS reaches _finalize (which sets the shutdown event); a
+            # tokenizer failure during the drain must not skip publish_final and
+            # leave main()'s `await shutdown_event.wait()` hanging.
             n_pending = self.pending_tokens
             try:
-                # flush_remaining never raises; it returns the count it could
-                # not finish, which becomes the snapshot's n_pending_tasks.
+                # flush_remaining tokenizes the whole buffer in one batched pass,
+                # bounded by the drain budget, and never raises: it returns the
+                # count it could not finish (timeout or failure), which becomes
+                # the snapshot's n_pending_tasks so Report flags an incomplete drain.
                 if self._token_queue is not None:
                     n_pending = await self._token_queue.flush_remaining(
                         self._drain_timeout_s
@@ -417,8 +430,11 @@ async def process(self, records: list[EventRecord]) -> None:
                 )
                 await self._publisher.publish_final(registry, n_pending_tasks=n_pending)
             finally:
-                # aclose is independently wrapped: its failure must not
-                # prevent _finalize, which sets the shutdown event.
+                # The aggregator MUST close the publisher and signal shutdown even
+                # if the drain/publish above failed — otherwise main()'s
+                # `await shutdown_event.wait()` hangs forever. aclose is
+                # independently wrapped: its failure must not prevent _finalize,
+                # which is what sets the shutdown event.
                 try:
                     await self._publisher.aclose()
                 except Exception:  # noqa: BLE001 — best-effort cleanup.
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index d67d82f97..60a75bdb6 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -15,12 +15,15 @@
 
 """Tokenization for ISL/OSL/TPOT metrics.
 
-``BatchTokenizer`` runs two lanes: live mid-run flushes on a small in-process
-thread pool (``--tokenizer-workers``), and the end-of-run drain sharded
-across worker processes each pinned to a ``CORES_PER_WORKER`` block.
-``TokenBatchQueue`` buffers per-sample work and clears it in batches. A
-tokenizer without a fast (Rust) backend is a startup error, never a silent
-slow path; platforms without CPU affinity (e.g. macOS) shard unpinned.
+``BatchTokenizer`` tokenizes whole batches at once, sharded across worker
+processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE
+rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers
+per-sample text. The sharded pool is the drain-phase accelerator and is
+auto-sized (one shard per core block); live mid-run flushes run on a small
+in-process thread pool (``--tokenizer-workers``, default 2) owned by the
+queue's live loop. A tokenizer without a fast (Rust) backend is a startup
+error, never a silent slow path. Platforms without CPU affinity (e.g. macOS)
+shard unpinned at full speed; only cache/NUMA locality is lost.
 """
 
 from __future__ import annotations
@@ -49,14 +52,18 @@
 # used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process.
 CORES_PER_WORKER = 8
 
-# Warmup budget (spawn + transformers import + tokenizer load per worker).
-# A hung load must become a startup error that fires before the parent's
-# 30 s service-launch budget kills the subprocess.
+# Budget for the parallel shard warmup (spawn + transformers import +
+# tokenizer load per worker). A hung load (e.g. a stuck network filesystem)
+# must become a bounded startup error, not wedge service startup — and the
+# error must fire before the parent's 30 s service-launch budget kills the
+# subprocess, so the diagnostic wins the race.
 _SHARD_WARMUP_TIMEOUT_S = 25.0
 
-# Per-flush ceiling for the LIVE lane: bounds the lock-hold time and the
-# unstoppable in-flight encode a drain-start cancellation leaves behind.
-# The drain has no ceiling — it always takes the whole buffer.
+# Per-flush ceiling for the LIVE lane. Bounds three things at once: how long
+# the queue lock is held mid-run, how much work an unstoppable in-flight
+# thread encode can hold after a drain-start cancellation, and how much the
+# drain re-encodes for items the cancelled flush gave back. The drain has no
+# ceiling — it always takes the whole buffer.
 _LIVE_FLUSH_MAX_ITEMS = 1024
 
 # Minimal user message used to satisfy chat templates that reject assistant-only
@@ -197,11 +204,10 @@ def __init__(
         n_workers: int = -1,
     ) -> None:
         self._tokenizer_name = tokenizer_name
-        # Cap this process's rayon pool so a live (in-process) batched encode
-        # uses ~live_workers cores, not the whole machine. Must be set before
-        # the first encode initializes the pool; setdefault lets an
-        # operator-exported RAYON_NUM_THREADS win. live_workers has no
-        # default — the one default lives in config/schema.py.
+        # The live lane runs in-process: cap this process's rayon pool so a
+        # mid-run batched encode uses ~live_workers cores, not the whole
+        # machine. Must be set before the first encode initializes the pool;
+        # setdefault lets an operator-exported RAYON_NUM_THREADS win.
         os.environ.setdefault("RAYON_NUM_THREADS", str(max(1, live_workers)))
         self._live_workers = live_workers
         self._fallback_warned: set[str] = set()
@@ -214,9 +220,10 @@ def __init__(
             max_workers=max(1, live_workers), thread_name_prefix="tok-thread"
         )
         self._load_tokenizer()  # also computes the chat-template baseline
-        # Process shards for the drain. Empty only when in-process mode was
-        # explicitly requested (n_workers=0 / cores_per_worker<=0, test-only
-        # seams — production wiring always auto-sizes).
+        # Process shards for the batched text path. Empty only when
+        # in-process mode was explicitly requested (n_workers=0 or
+        # cores_per_worker<=0; ctor overrides used primarily by tests —
+        # production wiring passes live_workers only and shards auto-size).
         self._procs: list[ProcessPoolExecutor] = []
         self._setup_shards(cores_per_worker, n_workers)
 
@@ -258,11 +265,14 @@ def _load_tokenizer(self) -> None:
     def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
         """Spawn one pinned single-worker process per core block.
 
-        ``n_workers == 0`` selects in-process tokenization; auto (``< 0``)
-        fits one shard per ``cores_per_worker`` block (at least one); an
-        explicit count is clamped to capacity. An environment that cannot
-        shard — no fast Rust backend, a failed or over-budget warmup —
-        raises instead of degrading to a slow path.
+        ``n_workers == 0`` explicitly selects in-process tokenization. Auto
+        (``< 0``) fits one shard per ``cores_per_worker`` block of this
+        process's affinity mask (or the online CPU count when the platform
+        has no affinity API — shards then run unpinned), always at least one;
+        an explicit count is clamped to that capacity. An environment that
+        cannot shard — no fast Rust backend, a warmup that fails or exceeds
+        its budget — raises instead of silently degrading to a slow path
+        that cannot keep up with completions.
         """
         if cores_per_worker <= 0 or n_workers == 0:
             logger.info("BatchTokenizer: in-process tokenization (explicit)")
@@ -273,9 +283,12 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
                 "backend; token metrics require one to keep up with "
                 "completions. Use a fast tokenizer, or disable token metrics."
             )
-        # Probe the full allowed CPU universe (cgroup-clamped) for the block
-        # math, then restore the inherited mask: the aggregator stays where
-        # the parent placed it; only the drain shards span the machine.
+        # Probe the full allowed CPU universe (cgroup-clamped) for the shard
+        # block math, then restore this process's inherited mask: the
+        # aggregator's event loop, publisher, and live tokenizer threads stay
+        # exactly where the parent placed them (the loadgen mask on a pinned
+        # Linux run). Only the drain-phase shard processes, pinned to their
+        # own blocks, span the whole machine.
         try:
             original = os.sched_getaffinity(0)
         except (OSError, AttributeError):
@@ -312,9 +325,11 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None:
                     initargs=(self._tokenizer_name, block),
                 )
                 procs.append(ex)
-            # Warm all shards in parallel (submit-then-await; awaiting each
-            # before the next would serialize N tokenizer loads). Bounded:
-            # one hung load must not wedge startup.
+            # Force spawn + pin + tokenizer-load now (not on the first batch).
+            # Submit to every shard first so the loads run in parallel, then
+            # await — waiting on each before submitting the next would
+            # serialize P tokenizer loads and can exceed the launch budget.
+            # The wait is bounded: one hung load must not wedge startup.
             ready = [ex.submit(_worker_ready, 0) for ex in procs]
             deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S
             for f in ready:
@@ -492,12 +507,16 @@ async def token_count_message_async(
 class TokenBatchQueue:
     """Buffers per-sample tokenization work and clears it in batches.
 
-    Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with
-    an ``on_count`` recorder callback. The queue owns its flush cadence:
-    ``start_live`` flushes periodically through the tokenizer's bounded live
-    lane; ``flush_remaining`` drains everything left at end-of-run through
-    every shard. ``pending`` is the snapshot's ``n_pending_tasks`` —
-    non-zero in the final snapshot means an incomplete drain.
+    Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an
+    ``on_count`` callback that records the resulting metric. The queue owns
+    its own flush cadence: ``start_live`` begins a periodic flush through the
+    tokenizer's bounded live lane (so live ISL/OSL/TPOT stay current without
+    touching the benchmark's cores), and ``flush_remaining`` drains everything
+    left at end-of-run through every shard.
+
+    ``pending`` counts enqueued-but-not-yet-recorded items; it is the
+    ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot
+    means the end-of-run flush did not finish within the drain budget or failed.
     """
 
     def __init__(
@@ -558,12 +577,16 @@ def enqueue_message(
     async def flush(self, live: bool = False) -> None:
         """Tokenize everything buffered so far and run each ``on_count``.
 
-        ``live=True`` routes text through the tokenizer's bounded live lane,
-        takes at most ``_LIVE_FLUSH_MAX_ITEMS`` per kind, and re-queues items
-        on failure or cancellation — a mid-run hiccup never loses samples.
-        Drain-mode failures are terminal: un-recorded items stay counted in
-        ``pending`` and surface as an incomplete drain. Items are detached up
-        front so concurrent enqueues land in the next flush.
+        ``live=True`` routes text batches through the tokenizer's bounded
+        live lane instead of the full shard pool, takes at most
+        ``_LIVE_FLUSH_MAX_ITEMS`` per kind (bounding lock-hold time and the
+        unstoppable in-flight encode a drain-start cancellation leaves
+        behind), and re-queues items on failure or cancellation so a mid-run
+        hiccup never loses samples — the end-of-run drain retries them. Drain-mode failures are terminal: the
+        un-recorded items stay counted in ``pending`` (``_inflight`` is
+        decremented only after a callback runs) and surface as an incomplete
+        drain, not as silently dropped samples. Items are detached from the
+        buffer up front so concurrent enqueues land in the next flush.
         """
         async with self._lock:
             if not (self._text or self._msg):

From 8321d835183ff80ecbcece8a963aa768ff3bb09a Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 10 Jun 2026 20:42:28 -0700
Subject: [PATCH 18/20] fix(metrics): raise metrics drain-timeout default to
 300s

A 1M-sample run holds ~2M deferred tokenizations at ENDED; the drain
fans the whole buffer into one encode_batch per shard, so a 60s budget
expires before any chunk returns and the entire backlog is dropped.
300s covers 1M-sample runs with headroom.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                                                     | 2 +-
 .../async_utils/services/metrics_aggregator/aggregator.py     | 2 +-
 .../async_utils/services/metrics_aggregator/snapshot.py       | 2 +-
 src/inference_endpoint/config/schema.py                       | 4 ++--
 .../config/templates/concurrency_template_full.yaml           | 2 +-
 .../config/templates/offline_template_full.yaml               | 2 +-
 .../config/templates/online_template_full.yaml                | 2 +-
 tests/unit/commands/test_benchmark.py                         | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index c9e0c7f41..56bd7b1d1 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils.
 
 - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O.
 - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks).
-- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
+- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 300 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
 - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`).
 - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots.
 
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
index 64a31ee42..e7e8daf20 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -124,7 +124,7 @@ def __init__(
     ):
         # drain_timeout_s is injected (not derived) because the right
         # value is workload-dependent: long-context tokenize-heavy runs
-        # need more headroom than the schema default 60 s, and the
+        # need more headroom than the schema default 300 s, and the
         # aggregator itself can't measure that ahead of time. Keeping it
         # as an arg lets the __main__ CLI flag plumb the user's choice
         # through without coupling this class to argparse.
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
index a1e461c43..8046e1704 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
@@ -45,7 +45,7 @@ class SessionState(str, Enum):
     LIVE        → run in progress; tick task publishing live HDR-derived stats.
     DRAINING    → ``SessionEventType.ENDED`` has been received; the aggregator
                   is tokenizing the buffered samples (bounded by the
-                  ``--drain-timeout`` budget — schema default 60 s). Tick task
+                  ``--drain-timeout`` budget — schema default 300 s). Tick task
                   continues at this stage, still HDR-derived; no new events
                   will arrive.
     COMPLETE    → terminal clean state. The ``publish_final()`` snapshot
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index f093a547a..19447dd2b 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -584,11 +584,11 @@ class DrainConfig(BaseModel):
             ),
         ),
     ] = Field(
-        60.0,
+        300.0,
         ge=0,
         description=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
-            "after ENDED (default: 60.0; 0 = unlimited)."
+            "after ENDED (default: 300.0; 0 = unlimited)."
         ),
     )
     metrics_tokenizer_workers: Annotated[
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 30b224402..42c449d1d 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
     metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index ae0ae939c..b5f4f5a23 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
     metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 9c5c62842..4271ff792 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
     metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 68498c9b4..e47def8f0 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -489,7 +489,7 @@ def test_defaults(self):
         assert cfg.warmup_timeout_s == 240.0
         assert cfg.performance_timeout_s == 240.0
         assert cfg.accuracy_timeout_s is None
-        assert cfg.metrics_drain_timeout_s == 60.0
+        assert cfg.metrics_drain_timeout_s == 300.0
 
     @pytest.mark.unit
     @pytest.mark.parametrize(

From f1ac948956b84d3d074f0d02bbe9ba255f5260bc Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Mon, 15 Jun 2026 21:41:59 -0700
Subject: [PATCH 19/20] test(metrics): pass now-required aggregator args in
 signal-handling test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The aggregator entrypoint now requires --drain-timeout and
--tokenizer-workers (single-sourced from the schema). The signal-handling
integration test spawns the subprocess directly and still omitted them, so
argparse exited the process (code 2) before any signal handler was
installed. Pass both: --tokenizer-workers 0 (no tokenizer configured, so no
live tokenization) and a small --drain-timeout (never reached — the run is
signalled, not ENDED).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/metrics_aggregator/test_signal_handling.py    | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py b/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py
index 010536c09..62db80b04 100644
--- a/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py
+++ b/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py
@@ -64,6 +64,13 @@ def _spawn_aggregator(
             metrics_socket,
             "--metrics-output-dir",
             str(output_dir),
+            # Required by the entrypoint, but inert here: no tokenizer is
+            # configured (so no live tokenization) and the run is signalled
+            # rather than ENDED, so the drain budget is never reached.
+            "--drain-timeout",
+            "5",
+            "--tokenizer-workers",
+            "0",
         ],
         # New process group so we can signal it without disturbing the
         # test runner.

From 70b39d9a547a6339b776d433c0297b34c2a75ff0 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Mon, 15 Jun 2026 22:23:01 -0700
Subject: [PATCH 20/20] chore(deps): bump aiohttp 3.14.0 -> 3.14.1 (fixes 8
 CVEs)

pip-audit flagged aiohttp 3.14.0 for CVE-2026-54273..54280 (8 advisories),
all fixed in 3.14.1. aiohttp is a test-only dependency (mock-server
fixture); production uses the custom httptools client. uv.lock regenerated
to match; uv run pip-audit now reports no known vulnerabilities.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pyproject.toml |  2 +-
 uv.lock        | 76 +++++++++++++++++++++++++-------------------------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0b0f67a86..4a7655021 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -112,7 +112,7 @@ test = [
     "Pympler==1.1",
     "scipy==1.17.1",
     # HTTP server and client for mock server fixture
-    "aiohttp==3.14.0",
+    "aiohttp==3.14.1",
     # Plotting for benchmark sweep mode
     "matplotlib==3.10.8",
     # Property-based testing (CLI fuzz)
diff --git a/uv.lock b/uv.lock
index bfdb3b236..984581b6b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -29,7 +29,7 @@ wheels = [
 
 [[package]]
 name = "aiohttp"
-version = "3.14.0"
+version = "3.14.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohappyeyeballs", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -41,42 +41,42 @@ dependencies = [
     { name = "typing-extensions", marker = "(python_full_version < '3.13' and platform_machine == 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'darwin') or (python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "yarl", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/89/97/2b6889bfb6b6847520d50d95eb8c4307a45e28aaca39faf4a9454b3d1b2f/aiohttp-3.14.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b29518c9c2ec7e373e68259206a137c7f4f5439c58baaec4b5ab3ab799850a4e", size = 750194, upload-time = "2026-06-01T19:37:48.164Z" },
-    { url = "https://files.pythonhosted.org/packages/21/e2/62634b7fff918ed98c3c6b2f0e70d520f7f28846cb412d451b04354c6459/aiohttp-3.14.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dbec68ce61b64cb73cab4d33df9433427b1713c8bcccb181dce695c1b6f8e87c", size = 506966, upload-time = "2026-06-01T19:37:50.014Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/fb/5ce075150828c797a5106f1c2fb26034e709d4289b9d2bf8b07f1e59fac6/aiohttp-3.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3cdf534aa455593e589302990c5097aa5c92c06c4262a20da22934f9186a5fff", size = 507527, upload-time = "2026-06-01T19:37:51.96Z" },
-    { url = "https://files.pythonhosted.org/packages/01/d5/405a0ae4e6b081754a3609c1c97c63a950e000a2def16046f1e736933a0e/aiohttp-3.14.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cb6c657104393b5fbff01a5f59b2023db74058a8077d94475d6c25d03882a108", size = 1762420, upload-time = "2026-06-01T19:37:53.839Z" },
-    { url = "https://files.pythonhosted.org/packages/19/d8/51de5c6b971c27bb1ef620293b8d1ca611ec78736b34b3f6ccf68e4c8785/aiohttp-3.14.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78d6f9286a629ce52728430afe18f8ed2b6c39a1fddb3802d7244b9983910ad2", size = 1783112, upload-time = "2026-06-01T19:38:02.641Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/05/750a3265ca4dc54a460bd0cb1121a8f2ce9171fce4a135fb47ea7fd594d2/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4d6a998191f5ebe3b8c28463ff72bc030250008b3193c402464efadd08b5ca02", size = 1723119, upload-time = "2026-06-01T19:38:06.713Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/fb/05d9214c975f23225a8cd5c439325e338c7c377b315480ef3871db51f54e/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5ba10966d4f03dd96a14365be4b8e37c327c76f11c3ca867116966cdd9f98066", size = 1760193, upload-time = "2026-06-01T19:38:17.624Z" },
-    { url = "https://files.pythonhosted.org/packages/11/41/cc2d2cfbfbdc3126ba258f3cd27d1ac8a33492ae3c35a4583ee21f0ba7f1/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3366751d68d237c621264233a32f3078bbc21b7904ab90a77e03d21390c742c6", size = 481670, upload-time = "2026-06-01T19:38:29.836Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/07/381f4023c3b08cb616e520f566d8c58957abad54e56441d41fe67cfb0195/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:57ea07d28695a7a40304d42251892a8df765e5588c10ee32afeddcd5df33c0a2", size = 487591, upload-time = "2026-06-01T19:38:31.704Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/4d/4506fdb7a022bdf70011a3bbb4ca00c5c570026ef6a3c5bd7bc70c39089c/aiohttp-3.14.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:076cb014191ae2e65d949e1ad01f1dcfe33e32789b5172510f3e79c79fc04d50", size = 496503, upload-time = "2026-06-01T19:38:33.6Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/7d/c814111e04894a45d9e2defc94443879a6f118d9633d5fedfe6e2e8af5f0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2f3fc37054564dee64a855b5b092d87ec35dcddfaabf7dacb1c8a2b1f83dc0a9", size = 745870, upload-time = "2026-06-01T19:38:36.013Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/ee/80eee0efddfe187e7cd05027086b7ce1c0e492e82a4eda58f5c5543a44a0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8fcaef74d2ab0f607d7ff85a0d15e21bb5a258c4a58df1908396eb50d7f4ed3c", size = 505588, upload-time = "2026-06-01T19:38:38.282Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/f8/0f28f04eef75d52fc9c715dde7ce9c0abb810fd20cfeb0fea7afd2ab1e98/aiohttp-3.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4c01b0bfc6209590960e68eac083cd22d5d87c21f974dd6208cafa5d3542bc8", size = 504492, upload-time = "2026-06-01T19:38:40.611Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/db/44c755232085545065c94378dfce38641b1aee647f4939fcd32f5b32e719/aiohttp-3.14.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f12eb7896e81caf403a2b18c9406426f1207361e7239c057ab29c076d4257e83", size = 1752111, upload-time = "2026-06-01T19:38:42.682Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/a3/3800dbd095cb2bb165a7ea5d94d790914677e27f45638c7d80e3f34c8945/aiohttp-3.14.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:26d9224c6dd7f5c749aba4f61315a894601448b28d94d12f4dea0903e26d2096", size = 1777241, upload-time = "2026-06-01T19:38:52.04Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/3d/dc94df99ed1511fdf28314f722643ed334112643cab00223577085e788c4/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:23e8314e7aed8576fbe33314d218bd81447a3adbc91dc36f1163bf583cd3084c", size = 1714864, upload-time = "2026-06-01T19:38:56.788Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/10/ab28818262f4d26bdb47ed5f1fc7999b69e2fc6e0370b02d0f49011f45ea/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:666c7c5036df57b693026398b69b41874a1931ac5b3485fd910e57bfac253869", size = 1754516, upload-time = "2026-06-01T19:39:08.788Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/fe/6edbf5d39bf29322b6816365b17ed8ede4dace164a3aea1abcd30110eb78/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:70ea956f6cc4a37620966b56c2e205d88ca3e6d85ec063277e414b1035cddad3", size = 483329, upload-time = "2026-06-01T19:39:22.607Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/5a/fae531bdbc6456fb6241f46b7b81e4d8a0dd3fc09118a0055dc7141ac1ec/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:ea3b9806c89f61da22fddf1f12dd524fb368e5e28f1261fbdafe5c3cd8ce893b", size = 489502, upload-time = "2026-06-01T19:39:24.881Z" },
-    { url = "https://files.pythonhosted.org/packages/36/f4/48a7b0414db7fed77a03d5dde34508c026afd83510ab6bca08c313855776/aiohttp-3.14.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a071be341c2bd9b0188e62d173509f024e0a35b1c342c53c50f8daaeda8c3bd8", size = 497357, upload-time = "2026-06-01T19:39:27.197Z" },
-    { url = "https://files.pythonhosted.org/packages/75/75/e85a13a370acc007fca5feb1fd1b88ac2d8426e6dadd625479b7cadd55a3/aiohttp-3.14.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:198cfe61bf253b19da1fb3e0fa122249dc4f14c12709493fed8054aa0411cc76", size = 750898, upload-time = "2026-06-01T19:39:29.563Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/e4/3d637f800c724eff0e2bed64df72557444482366fd0a35b0cec0e6968f6c/aiohttp-3.14.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9dc203d6ce6b9106d54e2a93f41dfdfebfbca2d99962ba503bfd3e5921a6549e", size = 506986, upload-time = "2026-06-01T19:39:31.872Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/df/35161f3598bf7501d2b2a805b41ab4f45a2e34150c421bcb4ef8c0d281a7/aiohttp-3.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9e19d17ab02bf16832a2c8c0d55a486792c5b1645665652ee9531aebcc30cb72", size = 508033, upload-time = "2026-06-01T19:39:34.137Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/39/b36e5d3d31e850fb4691dd3e941684ac490a2559249f6fa634b6b0fdf020/aiohttp-3.14.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d925fba0c14d5b498a8028b0107beebdfd16c5d48d702ff54f879cb017aaaca3", size = 1746213, upload-time = "2026-06-01T19:39:36.654Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/05/27df32c844b2156e1675a8d8ec22d963e3c8ba469ed7ceb1863320c7b521/aiohttp-3.14.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff82be7f1ef73634cb77890a770743239bc3d487b848669be1c599889336dc0a", size = 1751659, upload-time = "2026-06-01T19:39:46.398Z" },
-    { url = "https://files.pythonhosted.org/packages/66/e3/53c67097e8a5ce98625e91e3fa7f43c9c6940de680345d03b3509a72a078/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:edc01ea4e1ec5a1649a28866262bf24195889ff7b27bdd947029a6086741de9b", size = 1710090, upload-time = "2026-06-01T19:39:51.392Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/69/155c4ef3aec96417d47024800472b33b16c5d8a665371dcd044c2afdf25d/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:26b6d79aa54cb4ed50cc7d41ed14e99e0f1fc8e7c2d42f2e05b37aea897b2b52", size = 1733716, upload-time = "2026-06-01T19:40:03.631Z" },
-    { url = "https://files.pythonhosted.org/packages/12/34/6180103ce9aabc8ebff3f7bb55a1228ffe60f61042823031d9692cb7b101/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:6aa1a40f9cbb3da9f80714c5966b8946c21e6a2530d809b9498b33161e3c8733", size = 787878, upload-time = "2026-06-01T19:40:13.401Z" },
-    { url = "https://files.pythonhosted.org/packages/92/e9/08954a40e8b7baa3d8beadd2b074b186e9b1e9c8ddabc288678a6265de50/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b62af5a8cc96a194eaa01a9ed7b34a3ffa58d3d8daaa1a0d7a749353ad12d228", size = 524400, upload-time = "2026-06-01T19:40:15.972Z" },
-    { url = "https://files.pythonhosted.org/packages/08/6a/b5965a634ac4d5ba99a463314cf4ab214ca073fcdc38a15e0294273701fc/aiohttp-3.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6eb63b1417efaf7d1002a6ad034a40d44376afcc16508a57f8e74b49ad26a095", size = 527904, upload-time = "2026-06-01T19:40:18.28Z" },
-    { url = "https://files.pythonhosted.org/packages/06/b4/932bcdd850c354d9bcca30f360e475d7852e30413fbbd44b182782ed5432/aiohttp-3.14.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c20b9ad156a79eb97be5cf9e069eec01d2f0dc8472ffbd75299a8b2d4c2cbbde", size = 1912162, upload-time = "2026-06-01T19:40:20.825Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/1c/a57de71a4508c93a830b77c28af3d08cd97f606dedfc6b94275347744508/aiohttp-3.14.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:145262119b07d7f95abc1839add35ba2bfc84551d4b4660ca11542c0b215455b", size = 1868606, upload-time = "2026-06-01T19:40:31.843Z" },
-    { url = "https://files.pythonhosted.org/packages/35/1e/c237923232c7da7f0392ea25d89fc5e60c0e93f685f4ebca8e7bcdd5271c/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cc736a9c9fc2bc4dd71fd404815741b6573df27c3f985948ec4076989ac57de", size = 1834090, upload-time = "2026-06-01T19:40:37.733Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/bc/2aaab2f85cadb26ea59c091fa2b8e370d625154b5c14b478f1b489d07551/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6199707cc40e0e9cd39c36fbc97bec416c704e1d0ddce03412bb3b3e6a90ccd0", size = 1832281, upload-time = "2026-06-01T19:40:52.303Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/82/78/8ea7308cac6934de8c74a14f3d5f65d1c89287426688be79538d0e5c013d/aiohttp-3.14.1.tar.gz", hash = "sha256:307f2cff90a764d329e77040603fa032db89c5c24fdad50c4c15334cba744035", size = 7955794, upload-time = "2026-06-07T21:09:35.529Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/21/151624b51cd92553d95424daf4bf19f19ce9be9002d19253e7e7ce67197b/aiohttp-3.14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d35143e27778b4bb0fb189562d7f275bff79c62ab8e98459717c0ea617ff2480", size = 757402, upload-time = "2026-06-07T21:06:40.311Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/82/280619e0bd7bf2454987e19282616e84762255dd9c8468f62382e8c191f1/aiohttp-3.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bcfb80a2cc36fba2534e5e5b5264dc7ae6fcd9bf15256da3e53d2f499e6fa29d", size = 512310, upload-time = "2026-06-07T21:06:42.207Z" },
+    { url = "https://files.pythonhosted.org/packages/55/b2/2aac325583aaa1353045f96dffa586d8a34e8322e14a7ba49cffeb103ab4/aiohttp-3.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27fd7c91e51729b4f7e1577865fa6d34c9adccbc39aabe9000285b48af9f0ec2", size = 512448, upload-time = "2026-06-07T21:06:43.813Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/72/a60607cb849faa8af8a356c9329ea2eb6f395d49e82cc82ccba1fd8deb8f/aiohttp-3.14.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:64c567bf9eaf664280116a8688f63016e6b32db2505908e2bdaca1b6438142f2", size = 1766854, upload-time = "2026-06-07T21:06:45.391Z" },
+    { url = "https://files.pythonhosted.org/packages/20/9c/d445818389df371f56d141d881153ba23183c4735a03f7356ffb43f7757d/aiohttp-3.14.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e6fc1a85fa7194a1a7d19f44e8609180f4a8eb5fa4c7ed8b4355f080fad235c", size = 1790278, upload-time = "2026-06-07T21:06:54.049Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/b4/4dac0038960427ba832f6609dfb4ea5437d7fd80c72001b9e48f834f428b/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c6fa4dc7ad6f8109c70bb1499e589f76b0b792baf39f9b017eb92c8a81d0a199", size = 1728397, upload-time = "2026-06-07T21:06:57.777Z" },
+    { url = "https://files.pythonhosted.org/packages/70/0a/e0075ce9ca0279ee1d4f0c0b85f54fea02ebc83c3007651a72bece658fec/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f71173be42d3241d428f760122febb748de0623f44308a6f120d0dd9ec572e3", size = 1767580, upload-time = "2026-06-07T21:07:07.873Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/22/a73ccbf9dbd6e26dda0b24d5fd5db7da92ee3383a79f47677ffb834c5c5b/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:915fbb7b41b115192259f8c9ae58f3ddc444d2b5579917270211858e606a4afd", size = 485841, upload-time = "2026-06-07T21:07:19.555Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/b9/57ed8eaf596321c2ad747bd480fb1700dbd7177c60dfc9e4c187f629662e/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:7fb4bdf95b0561a79f259f9d28fbc109728c5ee7f27aff6391f0ca703a329abe", size = 492088, upload-time = "2026-06-07T21:07:21.581Z" },
+    { url = "https://files.pythonhosted.org/packages/78/c0/5ebe5270a7c140d7c6f79dcb018640225f14d406c149e4eec04a7d82fe71/aiohttp-3.14.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1b9748363260121d2927704f5d4fc498150669ca3ae93625986ee89c8f80dcd4", size = 501564, upload-time = "2026-06-07T21:07:23.388Z" },
+    { url = "https://files.pythonhosted.org/packages/75/7f/8cdaa24fc7983865e0915153b96a9ac5bcdd3548d64c5a27d17cecccad2d/aiohttp-3.14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:86a6dab78b0e43e2897a3bbe15745aa60dc5423ca437b7b0b164c069bf91b876", size = 751998, upload-time = "2026-06-07T21:07:25.046Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/f4/c4227aacfacc5cb0cc2d119b65301d177912a6842cd64e120c47af76064f/aiohttp-3.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4dfd6e47d3c44c2279907607f73a4240b88c69eb8b90da7e2441a8045dfd21da", size = 510918, upload-time = "2026-06-07T21:07:27.28Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/01/a2d5f96cd4e74424864d30bc0a7e44d0a12dacdcfa91b5b2d1bd3dca6bf3/aiohttp-3.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:317acd9f8602858dc7d59679812c376c7f0b97bcbbf16e0d6237f54141d8a8a6", size = 508657, upload-time = "2026-06-07T21:07:29.252Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/ed/3c0fb5c500fdd8e7ebc10d1889c04384fffa1a9163eac1356088ca9da1b1/aiohttp-3.14.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd869c427324e5cb15195793de951295710db28be7d818247f3097b4ab5d4b96", size = 1757907, upload-time = "2026-06-07T21:07:31.03Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/6e/dbf1d0625dc711fb2851f4f3c3055c39ed58bae92082d8c627dbe6013736/aiohttp-3.14.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:faccab372e66bc76d5731525e7f1143c922271725b9d38c9f97edcc66266b451", size = 1783881, upload-time = "2026-06-07T21:07:39.063Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/bd/cf9cee17e140f942a3de73e658a543aa8fbf35a5fc67a9d2538d52d77f0b/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:97e704dcd26271f5bda3fa07c3ce0fb76d6d3f8659f4baa1a24442cc9ba177ca", size = 1722137, upload-time = "2026-06-07T21:07:43.014Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/45/4de841f005cfe1fd63e2a2fe011262c515e2a62aa6994b15947e7d717ac9/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cb21957bb8aca671c1765e32f58164cf0c50e6bf41c0bbbd16da20732ecaf588", size = 1761094, upload-time = "2026-06-07T21:07:54.113Z" },
+    { url = "https://files.pythonhosted.org/packages/85/a5/9594ad6289eebbc97d167c44213d557807f90e59115caad24de21ad2c3b1/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:62a759436b29e677181a9e76bab8b8f689a29cb9c535f45f7c48c9c830d3f8c3", size = 487918, upload-time = "2026-06-07T21:08:06.377Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/61/16a32c36c3c49edec122a3dc811f2057df2f94d3b14aa107c8017d981618/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2964cbf553df4d7a57348da44d961d871895fc1ee4e8c322b2a95612c7b17fba", size = 494014, upload-time = "2026-06-07T21:08:08.263Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/89/3ebcf96ed99c05bec9c434aaac6963fd3cbab4a786ae739908a144d9ce44/aiohttp-3.14.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:237651caadc3a59badd39319c54642b5299e9cc98a3a194310e55d5bb9f5e397", size = 502398, upload-time = "2026-06-07T21:08:10.244Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/3d/b74870a0c2d40c355928cd5b96c7a11fa821b8a40fc41365e64479b151fb/aiohttp-3.14.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:896e12dfdbbab9d8f7e16d2b28c6769a60126fa92095d1ebf9473d02593a2448", size = 758018, upload-time = "2026-06-07T21:08:12.447Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/66/f42f5c984d99e49c6cff5f26f590750f2e2f7ef1fcfb99966ab5be1b632e/aiohttp-3.14.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d03f281ed22579314ba00821ce20115a7c0ac430660b4cc05704a3f818b3e004", size = 512462, upload-time = "2026-06-07T21:08:14.624Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a7/248e1aebe0c7810b0271e021a0f2a5eb6e78a051885b3c9df49f42a5802d/aiohttp-3.14.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07eabb979d236335fed927e137a928c9adfb7df3b9ec7aa31726f133a62be983", size = 512824, upload-time = "2026-06-07T21:08:16.572Z" },
+    { url = "https://files.pythonhosted.org/packages/26/97/2aa0e5ba0727dc3bd5aaebb7ccbc510f7dfb7fb961ec87497cd496635ab1/aiohttp-3.14.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4fe1f1087cbadb280b5e1bb054a4f00d1423c74d6626c5e48400d871d34ecefe", size = 1749898, upload-time = "2026-06-07T21:08:18.635Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/18/938441025db6769a3464596b2410af3afde0b21eb2f204c6f766f68af4bd/aiohttp-3.14.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:634e385930fb6d2d479cf3aa66515955863b77a5e3c2b5894ca259a25b308602", size = 1760329, upload-time = "2026-06-07T21:08:27.363Z" },
+    { url = "https://files.pythonhosted.org/packages/49/a2/2136674d52123b1354bd05dd5753c318db47dc0c927cc70b27bab3755456/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:335c0cc3e3545ce98dcb9cfcb836f40c3411f43fa03dab757597d80c89af8a35", size = 1714756, upload-time = "2026-06-07T21:08:32.094Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/af/14bb5843eccbe234f4dfb78ab73e549d99727247e62ae5d62cbd22eaf5b0/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6ffbb2f4ec1ceaff7e07d43922954da26b223d188bf30658e561b98e23089444", size = 1742574, upload-time = "2026-06-07T21:08:43.795Z" },
+    { url = "https://files.pythonhosted.org/packages/34/e3/19dbe1a1f4cc6230eb9e314de7fe68053b0992f9302b27d12141a0b5db53/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:819c054312f1af92947e6a55883d1b66feefab11531a7fc45e0fb9b63880b5c2", size = 793320, upload-time = "2026-06-07T21:08:52.775Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/20/1b7182219ba1b108430d6e4dc53d25ae02dcfcf5a045b33af4e8c5167527/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10ee9c1753a8f706345b22496c79fbddb5be0599e0823f3738b1534058e25340", size = 529077, upload-time = "2026-06-07T21:08:55Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/c8/14ce60ec31a2e5f5274bb17d383a6f7a3aabca31ac04eee05585bbadab16/aiohttp-3.14.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1601cc37baf5750ccacae618ec2daf020769581695550e3b654a911f859c563d", size = 532476, upload-time = "2026-06-07T21:08:57.176Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/02/9ac85e081e53da2e061b02fa7758fe0a12d17b8ce2d1f5e6c7cb76730328/aiohttp-3.14.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4d6e0ac9da31c9c04c84e1c0182ad8d6df35965a85cae29cd71d089621b3ae94", size = 1922347, upload-time = "2026-06-07T21:08:59.563Z" },
+    { url = "https://files.pythonhosted.org/packages/66/4e/560c7472d3d198a23aa5c8b19a5115bf6a9b77b7d3e4bb363da320430ad2/aiohttp-3.14.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fc0cacab7ba4e56f0f81c82a98c09bed2f39c940107b03a34b168bdf7597edd3", size = 1877095, upload-time = "2026-06-07T21:09:09.011Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/c9/48255813cca749a229ef0ab476004ec623728ad79a9c0840616f6c076325/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:38e1e7daaea81df51c952e18483f323d878499a1e2bfe564790e0f9701d6f203", size = 1842922, upload-time = "2026-06-07T21:09:14.118Z" },
+    { url = "https://files.pythonhosted.org/packages/44/be/0474c5a8b5640e1e4aa1923430a91f4151be82e511373fe764189b89aef5/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:99abd37084b82f5830c635fddd0b4993b9742a66eb746dacf433c8590e8f9e3c", size = 1841409, upload-time = "2026-06-07T21:09:26.207Z" },
 ]
 
 [[package]]
@@ -858,7 +858,7 @@ test = [
 
 [package.metadata]
 requires-dist = [
-    { name = "aiohttp", marker = "extra == 'test'", specifier = "==3.14.0" },
+    { name = "aiohttp", marker = "extra == 'test'", specifier = "==3.14.1" },
     { name = "colorama", specifier = "==0.4.6" },
     { name = "coverage", marker = "extra == 'test'", specifier = "==7.13.4" },
     { name = "cyclopts", specifier = "==4.10.0" },