From 9ab6e838fd845482bf23bcdd578fb056e8009ef8 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Tue, 9 Jun 2026 13:32:42 -0700 Subject: [PATCH 01/20] perf(metrics): batch tokenization with defer-to-flush drain Replace the per-event async tokenize model (one asyncio task per sample's ISL/OSL/TPOT) with a deferred batch design that keeps tokenization ahead of completions on high-completion-rate runs, where the per-event tasks otherwise piled up faster than a single tokenizer thread could clear them and stretched the end-of-run drain. - BatchTokenizer: counts whole batches via the raw tokenizers backend (encode_batch_fast), sharded across worker processes each pinned to a disjoint CORES_PER_WORKER-core block so their rayon pools stay NUMA-local. Falls back to a single in-process thread when there is no fast backend or fewer than two core blocks fit. - TokenBatchQueue: triggers enqueue (text/message + a recorder callback) instead of spawning tasks; the buffer is tokenized in one sharded call at each publish tick (live ISL/OSL/TPOT) and once at end-of-run (flush_remaining, bounded by the drain budget). n_pending_tasks now counts un-tokenized items, preserving the Report "incomplete drain" contract. - MetricsTable is now fully synchronous (drops the in-flight task set, drain_tasks, and in_flight_tasks_count). - CORES_PER_WORKER is a module constant; removes the metrics_tokenizer_workers config knob (schema/execute/CLI) and regenerates the YAML templates. Validated: 234 unit + 3 integration tests pass. Offline-burst e2e (echo server, streaming, real tokenizer) shows a 3000-tokenization backlog at ENDED drained to n_pending_tasks=0 with the final report state=complete. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/__main__.py | 33 +- .../services/metrics_aggregator/aggregator.py | 56 +- .../metrics_aggregator/metrics_table.py | 172 ++---- .../services/metrics_aggregator/publisher.py | 19 +- .../metrics_aggregator/token_metrics.py | 521 +++++++++++++----- .../commands/benchmark/execute.py | 6 - src/inference_endpoint/config/schema.py | 15 - .../templates/concurrency_template_full.yaml | 1 - .../templates/offline_template_full.yaml | 1 - .../templates/online_template_full.yaml | 1 - .../services/metrics_aggregator/conftest.py | 27 +- .../metrics_aggregator/test_aggregator.py | 112 ++-- .../test_main_signal_handler.py | 8 +- .../metrics_aggregator/test_metrics_table.py | 47 +- .../metrics_aggregator/test_token_metrics.py | 174 ++++-- tests/unit/commands/test_benchmark.py | 62 --- 16 files changed, 716 insertions(+), 539 deletions(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 2231d6dc8..9cd1c7e5e 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -33,7 +33,7 @@ from .publisher import MetricsPublisher from .registry import MetricsRegistry from .snapshot import MetricsSnapshotCodec -from .token_metrics import TokenizePool +from .token_metrics import BatchTokenizer, TokenBatchQueue logger = logging.getLogger(__name__) @@ -44,6 +44,7 @@ def _make_sigterm_handler( registry: MetricsRegistry, publisher: MetricsPublisher, table: MetricsTable, + token_queue: TokenBatchQueue | None, shutdown_event: asyncio.Event, ) -> tuple[Callable[[], None], set[asyncio.Task]]: """Build the SIGTERM handler that writes the INTERRUPTED final snapshot. @@ -75,7 +76,7 @@ async def _signal_finalize() -> None: ) await publisher.publish_final( registry, - n_pending_tasks=table.in_flight_tasks_count, + n_pending_tasks=token_queue.pending if token_queue is not None else 0, interrupted=True, ) except Exception: # noqa: BLE001 — best-effort. @@ -134,11 +135,10 @@ async def main() -> None: type=float, default=60.0, help=( - "Wall-clock budget (seconds) to wait for in-flight async tokenize " - "tasks to finish after ENDED before the aggregator cancels them " - "and emits the final snapshot with n_pending_tasks > 0 " - "(default: 60.0; 0 = wait indefinitely). Increase for long-context " - "/ low-worker-count tokenize workloads." + "Wall-clock budget (seconds) to finish tokenizing buffered samples " + "after ENDED before the aggregator emits the final snapshot with " + "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase " + "for very large datasets where the end-of-run tokenize batch is big." ), ) parser.add_argument( @@ -159,12 +159,6 @@ async def main() -> None: default=None, help="HuggingFace tokenizer name for ISL/OSL/TPOT (e.g. 'gpt2'). If not set, token metrics are disabled.", ) - parser.add_argument( - "--tokenizer-workers", - type=int, - default=2, - help="Number of tokenizer worker threads (default: 2)", - ) parser.add_argument( "--streaming", action="store_true", @@ -204,15 +198,15 @@ async def main() -> None: loop = LoopManager().default_loop # Using ternary operator causes errors in MyPy object type coalescing - # (coalesces to 'object' not 'AbstractContextManager[TokenizePool | None]') - pool_cm: AbstractContextManager[TokenizePool | None] + # (coalesces to 'object' not 'AbstractContextManager[BatchTokenizer | None]') + tokenizer_cm: AbstractContextManager[BatchTokenizer | None] if args.tokenizer: - pool_cm = TokenizePool(args.tokenizer, n_workers=args.tokenizer_workers) + tokenizer_cm = BatchTokenizer(args.tokenizer) else: - pool_cm = nullcontext() + tokenizer_cm = nullcontext() with ( - pool_cm as pool, + tokenizer_cm as tokenizer, ManagedZMQContext.scoped(socket_dir=args.socket_dir) as zmq_ctx, ): registry = MetricsRegistry() @@ -234,7 +228,7 @@ async def main() -> None: publish_interval_s=args.publish_interval, sig_figs=args.hdr_sig_figs, n_histogram_buckets=args.n_histogram_buckets, - tokenize_pool=pool, + tokenizer=tokenizer, streaming=args.streaming, shutdown_event=shutdown_event, drain_timeout_s=None if args.drain_timeout == 0 else args.drain_timeout, @@ -269,6 +263,7 @@ async def main() -> None: registry=registry, publisher=publisher, table=aggregator._table, + token_queue=aggregator._token_queue, shutdown_event=shutdown_event, ) loop.add_signal_handler(signal.SIGTERM, on_sigterm) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index f01c9753c..ed5ace0a0 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -47,7 +47,7 @@ from .publisher import MetricsPublisher from .registry import MetricsRegistry from .snapshot import SessionState -from .token_metrics import TokenizePool +from .token_metrics import BatchTokenizer, TokenBatchQueue logger = logging.getLogger(__name__) @@ -117,7 +117,7 @@ def __init__( publish_interval_s: float, sig_figs: int, n_histogram_buckets: int, - tokenize_pool: TokenizePool | None = None, + tokenizer: BatchTokenizer | None = None, streaming: bool = False, shutdown_event: asyncio.Event | None = None, drain_timeout_s: float | None = _DEFAULT_DRAIN_TIMEOUT_S, @@ -133,7 +133,12 @@ def __init__( self._registry = registry self._publisher = publisher self._publish_interval_s = publish_interval_s - self._tokenize_pool = tokenize_pool + # Token triggers enqueue onto this queue; it is flushed in batches at + # each publish tick and at end-of-run. None when no tokenizer is set + # (token metrics disabled), in which case those triggers are no-ops. + self._token_queue: TokenBatchQueue | None = ( + TokenBatchQueue(tokenizer, self.loop) if tokenizer is not None else None + ) self._streaming = streaming self._shutdown_event = shutdown_event self._shutdown_received = False @@ -223,21 +228,23 @@ def _register_triggers(self, streaming: bool) -> None: """ table = self._table registry = self._registry - pool = self._tokenize_pool - loop = self.loop + queue = self._token_queue # Always registered - table.add_trigger(SampleField.ISSUED_NS, IslTrigger(registry, pool, loop)) + table.add_trigger(SampleField.ISSUED_NS, IslTrigger(registry, queue)) table.add_trigger(SampleField.COMPLETE_NS, SampleLatencyTrigger(registry)) - table.add_trigger(SampleField.COMPLETE_NS, OslTrigger(registry, pool, loop)) + table.add_trigger(SampleField.COMPLETE_NS, OslTrigger(registry, queue)) # Streaming-only if streaming: table.add_trigger(SampleField.RECV_FIRST_NS, TtftTrigger(registry)) table.add_trigger(SampleField.LAST_RECV_NS, ChunkDeltaTrigger(registry)) - table.add_trigger( - SampleField.COMPLETE_NS, TpotTrigger(registry, pool, loop) - ) + table.add_trigger(SampleField.COMPLETE_NS, TpotTrigger(registry, queue)) + + async def _flush_tokens(self) -> None: + """Flush buffered tokenizations so the next snapshot reflects them.""" + if self._token_queue is not None: + await self._token_queue.flush() # ------------------------------------------------------------------ # Event processing @@ -311,8 +318,11 @@ async def process(self, records: list[EventRecord]) -> None: self._publish_interval_s, get_runtime_state=lambda: ( self._session_state, - table.in_flight_tasks_count, + self._token_queue.pending + if self._token_queue is not None + else 0, ), + pre_publish=self._flush_tokens, ) table.handle_session_event(record) if ev == SessionEventType.STOP_PERFORMANCE_TRACKING: @@ -367,12 +377,18 @@ async def process(self, records: list[EventRecord]) -> None: # ENDED has been observed; transition to DRAINING so any tick # that fires before publish_final reflects the new state. self._session_state = SessionState.DRAINING - logger.info("Draining %d async tasks...", table.in_flight_tasks_count) - # drain_tasks owns the timeout + cancel-and-await sequence so - # the pending count is captured BEFORE done-callbacks empty - # the in-flight set. Reading in_flight_tasks_count out here - # would always be 0 (see drain_tasks docstring). - n_pending = await table.drain_tasks(timeout=self._drain_timeout_s) + queue = self._token_queue + pending = queue.pending if queue is not None else 0 + logger.info("Draining %d pending tokenizations...", pending) + # flush_remaining tokenizes the whole buffer in one batched pass, + # bounded by the drain budget; it returns the count it could not + # finish (non-zero only on a timeout), which becomes the snapshot's + # n_pending_tasks so Report can flag an incomplete drain. + n_pending = ( + await queue.flush_remaining(self._drain_timeout_s) + if queue is not None + else 0 + ) if n_pending > 0: timeout_str = ( f"{self._drain_timeout_s:.1f}s" @@ -380,13 +396,13 @@ async def process(self, records: list[EventRecord]) -> None: else "unlimited" ) logger.warning( - "drain_tasks timed out after %s; %d async tasks " - "did not complete and were cancelled", + "tokenizer drain timed out after %s; %d tokenizations " + "did not complete", timeout_str, n_pending, ) logger.info( - "Async tasks drained (n_pending_tasks=%d at finalize)", n_pending + "Tokenizations drained (n_pending_tasks=%d at finalize)", n_pending ) registry.set_counter( MetricCounterKey.TRACKED_DURATION_NS.value, diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py index 46a17e92f..f67c859e6 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py @@ -17,9 +17,9 @@ from __future__ import annotations -import asyncio import logging from abc import ABC, abstractmethod +from collections.abc import Callable from dataclasses import dataclass from enum import Enum from typing import TYPE_CHECKING, Any @@ -33,7 +33,8 @@ MetricsRegistry, ) from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import ( - TokenizePool, + MessageParts, + TokenBatchQueue, ) from inference_endpoint.core.record import EventRecord @@ -146,8 +147,13 @@ def fire( ev_rec: EventRecord, row: SampleRow, pre_change: dict[str, Any], - ) -> asyncio.Task | None: - """Must be non-blocking. Return a Task if async work was scheduled.""" + ) -> None: + """Must be non-blocking. + + Sync triggers record into the registry directly. Token triggers + enqueue onto the shared ``TokenBatchQueue`` for batched tokenization + at the next flush; neither path schedules per-event tasks. + """ raise NotImplementedError() @@ -177,28 +183,27 @@ def fire(self, ev_rec, row, pre_change): class AsyncTokenTrigger(EmitTrigger): - """Base for triggers that need async tokenization. - - Subclasses implement ``_extract_text()`` to pull the text to tokenize - from the event record. If text is returned, an async task is created - to tokenize and emit. Subclasses can also override ``_extract_message()`` - to return (content, reasoning, tool_calls) for chat-template–aware tokenization - when tool calls are present. Subclasses can override ``_compute_value()`` to - transform the token count before storing. + """Base for triggers whose metric needs tokenization. + + Subclasses implement ``_extract_text()`` to pull the text to tokenize from + the event record, and may override ``_extract_message()`` to return + (content, reasoning, tool_calls) for chat-template–aware tokenization when + tool calls are present. ``fire()`` does not tokenize inline — it enqueues + the work plus a recorder callback onto the shared ``TokenBatchQueue``, which + the aggregator flushes in batches. ``_compute_value()`` can transform the + token count before it is recorded. """ def __init__( self, metric_name: str, registry: MetricsRegistry, - tokenize_pool: TokenizePool | None, - loop: asyncio.AbstractEventLoop | None, + queue: TokenBatchQueue | None, requires: tuple[str, ...] = (), dtype: type = int, ): super().__init__(metric_name, registry, requires=requires, dtype=dtype) - self._pool = tokenize_pool - self._loop = loop + self._queue = queue @abstractmethod def _extract_text( @@ -209,11 +214,11 @@ def _extract_text( def _extract_message( self, ev_rec: EventRecord, row: SampleRow, pre_change: dict[str, Any] - ) -> tuple[str, str | None, tuple[dict[str, Any], ...] | None] | None: - """Return (content, reasoning, tool_calls) for message-aware tokenization, or None. + ) -> MessageParts | None: + """Return (content, reasoning, tool_calls) for message-aware tokenization. - When non-None is returned, ``token_count_message_async`` is used instead of - ``token_count_async``. Default returns None (use text path). + When non-None, the message (chat-template) path is used instead of the + plain-text path. Default returns None (use text path). """ return None @@ -223,48 +228,32 @@ def _compute_value( """Transform token count into the metric value. Default: count as-is.""" return token_count - def fire(self, ev_rec, row, pre_change): - if self._pool is None or self._loop is None: - return None + def _make_recorder( + self, ev_rec: EventRecord, pre_change: dict[str, Any] + ) -> Callable[[int], None]: + """Build the callback the queue runs once the token count is known.""" + registry, name = self.registry, self.metric_name - message_parts = self._extract_message(ev_rec, row, pre_change) - if message_parts is not None: - content, reasoning, tool_calls = message_parts - pool, loop = self._pool, self._loop - registry, name = self.registry, self.metric_name - uuid = row.sample_uuid - - async def _tokenize_message_and_emit() -> None: - try: - count = await pool.token_count_message_async( - content, reasoning, tool_calls, loop - ) - value = self._compute_value(count, ev_rec, pre_change) - if value is not None: - registry.record(name, value) - except Exception: - logger.exception("%s tokenization failed for %s", name, uuid) + def record(count: int) -> None: + value = self._compute_value(count, ev_rec, pre_change) + if value is not None: + registry.record(name, value) - return loop.create_task(_tokenize_message_and_emit()) + return record + def fire(self, ev_rec, row, pre_change): + if self._queue is None: + return + message_parts = self._extract_message(ev_rec, row, pre_change) + if message_parts is not None: + self._queue.enqueue_message( + message_parts, self._make_recorder(ev_rec, pre_change) + ) + return text = self._extract_text(ev_rec, row, pre_change) if not text: - return None - - pool, loop = self._pool, self._loop - registry, name = self.registry, self.metric_name - uuid = row.sample_uuid - - async def _tokenize_and_emit() -> None: - try: - count = await pool.token_count_async(text, loop) - value = self._compute_value(count, ev_rec, pre_change) - if value is not None: - registry.record(name, value) - except Exception: - logger.exception("%s tokenization failed for %s", name, uuid) - - return loop.create_task(_tokenize_and_emit()) + return + self._queue.enqueue_text(text, self._make_recorder(ev_rec, pre_change)) # --------------------------------------------------------------------------- @@ -319,19 +308,18 @@ class IslTrigger(AsyncTokenTrigger): def __init__( self, registry: MetricsRegistry, - tokenize_pool: TokenizePool | None, - loop: asyncio.AbstractEventLoop | None, + queue: TokenBatchQueue | None, ): - super().__init__(MetricSeriesKey.ISL, registry, tokenize_pool, loop) + super().__init__(MetricSeriesKey.ISL, registry, queue) def fire(self, ev_rec, row, pre_change): # Sync fast path: any backend that pre-populates token_ids (e.g. SGLang). if isinstance(ev_rec.data, PromptData) and ev_rec.data.token_ids is not None: self.registry.record(self.metric_name, len(ev_rec.data.token_ids)) - return None - # Async path: tokenize raw text — used when token_ids are unavailable - # (e.g. OpenAI-compatible endpoints). Handled by the base class. - return super().fire(ev_rec, row, pre_change) + return + # Text path: tokenize raw prompt text — used when token_ids are + # unavailable (e.g. OpenAI-compatible endpoints). Enqueued by the base. + super().fire(ev_rec, row, pre_change) def _extract_text(self, ev_rec, row, pre_change): if isinstance(ev_rec.data, PromptData) and ev_rec.data.text is not None: @@ -345,10 +333,9 @@ class OslTrigger(AsyncTokenTrigger): def __init__( self, registry: MetricsRegistry, - tokenize_pool: TokenizePool | None, - loop: asyncio.AbstractEventLoop | None, + queue: TokenBatchQueue | None, ): - super().__init__(MetricSeriesKey.OSL, registry, tokenize_pool, loop) + super().__init__(MetricSeriesKey.OSL, registry, queue) def _extract_text(self, ev_rec, row, pre_change): if isinstance(ev_rec.data, TextModelOutput): @@ -383,14 +370,12 @@ class TpotTrigger(AsyncTokenTrigger): def __init__( self, registry: MetricsRegistry, - tokenize_pool: TokenizePool | None, - loop: asyncio.AbstractEventLoop | None, + queue: TokenBatchQueue | None, ): super().__init__( MetricSeriesKey.TPOT_NS, registry, - tokenize_pool, - loop, + queue, requires=(SampleField.RECV_FIRST_NS,), dtype=float, ) @@ -444,7 +429,6 @@ def __init__(self, registry: MetricsRegistry) -> None: self._registry = registry self._in_flight: dict[str, SampleRow] = {} self._triggers: dict[str, list[EmitTrigger]] = {} - self._in_flight_tasks: set[asyncio.Task] = set() # Session-level state self.is_tracking: bool = False @@ -538,45 +522,6 @@ def set_field( self._update_tracked_block(row, ev_rec.timestamp_ns) self._in_flight.pop(sample_uuid, None) - # --- Task draining --- - - @property - def in_flight_tasks_count(self) -> int: - """Number of async trigger tasks currently in flight.""" - return len(self._in_flight_tasks) - - async def drain_tasks(self, *, timeout: float | None = None) -> int: - """Await in-flight async trigger tasks. - - With ``timeout``, the pending set at the timeout boundary is - cancelled and awaited; the count of those pending tasks is - returned (>0 indicates the drain timed out). Without - ``timeout``, blocks indefinitely and returns 0 on clean drain. - - The pending count must be captured BEFORE the cancel-and-await - step: each task's ``add_done_callback(_in_flight_tasks.discard)`` - empties ``_in_flight_tasks`` as cancellation propagates, so - reading ``in_flight_tasks_count`` after this method returns - would always be 0 — making a drain timeout indistinguishable - from a clean run. - """ - if not self._in_flight_tasks: - return 0 - if timeout is None: - await asyncio.gather(*self._in_flight_tasks, return_exceptions=True) - self._in_flight_tasks.clear() - return 0 - _, still_pending = await asyncio.wait( - list(self._in_flight_tasks), timeout=timeout - ) - n_pending = len(still_pending) - if still_pending: - for t in still_pending: - t.cancel() - await asyncio.gather(*still_pending, return_exceptions=True) - self._in_flight_tasks.clear() - return n_pending - # --- Internal --- def _create_row(self, sample_uuid: str) -> SampleRow: @@ -595,10 +540,7 @@ def _fire_triggers( ) -> None: for trigger in self._triggers.get(field_name, ()): pre_change = {attr: getattr(row, attr) for attr in trigger.requires} - task = trigger.fire(ev_rec, row, pre_change) - if task is not None: - self._in_flight_tasks.add(task) - task.add_done_callback(self._in_flight_tasks.discard) + trigger.fire(ev_rec, row, pre_change) def _update_tracked_block(self, row: SampleRow, complete_ns: int) -> None: """Extend the sample's tracked block duration and increment count.""" diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py index d21973a3f..b58aa05ff 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py @@ -21,7 +21,7 @@ import json import logging import os -from collections.abc import Callable +from collections.abc import Awaitable, Callable from pathlib import Path from inference_endpoint.async_utils.services.metrics_aggregator.registry import ( @@ -102,15 +102,22 @@ def start( registry: MetricsRegistry, publish_interval_s: float, get_runtime_state: Callable[[], tuple[SessionState, int]], + pre_publish: Callable[[], Awaitable[None]] | None = None, ) -> None: """Begin publishing live ticks every ``publish_interval_s`` seconds. ``get_runtime_state`` returns ``(state, n_pending_tasks)`` for the current moment: the aggregator's session state (``LIVE`` or - ``DRAINING``) and the count of in-flight async tokenize tasks. The - callable is invoked once per tick and the values are plumbed into - the published snapshot. ``COMPLETE`` is emitted only by - ``publish_final``, never by the tick task. + ``DRAINING``) and the count of pending tokenizations. The callable is + invoked once per tick and the values are plumbed into the published + snapshot. ``COMPLETE`` is emitted only by ``publish_final``, never by + the tick task. + + ``pre_publish``, if given, is awaited at the top of each tick before + the snapshot is built — the aggregator uses it to flush buffered + tokenizations so live ISL/OSL/TPOT reflect recently completed samples. + Its failures are swallowed by the tick's own try/except (the tick keeps + going), so a transient tokenizer hiccup never stops live publishing. Idempotent on the tick-task slot: a second call (e.g. from a spurious duplicate ``STARTED`` event or a buggy replay producer) @@ -133,6 +140,8 @@ async def _tick() -> None: while True: try: await asyncio.sleep(publish_interval_s) + if pre_publish is not None: + await pre_publish() state, n_pending = get_runtime_state() snap = registry.build_snapshot( state=state, n_pending_tasks=n_pending diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 3411d5061..57c9704d4 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -13,25 +13,46 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tokenization utilities for metrics aggregation.""" +"""Tokenization for ISL/OSL/TPOT metrics. + +``BatchTokenizer`` tokenizes whole batches of text at once. A single BPE rayon +pool saturates ~8 CPU cores (memory-bound), so to use the whole machine it +shards each batch across worker *processes*, one pinned to each block of +``CORES_PER_WORKER`` cores (their rayon pools stay NUMA-local). The aggregator +buffers per-sample text as COMPLETE events arrive and calls ``count_texts`` once +per flush (publish tick + drain) — so batching, not a per-request coalescer, +keeps tokenization ahead of completions. Falls back to a single in-process +thread when there is no fast Rust backend or fewer than two core blocks fit. +""" from __future__ import annotations import asyncio import json import logging -import threading -from concurrent.futures import ThreadPoolExecutor -from typing import TYPE_CHECKING, Any +import multiprocessing +import os +from collections.abc import Callable +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from typing import TYPE_CHECKING, Any, Protocol, cast import msgspec from transformers import AutoTokenizer +from transformers.utils import logging as transformers_logging + +# A single rayon pool peaks at ~8 cores for BPE (memory-bound; more threads +# oversubscribe and, on multi-socket Grace, cross the NUMA boundary). Sharding +# across processes pinned to disjoint 8-core blocks is how the whole machine is +# used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process. +CORES_PER_WORKER = 8 # Minimal user message used to satisfy chat templates that reject assistant-only # message lists. Its token count is subtracted so only the assistant payload is # measured. _PREFIX_USER_MSG: dict[str, str] = {"role": "user", "content": ""} +logger = logging.getLogger(__name__) + def _normalize_tool_calls_for_template( tool_calls: tuple[dict[str, Any], ...] | list[dict[str, Any]], @@ -60,140 +81,252 @@ def _normalize_tool_calls_for_template( return normalized +# --------------------------------------------------------------------------- +# Process-worker entry points (module-level so ProcessPoolExecutor can pickle +# them by name). Each worker holds one raw tokenizers backend, pinned to a +# fixed core block. +# --------------------------------------------------------------------------- + +_WORKER_BACKEND: Any = None + + +def _init_worker(tokenizer_name: str, core_set: list[int]) -> None: + """Pin this worker to ``core_set``, then load the raw tokenizers backend. + + Affinity is set before the first encode so the Rust rayon pool sizes itself + to the pinned core count (num_cpus respects sched_getaffinity on Linux). + """ + if core_set: + try: + os.sched_setaffinity(0, set(core_set)) + except (OSError, AttributeError): + logger.debug("could not pin tokenizer worker to %s", core_set) + transformers_logging.set_verbosity_error() + tok = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) + global _WORKER_BACKEND + _WORKER_BACKEND = getattr(tok, "backend_tokenizer", None) + if _WORKER_BACKEND is not None: + _WORKER_BACKEND.encode("warmup", add_special_tokens=False) + + +def _worker_encode_lengths(texts: list[str]) -> list[int]: + """Per-text token counts for a shard, in one rayon-parallel call.""" + backend = _WORKER_BACKEND + if backend is None: + raise RuntimeError("tokenizer worker backend unavailable") + encode_batch = getattr(backend, "encode_batch_fast", None) or backend.encode_batch + return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)] + + +def _worker_ready(_: int) -> bool: + """Warmup probe: returns once the worker's backend is loaded.""" + return _WORKER_BACKEND is not None + + if TYPE_CHECKING: from transformers import PreTrainedTokenizerBase -logger = logging.getLogger(__name__) +def _even_chunks(items: list[str], n: int) -> list[list[str]]: + """Split ``items`` into at most ``n`` near-equal contiguous chunks.""" + if n <= 1 or len(items) <= 1: + return [items] + size = (len(items) + n - 1) // n + return [items[i : i + size] for i in range(0, len(items), size)] -class TokenizePool: - """A pool of worker threads, each with its own HuggingFace AutoTokenizer. - Uses multi-threading (not multiprocessing) because HuggingFace tokenizers - use a Rust backend that releases the GIL during tokenization, so threads - can run tokenization in parallel without GIL contention. Multiprocessing - would add process spawn overhead and per-process tokenizer memory and - IPC latency. +class BatchTokenizer: + """Counts tokens for batches of text, sharded across pinned CPU cores. - Thread-safety notes: - - The ThreadPoolExecutor itself is thread-safe (submit/shutdown are synchronized). - - Each worker thread has its own tokenizer via thread-local storage, so there - is no shared mutable state during tokenization. - - The blocking `token_count()` method is safe to call from multiple threads - concurrently. - - In an async context, use `token_count_async` to avoid blocking the event loop. + ``count_texts`` / ``count_texts_async`` tokenize a whole list in one shot. + The sync ``token_count`` and chat-template ``token_count_message`` paths run + on a small in-process thread pool — they are rare (single ISL probes, tool + calls) relative to the batched OSL/ISL/TPOT flush. """ - def __init__(self, tokenizer_name: str, n_workers: int) -> None: - if n_workers < 1: - raise ValueError("n_workers must be at least 1") + def __init__( + self, + tokenizer_name: str, + *, + cores_per_worker: int = CORES_PER_WORKER, + ) -> None: self._tokenizer_name = tokenizer_name - self._n_workers = n_workers - self._thread_local = threading.local() self._fallback_warned: set[str] = set() - self._executor: ThreadPoolExecutor | None = ThreadPoolExecutor( - max_workers=n_workers, - thread_name_prefix="TokenizePool", + self._tokenizer: PreTrainedTokenizerBase | None = None + self._prefix_len = 0 + self._baseline = 0 + # In-process thread for the sync + chat-template paths. + self._thread: ThreadPoolExecutor | None = ThreadPoolExecutor( + max_workers=1, thread_name_prefix="tok-thread" ) - # Pre-load a tokenizer on every worker thread so the first real - # token_count call doesn't pay the AutoTokenizer.from_pretrained cost. - # Submitting n_workers tasks is guaranteed to hit every thread because - # AutoTokenizer.from_pretrained blocks long enough that no thread - # completes before all tasks are submitted. - # **IMPORTANT**: This is not a guarantee - for instance when using a mock - # object in tests for the tokenizer, the mock object *must* block in the 100ms - # range to simulate proper .from_pretrained behavior. - # It is not super impactful if a thread is not pre-initialized - it will just - # have to pay the cost of .from_pretrained on the first pool.token_count call - # for that thread. - futures = [ - self._executor.submit(self._get_thread_tokenizer) for _ in range(n_workers) - ] + self._load_tokenizer() # also computes the chat-template baseline + # Process shards for the batched text path (or empty -> in-process). + self._procs: list[ProcessPoolExecutor] = [] + self._setup_shards(cores_per_worker) + + # -- setup -------------------------------------------------------------- + + def _load_tokenizer(self) -> None: + tok = AutoTokenizer.from_pretrained( + self._tokenizer_name, trust_remote_code=True + ) + self._tokenizer = tok + # Baseline = tokens from a [user, empty-assistant] pair minus the [user] + # prefix alone, so the assistant frame is subtracted from message counts. try: - for f in futures: - f.result() - except Exception: - self._executor.shutdown(wait=False) - self._executor = None - raise - - def _get_thread_tokenizer(self) -> PreTrainedTokenizerBase: - """Return the tokenizer for the current thread, loading it if needed.""" - if getattr(self._thread_local, "tokenizer", None) is None: - self._thread_local.tokenizer = AutoTokenizer.from_pretrained( - self._tokenizer_name, trust_remote_code=True + prefix = cast( + str, + tok.apply_chat_template( + [_PREFIX_USER_MSG], tokenize=False, add_generation_prompt=False + ), ) - # Baseline = tokens contributed by a [user, empty-assistant] pair minus - # the [user] prefix alone. Some templates (Qwen3-Coder, etc.) reject - # assistant-only message lists, so a user prefix is required; we - # subtract it out so the baseline reflects only the assistant frame. - try: - tok = self._thread_local.tokenizer - prefix_rendered = tok.apply_chat_template( - [_PREFIX_USER_MSG], - tokenize=False, - add_generation_prompt=False, - ) - prefix_len = len(tok.tokenize(prefix_rendered)) - with_empty_assistant_rendered = tok.apply_chat_template( + self._prefix_len = len(tok.tokenize(prefix)) + with_assistant = cast( + str, + tok.apply_chat_template( [_PREFIX_USER_MSG, {"role": "assistant", "content": ""}], tokenize=False, add_generation_prompt=False, + ), + ) + self._baseline = len(tok.tokenize(with_assistant)) - self._prefix_len + except Exception: + self._prefix_len = 0 + self._baseline = 0 + logger.exception( + "Failed to compute chat-template baseline for %s; tool-call " + "token counts may be over-estimated", + self._tokenizer_name, + ) + + def _setup_shards(self, cores_per_worker: int) -> None: + """Spawn one pinned single-worker process per core block. + + No-op (leaving the batch path in-process) when the tokenizer has no fast + Rust backend, affinity is unavailable, or fewer than two blocks fit — a + single shard is no faster than the in-process backend. + """ + if cores_per_worker <= 0: + return + if getattr(self._tokenizer, "backend_tokenizer", None) is None: + return + try: + available = sorted(os.sched_getaffinity(0)) + except (OSError, AttributeError): + return + n = len(available) // cores_per_worker + if n < 2: + return + ctx = multiprocessing.get_context("spawn") + procs: list[ProcessPoolExecutor] = [] + try: + for i in range(n): + block = available[i * cores_per_worker : (i + 1) * cores_per_worker] + ex = ProcessPoolExecutor( + max_workers=1, + mp_context=ctx, + initializer=_init_worker, + initargs=(self._tokenizer_name, block), ) - with_empty_assistant_len = len( - tok.tokenize(with_empty_assistant_rendered) - ) - self._thread_local.prefix_len = prefix_len - self._thread_local.baseline = with_empty_assistant_len - prefix_len - except Exception: - self._thread_local.prefix_len = 0 - self._thread_local.baseline = 0 - logger.exception( - "Failed to compute chat-template baseline for %s; tool-call token counts may be over-estimated", - self._tokenizer_name, - ) - return self._thread_local.tokenizer + procs.append(ex) + # Force spawn + pin + tokenizer-load now (not on the first batch). + # Submit to every shard first so the loads run in parallel, then + # await — waiting on each before submitting the next would + # serialize P tokenizer loads and can exceed the launch budget. + ready = [ex.submit(_worker_ready, 0) for ex in procs] + for f in ready: + f.result() + except Exception: + for ex in procs: + ex.shutdown(wait=False) + logger.exception( + "tokenizer shard setup failed; using in-process tokenization" + ) + return + self._procs = procs + logger.info( + "BatchTokenizer: %d shards x %d cores", len(procs), cores_per_worker + ) + + # -- batched text path -------------------------------------------------- - def _token_count_worker(self, text: str) -> int: - """Worker entry: return the number of tokens in text.""" - tokenizer = self._get_thread_tokenizer() - return len(tokenizer.tokenize(text)) + def _encode_lengths_inproc(self, texts: list[str]) -> list[int]: + tok = self._tokenizer + backend = getattr(tok, "backend_tokenizer", None) + if backend is not None: + encode_batch = getattr(backend, "encode_batch_fast", None) + if encode_batch is None: + encode_batch = backend.encode_batch + return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)] + return [len(tok.tokenize(t)) for t in texts] # type: ignore[union-attr] + + def count_texts(self, texts: list[str]) -> list[int]: + """Per-text token counts for a whole batch (blocking).""" + if not texts: + return [] + if not self._procs: + return self._encode_lengths_inproc(texts) + chunks = _even_chunks(texts, len(self._procs)) + futures = [ + self._procs[i].submit(_worker_encode_lengths, chunk) + for i, chunk in enumerate(chunks) + ] + out: list[int] = [] + for f in futures: + out.extend(f.result()) + return out - def _token_count_message_worker( + async def count_texts_async( + self, texts: list[str], loop: asyncio.AbstractEventLoop + ) -> list[int]: + """Per-text token counts for a whole batch without blocking the loop.""" + if not texts: + return [] + if not self._procs: + return await loop.run_in_executor( + self._thread, self._encode_lengths_inproc, texts + ) + chunks = _even_chunks(texts, len(self._procs)) + futures = [ + asyncio.wrap_future(self._procs[i].submit(_worker_encode_lengths, chunk)) + for i, chunk in enumerate(chunks) + ] + results = await asyncio.gather(*futures) + out: list[int] = [] + for r in results: + out.extend(r) + return out + + # -- sync + chat-template paths (in-process thread) --------------------- + + def _token_count_text(self, text: str) -> int: + return len(self._tokenizer.tokenize(text)) # type: ignore[union-attr] + + def _token_count_message( self, content: str, reasoning: str | None, tool_calls: tuple[dict[str, Any], ...] | None, ) -> int: - """Worker entry: tokenize a full assistant message using apply_chat_template. - - Falls back to whitespace-split tokenization if apply_chat_template raises - (e.g. the template does not support tool_calls or reasoning fields). - """ - tokenizer = self._get_thread_tokenizer() + tok = self._tokenizer msg: dict[str, Any] = {"role": "assistant", "content": content or ""} if reasoning: msg["reasoning_content"] = reasoning if tool_calls: msg["tool_calls"] = _normalize_tool_calls_for_template(tool_calls) try: - rendered = tokenizer.apply_chat_template( - [_PREFIX_USER_MSG, msg], - tokenize=False, - add_generation_prompt=False, + rendered = tok.apply_chat_template( # type: ignore[union-attr] + [_PREFIX_USER_MSG, msg], tokenize=False, add_generation_prompt=False ) - full = len(tokenizer.tokenize(rendered)) - prefix_len = getattr(self._thread_local, "prefix_len", 0) - baseline = getattr(self._thread_local, "baseline", 0) - return max(0, full - prefix_len - baseline) + full = len(tok.tokenize(rendered)) # type: ignore[union-attr] + return max(0, full - self._prefix_len - self._baseline) except Exception as exc: key = f"{self._tokenizer_name}:{type(exc).__name__}" if key not in self._fallback_warned: self._fallback_warned.add(key) logger.exception( "apply_chat_template failed for %s (%s); falling back to " - "whitespace tokenization. Tool-call OSL/TPOT may diverge " - "from server-side counts for this run.", + "whitespace tokenization. Tool-call OSL/TPOT may diverge.", self._tokenizer_name, type(exc).__name__, ) @@ -203,15 +336,13 @@ def _token_count_message_worker( parts = [ p for p in (content or None, reasoning or None, tool_calls_json) if p ] - fallback_text = "\n".join(parts) - return self._token_count_worker(fallback_text) + return self._token_count_text("\n".join(parts)) def token_count(self, text: str) -> int: - """Return the number of tokens in the input string (blocking).""" - if self._executor is None: - raise RuntimeError("TokenizePool is closed") - future = self._executor.submit(self._token_count_worker, text) - return future.result() + """Token count for a single string (blocking).""" + if self._thread is None: + raise RuntimeError("BatchTokenizer is closed") + return self._thread.submit(self._token_count_text, text).result() def token_count_message( self, @@ -219,27 +350,12 @@ def token_count_message( reasoning: str | None, tool_calls: tuple[dict[str, Any], ...] | None, ) -> int: - """Return the token count for an assistant message (blocking).""" - if self._executor is None: - raise RuntimeError("TokenizePool is closed") - future = self._executor.submit( - self._token_count_message_worker, content, reasoning, tool_calls - ) - return future.result() - - async def token_count_async( - self, text: str, loop: asyncio.AbstractEventLoop - ) -> int: - """Return the number of tokens without blocking the event loop. - - Submits directly to the TokenizePool's executor so tokenization runs - on a thread with a pre-loaded thread-local tokenizer instance. - """ - if self._executor is None: - raise RuntimeError("TokenizePool is closed") - return await loop.run_in_executor( - self._executor, self._token_count_worker, text - ) + """Token count for an assistant message via the chat template (blocking).""" + if self._thread is None: + raise RuntimeError("BatchTokenizer is closed") + return self._thread.submit( + self._token_count_message, content, reasoning, tool_calls + ).result() async def token_count_message_async( self, @@ -248,25 +364,148 @@ async def token_count_message_async( tool_calls: tuple[dict[str, Any], ...] | None, loop: asyncio.AbstractEventLoop, ) -> int: - """Return the token count for an assistant message without blocking the event loop.""" - if self._executor is None: - raise RuntimeError("TokenizePool is closed") + """Chat-template message token count without blocking the loop.""" + if self._thread is None: + raise RuntimeError("BatchTokenizer is closed") return await loop.run_in_executor( - self._executor, - self._token_count_message_worker, - content, - reasoning, - tool_calls, + self._thread, self._token_count_message, content, reasoning, tool_calls ) def close(self) -> None: - """Shut down the worker pool. Idempotent.""" - if self._executor is not None: - self._executor.shutdown(wait=True) - self._executor = None + """Shut down all workers. Idempotent.""" + for ex in self._procs: + ex.shutdown(wait=False) + self._procs = [] + if self._thread is not None: + self._thread.shutdown(wait=True) + self._thread = None - def __enter__(self) -> TokenizePool: + def __enter__(self) -> BatchTokenizer: return self def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: self.close() + + +# Type alias for the (content, reasoning, tool_calls) tuple a message trigger +# enqueues for chat-template tokenization. +MessageParts = tuple[str, str | None, "tuple[dict[str, Any], ...] | None"] + + +class TokenCounter(Protocol): + """The async tokenization surface ``TokenBatchQueue`` depends on. + + ``BatchTokenizer`` satisfies this structurally; tests pass lightweight + stubs. Declared as a Protocol so the queue is decoupled from the concrete + tokenizer and test doubles type-check without inheritance. + """ + + async def count_texts_async( + self, texts: list[str], loop: asyncio.AbstractEventLoop, / + ) -> list[int]: ... + + async def token_count_message_async( + self, + content: str, + reasoning: str | None, + tool_calls: tuple[dict[str, Any], ...] | None, + loop: asyncio.AbstractEventLoop, + /, + ) -> int: ... + + +class TokenBatchQueue: + """Buffers per-sample tokenization work and clears it in batches. + + Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with a + ``on_count`` callback that records the resulting metric. The aggregator + drains the buffer with ``flush`` (once per publish tick, so live ISL/OSL/ + TPOT stay current) and with ``flush_remaining`` at end-of-run. Holding the + work until a flush lets the whole buffer go through ``BatchTokenizer`` in + one sharded call, instead of one event-loop task per completion — the latter + is what fell behind and stretched the drain on high-completion-rate runs. + + ``pending`` counts enqueued-but-not-yet-recorded items; it is the + ``n_pending_tasks`` surfaced on the snapshot, and a non-zero value in the + final snapshot means the end-of-run flush did not finish within the drain + budget. + """ + + def __init__( + self, tokenizer: TokenCounter, loop: asyncio.AbstractEventLoop + ) -> None: + self._tokenizer = tokenizer + self._loop = loop + self._text: list[tuple[str, Callable[[int], None]]] = [] + self._msg: list[tuple[MessageParts, Callable[[int], None]]] = [] + self._inflight = 0 + # Serializes flushes so a periodic tick flush and the end-of-run flush + # never record the same item twice or race on the pending count. + self._lock = asyncio.Lock() + + @property + def pending(self) -> int: + """Enqueued items not yet tokenized-and-recorded.""" + return self._inflight + + def enqueue_text(self, text: str, on_count: Callable[[int], None]) -> None: + self._inflight += 1 + self._text.append((text, on_count)) + + def enqueue_message( + self, parts: MessageParts, on_count: Callable[[int], None] + ) -> None: + self._inflight += 1 + self._msg.append((parts, on_count)) + + async def flush(self) -> None: + """Tokenize everything buffered so far and run each ``on_count``. + + Items are detached from the buffer up front so concurrent enqueues land + in the next flush. ``_inflight`` is decremented only after a callback + runs, so a cancellation (drain timeout) leaves it reflecting exactly the + items that were not recorded. + """ + async with self._lock: + if not (self._text or self._msg): + return + text_items, self._text = self._text, [] + msg_items, self._msg = self._msg, [] + if text_items: + counts = await self._tokenizer.count_texts_async( + [t for t, _ in text_items], self._loop + ) + for (_, on_count), count in zip(text_items, counts, strict=True): + try: + on_count(count) + finally: + self._inflight -= 1 + for (content, reasoning, tool_calls), on_count in msg_items: + count = await self._tokenizer.token_count_message_async( + content, reasoning, tool_calls, self._loop + ) + try: + on_count(count) + finally: + self._inflight -= 1 + + async def flush_remaining(self, timeout: float | None) -> int: + """End-of-run flush, bounded by ``timeout`` seconds. + + Returns the number of items still un-tokenized — non-zero only if the + budget was exhausted (``timeout`` reached). ``None`` waits indefinitely. + """ + if self._inflight == 0: + return 0 + try: + if timeout is None: + await self.flush() + else: + await asyncio.wait_for(self.flush(), timeout) + except TimeoutError: + logger.warning( + "tokenizer drain timed out after %.1fs; %d items not counted", + timeout, + self._inflight, + ) + return self._inflight diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index a2050bbe3..380a0b14d 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -612,12 +612,6 @@ async def _run_benchmark_async( aggregator_args.extend( ["--drain-timeout", str(config.settings.drain.metrics_drain_timeout_s)] ) - aggregator_args.extend( - [ - "--tokenizer-workers", - str(config.settings.drain.metrics_tokenizer_workers), - ] - ) # EventLoggerService writes events.jsonl to tmpfs (high-frequency writes) event_logger_args: list[str] = [ diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 9226d7f85..722652e0d 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -592,21 +592,6 @@ class DrainConfig(BaseModel): "in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited)." ), ) - metrics_tokenizer_workers: Annotated[ - int, - cyclopts.Parameter( - alias="--metrics-tokenizer-workers", - help=( - "Number of tokenizer worker threads in the metrics aggregator. " - "Increase if ISL/OSL/TPOT tokenization can't keep up with request " - "throughput (symptoms: large drain timeout warning at run end)." - ), - ), - ] = Field( - 2, - ge=1, - description="Number of tokenizer worker threads in the metrics aggregator (default: 2).", - ) @cyclopts.Parameter(name="*") diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 38829f0f5..693765e57 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -80,7 +80,6 @@ settings: performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited). - metrics_tokenizer_workers: 2 # Number of tokenizer worker threads in the metrics aggregator (default: 2). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index c3454d5da..64439452f 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -80,7 +80,6 @@ settings: performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited). - metrics_tokenizer_workers: 2 # Number of tokenizer worker threads in the metrics aggregator (default: 2). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 5bea95329..0c810f30b 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -80,7 +80,6 @@ settings: performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited). - metrics_tokenizer_workers: 2 # Number of tokenizer worker threads in the metrics aggregator (default: 2). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py index 7adbe0361..b32811bcf 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py +++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py @@ -49,24 +49,26 @@ from inference_endpoint.core.types import TextModelOutput # --------------------------------------------------------------------------- -# Mock TokenizePool — used by tests that exercise async triggers directly. +# Mock BatchTokenizer — whitespace token counts; matches the BatchTokenizer +# surface the TokenBatchQueue calls (count_texts_async + message path). # --------------------------------------------------------------------------- -class MockTokenizePool: - """Mock TokenizePool that splits on whitespace with artificial async delay.""" +class MockBatchTokenizer: + """Mock BatchTokenizer that splits on whitespace with optional async delay.""" - def __init__(self, delay: float = 0.01) -> None: + def __init__(self, delay: float = 0.0) -> None: self._delay = delay def token_count(self, text: str) -> int: return len(text.split()) - async def token_count_async( - self, text: str, _loop: asyncio.AbstractEventLoop - ) -> int: - await asyncio.sleep(self._delay) - return len(text.split()) + async def count_texts_async( + self, texts: list[str], _loop: asyncio.AbstractEventLoop + ) -> list[int]: + if self._delay: + await asyncio.sleep(self._delay) + return [len(t.split()) for t in texts] async def token_count_message_async( self, @@ -77,7 +79,8 @@ async def token_count_message_async( ) -> int: import msgspec - await asyncio.sleep(self._delay) + if self._delay: + await asyncio.sleep(self._delay) tool_calls_str = ( msgspec.json.encode(list(tool_calls)).decode() if tool_calls else "" ) @@ -164,7 +167,7 @@ def make_aggregator( loop: asyncio.AbstractEventLoop, socket_name: str, *, - tokenize_pool=None, + tokenizer=None, streaming: bool = True, shutdown_event: asyncio.Event | None = None, ) -> tuple[MetricsAggregatorService, MetricsRegistry, MagicMock]: @@ -195,7 +198,7 @@ def make_aggregator( publish_interval_s=0.25, sig_figs=3, n_histogram_buckets=10, - tokenize_pool=tokenize_pool, + tokenizer=tokenizer, streaming=streaming, shutdown_event=shutdown_event, ) diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py index 9877aee5d..87c5ff96b 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py @@ -44,7 +44,7 @@ from inference_endpoint.core.types import ErrorData, PromptData, TextModelOutput from .conftest import ( - MockTokenizePool, + MockBatchTokenizer, make_aggregator, sample_event, session_event, @@ -312,10 +312,10 @@ async def test_chunk_deltas(self, tmp_path): async def test_non_streaming_latency_only(self, tmp_path): """Non-streaming: emits sample_latency_ns + OSL, no TTFT/chunk_delta/TPOT.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.0) + tokenizer = MockBatchTokenizer(delay=0.0) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_non_streaming", tokenize_pool=pool + ctx, loop, "agg_non_streaming", tokenizer=tokenizer ) try: await agg.process( @@ -332,7 +332,7 @@ async def test_non_streaming_latency_only(self, tmp_path): ), ] ) - await agg._table.drain_tasks() + await agg._flush_tokens() # sample_latency = 3000-1000 = 2000 assert ( snapshot_series_total( @@ -380,7 +380,7 @@ async def test_chunk_delta_not_emitted_without_last_recv(self, tmp_path): # --------------------------------------------------------------------------- -# ISL (token_ids path -- sync, no tokenize_pool needed) +# ISL (token_ids path -- sync, no tokenizer needed) # --------------------------------------------------------------------------- @@ -766,7 +766,7 @@ async def test_total_vs_tracked_counters(self, tmp_path): # --------------------------------------------------------------------------- -# Async trigger tests (with mock TokenizePool and real event loop) +# Token trigger tests (with mock BatchTokenizer and real event loop) # --------------------------------------------------------------------------- @@ -776,10 +776,10 @@ class TestAsyncTriggers: async def test_isl_text_path_async(self, tmp_path): """ISL with text prompt triggers async tokenization.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.01) + tokenizer = MockBatchTokenizer(delay=0.01) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_isl_text_async", tokenize_pool=pool + ctx, loop, "agg_isl_text_async", tokenizer=tokenizer ) try: await agg.process( @@ -796,7 +796,7 @@ async def test_isl_text_path_async(self, tmp_path): ] ) # ISL task is in-flight; drain it - await agg._table.drain_tasks() + await agg._flush_tokens() assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 4 finally: agg.close() @@ -805,10 +805,10 @@ async def test_isl_text_path_async(self, tmp_path): async def test_osl_emitted_on_complete(self, tmp_path): """OSL is emitted via async tokenization when COMPLETE carries text.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.01) + tokenizer = MockBatchTokenizer(delay=0.01) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_osl_complete", tokenize_pool=pool + ctx, loop, "agg_osl_complete", tokenizer=tokenizer ) try: await agg.process( @@ -825,7 +825,7 @@ async def test_osl_emitted_on_complete(self, tmp_path): ), ] ) - await agg._table.drain_tasks() + await agg._flush_tokens() # sample_latency_ns = 5000-1000 = 4000 assert ( snapshot_series_total( @@ -842,10 +842,10 @@ async def test_osl_emitted_on_complete(self, tmp_path): async def test_tpot_emitted_for_streaming(self, tmp_path): """TPOT is emitted for streaming responses using text_after_first_chunk.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.0) + tokenizer = MockBatchTokenizer(delay=0.0) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_tpot_streaming", tokenize_pool=pool + ctx, loop, "agg_tpot_streaming", tokenizer=tokenizer ) try: await agg.process( @@ -864,7 +864,7 @@ async def test_tpot_emitted_for_streaming(self, tmp_path): ), ] ) - await agg._table.drain_tasks() + await agg._flush_tokens() # OSL = "hello world foo" = 3 tokens assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3 # tpot = (5000 - 2000) / token_count("world foo") = 3000 / 2 = 1500 @@ -878,10 +878,10 @@ async def test_tpot_emitted_for_streaming(self, tmp_path): async def test_tpot_skipped_when_single_chunk(self, tmp_path): """TPOT is not emitted when there are no tokens after the first chunk.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.0) + tokenizer = MockBatchTokenizer(delay=0.0) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_tpot_single_chunk", tokenize_pool=pool + ctx, loop, "agg_tpot_single_chunk", tokenizer=tokenizer ) try: await agg.process( @@ -900,7 +900,7 @@ async def test_tpot_skipped_when_single_chunk(self, tmp_path): ), ] ) - await agg._table.drain_tasks() + await agg._flush_tokens() assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 1 assert ( snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0 @@ -914,13 +914,13 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path): registered at all — the aggregator's snapshot has no entry for them. """ loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.0) + tokenizer = MockBatchTokenizer(delay=0.0) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( ctx, loop, "agg_tpot_no_streaming", - tokenize_pool=pool, + tokenizer=tokenizer, streaming=False, ) try: @@ -939,7 +939,7 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path): ), ] ) - await agg._table.drain_tasks() + await agg._flush_tokens() # sample_latency / OSL still emitted in non-streaming mode. assert ( snapshot_series_total( @@ -959,10 +959,10 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path): async def test_tpot_non_streaming_output_skipped(self, tmp_path): """TPOT is not emitted for non-streaming (str) TextModelOutput.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.0) + tokenizer = MockBatchTokenizer(delay=0.0) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_tpot_str_output", tokenize_pool=pool + ctx, loop, "agg_tpot_str_output", tokenizer=tokenizer ) try: await agg.process( @@ -981,7 +981,7 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path): ), ] ) - await agg._table.drain_tasks() + await agg._flush_tokens() assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3 assert ( snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0 @@ -990,13 +990,13 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path): agg.close() @pytest.mark.asyncio - async def test_drain_tasks_awaits_in_flight(self, tmp_path): - """drain_tasks() properly awaits all in-flight async trigger tasks.""" + async def test_flush_records_buffered_tokenizations(self, tmp_path): + """fire() buffers tokenization; flush() tokenizes the batch and records.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.05) + tokenizer = MockBatchTokenizer() with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_drain_in_flight", tokenize_pool=pool + ctx, loop, "agg_flush_records", tokenizer=tokenizer ) try: await agg.process( @@ -1012,23 +1012,24 @@ async def test_drain_tasks_awaits_in_flight(self, tmp_path): ), ] ) - # Tasks are in-flight but not yet complete - assert agg._table.in_flight_tasks_count > 0 + assert agg._token_queue is not None + # Enqueued by fire(), not yet tokenized (no tick/drain flush). + assert agg._token_queue.pending > 0 - await agg._table.drain_tasks() - assert agg._table.in_flight_tasks_count == 0 + await agg._flush_tokens() + assert agg._token_queue.pending == 0 assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 5 finally: agg.close() @pytest.mark.asyncio - async def test_shutdown_drains_async_tasks(self, tmp_path): - """ENDED drains in-flight async tasks before finalizing.""" + async def test_shutdown_flushes_buffered_tokenizations(self, tmp_path): + """ENDED flushes buffered tokenizations before finalizing.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.02) + tokenizer = MockBatchTokenizer(delay=0.02) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, publisher = make_aggregator( - ctx, loop, "agg_shutdown_drain", tokenize_pool=pool + ctx, loop, "agg_shutdown_drain", tokenizer=tokenizer ) try: await agg.process( @@ -1045,16 +1046,16 @@ async def test_shutdown_drains_async_tasks(self, tmp_path): session_event(SessionEventType.ENDED, ts=2000), ] ) - # After ENDED, drain_tasks ran inside process() — ISL emitted. + # After ENDED, flush_remaining ran inside process() — ISL emitted. assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 3 publisher.publish_final.assert_awaited_once() finally: agg.close() # NOTE(agents): Trigger exception handling (logger.exception paths) is not - # exercised here. Adding a MockTokenizePool that raises on - # token_count_async would let us assert no metric is emitted, the - # aggregator does not crash, and the task set is cleaned up. + # exercised here. A MockBatchTokenizer whose count_texts_async raises would + # let us assert the flush surfaces the error without crashing the + # aggregator and that the buffer is cleared. @pytest.mark.asyncio async def test_drain_timeout_reports_pending_count(self, tmp_path): @@ -1068,29 +1069,21 @@ async def test_drain_timeout_reports_pending_count(self, tmp_path): """ loop = asyncio.get_event_loop() - class BlockingTokenizePool: - async def token_count_async(self, text, _loop): + class BlockingBatchTokenizer: + async def count_texts_async(self, texts, _loop): await asyncio.sleep(10.0) # exceeds drain timeout - return 0 + return [0] * len(texts) - def token_count(self, text): + async def token_count_message_async(self, *args): + await asyncio.sleep(10.0) return 0 - def close(self): - pass - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, _, publisher = make_aggregator( ctx, loop, "agg_drain_timeout", - tokenize_pool=BlockingTokenizePool(), + tokenizer=BlockingBatchTokenizer(), ) agg._drain_timeout_s = 0.05 try: @@ -1107,9 +1100,10 @@ def __exit__(self, *args): ), ] ) + assert agg._token_queue is not None assert ( - agg._table.in_flight_tasks_count > 0 - ), "precondition: ISL task must be in-flight before ENDED" + agg._token_queue.pending > 0 + ), "precondition: ISL must be buffered before ENDED" await agg.process([session_event(SessionEventType.ENDED, ts=2000)]) publisher.publish_final.assert_awaited_once() @@ -1125,10 +1119,10 @@ def __exit__(self, *args): async def test_tpot_osl_for_tool_call_complete(self, tmp_path): """OSL and TPOT use message-path tokenization when COMPLETE carries tool_calls.""" loop = asyncio.get_event_loop() - pool = MockTokenizePool(delay=0.0) + tokenizer = MockBatchTokenizer(delay=0.0) with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: agg, registry, _ = make_aggregator( - ctx, loop, "agg_tpot_osl_tool_call", tokenize_pool=pool + ctx, loop, "agg_tpot_osl_tool_call", tokenizer=tokenizer ) try: tool_call = { @@ -1151,7 +1145,7 @@ async def test_tpot_osl_for_tool_call_complete(self, tmp_path): ), ] ) - await agg._table.drain_tasks() + await agg._flush_tokens() # OSL = token_count("ok" + tool_calls_json) = 2 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 2 # tpot = (5000 - 2000) / token_count(tool_calls_json) = 3000 / 1 = 3000 diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py index 550a4863c..32f159403 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py @@ -50,7 +50,8 @@ async def test_sigterm_handler_holds_strong_reference_to_finalize_task(): registry = MagicMock() table = MagicMock() table.total_tracked_duration_ns = 0 - table.in_flight_tasks_count = 0 + token_queue = MagicMock() + token_queue.pending = 0 # publish_final blocks on an event so we can observe the task # mid-execution and exercise the strong-ref contract. @@ -69,6 +70,7 @@ async def _slow_publish(*args, **kwargs): registry=registry, publisher=publisher, table=table, + token_queue=token_queue, shutdown_event=shutdown_event, ) @@ -122,7 +124,8 @@ async def test_sigterm_handler_refreshes_tracked_duration(): registry = MagicMock() table = MagicMock() table.total_tracked_duration_ns = 12345 - table.in_flight_tasks_count = 3 + token_queue = MagicMock() + token_queue.pending = 3 publisher = MagicMock() publisher.publish_final = AsyncMock() @@ -134,6 +137,7 @@ async def test_sigterm_handler_refreshes_tracked_duration(): registry=registry, publisher=publisher, table=table, + token_queue=token_queue, shutdown_event=shutdown_event, ) on_sigterm() diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py b/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py index 077923ff8..4ed957a98 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_metrics_table.py @@ -34,6 +34,9 @@ from inference_endpoint.async_utils.services.metrics_aggregator.registry import ( MetricsRegistry, ) +from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import ( + TokenBatchQueue, +) from inference_endpoint.core.record import ( EventRecord, SampleEventType, @@ -294,13 +297,13 @@ async def test_osl_with_tool_calls_uses_message_path(self): ) from inference_endpoint.core.types import TextModelOutput - from .conftest import MockTokenizePool, snapshot_series_count + from .conftest import MockBatchTokenizer, snapshot_series_count registry = MetricsRegistry() registry.register_series("osl", hdr_low=1, hdr_high=100_000) loop = asyncio.get_running_loop() - pool = MockTokenizePool(delay=0) - trigger = OslTrigger(registry, pool, loop) + queue = TokenBatchQueue(MockBatchTokenizer(), loop) + trigger = OslTrigger(registry, queue) tool_calls = ( { @@ -317,9 +320,8 @@ async def test_osl_with_tool_calls_uses_message_path(self): data=tmo, ) row = SampleRow(sample_uuid="s1") - task = trigger.fire(ev, row, {}) - assert task is not None - await task + trigger.fire(ev, row, {}) + await queue.flush() assert snapshot_series_count(registry, "osl") == 1 @@ -331,13 +333,13 @@ async def test_osl_without_tool_calls_uses_text_path(self): ) from inference_endpoint.core.types import TextModelOutput - from .conftest import MockTokenizePool, snapshot_series_count + from .conftest import MockBatchTokenizer, snapshot_series_count registry = MetricsRegistry() registry.register_series("osl", hdr_low=1, hdr_high=100_000) loop = asyncio.get_running_loop() - pool = MockTokenizePool(delay=0) - trigger = OslTrigger(registry, pool, loop) + queue = TokenBatchQueue(MockBatchTokenizer(), loop) + trigger = OslTrigger(registry, queue) tmo = TextModelOutput(output="hello world") ev = EventRecord( @@ -347,9 +349,8 @@ async def test_osl_without_tool_calls_uses_text_path(self): data=tmo, ) row = SampleRow(sample_uuid="s1") - task = trigger.fire(ev, row, {}) - assert task is not None - await task + trigger.fire(ev, row, {}) + await queue.flush() assert snapshot_series_count(registry, "osl") == 1 @@ -368,15 +369,15 @@ async def test_tpot_tool_calls_only_response(self): ) from inference_endpoint.core.types import TextModelOutput - from .conftest import MockTokenizePool, snapshot_series_count + from .conftest import MockBatchTokenizer, snapshot_series_count registry = MetricsRegistry() registry.register_series( "tpot_ns", hdr_low=1, hdr_high=100_000_000_000, dtype=float ) loop = asyncio.get_running_loop() - pool = MockTokenizePool(delay=0) - trigger = TpotTrigger(registry, pool, loop) + queue = TokenBatchQueue(MockBatchTokenizer(), loop) + trigger = TpotTrigger(registry, queue) tool_calls = ( { @@ -395,9 +396,8 @@ async def test_tpot_tool_calls_only_response(self): row = SampleRow(sample_uuid="s1") # RECV_FIRST_NS was set at t=1000 pre_change = {SampleField.RECV_FIRST_NS: 1000} - task = trigger.fire(ev, row, pre_change) - assert task is not None - await task + trigger.fire(ev, row, pre_change) + await queue.flush() assert snapshot_series_count(registry, "tpot_ns") == 1 @@ -409,15 +409,15 @@ async def test_tpot_uses_tool_call_deltas_after_first_chunk(self): ) from inference_endpoint.core.types import TextModelOutput - from .conftest import MockTokenizePool, snapshot_series_total + from .conftest import MockBatchTokenizer, snapshot_series_total registry = MetricsRegistry() registry.register_series( "tpot_ns", hdr_low=1, hdr_high=100_000_000_000, dtype=float ) loop = asyncio.get_running_loop() - pool = MockTokenizePool(delay=0) - trigger = TpotTrigger(registry, pool, loop) + queue = TokenBatchQueue(MockBatchTokenizer(), loop) + trigger = TpotTrigger(registry, queue) tool_call_chunks = ( ( @@ -442,8 +442,7 @@ async def test_tpot_uses_tool_call_deltas_after_first_chunk(self): ) row = SampleRow(sample_uuid="s1") pre_change = {SampleField.RECV_FIRST_NS: 1000} - task = trigger.fire(ev, row, pre_change) - assert task is not None - await task + trigger.fire(ev, row, pre_change) + await queue.flush() assert snapshot_series_total(registry, "tpot_ns") == pytest.approx(2000.0) diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index e25bf0022..b59e56501 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for TokenizePool thread-safety and correctness.""" +"""Tests for BatchTokenizer and TokenBatchQueue.""" import asyncio import time @@ -22,18 +22,22 @@ import pytest from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import ( - TokenizePool, + BatchTokenizer, + TokenBatchQueue, ) _MOCK_TARGET = "inference_endpoint.async_utils.services.metrics_aggregator.token_metrics.AutoTokenizer" class _FakeTokenizer: - """Deterministic tokenizer that splits on whitespace.""" + """Deterministic tokenizer that splits on whitespace. - def __init__(self, load_delay: float = 0.1): - # Simulate the blocking cost of from_pretrained so that - # pre-initialization in __init__ saturates all worker threads. + Has no ``backend_tokenizer``, so BatchTokenizer keeps the batch path + in-process (no subprocess shards) and ``count_texts`` falls back to + ``tokenize`` per text — which is what these tests assert against. + """ + + def __init__(self, load_delay: float = 0.0): time.sleep(load_delay) def tokenize(self, text: str) -> list[str]: @@ -46,64 +50,58 @@ def from_pretrained(cls, name: str, **kwargs: object) -> "_FakeTokenizer": @pytest.mark.unit -class TestTokenizePool: +class TestBatchTokenizer: def test_token_count_returns_int(self): with patch(_MOCK_TARGET, _FakeTokenizer): - with TokenizePool("fake", n_workers=1) as pool: - count = pool.token_count("Hello world") - assert count == 2 + with BatchTokenizer("fake") as tok: + assert tok.token_count("Hello world") == 2 - def test_multiple_workers(self): + def test_count_texts_batch(self): with patch(_MOCK_TARGET, _FakeTokenizer): - with TokenizePool("fake", n_workers=4) as pool: - results = [] - for i in range(10): - results.append(pool.token_count(f"Sentence number {i}")) - assert all(isinstance(r, int) and r > 0 for r in results) + with BatchTokenizer("fake") as tok: + assert tok.count_texts(["a b", "c d e", "x"]) == [2, 3, 1] - def test_concurrent_calls_thread_safe(self): + def test_count_texts_empty(self): with patch(_MOCK_TARGET, _FakeTokenizer): - with TokenizePool("fake", n_workers=2) as pool: - texts = [f"word{i} word{i+1}" for i in range(20)] + with BatchTokenizer("fake") as tok: + assert tok.count_texts([]) == [] + def test_concurrent_token_count_thread_safe(self): + with patch(_MOCK_TARGET, _FakeTokenizer): + with BatchTokenizer("fake") as tok: + texts = [f"word{i} word{i + 1}" for i in range(20)] with ThreadPoolExecutor(max_workers=8) as executor: - futures = [executor.submit(pool.token_count, t) for t in texts] + futures = [executor.submit(tok.token_count, t) for t in texts] results = [f.result() for f in futures] - - assert len(results) == 20 - assert all(r == 2 for r in results) + assert results == [2] * 20 def test_close_is_idempotent(self): with patch(_MOCK_TARGET, _FakeTokenizer): - pool = TokenizePool("fake", n_workers=1) - pool.close() - pool.close() # Should not raise + tok = BatchTokenizer("fake") + tok.close() + tok.close() # must not raise def test_use_after_close_raises(self): with patch(_MOCK_TARGET, _FakeTokenizer): - pool = TokenizePool("fake", n_workers=1) - pool.close() + tok = BatchTokenizer("fake") + tok.close() with pytest.raises(RuntimeError, match="closed"): - pool.token_count("hello") - - def test_n_workers_zero_raises(self): - with pytest.raises(ValueError, match="n_workers"): - TokenizePool("fake", n_workers=0) + tok.token_count("hello") @pytest.mark.asyncio - async def test_token_count_async(self): + async def test_count_texts_async(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with TokenizePool("fake", n_workers=1) as pool: - count = await pool.token_count_async("Hello world foo", loop) - assert count == 3 + with BatchTokenizer("fake") as tok: + counts = await tok.count_texts_async(["Hello world foo", "a"], loop) + assert counts == [3, 1] def test_context_manager(self): with patch(_MOCK_TARGET, _FakeTokenizer): - with TokenizePool("fake", n_workers=1) as pool: - assert pool.token_count("a b c") == 3 + with BatchTokenizer("fake") as tok: + assert tok.token_count("a b c") == 3 with pytest.raises(RuntimeError, match="closed"): - pool.token_count("test") + tok.token_count("test") class _FakeTokenizerWithTemplate(_FakeTokenizer): @@ -131,19 +129,18 @@ def apply_chat_template( @pytest.mark.unit -class TestTokenizePoolMessageTokenization: +class TestBatchTokenizerMessageTokenization: def test_token_count_message_subtracts_baseline(self): """token_count_message returns full_tokens - baseline.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): - with TokenizePool("fake", n_workers=1) as pool: - # "hello world" -> 2 content words + 2 wrapper = 4; baseline = 0 + 2 = 2; net = 2 - count = pool.token_count_message("hello world", None, None) - assert count == 2 + with BatchTokenizer("fake") as tok: + # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2 + assert tok.token_count_message("hello world", None, None) == 2 def test_token_count_message_includes_tool_calls(self): """token_count_message includes tool-call JSON tokens.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): - with TokenizePool("fake", n_workers=1) as pool: + with BatchTokenizer("fake") as tok: tool_calls = ( { "id": "c1", @@ -151,9 +148,9 @@ def test_token_count_message_includes_tool_calls(self): "function": {"name": "f", "arguments": "{}"}, }, ) - count_without = pool.token_count_message("hello", None, None) - count_with = pool.token_count_message("hello", None, tool_calls) - assert count_with > count_without + without = tok.token_count_message("hello", None, None) + with_calls = tok.token_count_message("hello", None, tool_calls) + assert with_calls > without def test_token_count_message_fallback_on_exception(self): """Falls back to whitespace split when apply_chat_template raises.""" @@ -163,7 +160,7 @@ def apply_chat_template(self, *args, **kwargs): raise ValueError("template does not support tool_calls") with patch(_MOCK_TARGET, _BadTemplateTokenizer): - with TokenizePool("fake", n_workers=1) as pool: + with BatchTokenizer("fake") as tok: tool_calls = ( { "id": "c1", @@ -171,17 +168,82 @@ def apply_chat_template(self, *args, **kwargs): "function": {"name": "f", "arguments": "{}"}, }, ) - # Should not raise; falls back to whitespace tokenizer - count = pool.token_count_message("hello world", None, tool_calls) - assert count > 0 + # Must not raise; falls back to whitespace tokenizer. + assert tok.token_count_message("hello world", None, tool_calls) > 0 @pytest.mark.asyncio async def test_token_count_message_async(self): - """token_count_message_async returns count without blocking event loop.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): loop = asyncio.get_running_loop() - with TokenizePool("fake", n_workers=1) as pool: - count = await pool.token_count_message_async( + with BatchTokenizer("fake") as tok: + count = await tok.token_count_message_async( "hello world", None, None, loop ) assert count == 2 + + +class _CapturingTokenizer: + """Minimal tokenizer stub for queue tests: whitespace counts, no procs.""" + + async def count_texts_async(self, texts, _loop): + return [len(t.split()) for t in texts] + + async def token_count_message_async(self, content, reasoning, tool_calls, _loop): + parts = [p for p in (content, reasoning) if p] + return len(" ".join(parts).split()) + (len(tool_calls) if tool_calls else 0) + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestTokenBatchQueue: + async def test_flush_records_text_via_callback(self): + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + recorded: list[int] = [] + queue.enqueue_text("a b c", recorded.append) + queue.enqueue_text("d e", recorded.append) + assert queue.pending == 2 + await queue.flush() + assert sorted(recorded) == [2, 3] + assert queue.pending == 0 + + async def test_flush_records_message_via_callback(self): + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + recorded: list[int] = [] + queue.enqueue_message(("hello world", None, None), recorded.append) + await queue.flush() + assert recorded == [2] + + async def test_flush_empty_is_noop(self): + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + await queue.flush() + assert queue.pending == 0 + + async def test_flush_remaining_clean_returns_zero(self): + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + recorded: list[int] = [] + queue.enqueue_text("a b", recorded.append) + assert await queue.flush_remaining(timeout=5.0) == 0 + assert recorded == [2] + + async def test_flush_remaining_timeout_reports_pending(self): + """A tokenizer slower than the budget leaves items pending.""" + + class _BlockingTokenizer: + async def count_texts_async(self, texts, _loop): + await asyncio.sleep(10.0) + return [0] * len(texts) + + async def token_count_message_async(self, *args): + return 0 + + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_BlockingTokenizer(), loop) + recorded: list[int] = [] + queue.enqueue_text("never counted", recorded.append) + n_pending = await queue.flush_remaining(timeout=0.05) + assert n_pending == 1 + assert recorded == [] diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 1c90554fb..969f22ce2 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -490,7 +490,6 @@ def test_defaults(self): assert cfg.performance_timeout_s == 240.0 assert cfg.accuracy_timeout_s is None assert cfg.metrics_drain_timeout_s == 60.0 - assert cfg.metrics_tokenizer_workers == 2 @pytest.mark.unit @pytest.mark.parametrize( @@ -512,11 +511,6 @@ def test_metrics_drain_timeout_negative_rejected(self): with pytest.raises(ValidationError): DrainConfig(metrics_drain_timeout_s=-1.0) - @pytest.mark.unit - def test_metrics_tokenizer_workers_must_be_at_least_one(self): - with pytest.raises(ValidationError): - DrainConfig(metrics_tokenizer_workers=0) - @pytest.mark.unit def test_extra_fields_rejected(self): with pytest.raises(ValidationError): @@ -538,7 +532,6 @@ def test_yaml_roundtrip(self, tmp_path): performance_timeout_s: 30.0 accuracy_timeout_s: null metrics_drain_timeout_s: 300.0 - metrics_tokenizer_workers: 8 """ config_file = tmp_path / "drain.yaml" config_file.write_text(yaml_content) @@ -548,7 +541,6 @@ def test_yaml_roundtrip(self, tmp_path): assert drain.performance_timeout_s == 30.0 assert drain.accuracy_timeout_s is None assert drain.metrics_drain_timeout_s == 300.0 - assert drain.metrics_tokenizer_workers == 8 class TestAggregatorArgs: @@ -639,60 +631,6 @@ async def _capture_launch(service_configs, *, timeout): idx = args.index("--drain-timeout") assert args[idx + 1] == expected_flag - @pytest.mark.unit - @pytest.mark.asyncio - @pytest.mark.parametrize("workers, expected_flag", [(4, "4"), (8, "8"), (2, "2")]) - async def test_tokenizer_workers_forwarded_to_aggregator_args( - self, tmp_path, workers, expected_flag - ): - config = OfflineConfig( - **_OFFLINE_KWARGS, - settings=OfflineSettings( - drain=DrainConfig(metrics_tokenizer_workers=workers) - ), - ) - ctx = self._make_ctx(config, tmp_path) - - captured: list = [] - - async def _capture_launch(service_configs, *, timeout): - captured.extend(service_configs) - raise KeyboardInterrupt("stop after launch") - - mock_zmq = MagicMock() - mock_zmq.socket_dir = str(tmp_path / "sockets") - - with ( - patch( - "inference_endpoint.commands.benchmark.execute.ManagedZMQContext" - ) as MockZMQ, - patch( - "inference_endpoint.commands.benchmark.execute.EventPublisherService" - ) as MockPub, - patch( - "inference_endpoint.commands.benchmark.execute.MetricsSnapshotSubscriber" - ) as MockSub, - patch( - "inference_endpoint.commands.benchmark.execute.ServiceLauncher" - ) as MockLauncher, - patch("inference_endpoint.commands.benchmark.execute.tqdm"), - ): - MockZMQ.scoped.return_value.__enter__ = MagicMock(return_value=mock_zmq) - MockZMQ.scoped.return_value.__exit__ = MagicMock(return_value=False) - MockPub.return_value.socket_name = "test_pub" - MockSub.return_value.start = MagicMock() - MockLauncher.return_value.launch = _capture_launch - - loop = asyncio.get_event_loop() - with pytest.raises(KeyboardInterrupt): - await _run_benchmark_async(ctx, loop) - - aggregator_cfg = next(c for c in captured if "metrics_aggregator" in c.module) - args = aggregator_cfg.args - assert "--tokenizer-workers" in args - idx = args.index("--tokenizer-workers") - assert args[idx + 1] == expected_flag - class TestBuildPhases: """Tests for _build_phases() in execute.py.""" From b40f72f0fba65f01f1e5719202c18afd47bb5eb9 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Tue, 9 Jun 2026 17:12:33 -0700 Subject: [PATCH 02/20] fix(metrics): never skip finalize on tokenizer drain failure; cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ENDED drain sat outside the finalization try/finally and flush_remaining caught only TimeoutError: any other tokenizer failure (e.g. BrokenProcessPool from a dead shard) escaped the fire-and-forget process() task, skipped publish_final, and hung the subprocess with no final_snapshot.json. The drain now runs inside the finalization boundary and flush_remaining swallows non-timeout failures, logs them, and returns the un-tokenized count — surfacing as an incomplete drain (n_pending_tasks > 0) instead of a hang. Cleanup (review feedback): - delete the test-only sync API (count_texts / token_count / token_count_message); production uses only the async paths, and count_texts_async now raises RuntimeError after close() - rename AsyncTokenTrigger -> TokenTrigger (fire() is sync; it enqueues) - extract _encode_batch_lengths shared by the worker and in-process paths - pending_tokens property collapses the triple None-guard; the SIGTERM handler takes a pending_tokens callback instead of reaching into aggregator._token_queue - drop vestigial return None and quoted forward-ref; trim stale "async tasks" wording in docs and the drain-timeout help text (templates regenerated); document the wait=False shard shutdown Tests: sharded-path reassembly + BrokenProcessPool propagation, _even_chunks, and queue/aggregator drain-failure regression tests. 145 aggregator unit + 160 config/commands/integration tests pass; pre-commit clean. Validated on GB200 (ptyche, 144-core Grace, 18 shards, real DeepSeek-R1 tokenizer at mean OSL 3877): 38x vs the per-event pool; 1M-output drain 84s vs ~53min. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/__main__.py | 8 +- .../services/metrics_aggregator/aggregator.py | 83 ++++----- .../metrics_aggregator/metrics_table.py | 29 ++-- .../metrics_aggregator/token_metrics.py | 140 ++++++--------- src/inference_endpoint/config/schema.py | 11 +- .../templates/concurrency_template_full.yaml | 2 +- .../templates/offline_template_full.yaml | 2 +- .../templates/online_template_full.yaml | 2 +- .../services/metrics_aggregator/conftest.py | 3 - .../metrics_aggregator/test_aggregator.py | 48 +++++- .../test_main_signal_handler.py | 10 +- .../metrics_aggregator/test_token_metrics.py | 163 ++++++++++++------ 12 files changed, 285 insertions(+), 216 deletions(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 9cd1c7e5e..7d2101c11 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -33,7 +33,7 @@ from .publisher import MetricsPublisher from .registry import MetricsRegistry from .snapshot import MetricsSnapshotCodec -from .token_metrics import BatchTokenizer, TokenBatchQueue +from .token_metrics import BatchTokenizer logger = logging.getLogger(__name__) @@ -44,7 +44,7 @@ def _make_sigterm_handler( registry: MetricsRegistry, publisher: MetricsPublisher, table: MetricsTable, - token_queue: TokenBatchQueue | None, + pending_tokens: Callable[[], int], shutdown_event: asyncio.Event, ) -> tuple[Callable[[], None], set[asyncio.Task]]: """Build the SIGTERM handler that writes the INTERRUPTED final snapshot. @@ -76,7 +76,7 @@ async def _signal_finalize() -> None: ) await publisher.publish_final( registry, - n_pending_tasks=token_queue.pending if token_queue is not None else 0, + n_pending_tasks=pending_tokens(), interrupted=True, ) except Exception: # noqa: BLE001 — best-effort. @@ -263,7 +263,7 @@ async def main() -> None: registry=registry, publisher=publisher, table=aggregator._table, - token_queue=aggregator._token_queue, + pending_tokens=lambda: aggregator.pending_tokens, shutdown_event=shutdown_event, ) loop.add_signal_handler(signal.SIGTERM, on_sigterm) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index ed5ace0a0..e87448036 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -241,6 +241,11 @@ def _register_triggers(self, streaming: bool) -> None: table.add_trigger(SampleField.LAST_RECV_NS, ChunkDeltaTrigger(registry)) table.add_trigger(SampleField.COMPLETE_NS, TpotTrigger(registry, queue)) + @property + def pending_tokens(self) -> int: + """Enqueued tokenizations not yet recorded (the snapshot n_pending_tasks).""" + return self._token_queue.pending if self._token_queue is not None else 0 + async def _flush_tokens(self) -> None: """Flush buffered tokenizations so the next snapshot reflects them.""" if self._token_queue is not None: @@ -318,9 +323,7 @@ async def process(self, records: list[EventRecord]) -> None: self._publish_interval_s, get_runtime_state=lambda: ( self._session_state, - self._token_queue.pending - if self._token_queue is not None - else 0, + self.pending_tokens, ), pre_publish=self._flush_tokens, ) @@ -377,47 +380,47 @@ async def process(self, records: list[EventRecord]) -> None: # ENDED has been observed; transition to DRAINING so any tick # that fires before publish_final reflects the new state. self._session_state = SessionState.DRAINING - queue = self._token_queue - pending = queue.pending if queue is not None else 0 - logger.info("Draining %d pending tokenizations...", pending) - # flush_remaining tokenizes the whole buffer in one batched pass, - # bounded by the drain budget; it returns the count it could not - # finish (non-zero only on a timeout), which becomes the snapshot's - # n_pending_tasks so Report can flag an incomplete drain. - n_pending = ( - await queue.flush_remaining(self._drain_timeout_s) - if queue is not None - else 0 - ) - if n_pending > 0: - timeout_str = ( - f"{self._drain_timeout_s:.1f}s" - if self._drain_timeout_s is not None - else "unlimited" + logger.info("Draining %d pending tokenizations...", self.pending_tokens) + # The drain and final publish are wrapped together so the aggregator + # ALWAYS reaches _finalize (which sets the shutdown event); a + # tokenizer failure during the drain must not skip publish_final and + # leave main()'s `await shutdown_event.wait()` hanging. + n_pending = self.pending_tokens + try: + # flush_remaining tokenizes the whole buffer in one batched pass, + # bounded by the drain budget, and never raises: it returns the + # count it could not finish (timeout or failure), which becomes + # the snapshot's n_pending_tasks so Report flags an incomplete drain. + if self._token_queue is not None: + n_pending = await self._token_queue.flush_remaining( + self._drain_timeout_s + ) + if n_pending > 0: + budget = ( + f"{self._drain_timeout_s:.1f}s" + if self._drain_timeout_s is not None + else "unlimited" + ) + logger.warning( + "tokenizer drain incomplete (budget %s); %d tokenizations " + "did not complete", + budget, + n_pending, + ) + logger.info( + "Tokenizations drained (n_pending_tasks=%d at finalize)", n_pending ) - logger.warning( - "tokenizer drain timed out after %s; %d tokenizations " - "did not complete", - timeout_str, - n_pending, + registry.set_counter( + MetricCounterKey.TRACKED_DURATION_NS.value, + table.total_tracked_duration_ns, ) - logger.info( - "Tokenizations drained (n_pending_tasks=%d at finalize)", n_pending - ) - registry.set_counter( - MetricCounterKey.TRACKED_DURATION_NS.value, - table.total_tracked_duration_ns, - ) - try: await self._publisher.publish_final(registry, n_pending_tasks=n_pending) finally: - # Whatever happens above, the aggregator MUST close the - # publisher and signal shutdown — otherwise the main() - # entry point's `await shutdown_event.wait()` hangs - # forever and the subprocess never exits cleanly. Each - # cleanup step is independently wrapped: a failure in - # aclose must not prevent _finalize, since _finalize is - # what sets the shutdown event. + # The aggregator MUST close the publisher and signal shutdown even + # if the drain/publish above failed — otherwise main()'s + # `await shutdown_event.wait()` hangs forever. aclose is + # independently wrapped: its failure must not prevent _finalize, + # which is what sets the shutdown event. try: await self._publisher.aclose() except Exception: # noqa: BLE001 — best-effort cleanup. diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py index f67c859e6..88d2693ee 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py @@ -179,10 +179,9 @@ def fire(self, ev_rec, row, pre_change): baseline = pre_change.get(self._delta_start_fieldname) if baseline is not None: self.registry.record(self.metric_name, ev_rec.timestamp_ns - baseline) - return None -class AsyncTokenTrigger(EmitTrigger): +class TokenTrigger(EmitTrigger): """Base for triggers whose metric needs tokenization. Subclasses implement ``_extract_text()`` to pull the text to tokenize from @@ -298,12 +297,12 @@ def __init__(self, registry: MetricsRegistry): # --------------------------------------------------------------------------- -# Token triggers (async) +# Token triggers (batched) # --------------------------------------------------------------------------- -class IslTrigger(AsyncTokenTrigger): - """ISL from PromptData: len(token_ids) sync, or token_count(text) async.""" +class IslTrigger(TokenTrigger): + """ISL from PromptData: ``len(token_ids)`` or the tokenized prompt text.""" def __init__( self, @@ -327,7 +326,7 @@ def _extract_text(self, ev_rec, row, pre_change): return None -class OslTrigger(AsyncTokenTrigger): +class OslTrigger(TokenTrigger): """OSL = token_count(full output text) from COMPLETE event data.""" def __init__( @@ -352,19 +351,13 @@ def _extract_message(self, ev_rec, row, pre_change): return None -class TpotTrigger(AsyncTokenTrigger): - """TPOT = (complete_ns - recv_first_ns) / token_count(text_after_first_chunk). - - Only registered when streaming mode is enabled. +class TpotTrigger(TokenTrigger): + """TPOT = (complete_ns - recv_first_ns) / output token count. - # NOTE(agents): This trigger tokenizes text_after_first_chunk independently - # from OslTrigger, which tokenizes the full output. This means the output is - # tokenized twice at COMPLETE time for streaming samples. This is intentional: - # OSL is always required (non-streaming and streaming), while TPOT is - # streaming-only. Keeping them as separate triggers allows conditional - # registration via the streaming flag. If tokenization throughput becomes a - # bottleneck, consider merging OSL and TPOT into a single trigger that - # tokenizes once and derives both metrics. + Streaming-only. Tokenizes the post-first-chunk output independently of + ``OslTrigger`` (full output), so streaming samples are tokenized twice — + intentional: OSL is always required, TPOT is conditional on the streaming + flag. """ def __init__( diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 57c9704d4..d3a236d2b 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -15,14 +15,12 @@ """Tokenization for ISL/OSL/TPOT metrics. -``BatchTokenizer`` tokenizes whole batches of text at once. A single BPE rayon -pool saturates ~8 CPU cores (memory-bound), so to use the whole machine it -shards each batch across worker *processes*, one pinned to each block of -``CORES_PER_WORKER`` cores (their rayon pools stay NUMA-local). The aggregator -buffers per-sample text as COMPLETE events arrive and calls ``count_texts`` once -per flush (publish tick + drain) — so batching, not a per-request coalescer, -keeps tokenization ahead of completions. Falls back to a single in-process -thread when there is no fast Rust backend or fewer than two core blocks fit. +``BatchTokenizer`` tokenizes whole batches at once, sharded across worker +processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE +rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers +per-sample text and flushes the batch once per publish tick and at drain. Falls +back to a single in-process thread when there is no fast Rust backend or fewer +than two core blocks fit. """ from __future__ import annotations @@ -109,13 +107,18 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None: _WORKER_BACKEND.encode("warmup", add_special_tokens=False) +def _encode_batch_lengths(backend: Any, texts: list[str]) -> list[int]: + """Per-text token counts via the raw tokenizers backend, one rayon call.""" + encode_batch = getattr(backend, "encode_batch_fast", None) or backend.encode_batch + return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)] + + def _worker_encode_lengths(texts: list[str]) -> list[int]: """Per-text token counts for a shard, in one rayon-parallel call.""" backend = _WORKER_BACKEND if backend is None: raise RuntimeError("tokenizer worker backend unavailable") - encode_batch = getattr(backend, "encode_batch_fast", None) or backend.encode_batch - return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)] + return _encode_batch_lengths(backend, texts) def _worker_ready(_: int) -> bool: @@ -138,10 +141,9 @@ def _even_chunks(items: list[str], n: int) -> list[list[str]]: class BatchTokenizer: """Counts tokens for batches of text, sharded across pinned CPU cores. - ``count_texts`` / ``count_texts_async`` tokenize a whole list in one shot. - The sync ``token_count`` and chat-template ``token_count_message`` paths run - on a small in-process thread pool — they are rare (single ISL probes, tool - calls) relative to the batched OSL/ISL/TPOT flush. + ``count_texts_async`` tokenizes a whole list in one sharded call. The + chat-template ``token_count_message_async`` path runs on a small in-process + thread — rare (tool calls) relative to the batched OSL/ISL/TPOT flush. """ def __init__( @@ -254,48 +256,31 @@ def _encode_lengths_inproc(self, texts: list[str]) -> list[int]: tok = self._tokenizer backend = getattr(tok, "backend_tokenizer", None) if backend is not None: - encode_batch = getattr(backend, "encode_batch_fast", None) - if encode_batch is None: - encode_batch = backend.encode_batch - return [len(e.ids) for e in encode_batch(texts, add_special_tokens=False)] + return _encode_batch_lengths(backend, texts) return [len(tok.tokenize(t)) for t in texts] # type: ignore[union-attr] - def count_texts(self, texts: list[str]) -> list[int]: - """Per-text token counts for a whole batch (blocking).""" - if not texts: - return [] - if not self._procs: - return self._encode_lengths_inproc(texts) - chunks = _even_chunks(texts, len(self._procs)) - futures = [ - self._procs[i].submit(_worker_encode_lengths, chunk) - for i, chunk in enumerate(chunks) - ] - out: list[int] = [] - for f in futures: - out.extend(f.result()) - return out - async def count_texts_async( self, texts: list[str], loop: asyncio.AbstractEventLoop ) -> list[int]: - """Per-text token counts for a whole batch without blocking the loop.""" + """Per-text token counts for a whole batch without blocking the loop. + + A worker-shard failure propagates and is treated as an incomplete drain. + """ if not texts: return [] - if not self._procs: - return await loop.run_in_executor( - self._thread, self._encode_lengths_inproc, texts - ) - chunks = _even_chunks(texts, len(self._procs)) - futures = [ - asyncio.wrap_future(self._procs[i].submit(_worker_encode_lengths, chunk)) - for i, chunk in enumerate(chunks) - ] - results = await asyncio.gather(*futures) - out: list[int] = [] - for r in results: - out.extend(r) - return out + if self._procs: + chunks = _even_chunks(texts, len(self._procs)) + futures = [ + asyncio.wrap_future(ex.submit(_worker_encode_lengths, chunk)) + for ex, chunk in zip(self._procs, chunks, strict=False) + ] + results = await asyncio.gather(*futures) + return [n for r in results for n in r] + if self._thread is None: + raise RuntimeError("BatchTokenizer is closed") + return await loop.run_in_executor( + self._thread, self._encode_lengths_inproc, texts + ) # -- sync + chat-template paths (in-process thread) --------------------- @@ -338,25 +323,6 @@ def _token_count_message( ] return self._token_count_text("\n".join(parts)) - def token_count(self, text: str) -> int: - """Token count for a single string (blocking).""" - if self._thread is None: - raise RuntimeError("BatchTokenizer is closed") - return self._thread.submit(self._token_count_text, text).result() - - def token_count_message( - self, - content: str, - reasoning: str | None, - tool_calls: tuple[dict[str, Any], ...] | None, - ) -> int: - """Token count for an assistant message via the chat template (blocking).""" - if self._thread is None: - raise RuntimeError("BatchTokenizer is closed") - return self._thread.submit( - self._token_count_message, content, reasoning, tool_calls - ).result() - async def token_count_message_async( self, content: str, @@ -372,7 +338,11 @@ async def token_count_message_async( ) def close(self) -> None: - """Shut down all workers. Idempotent.""" + """Shut down all workers. Idempotent. + + Shard shutdown uses ``wait=False``: a hung worker must not block + aggregator shutdown; idle workers exit on their own once signalled. + """ for ex in self._procs: ex.shutdown(wait=False) self._procs = [] @@ -389,7 +359,7 @@ def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: # Type alias for the (content, reasoning, tool_calls) tuple a message trigger # enqueues for chat-template tokenization. -MessageParts = tuple[str, str | None, "tuple[dict[str, Any], ...] | None"] +MessageParts = tuple[str, str | None, tuple[dict[str, Any], ...] | None] class TokenCounter(Protocol): @@ -417,18 +387,15 @@ async def token_count_message_async( class TokenBatchQueue: """Buffers per-sample tokenization work and clears it in batches. - Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with a + Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an ``on_count`` callback that records the resulting metric. The aggregator - drains the buffer with ``flush`` (once per publish tick, so live ISL/OSL/ - TPOT stay current) and with ``flush_remaining`` at end-of-run. Holding the - work until a flush lets the whole buffer go through ``BatchTokenizer`` in - one sharded call, instead of one event-loop task per completion — the latter - is what fell behind and stretched the drain on high-completion-rate runs. + flushes the buffer with ``flush`` once per publish tick (so live ISL/OSL/ + TPOT stay current) and with ``flush_remaining`` at end-of-run, sending the + whole batch through ``BatchTokenizer`` in one sharded call. ``pending`` counts enqueued-but-not-yet-recorded items; it is the - ``n_pending_tasks`` surfaced on the snapshot, and a non-zero value in the - final snapshot means the end-of-run flush did not finish within the drain - budget. + ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot + means the end-of-run flush did not finish within the drain budget or failed. """ def __init__( @@ -463,8 +430,9 @@ async def flush(self) -> None: Items are detached from the buffer up front so concurrent enqueues land in the next flush. ``_inflight`` is decremented only after a callback - runs, so a cancellation (drain timeout) leaves it reflecting exactly the - items that were not recorded. + runs, so a cancellation (drain timeout) or a tokenizer error leaves it + reflecting exactly the items that were not recorded — those surface as + ``pending`` (an incomplete drain), not as silently dropped samples. """ async with self._lock: if not (self._text or self._msg): @@ -492,8 +460,10 @@ async def flush(self) -> None: async def flush_remaining(self, timeout: float | None) -> int: """End-of-run flush, bounded by ``timeout`` seconds. - Returns the number of items still un-tokenized — non-zero only if the - budget was exhausted (``timeout`` reached). ``None`` waits indefinitely. + Returns the number of items still un-tokenized — non-zero if the budget + was exhausted (``timeout`` reached) or tokenization failed. ``None`` + waits indefinitely. Never raises: a failure here must not stop the + aggregator from publishing the (incomplete) final snapshot. """ if self._inflight == 0: return 0 @@ -508,4 +478,8 @@ async def flush_remaining(self, timeout: float | None) -> int: timeout, self._inflight, ) + except Exception: # noqa: BLE001 — drain must not block finalize. + logger.exception( + "tokenizer drain failed; %d items not counted", self._inflight + ) return self._inflight diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 722652e0d..0a59074f5 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -578,18 +578,17 @@ class DrainConfig(BaseModel): alias="--metrics-drain-timeout", help=( "Wall-clock budget (seconds) for the metrics aggregator to finish " - "in-flight async tokenize tasks after the run ends before cancelling " - "them. Set to 0 to wait indefinitely. Increase for large datasets or " - "long-context workloads where ISL/OSL/TPOT tokenization lags behind " - "request throughput." + "tokenizing buffered samples after the run ends. Set to 0 to wait " + "indefinitely. Increase for very large datasets where the end-of-run " + "tokenize batch is big." ), ), ] = Field( 60.0, ge=0, description=( - "Wall-clock budget (seconds) for the metrics aggregator to drain " - "in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited)." + "Wall-clock budget (seconds) to finish tokenizing buffered samples " + "after ENDED (default: 60.0; 0 = unlimited)." ), ) diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 693765e57..75feab6fb 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index 64439452f..3ff1ccd17 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 0c810f30b..1287b99af 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) for the metrics aggregator to drain in-flight tokenize tasks after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py index b32811bcf..51d25565a 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py +++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py @@ -60,9 +60,6 @@ class MockBatchTokenizer: def __init__(self, delay: float = 0.0) -> None: self._delay = delay - def token_count(self, text: str) -> int: - return len(text.split()) - async def count_texts_async( self, texts: list[str], _loop: asyncio.AbstractEventLoop ) -> list[int]: diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py index 87c5ff96b..3337b168b 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py @@ -1052,10 +1052,50 @@ async def test_shutdown_flushes_buffered_tokenizations(self, tmp_path): finally: agg.close() - # NOTE(agents): Trigger exception handling (logger.exception paths) is not - # exercised here. A MockBatchTokenizer whose count_texts_async raises would - # let us assert the flush surfaces the error without crashing the - # aggregator and that the buffer is cleared. + @pytest.mark.asyncio + async def test_drain_failure_reports_pending_and_finalizes(self, tmp_path): + """A tokenizer error during the ENDED drain must not skip finalize. + + flush_remaining swallows non-timeout failures and returns the stuck + count, so publish_final still runs with n_pending_tasks > 0 (incomplete + drain) instead of the error escaping process() and hanging main(). + """ + loop = asyncio.get_event_loop() + + class FailingBatchTokenizer: + async def count_texts_async(self, texts, _loop): + raise RuntimeError("tokenizer backend died") + + async def token_count_message_async(self, *args): + raise RuntimeError("tokenizer backend died") + + with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: + agg, _, publisher = make_aggregator( + ctx, loop, "agg_drain_failure", tokenizer=FailingBatchTokenizer() + ) + try: + await agg.process( + [ + session_event( + SessionEventType.START_PERFORMANCE_TRACKING, ts=0 + ), + sample_event( + SampleEventType.ISSUED, + "s1", + ts=1000, + data=PromptData(text="some text to tokenize"), + ), + ] + ) + assert agg._token_queue is not None + assert agg._token_queue.pending > 0 + await agg.process([session_event(SessionEventType.ENDED, ts=2000)]) + + publisher.publish_final.assert_awaited_once() + assert publisher.publish_final.await_args.kwargs["n_pending_tasks"] > 0 + publisher.aclose.assert_awaited_once() + finally: + agg.close() @pytest.mark.asyncio async def test_drain_timeout_reports_pending_count(self, tmp_path): diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py index 32f159403..3428f6f22 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py @@ -50,8 +50,7 @@ async def test_sigterm_handler_holds_strong_reference_to_finalize_task(): registry = MagicMock() table = MagicMock() table.total_tracked_duration_ns = 0 - token_queue = MagicMock() - token_queue.pending = 0 + n_pending = 0 # publish_final blocks on an event so we can observe the task # mid-execution and exercise the strong-ref contract. @@ -70,7 +69,7 @@ async def _slow_publish(*args, **kwargs): registry=registry, publisher=publisher, table=table, - token_queue=token_queue, + pending_tokens=lambda: n_pending, shutdown_event=shutdown_event, ) @@ -124,8 +123,7 @@ async def test_sigterm_handler_refreshes_tracked_duration(): registry = MagicMock() table = MagicMock() table.total_tracked_duration_ns = 12345 - token_queue = MagicMock() - token_queue.pending = 3 + n_pending = 3 publisher = MagicMock() publisher.publish_final = AsyncMock() @@ -137,7 +135,7 @@ async def test_sigterm_handler_refreshes_tracked_duration(): registry=registry, publisher=publisher, table=table, - token_queue=token_queue, + pending_tokens=lambda: n_pending, shutdown_event=shutdown_event, ) on_sigterm() diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index b59e56501..03e47158c 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -17,13 +17,15 @@ import asyncio import time -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import Future +from concurrent.futures.process import BrokenProcessPool from unittest.mock import patch import pytest from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import ( BatchTokenizer, TokenBatchQueue, + _even_chunks, ) _MOCK_TARGET = "inference_endpoint.async_utils.services.metrics_aggregator.token_metrics.AutoTokenizer" @@ -33,8 +35,7 @@ class _FakeTokenizer: """Deterministic tokenizer that splits on whitespace. Has no ``backend_tokenizer``, so BatchTokenizer keeps the batch path - in-process (no subprocess shards) and ``count_texts`` falls back to - ``tokenize`` per text — which is what these tests assert against. + in-process (no subprocess shards) and counts via ``tokenize`` per text. """ def __init__(self, load_delay: float = 0.0): @@ -49,31 +50,66 @@ def from_pretrained(cls, name: str, **kwargs: object) -> "_FakeTokenizer": return cls() +class _FakeProc: + """Stands in for a ProcessPoolExecutor shard; whitespace-counts its chunk.""" + + def submit(self, _fn, chunk): + fut: Future = Future() + fut.set_result([len(t.split()) for t in chunk]) + return fut + + def shutdown(self, wait=False): + pass + + +class _BrokenProc: + """A shard whose work resolves to BrokenProcessPool (worker died).""" + + def submit(self, _fn, _chunk): + fut: Future = Future() + fut.set_exception(BrokenProcessPool("worker died")) + return fut + + def shutdown(self, wait=False): + pass + + @pytest.mark.unit class TestBatchTokenizer: - def test_token_count_returns_int(self): + @pytest.mark.asyncio + async def test_count_texts_async(self): with patch(_MOCK_TARGET, _FakeTokenizer): + loop = asyncio.get_running_loop() with BatchTokenizer("fake") as tok: - assert tok.token_count("Hello world") == 2 + counts = await tok.count_texts_async(["Hello world foo", "a"], loop) + assert counts == [3, 1] - def test_count_texts_batch(self): + @pytest.mark.asyncio + async def test_count_texts_async_empty(self): with patch(_MOCK_TARGET, _FakeTokenizer): + loop = asyncio.get_running_loop() with BatchTokenizer("fake") as tok: - assert tok.count_texts(["a b", "c d e", "x"]) == [2, 3, 1] + assert await tok.count_texts_async([], loop) == [] - def test_count_texts_empty(self): + @pytest.mark.asyncio + async def test_count_texts_async_sharded(self): + """With shards present, chunks are reassembled in original order.""" with patch(_MOCK_TARGET, _FakeTokenizer): + loop = asyncio.get_running_loop() with BatchTokenizer("fake") as tok: - assert tok.count_texts([]) == [] + tok._procs = [_FakeProc(), _FakeProc()] + counts = await tok.count_texts_async(["a", "b b", "c c c", "d"], loop) + assert counts == [1, 2, 3, 1] - def test_concurrent_token_count_thread_safe(self): + @pytest.mark.asyncio + async def test_count_texts_async_shard_failure_propagates(self): + """A dead shard surfaces as an error, not a silent in-process fallback.""" with patch(_MOCK_TARGET, _FakeTokenizer): + loop = asyncio.get_running_loop() with BatchTokenizer("fake") as tok: - texts = [f"word{i} word{i + 1}" for i in range(20)] - with ThreadPoolExecutor(max_workers=8) as executor: - futures = [executor.submit(tok.token_count, t) for t in texts] - results = [f.result() for f in futures] - assert results == [2] * 20 + tok._procs = [_BrokenProc()] + with pytest.raises(BrokenProcessPool): + await tok.count_texts_async(["a b"], loop) def test_close_is_idempotent(self): with patch(_MOCK_TARGET, _FakeTokenizer): @@ -81,27 +117,14 @@ def test_close_is_idempotent(self): tok.close() tok.close() # must not raise - def test_use_after_close_raises(self): - with patch(_MOCK_TARGET, _FakeTokenizer): - tok = BatchTokenizer("fake") - tok.close() - with pytest.raises(RuntimeError, match="closed"): - tok.token_count("hello") - @pytest.mark.asyncio - async def test_count_texts_async(self): + async def test_use_after_close_raises(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: - counts = await tok.count_texts_async(["Hello world foo", "a"], loop) - assert counts == [3, 1] - - def test_context_manager(self): - with patch(_MOCK_TARGET, _FakeTokenizer): - with BatchTokenizer("fake") as tok: - assert tok.token_count("a b c") == 3 + tok = BatchTokenizer("fake") + tok.close() with pytest.raises(RuntimeError, match="closed"): - tok.token_count("test") + await tok.count_texts_async(["hello"], loop) class _FakeTokenizerWithTemplate(_FakeTokenizer): @@ -130,16 +153,23 @@ def apply_chat_template( @pytest.mark.unit class TestBatchTokenizerMessageTokenization: - def test_token_count_message_subtracts_baseline(self): - """token_count_message returns full_tokens - baseline.""" + @pytest.mark.asyncio + async def test_token_count_message_subtracts_baseline(self): + """token_count_message_async returns full_tokens - baseline.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): + loop = asyncio.get_running_loop() with BatchTokenizer("fake") as tok: # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2 - assert tok.token_count_message("hello world", None, None) == 2 + count = await tok.token_count_message_async( + "hello world", None, None, loop + ) + assert count == 2 - def test_token_count_message_includes_tool_calls(self): - """token_count_message includes tool-call JSON tokens.""" + @pytest.mark.asyncio + async def test_token_count_message_includes_tool_calls(self): + """Tool-call JSON tokens are included in the count.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): + loop = asyncio.get_running_loop() with BatchTokenizer("fake") as tok: tool_calls = ( { @@ -148,11 +178,14 @@ def test_token_count_message_includes_tool_calls(self): "function": {"name": "f", "arguments": "{}"}, }, ) - without = tok.token_count_message("hello", None, None) - with_calls = tok.token_count_message("hello", None, tool_calls) + without = await tok.token_count_message_async("hello", None, None, loop) + with_calls = await tok.token_count_message_async( + "hello", None, tool_calls, loop + ) assert with_calls > without - def test_token_count_message_fallback_on_exception(self): + @pytest.mark.asyncio + async def test_token_count_message_fallback_on_exception(self): """Falls back to whitespace split when apply_chat_template raises.""" class _BadTemplateTokenizer(_FakeTokenizer): @@ -160,6 +193,7 @@ def apply_chat_template(self, *args, **kwargs): raise ValueError("template does not support tool_calls") with patch(_MOCK_TARGET, _BadTemplateTokenizer): + loop = asyncio.get_running_loop() with BatchTokenizer("fake") as tok: tool_calls = ( { @@ -169,17 +203,31 @@ def apply_chat_template(self, *args, **kwargs): }, ) # Must not raise; falls back to whitespace tokenizer. - assert tok.token_count_message("hello world", None, tool_calls) > 0 - - @pytest.mark.asyncio - async def test_token_count_message_async(self): - with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): - loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: count = await tok.token_count_message_async( - "hello world", None, None, loop + "hello world", None, tool_calls, loop ) - assert count == 2 + assert count > 0 + + +@pytest.mark.unit +class TestEvenChunks: + def test_splits_into_near_equal_chunks(self): + assert _even_chunks(["a", "b", "c", "d", "e"], 2) == [ + ["a", "b", "c"], + ["d", "e"], + ] + + def test_single_chunk_when_n_le_one(self): + assert _even_chunks(["a", "b"], 1) == [["a", "b"]] + + def test_single_item_input(self): + assert _even_chunks(["only"], 4) == [["only"]] + + def test_preserves_order_and_bounds_chunk_count(self): + items = [str(i) for i in range(10)] + chunks = _even_chunks(items, 3) + assert [x for c in chunks for x in c] == items + assert len(chunks) <= 3 class _CapturingTokenizer: @@ -247,3 +295,20 @@ async def token_count_message_async(self, *args): n_pending = await queue.flush_remaining(timeout=0.05) assert n_pending == 1 assert recorded == [] + + async def test_flush_remaining_failure_reports_pending(self): + """A tokenizer error leaves items pending and never raises.""" + + class _FailingTokenizer: + async def count_texts_async(self, texts, _loop): + raise RuntimeError("tokenizer boom") + + async def token_count_message_async(self, *args): + raise RuntimeError("tokenizer boom") + + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_FailingTokenizer(), loop) + recorded: list[int] = [] + queue.enqueue_text("x y", recorded.append) + assert await queue.flush_remaining(timeout=5.0) == 1 + assert recorded == [] From 82a12bca9a80840ca4c1b8b293d03afe9a53de0e Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Tue, 9 Jun 2026 18:12:32 -0700 Subject: [PATCH 03/20] fix(metrics): tokenizer stage uses the whole machine; restore --tokenizer-workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review-council + e2e findings on the batch-tokenization branch. The tokenizer drain runs after the benchmark, so the loadgen/worker affinity partition does not apply to it — but the aggregator subprocess inherited the loadgen's narrow pin (subprocess.Popen propagates the parent mask) and sharding silently never engaged under the default enable_cpu_affinity=true. - cpu_affinity: add expand_to_all_online_cpus() — reset the current process to every online CPU (kernel still clamps to the cgroup/Slurm cpuset). The aggregator calls it before constructing the tokenizer, so shards size to the full machine by default. - Restore the --tokenizer-workers service flag with shard semantics: -1 auto (one process per 8-core block), explicit count clamped to capacity, 0 disables sharding. Every fallback path logs its reason and the success log includes setup time. - flush() phase isolation: a text-batch failure no longer drops the message items (separate failure scopes per executor; first error re-raised after both phases), and a raising recorder callback is logged instead of poisoning the rest of the batch. - Shard workers ignore SIGINT: Ctrl-C goes to the whole process group; the parent drain must control worker lifetime. - Stale "in-flight async tokenize tasks" wording updated in snapshot.py, publisher.py, and AGENTS.md (TokenizePool reference); documented the wait=False shard shutdown. Validated e2e through the real launch path (echo server, default flags, 48-CPU host): aggregator expands 10 -> 48 CPUs, "BatchTokenizer: 6 shards x 8 cores", drain to n_pending_tasks=0, state=complete. 166 unit tests pass; pre-commit clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- .../services/metrics_aggregator/__main__.py | 24 ++++- .../services/metrics_aggregator/publisher.py | 6 +- .../services/metrics_aggregator/snapshot.py | 19 ++-- .../metrics_aggregator/token_metrics.py | 100 +++++++++++++----- .../endpoint_client/cpu_affinity.py | 26 +++++ .../metrics_aggregator/test_publisher.py | 77 ++++++++++++++ .../metrics_aggregator/test_token_metrics.py | 79 ++++++++++++++ .../unit/endpoint_client/test_cpu_affinity.py | 28 +++++ 9 files changed, 323 insertions(+), 38 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 79bc5ded4..050d9e5b3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -204,7 +204,7 @@ src/inference_endpoint/ │ │ ├── publisher.py # MetricsPublisher (tick task + atomic disk fallback) │ │ ├── subscriber.py # MetricsSnapshotSubscriber (latest + COMPLETE snapshot capture) │ │ ├── metrics_table.py # In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL) -│ │ └── token_metrics.py # TokenizePool (HF tokenizer thread pool for ISL/OSL/TPOT) +│ │ └── token_metrics.py # BatchTokenizer (sharded batch tokenization) + TokenBatchQueue (defer-to-flush buffer) for ISL/OSL/TPOT │ └── transport/ # ZMQ-based IPC transport layer │ ├── protocol.py # Transport protocols + TransportConfig + MessageCodec[T] │ └── zmq/ # ZMQ implementation (context, pubsub, transport, ZMQTransportConfig) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 7d2101c11..fc975246a 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -26,6 +26,10 @@ from inference_endpoint.async_utils.loop_manager import LoopManager from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext from inference_endpoint.async_utils.transport.zmq.ready_check import send_ready_signal +from inference_endpoint.endpoint_client.cpu_affinity import ( + UnsupportedPlatformError, + expand_to_all_online_cpus, +) from inference_endpoint.utils.logging import setup_logging from .aggregator import MetricCounterKey, MetricsAggregatorService @@ -159,6 +163,15 @@ async def main() -> None: default=None, help="HuggingFace tokenizer name for ISL/OSL/TPOT (e.g. 'gpt2'). If not set, token metrics are disabled.", ) + parser.add_argument( + "--tokenizer-workers", + type=int, + default=-1, + help=( + "Number of tokenizer shard processes (-1 = auto: one per " + "8-core block of this machine; 0 = in-process tokenization)." + ), + ) parser.add_argument( "--streaming", action="store_true", @@ -201,7 +214,16 @@ async def main() -> None: # (coalesces to 'object' not 'AbstractContextManager[BatchTokenizer | None]') tokenizer_cm: AbstractContextManager[BatchTokenizer | None] if args.tokenizer: - tokenizer_cm = BatchTokenizer(args.tokenizer) + # Tokenization drains after the benchmark run, so the loadgen/worker + # affinity partition does not apply to this stage: drop the narrow + # mask inherited from the pinned parent so shards size to the whole + # machine (cgroup/Slurm CPU limits still apply). + try: + cpus = expand_to_all_online_cpus() + logger.info("metrics aggregator affinity: %d CPUs", len(cpus)) + except UnsupportedPlatformError: + pass # non-Linux: no inherited pin to undo. + tokenizer_cm = BatchTokenizer(args.tokenizer, n_workers=args.tokenizer_workers) else: tokenizer_cm = nullcontext() diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py index b58aa05ff..ae60942f0 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py @@ -168,9 +168,9 @@ async def publish_final( ) -> None: """Write the final snapshot to disk and signal pub/sub consumers. - ``n_pending_tasks`` is the count of in-flight async tokenize tasks - at finalization time. Drain timeout is detected by Report consumers - as ``state == COMPLETE and n_pending_tasks > 0``. + ``n_pending_tasks`` is the count of buffered tokenizations not yet + recorded at finalization time. An incomplete drain is detected by + Report consumers as ``state == COMPLETE and n_pending_tasks > 0``. ``interrupted=True`` is set by the signal handler in __main__.py when SIGTERM/SIGINT triggers shutdown before ``ENDED`` arrived; diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py index 95c68ab16..eacac94f5 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py @@ -44,8 +44,8 @@ class SessionState(str, Enum): state to carry). LIVE → run in progress; tick task publishing live HDR-derived stats. DRAINING → ``SessionEventType.ENDED`` has been received; the aggregator - is awaiting the in-flight async tokenize tasks (bounded by - the ``--drain-timeout`` budget, default 60 s). Tick task + is tokenizing the buffered samples (bounded by the + ``--drain-timeout`` budget, default 60 s). Tick task continues at this stage, still HDR-derived; no new events will arrive. COMPLETE → terminal clean state. The ``publish_final()`` snapshot @@ -149,13 +149,14 @@ class MetricsSnapshot( ``INTERRUPTED``) mark the last snapshot of the run; for ``COMPLETE`` snapshots percentiles and histograms are exact, otherwise HDR-derived. - n_pending_tasks: Count of in-flight async tokenize tasks at snapshot - composition time. ``> 0`` during normal load (ISL/ - OSL/TPOT post-processing in flight) and during the - drain phase. **Drain timeout is detected as** - ``state == COMPLETE and n_pending_tasks > 0``: the - aggregator gave up draining; some async-only series - are missing samples that were still being tokenized. + n_pending_tasks: Count of buffered tokenizations not yet recorded at + snapshot composition time. ``> 0`` during normal + load (ISL/OSL/TPOT buffered between publish-tick + flushes) and during the drain phase. **An + incomplete drain is detected as** ``state == + COMPLETE and n_pending_tasks > 0``: the end-of-run + flush timed out or failed; the token-derived series + are missing those samples. metrics: Tagged union of ``CounterStat`` and ``SeriesStat``, ordered counters-first then series, registration order within each. diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index d3a236d2b..8ad0ae10b 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -30,6 +30,8 @@ import logging import multiprocessing import os +import signal +import time from collections.abc import Callable from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from typing import TYPE_CHECKING, Any, Protocol, cast @@ -94,6 +96,10 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None: Affinity is set before the first encode so the Rust rayon pool sizes itself to the pinned core count (num_cpus respects sched_getaffinity on Linux). """ + # Ctrl-C sends SIGINT to the whole foreground process group; the parent + # drives worker shutdown, so a worker dying mid-drain would break the pool + # and lose the buffered tokenizations it was counting. + signal.signal(signal.SIGINT, signal.SIG_IGN) if core_set: try: os.sched_setaffinity(0, set(core_set)) @@ -151,20 +157,21 @@ def __init__( tokenizer_name: str, *, cores_per_worker: int = CORES_PER_WORKER, + n_workers: int = -1, ) -> None: self._tokenizer_name = tokenizer_name self._fallback_warned: set[str] = set() self._tokenizer: PreTrainedTokenizerBase | None = None self._prefix_len = 0 self._baseline = 0 - # In-process thread for the sync + chat-template paths. + # In-process thread for the chat-template path. self._thread: ThreadPoolExecutor | None = ThreadPoolExecutor( max_workers=1, thread_name_prefix="tok-thread" ) self._load_tokenizer() # also computes the chat-template baseline # Process shards for the batched text path (or empty -> in-process). self._procs: list[ProcessPoolExecutor] = [] - self._setup_shards(cores_per_worker) + self._setup_shards(cores_per_worker, n_workers) # -- setup -------------------------------------------------------------- @@ -201,24 +208,47 @@ def _load_tokenizer(self) -> None: self._tokenizer_name, ) - def _setup_shards(self, cores_per_worker: int) -> None: + def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: """Spawn one pinned single-worker process per core block. - No-op (leaving the batch path in-process) when the tokenizer has no fast - Rust backend, affinity is unavailable, or fewer than two blocks fit — a - single shard is no faster than the in-process backend. + ``n_workers <= 0`` (auto) fits as many shards as this process's + affinity mask allows, one per ``cores_per_worker`` block; an explicit + count is clamped to that capacity. No-op (leaving the batch path + in-process) when the tokenizer has no fast Rust backend, affinity is + unavailable, or — in auto mode — fewer than two blocks fit (a single + shard is no faster than the in-process backend). Each fallback is + logged: a missing "shards" INFO line is the only other signal that + the batched path is running single-threaded. """ - if cores_per_worker <= 0: + if cores_per_worker <= 0 or n_workers == 0: + logger.info("BatchTokenizer: sharding disabled") return if getattr(self._tokenizer, "backend_tokenizer", None) is None: + logger.info( + "BatchTokenizer: no fast tokenizer backend; using in-process " + "tokenization" + ) return try: available = sorted(os.sched_getaffinity(0)) except (OSError, AttributeError): + logger.info( + "BatchTokenizer: CPU affinity unavailable; using in-process " + "tokenization" + ) return - n = len(available) // cores_per_worker - if n < 2: + capacity = len(available) // cores_per_worker + n = capacity if n_workers < 0 else min(n_workers, capacity) + if n < (2 if n_workers < 0 else 1): + logger.info( + "BatchTokenizer: %d CPUs available (capacity %d blocks of %d); " + "using in-process tokenization", + len(available), + capacity, + cores_per_worker, + ) return + t0 = time.perf_counter() ctx = multiprocessing.get_context("spawn") procs: list[ProcessPoolExecutor] = [] try: @@ -247,7 +277,10 @@ def _setup_shards(self, cores_per_worker: int) -> None: return self._procs = procs logger.info( - "BatchTokenizer: %d shards x %d cores", len(procs), cores_per_worker + "BatchTokenizer: %d shards x %d cores (setup %.1fs)", + len(procs), + cores_per_worker, + time.perf_counter() - t0, ) # -- batched text path -------------------------------------------------- @@ -439,23 +472,42 @@ async def flush(self) -> None: return text_items, self._text = self._text, [] msg_items, self._msg = self._msg, [] + # The text and message phases fail independently — they run on + # separate executors, so a dead text shard must not drop message + # items that would still succeed (and vice versa). The first + # failure is re-raised after both phases so callers still see it. + failure: Exception | None = None if text_items: - counts = await self._tokenizer.count_texts_async( - [t for t, _ in text_items], self._loop - ) - for (_, on_count), count in zip(text_items, counts, strict=True): - try: - on_count(count) - finally: - self._inflight -= 1 + try: + counts = await self._tokenizer.count_texts_async( + [t for t, _ in text_items], self._loop + ) + except Exception as exc: # noqa: BLE001 — isolate phases. + failure = exc + else: + for (_, on_count), count in zip(text_items, counts, strict=True): + self._record(on_count, count) for (content, reasoning, tool_calls), on_count in msg_items: - count = await self._tokenizer.token_count_message_async( - content, reasoning, tool_calls, self._loop - ) try: - on_count(count) - finally: - self._inflight -= 1 + count = await self._tokenizer.token_count_message_async( + content, reasoning, tool_calls, self._loop + ) + except Exception as exc: # noqa: BLE001 — isolate items. + failure = failure or exc + continue + self._record(on_count, count) + if failure is not None: + raise failure + + def _record(self, on_count: Callable[[int], None], count: int) -> None: + """Run one recorder callback; a raising recorder must not poison the + rest of the batch, and the item still counts as recorded.""" + try: + on_count(count) + except Exception: # noqa: BLE001 — per-item isolation. + logger.exception("token metric recorder failed") + finally: + self._inflight -= 1 async def flush_remaining(self, timeout: float | None) -> int: """End-of-run flush, bounded by ``timeout`` seconds. diff --git a/src/inference_endpoint/endpoint_client/cpu_affinity.py b/src/inference_endpoint/endpoint_client/cpu_affinity.py index 8972a59d9..0de6e39a4 100644 --- a/src/inference_endpoint/endpoint_client/cpu_affinity.py +++ b/src/inference_endpoint/endpoint_client/cpu_affinity.py @@ -317,6 +317,32 @@ def pin_loadgen( return None +@require_linux +def expand_to_all_online_cpus() -> set[int]: + """Reset the current process's affinity to every online CPU. + + Undoes a narrow mask inherited from a pinned parent (subprocesses spawned + after ``pin_loadgen`` inherit the loadgen mask). The kernel intersects the + request with the cgroup cpuset, so container/Slurm CPU limits still apply. + + Returns: + The effective CPU set after the reset. + + Raises: + UnsupportedPlatformError: If not running on Linux. + """ + online = _read_sysfs_cpulist(_SYSFS_CPU / "online") or set() + if online: + try: + os.sched_setaffinity(0, online) + except OSError as e: + logger.warning(f"Could not expand CPU affinity: {e}") + try: + return os.sched_getaffinity(0) + except OSError: + return online + + @require_linux def set_cpu_affinity(pid: int, cpus: set[int]) -> bool: """Set CPU affinity for a process. diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py index 9e26f734a..15ad4d95c 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py @@ -81,6 +81,83 @@ def get_runtime_state() -> tuple[SessionState, int]: finally: publisher.close() + @pytest.mark.asyncio + async def test_pre_publish_runs_before_each_tick_snapshot( + self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext + ): + """pre_publish is awaited before the runtime state is captured.""" + loop = asyncio.get_event_loop() + publisher = MetricsPublisher( + MetricsSnapshotCodec(), + zmq_ctx_scope, + "test_pub_pre", + loop, + final_snapshot_path=tmp_path / "final_snapshot.json", + ) + try: + registry = MetricsRegistry() + registry.register_counter("c") + order: list[str] = [] + + async def pre_publish() -> None: + order.append("flush") + + def get_runtime_state() -> tuple[SessionState, int]: + order.append("state") + return SessionState.LIVE, 0 + + publisher.start( + registry, + publish_interval_s=0.01, + get_runtime_state=get_runtime_state, + pre_publish=pre_publish, + ) + await asyncio.sleep(0.05) + assert order, "no tick ran" + # Every state capture is preceded by a flush in the same tick. + assert order[0] == "flush" + for i, entry in enumerate(order): + if entry == "state": + assert order[i - 1] == "flush" + finally: + publisher.close() + + @pytest.mark.asyncio + async def test_pre_publish_failure_keeps_ticking( + self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext + ): + """A raising pre_publish is swallowed by the tick; ticks continue.""" + loop = asyncio.get_event_loop() + publisher = MetricsPublisher( + MetricsSnapshotCodec(), + zmq_ctx_scope, + "test_pub_pre_fail", + loop, + final_snapshot_path=tmp_path / "final_snapshot.json", + ) + try: + registry = MetricsRegistry() + registry.register_counter("c") + attempts = 0 + + async def pre_publish() -> None: + nonlocal attempts + attempts += 1 + raise RuntimeError("tokenizer hiccup") + + publisher.start( + registry, + publish_interval_s=0.01, + get_runtime_state=lambda: (SessionState.LIVE, 0), + pre_publish=pre_publish, + ) + await asyncio.sleep(0.08) + assert attempts >= 2, "tick task died after a pre_publish failure" + assert publisher._tick_task is not None + assert not publisher._tick_task.done() + finally: + publisher.close() + @pytest.mark.asyncio async def test_publish_final_writes_json_atomically( self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 03e47158c..82609f275 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -22,10 +22,15 @@ from unittest.mock import patch import pytest +from inference_endpoint.async_utils.services.metrics_aggregator import ( + token_metrics as token_metrics_module, +) from inference_endpoint.async_utils.services.metrics_aggregator.token_metrics import ( BatchTokenizer, TokenBatchQueue, + _encode_batch_lengths, _even_chunks, + _worker_encode_lengths, ) _MOCK_TARGET = "inference_endpoint.async_utils.services.metrics_aggregator.token_metrics.AutoTokenizer" @@ -209,6 +214,43 @@ def apply_chat_template(self, *args, **kwargs): assert count > 0 +class _Encoding: + def __init__(self, n: int): + self.ids = list(range(n)) + + +class _FastBackend: + """Raw-tokenizers backend stub with the fast batch entry point.""" + + def encode_batch_fast(self, texts, add_special_tokens=False): + return [_Encoding(len(t.split())) for t in texts] + + +class _SlowBackend: + """Raw-tokenizers backend stub without encode_batch_fast.""" + + def encode_batch(self, texts, add_special_tokens=False): + return [_Encoding(len(t.split())) for t in texts] + + +@pytest.mark.unit +class TestEncodeHelpers: + def test_encode_batch_lengths_prefers_fast(self): + assert _encode_batch_lengths(_FastBackend(), ["a b", "c"]) == [2, 1] + + def test_encode_batch_lengths_falls_back_to_encode_batch(self): + assert _encode_batch_lengths(_SlowBackend(), ["a b c", "d"]) == [3, 1] + + def test_worker_encode_lengths_raises_without_backend(self, monkeypatch): + monkeypatch.setattr(token_metrics_module, "_WORKER_BACKEND", None) + with pytest.raises(RuntimeError, match="backend unavailable"): + _worker_encode_lengths(["a"]) + + def test_worker_encode_lengths_uses_backend(self, monkeypatch): + monkeypatch.setattr(token_metrics_module, "_WORKER_BACKEND", _FastBackend()) + assert _worker_encode_lengths(["a b", "c d e"]) == [2, 3] + + @pytest.mark.unit class TestEvenChunks: def test_splits_into_near_equal_chunks(self): @@ -312,3 +354,40 @@ async def token_count_message_async(self, *args): queue.enqueue_text("x y", recorded.append) assert await queue.flush_remaining(timeout=5.0) == 1 assert recorded == [] + + async def test_flush_text_failure_does_not_drop_message_items(self): + """The message phase runs (and records) even when the text batch fails.""" + + class _TextFailingTokenizer: + async def count_texts_async(self, texts, _loop): + raise RuntimeError("text shard died") + + async def token_count_message_async( + self, content, reasoning, tool_calls, _loop + ): + return len(content.split()) + + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_TextFailingTokenizer(), loop) + recorded: list[int] = [] + queue.enqueue_text("never counted", recorded.append) + queue.enqueue_message(("hello world", None, None), recorded.append) + with pytest.raises(RuntimeError, match="text shard died"): + await queue.flush() + assert recorded == [2], "message item must survive the text failure" + assert queue.pending == 1, "only the text item remains pending" + + async def test_flush_recorder_failure_does_not_poison_batch(self): + """One raising on_count is logged; the rest of the batch still records.""" + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + recorded: list[int] = [] + + def bad_recorder(count: int) -> None: + raise ValueError("recorder bug") + + queue.enqueue_text("a b", bad_recorder) + queue.enqueue_text("c d e", recorded.append) + await queue.flush() + assert recorded == [3] + assert queue.pending == 0, "a raising recorder still counts as recorded" diff --git a/tests/unit/endpoint_client/test_cpu_affinity.py b/tests/unit/endpoint_client/test_cpu_affinity.py index 52ef724e2..7d100be9d 100644 --- a/tests/unit/endpoint_client/test_cpu_affinity.py +++ b/tests/unit/endpoint_client/test_cpu_affinity.py @@ -6,6 +6,7 @@ from inference_endpoint.endpoint_client.cpu_affinity import ( AffinityPlan, compute_affinity_plan, + expand_to_all_online_cpus, get_all_online_cpus, pin_loadgen, set_cpu_affinity, @@ -146,3 +147,30 @@ def test_all_methods_fail_returns_empty( """Test that empty set is returned when all methods fail.""" cpus = get_all_online_cpus() assert cpus == set() + + +class TestExpandToAllOnlineCpus: + @patch("os.sched_getaffinity") + @patch("os.sched_setaffinity") + @patch("pathlib.Path.read_text") + def test_expands_inherited_mask_to_online(self, mock_read, mock_set, mock_get): + """The full sysfs online set is requested; the effective mask is returned.""" + mock_read.return_value = "0-7\n" + mock_get.return_value = {0, 1, 2, 3, 4, 5, 6, 7} + + cpus = expand_to_all_online_cpus() + + mock_set.assert_called_once_with(0, {0, 1, 2, 3, 4, 5, 6, 7}) + assert cpus == {0, 1, 2, 3, 4, 5, 6, 7} + + @patch("os.sched_getaffinity") + @patch("os.sched_setaffinity", side_effect=OSError("cpuset denies")) + @patch("pathlib.Path.read_text") + def test_setaffinity_failure_returns_current_mask( + self, mock_read, mock_set, mock_get + ): + """A denied expansion is non-fatal: the current mask is reported.""" + mock_read.return_value = "0-7\n" + mock_get.return_value = {0, 1} + + assert expand_to_all_online_cpus() == {0, 1} From 8033c4726fd14d1e86099b8e94160f7e19e06a67 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Tue, 9 Jun 2026 23:41:43 -0700 Subject: [PATCH 04/20] chore(metrics): use pass bodies in TokenCounter protocol stubs The ellipsis bodies trip the code-quality bot's "statement has no effect" check on every push; pass is semantically identical for Protocol method declarations and keeps the report clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/token_metrics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 8ad0ae10b..4e49aedc8 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -405,7 +405,8 @@ class TokenCounter(Protocol): async def count_texts_async( self, texts: list[str], loop: asyncio.AbstractEventLoop, / - ) -> list[int]: ... + ) -> list[int]: + pass async def token_count_message_async( self, @@ -414,7 +415,8 @@ async def token_count_message_async( tool_calls: tuple[dict[str, Any], ...] | None, loop: asyncio.AbstractEventLoop, /, - ) -> int: ... + ) -> int: + pass class TokenBatchQueue: From 47c4f35ac669ae58f1cce7d782863e64c1fd7541 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 01:50:03 -0700 Subject: [PATCH 05/20] docs(metrics): add metrics-aggregator design doc; refresh services overview New docs/async_utils/services/metrics_aggregator/DESIGN.md (mirroring the event_logger convention) covering the service lifecycle and the token metrics pipeline: defer-to-flush batching, process-sharded batch encoding, the post-run affinity expansion, failure isolation, and the n_pending_tasks contract. The services overview 6.2 entry now reflects the batched tokenizer, the snapshot outputs, and the current CLI flags, and links the new doc. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/async_utils/services/DESIGN.md | 6 +- .../services/metrics_aggregator/DESIGN.md | 171 ++++++++++++++++++ 2 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 docs/async_utils/services/metrics_aggregator/DESIGN.md diff --git a/docs/async_utils/services/DESIGN.md b/docs/async_utils/services/DESIGN.md index a26f13783..e12eb8a4d 100644 --- a/docs/async_utils/services/DESIGN.md +++ b/docs/async_utils/services/DESIGN.md @@ -306,9 +306,9 @@ stateDiagram-v2 ### 6.2 Metrics aggregator -- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). May use a tokenizer pool for token-based metrics. Shuts down on **session.ended**. -- **Outputs**: Planned is to push real time metrics to Prometheus via PushGateway. Currently, logging / writing final report to JSON is sufficient legacy behavior. -- **Process**: Run as a **subprocess**; given `--metrics-dir`, `--socket-dir`, `--socket-name`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC address. +- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). Token metrics (ISL/OSL/TPOT) are computed by a batched, process-sharded tokenizer — see [metrics_aggregator/DESIGN.md](metrics_aggregator/DESIGN.md). Shuts down on **session.ended**. +- **Outputs**: Live `MetricsSnapshot` frames over an IPC PUB socket, and an atomically written `final_snapshot.json` (the primary Report source). Planned is to push real time metrics to Prometheus via PushGateway. +- **Process**: Run as a **subprocess**; given `--metrics-output-dir`, `--socket-dir`, `--socket-name`, `--metrics-socket`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC address. --- diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md new file mode 100644 index 000000000..4f094097a --- /dev/null +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -0,0 +1,171 @@ +# Metrics Aggregator Service — Design Document + +## Overview + +The metrics aggregator is a **subprocess** (`python -m +inference_endpoint.async_utils.services.metrics_aggregator`) that subscribes to +the EventRecord pub/sub stream, folds per-sample events into a +`MetricsRegistry` (counters + HDR-histogram series + raw values), and publishes +`MetricsSnapshot` frames over an IPC PUB socket at a fixed cadence. At +end-of-run it atomically writes `final_snapshot.json`, which is the **primary** +source for `Report`; the terminal pub/sub frame is only a TUI "run finished" +signal. + +This document covers the service's lifecycle and, in depth, the **token +metrics pipeline** — how ISL/OSL/TPOT tokenization keeps pace with +high-completion-rate runs. + +## Module Layout + +| File | Purpose | +| ------------------ | ------------------------------------------------------------------------- | +| `__main__.py` | Subprocess entry: argparse, affinity expansion, lifecycle wiring, SIGTERM | +| `aggregator.py` | `MetricsAggregatorService` — event router, session state, drain | +| `registry.py` | `MetricsRegistry`, `CounterSampler`, `SeriesSampler` | +| `snapshot.py` | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec | +| `publisher.py` | `MetricsPublisher` — tick task + atomic final-snapshot write | +| `subscriber.py` | `MetricsSnapshotSubscriber` — main-process consumer | +| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL) | +| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue` | + +## Lifecycle + +``` +INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► COMPLETE + └──► INTERRUPTED (SIGTERM/SIGINT) +``` + +- **LIVE**: the publisher tick task emits a snapshot every + `--publish-interval` seconds (default 0.25 s). +- **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed, + bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited). +- The ENDED path runs inside a finalization boundary: whatever the drain does + — finish, time out, or fail — `publish_final` and the shutdown signal always + run. A tokenizer failure can degrade the snapshot (see the + `n_pending_tasks` contract below) but can never hang the subprocess. +- **INTERRUPTED**: a signal handler writes a best-effort partial final + snapshot so `Report` can distinguish a killed run from a clean one. + +## Token Metrics Pipeline + +ISL, OSL, and TPOT all require running the HF tokenizer over prompt or +completion text. With streaming on, each completed sample needs up to three +tokenizer passes, so at high completion rates tokenization is the service's +dominant CPU cost — and a per-event dispatch model cannot keep up: work +arriving faster than it drains accumulates an unbounded backlog that must be +paid at end-of-run. The pipeline is therefore built around two ideas: +**defer-to-flush batching** and **process-sharded batch encoding**. + +### Defer-to-flush (`TokenBatchQueue`) + +Token triggers do no work at event time. `fire()` appends +`(text, on_count)` — or `(message_parts, on_count)` for chat-template items — +to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared +in batches at exactly two points: + +1. **Every publish tick** — the publisher awaits a `pre_publish` hook before + composing each snapshot, so live ISL/OSL/TPOT reflect recently completed + samples. A failure here is swallowed by the tick (live publishing never + stops). +2. **End-of-run** — `flush_remaining(timeout)` drains everything still + buffered, bounded by the drain budget. + +`flush()` serializes under an asyncio lock and detaches the buffer up front, +so enqueues that race a flush land in the next one. Failure isolation is +layered: the plain-text phase and the chat-template phase fail independently +(they run on separate executors, so a dead text shard must not drop message +items), a raising recorder callback is logged without aborting the rest of +the batch, and the first error is re-raised only after both phases ran. +`flush_remaining` never raises — a timeout or tokenizer failure becomes a +logged, non-zero pending count. + +### Sharded batch encoding (`BatchTokenizer`) + +A flush hands the whole buffer to `count_texts_async`, which splits it into +contiguous chunks and fans them out across worker **processes**, one pinned to +each block of `CORES_PER_WORKER` (8) cores. Why this shape: + +- Each worker runs the raw `tokenizers` backend's `encode_batch_fast` — Rust, + rayon-parallel, no Python-per-text cost. Batching amortizes the + submit/result overhead over thousands of texts. +- A single BPE rayon pool is memory-bound and saturates at ~8 cores; more + threads oversubscribe and, on multi-socket parts, cross the NUMA boundary. + Sharding across processes pinned to disjoint 8-core blocks (affinity set + **before** the backend loads, so each rayon pool sizes itself to its block + and stays NUMA-local) is how the whole machine is used. +- Workers are spawn-context processes with module-level entry points (pickled + by name), warmed in parallel at construction so N tokenizer loads do not + serialize, and they ignore SIGINT — Ctrl-C goes to the whole process group, + and worker lifetime must stay under the parent drain's control. + +`--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one +shard per 8-core block of the process affinity mask, an explicit count is +clamped to that capacity, and `0` disables sharding. Every fallback to the +in-process path (no fast Rust backend, affinity unavailable, fewer than two +blocks) is logged with its reason — a missing "shards" INFO line should never +be the only signal that the batch path is running single-threaded. + +Chat-template items (tool-call outputs) take a separate in-process thread: +they are rare relative to the batched flush, and `apply_chat_template` is +Python/Jinja — sharding buys nothing. A template baseline (the empty +assistant-message frame) is computed once and subtracted so only the payload +is counted. + +### CPU affinity: the tokenizer stage is post-run + +The benchmark parent pins itself to the loadgen cores before launching +services, and subprocesses inherit that narrow mask. The tokenizer's heavy +work happens **after** the run (the end-of-run flush), so the run-time core +partition does not apply to it: at startup the service calls +`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`) to reset +its mask to every online CPU — the kernel still clamps to the cgroup/Slurm +cpuset — and shards size to the full machine. Mid-run tick flushes are small +batches; the drain is where the core count pays. + +### The `n_pending_tasks` contract + +`TokenBatchQueue.pending` counts enqueued-but-not-yet-recorded items and is +surfaced on every snapshot as `n_pending_tasks`. In the **final** snapshot: + +- `state == complete && n_pending_tasks == 0` — clean run, token series exact. +- `state == complete && n_pending_tasks > 0` — **incomplete drain**: the + end-of-run flush ran out of budget or the tokenizer failed; token-derived + series are missing exactly that many samples. `Report` renders a warning. + +Items dropped by a failed flush are intentionally _not_ removed from the +pending count — under-reporting an incomplete drain would silently rebadge it +as a clean run. + +### Data flow + +``` +COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count) [O(1)] + │ + publish tick (0.25 s) ──────────────┤ flush() + ENDED drain (budgeted) ─────────────┘ │ + ├─► chunks ─► N pinned worker procs + │ (encode_batch_fast) + └─► on_count(n) ─► registry.record() +``` + +## CLI Interface + +| Flag | Default | Purpose | +| -------------------------------- | -------- | --------------------------------------------------- | +| `--socket-dir` / `--socket-name` | required | EventRecord SUB socket | +| `--metrics-socket` | required | Snapshot PUB socket name | +| `--metrics-output-dir` | required | Directory for `final_snapshot.json` | +| `--publish-interval` | 0.25 | Live snapshot cadence (seconds) | +| `--drain-timeout` | 60.0 | End-of-run tokenize budget (`0` = unlimited) | +| `--tokenizer` | none | HF name or local path; unset disables token metrics | +| `--tokenizer-workers` | -1 | Shard processes (`-1` auto, `0` in-process) | +| `--streaming` | off | Register TTFT/chunk-delta/TPOT triggers | + +## References + +- [docs/async_utils/services/DESIGN.md](../DESIGN.md) — the EventRecord + pub/sub system this service subscribes to. +- [docs/PERF_ARCHITECTURE.md](../../../PERF_ARCHITECTURE.md) — CPU pinning + strategy for the loadgen/worker hot path. +- AGENTS.md "Metrics Aggregator subprocess" — the condensed contract summary + for AI agents. From 1315a737916ea69ed45ad201ed1970002c6b596b Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 14:41:19 -0700 Subject: [PATCH 06/20] fix(metrics): publish live snapshots through tokenizer failures; bound shard warmup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review-council findings (handled locally): - A persistently failing pre_publish flush aborted every tick before the snapshot was built, silently stopping ALL live metrics publishing — not just token series. The flush now fails in its own handler (logged once) and the tick always proceeds to build and publish; unflushed items stay visible as n_pending_tasks. Regression-tested: a failing flush must not suppress state capture/publish. - Shard warmup waits are bounded (_SHARD_WARMUP_TIMEOUT_S): a hung tokenizer load (e.g. stuck network filesystem) now degrades to the in-process path instead of wedging service startup forever. - close() and warmup cleanup terminate shard workers (cancel_futures + SIGTERM) so an in-flight encode cannot stall interpreter exit after a drain timeout. - TokenCounter protocol stubs use docstring + raise NotImplementedError (the one body shape CodeQL, mypy, and Pyright all accept). - New TestSetupShardsDecisions pins the --tokenizer-workers contract (auto/clamp/disable thresholds, block pinning, affinity and warmup failure fallbacks) — previously zero coverage of the decision logic. 162 aggregator unit tests pass; pre-commit clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/DESIGN.md | 6 +- .../services/metrics_aggregator/publisher.py | 25 ++++- .../metrics_aggregator/token_metrics.py | 43 +++++++-- .../metrics_aggregator/test_publisher.py | 11 ++- .../metrics_aggregator/test_token_metrics.py | 93 ++++++++++++++++++- 5 files changed, 161 insertions(+), 17 deletions(-) diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md index 4f094097a..30fe533d6 100644 --- a/docs/async_utils/services/metrics_aggregator/DESIGN.md +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -95,8 +95,10 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape: and stays NUMA-local) is how the whole machine is used. - Workers are spawn-context processes with module-level entry points (pickled by name), warmed in parallel at construction so N tokenizer loads do not - serialize, and they ignore SIGINT — Ctrl-C goes to the whole process group, - and worker lifetime must stay under the parent drain's control. + serialize (the warmup wait is bounded — a hung load degrades to the + in-process path instead of wedging startup), and they ignore SIGINT — + Ctrl-C goes to the whole process group, and worker lifetime must stay under + the parent drain's control. `--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one shard per 8-core block of the process affinity mask, an explicit count is diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py index ae60942f0..fedc0fbe1 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py @@ -116,8 +116,10 @@ def start( ``pre_publish``, if given, is awaited at the top of each tick before the snapshot is built — the aggregator uses it to flush buffered tokenizations so live ISL/OSL/TPOT reflect recently completed samples. - Its failures are swallowed by the tick's own try/except (the tick keeps - going), so a transient tokenizer hiccup never stops live publishing. + Its failures are swallowed in their own handler so the snapshot is + still built and published — even a tokenizer that fails on every tick + cannot stop live publishing; the unflushed items remain visible as + ``n_pending_tasks``. Idempotent on the tick-task slot: a second call (e.g. from a spurious duplicate ``STARTED`` event or a buggy replay producer) @@ -137,11 +139,28 @@ def start( ) async def _tick() -> None: + flush_failure_logged = False while True: try: await asyncio.sleep(publish_interval_s) if pre_publish is not None: - await pre_publish() + # Isolated from the publish path: a persistently + # broken tokenizer would otherwise abort every tick + # here and stop ALL live snapshots, not just token + # series. Unflushed items stay visible to consumers + # via n_pending_tasks. + try: + await pre_publish() + except Exception: # noqa: BLE001 — publish anyway. + if not flush_failure_logged: + flush_failure_logged = True + logger.exception( + "pre_publish flush failed; live snapshots " + "continue without fresh token metrics " + "(further failures logged at debug)" + ) + else: + logger.debug("pre_publish flush failed again") state, n_pending = get_runtime_state() snap = registry.build_snapshot( state=state, n_pending_tasks=n_pending diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 4e49aedc8..5e57a197d 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -46,6 +46,11 @@ # used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process. CORES_PER_WORKER = 8 +# Budget for the parallel shard warmup (spawn + transformers import + +# tokenizer load per worker). A hung load (e.g. a stuck network filesystem) +# must degrade to the in-process path, not wedge service startup. +_SHARD_WARMUP_TIMEOUT_S = 120.0 + # Minimal user message used to satisfy chat templates that reject assistant-only # message lists. Its token count is subtracted so only the assistant payload is # measured. @@ -132,6 +137,23 @@ def _worker_ready(_: int) -> bool: return _WORKER_BACKEND is not None +def _terminate_procs(procs: list[ProcessPoolExecutor]) -> None: + """Best-effort immediate stop: cancel queued work and SIGTERM workers. + + ``shutdown(wait=False)`` alone leaves an in-flight encode running, and the + non-daemon worker would still be joined at interpreter exit — so a drain + timeout could stall process shutdown until the chunk finished. + """ + for ex in procs: + ex.shutdown(wait=False, cancel_futures=True) + workers = getattr(ex, "_processes", None) or {} # CPython impl detail. + for p in workers.values(): + try: + p.terminate() + except Exception: # noqa: BLE001 — already-dead workers are fine. + pass + + if TYPE_CHECKING: from transformers import PreTrainedTokenizerBase @@ -265,12 +287,13 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: # Submit to every shard first so the loads run in parallel, then # await — waiting on each before submitting the next would # serialize P tokenizer loads and can exceed the launch budget. + # The wait is bounded: one hung load must not wedge startup. ready = [ex.submit(_worker_ready, 0) for ex in procs] + deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S for f in ready: - f.result() + f.result(timeout=max(0.0, deadline - time.monotonic())) except Exception: - for ex in procs: - ex.shutdown(wait=False) + _terminate_procs(procs) logger.exception( "tokenizer shard setup failed; using in-process tokenization" ) @@ -373,11 +396,11 @@ async def token_count_message_async( def close(self) -> None: """Shut down all workers. Idempotent. - Shard shutdown uses ``wait=False``: a hung worker must not block - aggregator shutdown; idle workers exit on their own once signalled. + Shards are stopped without waiting (a hung worker must not block + aggregator shutdown) and terminated so an in-flight encode cannot + stall interpreter exit after a drain timeout. """ - for ex in self._procs: - ex.shutdown(wait=False) + _terminate_procs(self._procs) self._procs = [] if self._thread is not None: self._thread.shutdown(wait=True) @@ -406,7 +429,8 @@ class TokenCounter(Protocol): async def count_texts_async( self, texts: list[str], loop: asyncio.AbstractEventLoop, / ) -> list[int]: - pass + """Per-text token counts for a whole batch.""" + raise NotImplementedError async def token_count_message_async( self, @@ -416,7 +440,8 @@ async def token_count_message_async( loop: asyncio.AbstractEventLoop, /, ) -> int: - pass + """Chat-template token count for one assistant message.""" + raise NotImplementedError class TokenBatchQueue: diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py index 15ad4d95c..1d540ddec 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py @@ -139,20 +139,29 @@ async def test_pre_publish_failure_keeps_ticking( registry = MetricsRegistry() registry.register_counter("c") attempts = 0 + published_states = 0 async def pre_publish() -> None: nonlocal attempts attempts += 1 raise RuntimeError("tokenizer hiccup") + def get_runtime_state() -> tuple[SessionState, int]: + nonlocal published_states + published_states += 1 + return SessionState.LIVE, 0 + publisher.start( registry, publish_interval_s=0.01, - get_runtime_state=lambda: (SessionState.LIVE, 0), + get_runtime_state=get_runtime_state, pre_publish=pre_publish, ) await asyncio.sleep(0.08) assert attempts >= 2, "tick task died after a pre_publish failure" + # The failure must not suppress the snapshot: every failing tick + # still proceeds to capture state and publish. + assert published_states >= 2, "failing pre_publish suppressed publishing" assert publisher._tick_task is not None assert not publisher._tick_task.done() finally: diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 82609f275..14805b011 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -63,7 +63,7 @@ def submit(self, _fn, chunk): fut.set_result([len(t.split()) for t in chunk]) return fut - def shutdown(self, wait=False): + def shutdown(self, wait=False, cancel_futures=False): pass @@ -75,7 +75,7 @@ def submit(self, _fn, _chunk): fut.set_exception(BrokenProcessPool("worker died")) return fut - def shutdown(self, wait=False): + def shutdown(self, wait=False, cancel_futures=False): pass @@ -251,6 +251,95 @@ def test_worker_encode_lengths_uses_backend(self, monkeypatch): assert _worker_encode_lengths(["a b", "c d e"]) == [2, 3] +class _FakeTokenizerWithBackend(_FakeTokenizer): + """Fast-backend fake: lets ``_setup_shards`` proceed past the backend guard.""" + + backend_tokenizer = _FastBackend() + + +class _SpawnlessExecutor: + """Stands in for ProcessPoolExecutor: records ctor args, instant warmup.""" + + def __init__(self, max_workers, mp_context=None, initializer=None, initargs=()): + self.initargs = initargs + + def submit(self, fn, *args): + fut: Future = Future() + fut.set_result(True) + return fut + + def shutdown(self, wait=False, cancel_futures=False): + pass + + +@pytest.mark.unit +class TestSetupShardsDecisions: + """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 disabled.""" + + def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor): + monkeypatch.setattr(token_metrics_module, "ProcessPoolExecutor", executor) + monkeypatch.setattr( + token_metrics_module.os, "sched_getaffinity", lambda pid: set(range(cpus)) + ) + with patch(_MOCK_TARGET, _FakeTokenizerWithBackend): + return BatchTokenizer("fake", n_workers=n_workers) + + @pytest.mark.parametrize( + "cpus, n_workers, expected_shards", + [ + (16, -1, 2), # auto: one shard per 8-core block + (10, -1, 0), # auto needs >= 2 blocks (1 shard ~= in-process) + (48, 3, 3), # explicit count under capacity + (16, 10, 2), # explicit count clamped to capacity + (16, 1, 1), # explicit single shard honored + (16, 0, 0), # 0 disables sharding + ], + ) + def test_shard_count(self, monkeypatch, cpus, n_workers, expected_shards): + tok = self._make(monkeypatch, cpus, n_workers) + try: + assert len(tok._procs) == expected_shards + finally: + tok.close() + + def test_blocks_are_disjoint_consecutive_core_sets(self, monkeypatch): + tok = self._make(monkeypatch, 16, -1) + try: + blocks = [set(ex.initargs[1]) for ex in tok._procs] + assert blocks == [set(range(0, 8)), set(range(8, 16))] + finally: + tok.close() + + def test_affinity_failure_falls_back_in_process(self, monkeypatch): + monkeypatch.setattr( + token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor + ) + + def _raise(pid): + raise OSError("affinity unavailable") + + monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise) + with patch(_MOCK_TARGET, _FakeTokenizerWithBackend): + tok = BatchTokenizer("fake") + try: + assert tok._procs == [] + finally: + tok.close() + + def test_warmup_failure_falls_back_in_process(self, monkeypatch): + class _BrokenWarmup(_SpawnlessExecutor): + def submit(self, fn, *args): + fut: Future = Future() + fut.set_exception(RuntimeError("spawn died")) + return fut + + tok = self._make(monkeypatch, 16, -1, executor=_BrokenWarmup) + try: + assert tok._procs == [] + finally: + tok.close() + + @pytest.mark.unit class TestEvenChunks: def test_splits_into_near_equal_chunks(self): From 6d227bfc0a8a1c79d8bd8e9c6f46b1234c5be0f2 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 14:55:54 -0700 Subject: [PATCH 07/20] =?UTF-8?q?feat(metrics):=20no=20silent=20tokenizer?= =?UTF-8?q?=20fallbacks=20=E2=80=94=20shard=20or=20exit=20cleanly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A fully set-up environment (fast Rust tokenizer backend + Linux affinity) always shards; anything else was previously a silent in-process fallback that cannot keep up with completions and only surfaces much later as an incomplete drain. Setup is now strict: - no fast backend / no CPU affinity / failed or over-budget warmup -> RuntimeError, surfaced by the service entry as a FATAL launch failure - --tokenizer-workers 0 is the only (explicit) in-process mode - auto mode always shards: max(1, cpus // 8) — the "fewer than two blocks" in-process heuristic is gone; one pinned shard below a full block Also converts the new shard-decision tests to context-managed BatchTokenizer construction (CodeQL: use-with-statement). 164 aggregator unit tests pass; pre-commit clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/DESIGN.md | 18 ++--- .../services/metrics_aggregator/__main__.py | 15 +++- .../metrics_aggregator/token_metrics.py | 70 +++++++++---------- .../metrics_aggregator/test_token_metrics.py | 65 ++++++++--------- 4 files changed, 87 insertions(+), 81 deletions(-) diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md index 30fe533d6..074cb569b 100644 --- a/docs/async_utils/services/metrics_aggregator/DESIGN.md +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -95,17 +95,17 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape: and stays NUMA-local) is how the whole machine is used. - Workers are spawn-context processes with module-level entry points (pickled by name), warmed in parallel at construction so N tokenizer loads do not - serialize (the warmup wait is bounded — a hung load degrades to the - in-process path instead of wedging startup), and they ignore SIGINT — - Ctrl-C goes to the whole process group, and worker lifetime must stay under - the parent drain's control. + serialize (the warmup wait is bounded — a hung load is a startup error, not + a wedge), and they ignore SIGINT — Ctrl-C goes to the whole process group, + and worker lifetime must stay under the parent drain's control. `--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one -shard per 8-core block of the process affinity mask, an explicit count is -clamped to that capacity, and `0` disables sharding. Every fallback to the -in-process path (no fast Rust backend, affinity unavailable, fewer than two -blocks) is logged with its reason — a missing "shards" INFO line should never -be the only signal that the batch path is running single-threaded. +shard per 8-core block of the process affinity mask (always at least one), an +explicit count is clamped to that capacity, and `0` explicitly selects +in-process tokenization. There is no implicit fallback: an environment that +cannot shard — no fast Rust backend, no CPU affinity, a failed or over-budget +warmup — is a startup error, because a silent in-process slow path cannot +keep up with completions and would surface much later as an incomplete drain. Chat-template items (tool-call outputs) take a separate in-process thread: they are rare relative to the batched flush, and `apply_chat_template` is diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index fc975246a..20a5b1dfb 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -169,7 +169,10 @@ async def main() -> None: default=-1, help=( "Number of tokenizer shard processes (-1 = auto: one per " - "8-core block of this machine; 0 = in-process tokenization)." + "8-core block of this machine, minimum one; 0 = explicit " + "in-process tokenization). An environment that cannot shard " + "(no fast tokenizer backend, no CPU affinity) is a startup " + "error unless 0 is passed." ), ) parser.add_argument( @@ -223,7 +226,15 @@ async def main() -> None: logger.info("metrics aggregator affinity: %d CPUs", len(cpus)) except UnsupportedPlatformError: pass # non-Linux: no inherited pin to undo. - tokenizer_cm = BatchTokenizer(args.tokenizer, n_workers=args.tokenizer_workers) + try: + tokenizer_cm = BatchTokenizer( + args.tokenizer, n_workers=args.tokenizer_workers + ) + except RuntimeError as exc: + # Fail-fast contract: a tokenizer environment that cannot shard + # must surface as a clear service-launch failure, not a silent + # slow path that cannot keep up with completions. + raise SystemExit(f"FATAL: {exc}") from exc else: tokenizer_cm = nullcontext() diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 5e57a197d..8aa678f84 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -18,9 +18,10 @@ ``BatchTokenizer`` tokenizes whole batches at once, sharded across worker processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers -per-sample text and flushes the batch once per publish tick and at drain. Falls -back to a single in-process thread when there is no fast Rust backend or fewer -than two core blocks fit. +per-sample text and flushes the batch once per publish tick and at drain. +Sharding requires the fast (Rust) tokenizers backend and Linux CPU affinity; +an environment that cannot shard is a startup error, never a silent slow +path — ``--tokenizer-workers 0`` is the only (explicit) in-process mode. """ from __future__ import annotations @@ -191,7 +192,8 @@ def __init__( max_workers=1, thread_name_prefix="tok-thread" ) self._load_tokenizer() # also computes the chat-template baseline - # Process shards for the batched text path (or empty -> in-process). + # Process shards for the batched text path. Empty only when + # in-process mode was explicitly requested (n_workers=0). self._procs: list[ProcessPoolExecutor] = [] self._setup_shards(cores_per_worker, n_workers) @@ -233,43 +235,34 @@ def _load_tokenizer(self) -> None: def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: """Spawn one pinned single-worker process per core block. - ``n_workers <= 0`` (auto) fits as many shards as this process's - affinity mask allows, one per ``cores_per_worker`` block; an explicit - count is clamped to that capacity. No-op (leaving the batch path - in-process) when the tokenizer has no fast Rust backend, affinity is - unavailable, or — in auto mode — fewer than two blocks fit (a single - shard is no faster than the in-process backend). Each fallback is - logged: a missing "shards" INFO line is the only other signal that - the batched path is running single-threaded. + ``n_workers == 0`` explicitly selects in-process tokenization. Auto + (``< 0``) fits one shard per ``cores_per_worker`` block of this + process's affinity mask, always at least one; an explicit count is + clamped to that capacity. An environment that cannot shard — no fast + Rust backend, no CPU affinity, a warmup that fails or exceeds its + budget — raises instead of silently degrading to a slow path that + cannot keep up with completions. """ if cores_per_worker <= 0 or n_workers == 0: - logger.info("BatchTokenizer: sharding disabled") + logger.info("BatchTokenizer: in-process tokenization (explicit)") return if getattr(self._tokenizer, "backend_tokenizer", None) is None: - logger.info( - "BatchTokenizer: no fast tokenizer backend; using in-process " - "tokenization" + raise RuntimeError( + f"tokenizer {self._tokenizer_name!r} has no fast (Rust) " + "backend; token metrics require one to keep up with " + "completions. Pass --tokenizer-workers 0 to explicitly run " + "single-threaded in-process tokenization." ) - return try: available = sorted(os.sched_getaffinity(0)) - except (OSError, AttributeError): - logger.info( - "BatchTokenizer: CPU affinity unavailable; using in-process " - "tokenization" - ) - return - capacity = len(available) // cores_per_worker + except (OSError, AttributeError) as exc: + raise RuntimeError( + "CPU affinity is unavailable; tokenizer sharding requires " + "Linux. Pass --tokenizer-workers 0 to explicitly run " + "in-process tokenization." + ) from exc + capacity = max(1, len(available) // cores_per_worker) n = capacity if n_workers < 0 else min(n_workers, capacity) - if n < (2 if n_workers < 0 else 1): - logger.info( - "BatchTokenizer: %d CPUs available (capacity %d blocks of %d); " - "using in-process tokenization", - len(available), - capacity, - cores_per_worker, - ) - return t0 = time.perf_counter() ctx = multiprocessing.get_context("spawn") procs: list[ProcessPoolExecutor] = [] @@ -292,12 +285,13 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S for f in ready: f.result(timeout=max(0.0, deadline - time.monotonic())) - except Exception: + except Exception as exc: _terminate_procs(procs) - logger.exception( - "tokenizer shard setup failed; using in-process tokenization" - ) - return + raise RuntimeError( + "tokenizer shard warmup failed; refusing to fall back to a " + "slow path. Fix the environment (or pass --tokenizer-workers " + "0 to explicitly run in-process)." + ) from exc self._procs = procs logger.info( "BatchTokenizer: %d shards x %d cores (setup %.1fs)", diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 14805b011..2558c6e69 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -85,7 +85,7 @@ class TestBatchTokenizer: async def test_count_texts_async(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", n_workers=0) as tok: counts = await tok.count_texts_async(["Hello world foo", "a"], loop) assert counts == [3, 1] @@ -93,7 +93,7 @@ async def test_count_texts_async(self): async def test_count_texts_async_empty(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", n_workers=0) as tok: assert await tok.count_texts_async([], loop) == [] @pytest.mark.asyncio @@ -101,7 +101,7 @@ async def test_count_texts_async_sharded(self): """With shards present, chunks are reassembled in original order.""" with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", n_workers=0) as tok: tok._procs = [_FakeProc(), _FakeProc()] counts = await tok.count_texts_async(["a", "b b", "c c c", "d"], loop) assert counts == [1, 2, 3, 1] @@ -111,14 +111,14 @@ async def test_count_texts_async_shard_failure_propagates(self): """A dead shard surfaces as an error, not a silent in-process fallback.""" with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", n_workers=0) as tok: tok._procs = [_BrokenProc()] with pytest.raises(BrokenProcessPool): await tok.count_texts_async(["a b"], loop) def test_close_is_idempotent(self): with patch(_MOCK_TARGET, _FakeTokenizer): - tok = BatchTokenizer("fake") + tok = BatchTokenizer("fake", n_workers=0) tok.close() tok.close() # must not raise @@ -126,7 +126,7 @@ def test_close_is_idempotent(self): async def test_use_after_close_raises(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - tok = BatchTokenizer("fake") + tok = BatchTokenizer("fake", n_workers=0) tok.close() with pytest.raises(RuntimeError, match="closed"): await tok.count_texts_async(["hello"], loop) @@ -163,7 +163,7 @@ async def test_token_count_message_subtracts_baseline(self): """token_count_message_async returns full_tokens - baseline.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", n_workers=0) as tok: # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2 count = await tok.token_count_message_async( "hello world", None, None, loop @@ -175,7 +175,7 @@ async def test_token_count_message_includes_tool_calls(self): """Tool-call JSON tokens are included in the count.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", n_workers=0) as tok: tool_calls = ( { "id": "c1", @@ -199,7 +199,7 @@ def apply_chat_template(self, *args, **kwargs): with patch(_MOCK_TARGET, _BadTemplateTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", n_workers=0) as tok: tool_calls = ( { "id": "c1", @@ -274,7 +274,11 @@ def shutdown(self, wait=False, cancel_futures=False): @pytest.mark.unit class TestSetupShardsDecisions: - """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 disabled.""" + """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 explicit. + + An environment that cannot shard is a startup error — never a silent + in-process fallback. + """ def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor): monkeypatch.setattr(token_metrics_module, "ProcessPoolExecutor", executor) @@ -288,29 +292,32 @@ def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor): "cpus, n_workers, expected_shards", [ (16, -1, 2), # auto: one shard per 8-core block - (10, -1, 0), # auto needs >= 2 blocks (1 shard ~= in-process) + (10, -1, 1), # auto: always at least one shard + (6, -1, 1), # auto: even below one full block (48, 3, 3), # explicit count under capacity (16, 10, 2), # explicit count clamped to capacity (16, 1, 1), # explicit single shard honored - (16, 0, 0), # 0 disables sharding + (16, 0, 0), # 0 = explicit in-process mode ], ) def test_shard_count(self, monkeypatch, cpus, n_workers, expected_shards): - tok = self._make(monkeypatch, cpus, n_workers) - try: + with self._make(monkeypatch, cpus, n_workers) as tok: assert len(tok._procs) == expected_shards - finally: - tok.close() def test_blocks_are_disjoint_consecutive_core_sets(self, monkeypatch): - tok = self._make(monkeypatch, 16, -1) - try: + with self._make(monkeypatch, 16, -1) as tok: blocks = [set(ex.initargs[1]) for ex in tok._procs] assert blocks == [set(range(0, 8)), set(range(8, 16))] - finally: - tok.close() - def test_affinity_failure_falls_back_in_process(self, monkeypatch): + def test_no_fast_backend_is_a_startup_error(self, monkeypatch): + monkeypatch.setattr( + token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor + ) + with patch(_MOCK_TARGET, _FakeTokenizer): # no backend_tokenizer + with pytest.raises(RuntimeError, match="fast"): + BatchTokenizer("fake") + + def test_affinity_failure_is_a_startup_error(self, monkeypatch): monkeypatch.setattr( token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor ) @@ -320,24 +327,18 @@ def _raise(pid): monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise) with patch(_MOCK_TARGET, _FakeTokenizerWithBackend): - tok = BatchTokenizer("fake") - try: - assert tok._procs == [] - finally: - tok.close() + with pytest.raises(RuntimeError, match="affinity"): + BatchTokenizer("fake") - def test_warmup_failure_falls_back_in_process(self, monkeypatch): + def test_warmup_failure_is_a_startup_error(self, monkeypatch): class _BrokenWarmup(_SpawnlessExecutor): def submit(self, fn, *args): fut: Future = Future() fut.set_exception(RuntimeError("spawn died")) return fut - tok = self._make(monkeypatch, 16, -1, executor=_BrokenWarmup) - try: - assert tok._procs == [] - finally: - tok.close() + with pytest.raises(RuntimeError, match="warmup"): + self._make(monkeypatch, 16, -1, executor=_BrokenWarmup) @pytest.mark.unit From 0cca84a0de6daad267a4821f847db74e8ead85a5 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 15:24:37 -0700 Subject: [PATCH 08/20] fix(metrics): shard unpinned on platforms without CPU affinity (macOS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The affinity API's absence is a platform property, not a broken environment: sharding works identically without pinning — the OS scheduler spreads the workers and only cache/NUMA locality is lost. _setup_shards now sizes blocks from the online CPU count when sched_getaffinity is unavailable, and each worker that cannot pin caps its rayon pool to its block size via RAYON_NUM_THREADS so unpinned shards do not oversubscribe each other. The strict startup errors remain for genuine environment problems: a tokenizer without a fast (Rust) backend, and a failed or over-budget shard warmup. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/DESIGN.md | 9 +++-- .../services/metrics_aggregator/__main__.py | 6 ++-- .../metrics_aggregator/token_metrics.py | 33 +++++++++++-------- .../metrics_aggregator/test_token_metrics.py | 8 +++-- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md index 074cb569b..5b8cd2c50 100644 --- a/docs/async_utils/services/metrics_aggregator/DESIGN.md +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -103,9 +103,12 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape: shard per 8-core block of the process affinity mask (always at least one), an explicit count is clamped to that capacity, and `0` explicitly selects in-process tokenization. There is no implicit fallback: an environment that -cannot shard — no fast Rust backend, no CPU affinity, a failed or over-budget -warmup — is a startup error, because a silent in-process slow path cannot -keep up with completions and would surface much later as an incomplete drain. +cannot shard — no fast Rust backend, a failed or over-budget warmup — is a +startup error, because a silent in-process slow path cannot keep up with +completions and would surface much later as an incomplete drain. Platforms +without a CPU-affinity API (e.g. macOS) still shard at full speed, just +unpinned: blocks are sized from the online CPU count and each worker caps its +rayon pool to the block size instead of pinning. Chat-template items (tool-call outputs) take a separate in-process thread: they are rare relative to the batched flush, and `apply_chat_template` is diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 20a5b1dfb..7e2acea30 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -170,9 +170,9 @@ async def main() -> None: help=( "Number of tokenizer shard processes (-1 = auto: one per " "8-core block of this machine, minimum one; 0 = explicit " - "in-process tokenization). An environment that cannot shard " - "(no fast tokenizer backend, no CPU affinity) is a startup " - "error unless 0 is passed." + "in-process tokenization). A tokenizer without a fast (Rust) " + "backend is a startup error unless 0 is passed; platforms " + "without CPU affinity (e.g. macOS) shard unpinned." ), ) parser.add_argument( diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 8aa678f84..02b927d24 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -19,9 +19,10 @@ processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers per-sample text and flushes the batch once per publish tick and at drain. -Sharding requires the fast (Rust) tokenizers backend and Linux CPU affinity; -an environment that cannot shard is a startup error, never a silent slow -path — ``--tokenizer-workers 0`` is the only (explicit) in-process mode. +Sharding requires the fast (Rust) tokenizers backend; an environment without +one is a startup error, never a silent slow path — ``--tokenizer-workers 0`` +is the only (explicit) in-process mode. Platforms without CPU affinity (e.g. +macOS) shard unpinned at full speed; only cache/NUMA locality is lost. """ from __future__ import annotations @@ -110,6 +111,9 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None: try: os.sched_setaffinity(0, set(core_set)) except (OSError, AttributeError): + # No pinning (e.g. macOS): cap the rayon pool to the block size + # instead, so unpinned shards don't oversubscribe each other. + os.environ.setdefault("RAYON_NUM_THREADS", str(len(core_set))) logger.debug("could not pin tokenizer worker to %s", core_set) transformers_logging.set_verbosity_error() tok = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) @@ -237,11 +241,12 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: ``n_workers == 0`` explicitly selects in-process tokenization. Auto (``< 0``) fits one shard per ``cores_per_worker`` block of this - process's affinity mask, always at least one; an explicit count is - clamped to that capacity. An environment that cannot shard — no fast - Rust backend, no CPU affinity, a warmup that fails or exceeds its - budget — raises instead of silently degrading to a slow path that - cannot keep up with completions. + process's affinity mask (or the online CPU count when the platform + has no affinity API — shards then run unpinned), always at least one; + an explicit count is clamped to that capacity. An environment that + cannot shard — no fast Rust backend, a warmup that fails or exceeds + its budget — raises instead of silently degrading to a slow path + that cannot keep up with completions. """ if cores_per_worker <= 0 or n_workers == 0: logger.info("BatchTokenizer: in-process tokenization (explicit)") @@ -255,12 +260,12 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: ) try: available = sorted(os.sched_getaffinity(0)) - except (OSError, AttributeError) as exc: - raise RuntimeError( - "CPU affinity is unavailable; tokenizer sharding requires " - "Linux. Pass --tokenizer-workers 0 to explicitly run " - "in-process tokenization." - ) from exc + except (OSError, AttributeError): + # No affinity API (e.g. macOS): shard unpinned — the OS scheduler + # spreads the workers; only cache/NUMA locality is lost. Workers + # cap their rayon pools to the block size instead (_init_worker). + available = list(range(os.cpu_count() or 1)) + logger.info("BatchTokenizer: CPU affinity unavailable; sharding unpinned") capacity = max(1, len(available) // cores_per_worker) n = capacity if n_workers < 0 else min(n_workers, capacity) t0 = time.perf_counter() diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 2558c6e69..1bee9ba7f 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -317,7 +317,8 @@ def test_no_fast_backend_is_a_startup_error(self, monkeypatch): with pytest.raises(RuntimeError, match="fast"): BatchTokenizer("fake") - def test_affinity_failure_is_a_startup_error(self, monkeypatch): + def test_affinity_unavailable_shards_unpinned(self, monkeypatch): + """No affinity API (e.g. macOS): shard from the CPU count, unpinned.""" monkeypatch.setattr( token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor ) @@ -326,9 +327,10 @@ def _raise(pid): raise OSError("affinity unavailable") monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise) + monkeypatch.setattr(token_metrics_module.os, "cpu_count", lambda: 16) with patch(_MOCK_TARGET, _FakeTokenizerWithBackend): - with pytest.raises(RuntimeError, match="affinity"): - BatchTokenizer("fake") + with BatchTokenizer("fake") as tok: + assert len(tok._procs) == 2 def test_warmup_failure_is_a_startup_error(self, monkeypatch): class _BrokenWarmup(_SpawnlessExecutor): From 443a923f223758f9dc5a86023ff7f1ad655237ef Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 15:50:41 -0700 Subject: [PATCH 09/20] refactor(metrics): queue-owned live flush lane; drop the pre_publish hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The publisher no longer knows about tokenization: TokenBatchQueue owns its flush cadence via start_live(interval), removing the pre_publish callback (and its failure-isolation machinery) added earlier in this branch. Mid-run flushes go through a bounded live lane — --live-tokenizers shards (default 1), taken from the highest core blocks, farthest from the loadgen's low cores — so live ISL/OSL/TPOT stay current without contending with the benchmark hot path; --live-tokenizers 0 defers all tokenization to the end-of-run drain, which always uses every shard. Live-flush failures and cancellations re-queue the detached items so a mid-run hiccup never loses samples (the drain retries them); drain failures remain terminal and pending-counted. Default metrics-drain-timeout rises 60s -> 300s since the live lane is sized for currency, not for keeping up with peak completion rates. For comparison, main tokenizes continuously during the run on 2 threads inside the aggregator process — which inherits the loadgen's pinned mask, i.e. directly on the loadgen's cores. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- .../services/metrics_aggregator/DESIGN.md | 24 +-- .../services/metrics_aggregator/__main__.py | 21 ++- .../services/metrics_aggregator/aggregator.py | 17 ++- .../services/metrics_aggregator/publisher.py | 30 +--- .../services/metrics_aggregator/snapshot.py | 2 +- .../metrics_aggregator/token_metrics.py | 137 ++++++++++++++---- src/inference_endpoint/config/schema.py | 4 +- .../templates/concurrency_template_full.yaml | 2 +- .../templates/offline_template_full.yaml | 2 +- .../templates/online_template_full.yaml | 2 +- .../services/metrics_aggregator/conftest.py | 5 + .../metrics_aggregator/test_aggregator.py | 18 +-- .../metrics_aggregator/test_publisher.py | 86 ----------- .../metrics_aggregator/test_token_metrics.py | 83 +++++++++++ tests/unit/commands/test_benchmark.py | 2 +- 16 files changed, 262 insertions(+), 175 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 050d9e5b3..dbc8ce953 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils. - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O. - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks). -- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. +- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 300 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`). - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots. diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md index 5b8cd2c50..707ded06d 100644 --- a/docs/async_utils/services/metrics_aggregator/DESIGN.md +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -38,7 +38,7 @@ INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► C - **LIVE**: the publisher tick task emits a snapshot every `--publish-interval` seconds (default 0.25 s). - **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed, - bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited). + bounded by the `--drain-timeout` budget (default 300 s; `0` = unlimited). - The ENDED path runs inside a finalization boundary: whatever the drain does — finish, time out, or fail — `publish_final` and the shutdown signal always run. A tokenizer failure can degrade the snapshot (see the @@ -63,12 +63,17 @@ Token triggers do no work at event time. `fire()` appends to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared in batches at exactly two points: -1. **Every publish tick** — the publisher awaits a `pre_publish` hook before - composing each snapshot, so live ISL/OSL/TPOT reflect recently completed - samples. A failure here is swallowed by the tick (live publishing never - stops). -2. **End-of-run** — `flush_remaining(timeout)` drains everything still - buffered, bounded by the drain budget. +1. **The queue's own live loop** — `start_live(interval)` flushes + periodically (at the publish cadence) through the tokenizer's **bounded + live lane**: the last `--live-tokenizers` shards (default 1 — the highest + core block, farthest from the loadgen's low cores), so live ISL/OSL/TPOT + stay current without touching the benchmark hot path. `--live-tokenizers +0` disables mid-run tokenization entirely. Failures are logged once and + never stop the loop. +2. **End-of-run** — `flush_remaining(timeout)` stops the live loop and drains + everything still buffered through **every** shard, bounded by the drain + budget. The publisher knows nothing about tokenization — it only reads + `(state, n_pending_tasks)`. `flush()` serializes under an asyncio lock and detaches the buffer up front, so enqueues that race a flush land in the next one. Failure isolation is @@ -146,7 +151,7 @@ as a clean run. ``` COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count) [O(1)] │ - publish tick (0.25 s) ──────────────┤ flush() + live loop (0.25 s, live lane) ─────┤ flush() ENDED drain (budgeted) ─────────────┘ │ ├─► chunks ─► N pinned worker procs │ (encode_batch_fast) @@ -161,9 +166,10 @@ COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count) [ | `--metrics-socket` | required | Snapshot PUB socket name | | `--metrics-output-dir` | required | Directory for `final_snapshot.json` | | `--publish-interval` | 0.25 | Live snapshot cadence (seconds) | -| `--drain-timeout` | 60.0 | End-of-run tokenize budget (`0` = unlimited) | +| `--drain-timeout` | 300.0 | End-of-run tokenize budget (`0` = unlimited) | | `--tokenizer` | none | HF name or local path; unset disables token metrics | | `--tokenizer-workers` | -1 | Shard processes (`-1` auto, `0` in-process) | +| `--live-tokenizers` | 1 | Shards for mid-run live flushes (`0` = defer all) | | `--streaming` | off | Register TTFT/chunk-delta/TPOT triggers | ## References diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 7e2acea30..628811a36 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -137,11 +137,11 @@ async def main() -> None: parser.add_argument( "--drain-timeout", type=float, - default=60.0, + default=300.0, help=( "Wall-clock budget (seconds) to finish tokenizing buffered samples " "after ENDED before the aggregator emits the final snapshot with " - "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase " + "n_pending_tasks > 0 (default: 300.0; 0 = wait indefinitely). Increase " "for very large datasets where the end-of-run tokenize batch is big." ), ) @@ -175,6 +175,16 @@ async def main() -> None: "without CPU affinity (e.g. macOS) shard unpinned." ), ) + parser.add_argument( + "--live-tokenizers", + type=int, + default=1, + help=( + "Shards used for mid-run (live) token-metric flushes (default: 1 " + "— the highest core block, away from the loadgen's cores; 0 = no " + "mid-run tokenization, everything defers to the end-of-run drain)." + ), + ) parser.add_argument( "--streaming", action="store_true", @@ -228,7 +238,9 @@ async def main() -> None: pass # non-Linux: no inherited pin to undo. try: tokenizer_cm = BatchTokenizer( - args.tokenizer, n_workers=args.tokenizer_workers + args.tokenizer, + n_workers=args.tokenizer_workers, + live_workers=args.live_tokenizers, ) except RuntimeError as exc: # Fail-fast contract: a tokenizer environment that cannot shard @@ -262,6 +274,9 @@ async def main() -> None: sig_figs=args.hdr_sig_figs, n_histogram_buckets=args.n_histogram_buckets, tokenizer=tokenizer, + live_flush_interval_s=( + args.publish_interval if args.live_tokenizers > 0 else None + ), streaming=args.streaming, shutdown_event=shutdown_event, drain_timeout_s=None if args.drain_timeout == 0 else args.drain_timeout, diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index e87448036..cb2db5878 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -118,6 +118,7 @@ def __init__( sig_figs: int, n_histogram_buckets: int, tokenizer: BatchTokenizer | None = None, + live_flush_interval_s: float | None = None, streaming: bool = False, shutdown_event: asyncio.Event | None = None, drain_timeout_s: float | None = _DEFAULT_DRAIN_TIMEOUT_S, @@ -139,6 +140,9 @@ def __init__( self._token_queue: TokenBatchQueue | None = ( TokenBatchQueue(tokenizer, self.loop) if tokenizer is not None else None ) + # Cadence of the queue's live flush loop (None = no mid-run + # tokenization; everything defers to the end-of-run drain). + self._live_flush_interval_s = live_flush_interval_s self._streaming = streaming self._shutdown_event = shutdown_event self._shutdown_received = False @@ -246,11 +250,6 @@ def pending_tokens(self) -> int: """Enqueued tokenizations not yet recorded (the snapshot n_pending_tasks).""" return self._token_queue.pending if self._token_queue is not None else 0 - async def _flush_tokens(self) -> None: - """Flush buffered tokenizations so the next snapshot reflects them.""" - if self._token_queue is not None: - await self._token_queue.flush() - # ------------------------------------------------------------------ # Event processing # ------------------------------------------------------------------ @@ -325,8 +324,14 @@ async def process(self, records: list[EventRecord]) -> None: self._session_state, self.pending_tokens, ), - pre_publish=self._flush_tokens, ) + if ( + self._token_queue is not None + and self._live_flush_interval_s is not None + ): + self._token_queue.start_live( + self._live_flush_interval_s + ) table.handle_session_event(record) if ev == SessionEventType.STOP_PERFORMANCE_TRACKING: registry.set_counter( diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py index fedc0fbe1..c90ca11fc 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py @@ -21,7 +21,7 @@ import json import logging import os -from collections.abc import Awaitable, Callable +from collections.abc import Callable from pathlib import Path from inference_endpoint.async_utils.services.metrics_aggregator.registry import ( @@ -102,7 +102,6 @@ def start( registry: MetricsRegistry, publish_interval_s: float, get_runtime_state: Callable[[], tuple[SessionState, int]], - pre_publish: Callable[[], Awaitable[None]] | None = None, ) -> None: """Begin publishing live ticks every ``publish_interval_s`` seconds. @@ -113,14 +112,6 @@ def start( snapshot. ``COMPLETE`` is emitted only by ``publish_final``, never by the tick task. - ``pre_publish``, if given, is awaited at the top of each tick before - the snapshot is built — the aggregator uses it to flush buffered - tokenizations so live ISL/OSL/TPOT reflect recently completed samples. - Its failures are swallowed in their own handler so the snapshot is - still built and published — even a tokenizer that fails on every tick - cannot stop live publishing; the unflushed items remain visible as - ``n_pending_tasks``. - Idempotent on the tick-task slot: a second call (e.g. from a spurious duplicate ``STARTED`` event or a buggy replay producer) is a no-op rather than orphaning the original task. The original @@ -139,28 +130,9 @@ def start( ) async def _tick() -> None: - flush_failure_logged = False while True: try: await asyncio.sleep(publish_interval_s) - if pre_publish is not None: - # Isolated from the publish path: a persistently - # broken tokenizer would otherwise abort every tick - # here and stop ALL live snapshots, not just token - # series. Unflushed items stay visible to consumers - # via n_pending_tasks. - try: - await pre_publish() - except Exception: # noqa: BLE001 — publish anyway. - if not flush_failure_logged: - flush_failure_logged = True - logger.exception( - "pre_publish flush failed; live snapshots " - "continue without fresh token metrics " - "(further failures logged at debug)" - ) - else: - logger.debug("pre_publish flush failed again") state, n_pending = get_runtime_state() snap = registry.build_snapshot( state=state, n_pending_tasks=n_pending diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py index eacac94f5..e233f36a3 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py @@ -45,7 +45,7 @@ class SessionState(str, Enum): LIVE → run in progress; tick task publishing live HDR-derived stats. DRAINING → ``SessionEventType.ENDED`` has been received; the aggregator is tokenizing the buffered samples (bounded by the - ``--drain-timeout`` budget, default 60 s). Tick task + ``--drain-timeout`` budget, default 300 s). Tick task continues at this stage, still HDR-derived; no new events will arrive. COMPLETE → terminal clean state. The ``publish_final()`` snapshot diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 02b927d24..7723d84d4 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -18,7 +18,8 @@ ``BatchTokenizer`` tokenizes whole batches at once, sharded across worker processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers -per-sample text and flushes the batch once per publish tick and at drain. +per-sample text; a queue-owned live loop flushes through a bounded live lane +(default one shard) mid-run, and the end-of-run drain uses every shard. Sharding requires the fast (Rust) tokenizers backend; an environment without one is a startup error, never a silent slow path — ``--tokenizer-workers 0`` is the only (explicit) in-process mode. Platforms without CPU affinity (e.g. @@ -28,6 +29,7 @@ from __future__ import annotations import asyncio +import contextlib import json import logging import multiprocessing @@ -185,8 +187,10 @@ def __init__( *, cores_per_worker: int = CORES_PER_WORKER, n_workers: int = -1, + live_workers: int = 1, ) -> None: self._tokenizer_name = tokenizer_name + self._live_workers = live_workers self._fallback_warned: set[str] = set() self._tokenizer: PreTrainedTokenizerBase | None = None self._prefix_len = 0 @@ -324,19 +328,40 @@ async def count_texts_async( if not texts: return [] if self._procs: - chunks = _even_chunks(texts, len(self._procs)) - futures = [ - asyncio.wrap_future(ex.submit(_worker_encode_lengths, chunk)) - for ex, chunk in zip(self._procs, chunks, strict=False) - ] - results = await asyncio.gather(*futures) - return [n for r in results for n in r] + return await self._fan_out(self._procs, texts) if self._thread is None: raise RuntimeError("BatchTokenizer is closed") return await loop.run_in_executor( self._thread, self._encode_lengths_inproc, texts ) + async def count_texts_live_async( + self, texts: list[str], loop: asyncio.AbstractEventLoop + ) -> list[int]: + """Like ``count_texts_async``, bounded to the live lane. + + Mid-run flushes use only the last ``live_workers`` shards — the + highest core blocks, farthest from the loadgen's low cores — so live + token metrics never contend with the benchmark hot path. The + end-of-run drain uses every shard. + """ + if not texts: + return [] + live_n = max(1, self._live_workers) + if self._procs and live_n < len(self._procs): + return await self._fan_out(self._procs[-live_n:], texts) + return await self.count_texts_async(texts, loop) + + @staticmethod + async def _fan_out(procs: list[ProcessPoolExecutor], texts: list[str]) -> list[int]: + chunks = _even_chunks(texts, len(procs)) + futures = [ + asyncio.wrap_future(ex.submit(_worker_encode_lengths, chunk)) + for ex, chunk in zip(procs, chunks, strict=False) + ] + results = await asyncio.gather(*futures) + return [n for r in results for n in r] + # -- sync + chat-template paths (in-process thread) --------------------- def _token_count_text(self, text: str) -> int: @@ -428,7 +453,13 @@ class TokenCounter(Protocol): async def count_texts_async( self, texts: list[str], loop: asyncio.AbstractEventLoop, / ) -> list[int]: - """Per-text token counts for a whole batch.""" + """Per-text token counts for a whole batch (full pool).""" + raise NotImplementedError + + async def count_texts_live_async( + self, texts: list[str], loop: asyncio.AbstractEventLoop, / + ) -> list[int]: + """Per-text token counts via the bounded live lane.""" raise NotImplementedError async def token_count_message_async( @@ -447,10 +478,11 @@ class TokenBatchQueue: """Buffers per-sample tokenization work and clears it in batches. Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an - ``on_count`` callback that records the resulting metric. The aggregator - flushes the buffer with ``flush`` once per publish tick (so live ISL/OSL/ - TPOT stay current) and with ``flush_remaining`` at end-of-run, sending the - whole batch through ``BatchTokenizer`` in one sharded call. + ``on_count`` callback that records the resulting metric. The queue owns + its own flush cadence: ``start_live`` begins a periodic flush through the + tokenizer's bounded live lane (so live ISL/OSL/TPOT stay current without + touching the benchmark's cores), and ``flush_remaining`` drains everything + left at end-of-run through every shard. ``pending`` counts enqueued-but-not-yet-recorded items; it is the ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot @@ -465,10 +497,38 @@ def __init__( self._text: list[tuple[str, Callable[[int], None]]] = [] self._msg: list[tuple[MessageParts, Callable[[int], None]]] = [] self._inflight = 0 - # Serializes flushes so a periodic tick flush and the end-of-run flush - # never record the same item twice or race on the pending count. + self._live_task: asyncio.Task | None = None + # Serializes flushes so the periodic live flush and the end-of-run + # flush never record the same item twice or race on the pending count. self._lock = asyncio.Lock() + def start_live(self, interval_s: float) -> None: + """Begin the periodic live flush (idempotent). + + Failures are logged once and never interrupt the loop — unflushed + items stay visible as ``pending`` and the end-of-run drain picks + them up. + """ + if self._live_task is not None: + return + self._live_task = self._loop.create_task(self._live_flush_loop(interval_s)) + + async def _live_flush_loop(self, interval_s: float) -> None: + failure_logged = False + while True: + await asyncio.sleep(interval_s) + try: + await self.flush(live=True) + except Exception: # noqa: BLE001 — keep live metrics flowing. + if not failure_logged: + failure_logged = True + logger.exception( + "live token flush failed; retrying each interval " + "(further failures logged at debug)" + ) + else: + logger.debug("live token flush failed again") + @property def pending(self) -> int: """Enqueued items not yet tokenized-and-recorded.""" @@ -484,15 +544,23 @@ def enqueue_message( self._inflight += 1 self._msg.append((parts, on_count)) - async def flush(self) -> None: + async def flush(self, live: bool = False) -> None: """Tokenize everything buffered so far and run each ``on_count``. - Items are detached from the buffer up front so concurrent enqueues land - in the next flush. ``_inflight`` is decremented only after a callback - runs, so a cancellation (drain timeout) or a tokenizer error leaves it - reflecting exactly the items that were not recorded — those surface as - ``pending`` (an incomplete drain), not as silently dropped samples. + ``live=True`` routes text batches through the tokenizer's bounded + live lane instead of the full shard pool, and re-queues items on + failure or cancellation so a mid-run hiccup never loses samples — the + end-of-run drain retries them. Drain-mode failures are terminal: the + un-recorded items stay counted in ``pending`` (``_inflight`` is + decremented only after a callback runs) and surface as an incomplete + drain, not as silently dropped samples. Items are detached from the + buffer up front so concurrent enqueues land in the next flush. """ + count_texts = ( + self._tokenizer.count_texts_live_async + if live + else self._tokenizer.count_texts_async + ) async with self._lock: if not (self._text or self._msg): return @@ -505,21 +573,34 @@ async def flush(self) -> None: failure: Exception | None = None if text_items: try: - counts = await self._tokenizer.count_texts_async( - [t for t, _ in text_items], self._loop - ) + counts = await count_texts([t for t, _ in text_items], self._loop) + except asyncio.CancelledError: + if live: + self._text[:0] = text_items + raise except Exception as exc: # noqa: BLE001 — isolate phases. failure = exc + if live: + # A live hiccup must not lose samples: give the items + # back so the end-of-run drain (full pool) retries. + # Drain failures are terminal and stay pending-only. + self._text[:0] = text_items else: for (_, on_count), count in zip(text_items, counts, strict=True): self._record(on_count, count) - for (content, reasoning, tool_calls), on_count in msg_items: + for i, ((content, reasoning, tool_calls), on_count) in enumerate(msg_items): try: count = await self._tokenizer.token_count_message_async( content, reasoning, tool_calls, self._loop ) + except asyncio.CancelledError: + if live: + self._msg[:0] = msg_items[i:] + raise except Exception as exc: # noqa: BLE001 — isolate items. failure = failure or exc + if live: + self._msg.append(((content, reasoning, tool_calls), on_count)) continue self._record(on_count, count) if failure is not None: @@ -538,11 +619,17 @@ def _record(self, on_count: Callable[[int], None], count: int) -> None: async def flush_remaining(self, timeout: float | None) -> int: """End-of-run flush, bounded by ``timeout`` seconds. + Stops the live flush loop, then drains through the full shard pool. Returns the number of items still un-tokenized — non-zero if the budget was exhausted (``timeout`` reached) or tokenization failed. ``None`` waits indefinitely. Never raises: a failure here must not stop the aggregator from publishing the (incomplete) final snapshot. """ + if self._live_task is not None: + self._live_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await self._live_task + self._live_task = None if self._inflight == 0: return 0 try: diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 0a59074f5..6a8b9b872 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -584,11 +584,11 @@ class DrainConfig(BaseModel): ), ), ] = Field( - 60.0, + 300.0, ge=0, description=( "Wall-clock budget (seconds) to finish tokenizing buffered samples " - "after ENDED (default: 60.0; 0 = unlimited)." + "after ENDED (default: 300.0; 0 = unlimited)." ), ) diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 75feab6fb..5132f5b0e 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index 3ff1ccd17..e3ec95284 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 1287b99af..73c0b69d4 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py index 51d25565a..05b68d3ee 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py +++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py @@ -67,6 +67,11 @@ async def count_texts_async( await asyncio.sleep(self._delay) return [len(t.split()) for t in texts] + async def count_texts_live_async( + self, texts: list[str], loop: asyncio.AbstractEventLoop + ) -> list[int]: + return await self.count_texts_async(texts, loop) + async def token_count_message_async( self, content: str, diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py index 3337b168b..bae2a0aa5 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py @@ -332,7 +332,7 @@ async def test_non_streaming_latency_only(self, tmp_path): ), ] ) - await agg._flush_tokens() + await agg._token_queue.flush() # sample_latency = 3000-1000 = 2000 assert ( snapshot_series_total( @@ -796,7 +796,7 @@ async def test_isl_text_path_async(self, tmp_path): ] ) # ISL task is in-flight; drain it - await agg._flush_tokens() + await agg._token_queue.flush() assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 4 finally: agg.close() @@ -825,7 +825,7 @@ async def test_osl_emitted_on_complete(self, tmp_path): ), ] ) - await agg._flush_tokens() + await agg._token_queue.flush() # sample_latency_ns = 5000-1000 = 4000 assert ( snapshot_series_total( @@ -864,7 +864,7 @@ async def test_tpot_emitted_for_streaming(self, tmp_path): ), ] ) - await agg._flush_tokens() + await agg._token_queue.flush() # OSL = "hello world foo" = 3 tokens assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3 # tpot = (5000 - 2000) / token_count("world foo") = 3000 / 2 = 1500 @@ -900,7 +900,7 @@ async def test_tpot_skipped_when_single_chunk(self, tmp_path): ), ] ) - await agg._flush_tokens() + await agg._token_queue.flush() assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 1 assert ( snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0 @@ -939,7 +939,7 @@ async def test_tpot_not_emitted_without_streaming_flag(self, tmp_path): ), ] ) - await agg._flush_tokens() + await agg._token_queue.flush() # sample_latency / OSL still emitted in non-streaming mode. assert ( snapshot_series_total( @@ -981,7 +981,7 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path): ), ] ) - await agg._flush_tokens() + await agg._token_queue.flush() assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 3 assert ( snapshot_series_count(registry, MetricSeriesKey.TPOT_NS.value) == 0 @@ -1016,7 +1016,7 @@ async def test_flush_records_buffered_tokenizations(self, tmp_path): # Enqueued by fire(), not yet tokenized (no tick/drain flush). assert agg._token_queue.pending > 0 - await agg._flush_tokens() + await agg._token_queue.flush() assert agg._token_queue.pending == 0 assert snapshot_series_total(registry, MetricSeriesKey.ISL.value) == 5 finally: @@ -1185,7 +1185,7 @@ async def test_tpot_osl_for_tool_call_complete(self, tmp_path): ), ] ) - await agg._flush_tokens() + await agg._token_queue.flush() # OSL = token_count("ok" + tool_calls_json) = 2 assert snapshot_series_total(registry, MetricSeriesKey.OSL.value) == 2 # tpot = (5000 - 2000) / token_count(tool_calls_json) = 3000 / 1 = 3000 diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py index 1d540ddec..9e26f734a 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_publisher.py @@ -81,92 +81,6 @@ def get_runtime_state() -> tuple[SessionState, int]: finally: publisher.close() - @pytest.mark.asyncio - async def test_pre_publish_runs_before_each_tick_snapshot( - self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext - ): - """pre_publish is awaited before the runtime state is captured.""" - loop = asyncio.get_event_loop() - publisher = MetricsPublisher( - MetricsSnapshotCodec(), - zmq_ctx_scope, - "test_pub_pre", - loop, - final_snapshot_path=tmp_path / "final_snapshot.json", - ) - try: - registry = MetricsRegistry() - registry.register_counter("c") - order: list[str] = [] - - async def pre_publish() -> None: - order.append("flush") - - def get_runtime_state() -> tuple[SessionState, int]: - order.append("state") - return SessionState.LIVE, 0 - - publisher.start( - registry, - publish_interval_s=0.01, - get_runtime_state=get_runtime_state, - pre_publish=pre_publish, - ) - await asyncio.sleep(0.05) - assert order, "no tick ran" - # Every state capture is preceded by a flush in the same tick. - assert order[0] == "flush" - for i, entry in enumerate(order): - if entry == "state": - assert order[i - 1] == "flush" - finally: - publisher.close() - - @pytest.mark.asyncio - async def test_pre_publish_failure_keeps_ticking( - self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext - ): - """A raising pre_publish is swallowed by the tick; ticks continue.""" - loop = asyncio.get_event_loop() - publisher = MetricsPublisher( - MetricsSnapshotCodec(), - zmq_ctx_scope, - "test_pub_pre_fail", - loop, - final_snapshot_path=tmp_path / "final_snapshot.json", - ) - try: - registry = MetricsRegistry() - registry.register_counter("c") - attempts = 0 - published_states = 0 - - async def pre_publish() -> None: - nonlocal attempts - attempts += 1 - raise RuntimeError("tokenizer hiccup") - - def get_runtime_state() -> tuple[SessionState, int]: - nonlocal published_states - published_states += 1 - return SessionState.LIVE, 0 - - publisher.start( - registry, - publish_interval_s=0.01, - get_runtime_state=get_runtime_state, - pre_publish=pre_publish, - ) - await asyncio.sleep(0.08) - assert attempts >= 2, "tick task died after a pre_publish failure" - # The failure must not suppress the snapshot: every failing tick - # still proceeds to capture state and publish. - assert published_states >= 2, "failing pre_publish suppressed publishing" - assert publisher._tick_task is not None - assert not publisher._tick_task.done() - finally: - publisher.close() - @pytest.mark.asyncio async def test_publish_final_writes_json_atomically( self, tmp_path: Path, zmq_ctx_scope: ManagedZMQContext diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 1bee9ba7f..2588e16cd 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -343,6 +343,86 @@ def submit(self, fn, *args): self._make(monkeypatch, 16, -1, executor=_BrokenWarmup) +class _RecordingProc(_FakeProc): + """_FakeProc that records the chunks submitted to it.""" + + def __init__(self): + self.chunks = [] + + def submit(self, _fn, chunk): + self.chunks.append(list(chunk)) + return super().submit(_fn, chunk) + + +@pytest.mark.unit +class TestLiveLane: + @pytest.mark.asyncio + async def test_live_uses_only_the_last_shards(self): + """Mid-run flushes stay off the low core blocks (loadgen side).""" + with patch(_MOCK_TARGET, _FakeTokenizer): + loop = asyncio.get_running_loop() + with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok: + procs = [_RecordingProc(), _RecordingProc(), _RecordingProc()] + tok._procs = procs + counts = await tok.count_texts_live_async(["a b", "c"], loop) + assert counts == [2, 1] + assert procs[0].chunks == [] and procs[1].chunks == [] + assert procs[2].chunks == [["a b", "c"]] + + @pytest.mark.asyncio + async def test_drain_uses_every_shard(self): + with patch(_MOCK_TARGET, _FakeTokenizer): + loop = asyncio.get_running_loop() + with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok: + procs = [_RecordingProc(), _RecordingProc()] + tok._procs = procs + await tok.count_texts_async(["a", "b", "c", "d"], loop) + assert all(p.chunks for p in procs) + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestQueueLiveLoop: + async def test_start_live_flushes_periodically(self): + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + recorded: list[int] = [] + queue.enqueue_text("a b c", recorded.append) + queue.start_live(0.01) + queue.start_live(0.01) # idempotent + await asyncio.sleep(0.05) + assert recorded == [3] + assert queue.pending == 0 + await queue.flush_remaining(timeout=1.0) + + async def test_live_loop_survives_tokenizer_failure(self): + class _FailingLive(_CapturingTokenizer): + async def count_texts_live_async(self, texts, _loop): + raise RuntimeError("live lane boom") + + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_FailingLive(), loop) + recorded: list[int] = [] + queue.enqueue_text("a b", recorded.append) + queue.start_live(0.01) + await asyncio.sleep(0.05) + assert recorded == [] + assert queue.pending == 1, "failed live flush must keep items pending" + assert queue._live_task is not None and not queue._live_task.done() + # The end-of-run drain (full pool) still recovers the items. + assert await queue.flush_remaining(timeout=1.0) == 0 + assert recorded == [2] + + async def test_flush_remaining_stops_live_loop(self): + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + queue.start_live(0.01) + task = queue._live_task + await queue.flush_remaining(timeout=1.0) + assert queue._live_task is None + assert task is not None and task.cancelled() + + @pytest.mark.unit class TestEvenChunks: def test_splits_into_near_equal_chunks(self): @@ -370,6 +450,9 @@ class _CapturingTokenizer: async def count_texts_async(self, texts, _loop): return [len(t.split()) for t in texts] + async def count_texts_live_async(self, texts, _loop): + return await self.count_texts_async(texts, _loop) + async def token_count_message_async(self, content, reasoning, tool_calls, _loop): parts = [p for p in (content, reasoning) if p] return len(" ".join(parts).split()) + (len(tool_calls) if tool_calls else 0) diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 969f22ce2..9da2dcf56 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -489,7 +489,7 @@ def test_defaults(self): assert cfg.warmup_timeout_s == 240.0 assert cfg.performance_timeout_s == 240.0 assert cfg.accuracy_timeout_s is None - assert cfg.metrics_drain_timeout_s == 60.0 + assert cfg.metrics_drain_timeout_s == 300.0 @pytest.mark.unit @pytest.mark.parametrize( From aed6b78aa2ba19f63bf1eb789e25fc4a528ec4ed Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 16:30:03 -0700 Subject: [PATCH 10/20] fix(metrics): bound live flushes; align defaults; audit-driven hardening Workstreams from the full design audit: - Live flushes take at most _LIVE_FLUSH_MAX_ITEMS per kind: bounds the queue-lock hold time, the unstoppable in-flight thread encode left behind by a drain-start cancellation (close(wait=True) is now bounded by ~one slice), and the drain's re-encode of requeued items. - BatchTokenizer live_workers ctor default aligned to 2 (the CLI default); the aggregator class drain-timeout default aligned to 300s (the CLI default); --tokenizer-workers < 0 rejected at startup. - A failed restore of the inherited CPU mask is logged instead of silently leaving the aggregator expanded. - Comment/docstring hygiene: removed prior-implementation narration and stale shard-lane/warmup-degrade/publish-tick wording; SIGTERM-only phrasing in publisher docs. - Tests: shard-decision suite no longer issues real sched_setaffinity syscalls (probes and restore are patched and asserted); live lane pinned as in-process-only; new coverage for RAYON caps (ctor, operator override, per-shard block override), live flush slice cap, live cancellation/message-failure requeue, and STARTED arming the live loop with ENDED stopping it; live-method aliases on all stubs. - DESIGN.md rewritten for the final shape (in-process live lane, drain-only auto-sized shards, probe-and-restore affinity, requeue semantics, diagram + CLI table); services overview and AGENTS.md row aligned. 345 unit tests pass; pre-commit clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- docs/async_utils/services/DESIGN.md | 2 +- .../services/metrics_aggregator/DESIGN.md | 101 +++++++------- .../services/metrics_aggregator/__main__.py | 43 ++---- .../services/metrics_aggregator/aggregator.py | 9 +- .../services/metrics_aggregator/publisher.py | 6 +- .../metrics_aggregator/token_metrics.py | 118 +++++++++++----- .../services/metrics_aggregator/conftest.py | 2 + .../metrics_aggregator/test_aggregator.py | 29 ++++ .../metrics_aggregator/test_token_metrics.py | 130 +++++++++++++++++- 10 files changed, 310 insertions(+), 132 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index dbc8ce953..e6182d198 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -204,7 +204,7 @@ src/inference_endpoint/ │ │ ├── publisher.py # MetricsPublisher (tick task + atomic disk fallback) │ │ ├── subscriber.py # MetricsSnapshotSubscriber (latest + COMPLETE snapshot capture) │ │ ├── metrics_table.py # In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL) -│ │ └── token_metrics.py # BatchTokenizer (sharded batch tokenization) + TokenBatchQueue (defer-to-flush buffer) for ISL/OSL/TPOT +│ │ └── token_metrics.py # BatchTokenizer (live thread lane + drain-only sharded pool) + TokenBatchQueue (defer-to-flush buffer, owns the live flush loop) for ISL/OSL/TPOT │ └── transport/ # ZMQ-based IPC transport layer │ ├── protocol.py # Transport protocols + TransportConfig + MessageCodec[T] │ └── zmq/ # ZMQ implementation (context, pubsub, transport, ZMQTransportConfig) diff --git a/docs/async_utils/services/DESIGN.md b/docs/async_utils/services/DESIGN.md index e12eb8a4d..e013b4ea1 100644 --- a/docs/async_utils/services/DESIGN.md +++ b/docs/async_utils/services/DESIGN.md @@ -306,7 +306,7 @@ stateDiagram-v2 ### 6.2 Metrics aggregator -- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). Token metrics (ISL/OSL/TPOT) are computed by a batched, process-sharded tokenizer — see [metrics_aggregator/DESIGN.md](metrics_aggregator/DESIGN.md). Shuts down on **session.ended**. +- **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). Token metrics (ISL/OSL/TPOT) are computed by a batched tokenizer (in-process threads live; process-sharded end-of-run drain) — see [metrics_aggregator/DESIGN.md](metrics_aggregator/DESIGN.md). Shuts down on **session.ended**. - **Outputs**: Live `MetricsSnapshot` frames over an IPC PUB socket, and an atomically written `final_snapshot.json` (the primary Report source). Planned is to push real time metrics to Prometheus via PushGateway. - **Process**: Run as a **subprocess**; given `--metrics-output-dir`, `--socket-dir`, `--socket-name`, `--metrics-socket`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC address. diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md index 707ded06d..882683968 100644 --- a/docs/async_utils/services/metrics_aggregator/DESIGN.md +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -17,16 +17,16 @@ high-completion-rate runs. ## Module Layout -| File | Purpose | -| ------------------ | ------------------------------------------------------------------------- | -| `__main__.py` | Subprocess entry: argparse, affinity expansion, lifecycle wiring, SIGTERM | -| `aggregator.py` | `MetricsAggregatorService` — event router, session state, drain | -| `registry.py` | `MetricsRegistry`, `CounterSampler`, `SeriesSampler` | -| `snapshot.py` | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec | -| `publisher.py` | `MetricsPublisher` — tick task + atomic final-snapshot write | -| `subscriber.py` | `MetricsSnapshotSubscriber` — main-process consumer | -| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL) | -| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue` | +| File | Purpose | +| ------------------ | ------------------------------------------------------------------------------- | +| `__main__.py` | Subprocess entry: argparse, strict tokenizer startup, lifecycle wiring, SIGTERM | +| `aggregator.py` | `MetricsAggregatorService` — event router, session state, drain | +| `registry.py` | `MetricsRegistry`, `CounterSampler`, `SeriesSampler` | +| `snapshot.py` | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec | +| `publisher.py` | `MetricsPublisher` — tick task + atomic final-snapshot write | +| `subscriber.py` | `MetricsSnapshotSubscriber` — main-process consumer | +| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL) | +| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue` | ## Lifecycle @@ -64,12 +64,15 @@ to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared in batches at exactly two points: 1. **The queue's own live loop** — `start_live(interval)` flushes - periodically (at the publish cadence) through the tokenizer's **bounded - live lane**: the last `--live-tokenizers` shards (default 1 — the highest - core block, farthest from the loadgen's low cores), so live ISL/OSL/TPOT - stay current without touching the benchmark hot path. `--live-tokenizers -0` disables mid-run tokenization entirely. Failures are logged once and - never stop the loop. + periodically (at the publish cadence) through the tokenizer's **in-process + live lane**: a small thread pool of `--tokenizer-workers` threads + (default 2) whose rayon pool is capped to the same width, taking at most + `_LIVE_FLUSH_MAX_ITEMS` per flush so the queue lock is never held for a + long encode. Live flushes never touch the shard processes; they run inside + the aggregator process, wherever the parent placed it. + `--tokenizer-workers 0` disables mid-run tokenization entirely. Failures + are logged once and never stop the loop — failed or cancelled live items + are **re-queued** so the drain retries them. 2. **End-of-run** — `flush_remaining(timeout)` stops the live loop and drains everything still buffered through **every** shard, bounded by the drain budget. The publisher knows nothing about tokenization — it only reads @@ -78,15 +81,17 @@ in batches at exactly two points: `flush()` serializes under an asyncio lock and detaches the buffer up front, so enqueues that race a flush land in the next one. Failure isolation is layered: the plain-text phase and the chat-template phase fail independently -(they run on separate executors, so a dead text shard must not drop message -items), a raising recorder callback is logged without aborting the rest of -the batch, and the first error is re-raised only after both phases ran. -`flush_remaining` never raises — a timeout or tokenizer failure becomes a -logged, non-zero pending count. +(in drain mode they run on separate executors, so a dead text shard must not +drop message items), a raising recorder callback is logged without aborting +the rest of the batch, and the first error is re-raised only after both +phases ran. Live-mode failures and cancellations re-queue the detached items +(a mid-run hiccup never loses samples); drain-mode failures are terminal — +the items stay counted in `pending`. `flush_remaining` never raises — a +timeout or tokenizer failure becomes a logged, non-zero pending count. ### Sharded batch encoding (`BatchTokenizer`) -A flush hands the whole buffer to `count_texts_async`, which splits it into +The end-of-run drain hands the whole buffer to `count_texts_async`, which splits it into contiguous chunks and fans them out across worker **processes**, one pinned to each block of `CORES_PER_WORKER` (8) cores. Why this shape: @@ -104,16 +109,16 @@ each block of `CORES_PER_WORKER` (8) cores. Why this shape: a wedge), and they ignore SIGINT — Ctrl-C goes to the whole process group, and worker lifetime must stay under the parent drain's control. -`--tokenizer-workers` controls the shard count: `-1` (default) auto-fits one -shard per 8-core block of the process affinity mask (always at least one), an -explicit count is clamped to that capacity, and `0` explicitly selects -in-process tokenization. There is no implicit fallback: an environment that -cannot shard — no fast Rust backend, a failed or over-budget warmup — is a -startup error, because a silent in-process slow path cannot keep up with -completions and would surface much later as an incomplete drain. Platforms -without a CPU-affinity API (e.g. macOS) still shard at full speed, just -unpinned: blocks are sized from the online CPU count and each worker caps its -rayon pool to the block size instead of pinning. +The shard pool has no CLI knob: it always auto-sizes to one shard per +8-core block of the allowed CPU universe (always at least one). +`--tokenizer-workers` sizes the **live** in-process thread lane instead +(default 2; `0` = no mid-run tokenization). There is no implicit fallback: an +environment that cannot shard — no fast Rust backend, a failed or over-budget +warmup — is a startup error, because a silent in-process slow path cannot +keep up with completions and would surface much later as an incomplete drain. +Platforms without a CPU-affinity API (e.g. macOS) still shard at full speed, +just unpinned: blocks are sized from the online CPU count and each worker +caps its rayon pool to the block size instead of pinning. Chat-template items (tool-call outputs) take a separate in-process thread: they are rare relative to the batched flush, and `apply_chat_template` is @@ -123,14 +128,17 @@ is counted. ### CPU affinity: the tokenizer stage is post-run -The benchmark parent pins itself to the loadgen cores before launching -services, and subprocesses inherit that narrow mask. The tokenizer's heavy -work happens **after** the run (the end-of-run flush), so the run-time core -partition does not apply to it: at startup the service calls -`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`) to reset -its mask to every online CPU — the kernel still clamps to the cgroup/Slurm -cpuset — and shards size to the full machine. Mid-run tick flushes are small -batches; the drain is where the core count pays. +The benchmark parent pins itself to the loadgen cores (the fastest +perf-ranked physical cores) before launching services, and subprocesses +inherit that narrow mask. The tokenizer's heavy work happens **after** the +run, so the run-time core partition does not apply to it — but the aggregator +itself must not move: `_setup_shards` probes the full allowed universe via +`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`; the +kernel still clamps to the cgroup/Slurm cpuset) **and then restores the +inherited mask**, so the event loop, the publisher, and the live tokenizer +threads stay exactly where the parent placed them. Only the drain-phase shard +children, which pin themselves to their own 8-core blocks, span the whole +machine — and they are idle until `ENDED`. ### The `n_pending_tasks` contract @@ -151,11 +159,11 @@ as a clean run. ``` COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count) [O(1)] │ - live loop (0.25 s, live lane) ─────┤ flush() - ENDED drain (budgeted) ─────────────┘ │ - ├─► chunks ─► N pinned worker procs - │ (encode_batch_fast) - └─► on_count(n) ─► registry.record() + live loop (0.25 s) ── flush(live) ───────┤─► in-process thread pool + │ (rayon capped to --tokenizer-workers) + ENDED drain (budgeted) ── flush() ───────┘─► chunks ─► N pinned worker procs + │ (encode_batch_fast) + └─► on_count(n) ─► registry.record() ``` ## CLI Interface @@ -168,8 +176,7 @@ COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count) [ | `--publish-interval` | 0.25 | Live snapshot cadence (seconds) | | `--drain-timeout` | 300.0 | End-of-run tokenize budget (`0` = unlimited) | | `--tokenizer` | none | HF name or local path; unset disables token metrics | -| `--tokenizer-workers` | -1 | Shard processes (`-1` auto, `0` in-process) | -| `--live-tokenizers` | 1 | Shards for mid-run live flushes (`0` = defer all) | +| `--tokenizer-workers` | 2 | Live in-process threads (`0` = defer all to drain) | | `--streaming` | off | Register TTFT/chunk-delta/TPOT triggers | ## References diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 628811a36..0d5be495e 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -26,10 +26,6 @@ from inference_endpoint.async_utils.loop_manager import LoopManager from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext from inference_endpoint.async_utils.transport.zmq.ready_check import send_ready_signal -from inference_endpoint.endpoint_client.cpu_affinity import ( - UnsupportedPlatformError, - expand_to_all_online_cpus, -) from inference_endpoint.utils.logging import setup_logging from .aggregator import MetricCounterKey, MetricsAggregatorService @@ -166,23 +162,12 @@ async def main() -> None: parser.add_argument( "--tokenizer-workers", type=int, - default=-1, + default=2, help=( - "Number of tokenizer shard processes (-1 = auto: one per " - "8-core block of this machine, minimum one; 0 = explicit " - "in-process tokenization). A tokenizer without a fast (Rust) " - "backend is a startup error unless 0 is passed; platforms " - "without CPU affinity (e.g. macOS) shard unpinned." - ), - ) - parser.add_argument( - "--live-tokenizers", - type=int, - default=1, - help=( - "Shards used for mid-run (live) token-metric flushes (default: 1 " - "— the highest core block, away from the loadgen's cores; 0 = no " - "mid-run tokenization, everything defers to the end-of-run drain)." + "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT " + "(default: 2; 0 = no mid-run tokenization, everything defers " + "to the end-of-run drain). The drain always uses the auto-sized " + "sharded pool — one worker process per 8-core block." ), ) parser.add_argument( @@ -206,6 +191,9 @@ async def main() -> None: args = parser.parse_args() setup_logging(level="INFO") + if args.tokenizer_workers < 0: + raise SystemExit("FATAL: --tokenizer-workers must be >= 0") + # The parent owns directory setup — `commands/benchmark/execute.py` # creates `/metrics/` and validates it before launching # this subprocess. Validate here as a fail-fast contract check so a @@ -227,20 +215,9 @@ async def main() -> None: # (coalesces to 'object' not 'AbstractContextManager[BatchTokenizer | None]') tokenizer_cm: AbstractContextManager[BatchTokenizer | None] if args.tokenizer: - # Tokenization drains after the benchmark run, so the loadgen/worker - # affinity partition does not apply to this stage: drop the narrow - # mask inherited from the pinned parent so shards size to the whole - # machine (cgroup/Slurm CPU limits still apply). - try: - cpus = expand_to_all_online_cpus() - logger.info("metrics aggregator affinity: %d CPUs", len(cpus)) - except UnsupportedPlatformError: - pass # non-Linux: no inherited pin to undo. try: tokenizer_cm = BatchTokenizer( - args.tokenizer, - n_workers=args.tokenizer_workers, - live_workers=args.live_tokenizers, + args.tokenizer, live_workers=args.tokenizer_workers ) except RuntimeError as exc: # Fail-fast contract: a tokenizer environment that cannot shard @@ -275,7 +252,7 @@ async def main() -> None: n_histogram_buckets=args.n_histogram_buckets, tokenizer=tokenizer, live_flush_interval_s=( - args.publish_interval if args.live_tokenizers > 0 else None + args.publish_interval if args.tokenizer_workers > 0 else None ), streaming=args.streaming, shutdown_event=shutdown_event, diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index cb2db5878..14bb28189 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -96,7 +96,7 @@ class MetricCounterKey(str, Enum): _TOKEN_HDR_LOW: Final[int] = 1 _TOKEN_HDR_HIGH: Final[int] = 10_000_000 # 10M tokens -_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 60.0 +_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 300.0 class MetricsAggregatorService(ZmqMessageSubscriber[EventRecord]): @@ -134,9 +134,10 @@ def __init__( self._registry = registry self._publisher = publisher self._publish_interval_s = publish_interval_s - # Token triggers enqueue onto this queue; it is flushed in batches at - # each publish tick and at end-of-run. None when no tokenizer is set - # (token metrics disabled), in which case those triggers are no-ops. + # Token triggers enqueue onto this queue; it is flushed by the + # queue's own live loop (start_live) and by the end-of-run drain. + # None when no tokenizer is set (token metrics disabled), in which + # case those triggers are no-ops. self._token_queue: TokenBatchQueue | None = ( TokenBatchQueue(tokenizer, self.loop) if tokenizer is not None else None ) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py index c90ca11fc..578e47198 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/publisher.py @@ -88,7 +88,7 @@ def __init__( self._final_snapshot_path = final_snapshot_path self._tick_task: asyncio.Task | None = None self._closed = False - # publish_final is idempotent: the SIGTERM/SIGINT handler in + # publish_final is idempotent: the SIGTERM handler in # __main__.py and the aggregator's ENDED-driven path can both # call it; the second call must not re-publish or re-write. self._finalized = False @@ -164,7 +164,7 @@ async def publish_final( Report consumers as ``state == COMPLETE and n_pending_tasks > 0``. ``interrupted=True`` is set by the signal handler in __main__.py - when SIGTERM/SIGINT triggers shutdown before ``ENDED`` arrived; + when SIGTERM triggers shutdown before ``ENDED`` arrived; the resulting snapshot is tagged ``state=INTERRUPTED`` so Report can distinguish "user killed the run mid-execution" from a clean end. Stats in an INTERRUPTED snapshot are best-effort partial @@ -190,7 +190,7 @@ async def publish_final( of the terminal state as the last message). Idempotent: only the first call writes/publishes; subsequent - calls early-return. The SIGTERM/SIGINT handler relies on this to + calls early-return. The SIGTERM handler relies on this to race safely with the ENDED-driven path. """ if self._finalized: diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 7723d84d4..dc2736e5d 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -18,12 +18,12 @@ ``BatchTokenizer`` tokenizes whole batches at once, sharded across worker processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers -per-sample text; a queue-owned live loop flushes through a bounded live lane -(default one shard) mid-run, and the end-of-run drain uses every shard. -Sharding requires the fast (Rust) tokenizers backend; an environment without -one is a startup error, never a silent slow path — ``--tokenizer-workers 0`` -is the only (explicit) in-process mode. Platforms without CPU affinity (e.g. -macOS) shard unpinned at full speed; only cache/NUMA locality is lost. +per-sample text. The sharded pool is the drain-phase accelerator and is +auto-sized (one shard per core block); live mid-run flushes run on a small +in-process thread pool (``--tokenizer-workers``, default 2) owned by the +queue's live loop. A tokenizer without a fast (Rust) backend is a startup +error, never a silent slow path. Platforms without CPU affinity (e.g. macOS) +shard unpinned at full speed; only cache/NUMA locality is lost. """ from __future__ import annotations @@ -41,6 +41,9 @@ from typing import TYPE_CHECKING, Any, Protocol, cast import msgspec +from inference_endpoint.endpoint_client.cpu_affinity import ( + expand_to_all_online_cpus, +) from transformers import AutoTokenizer from transformers.utils import logging as transformers_logging @@ -52,9 +55,16 @@ # Budget for the parallel shard warmup (spawn + transformers import + # tokenizer load per worker). A hung load (e.g. a stuck network filesystem) -# must degrade to the in-process path, not wedge service startup. +# must become a bounded startup error, not wedge service startup. _SHARD_WARMUP_TIMEOUT_S = 120.0 +# Per-flush ceiling for the LIVE lane. Bounds three things at once: how long +# the queue lock is held mid-run, how much work an unstoppable in-flight +# thread encode can hold after a drain-start cancellation, and how much the +# drain re-encodes for items the cancelled flush gave back. The drain has no +# ceiling — it always takes the whole buffer. +_LIVE_FLUSH_MAX_ITEMS = 1024 + # Minimal user message used to satisfy chat templates that reject assistant-only # message lists. Its token count is subtracted so only the assistant payload is # measured. @@ -110,12 +120,15 @@ def _init_worker(tokenizer_name: str, core_set: list[int]) -> None: # and lose the buffered tokenizations it was counting. signal.signal(signal.SIGINT, signal.SIG_IGN) if core_set: + # Size the rayon pool to the block explicitly: the parent process caps + # its own pool for the live lane, and spawn children inherit that env — + # without the override every shard would run at the live-lane width. + os.environ["RAYON_NUM_THREADS"] = str(len(core_set)) try: os.sched_setaffinity(0, set(core_set)) except (OSError, AttributeError): - # No pinning (e.g. macOS): cap the rayon pool to the block size - # instead, so unpinned shards don't oversubscribe each other. - os.environ.setdefault("RAYON_NUM_THREADS", str(len(core_set))) + # No pinning (e.g. macOS): the rayon cap above still keeps + # unpinned shards from oversubscribing each other. logger.debug("could not pin tokenizer worker to %s", core_set) transformers_logging.set_verbosity_error() tok = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) @@ -187,21 +200,29 @@ def __init__( *, cores_per_worker: int = CORES_PER_WORKER, n_workers: int = -1, - live_workers: int = 1, + live_workers: int = 2, ) -> None: self._tokenizer_name = tokenizer_name + # The live lane runs in-process: cap this process's rayon pool so a + # mid-run batched encode uses ~live_workers cores, not the whole + # machine. Must be set before the first encode initializes the pool; + # setdefault lets an operator-exported RAYON_NUM_THREADS win. + os.environ.setdefault("RAYON_NUM_THREADS", str(max(1, live_workers))) self._live_workers = live_workers self._fallback_warned: set[str] = set() self._tokenizer: PreTrainedTokenizerBase | None = None self._prefix_len = 0 self._baseline = 0 - # In-process thread for the chat-template path. + # In-process threads: the live token-metric lane plus the + # chat-template path. self._thread: ThreadPoolExecutor | None = ThreadPoolExecutor( - max_workers=1, thread_name_prefix="tok-thread" + max_workers=max(1, live_workers), thread_name_prefix="tok-thread" ) self._load_tokenizer() # also computes the chat-template baseline # Process shards for the batched text path. Empty only when - # in-process mode was explicitly requested (n_workers=0). + # in-process mode was explicitly requested (n_workers=0 or + # cores_per_worker<=0; ctor overrides used primarily by tests — + # production wiring passes live_workers only and shards auto-size). self._procs: list[ProcessPoolExecutor] = [] self._setup_shards(cores_per_worker, n_workers) @@ -259,17 +280,35 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: raise RuntimeError( f"tokenizer {self._tokenizer_name!r} has no fast (Rust) " "backend; token metrics require one to keep up with " - "completions. Pass --tokenizer-workers 0 to explicitly run " - "single-threaded in-process tokenization." + "completions. Use a fast tokenizer, or disable token metrics." ) + # Probe the full allowed CPU universe (cgroup-clamped) for the shard + # block math, then restore this process's inherited mask: the + # aggregator's event loop, publisher, and live tokenizer threads stay + # exactly where the parent placed them (the loadgen mask on a pinned + # Linux run). Only the drain-phase shard processes, pinned to their + # own blocks, span the whole machine. try: - available = sorted(os.sched_getaffinity(0)) + original = os.sched_getaffinity(0) except (OSError, AttributeError): - # No affinity API (e.g. macOS): shard unpinned — the OS scheduler - # spreads the workers; only cache/NUMA locality is lost. Workers - # cap their rayon pools to the block size instead (_init_worker). + original = None + try: + available = sorted(expand_to_all_online_cpus()) + except Exception: # noqa: BLE001 — no affinity API (e.g. macOS). + # Shard unpinned: the OS scheduler spreads the workers; only + # cache/NUMA locality is lost. Workers cap their rayon pools to + # the block size instead (_init_worker). available = list(range(os.cpu_count() or 1)) logger.info("BatchTokenizer: CPU affinity unavailable; sharding unpinned") + else: + if original is not None: + try: + os.sched_setaffinity(0, original) + except OSError: + logger.warning( + "could not restore the aggregator's inherited CPU " + "mask; this process stays expanded to all CPUs" + ) capacity = max(1, len(available) // cores_per_worker) n = capacity if n_workers < 0 else min(n_workers, capacity) t0 = time.perf_counter() @@ -298,8 +337,8 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: _terminate_procs(procs) raise RuntimeError( "tokenizer shard warmup failed; refusing to fall back to a " - "slow path. Fix the environment (or pass --tokenizer-workers " - "0 to explicitly run in-process)." + "slow path that cannot keep up with completions. Fix the " + "environment (see the chained error)." ) from exc self._procs = procs logger.info( @@ -338,19 +377,19 @@ async def count_texts_async( async def count_texts_live_async( self, texts: list[str], loop: asyncio.AbstractEventLoop ) -> list[int]: - """Like ``count_texts_async``, bounded to the live lane. + """Like ``count_texts_async``, bounded to the in-process live lane. - Mid-run flushes use only the last ``live_workers`` shards — the - highest core blocks, farthest from the loadgen's low cores — so live - token metrics never contend with the benchmark hot path. The - end-of-run drain uses every shard. + Mid-run flushes never touch the shard processes: they run on this + process's small thread pool with a rayon pool capped to + ``live_workers`` cores. The end-of-run drain uses every shard. """ if not texts: return [] - live_n = max(1, self._live_workers) - if self._procs and live_n < len(self._procs): - return await self._fan_out(self._procs[-live_n:], texts) - return await self.count_texts_async(texts, loop) + if self._thread is None: + raise RuntimeError("BatchTokenizer is closed") + return await loop.run_in_executor( + self._thread, self._encode_lengths_inproc, texts + ) @staticmethod async def _fan_out(procs: list[ProcessPoolExecutor], texts: list[str]) -> list[int]: @@ -548,9 +587,11 @@ async def flush(self, live: bool = False) -> None: """Tokenize everything buffered so far and run each ``on_count``. ``live=True`` routes text batches through the tokenizer's bounded - live lane instead of the full shard pool, and re-queues items on - failure or cancellation so a mid-run hiccup never loses samples — the - end-of-run drain retries them. Drain-mode failures are terminal: the + live lane instead of the full shard pool, takes at most + ``_LIVE_FLUSH_MAX_ITEMS`` per kind (bounding lock-hold time and the + unstoppable in-flight encode a drain-start cancellation leaves + behind), and re-queues items on failure or cancellation so a mid-run + hiccup never loses samples — the end-of-run drain retries them. Drain-mode failures are terminal: the un-recorded items stay counted in ``pending`` (``_inflight`` is decremented only after a callback runs) and surface as an incomplete drain, not as silently dropped samples. Items are detached from the @@ -564,8 +605,13 @@ async def flush(self, live: bool = False) -> None: async with self._lock: if not (self._text or self._msg): return - text_items, self._text = self._text, [] - msg_items, self._msg = self._msg, [] + if live: + cap = _LIVE_FLUSH_MAX_ITEMS + text_items, self._text = self._text[:cap], self._text[cap:] + msg_items, self._msg = self._msg[:cap], self._msg[cap:] + else: + text_items, self._text = self._text, [] + msg_items, self._msg = self._msg, [] # The text and message phases fail independently — they run on # separate executors, so a dead text shard must not drop message # items that would still succeed (and vice versa). The first diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py index 05b68d3ee..f28b48f7a 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py +++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py @@ -170,6 +170,7 @@ def make_aggregator( socket_name: str, *, tokenizer=None, + live_flush_interval_s: float | None = None, streaming: bool = True, shutdown_event: asyncio.Event | None = None, ) -> tuple[MetricsAggregatorService, MetricsRegistry, MagicMock]: @@ -201,6 +202,7 @@ def make_aggregator( sig_figs=3, n_histogram_buckets=10, tokenizer=tokenizer, + live_flush_interval_s=live_flush_interval_s, streaming=streaming, shutdown_event=shutdown_event, ) diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py index bae2a0aa5..bc7d2763b 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py @@ -989,6 +989,29 @@ async def test_tpot_non_streaming_output_skipped(self, tmp_path): finally: agg.close() + @pytest.mark.asyncio + async def test_started_arms_the_live_flush_loop(self, tmp_path): + """STARTED starts the queue's live loop when an interval is set.""" + loop = asyncio.get_event_loop() + with ManagedZMQContext.scoped(socket_dir=str(tmp_path)) as ctx: + agg, _, _ = make_aggregator( + ctx, + loop, + "agg_live_arm", + tokenizer=MockBatchTokenizer(), + live_flush_interval_s=0.01, + ) + try: + await agg.process([session_event(SessionEventType.STARTED, ts=0)]) + assert agg._token_queue is not None + assert agg._token_queue._live_task is not None + await agg.process([session_event(SessionEventType.ENDED, ts=100)]) + assert ( + agg._token_queue._live_task is None + ), "drain must stop the live loop" + finally: + agg.close() + @pytest.mark.asyncio async def test_flush_records_buffered_tokenizations(self, tmp_path): """fire() buffers tokenization; flush() tokenizes the batch and records.""" @@ -1066,6 +1089,9 @@ class FailingBatchTokenizer: async def count_texts_async(self, texts, _loop): raise RuntimeError("tokenizer backend died") + async def count_texts_live_async(self, texts, _loop): + return await self.count_texts_async(texts, _loop) + async def token_count_message_async(self, *args): raise RuntimeError("tokenizer backend died") @@ -1114,6 +1140,9 @@ async def count_texts_async(self, texts, _loop): await asyncio.sleep(10.0) # exceeds drain timeout return [0] * len(texts) + async def count_texts_live_async(self, texts, _loop): + return await self.count_texts_async(texts, _loop) + async def token_count_message_async(self, *args): await asyncio.sleep(10.0) return 0 diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 2588e16cd..270fd683a 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -274,7 +274,9 @@ def shutdown(self, wait=False, cancel_futures=False): @pytest.mark.unit class TestSetupShardsDecisions: - """Pins the --tokenizer-workers contract: -1 auto / N clamped / 0 explicit. + """Pins the BatchTokenizer(n_workers=...) shard contract: -1 auto / N + clamped / 0 explicit in-process (auto-sized in production — the CLI's + --tokenizer-workers maps to the live thread lane, not to shards). An environment that cannot shard is a startup error — never a silent in-process fallback. @@ -282,8 +284,20 @@ class TestSetupShardsDecisions: def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor): monkeypatch.setattr(token_metrics_module, "ProcessPoolExecutor", executor) + # Patch the probe + the restore so no real affinity syscalls run. monkeypatch.setattr( - token_metrics_module.os, "sched_getaffinity", lambda pid: set(range(cpus)) + token_metrics_module, + "expand_to_all_online_cpus", + lambda: set(range(cpus)), + ) + monkeypatch.setattr( + token_metrics_module.os, "sched_getaffinity", lambda pid: {0, 1} + ) + self.restored: list[set] = [] + monkeypatch.setattr( + token_metrics_module.os, + "sched_setaffinity", + lambda pid, mask: self.restored.append(set(mask)), ) with patch(_MOCK_TARGET, _FakeTokenizerWithBackend): return BatchTokenizer("fake", n_workers=n_workers) @@ -309,6 +323,13 @@ def test_blocks_are_disjoint_consecutive_core_sets(self, monkeypatch): blocks = [set(ex.initargs[1]) for ex in tok._procs] assert blocks == [set(range(0, 8)), set(range(8, 16))] + def test_probe_restores_the_inherited_mask(self, monkeypatch): + """The aggregator keeps the mask its parent gave it; only the probe + widens, and only the shard children pin elsewhere.""" + with self._make(monkeypatch, 16, -1): + pass + assert self.restored == [{0, 1}] + def test_no_fast_backend_is_a_startup_error(self, monkeypatch): monkeypatch.setattr( token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor @@ -323,8 +344,15 @@ def test_affinity_unavailable_shards_unpinned(self, monkeypatch): token_metrics_module, "ProcessPoolExecutor", _SpawnlessExecutor ) + def _unsupported(): + raise RuntimeError("affinity requires Linux") + + monkeypatch.setattr( + token_metrics_module, "expand_to_all_online_cpus", _unsupported + ) + def _raise(pid): - raise OSError("affinity unavailable") + raise AttributeError("no sched_getaffinity") monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise) monkeypatch.setattr(token_metrics_module.os, "cpu_count", lambda: 16) @@ -357,8 +385,8 @@ def submit(self, _fn, chunk): @pytest.mark.unit class TestLiveLane: @pytest.mark.asyncio - async def test_live_uses_only_the_last_shards(self): - """Mid-run flushes stay off the low core blocks (loadgen side).""" + async def test_live_never_touches_the_shard_pool(self): + """Mid-run flushes run in-process; the shards are drain-only.""" with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok: @@ -366,8 +394,7 @@ async def test_live_uses_only_the_last_shards(self): tok._procs = procs counts = await tok.count_texts_live_async(["a b", "c"], loop) assert counts == [2, 1] - assert procs[0].chunks == [] and procs[1].chunks == [] - assert procs[2].chunks == [["a b", "c"]] + assert all(p.chunks == [] for p in procs) @pytest.mark.asyncio async def test_drain_uses_every_shard(self): @@ -423,6 +450,86 @@ async def test_flush_remaining_stops_live_loop(self): assert task is not None and task.cancelled() +@pytest.mark.unit +class TestRayonCaps: + def test_ctor_caps_rayon_to_live_workers(self, monkeypatch): + monkeypatch.delenv("RAYON_NUM_THREADS", raising=False) + with patch(_MOCK_TARGET, _FakeTokenizer): + with BatchTokenizer("fake", n_workers=0, live_workers=3): + assert token_metrics_module.os.environ["RAYON_NUM_THREADS"] == "3" + + def test_ctor_respects_operator_exported_cap(self, monkeypatch): + monkeypatch.setenv("RAYON_NUM_THREADS", "7") + with patch(_MOCK_TARGET, _FakeTokenizer): + with BatchTokenizer("fake", n_workers=0, live_workers=3): + assert token_metrics_module.os.environ["RAYON_NUM_THREADS"] == "7" + + def test_init_worker_overrides_inherited_cap_with_block_size(self, monkeypatch): + """Spawn children inherit the parent's live cap; each shard must + re-size its rayon pool to its own core block.""" + monkeypatch.setenv("RAYON_NUM_THREADS", "2") + + def _no_affinity(pid, mask): + raise AttributeError("no sched_setaffinity") + + monkeypatch.setattr(token_metrics_module.os, "sched_setaffinity", _no_affinity) + with patch(_MOCK_TARGET, _FakeTokenizer): + token_metrics_module._init_worker("fake", [0, 1, 2, 3, 4, 5, 6, 7]) + assert token_metrics_module.os.environ["RAYON_NUM_THREADS"] == "8" + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestLiveFlushBounds: + async def test_live_flush_takes_at_most_the_cap(self, monkeypatch): + monkeypatch.setattr(token_metrics_module, "_LIVE_FLUSH_MAX_ITEMS", 3) + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_CapturingTokenizer(), loop) + recorded: list[int] = [] + for i in range(5): + queue.enqueue_text(f"t{i}", recorded.append) + await queue.flush(live=True) + assert len(recorded) == 3 + assert queue.pending == 2 + # The drain takes everything that remains. + assert await queue.flush_remaining(timeout=1.0) == 0 + assert len(recorded) == 5 + + async def test_live_cancellation_requeues_texts(self): + class _Hanging(_CapturingTokenizer): + async def count_texts_live_async(self, texts, _loop): + await asyncio.sleep(30) + return [0] * len(texts) + + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_Hanging(), loop) + recorded: list[int] = [] + queue.enqueue_text("a b", recorded.append) + task = loop.create_task(queue.flush(live=True)) + await asyncio.sleep(0.01) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + assert queue.pending == 1 + assert len(queue._text) == 1, "cancelled live flush must give items back" + assert await queue.flush_remaining(timeout=1.0) == 0 + assert recorded == [2] + + async def test_live_message_failure_requeues_message(self): + class _MsgFailing(_CapturingTokenizer): + async def token_count_message_async(self, *args): + raise RuntimeError("template boom") + + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_MsgFailing(), loop) + recorded: list[int] = [] + queue.enqueue_message(("hello world", None, None), recorded.append) + with pytest.raises(RuntimeError, match="template boom"): + await queue.flush(live=True) + assert queue.pending == 1 + assert len(queue._msg) == 1, "failed live message must be re-queued" + + @pytest.mark.unit class TestEvenChunks: def test_splits_into_near_equal_chunks(self): @@ -502,6 +609,9 @@ async def count_texts_async(self, texts, _loop): await asyncio.sleep(10.0) return [0] * len(texts) + async def count_texts_live_async(self, texts, _loop): + return await self.count_texts_async(texts, _loop) + async def token_count_message_async(self, *args): return 0 @@ -520,6 +630,9 @@ class _FailingTokenizer: async def count_texts_async(self, texts, _loop): raise RuntimeError("tokenizer boom") + async def count_texts_live_async(self, texts, _loop): + return await self.count_texts_async(texts, _loop) + async def token_count_message_async(self, *args): raise RuntimeError("tokenizer boom") @@ -537,6 +650,9 @@ class _TextFailingTokenizer: async def count_texts_async(self, texts, _loop): raise RuntimeError("text shard died") + async def count_texts_live_async(self, texts, _loop): + return await self.count_texts_async(texts, _loop) + async def token_count_message_async( self, content, reasoning, tool_calls, _loop ): From 700423e92113250ee76133cb94dd49b2595301f0 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 16:40:53 -0700 Subject: [PATCH 11/20] fix(metrics): call-shaped awaits for cancelled tasks; pin aggregator-args seam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - flush_remaining gathers the cancelled live task (return_exceptions) instead of a bare suppressed await; the cancellation test awaits via wait_for. Both silence the code-quality ineffectual-statement check without changing semantics. - New TestAggregatorArgs case pins the SUT-intrusion seam: --tokenizer is forwarded, and no live/worker knobs are — the service defaults deliberately govern mid-run tokenization (review feedback). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../metrics_aggregator/token_metrics.py | 4 +- .../metrics_aggregator/test_token_metrics.py | 2 +- tests/unit/commands/test_benchmark.py | 53 +++++++++++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index dc2736e5d..c5d742087 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -29,7 +29,6 @@ from __future__ import annotations import asyncio -import contextlib import json import logging import multiprocessing @@ -673,8 +672,7 @@ async def flush_remaining(self, timeout: float | None) -> int: """ if self._live_task is not None: self._live_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._live_task + await asyncio.gather(self._live_task, return_exceptions=True) self._live_task = None if self._inflight == 0: return 0 diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 270fd683a..1a51c1a18 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -509,7 +509,7 @@ async def count_texts_live_async(self, texts, _loop): await asyncio.sleep(0.01) task.cancel() with pytest.raises(asyncio.CancelledError): - await task + await asyncio.wait_for(task, timeout=1.0) assert queue.pending == 1 assert len(queue._text) == 1, "cancelled live flush must give items back" assert await queue.flush_remaining(timeout=1.0) == 0 diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 9da2dcf56..7d43017dc 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -631,6 +631,59 @@ async def _capture_launch(service_configs, *, timeout): idx = args.index("--drain-timeout") assert args[idx + 1] == expected_flag + @pytest.mark.unit + @pytest.mark.asyncio + async def test_tokenizer_forwarded_and_live_args_left_to_service_defaults( + self, tmp_path + ): + """Pins the SUT-intrusion seam: the benchmark forwards --tokenizer but + deliberately no live/worker knobs — the service's own defaults govern + mid-run tokenization.""" + config = OfflineConfig(**_OFFLINE_KWARGS, settings=OfflineSettings()) + ctx = self._make_ctx(config, tmp_path) + ctx.tokenizer_name = "gpt2" + + captured: list = [] + + async def _capture_launch(service_configs, *, timeout): + captured.extend(service_configs) + raise KeyboardInterrupt("stop after launch") + + mock_zmq = MagicMock() + mock_zmq.socket_dir = str(tmp_path / "sockets") + + with ( + patch( + "inference_endpoint.commands.benchmark.execute.ManagedZMQContext" + ) as MockZMQ, + patch( + "inference_endpoint.commands.benchmark.execute.EventPublisherService" + ) as MockPub, + patch( + "inference_endpoint.commands.benchmark.execute.MetricsSnapshotSubscriber" + ) as MockSub, + patch( + "inference_endpoint.commands.benchmark.execute.ServiceLauncher" + ) as MockLauncher, + patch("inference_endpoint.commands.benchmark.execute.tqdm"), + ): + MockZMQ.scoped.return_value.__enter__ = MagicMock(return_value=mock_zmq) + MockZMQ.scoped.return_value.__exit__ = MagicMock(return_value=False) + MockPub.return_value.socket_name = "test_pub" + MockSub.return_value.start = MagicMock() + MockLauncher.return_value.launch = _capture_launch + + loop = asyncio.get_event_loop() + with pytest.raises(KeyboardInterrupt): + await _run_benchmark_async(ctx, loop) + + aggregator_cfg = next(c for c in captured if "metrics_aggregator" in c.module) + args = aggregator_cfg.args + idx = args.index("--tokenizer") + assert args[idx + 1] == "gpt2" + assert "--tokenizer-workers" not in args + assert "--live-tokenizers" not in args + class TestBuildPhases: """Tests for _build_phases() in execute.py.""" From 9640bd79dcca2a63a930b4a616d6ad358a0724df Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 16:59:35 -0700 Subject: [PATCH 12/20] fix(metrics): requeue messages on live-cancel; shrink the tokenizer API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review feedback (human + council), with the API surface pulled back toward main: - A live-flush cancellation landing in the text encode dropped the already-detached message items — lost tool-call samples and a final snapshot stuck at n_pending_tasks > 0 for work the drain could never reach. The text-phase CancelledError handler now re-queues both kinds; regression test covers text+message together. - count_texts_live_async is gone: the live lane is a live= keyword on count_texts_async, so the TokenCounter protocol is back to two methods and every test stub lost its alias. - The SIGTERM handler takes the token queue object again (reads .pending), not a callable. - Live flushes take their slice in place (del list[:cap]) instead of copying the whole backlog tail under the queue lock each tick. - Shard warmup budget reduced to 25s so its diagnostic FATAL fires before the parent's 30s service-launch kill. - TestAggregatorArgs pins the SUT-intrusion seam: --tokenizer is forwarded; live/worker knobs deliberately are not. 276 unit tests pass; pre-commit clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/__main__.py | 8 +-- .../metrics_aggregator/token_metrics.py | 69 +++++++++---------- .../services/metrics_aggregator/conftest.py | 10 ++- .../metrics_aggregator/test_aggregator.py | 10 +-- .../test_main_signal_handler.py | 9 +-- .../metrics_aggregator/test_token_metrics.py | 60 ++++++++++------ 6 files changed, 84 insertions(+), 82 deletions(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 0d5be495e..b4cd8bba9 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -33,7 +33,7 @@ from .publisher import MetricsPublisher from .registry import MetricsRegistry from .snapshot import MetricsSnapshotCodec -from .token_metrics import BatchTokenizer +from .token_metrics import BatchTokenizer, TokenBatchQueue logger = logging.getLogger(__name__) @@ -44,7 +44,7 @@ def _make_sigterm_handler( registry: MetricsRegistry, publisher: MetricsPublisher, table: MetricsTable, - pending_tokens: Callable[[], int], + token_queue: TokenBatchQueue | None, shutdown_event: asyncio.Event, ) -> tuple[Callable[[], None], set[asyncio.Task]]: """Build the SIGTERM handler that writes the INTERRUPTED final snapshot. @@ -76,7 +76,7 @@ async def _signal_finalize() -> None: ) await publisher.publish_final( registry, - n_pending_tasks=pending_tokens(), + n_pending_tasks=token_queue.pending if token_queue is not None else 0, interrupted=True, ) except Exception: # noqa: BLE001 — best-effort. @@ -288,7 +288,7 @@ async def main() -> None: registry=registry, publisher=publisher, table=aggregator._table, - pending_tokens=lambda: aggregator.pending_tokens, + token_queue=aggregator._token_queue, shutdown_event=shutdown_event, ) loop.add_signal_handler(signal.SIGTERM, on_sigterm) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index c5d742087..1ae3c5ce7 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -54,8 +54,10 @@ # Budget for the parallel shard warmup (spawn + transformers import + # tokenizer load per worker). A hung load (e.g. a stuck network filesystem) -# must become a bounded startup error, not wedge service startup. -_SHARD_WARMUP_TIMEOUT_S = 120.0 +# must become a bounded startup error, not wedge service startup — and the +# error must fire before the parent's 30 s service-launch budget kills the +# subprocess, so the diagnostic wins the race. +_SHARD_WARMUP_TIMEOUT_S = 25.0 # Per-flush ceiling for the LIVE lane. Bounds three things at once: how long # the queue lock is held mid-run, how much work an unstoppable in-flight @@ -357,15 +359,23 @@ def _encode_lengths_inproc(self, texts: list[str]) -> list[int]: return [len(tok.tokenize(t)) for t in texts] # type: ignore[union-attr] async def count_texts_async( - self, texts: list[str], loop: asyncio.AbstractEventLoop + self, + texts: list[str], + loop: asyncio.AbstractEventLoop, + *, + live: bool = False, ) -> list[int]: """Per-text token counts for a whole batch without blocking the loop. - A worker-shard failure propagates and is treated as an incomplete drain. + ``live=True`` is the mid-run lane: it never touches the shard + processes — it runs on this process's small thread pool with a rayon + pool capped to ``live_workers`` cores. The default (drain) path fans + out across every shard; a worker-shard failure propagates and is + treated as an incomplete drain. """ if not texts: return [] - if self._procs: + if self._procs and not live: return await self._fan_out(self._procs, texts) if self._thread is None: raise RuntimeError("BatchTokenizer is closed") @@ -373,23 +383,6 @@ async def count_texts_async( self._thread, self._encode_lengths_inproc, texts ) - async def count_texts_live_async( - self, texts: list[str], loop: asyncio.AbstractEventLoop - ) -> list[int]: - """Like ``count_texts_async``, bounded to the in-process live lane. - - Mid-run flushes never touch the shard processes: they run on this - process's small thread pool with a rayon pool capped to - ``live_workers`` cores. The end-of-run drain uses every shard. - """ - if not texts: - return [] - if self._thread is None: - raise RuntimeError("BatchTokenizer is closed") - return await loop.run_in_executor( - self._thread, self._encode_lengths_inproc, texts - ) - @staticmethod async def _fan_out(procs: list[ProcessPoolExecutor], texts: list[str]) -> list[int]: chunks = _even_chunks(texts, len(procs)) @@ -489,15 +482,14 @@ class TokenCounter(Protocol): """ async def count_texts_async( - self, texts: list[str], loop: asyncio.AbstractEventLoop, / - ) -> list[int]: - """Per-text token counts for a whole batch (full pool).""" - raise NotImplementedError - - async def count_texts_live_async( - self, texts: list[str], loop: asyncio.AbstractEventLoop, / + self, + texts: list[str], + loop: asyncio.AbstractEventLoop, + /, + *, + live: bool = False, ) -> list[int]: - """Per-text token counts via the bounded live lane.""" + """Per-text token counts (``live=True`` = the bounded mid-run lane).""" raise NotImplementedError async def token_count_message_async( @@ -596,18 +588,15 @@ async def flush(self, live: bool = False) -> None: drain, not as silently dropped samples. Items are detached from the buffer up front so concurrent enqueues land in the next flush. """ - count_texts = ( - self._tokenizer.count_texts_live_async - if live - else self._tokenizer.count_texts_async - ) async with self._lock: if not (self._text or self._msg): return if live: cap = _LIVE_FLUSH_MAX_ITEMS - text_items, self._text = self._text[:cap], self._text[cap:] - msg_items, self._msg = self._msg[:cap], self._msg[cap:] + text_items = self._text[:cap] + del self._text[:cap] # in-place: O(cap), not O(backlog). + msg_items = self._msg[:cap] + del self._msg[:cap] else: text_items, self._text = self._text, [] msg_items, self._msg = self._msg, [] @@ -618,10 +607,14 @@ async def flush(self, live: bool = False) -> None: failure: Exception | None = None if text_items: try: - counts = await count_texts([t for t, _ in text_items], self._loop) + counts = await self._tokenizer.count_texts_async( + [t for t, _ in text_items], self._loop, live=live + ) except asyncio.CancelledError: if live: self._text[:0] = text_items + self._msg[:0] = msg_items + msg_items = [] raise except Exception as exc: # noqa: BLE001 — isolate phases. failure = exc diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py index f28b48f7a..38e25945c 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py +++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py @@ -61,17 +61,15 @@ def __init__(self, delay: float = 0.0) -> None: self._delay = delay async def count_texts_async( - self, texts: list[str], _loop: asyncio.AbstractEventLoop + self, + texts: list[str], + _loop: asyncio.AbstractEventLoop, + live: bool = False, ) -> list[int]: if self._delay: await asyncio.sleep(self._delay) return [len(t.split()) for t in texts] - async def count_texts_live_async( - self, texts: list[str], loop: asyncio.AbstractEventLoop - ) -> list[int]: - return await self.count_texts_async(texts, loop) - async def token_count_message_async( self, content: str, diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py index bc7d2763b..075e4a0d5 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator.py @@ -1086,12 +1086,9 @@ async def test_drain_failure_reports_pending_and_finalizes(self, tmp_path): loop = asyncio.get_event_loop() class FailingBatchTokenizer: - async def count_texts_async(self, texts, _loop): + async def count_texts_async(self, texts, _loop, live=False): raise RuntimeError("tokenizer backend died") - async def count_texts_live_async(self, texts, _loop): - return await self.count_texts_async(texts, _loop) - async def token_count_message_async(self, *args): raise RuntimeError("tokenizer backend died") @@ -1136,13 +1133,10 @@ async def test_drain_timeout_reports_pending_count(self, tmp_path): loop = asyncio.get_event_loop() class BlockingBatchTokenizer: - async def count_texts_async(self, texts, _loop): + async def count_texts_async(self, texts, _loop, live=False): await asyncio.sleep(10.0) # exceeds drain timeout return [0] * len(texts) - async def count_texts_live_async(self, texts, _loop): - return await self.count_texts_async(texts, _loop) - async def token_count_message_async(self, *args): await asyncio.sleep(10.0) return 0 diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py index 3428f6f22..13fb1f40b 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_main_signal_handler.py @@ -27,6 +27,7 @@ import asyncio import gc import weakref +from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock import pytest @@ -50,7 +51,7 @@ async def test_sigterm_handler_holds_strong_reference_to_finalize_task(): registry = MagicMock() table = MagicMock() table.total_tracked_duration_ns = 0 - n_pending = 0 + token_queue = SimpleNamespace(pending=0) # publish_final blocks on an event so we can observe the task # mid-execution and exercise the strong-ref contract. @@ -69,7 +70,7 @@ async def _slow_publish(*args, **kwargs): registry=registry, publisher=publisher, table=table, - pending_tokens=lambda: n_pending, + token_queue=token_queue, shutdown_event=shutdown_event, ) @@ -123,7 +124,7 @@ async def test_sigterm_handler_refreshes_tracked_duration(): registry = MagicMock() table = MagicMock() table.total_tracked_duration_ns = 12345 - n_pending = 3 + token_queue = SimpleNamespace(pending=3) publisher = MagicMock() publisher.publish_final = AsyncMock() @@ -135,7 +136,7 @@ async def test_sigterm_handler_refreshes_tracked_duration(): registry=registry, publisher=publisher, table=table, - pending_tokens=lambda: n_pending, + token_queue=token_queue, shutdown_event=shutdown_event, ) on_sigterm() diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 1a51c1a18..8bf838c17 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -392,7 +392,7 @@ async def test_live_never_touches_the_shard_pool(self): with BatchTokenizer("fake", n_workers=0, live_workers=1) as tok: procs = [_RecordingProc(), _RecordingProc(), _RecordingProc()] tok._procs = procs - counts = await tok.count_texts_live_async(["a b", "c"], loop) + counts = await tok.count_texts_async(["a b", "c"], loop, live=True) assert counts == [2, 1] assert all(p.chunks == [] for p in procs) @@ -424,8 +424,10 @@ async def test_start_live_flushes_periodically(self): async def test_live_loop_survives_tokenizer_failure(self): class _FailingLive(_CapturingTokenizer): - async def count_texts_live_async(self, texts, _loop): - raise RuntimeError("live lane boom") + async def count_texts_async(self, texts, _loop, live=False): + if live: + raise RuntimeError("live lane boom") + return await super().count_texts_async(texts, _loop) loop = asyncio.get_running_loop() queue = TokenBatchQueue(_FailingLive(), loop) @@ -497,9 +499,10 @@ async def test_live_flush_takes_at_most_the_cap(self, monkeypatch): async def test_live_cancellation_requeues_texts(self): class _Hanging(_CapturingTokenizer): - async def count_texts_live_async(self, texts, _loop): - await asyncio.sleep(30) - return [0] * len(texts) + async def count_texts_async(self, texts, _loop, live=False): + if live: + await asyncio.sleep(30) + return await super().count_texts_async(texts, _loop) loop = asyncio.get_running_loop() queue = TokenBatchQueue(_Hanging(), loop) @@ -515,6 +518,31 @@ async def count_texts_live_async(self, texts, _loop): assert await queue.flush_remaining(timeout=1.0) == 0 assert recorded == [2] + async def test_live_cancellation_requeues_messages_too(self): + """A cancel landing in the text encode must give back BOTH kinds.""" + + class _Hanging(_CapturingTokenizer): + async def count_texts_async(self, texts, _loop, live=False): + if live: + await asyncio.sleep(30) + return await super().count_texts_async(texts, _loop) + + loop = asyncio.get_running_loop() + queue = TokenBatchQueue(_Hanging(), loop) + recorded: list[int] = [] + queue.enqueue_text("a b", recorded.append) + queue.enqueue_message(("hello world", None, None), recorded.append) + task = loop.create_task(queue.flush(live=True)) + await asyncio.sleep(0.01) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await asyncio.wait_for(task, timeout=1.0) + assert queue.pending == 2 + assert len(queue._text) == 1 + assert len(queue._msg) == 1, "detached messages must be re-queued" + assert await queue.flush_remaining(timeout=1.0) == 0 + assert sorted(recorded) == [2, 2] + async def test_live_message_failure_requeues_message(self): class _MsgFailing(_CapturingTokenizer): async def token_count_message_async(self, *args): @@ -554,12 +582,9 @@ def test_preserves_order_and_bounds_chunk_count(self): class _CapturingTokenizer: """Minimal tokenizer stub for queue tests: whitespace counts, no procs.""" - async def count_texts_async(self, texts, _loop): + async def count_texts_async(self, texts, _loop, live=False): return [len(t.split()) for t in texts] - async def count_texts_live_async(self, texts, _loop): - return await self.count_texts_async(texts, _loop) - async def token_count_message_async(self, content, reasoning, tool_calls, _loop): parts = [p for p in (content, reasoning) if p] return len(" ".join(parts).split()) + (len(tool_calls) if tool_calls else 0) @@ -605,13 +630,10 @@ async def test_flush_remaining_timeout_reports_pending(self): """A tokenizer slower than the budget leaves items pending.""" class _BlockingTokenizer: - async def count_texts_async(self, texts, _loop): + async def count_texts_async(self, texts, _loop, live=False): await asyncio.sleep(10.0) return [0] * len(texts) - async def count_texts_live_async(self, texts, _loop): - return await self.count_texts_async(texts, _loop) - async def token_count_message_async(self, *args): return 0 @@ -627,12 +649,9 @@ async def test_flush_remaining_failure_reports_pending(self): """A tokenizer error leaves items pending and never raises.""" class _FailingTokenizer: - async def count_texts_async(self, texts, _loop): + async def count_texts_async(self, texts, _loop, live=False): raise RuntimeError("tokenizer boom") - async def count_texts_live_async(self, texts, _loop): - return await self.count_texts_async(texts, _loop) - async def token_count_message_async(self, *args): raise RuntimeError("tokenizer boom") @@ -647,12 +666,9 @@ async def test_flush_text_failure_does_not_drop_message_items(self): """The message phase runs (and records) even when the text batch fails.""" class _TextFailingTokenizer: - async def count_texts_async(self, texts, _loop): + async def count_texts_async(self, texts, _loop, live=False): raise RuntimeError("text shard died") - async def count_texts_live_async(self, texts, _loop): - return await self.count_texts_async(texts, _loop) - async def token_count_message_async( self, content, reasoning, tool_calls, _loop ): From a1b93868f054b16a42a4a4f952a2c850f22fd1a6 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 17:00:45 -0700 Subject: [PATCH 13/20] chore(metrics): public read-only wiring surface on the aggregator The service entry wires the SIGTERM handler from the aggregator's table and token queue; expose them as read-only properties instead of reaching into private attributes. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/__main__.py | 4 ++-- .../services/metrics_aggregator/aggregator.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index b4cd8bba9..e3d136ab0 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -287,8 +287,8 @@ async def main() -> None: loop=loop, registry=registry, publisher=publisher, - table=aggregator._table, - token_queue=aggregator._token_queue, + table=aggregator.table, + token_queue=aggregator.token_queue, shutdown_event=shutdown_event, ) loop.add_signal_handler(signal.SIGTERM, on_sigterm) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index 14bb28189..233342b46 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -246,6 +246,16 @@ def _register_triggers(self, streaming: bool) -> None: table.add_trigger(SampleField.LAST_RECV_NS, ChunkDeltaTrigger(registry)) table.add_trigger(SampleField.COMPLETE_NS, TpotTrigger(registry, queue)) + @property + def table(self) -> MetricsTable: + """The per-sample metrics table (read-only; for service wiring).""" + return self._table + + @property + def token_queue(self) -> TokenBatchQueue | None: + """The token batch queue, if token metrics are enabled.""" + return self._token_queue + @property def pending_tokens(self) -> int: """Enqueued tokenizations not yet recorded (the snapshot n_pending_tasks).""" From 34e0c489fcdcd874a5077e40c5ebc80d9d6a384a Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 17:02:41 -0700 Subject: [PATCH 14/20] chore(metrics): drain-timeout default back to 60s (review feedback) The end-of-run drain runs on the full shard pool, so 60s covers roughly a million buffered tokenizations on a large node; bigger runs set --metrics-drain-timeout explicitly (0 = unlimited). Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- docs/async_utils/services/metrics_aggregator/DESIGN.md | 4 ++-- .../async_utils/services/metrics_aggregator/__main__.py | 4 ++-- .../async_utils/services/metrics_aggregator/aggregator.py | 2 +- .../async_utils/services/metrics_aggregator/snapshot.py | 2 +- src/inference_endpoint/config/schema.py | 4 ++-- .../config/templates/concurrency_template_full.yaml | 2 +- .../config/templates/offline_template_full.yaml | 2 +- .../config/templates/online_template_full.yaml | 2 +- tests/unit/commands/test_benchmark.py | 2 +- 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index e6182d198..907a25109 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils. - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O. - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks). -- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 300 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. +- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`). - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots. diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md index 882683968..207c83889 100644 --- a/docs/async_utils/services/metrics_aggregator/DESIGN.md +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -38,7 +38,7 @@ INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► C - **LIVE**: the publisher tick task emits a snapshot every `--publish-interval` seconds (default 0.25 s). - **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed, - bounded by the `--drain-timeout` budget (default 300 s; `0` = unlimited). + bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited). - The ENDED path runs inside a finalization boundary: whatever the drain does — finish, time out, or fail — `publish_final` and the shutdown signal always run. A tokenizer failure can degrade the snapshot (see the @@ -174,7 +174,7 @@ COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count) [ | `--metrics-socket` | required | Snapshot PUB socket name | | `--metrics-output-dir` | required | Directory for `final_snapshot.json` | | `--publish-interval` | 0.25 | Live snapshot cadence (seconds) | -| `--drain-timeout` | 300.0 | End-of-run tokenize budget (`0` = unlimited) | +| `--drain-timeout` | 60.0 | End-of-run tokenize budget (`0` = unlimited) | | `--tokenizer` | none | HF name or local path; unset disables token metrics | | `--tokenizer-workers` | 2 | Live in-process threads (`0` = defer all to drain) | | `--streaming` | off | Register TTFT/chunk-delta/TPOT triggers | diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index e3d136ab0..2e042943c 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -133,11 +133,11 @@ async def main() -> None: parser.add_argument( "--drain-timeout", type=float, - default=300.0, + default=60.0, help=( "Wall-clock budget (seconds) to finish tokenizing buffered samples " "after ENDED before the aggregator emits the final snapshot with " - "n_pending_tasks > 0 (default: 300.0; 0 = wait indefinitely). Increase " + "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase " "for very large datasets where the end-of-run tokenize batch is big." ), ) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index 233342b46..686e5eb2f 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -96,7 +96,7 @@ class MetricCounterKey(str, Enum): _TOKEN_HDR_LOW: Final[int] = 1 _TOKEN_HDR_HIGH: Final[int] = 10_000_000 # 10M tokens -_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 300.0 +_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 60.0 class MetricsAggregatorService(ZmqMessageSubscriber[EventRecord]): diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py index e233f36a3..eacac94f5 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py @@ -45,7 +45,7 @@ class SessionState(str, Enum): LIVE → run in progress; tick task publishing live HDR-derived stats. DRAINING → ``SessionEventType.ENDED`` has been received; the aggregator is tokenizing the buffered samples (bounded by the - ``--drain-timeout`` budget, default 300 s). Tick task + ``--drain-timeout`` budget, default 60 s). Tick task continues at this stage, still HDR-derived; no new events will arrive. COMPLETE → terminal clean state. The ``publish_final()`` snapshot diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 6a8b9b872..0a59074f5 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -584,11 +584,11 @@ class DrainConfig(BaseModel): ), ), ] = Field( - 300.0, + 60.0, ge=0, description=( "Wall-clock budget (seconds) to finish tokenizing buffered samples " - "after ENDED (default: 300.0; 0 = unlimited)." + "after ENDED (default: 60.0; 0 = unlimited)." ), ) diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 5132f5b0e..75feab6fb 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). + metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index e3ec95284..3ff1ccd17 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). + metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 73c0b69d4..1287b99af 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). + metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 7d43017dc..4109ebfc2 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -489,7 +489,7 @@ def test_defaults(self): assert cfg.warmup_timeout_s == 240.0 assert cfg.performance_timeout_s == 240.0 assert cfg.accuracy_timeout_s is None - assert cfg.metrics_drain_timeout_s == 300.0 + assert cfg.metrics_drain_timeout_s == 60.0 @pytest.mark.unit @pytest.mark.parametrize( From 033d724b797bfb351b563f8f7661df8376326de9 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 17:03:37 -0700 Subject: [PATCH 15/20] chore(metrics): drop dead local in the live-cancel handler Co-Authored-By: Claude Opus 4.8 (1M context) --- .../async_utils/services/metrics_aggregator/token_metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index 1ae3c5ce7..daa3f1424 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -614,7 +614,6 @@ async def flush(self, live: bool = False) -> None: if live: self._text[:0] = text_items self._msg[:0] = msg_items - msg_items = [] raise except Exception as exc: # noqa: BLE001 — isolate phases. failure = exc From 6b704332d0cdefb26929186281d221a69492652d Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 17:42:20 -0700 Subject: [PATCH 16/20] refactor(metrics): single-source service defaults in schema; tighten docs metrics_tokenizer_workers returns to DrainConfig (default 2, ge=0; 0 = defer all to drain) and execute.py forwards it again. --drain-timeout and --tokenizer-workers become required service args; the aggregator ctor and BatchTokenizer lose their duplicated defaults. Docs and comments trimmed. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- .../services/metrics_aggregator/DESIGN.md | 246 +++++++----------- .../services/metrics_aggregator/__main__.py | 97 ++----- .../services/metrics_aggregator/aggregator.py | 44 +--- .../services/metrics_aggregator/snapshot.py | 2 +- .../metrics_aggregator/token_metrics.py | 111 ++++---- .../commands/benchmark/execute.py | 6 + src/inference_endpoint/config/schema.py | 18 ++ .../templates/concurrency_template_full.yaml | 1 + .../templates/offline_template_full.yaml | 1 + .../templates/online_template_full.yaml | 1 + .../services/metrics_aggregator/conftest.py | 2 + .../test_aggregator_error_handler.py | 1 + .../metrics_aggregator/test_token_metrics.py | 24 +- tests/unit/commands/test_benchmark.py | 15 +- 15 files changed, 222 insertions(+), 349 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 907a25109..c9e0c7f41 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils. - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O. - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks). -- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by `--drain-timeout` budget, default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. +- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`). - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots. diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md index 207c83889..42929f5a4 100644 --- a/docs/async_utils/services/metrics_aggregator/DESIGN.md +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -1,189 +1,119 @@ -# Metrics Aggregator Service — Design Document - -## Overview - -The metrics aggregator is a **subprocess** (`python -m -inference_endpoint.async_utils.services.metrics_aggregator`) that subscribes to -the EventRecord pub/sub stream, folds per-sample events into a -`MetricsRegistry` (counters + HDR-histogram series + raw values), and publishes -`MetricsSnapshot` frames over an IPC PUB socket at a fixed cadence. At -end-of-run it atomically writes `final_snapshot.json`, which is the **primary** -source for `Report`; the terminal pub/sub frame is only a TUI "run finished" -signal. - -This document covers the service's lifecycle and, in depth, the **token -metrics pipeline** — how ISL/OSL/TPOT tokenization keeps pace with -high-completion-rate runs. - -## Module Layout - -| File | Purpose | -| ------------------ | ------------------------------------------------------------------------------- | -| `__main__.py` | Subprocess entry: argparse, strict tokenizer startup, lifecycle wiring, SIGTERM | -| `aggregator.py` | `MetricsAggregatorService` — event router, session state, drain | -| `registry.py` | `MetricsRegistry`, `CounterSampler`, `SeriesSampler` | -| `snapshot.py` | `MetricsSnapshot` wire schema, `SessionState`, msgpack codec | -| `publisher.py` | `MetricsPublisher` — tick task + atomic final-snapshot write | -| `subscriber.py` | `MetricsSnapshotSubscriber` — main-process consumer | -| `metrics_table.py` | In-flight sample rows + trigger dispatch (TTFT/TPOT/ISL/OSL) | -| `token_metrics.py` | `BatchTokenizer` (sharded batch tokenization) + `TokenBatchQueue` | +# Metrics Aggregator Service — Design + +The metrics aggregator is a subprocess (`python -m +inference_endpoint.async_utils.services.metrics_aggregator`) that subscribes +to the EventRecord stream, folds per-sample events into a `MetricsRegistry`, +and publishes `MetricsSnapshot` frames over IPC PUB at a fixed cadence. At +end-of-run it atomically writes `final_snapshot.json` — the **primary** source +for `Report`; the terminal pub/sub frame is only a TUI "run finished" signal. ## Lifecycle ``` INITIALIZE ──STARTED──► LIVE ──ENDED──► DRAINING ──► COMPLETE - └──► INTERRUPTED (SIGTERM/SIGINT) + └──► INTERRUPTED (SIGTERM) ``` -- **LIVE**: the publisher tick task emits a snapshot every - `--publish-interval` seconds (default 0.25 s). -- **DRAINING**: entered on `ENDED`; the buffered tokenizations are flushed, - bounded by the `--drain-timeout` budget (default 60 s; `0` = unlimited). -- The ENDED path runs inside a finalization boundary: whatever the drain does - — finish, time out, or fail — `publish_final` and the shutdown signal always - run. A tokenizer failure can degrade the snapshot (see the - `n_pending_tasks` contract below) but can never hang the subprocess. -- **INTERRUPTED**: a signal handler writes a best-effort partial final - snapshot so `Report` can distinguish a killed run from a clean one. - -## Token Metrics Pipeline - -ISL, OSL, and TPOT all require running the HF tokenizer over prompt or -completion text. With streaming on, each completed sample needs up to three -tokenizer passes, so at high completion rates tokenization is the service's -dominant CPU cost — and a per-event dispatch model cannot keep up: work -arriving faster than it drains accumulates an unbounded backlog that must be -paid at end-of-run. The pipeline is therefore built around two ideas: -**defer-to-flush batching** and **process-sharded batch encoding**. +The ENDED path runs inside a finalization boundary: whatever the drain does — +finish, time out, or fail — `publish_final` and the shutdown signal always +run. A tokenizer failure can degrade the snapshot (see the `n_pending_tasks` +contract) but can never hang the subprocess. SIGTERM writes a best-effort +partial snapshot tagged `INTERRUPTED`. + +## Token metrics pipeline + +ISL/OSL/TPOT require tokenizer passes per completed sample; at high completion +rates a per-event dispatch model accumulates an unbounded backlog. The +pipeline batches instead: **defer-to-flush** + **process-sharded encoding**. ### Defer-to-flush (`TokenBatchQueue`) -Token triggers do no work at event time. `fire()` appends -`(text, on_count)` — or `(message_parts, on_count)` for chat-template items — -to a buffer, an O(1) operation with no event-loop tasks. The buffer is cleared -in batches at exactly two points: - -1. **The queue's own live loop** — `start_live(interval)` flushes - periodically (at the publish cadence) through the tokenizer's **in-process - live lane**: a small thread pool of `--tokenizer-workers` threads - (default 2) whose rayon pool is capped to the same width, taking at most - `_LIVE_FLUSH_MAX_ITEMS` per flush so the queue lock is never held for a - long encode. Live flushes never touch the shard processes; they run inside - the aggregator process, wherever the parent placed it. - `--tokenizer-workers 0` disables mid-run tokenization entirely. Failures - are logged once and never stop the loop — failed or cancelled live items - are **re-queued** so the drain retries them. +Triggers do no work at event time — `fire()` appends `(text, on_count)` to a +buffer, O(1), no tasks. The buffer is cleared at two points: + +1. **Live loop** — `start_live(interval)` flushes periodically through the + tokenizer's in-process lane: `--tokenizer-workers` threads, rayon capped + to the same width, at most `_LIVE_FLUSH_MAX_ITEMS` per flush. Never + touches the shard processes. `0` disables mid-run tokenization. Failed or + cancelled live items are **re-queued** — the drain retries them. 2. **End-of-run** — `flush_remaining(timeout)` stops the live loop and drains - everything still buffered through **every** shard, bounded by the drain - budget. The publisher knows nothing about tokenization — it only reads - `(state, n_pending_tasks)`. - -`flush()` serializes under an asyncio lock and detaches the buffer up front, -so enqueues that race a flush land in the next one. Failure isolation is -layered: the plain-text phase and the chat-template phase fail independently -(in drain mode they run on separate executors, so a dead text shard must not -drop message items), a raising recorder callback is logged without aborting -the rest of the batch, and the first error is re-raised only after both -phases ran. Live-mode failures and cancellations re-queue the detached items -(a mid-run hiccup never loses samples); drain-mode failures are terminal — -the items stay counted in `pending`. `flush_remaining` never raises — a -timeout or tokenizer failure becomes a logged, non-zero pending count. + everything left through every shard, bounded by the drain budget. + +`flush()` serializes under an asyncio lock and detaches the buffer up front. +The text and chat-template phases fail independently; a raising recorder is +logged without aborting the batch. Drain failures are terminal — items stay +counted in `pending`. `flush_remaining` never raises. ### Sharded batch encoding (`BatchTokenizer`) -The end-of-run drain hands the whole buffer to `count_texts_async`, which splits it into -contiguous chunks and fans them out across worker **processes**, one pinned to -each block of `CORES_PER_WORKER` (8) cores. Why this shape: - -- Each worker runs the raw `tokenizers` backend's `encode_batch_fast` — Rust, - rayon-parallel, no Python-per-text cost. Batching amortizes the - submit/result overhead over thousands of texts. -- A single BPE rayon pool is memory-bound and saturates at ~8 cores; more - threads oversubscribe and, on multi-socket parts, cross the NUMA boundary. - Sharding across processes pinned to disjoint 8-core blocks (affinity set - **before** the backend loads, so each rayon pool sizes itself to its block - and stays NUMA-local) is how the whole machine is used. -- Workers are spawn-context processes with module-level entry points (pickled - by name), warmed in parallel at construction so N tokenizer loads do not - serialize (the warmup wait is bounded — a hung load is a startup error, not - a wedge), and they ignore SIGINT — Ctrl-C goes to the whole process group, - and worker lifetime must stay under the parent drain's control. - -The shard pool has no CLI knob: it always auto-sizes to one shard per -8-core block of the allowed CPU universe (always at least one). -`--tokenizer-workers` sizes the **live** in-process thread lane instead -(default 2; `0` = no mid-run tokenization). There is no implicit fallback: an -environment that cannot shard — no fast Rust backend, a failed or over-budget -warmup — is a startup error, because a silent in-process slow path cannot -keep up with completions and would surface much later as an incomplete drain. -Platforms without a CPU-affinity API (e.g. macOS) still shard at full speed, -just unpinned: blocks are sized from the online CPU count and each worker -caps its rayon pool to the block size instead of pinning. - -Chat-template items (tool-call outputs) take a separate in-process thread: -they are rare relative to the batched flush, and `apply_chat_template` is -Python/Jinja — sharding buys nothing. A template baseline (the empty -assistant-message frame) is computed once and subtracted so only the payload -is counted. - -### CPU affinity: the tokenizer stage is post-run - -The benchmark parent pins itself to the loadgen cores (the fastest -perf-ranked physical cores) before launching services, and subprocesses -inherit that narrow mask. The tokenizer's heavy work happens **after** the -run, so the run-time core partition does not apply to it — but the aggregator -itself must not move: `_setup_shards` probes the full allowed universe via -`expand_to_all_online_cpus()` (see `endpoint_client/cpu_affinity.py`; the -kernel still clamps to the cgroup/Slurm cpuset) **and then restores the -inherited mask**, so the event loop, the publisher, and the live tokenizer -threads stay exactly where the parent placed them. Only the drain-phase shard -children, which pin themselves to their own 8-core blocks, span the whole -machine — and they are idle until `ENDED`. +The drain fans the whole buffer out across worker **processes**, one pinned +per `CORES_PER_WORKER` (8) core block. Each worker runs the raw `tokenizers` +backend's `encode_batch_fast` (Rust, rayon); a single BPE rayon pool +saturates ~8 cores, so disjoint pinned blocks are how the whole machine is +used. Workers are spawn-context, warmed in parallel at construction (bounded +— a hung load is a startup error), and ignore SIGINT. -### The `n_pending_tasks` contract +The shard pool has no knob: it auto-sizes to one shard per 8-core block of +the allowed CPU universe. There is no fallback — no fast Rust backend, or a +failed/over-budget warmup, is a startup error, because an in-process slow +path cannot keep up and would surface much later as an incomplete drain. +Platforms without an affinity API (macOS) shard unpinned; each worker caps +its rayon pool to the block size instead. -`TokenBatchQueue.pending` counts enqueued-but-not-yet-recorded items and is -surfaced on every snapshot as `n_pending_tasks`. In the **final** snapshot: +Chat-template items (tool calls) run on the in-process thread lane — +`apply_chat_template` is Python/Jinja; sharding buys nothing. -- `state == complete && n_pending_tasks == 0` — clean run, token series exact. -- `state == complete && n_pending_tasks > 0` — **incomplete drain**: the - end-of-run flush ran out of budget or the tokenizer failed; token-derived - series are missing exactly that many samples. `Report` renders a warning. +### CPU affinity: tokenize is post-run -Items dropped by a failed flush are intentionally _not_ removed from the -pending count — under-reporting an incomplete drain would silently rebadge it -as a clean run. +The parent pins itself to the loadgen cores and children inherit that narrow +mask. `_setup_shards` probes the full allowed universe via +`expand_to_all_online_cpus()` (cgroup/Slurm-clamped) for the block math, +**then restores the inherited mask** — the aggregator stays where the parent +placed it; only the drain-phase shard children span the machine, and they +are idle until `ENDED`. + +### The `n_pending_tasks` contract + +`TokenBatchQueue.pending` (enqueued-but-not-recorded) is surfaced on every +snapshot as `n_pending_tasks`. In the final snapshot: + +- `state == complete && n_pending_tasks == 0` — clean run, exact series. +- `state == complete && n_pending_tasks > 0` — **incomplete drain** (budget + exhausted or tokenizer failed); `Report` renders a warning. Failed items + are deliberately not removed from the count — under-reporting would + rebadge an incomplete drain as clean. ### Data flow ``` -COMPLETE event ─► TokenTrigger.fire ─► queue.enqueue(text, on_count) [O(1)] - │ - live loop (0.25 s) ── flush(live) ───────┤─► in-process thread pool - │ (rayon capped to --tokenizer-workers) - ENDED drain (budgeted) ── flush() ───────┘─► chunks ─► N pinned worker procs - │ (encode_batch_fast) - └─► on_count(n) ─► registry.record() +COMPLETE event ─► trigger.fire ─► queue.enqueue(text, on_count) [O(1)] + │ + live loop (publish cadence) ─ flush(live) ─► in-process threads (rayon-capped) + ENDED drain (budgeted) ────── flush() ─────► chunks ─► N pinned worker procs + └─► on_count(n) ─► registry.record() ``` -## CLI Interface +## CLI + +| Flag | Default | Purpose | +| -------------------------------- | ----------------- | --------------------------------------------------- | +| `--socket-dir` / `--socket-name` | required | EventRecord SUB socket | +| `--metrics-socket` | required | Snapshot PUB socket name | +| `--metrics-output-dir` | required | Directory for `final_snapshot.json` | +| `--publish-interval` | 0.25 | Live snapshot cadence (seconds) | +| `--drain-timeout` | required (schema) | End-of-run tokenize budget (`0` = unlimited) | +| `--tokenizer` | none | HF name or local path; unset disables token metrics | +| `--tokenizer-workers` | required (schema) | Live in-process threads (`0` = defer all to drain) | +| `--streaming` | off | Register TTFT/chunk-delta/TPOT triggers | -| Flag | Default | Purpose | -| -------------------------------- | -------- | --------------------------------------------------- | -| `--socket-dir` / `--socket-name` | required | EventRecord SUB socket | -| `--metrics-socket` | required | Snapshot PUB socket name | -| `--metrics-output-dir` | required | Directory for `final_snapshot.json` | -| `--publish-interval` | 0.25 | Live snapshot cadence (seconds) | -| `--drain-timeout` | 60.0 | End-of-run tokenize budget (`0` = unlimited) | -| `--tokenizer` | none | HF name or local path; unset disables token metrics | -| `--tokenizer-workers` | 2 | Live in-process threads (`0` = defer all to drain) | -| `--streaming` | off | Register TTFT/chunk-delta/TPOT triggers | +`--drain-timeout` and `--tokenizer-workers` carry no service-side defaults: +the benchmark always forwards them from `config/schema.py` +(`--metrics-drain-timeout`, `--metrics-tokenizer-workers`), the single source +of truth for their values. ## References - [docs/async_utils/services/DESIGN.md](../DESIGN.md) — the EventRecord pub/sub system this service subscribes to. - [docs/PERF_ARCHITECTURE.md](../../../PERF_ARCHITECTURE.md) — CPU pinning - strategy for the loadgen/worker hot path. -- AGENTS.md "Metrics Aggregator subprocess" — the condensed contract summary - for AI agents. + for the loadgen/worker hot path. diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 2e042943c..58733c087 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -49,27 +49,18 @@ def _make_sigterm_handler( ) -> tuple[Callable[[], None], set[asyncio.Task]]: """Build the SIGTERM handler that writes the INTERRUPTED final snapshot. - Returns ``(handler, pending_tasks)``. ``pending_tasks`` is the - strong-reference container that keeps spawned finalize tasks alive - while they run: asyncio tracks tasks only by weakref, so a task - whose only reference is the local variable inside the handler can - be garbage-collected mid-execution (per Python's asyncio docs). - Each spawned task self-removes from the set via - ``add_done_callback`` once it completes. - - Exposed at module level (rather than nested in ``main()``) so the - GC-safety contract is unit-testable without driving the whole - subprocess lifecycle. + Returns ``(handler, pending_tasks)``: asyncio holds tasks only by + weakref, so the handler's finalize task must live in this + strong-reference set until done. Module-level so the GC-safety + contract is unit-testable. """ pending_tasks: set[asyncio.Task] = set() async def _signal_finalize() -> None: try: - # Mirror the ENDED-driven path: refresh tracked_duration_ns - # from the table BEFORE publish_final, otherwise an - # interrupted run whose STOP_PERFORMANCE_TRACKING never - # fired would report duration_ns=0 and QPS=N/A in the final - # report even after processing many tracked samples. + # Refresh tracked_duration_ns before publish_final (mirrors the + # ENDED path) — otherwise an interrupted run whose + # STOP_PERFORMANCE_TRACKING never fired reports QPS=N/A. registry.set_counter( MetricCounterKey.TRACKED_DURATION_NS.value, table.total_tracked_duration_ns, @@ -133,12 +124,11 @@ async def main() -> None: parser.add_argument( "--drain-timeout", type=float, - default=60.0, + required=True, help=( "Wall-clock budget (seconds) to finish tokenizing buffered samples " - "after ENDED before the aggregator emits the final snapshot with " - "n_pending_tasks > 0 (default: 60.0; 0 = wait indefinitely). Increase " - "for very large datasets where the end-of-run tokenize batch is big." + "after ENDED (0 = wait indefinitely). The benchmark forwards " + "--metrics-drain-timeout; the default lives in config/schema.py." ), ) parser.add_argument( @@ -162,12 +152,12 @@ async def main() -> None: parser.add_argument( "--tokenizer-workers", type=int, - default=2, + required=True, help=( "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT " - "(default: 2; 0 = no mid-run tokenization, everything defers " - "to the end-of-run drain). The drain always uses the auto-sized " - "sharded pool — one worker process per 8-core block." + "(0 = defer everything to the end-of-run drain, which always uses " + "the auto-sized sharded pool). The benchmark forwards " + "--metrics-tokenizer-workers; the default lives in config/schema.py." ), ) parser.add_argument( @@ -194,12 +184,8 @@ async def main() -> None: if args.tokenizer_workers < 0: raise SystemExit("FATAL: --tokenizer-workers must be >= 0") - # The parent owns directory setup — `commands/benchmark/execute.py` - # creates `/metrics/` and validates it before launching - # this subprocess. Validate here as a fail-fast contract check so a - # misbehaving launcher (or a manual invocation) surfaces a clear - # error in this subprocess's stderr instead of crashing later on - # the atomic-write path. + # The parent (commands/benchmark/execute.py) owns directory creation; + # fail fast here so a bad launcher errors now, not on the atomic write. metrics_output_dir: Path = args.metrics_output_dir if not metrics_output_dir.is_dir(): raise SystemExit( @@ -220,9 +206,8 @@ async def main() -> None: args.tokenizer, live_workers=args.tokenizer_workers ) except RuntimeError as exc: - # Fail-fast contract: a tokenizer environment that cannot shard - # must surface as a clear service-launch failure, not a silent - # slow path that cannot keep up with completions. + # An environment that cannot shard is a launch failure, not a + # silent slow path that cannot keep up with completions. raise SystemExit(f"FATAL: {exc}") from exc else: tokenizer_cm = nullcontext() @@ -260,29 +245,12 @@ async def main() -> None: ) aggregator.start() - # SIGTERM only — the parent's ServiceLauncher.kill_all uses - # SIGTERM to kill the aggregator child before an ENDED event - # arrives; without this handler that path leaves the Report - # consumer with no final_snapshot file. The signal-triggered - # snapshot is tagged INTERRUPTED so Report can distinguish - # "parent killed the run" from a clean shutdown. - # publish_final is idempotent (see - # MetricsPublisher._finalized), so racing with the - # ENDED-driven call is safe. - # - # SIGINT is deliberately NOT handled in the same way. On an - # interactive ^C, the OS sends SIGINT to the whole - # foreground process group — parent + child both receive - # it. If we finalized eagerly here, the aggregator would - # write final_snapshot.json from whatever state it had at - # signal time, then exit; samples that completed during the - # parent's own graceful shutdown window would never reach - # the file (the parent eventually emits ENDED on its events - # channel, but `_finalized=True` makes that a no-op). The - # parent's clean-shutdown path is what we want to drive the - # aggregator's finalize — so we install a no-op handler for - # SIGINT here, which prevents Python's default - # KeyboardInterrupt and lets the parent control the lifecycle. + # SIGTERM (ServiceLauncher.kill_all) must still produce a final + # snapshot, tagged INTERRUPTED; publish_final is idempotent, so + # racing the ENDED-driven call is safe. SIGINT (^C hits the whole + # process group) is a no-op: finalizing eagerly at signal time + # would freeze the snapshot before the parent's graceful-shutdown + # samples land — the parent's ENDED drives finalize instead. on_sigterm, _sigterm_tasks = _make_sigterm_handler( loop=loop, registry=registry, @@ -292,8 +260,6 @@ async def main() -> None: shutdown_event=shutdown_event, ) loop.add_signal_handler(signal.SIGTERM, on_sigterm) - # No-op SIGINT handler: silence the default KeyboardInterrupt - # and let the parent's ENDED-driven path drive shutdown. loop.add_signal_handler( signal.SIGINT, lambda: logger.info( @@ -313,24 +279,13 @@ async def main() -> None: if __name__ == "__main__": - # Surface startup / bind / tokenizer-load failures with structured - # context. Without this wrap, the parent's ServiceLauncher only sees - # the non-zero exit code and a raw traceback — no diagnostic context - # to correlate against the parent's logs. The except/raise pattern - # preserves the original exit code (1) and traceback while emitting - # the structured logger.exception line before the interpreter prints - # the trace. try: LoopManager().default_loop.run_until_complete(main()) except SystemExit: # argparse / explicit sys.exit — already user-facing, don't dress up. raise except Exception as e: - # Catch Exception (not BaseException) so KeyboardInterrupt / - # SystemExit propagate untouched — those are control-flow - # signals, not crashes, and labeling them as "crashed" would - # mislead operators. The exception type goes first in the log - # message so it's grep-able without scrolling through the - # traceback. + # Structured log line so the crash is grep-able against the parent's + # logs; KeyboardInterrupt/SystemExit propagate untouched. logger.exception("metrics aggregator subprocess crashed (%s)", type(e).__name__) raise diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index 686e5eb2f..0a6f2dfbd 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -96,8 +96,6 @@ class MetricCounterKey(str, Enum): _TOKEN_HDR_LOW: Final[int] = 1 _TOKEN_HDR_HIGH: Final[int] = 10_000_000 # 10M tokens -_DEFAULT_DRAIN_TIMEOUT_S: Final[float] = 60.0 - class MetricsAggregatorService(ZmqMessageSubscriber[EventRecord]): """Subscribes to EventRecords and computes per-sample metrics in real time. @@ -121,15 +119,11 @@ def __init__( live_flush_interval_s: float | None = None, streaming: bool = False, shutdown_event: asyncio.Event | None = None, - drain_timeout_s: float | None = _DEFAULT_DRAIN_TIMEOUT_S, + drain_timeout_s: float | None, **kwargs, ): - # drain_timeout_s is injected (not derived) because the right - # value is workload-dependent: long-context tokenize-heavy runs - # need more headroom than the default 60 s, and the aggregator - # itself can't measure that ahead of time. Keeping it as an arg - # lets the __main__ CLI flag plumb the user's choice through - # without coupling this class to argparse. + # drain_timeout_s has no default here: the one default lives in + # config/schema.py (metrics_drain_timeout_s). None = wait forever. super().__init__(EventRecordCodec(), *args, **kwargs) self._registry = registry self._publisher = publisher @@ -304,15 +298,9 @@ async def process(self, records: list[EventRecord]) -> None: else: if ev == SessionEventType.STARTED: if self._session_start_ns is not None: - # A duplicate STARTED is a producer bug: - # re-assigning _session_start_ns would freeze - # total_duration_ns (the max-of-elapsed guard - # never updates once the start moves forward) - # and corrupt every downstream rate calc for - # the rest of the run. Surface loudly and - # ignore — the publisher.start guard already - # rejects the second tick-task spawn, but - # session-state must also be defended here. + # Producer bug: re-assigning _session_start_ns + # would freeze total_duration_ns (max-of-elapsed + # guard) and corrupt every downstream rate calc. logger.error( "Duplicate STARTED event received " "(original at ts=%d, duplicate at ts=%d); " @@ -397,16 +385,13 @@ async def process(self, records: list[EventRecord]) -> None: # that fires before publish_final reflects the new state. self._session_state = SessionState.DRAINING logger.info("Draining %d pending tokenizations...", self.pending_tokens) - # The drain and final publish are wrapped together so the aggregator - # ALWAYS reaches _finalize (which sets the shutdown event); a - # tokenizer failure during the drain must not skip publish_final and - # leave main()'s `await shutdown_event.wait()` hanging. + # Drain + final publish run inside one finalization boundary: a + # tokenizer failure must not skip publish_final and leave + # main()'s `await shutdown_event.wait()` hanging. n_pending = self.pending_tokens try: - # flush_remaining tokenizes the whole buffer in one batched pass, - # bounded by the drain budget, and never raises: it returns the - # count it could not finish (timeout or failure), which becomes - # the snapshot's n_pending_tasks so Report flags an incomplete drain. + # flush_remaining never raises; it returns the count it could + # not finish, which becomes the snapshot's n_pending_tasks. if self._token_queue is not None: n_pending = await self._token_queue.flush_remaining( self._drain_timeout_s @@ -432,11 +417,8 @@ async def process(self, records: list[EventRecord]) -> None: ) await self._publisher.publish_final(registry, n_pending_tasks=n_pending) finally: - # The aggregator MUST close the publisher and signal shutdown even - # if the drain/publish above failed — otherwise main()'s - # `await shutdown_event.wait()` hangs forever. aclose is - # independently wrapped: its failure must not prevent _finalize, - # which is what sets the shutdown event. + # aclose is independently wrapped: its failure must not + # prevent _finalize, which sets the shutdown event. try: await self._publisher.aclose() except Exception: # noqa: BLE001 — best-effort cleanup. diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py index eacac94f5..a1e461c43 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py @@ -45,7 +45,7 @@ class SessionState(str, Enum): LIVE → run in progress; tick task publishing live HDR-derived stats. DRAINING → ``SessionEventType.ENDED`` has been received; the aggregator is tokenizing the buffered samples (bounded by the - ``--drain-timeout`` budget, default 60 s). Tick task + ``--drain-timeout`` budget — schema default 60 s). Tick task continues at this stage, still HDR-derived; no new events will arrive. COMPLETE → terminal clean state. The ``publish_final()`` snapshot diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index daa3f1424..d67d82f97 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -15,15 +15,12 @@ """Tokenization for ISL/OSL/TPOT metrics. -``BatchTokenizer`` tokenizes whole batches at once, sharded across worker -processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE -rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers -per-sample text. The sharded pool is the drain-phase accelerator and is -auto-sized (one shard per core block); live mid-run flushes run on a small -in-process thread pool (``--tokenizer-workers``, default 2) owned by the -queue's live loop. A tokenizer without a fast (Rust) backend is a startup -error, never a silent slow path. Platforms without CPU affinity (e.g. macOS) -shard unpinned at full speed; only cache/NUMA locality is lost. +``BatchTokenizer`` runs two lanes: live mid-run flushes on a small in-process +thread pool (``--tokenizer-workers``), and the end-of-run drain sharded +across worker processes each pinned to a ``CORES_PER_WORKER`` block. +``TokenBatchQueue`` buffers per-sample work and clears it in batches. A +tokenizer without a fast (Rust) backend is a startup error, never a silent +slow path; platforms without CPU affinity (e.g. macOS) shard unpinned. """ from __future__ import annotations @@ -52,18 +49,14 @@ # used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process. CORES_PER_WORKER = 8 -# Budget for the parallel shard warmup (spawn + transformers import + -# tokenizer load per worker). A hung load (e.g. a stuck network filesystem) -# must become a bounded startup error, not wedge service startup — and the -# error must fire before the parent's 30 s service-launch budget kills the -# subprocess, so the diagnostic wins the race. +# Warmup budget (spawn + transformers import + tokenizer load per worker). +# A hung load must become a startup error that fires before the parent's +# 30 s service-launch budget kills the subprocess. _SHARD_WARMUP_TIMEOUT_S = 25.0 -# Per-flush ceiling for the LIVE lane. Bounds three things at once: how long -# the queue lock is held mid-run, how much work an unstoppable in-flight -# thread encode can hold after a drain-start cancellation, and how much the -# drain re-encodes for items the cancelled flush gave back. The drain has no -# ceiling — it always takes the whole buffer. +# Per-flush ceiling for the LIVE lane: bounds the lock-hold time and the +# unstoppable in-flight encode a drain-start cancellation leaves behind. +# The drain has no ceiling — it always takes the whole buffer. _LIVE_FLUSH_MAX_ITEMS = 1024 # Minimal user message used to satisfy chat templates that reject assistant-only @@ -199,15 +192,16 @@ def __init__( self, tokenizer_name: str, *, + live_workers: int, cores_per_worker: int = CORES_PER_WORKER, n_workers: int = -1, - live_workers: int = 2, ) -> None: self._tokenizer_name = tokenizer_name - # The live lane runs in-process: cap this process's rayon pool so a - # mid-run batched encode uses ~live_workers cores, not the whole - # machine. Must be set before the first encode initializes the pool; - # setdefault lets an operator-exported RAYON_NUM_THREADS win. + # Cap this process's rayon pool so a live (in-process) batched encode + # uses ~live_workers cores, not the whole machine. Must be set before + # the first encode initializes the pool; setdefault lets an + # operator-exported RAYON_NUM_THREADS win. live_workers has no + # default — the one default lives in config/schema.py. os.environ.setdefault("RAYON_NUM_THREADS", str(max(1, live_workers))) self._live_workers = live_workers self._fallback_warned: set[str] = set() @@ -220,10 +214,9 @@ def __init__( max_workers=max(1, live_workers), thread_name_prefix="tok-thread" ) self._load_tokenizer() # also computes the chat-template baseline - # Process shards for the batched text path. Empty only when - # in-process mode was explicitly requested (n_workers=0 or - # cores_per_worker<=0; ctor overrides used primarily by tests — - # production wiring passes live_workers only and shards auto-size). + # Process shards for the drain. Empty only when in-process mode was + # explicitly requested (n_workers=0 / cores_per_worker<=0, test-only + # seams — production wiring always auto-sizes). self._procs: list[ProcessPoolExecutor] = [] self._setup_shards(cores_per_worker, n_workers) @@ -265,14 +258,11 @@ def _load_tokenizer(self) -> None: def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: """Spawn one pinned single-worker process per core block. - ``n_workers == 0`` explicitly selects in-process tokenization. Auto - (``< 0``) fits one shard per ``cores_per_worker`` block of this - process's affinity mask (or the online CPU count when the platform - has no affinity API — shards then run unpinned), always at least one; - an explicit count is clamped to that capacity. An environment that - cannot shard — no fast Rust backend, a warmup that fails or exceeds - its budget — raises instead of silently degrading to a slow path - that cannot keep up with completions. + ``n_workers == 0`` selects in-process tokenization; auto (``< 0``) + fits one shard per ``cores_per_worker`` block (at least one); an + explicit count is clamped to capacity. An environment that cannot + shard — no fast Rust backend, a failed or over-budget warmup — + raises instead of degrading to a slow path. """ if cores_per_worker <= 0 or n_workers == 0: logger.info("BatchTokenizer: in-process tokenization (explicit)") @@ -283,12 +273,9 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: "backend; token metrics require one to keep up with " "completions. Use a fast tokenizer, or disable token metrics." ) - # Probe the full allowed CPU universe (cgroup-clamped) for the shard - # block math, then restore this process's inherited mask: the - # aggregator's event loop, publisher, and live tokenizer threads stay - # exactly where the parent placed them (the loadgen mask on a pinned - # Linux run). Only the drain-phase shard processes, pinned to their - # own blocks, span the whole machine. + # Probe the full allowed CPU universe (cgroup-clamped) for the block + # math, then restore the inherited mask: the aggregator stays where + # the parent placed it; only the drain shards span the machine. try: original = os.sched_getaffinity(0) except (OSError, AttributeError): @@ -325,11 +312,9 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: initargs=(self._tokenizer_name, block), ) procs.append(ex) - # Force spawn + pin + tokenizer-load now (not on the first batch). - # Submit to every shard first so the loads run in parallel, then - # await — waiting on each before submitting the next would - # serialize P tokenizer loads and can exceed the launch budget. - # The wait is bounded: one hung load must not wedge startup. + # Warm all shards in parallel (submit-then-await; awaiting each + # before the next would serialize N tokenizer loads). Bounded: + # one hung load must not wedge startup. ready = [ex.submit(_worker_ready, 0) for ex in procs] deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S for f in ready: @@ -507,16 +492,12 @@ async def token_count_message_async( class TokenBatchQueue: """Buffers per-sample tokenization work and clears it in batches. - Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an - ``on_count`` callback that records the resulting metric. The queue owns - its own flush cadence: ``start_live`` begins a periodic flush through the - tokenizer's bounded live lane (so live ISL/OSL/TPOT stay current without - touching the benchmark's cores), and ``flush_remaining`` drains everything - left at end-of-run through every shard. - - ``pending`` counts enqueued-but-not-yet-recorded items; it is the - ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot - means the end-of-run flush did not finish within the drain budget or failed. + Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with + an ``on_count`` recorder callback. The queue owns its flush cadence: + ``start_live`` flushes periodically through the tokenizer's bounded live + lane; ``flush_remaining`` drains everything left at end-of-run through + every shard. ``pending`` is the snapshot's ``n_pending_tasks`` — + non-zero in the final snapshot means an incomplete drain. """ def __init__( @@ -577,16 +558,12 @@ def enqueue_message( async def flush(self, live: bool = False) -> None: """Tokenize everything buffered so far and run each ``on_count``. - ``live=True`` routes text batches through the tokenizer's bounded - live lane instead of the full shard pool, takes at most - ``_LIVE_FLUSH_MAX_ITEMS`` per kind (bounding lock-hold time and the - unstoppable in-flight encode a drain-start cancellation leaves - behind), and re-queues items on failure or cancellation so a mid-run - hiccup never loses samples — the end-of-run drain retries them. Drain-mode failures are terminal: the - un-recorded items stay counted in ``pending`` (``_inflight`` is - decremented only after a callback runs) and surface as an incomplete - drain, not as silently dropped samples. Items are detached from the - buffer up front so concurrent enqueues land in the next flush. + ``live=True`` routes text through the tokenizer's bounded live lane, + takes at most ``_LIVE_FLUSH_MAX_ITEMS`` per kind, and re-queues items + on failure or cancellation — a mid-run hiccup never loses samples. + Drain-mode failures are terminal: un-recorded items stay counted in + ``pending`` and surface as an incomplete drain. Items are detached up + front so concurrent enqueues land in the next flush. """ async with self._lock: if not (self._text or self._msg): diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 380a0b14d..a2050bbe3 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -612,6 +612,12 @@ async def _run_benchmark_async( aggregator_args.extend( ["--drain-timeout", str(config.settings.drain.metrics_drain_timeout_s)] ) + aggregator_args.extend( + [ + "--tokenizer-workers", + str(config.settings.drain.metrics_tokenizer_workers), + ] + ) # EventLoggerService writes events.jsonl to tmpfs (high-frequency writes) event_logger_args: list[str] = [ diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 0a59074f5..f093a547a 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -591,6 +591,24 @@ class DrainConfig(BaseModel): "after ENDED (default: 60.0; 0 = unlimited)." ), ) + metrics_tokenizer_workers: Annotated[ + int, + cyclopts.Parameter( + alias="--metrics-tokenizer-workers", + help=( + "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT in " + "the metrics aggregator. 0 defers all tokenization to the " + "end-of-run drain, which always uses the auto-sized sharded pool." + ), + ), + ] = Field( + 2, + ge=0, + description=( + "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT " + "(default: 2; 0 = defer everything to the end-of-run drain)." + ), + ) @cyclopts.Parameter(name="*") diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 75feab6fb..30b224402 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -80,6 +80,7 @@ settings: performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_tokenizer_workers: 2 # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index 3ff1ccd17..ae0ae939c 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -80,6 +80,7 @@ settings: performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_tokenizer_workers: 2 # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 1287b99af..9c5c62842 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -80,6 +80,7 @@ settings: performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_tokenizer_workers: 2 # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain). warmup: enabled: false # Enable warmup phase before performance run n_requests: null # Warmup request count (None = full dataset once) diff --git a/tests/unit/async_utils/services/metrics_aggregator/conftest.py b/tests/unit/async_utils/services/metrics_aggregator/conftest.py index 38e25945c..aae7a07ac 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/conftest.py +++ b/tests/unit/async_utils/services/metrics_aggregator/conftest.py @@ -171,6 +171,7 @@ def make_aggregator( live_flush_interval_s: float | None = None, streaming: bool = True, shutdown_event: asyncio.Event | None = None, + drain_timeout_s: float | None = None, ) -> tuple[MetricsAggregatorService, MetricsRegistry, MagicMock]: """Construct an aggregator wired to a real SUB socket and a mocked publisher. @@ -203,5 +204,6 @@ def make_aggregator( live_flush_interval_s=live_flush_interval_s, streaming=streaming, shutdown_event=shutdown_event, + drain_timeout_s=drain_timeout_s, ) return agg, registry, publisher diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py index 4e8222c48..40e6eb91b 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_aggregator_error_handler.py @@ -82,6 +82,7 @@ def _make_aggregator( sig_figs=3, n_histogram_buckets=10, streaming=streaming, + drain_timeout_s=None, ) return agg, registry, publisher diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py index 8bf838c17..ba0d2e3e9 100644 --- a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py +++ b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py @@ -85,7 +85,7 @@ class TestBatchTokenizer: async def test_count_texts_async(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake", n_workers=0) as tok: + with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok: counts = await tok.count_texts_async(["Hello world foo", "a"], loop) assert counts == [3, 1] @@ -93,7 +93,7 @@ async def test_count_texts_async(self): async def test_count_texts_async_empty(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake", n_workers=0) as tok: + with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok: assert await tok.count_texts_async([], loop) == [] @pytest.mark.asyncio @@ -101,7 +101,7 @@ async def test_count_texts_async_sharded(self): """With shards present, chunks are reassembled in original order.""" with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake", n_workers=0) as tok: + with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok: tok._procs = [_FakeProc(), _FakeProc()] counts = await tok.count_texts_async(["a", "b b", "c c c", "d"], loop) assert counts == [1, 2, 3, 1] @@ -111,14 +111,14 @@ async def test_count_texts_async_shard_failure_propagates(self): """A dead shard surfaces as an error, not a silent in-process fallback.""" with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake", n_workers=0) as tok: + with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok: tok._procs = [_BrokenProc()] with pytest.raises(BrokenProcessPool): await tok.count_texts_async(["a b"], loop) def test_close_is_idempotent(self): with patch(_MOCK_TARGET, _FakeTokenizer): - tok = BatchTokenizer("fake", n_workers=0) + tok = BatchTokenizer("fake", n_workers=0, live_workers=2) tok.close() tok.close() # must not raise @@ -126,7 +126,7 @@ def test_close_is_idempotent(self): async def test_use_after_close_raises(self): with patch(_MOCK_TARGET, _FakeTokenizer): loop = asyncio.get_running_loop() - tok = BatchTokenizer("fake", n_workers=0) + tok = BatchTokenizer("fake", n_workers=0, live_workers=2) tok.close() with pytest.raises(RuntimeError, match="closed"): await tok.count_texts_async(["hello"], loop) @@ -163,7 +163,7 @@ async def test_token_count_message_subtracts_baseline(self): """token_count_message_async returns full_tokens - baseline.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): loop = asyncio.get_running_loop() - with BatchTokenizer("fake", n_workers=0) as tok: + with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok: # "hello world" -> 2 content + 2 wrapper = 4; baseline = 0, prefix = 2 count = await tok.token_count_message_async( "hello world", None, None, loop @@ -175,7 +175,7 @@ async def test_token_count_message_includes_tool_calls(self): """Tool-call JSON tokens are included in the count.""" with patch(_MOCK_TARGET, _FakeTokenizerWithTemplate): loop = asyncio.get_running_loop() - with BatchTokenizer("fake", n_workers=0) as tok: + with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok: tool_calls = ( { "id": "c1", @@ -199,7 +199,7 @@ def apply_chat_template(self, *args, **kwargs): with patch(_MOCK_TARGET, _BadTemplateTokenizer): loop = asyncio.get_running_loop() - with BatchTokenizer("fake", n_workers=0) as tok: + with BatchTokenizer("fake", n_workers=0, live_workers=2) as tok: tool_calls = ( { "id": "c1", @@ -300,7 +300,7 @@ def _make(self, monkeypatch, cpus, n_workers, executor=_SpawnlessExecutor): lambda pid, mask: self.restored.append(set(mask)), ) with patch(_MOCK_TARGET, _FakeTokenizerWithBackend): - return BatchTokenizer("fake", n_workers=n_workers) + return BatchTokenizer("fake", n_workers=n_workers, live_workers=2) @pytest.mark.parametrize( "cpus, n_workers, expected_shards", @@ -336,7 +336,7 @@ def test_no_fast_backend_is_a_startup_error(self, monkeypatch): ) with patch(_MOCK_TARGET, _FakeTokenizer): # no backend_tokenizer with pytest.raises(RuntimeError, match="fast"): - BatchTokenizer("fake") + BatchTokenizer("fake", live_workers=2) def test_affinity_unavailable_shards_unpinned(self, monkeypatch): """No affinity API (e.g. macOS): shard from the CPU count, unpinned.""" @@ -357,7 +357,7 @@ def _raise(pid): monkeypatch.setattr(token_metrics_module.os, "sched_getaffinity", _raise) monkeypatch.setattr(token_metrics_module.os, "cpu_count", lambda: 16) with patch(_MOCK_TARGET, _FakeTokenizerWithBackend): - with BatchTokenizer("fake") as tok: + with BatchTokenizer("fake", live_workers=2) as tok: assert len(tok._procs) == 2 def test_warmup_failure_is_a_startup_error(self, monkeypatch): diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 4109ebfc2..68498c9b4 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -633,12 +633,10 @@ async def _capture_launch(service_configs, *, timeout): @pytest.mark.unit @pytest.mark.asyncio - async def test_tokenizer_forwarded_and_live_args_left_to_service_defaults( - self, tmp_path - ): - """Pins the SUT-intrusion seam: the benchmark forwards --tokenizer but - deliberately no live/worker knobs — the service's own defaults govern - mid-run tokenization.""" + async def test_tokenizer_and_workers_forwarded_from_schema(self, tmp_path): + """The benchmark forwards --tokenizer and --tokenizer-workers; the + workers value comes from the schema default + (drain.metrics_tokenizer_workers), the single source of truth.""" config = OfflineConfig(**_OFFLINE_KWARGS, settings=OfflineSettings()) ctx = self._make_ctx(config, tmp_path) ctx.tokenizer_name = "gpt2" @@ -681,8 +679,9 @@ async def _capture_launch(service_configs, *, timeout): args = aggregator_cfg.args idx = args.index("--tokenizer") assert args[idx + 1] == "gpt2" - assert "--tokenizer-workers" not in args - assert "--live-tokenizers" not in args + idx = args.index("--tokenizer-workers") + expected = str(config.settings.drain.metrics_tokenizer_workers) + assert args[idx + 1] == expected class TestBuildPhases: From 6361768fb33c13121f0b22aa8f482c98bc9a1726 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 17:57:33 -0700 Subject: [PATCH 17/20] chore(metrics): restore original comments; keep only default-related edits Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/__main__.py | 95 +++++++++++---- .../services/metrics_aggregator/aggregator.py | 40 +++++-- .../metrics_aggregator/token_metrics.py | 109 +++++++++++------- 3 files changed, 165 insertions(+), 79 deletions(-) diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 58733c087..70633fb6d 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -49,18 +49,27 @@ def _make_sigterm_handler( ) -> tuple[Callable[[], None], set[asyncio.Task]]: """Build the SIGTERM handler that writes the INTERRUPTED final snapshot. - Returns ``(handler, pending_tasks)``: asyncio holds tasks only by - weakref, so the handler's finalize task must live in this - strong-reference set until done. Module-level so the GC-safety - contract is unit-testable. + Returns ``(handler, pending_tasks)``. ``pending_tasks`` is the + strong-reference container that keeps spawned finalize tasks alive + while they run: asyncio tracks tasks only by weakref, so a task + whose only reference is the local variable inside the handler can + be garbage-collected mid-execution (per Python's asyncio docs). + Each spawned task self-removes from the set via + ``add_done_callback`` once it completes. + + Exposed at module level (rather than nested in ``main()``) so the + GC-safety contract is unit-testable without driving the whole + subprocess lifecycle. """ pending_tasks: set[asyncio.Task] = set() async def _signal_finalize() -> None: try: - # Refresh tracked_duration_ns before publish_final (mirrors the - # ENDED path) — otherwise an interrupted run whose - # STOP_PERFORMANCE_TRACKING never fired reports QPS=N/A. + # Mirror the ENDED-driven path: refresh tracked_duration_ns + # from the table BEFORE publish_final, otherwise an + # interrupted run whose STOP_PERFORMANCE_TRACKING never + # fired would report duration_ns=0 and QPS=N/A in the final + # report even after processing many tracked samples. registry.set_counter( MetricCounterKey.TRACKED_DURATION_NS.value, table.total_tracked_duration_ns, @@ -127,8 +136,10 @@ async def main() -> None: required=True, help=( "Wall-clock budget (seconds) to finish tokenizing buffered samples " - "after ENDED (0 = wait indefinitely). The benchmark forwards " - "--metrics-drain-timeout; the default lives in config/schema.py." + "after ENDED before the aggregator emits the final snapshot with " + "n_pending_tasks > 0 (0 = wait indefinitely; the benchmark forwards " + "the schema default, see config/schema.py). Increase for very large " + "datasets where the end-of-run tokenize batch is big." ), ) parser.add_argument( @@ -155,9 +166,10 @@ async def main() -> None: required=True, help=( "In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT " - "(0 = defer everything to the end-of-run drain, which always uses " - "the auto-sized sharded pool). The benchmark forwards " - "--metrics-tokenizer-workers; the default lives in config/schema.py." + "(0 = no mid-run tokenization, everything defers to the " + "end-of-run drain; the benchmark forwards the schema default, " + "see config/schema.py). The drain always uses the auto-sized " + "sharded pool — one worker process per 8-core block." ), ) parser.add_argument( @@ -184,8 +196,12 @@ async def main() -> None: if args.tokenizer_workers < 0: raise SystemExit("FATAL: --tokenizer-workers must be >= 0") - # The parent (commands/benchmark/execute.py) owns directory creation; - # fail fast here so a bad launcher errors now, not on the atomic write. + # The parent owns directory setup — `commands/benchmark/execute.py` + # creates `/metrics/` and validates it before launching + # this subprocess. Validate here as a fail-fast contract check so a + # misbehaving launcher (or a manual invocation) surfaces a clear + # error in this subprocess's stderr instead of crashing later on + # the atomic-write path. metrics_output_dir: Path = args.metrics_output_dir if not metrics_output_dir.is_dir(): raise SystemExit( @@ -206,8 +222,9 @@ async def main() -> None: args.tokenizer, live_workers=args.tokenizer_workers ) except RuntimeError as exc: - # An environment that cannot shard is a launch failure, not a - # silent slow path that cannot keep up with completions. + # Fail-fast contract: a tokenizer environment that cannot shard + # must surface as a clear service-launch failure, not a silent + # slow path that cannot keep up with completions. raise SystemExit(f"FATAL: {exc}") from exc else: tokenizer_cm = nullcontext() @@ -245,12 +262,29 @@ async def main() -> None: ) aggregator.start() - # SIGTERM (ServiceLauncher.kill_all) must still produce a final - # snapshot, tagged INTERRUPTED; publish_final is idempotent, so - # racing the ENDED-driven call is safe. SIGINT (^C hits the whole - # process group) is a no-op: finalizing eagerly at signal time - # would freeze the snapshot before the parent's graceful-shutdown - # samples land — the parent's ENDED drives finalize instead. + # SIGTERM only — the parent's ServiceLauncher.kill_all uses + # SIGTERM to kill the aggregator child before an ENDED event + # arrives; without this handler that path leaves the Report + # consumer with no final_snapshot file. The signal-triggered + # snapshot is tagged INTERRUPTED so Report can distinguish + # "parent killed the run" from a clean shutdown. + # publish_final is idempotent (see + # MetricsPublisher._finalized), so racing with the + # ENDED-driven call is safe. + # + # SIGINT is deliberately NOT handled in the same way. On an + # interactive ^C, the OS sends SIGINT to the whole + # foreground process group — parent + child both receive + # it. If we finalized eagerly here, the aggregator would + # write final_snapshot.json from whatever state it had at + # signal time, then exit; samples that completed during the + # parent's own graceful shutdown window would never reach + # the file (the parent eventually emits ENDED on its events + # channel, but `_finalized=True` makes that a no-op). The + # parent's clean-shutdown path is what we want to drive the + # aggregator's finalize — so we install a no-op handler for + # SIGINT here, which prevents Python's default + # KeyboardInterrupt and lets the parent control the lifecycle. on_sigterm, _sigterm_tasks = _make_sigterm_handler( loop=loop, registry=registry, @@ -260,6 +294,8 @@ async def main() -> None: shutdown_event=shutdown_event, ) loop.add_signal_handler(signal.SIGTERM, on_sigterm) + # No-op SIGINT handler: silence the default KeyboardInterrupt + # and let the parent's ENDED-driven path drive shutdown. loop.add_signal_handler( signal.SIGINT, lambda: logger.info( @@ -279,13 +315,24 @@ async def main() -> None: if __name__ == "__main__": + # Surface startup / bind / tokenizer-load failures with structured + # context. Without this wrap, the parent's ServiceLauncher only sees + # the non-zero exit code and a raw traceback — no diagnostic context + # to correlate against the parent's logs. The except/raise pattern + # preserves the original exit code (1) and traceback while emitting + # the structured logger.exception line before the interpreter prints + # the trace. try: LoopManager().default_loop.run_until_complete(main()) except SystemExit: # argparse / explicit sys.exit — already user-facing, don't dress up. raise except Exception as e: - # Structured log line so the crash is grep-able against the parent's - # logs; KeyboardInterrupt/SystemExit propagate untouched. + # Catch Exception (not BaseException) so KeyboardInterrupt / + # SystemExit propagate untouched — those are control-flow + # signals, not crashes, and labeling them as "crashed" would + # mislead operators. The exception type goes first in the log + # message so it's grep-able without scrolling through the + # traceback. logger.exception("metrics aggregator subprocess crashed (%s)", type(e).__name__) raise diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index 0a6f2dfbd..64a31ee42 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -122,8 +122,12 @@ def __init__( drain_timeout_s: float | None, **kwargs, ): - # drain_timeout_s has no default here: the one default lives in - # config/schema.py (metrics_drain_timeout_s). None = wait forever. + # drain_timeout_s is injected (not derived) because the right + # value is workload-dependent: long-context tokenize-heavy runs + # need more headroom than the schema default 60 s, and the + # aggregator itself can't measure that ahead of time. Keeping it + # as an arg lets the __main__ CLI flag plumb the user's choice + # through without coupling this class to argparse. super().__init__(EventRecordCodec(), *args, **kwargs) self._registry = registry self._publisher = publisher @@ -298,9 +302,15 @@ async def process(self, records: list[EventRecord]) -> None: else: if ev == SessionEventType.STARTED: if self._session_start_ns is not None: - # Producer bug: re-assigning _session_start_ns - # would freeze total_duration_ns (max-of-elapsed - # guard) and corrupt every downstream rate calc. + # A duplicate STARTED is a producer bug: + # re-assigning _session_start_ns would freeze + # total_duration_ns (the max-of-elapsed guard + # never updates once the start moves forward) + # and corrupt every downstream rate calc for + # the rest of the run. Surface loudly and + # ignore — the publisher.start guard already + # rejects the second tick-task spawn, but + # session-state must also be defended here. logger.error( "Duplicate STARTED event received " "(original at ts=%d, duplicate at ts=%d); " @@ -385,13 +395,16 @@ async def process(self, records: list[EventRecord]) -> None: # that fires before publish_final reflects the new state. self._session_state = SessionState.DRAINING logger.info("Draining %d pending tokenizations...", self.pending_tokens) - # Drain + final publish run inside one finalization boundary: a - # tokenizer failure must not skip publish_final and leave - # main()'s `await shutdown_event.wait()` hanging. + # The drain and final publish are wrapped together so the aggregator + # ALWAYS reaches _finalize (which sets the shutdown event); a + # tokenizer failure during the drain must not skip publish_final and + # leave main()'s `await shutdown_event.wait()` hanging. n_pending = self.pending_tokens try: - # flush_remaining never raises; it returns the count it could - # not finish, which becomes the snapshot's n_pending_tasks. + # flush_remaining tokenizes the whole buffer in one batched pass, + # bounded by the drain budget, and never raises: it returns the + # count it could not finish (timeout or failure), which becomes + # the snapshot's n_pending_tasks so Report flags an incomplete drain. if self._token_queue is not None: n_pending = await self._token_queue.flush_remaining( self._drain_timeout_s @@ -417,8 +430,11 @@ async def process(self, records: list[EventRecord]) -> None: ) await self._publisher.publish_final(registry, n_pending_tasks=n_pending) finally: - # aclose is independently wrapped: its failure must not - # prevent _finalize, which sets the shutdown event. + # The aggregator MUST close the publisher and signal shutdown even + # if the drain/publish above failed — otherwise main()'s + # `await shutdown_event.wait()` hangs forever. aclose is + # independently wrapped: its failure must not prevent _finalize, + # which is what sets the shutdown event. try: await self._publisher.aclose() except Exception: # noqa: BLE001 — best-effort cleanup. diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py index d67d82f97..60a75bdb6 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py @@ -15,12 +15,15 @@ """Tokenization for ISL/OSL/TPOT metrics. -``BatchTokenizer`` runs two lanes: live mid-run flushes on a small in-process -thread pool (``--tokenizer-workers``), and the end-of-run drain sharded -across worker processes each pinned to a ``CORES_PER_WORKER`` block. -``TokenBatchQueue`` buffers per-sample work and clears it in batches. A -tokenizer without a fast (Rust) backend is a startup error, never a silent -slow path; platforms without CPU affinity (e.g. macOS) shard unpinned. +``BatchTokenizer`` tokenizes whole batches at once, sharded across worker +processes each pinned to a block of ``CORES_PER_WORKER`` cores (a single BPE +rayon pool is memory-bound and saturates ~8 cores). The aggregator buffers +per-sample text. The sharded pool is the drain-phase accelerator and is +auto-sized (one shard per core block); live mid-run flushes run on a small +in-process thread pool (``--tokenizer-workers``, default 2) owned by the +queue's live loop. A tokenizer without a fast (Rust) backend is a startup +error, never a silent slow path. Platforms without CPU affinity (e.g. macOS) +shard unpinned at full speed; only cache/NUMA locality is lost. """ from __future__ import annotations @@ -49,14 +52,18 @@ # used. Measured on GB200: ~16k texts/s at 18 blocks vs ~1.5k single-process. CORES_PER_WORKER = 8 -# Warmup budget (spawn + transformers import + tokenizer load per worker). -# A hung load must become a startup error that fires before the parent's -# 30 s service-launch budget kills the subprocess. +# Budget for the parallel shard warmup (spawn + transformers import + +# tokenizer load per worker). A hung load (e.g. a stuck network filesystem) +# must become a bounded startup error, not wedge service startup — and the +# error must fire before the parent's 30 s service-launch budget kills the +# subprocess, so the diagnostic wins the race. _SHARD_WARMUP_TIMEOUT_S = 25.0 -# Per-flush ceiling for the LIVE lane: bounds the lock-hold time and the -# unstoppable in-flight encode a drain-start cancellation leaves behind. -# The drain has no ceiling — it always takes the whole buffer. +# Per-flush ceiling for the LIVE lane. Bounds three things at once: how long +# the queue lock is held mid-run, how much work an unstoppable in-flight +# thread encode can hold after a drain-start cancellation, and how much the +# drain re-encodes for items the cancelled flush gave back. The drain has no +# ceiling — it always takes the whole buffer. _LIVE_FLUSH_MAX_ITEMS = 1024 # Minimal user message used to satisfy chat templates that reject assistant-only @@ -197,11 +204,10 @@ def __init__( n_workers: int = -1, ) -> None: self._tokenizer_name = tokenizer_name - # Cap this process's rayon pool so a live (in-process) batched encode - # uses ~live_workers cores, not the whole machine. Must be set before - # the first encode initializes the pool; setdefault lets an - # operator-exported RAYON_NUM_THREADS win. live_workers has no - # default — the one default lives in config/schema.py. + # The live lane runs in-process: cap this process's rayon pool so a + # mid-run batched encode uses ~live_workers cores, not the whole + # machine. Must be set before the first encode initializes the pool; + # setdefault lets an operator-exported RAYON_NUM_THREADS win. os.environ.setdefault("RAYON_NUM_THREADS", str(max(1, live_workers))) self._live_workers = live_workers self._fallback_warned: set[str] = set() @@ -214,9 +220,10 @@ def __init__( max_workers=max(1, live_workers), thread_name_prefix="tok-thread" ) self._load_tokenizer() # also computes the chat-template baseline - # Process shards for the drain. Empty only when in-process mode was - # explicitly requested (n_workers=0 / cores_per_worker<=0, test-only - # seams — production wiring always auto-sizes). + # Process shards for the batched text path. Empty only when + # in-process mode was explicitly requested (n_workers=0 or + # cores_per_worker<=0; ctor overrides used primarily by tests — + # production wiring passes live_workers only and shards auto-size). self._procs: list[ProcessPoolExecutor] = [] self._setup_shards(cores_per_worker, n_workers) @@ -258,11 +265,14 @@ def _load_tokenizer(self) -> None: def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: """Spawn one pinned single-worker process per core block. - ``n_workers == 0`` selects in-process tokenization; auto (``< 0``) - fits one shard per ``cores_per_worker`` block (at least one); an - explicit count is clamped to capacity. An environment that cannot - shard — no fast Rust backend, a failed or over-budget warmup — - raises instead of degrading to a slow path. + ``n_workers == 0`` explicitly selects in-process tokenization. Auto + (``< 0``) fits one shard per ``cores_per_worker`` block of this + process's affinity mask (or the online CPU count when the platform + has no affinity API — shards then run unpinned), always at least one; + an explicit count is clamped to that capacity. An environment that + cannot shard — no fast Rust backend, a warmup that fails or exceeds + its budget — raises instead of silently degrading to a slow path + that cannot keep up with completions. """ if cores_per_worker <= 0 or n_workers == 0: logger.info("BatchTokenizer: in-process tokenization (explicit)") @@ -273,9 +283,12 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: "backend; token metrics require one to keep up with " "completions. Use a fast tokenizer, or disable token metrics." ) - # Probe the full allowed CPU universe (cgroup-clamped) for the block - # math, then restore the inherited mask: the aggregator stays where - # the parent placed it; only the drain shards span the machine. + # Probe the full allowed CPU universe (cgroup-clamped) for the shard + # block math, then restore this process's inherited mask: the + # aggregator's event loop, publisher, and live tokenizer threads stay + # exactly where the parent placed them (the loadgen mask on a pinned + # Linux run). Only the drain-phase shard processes, pinned to their + # own blocks, span the whole machine. try: original = os.sched_getaffinity(0) except (OSError, AttributeError): @@ -312,9 +325,11 @@ def _setup_shards(self, cores_per_worker: int, n_workers: int) -> None: initargs=(self._tokenizer_name, block), ) procs.append(ex) - # Warm all shards in parallel (submit-then-await; awaiting each - # before the next would serialize N tokenizer loads). Bounded: - # one hung load must not wedge startup. + # Force spawn + pin + tokenizer-load now (not on the first batch). + # Submit to every shard first so the loads run in parallel, then + # await — waiting on each before submitting the next would + # serialize P tokenizer loads and can exceed the launch budget. + # The wait is bounded: one hung load must not wedge startup. ready = [ex.submit(_worker_ready, 0) for ex in procs] deadline = time.monotonic() + _SHARD_WARMUP_TIMEOUT_S for f in ready: @@ -492,12 +507,16 @@ async def token_count_message_async( class TokenBatchQueue: """Buffers per-sample tokenization work and clears it in batches. - Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with - an ``on_count`` recorder callback. The queue owns its flush cadence: - ``start_live`` flushes periodically through the tokenizer's bounded live - lane; ``flush_remaining`` drains everything left at end-of-run through - every shard. ``pending`` is the snapshot's ``n_pending_tasks`` — - non-zero in the final snapshot means an incomplete drain. + Triggers call ``enqueue_text`` / ``enqueue_message`` at event time with an + ``on_count`` callback that records the resulting metric. The queue owns + its own flush cadence: ``start_live`` begins a periodic flush through the + tokenizer's bounded live lane (so live ISL/OSL/TPOT stay current without + touching the benchmark's cores), and ``flush_remaining`` drains everything + left at end-of-run through every shard. + + ``pending`` counts enqueued-but-not-yet-recorded items; it is the + ``n_pending_tasks`` on the snapshot. A non-zero value in the final snapshot + means the end-of-run flush did not finish within the drain budget or failed. """ def __init__( @@ -558,12 +577,16 @@ def enqueue_message( async def flush(self, live: bool = False) -> None: """Tokenize everything buffered so far and run each ``on_count``. - ``live=True`` routes text through the tokenizer's bounded live lane, - takes at most ``_LIVE_FLUSH_MAX_ITEMS`` per kind, and re-queues items - on failure or cancellation — a mid-run hiccup never loses samples. - Drain-mode failures are terminal: un-recorded items stay counted in - ``pending`` and surface as an incomplete drain. Items are detached up - front so concurrent enqueues land in the next flush. + ``live=True`` routes text batches through the tokenizer's bounded + live lane instead of the full shard pool, takes at most + ``_LIVE_FLUSH_MAX_ITEMS`` per kind (bounding lock-hold time and the + unstoppable in-flight encode a drain-start cancellation leaves + behind), and re-queues items on failure or cancellation so a mid-run + hiccup never loses samples — the end-of-run drain retries them. Drain-mode failures are terminal: the + un-recorded items stay counted in ``pending`` (``_inflight`` is + decremented only after a callback runs) and surface as an incomplete + drain, not as silently dropped samples. Items are detached from the + buffer up front so concurrent enqueues land in the next flush. """ async with self._lock: if not (self._text or self._msg): From 8321d835183ff80ecbcece8a963aa768ff3bb09a Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 10 Jun 2026 20:42:28 -0700 Subject: [PATCH 18/20] fix(metrics): raise metrics drain-timeout default to 300s A 1M-sample run holds ~2M deferred tokenizations at ENDED; the drain fans the whole buffer into one encode_batch per shard, so a 60s budget expires before any chunk returns and the entire backlog is dropped. 300s covers 1M-sample runs with headroom. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- .../async_utils/services/metrics_aggregator/aggregator.py | 2 +- .../async_utils/services/metrics_aggregator/snapshot.py | 2 +- src/inference_endpoint/config/schema.py | 4 ++-- .../config/templates/concurrency_template_full.yaml | 2 +- .../config/templates/offline_template_full.yaml | 2 +- .../config/templates/online_template_full.yaml | 2 +- tests/unit/commands/test_benchmark.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index c9e0c7f41..56bd7b1d1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils. - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O. - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks). -- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. +- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 300 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly. - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`). - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots. diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py index 64a31ee42..e7e8daf20 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py @@ -124,7 +124,7 @@ def __init__( ): # drain_timeout_s is injected (not derived) because the right # value is workload-dependent: long-context tokenize-heavy runs - # need more headroom than the schema default 60 s, and the + # need more headroom than the schema default 300 s, and the # aggregator itself can't measure that ahead of time. Keeping it # as an arg lets the __main__ CLI flag plumb the user's choice # through without coupling this class to argparse. diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py index a1e461c43..8046e1704 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py @@ -45,7 +45,7 @@ class SessionState(str, Enum): LIVE → run in progress; tick task publishing live HDR-derived stats. DRAINING → ``SessionEventType.ENDED`` has been received; the aggregator is tokenizing the buffered samples (bounded by the - ``--drain-timeout`` budget — schema default 60 s). Tick task + ``--drain-timeout`` budget — schema default 300 s). Tick task continues at this stage, still HDR-derived; no new events will arrive. COMPLETE → terminal clean state. The ``publish_final()`` snapshot diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index f093a547a..19447dd2b 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -584,11 +584,11 @@ class DrainConfig(BaseModel): ), ), ] = Field( - 60.0, + 300.0, ge=0, description=( "Wall-clock budget (seconds) to finish tokenizing buffered samples " - "after ENDED (default: 60.0; 0 = unlimited)." + "after ENDED (default: 300.0; 0 = unlimited)." ), ) metrics_tokenizer_workers: Annotated[ diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 30b224402..42c449d1d 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). metrics_tokenizer_workers: 2 # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain). warmup: enabled: false # Enable warmup phase before performance run diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index ae0ae939c..b5f4f5a23 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). metrics_tokenizer_workers: 2 # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain). warmup: enabled: false # Enable warmup phase before performance run diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 9c5c62842..4271ff792 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -79,7 +79,7 @@ settings: warmup_timeout_s: 240.0 # Warmup drain timeout in seconds (None = wait indefinitely) performance_timeout_s: 240.0 # Performance drain timeout in seconds (None = wait indefinitely) accuracy_timeout_s: null # Accuracy drain timeout in seconds (None = wait indefinitely) - metrics_drain_timeout_s: 60.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited). + metrics_drain_timeout_s: 300.0 # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited). metrics_tokenizer_workers: 2 # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain). warmup: enabled: false # Enable warmup phase before performance run diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 68498c9b4..e47def8f0 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -489,7 +489,7 @@ def test_defaults(self): assert cfg.warmup_timeout_s == 240.0 assert cfg.performance_timeout_s == 240.0 assert cfg.accuracy_timeout_s is None - assert cfg.metrics_drain_timeout_s == 60.0 + assert cfg.metrics_drain_timeout_s == 300.0 @pytest.mark.unit @pytest.mark.parametrize( From f1ac948956b84d3d074f0d02bbe9ba255f5260bc Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Mon, 15 Jun 2026 21:41:59 -0700 Subject: [PATCH 19/20] test(metrics): pass now-required aggregator args in signal-handling test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The aggregator entrypoint now requires --drain-timeout and --tokenizer-workers (single-sourced from the schema). The signal-handling integration test spawns the subprocess directly and still omitted them, so argparse exited the process (code 2) before any signal handler was installed. Pass both: --tokenizer-workers 0 (no tokenizer configured, so no live tokenization) and a small --drain-timeout (never reached — the run is signalled, not ENDED). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/metrics_aggregator/test_signal_handling.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py b/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py index 010536c09..62db80b04 100644 --- a/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py +++ b/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py @@ -64,6 +64,13 @@ def _spawn_aggregator( metrics_socket, "--metrics-output-dir", str(output_dir), + # Required by the entrypoint, but inert here: no tokenizer is + # configured (so no live tokenization) and the run is signalled + # rather than ENDED, so the drain budget is never reached. + "--drain-timeout", + "5", + "--tokenizer-workers", + "0", ], # New process group so we can signal it without disturbing the # test runner. From 70b39d9a547a6339b776d433c0297b34c2a75ff0 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Mon, 15 Jun 2026 22:23:01 -0700 Subject: [PATCH 20/20] chore(deps): bump aiohttp 3.14.0 -> 3.14.1 (fixes 8 CVEs) pip-audit flagged aiohttp 3.14.0 for CVE-2026-54273..54280 (8 advisories), all fixed in 3.14.1. aiohttp is a test-only dependency (mock-server fixture); production uses the custom httptools client. uv.lock regenerated to match; uv run pip-audit now reports no known vulnerabilities. Co-Authored-By: Claude Opus 4.8 (1M context) --- pyproject.toml | 2 +- uv.lock | 76 +++++++++++++++++++++++++------------------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0b0f67a86..4a7655021 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,7 +112,7 @@ test = [ "Pympler==1.1", "scipy==1.17.1", # HTTP server and client for mock server fixture - "aiohttp==3.14.0", + "aiohttp==3.14.1", # Plotting for benchmark sweep mode "matplotlib==3.10.8", # Property-based testing (CLI fuzz) diff --git a/uv.lock b/uv.lock index bfdb3b236..984581b6b 100644 --- a/uv.lock +++ b/uv.lock @@ -29,7 +29,7 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.14.0" +version = "3.14.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohappyeyeballs", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -41,42 +41,42 @@ dependencies = [ { name = "typing-extensions", marker = "(python_full_version < '3.13' and platform_machine == 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'darwin') or (python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "yarl", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/89/97/2b6889bfb6b6847520d50d95eb8c4307a45e28aaca39faf4a9454b3d1b2f/aiohttp-3.14.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b29518c9c2ec7e373e68259206a137c7f4f5439c58baaec4b5ab3ab799850a4e", size = 750194, upload-time = "2026-06-01T19:37:48.164Z" }, - { url = "https://files.pythonhosted.org/packages/21/e2/62634b7fff918ed98c3c6b2f0e70d520f7f28846cb412d451b04354c6459/aiohttp-3.14.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dbec68ce61b64cb73cab4d33df9433427b1713c8bcccb181dce695c1b6f8e87c", size = 506966, upload-time = "2026-06-01T19:37:50.014Z" }, - { url = "https://files.pythonhosted.org/packages/dd/fb/5ce075150828c797a5106f1c2fb26034e709d4289b9d2bf8b07f1e59fac6/aiohttp-3.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3cdf534aa455593e589302990c5097aa5c92c06c4262a20da22934f9186a5fff", size = 507527, upload-time = "2026-06-01T19:37:51.96Z" }, - { url = "https://files.pythonhosted.org/packages/01/d5/405a0ae4e6b081754a3609c1c97c63a950e000a2def16046f1e736933a0e/aiohttp-3.14.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cb6c657104393b5fbff01a5f59b2023db74058a8077d94475d6c25d03882a108", size = 1762420, upload-time = "2026-06-01T19:37:53.839Z" }, - { url = "https://files.pythonhosted.org/packages/19/d8/51de5c6b971c27bb1ef620293b8d1ca611ec78736b34b3f6ccf68e4c8785/aiohttp-3.14.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78d6f9286a629ce52728430afe18f8ed2b6c39a1fddb3802d7244b9983910ad2", size = 1783112, upload-time = "2026-06-01T19:38:02.641Z" }, - { url = "https://files.pythonhosted.org/packages/bc/05/750a3265ca4dc54a460bd0cb1121a8f2ce9171fce4a135fb47ea7fd594d2/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4d6a998191f5ebe3b8c28463ff72bc030250008b3193c402464efadd08b5ca02", size = 1723119, upload-time = "2026-06-01T19:38:06.713Z" }, - { url = "https://files.pythonhosted.org/packages/a8/fb/05d9214c975f23225a8cd5c439325e338c7c377b315480ef3871db51f54e/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5ba10966d4f03dd96a14365be4b8e37c327c76f11c3ca867116966cdd9f98066", size = 1760193, upload-time = "2026-06-01T19:38:17.624Z" }, - { url = "https://files.pythonhosted.org/packages/11/41/cc2d2cfbfbdc3126ba258f3cd27d1ac8a33492ae3c35a4583ee21f0ba7f1/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3366751d68d237c621264233a32f3078bbc21b7904ab90a77e03d21390c742c6", size = 481670, upload-time = "2026-06-01T19:38:29.836Z" }, - { url = "https://files.pythonhosted.org/packages/3c/07/381f4023c3b08cb616e520f566d8c58957abad54e56441d41fe67cfb0195/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:57ea07d28695a7a40304d42251892a8df765e5588c10ee32afeddcd5df33c0a2", size = 487591, upload-time = "2026-06-01T19:38:31.704Z" }, - { url = "https://files.pythonhosted.org/packages/fb/4d/4506fdb7a022bdf70011a3bbb4ca00c5c570026ef6a3c5bd7bc70c39089c/aiohttp-3.14.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:076cb014191ae2e65d949e1ad01f1dcfe33e32789b5172510f3e79c79fc04d50", size = 496503, upload-time = "2026-06-01T19:38:33.6Z" }, - { url = "https://files.pythonhosted.org/packages/ef/7d/c814111e04894a45d9e2defc94443879a6f118d9633d5fedfe6e2e8af5f0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2f3fc37054564dee64a855b5b092d87ec35dcddfaabf7dacb1c8a2b1f83dc0a9", size = 745870, upload-time = "2026-06-01T19:38:36.013Z" }, - { url = "https://files.pythonhosted.org/packages/c6/ee/80eee0efddfe187e7cd05027086b7ce1c0e492e82a4eda58f5c5543a44a0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8fcaef74d2ab0f607d7ff85a0d15e21bb5a258c4a58df1908396eb50d7f4ed3c", size = 505588, upload-time = "2026-06-01T19:38:38.282Z" }, - { url = "https://files.pythonhosted.org/packages/d6/f8/0f28f04eef75d52fc9c715dde7ce9c0abb810fd20cfeb0fea7afd2ab1e98/aiohttp-3.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4c01b0bfc6209590960e68eac083cd22d5d87c21f974dd6208cafa5d3542bc8", size = 504492, upload-time = "2026-06-01T19:38:40.611Z" }, - { url = "https://files.pythonhosted.org/packages/ff/db/44c755232085545065c94378dfce38641b1aee647f4939fcd32f5b32e719/aiohttp-3.14.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f12eb7896e81caf403a2b18c9406426f1207361e7239c057ab29c076d4257e83", size = 1752111, upload-time = "2026-06-01T19:38:42.682Z" }, - { url = "https://files.pythonhosted.org/packages/c5/a3/3800dbd095cb2bb165a7ea5d94d790914677e27f45638c7d80e3f34c8945/aiohttp-3.14.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:26d9224c6dd7f5c749aba4f61315a894601448b28d94d12f4dea0903e26d2096", size = 1777241, upload-time = "2026-06-01T19:38:52.04Z" }, - { url = "https://files.pythonhosted.org/packages/b4/3d/dc94df99ed1511fdf28314f722643ed334112643cab00223577085e788c4/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:23e8314e7aed8576fbe33314d218bd81447a3adbc91dc36f1163bf583cd3084c", size = 1714864, upload-time = "2026-06-01T19:38:56.788Z" }, - { url = "https://files.pythonhosted.org/packages/fa/10/ab28818262f4d26bdb47ed5f1fc7999b69e2fc6e0370b02d0f49011f45ea/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:666c7c5036df57b693026398b69b41874a1931ac5b3485fd910e57bfac253869", size = 1754516, upload-time = "2026-06-01T19:39:08.788Z" }, - { url = "https://files.pythonhosted.org/packages/1a/fe/6edbf5d39bf29322b6816365b17ed8ede4dace164a3aea1abcd30110eb78/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:70ea956f6cc4a37620966b56c2e205d88ca3e6d85ec063277e414b1035cddad3", size = 483329, upload-time = "2026-06-01T19:39:22.607Z" }, - { url = "https://files.pythonhosted.org/packages/1b/5a/fae531bdbc6456fb6241f46b7b81e4d8a0dd3fc09118a0055dc7141ac1ec/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:ea3b9806c89f61da22fddf1f12dd524fb368e5e28f1261fbdafe5c3cd8ce893b", size = 489502, upload-time = "2026-06-01T19:39:24.881Z" }, - { url = "https://files.pythonhosted.org/packages/36/f4/48a7b0414db7fed77a03d5dde34508c026afd83510ab6bca08c313855776/aiohttp-3.14.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a071be341c2bd9b0188e62d173509f024e0a35b1c342c53c50f8daaeda8c3bd8", size = 497357, upload-time = "2026-06-01T19:39:27.197Z" }, - { url = "https://files.pythonhosted.org/packages/75/75/e85a13a370acc007fca5feb1fd1b88ac2d8426e6dadd625479b7cadd55a3/aiohttp-3.14.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:198cfe61bf253b19da1fb3e0fa122249dc4f14c12709493fed8054aa0411cc76", size = 750898, upload-time = "2026-06-01T19:39:29.563Z" }, - { url = "https://files.pythonhosted.org/packages/9e/e4/3d637f800c724eff0e2bed64df72557444482366fd0a35b0cec0e6968f6c/aiohttp-3.14.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9dc203d6ce6b9106d54e2a93f41dfdfebfbca2d99962ba503bfd3e5921a6549e", size = 506986, upload-time = "2026-06-01T19:39:31.872Z" }, - { url = "https://files.pythonhosted.org/packages/1d/df/35161f3598bf7501d2b2a805b41ab4f45a2e34150c421bcb4ef8c0d281a7/aiohttp-3.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9e19d17ab02bf16832a2c8c0d55a486792c5b1645665652ee9531aebcc30cb72", size = 508033, upload-time = "2026-06-01T19:39:34.137Z" }, - { url = "https://files.pythonhosted.org/packages/e5/39/b36e5d3d31e850fb4691dd3e941684ac490a2559249f6fa634b6b0fdf020/aiohttp-3.14.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d925fba0c14d5b498a8028b0107beebdfd16c5d48d702ff54f879cb017aaaca3", size = 1746213, upload-time = "2026-06-01T19:39:36.654Z" }, - { url = "https://files.pythonhosted.org/packages/3a/05/27df32c844b2156e1675a8d8ec22d963e3c8ba469ed7ceb1863320c7b521/aiohttp-3.14.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff82be7f1ef73634cb77890a770743239bc3d487b848669be1c599889336dc0a", size = 1751659, upload-time = "2026-06-01T19:39:46.398Z" }, - { url = "https://files.pythonhosted.org/packages/66/e3/53c67097e8a5ce98625e91e3fa7f43c9c6940de680345d03b3509a72a078/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:edc01ea4e1ec5a1649a28866262bf24195889ff7b27bdd947029a6086741de9b", size = 1710090, upload-time = "2026-06-01T19:39:51.392Z" }, - { url = "https://files.pythonhosted.org/packages/b8/69/155c4ef3aec96417d47024800472b33b16c5d8a665371dcd044c2afdf25d/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:26b6d79aa54cb4ed50cc7d41ed14e99e0f1fc8e7c2d42f2e05b37aea897b2b52", size = 1733716, upload-time = "2026-06-01T19:40:03.631Z" }, - { url = "https://files.pythonhosted.org/packages/12/34/6180103ce9aabc8ebff3f7bb55a1228ffe60f61042823031d9692cb7b101/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:6aa1a40f9cbb3da9f80714c5966b8946c21e6a2530d809b9498b33161e3c8733", size = 787878, upload-time = "2026-06-01T19:40:13.401Z" }, - { url = "https://files.pythonhosted.org/packages/92/e9/08954a40e8b7baa3d8beadd2b074b186e9b1e9c8ddabc288678a6265de50/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b62af5a8cc96a194eaa01a9ed7b34a3ffa58d3d8daaa1a0d7a749353ad12d228", size = 524400, upload-time = "2026-06-01T19:40:15.972Z" }, - { url = "https://files.pythonhosted.org/packages/08/6a/b5965a634ac4d5ba99a463314cf4ab214ca073fcdc38a15e0294273701fc/aiohttp-3.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6eb63b1417efaf7d1002a6ad034a40d44376afcc16508a57f8e74b49ad26a095", size = 527904, upload-time = "2026-06-01T19:40:18.28Z" }, - { url = "https://files.pythonhosted.org/packages/06/b4/932bcdd850c354d9bcca30f360e475d7852e30413fbbd44b182782ed5432/aiohttp-3.14.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c20b9ad156a79eb97be5cf9e069eec01d2f0dc8472ffbd75299a8b2d4c2cbbde", size = 1912162, upload-time = "2026-06-01T19:40:20.825Z" }, - { url = "https://files.pythonhosted.org/packages/d0/1c/a57de71a4508c93a830b77c28af3d08cd97f606dedfc6b94275347744508/aiohttp-3.14.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:145262119b07d7f95abc1839add35ba2bfc84551d4b4660ca11542c0b215455b", size = 1868606, upload-time = "2026-06-01T19:40:31.843Z" }, - { url = "https://files.pythonhosted.org/packages/35/1e/c237923232c7da7f0392ea25d89fc5e60c0e93f685f4ebca8e7bcdd5271c/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cc736a9c9fc2bc4dd71fd404815741b6573df27c3f985948ec4076989ac57de", size = 1834090, upload-time = "2026-06-01T19:40:37.733Z" }, - { url = "https://files.pythonhosted.org/packages/cc/bc/2aaab2f85cadb26ea59c091fa2b8e370d625154b5c14b478f1b489d07551/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6199707cc40e0e9cd39c36fbc97bec416c704e1d0ddce03412bb3b3e6a90ccd0", size = 1832281, upload-time = "2026-06-01T19:40:52.303Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/82/78/8ea7308cac6934de8c74a14f3d5f65d1c89287426688be79538d0e5c013d/aiohttp-3.14.1.tar.gz", hash = "sha256:307f2cff90a764d329e77040603fa032db89c5c24fdad50c4c15334cba744035", size = 7955794, upload-time = "2026-06-07T21:09:35.529Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/21/151624b51cd92553d95424daf4bf19f19ce9be9002d19253e7e7ce67197b/aiohttp-3.14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d35143e27778b4bb0fb189562d7f275bff79c62ab8e98459717c0ea617ff2480", size = 757402, upload-time = "2026-06-07T21:06:40.311Z" }, + { url = "https://files.pythonhosted.org/packages/c2/82/280619e0bd7bf2454987e19282616e84762255dd9c8468f62382e8c191f1/aiohttp-3.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bcfb80a2cc36fba2534e5e5b5264dc7ae6fcd9bf15256da3e53d2f499e6fa29d", size = 512310, upload-time = "2026-06-07T21:06:42.207Z" }, + { url = "https://files.pythonhosted.org/packages/55/b2/2aac325583aaa1353045f96dffa586d8a34e8322e14a7ba49cffeb103ab4/aiohttp-3.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27fd7c91e51729b4f7e1577865fa6d34c9adccbc39aabe9000285b48af9f0ec2", size = 512448, upload-time = "2026-06-07T21:06:43.813Z" }, + { url = "https://files.pythonhosted.org/packages/8a/72/a60607cb849faa8af8a356c9329ea2eb6f395d49e82cc82ccba1fd8deb8f/aiohttp-3.14.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:64c567bf9eaf664280116a8688f63016e6b32db2505908e2bdaca1b6438142f2", size = 1766854, upload-time = "2026-06-07T21:06:45.391Z" }, + { url = "https://files.pythonhosted.org/packages/20/9c/d445818389df371f56d141d881153ba23183c4735a03f7356ffb43f7757d/aiohttp-3.14.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e6fc1a85fa7194a1a7d19f44e8609180f4a8eb5fa4c7ed8b4355f080fad235c", size = 1790278, upload-time = "2026-06-07T21:06:54.049Z" }, + { url = "https://files.pythonhosted.org/packages/dc/b4/4dac0038960427ba832f6609dfb4ea5437d7fd80c72001b9e48f834f428b/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c6fa4dc7ad6f8109c70bb1499e589f76b0b792baf39f9b017eb92c8a81d0a199", size = 1728397, upload-time = "2026-06-07T21:06:57.777Z" }, + { url = "https://files.pythonhosted.org/packages/70/0a/e0075ce9ca0279ee1d4f0c0b85f54fea02ebc83c3007651a72bece658fec/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f71173be42d3241d428f760122febb748de0623f44308a6f120d0dd9ec572e3", size = 1767580, upload-time = "2026-06-07T21:07:07.873Z" }, + { url = "https://files.pythonhosted.org/packages/fe/22/a73ccbf9dbd6e26dda0b24d5fd5db7da92ee3383a79f47677ffb834c5c5b/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:915fbb7b41b115192259f8c9ae58f3ddc444d2b5579917270211858e606a4afd", size = 485841, upload-time = "2026-06-07T21:07:19.555Z" }, + { url = "https://files.pythonhosted.org/packages/3b/b9/57ed8eaf596321c2ad747bd480fb1700dbd7177c60dfc9e4c187f629662e/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:7fb4bdf95b0561a79f259f9d28fbc109728c5ee7f27aff6391f0ca703a329abe", size = 492088, upload-time = "2026-06-07T21:07:21.581Z" }, + { url = "https://files.pythonhosted.org/packages/78/c0/5ebe5270a7c140d7c6f79dcb018640225f14d406c149e4eec04a7d82fe71/aiohttp-3.14.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1b9748363260121d2927704f5d4fc498150669ca3ae93625986ee89c8f80dcd4", size = 501564, upload-time = "2026-06-07T21:07:23.388Z" }, + { url = "https://files.pythonhosted.org/packages/75/7f/8cdaa24fc7983865e0915153b96a9ac5bcdd3548d64c5a27d17cecccad2d/aiohttp-3.14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:86a6dab78b0e43e2897a3bbe15745aa60dc5423ca437b7b0b164c069bf91b876", size = 751998, upload-time = "2026-06-07T21:07:25.046Z" }, + { url = "https://files.pythonhosted.org/packages/b2/f4/c4227aacfacc5cb0cc2d119b65301d177912a6842cd64e120c47af76064f/aiohttp-3.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4dfd6e47d3c44c2279907607f73a4240b88c69eb8b90da7e2441a8045dfd21da", size = 510918, upload-time = "2026-06-07T21:07:27.28Z" }, + { url = "https://files.pythonhosted.org/packages/ab/01/a2d5f96cd4e74424864d30bc0a7e44d0a12dacdcfa91b5b2d1bd3dca6bf3/aiohttp-3.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:317acd9f8602858dc7d59679812c376c7f0b97bcbbf16e0d6237f54141d8a8a6", size = 508657, upload-time = "2026-06-07T21:07:29.252Z" }, + { url = "https://files.pythonhosted.org/packages/e8/ed/3c0fb5c500fdd8e7ebc10d1889c04384fffa1a9163eac1356088ca9da1b1/aiohttp-3.14.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd869c427324e5cb15195793de951295710db28be7d818247f3097b4ab5d4b96", size = 1757907, upload-time = "2026-06-07T21:07:31.03Z" }, + { url = "https://files.pythonhosted.org/packages/9d/6e/dbf1d0625dc711fb2851f4f3c3055c39ed58bae92082d8c627dbe6013736/aiohttp-3.14.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:faccab372e66bc76d5731525e7f1143c922271725b9d38c9f97edcc66266b451", size = 1783881, upload-time = "2026-06-07T21:07:39.063Z" }, + { url = "https://files.pythonhosted.org/packages/2a/bd/cf9cee17e140f942a3de73e658a543aa8fbf35a5fc67a9d2538d52d77f0b/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:97e704dcd26271f5bda3fa07c3ce0fb76d6d3f8659f4baa1a24442cc9ba177ca", size = 1722137, upload-time = "2026-06-07T21:07:43.014Z" }, + { url = "https://files.pythonhosted.org/packages/ba/45/4de841f005cfe1fd63e2a2fe011262c515e2a62aa6994b15947e7d717ac9/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cb21957bb8aca671c1765e32f58164cf0c50e6bf41c0bbbd16da20732ecaf588", size = 1761094, upload-time = "2026-06-07T21:07:54.113Z" }, + { url = "https://files.pythonhosted.org/packages/85/a5/9594ad6289eebbc97d167c44213d557807f90e59115caad24de21ad2c3b1/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:62a759436b29e677181a9e76bab8b8f689a29cb9c535f45f7c48c9c830d3f8c3", size = 487918, upload-time = "2026-06-07T21:08:06.377Z" }, + { url = "https://files.pythonhosted.org/packages/b4/61/16a32c36c3c49edec122a3dc811f2057df2f94d3b14aa107c8017d981618/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2964cbf553df4d7a57348da44d961d871895fc1ee4e8c322b2a95612c7b17fba", size = 494014, upload-time = "2026-06-07T21:08:08.263Z" }, + { url = "https://files.pythonhosted.org/packages/9b/89/3ebcf96ed99c05bec9c434aaac6963fd3cbab4a786ae739908a144d9ce44/aiohttp-3.14.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:237651caadc3a59badd39319c54642b5299e9cc98a3a194310e55d5bb9f5e397", size = 502398, upload-time = "2026-06-07T21:08:10.244Z" }, + { url = "https://files.pythonhosted.org/packages/fd/3d/b74870a0c2d40c355928cd5b96c7a11fa821b8a40fc41365e64479b151fb/aiohttp-3.14.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:896e12dfdbbab9d8f7e16d2b28c6769a60126fa92095d1ebf9473d02593a2448", size = 758018, upload-time = "2026-06-07T21:08:12.447Z" }, + { url = "https://files.pythonhosted.org/packages/d3/66/f42f5c984d99e49c6cff5f26f590750f2e2f7ef1fcfb99966ab5be1b632e/aiohttp-3.14.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d03f281ed22579314ba00821ce20115a7c0ac430660b4cc05704a3f818b3e004", size = 512462, upload-time = "2026-06-07T21:08:14.624Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a7/248e1aebe0c7810b0271e021a0f2a5eb6e78a051885b3c9df49f42a5802d/aiohttp-3.14.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07eabb979d236335fed927e137a928c9adfb7df3b9ec7aa31726f133a62be983", size = 512824, upload-time = "2026-06-07T21:08:16.572Z" }, + { url = "https://files.pythonhosted.org/packages/26/97/2aa0e5ba0727dc3bd5aaebb7ccbc510f7dfb7fb961ec87497cd496635ab1/aiohttp-3.14.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4fe1f1087cbadb280b5e1bb054a4f00d1423c74d6626c5e48400d871d34ecefe", size = 1749898, upload-time = "2026-06-07T21:08:18.635Z" }, + { url = "https://files.pythonhosted.org/packages/a0/18/938441025db6769a3464596b2410af3afde0b21eb2f204c6f766f68af4bd/aiohttp-3.14.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:634e385930fb6d2d479cf3aa66515955863b77a5e3c2b5894ca259a25b308602", size = 1760329, upload-time = "2026-06-07T21:08:27.363Z" }, + { url = "https://files.pythonhosted.org/packages/49/a2/2136674d52123b1354bd05dd5753c318db47dc0c927cc70b27bab3755456/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:335c0cc3e3545ce98dcb9cfcb836f40c3411f43fa03dab757597d80c89af8a35", size = 1714756, upload-time = "2026-06-07T21:08:32.094Z" }, + { url = "https://files.pythonhosted.org/packages/c1/af/14bb5843eccbe234f4dfb78ab73e549d99727247e62ae5d62cbd22eaf5b0/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6ffbb2f4ec1ceaff7e07d43922954da26b223d188bf30658e561b98e23089444", size = 1742574, upload-time = "2026-06-07T21:08:43.795Z" }, + { url = "https://files.pythonhosted.org/packages/34/e3/19dbe1a1f4cc6230eb9e314de7fe68053b0992f9302b27d12141a0b5db53/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:819c054312f1af92947e6a55883d1b66feefab11531a7fc45e0fb9b63880b5c2", size = 793320, upload-time = "2026-06-07T21:08:52.775Z" }, + { url = "https://files.pythonhosted.org/packages/7f/20/1b7182219ba1b108430d6e4dc53d25ae02dcfcf5a045b33af4e8c5167527/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10ee9c1753a8f706345b22496c79fbddb5be0599e0823f3738b1534058e25340", size = 529077, upload-time = "2026-06-07T21:08:55Z" }, + { url = "https://files.pythonhosted.org/packages/b9/c8/14ce60ec31a2e5f5274bb17d383a6f7a3aabca31ac04eee05585bbadab16/aiohttp-3.14.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1601cc37baf5750ccacae618ec2daf020769581695550e3b654a911f859c563d", size = 532476, upload-time = "2026-06-07T21:08:57.176Z" }, + { url = "https://files.pythonhosted.org/packages/7e/02/9ac85e081e53da2e061b02fa7758fe0a12d17b8ce2d1f5e6c7cb76730328/aiohttp-3.14.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4d6e0ac9da31c9c04c84e1c0182ad8d6df35965a85cae29cd71d089621b3ae94", size = 1922347, upload-time = "2026-06-07T21:08:59.563Z" }, + { url = "https://files.pythonhosted.org/packages/66/4e/560c7472d3d198a23aa5c8b19a5115bf6a9b77b7d3e4bb363da320430ad2/aiohttp-3.14.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fc0cacab7ba4e56f0f81c82a98c09bed2f39c940107b03a34b168bdf7597edd3", size = 1877095, upload-time = "2026-06-07T21:09:09.011Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c9/48255813cca749a229ef0ab476004ec623728ad79a9c0840616f6c076325/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:38e1e7daaea81df51c952e18483f323d878499a1e2bfe564790e0f9701d6f203", size = 1842922, upload-time = "2026-06-07T21:09:14.118Z" }, + { url = "https://files.pythonhosted.org/packages/44/be/0474c5a8b5640e1e4aa1923430a91f4151be82e511373fe764189b89aef5/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:99abd37084b82f5830c635fddd0b4993b9742a66eb746dacf433c8590e8f9e3c", size = 1841409, upload-time = "2026-06-07T21:09:26.207Z" }, ] [[package]] @@ -858,7 +858,7 @@ test = [ [package.metadata] requires-dist = [ - { name = "aiohttp", marker = "extra == 'test'", specifier = "==3.14.0" }, + { name = "aiohttp", marker = "extra == 'test'", specifier = "==3.14.1" }, { name = "colorama", specifier = "==0.4.6" }, { name = "coverage", marker = "extra == 'test'", specifier = "==7.13.4" }, { name = "cyclopts", specifier = "==4.10.0" },