From c119fe990fe23db8fe4504cfc723508d387e4a8a Mon Sep 17 00:00:00 2001 From: maxyanghu Date: Tue, 26 May 2026 11:56:43 -0700 Subject: [PATCH 1/5] feat: add client-side vLLM profiling trigger Adds an optional client-side trigger that fires POST /start_profile at the performance phase start and /stop_profile at run end, so a profiled run can be driven from a YAML/CLI flag without coupling endpoints to any vendor harness. Schema: ProfilerEngine enum (currently {vllm}) and ProfilingConfig hung off Settings. URLs are auto-derived per entry in endpoint_config.endpoints (strip /v1, append engine-specific path). Default-off; warn-don't-fail throughout. Report.txt gets a Profiling section and a sibling profiling.json is written next to result_summary.json when the trigger is enabled. --- .../commands/benchmark/cli.py | 16 ++ .../commands/benchmark/execute.py | 190 +++++++++++++++++- src/inference_endpoint/config/schema.py | 36 ++++ 3 files changed, 237 insertions(+), 5 deletions(-) diff --git a/src/inference_endpoint/commands/benchmark/cli.py b/src/inference_endpoint/commands/benchmark/cli.py index 685d2d305..22893739c 100644 --- a/src/inference_endpoint/commands/benchmark/cli.py +++ b/src/inference_endpoint/commands/benchmark/cli.py @@ -29,6 +29,7 @@ BenchmarkConfig, OfflineBenchmarkConfig, OnlineBenchmarkConfig, + ProfilerEngine, TestMode, TestType, ) @@ -98,6 +99,13 @@ def from_config( config: Annotated[Path, cyclopts.Parameter(name=["--config", "-c"])], timeout: float | None = None, mode: TestMode | None = None, + profile: Annotated[ + ProfilerEngine | None, + cyclopts.Parameter( + name="--profile", + help="Profile the named inference engine around the performance phase", + ), + ] = None, ): """Run benchmark from YAML config file.""" try: @@ -106,6 +114,14 @@ def from_config( raise InputValidationError(f"Config error: {e}") from e if timeout is not None: resolved = resolved.with_updates(timeout=timeout) + if profile is not None: + new_profiling = resolved.settings.profiling.model_copy( + update={"engine": profile} + ) + new_settings = resolved.settings.model_copy( + update={"profiling": new_profiling} + ) + resolved = resolved.with_updates(settings=new_settings) test_mode = mode or ( TestMode.BOTH if resolved.type == TestType.SUBMISSION else TestMode.PERF ) diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index a2050bbe3..717df676d 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -30,13 +30,16 @@ import shutil import signal import tempfile +import time import uuid from collections.abc import Callable from dataclasses import dataclass, field from dataclasses import replace as dataclass_replace from datetime import datetime from pathlib import Path -from typing import Any +from typing import Any, TextIO +from urllib import error as urllib_error +from urllib import request as urllib_request from urllib.parse import urljoin import msgspec @@ -65,6 +68,7 @@ DatasetType, LoadPattern, LoadPatternType, + ProfilerEngine, StreamingMode, TestMode, TestType, @@ -140,6 +144,10 @@ class BenchmarkResult: collector: ResponseCollector report: Report | None tmpfs_dir: Path + # Profile trigger payload {engine: str, starts: [...], stops: [...]} when + # settings.profiling.engine is set; None otherwise. Rendered into + # report.txt and a sibling profiling.json by finalize_benchmark. + profiling: dict[str, Any] | None = None @dataclass @@ -538,6 +546,110 @@ def _load_final_snapshot_from_disk(path: Path) -> dict[str, Any] | None: return None +# (start_path, stop_path) for each supported inference engine's profiling +# protocol. Add a row when introducing a new ProfilerEngine variant. +_PROFILE_PATHS: dict[ProfilerEngine, tuple[str, str]] = { + ProfilerEngine.VLLM: ("/start_profile", "/stop_profile"), +} + + +def _derive_profile_urls( + endpoints: list[str], engine: ProfilerEngine, action: str +) -> list[str]: + """One profile URL per endpoint, derived from the engine's HTTP protocol. + + For vLLM: strip a trailing ``/v1`` from each endpoint and append + ``/{start,stop}_profile``. ``action`` is ``"start"`` or ``"stop"``. + """ + if not endpoints: + raise ValueError( + f"profiling.engine={engine.value} but endpoint_config.endpoints " + f"is empty; cannot derive {action} URLs" + ) + start_path, stop_path = _PROFILE_PATHS[engine] + path = start_path if action == "start" else stop_path + urls: list[str] = [] + for ep in endpoints: + base = ep.rstrip("/") + if base.endswith("/v1"): + base = base[:-3] + urls.append(f"{base.rstrip('/')}{path}") + return urls + + +def _post_profile(url: str) -> dict[str, Any]: + """POST {url} with empty body; never raises. Returns a record dict suitable + for report.txt rendering and profiling.json serialization.""" + record: dict[str, Any] = { + "url": url, + "sent_at_ns": time.monotonic_ns(), + "sent_at_iso": datetime.now().isoformat(timespec="milliseconds"), + "status": None, + "error": None, + } + req = urllib_request.Request(url, method="POST", data=b"") + try: + with urllib_request.urlopen(req, timeout=2) as resp: + record["status"] = resp.status + except urllib_error.HTTPError as e: + record["status"] = e.code + record["error"] = f"{e.code} {e.reason}" + except Exception as e: # noqa: BLE001 — profile failures must never abort a run + record["error"] = f"{type(e).__name__}: {e}" + return record + + +def _render_profile_status(rec: dict[str, Any]) -> str: + status = rec.get("status") + error = rec.get("error") + if status == 200: + return "200 OK" + if status == 404: + return ( + "404 (profiling not enabled on server — pass " + "--profiler-config.profiler=... to server)" + ) + if error: + return error + if status is not None: + return str(status) + return "ERROR" + + +def _write_profiling_section(f: TextIO, profiling: dict[str, Any]) -> None: + """Append the Profiling section to report.txt (called after report.display).""" + starts = profiling.get("starts", []) + stops = profiling.get("stops", []) + f.write("\n------------------- Profiling -------------------\n") + f.write(f"Engine: {profiling.get('engine', 'unknown')}\n") + f.write("Start:\n") + for rec in starts: + f.write( + f" POST {rec['url']} @ {rec['sent_at_iso']} → " + f"{_render_profile_status(rec)}\n" + ) + if stops: + f.write("Stop:\n") + for rec in stops: + suffix = ( + " (from abort handler)" if rec.get("stop_reason") == "abort" else "" + ) + f.write( + f" POST {rec['url']} @ {rec['sent_at_iso']} → " + f"{_render_profile_status(rec)}{suffix}\n" + ) + if starts and stops: + first_start = min(r["sent_at_ns"] for r in starts) + last_stop = max(r["sent_at_ns"] for r in stops) + f.write(f"Trigger span: {(last_stop - first_start) / 1e9:.2f} s\n") + f.write( + "\nNote: actual trace window is bounded by server-side " + "--profiler-config.delay_iterations and " + "--profiler-config.max_iterations.\n" + "Trace artifact path is in server stdout.\n" + ) + + async def _run_benchmark_async( ctx: BenchmarkContext, loop: asyncio.AbstractEventLoop, @@ -735,6 +847,22 @@ def _on_sample_complete(result: QueryResult) -> None: _timeout_done = False max_duration_ms = ctx.rt_settings.max_duration_ms + # Profile trigger state. Pre-derive URLs once so a bad config + # (engine set but no endpoints) fails before the run. + profiling_cfg = config.settings.profiling + profile_start_urls: list[str] = [] + profile_stop_urls: list[str] = [] + profile_starts: list[dict[str, Any]] = [] + profile_stops: list[dict[str, Any]] = [] + if profiling_cfg.engine is not None: + profile_start_urls = _derive_profile_urls( + config.endpoint_config.endpoints, profiling_cfg.engine, "start" + ) + profile_stop_urls = _derive_profile_urls( + config.endpoint_config.endpoints, profiling_cfg.engine, "stop" + ) + session_completed_normally = False + def _on_global_timeout() -> None: if not _timeout_done: logger.warning( @@ -745,17 +873,32 @@ def _on_global_timeout() -> None: def _on_phase_start(phase: PhaseConfig) -> None: nonlocal global_timeout_handle - if ( - phase.phase_type == PhaseType.PERFORMANCE - and max_duration_ms is not None - ): + if phase.phase_type != PhaseType.PERFORMANCE: + return + if max_duration_ms is not None: global_timeout_handle = loop.call_later( max_duration_ms / 1000.0, _on_global_timeout ) + # Fire /start_profile sequentially before any perf request is + # issued, so the server is armed when traffic begins. Blocks + # the loop briefly (sub-100ms per URL); strategy task hasn't + # been created yet so nothing is starved. + for url in profile_start_urls: + rec = _post_profile(url) + if rec["status"] == 200: + logger.info("Profile start: %s -> 200 OK", url) + else: + logger.warning( + "Profile start: %s -> %s", + url, + rec["error"] or rec["status"], + ) + profile_starts.append(rec) loop.add_signal_handler(signal.SIGINT, session.stop) try: result = await session.run(phases, on_phase_start=_on_phase_start) + session_completed_normally = True except Exception as e: raise ExecutionError(f"Benchmark execution failed: {e}") from e finally: @@ -763,6 +906,25 @@ def _on_phase_start(phase: PhaseConfig) -> None: if global_timeout_handle is not None: global_timeout_handle.cancel() loop.remove_signal_handler(signal.SIGINT) + # Fire /stop_profile for URLs whose /start_profile succeeded. + # Unifies the clean phase-end path and the abort path — + # both reach this block, both fire stops. + if profile_starts: + stop_reason = "phase_end" if session_completed_normally else "abort" + for i, start_rec in enumerate(profile_starts): + if start_rec["status"] != 200 or i >= len(profile_stop_urls): + continue + rec = _post_profile(profile_stop_urls[i]) + rec["stop_reason"] = stop_reason + if rec["status"] == 200: + logger.info("Profile stop: %s -> 200 OK", profile_stop_urls[i]) + else: + logger.warning( + "Profile stop: %s -> %s", + profile_stop_urls[i], + rec["error"] or rec["status"], + ) + profile_stops.append(rec) logger.info("Cleaning up...") try: if http_client: @@ -815,11 +977,20 @@ def _on_phase_start(phase: PhaseConfig) -> None: metrics_subscriber.close() pbar.close() + profiling_payload: dict[str, Any] | None = None + if profiling_cfg.engine is not None: + profiling_payload = { + "engine": profiling_cfg.engine.value, + "starts": profile_starts, + "stops": profile_stops, + } + return BenchmarkResult( session=result, collector=collector, report=report, tmpfs_dir=tmpfs_dir, + profiling=profiling_payload, ) @@ -888,8 +1059,17 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None: report_txt = ctx.report_dir / "report.txt" with report_txt.open("w") as f: report.display(fn=lambda s: print(s, file=f)) + if bench.profiling is not None: + _write_profiling_section(f, bench.profiling) logger.info(f"Report written to {report_txt}") + # Sibling profiling.json — kept separate so Report stays a pure + # snapshot-derived struct. + if bench.profiling is not None: + (ctx.report_dir / "profiling.json").write_text( + json.dumps(bench.profiling, indent=2) + ) + # Write scoring artifacts + copy event log from tmpfs to disk _write_scoring_artifacts(ctx, result, bench.tmpfs_dir) diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 9226d7f85..7822f5a81 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -606,6 +606,41 @@ class DrainConfig(BaseModel): 2, ge=1, description="Number of tokenizer worker threads in the metrics aggregator (default: 2).", +class ProfilerEngine(str, Enum): + """Inference engine whose profiling protocol the client should drive. + + Selects the HTTP path layout used to derive start/stop URLs from + ``endpoint_config.endpoints``. Each value corresponds to one server-side + profiling protocol; add a new variant + ``_PROFILE_PATHS`` row to support + another engine. + """ + + VLLM = "vllm" + + +@cyclopts.Parameter(name="*") +class ProfilingConfig(BaseModel): + """Client-side trigger for the server's profiler. + + When ``engine`` is set, fires POST ```` at performance-phase + begin and POST ```` at performance-phase end. URLs are derived + from ``endpoint_config.endpoints`` using the engine-specific protocol. + Server must be launched with profiling enabled (e.g. vLLM's + ``--profiler-config.profiler=cuda|torch``); the schedule + (``delay_iterations``, ``max_iterations``) is set there, not here. + """ + + model_config = ConfigDict(extra="forbid", frozen=True) + + engine: Annotated[ + ProfilerEngine | None, + cyclopts.Parameter( + alias="--profile", + help="Profile the named inference engine around the performance phase", + ), + ] = Field( + None, + description="Profile the named inference engine around the performance phase", ) @@ -623,6 +658,7 @@ class Settings(BaseModel): description="Per-phase in-flight response drain timeout configuration", ) warmup: WarmupConfig = Field(default_factory=WarmupConfig) + profiling: ProfilingConfig = Field(default_factory=ProfilingConfig) class OfflineSettings(Settings): From 0cd49ef8372c2fbaa55eb645f81a7e9d49360b3e Mon Sep 17 00:00:00 2001 From: maxyanghu Date: Thu, 11 Jun 2026 12:58:11 -0700 Subject: [PATCH 2/5] fix: close unclosed Field() for metrics_tokenizer_workers The profiler-trigger commit (a4fe30b) left the Field( call for metrics_tokenizer_workers unterminated, so config/schema.py raised SyntaxError and the inference-endpoint CLI could not import. Add the missing ) so the module compiles. --- src/inference_endpoint/config/schema.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 7822f5a81..3dabab773 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -606,6 +606,9 @@ class DrainConfig(BaseModel): 2, ge=1, description="Number of tokenizer worker threads in the metrics aggregator (default: 2).", + ) + + class ProfilerEngine(str, Enum): """Inference engine whose profiling protocol the client should drive. From f73ef13df4a54d92f254f985634bbe9914fae260 Mon Sep 17 00:00:00 2001 From: maxyanghu Date: Mon, 15 Jun 2026 11:01:58 -0700 Subject: [PATCH 3/5] feat: allow separate profiling endpoint override Add an optional profiling.endpoints (CLI --profile-endpoints) field so the profiler start/stop triggers can target a different host than the inference endpoint. When unset, URLs are still derived from endpoint_config.endpoints; when set, derivation runs over the override list using the same engine-specific protocol. Adds a scheme validator mirroring EndpointConfig and a matching --profile-endpoints override on the from-config subcommand. --- .../commands/benchmark/cli.py | 24 ++++++++++----- .../commands/benchmark/execute.py | 5 ++-- src/inference_endpoint/config/schema.py | 30 ++++++++++++++++++- .../templates/concurrency_template_full.yaml | 3 ++ .../templates/offline_template_full.yaml | 3 ++ .../templates/online_template_full.yaml | 3 ++ 6 files changed, 58 insertions(+), 10 deletions(-) diff --git a/src/inference_endpoint/commands/benchmark/cli.py b/src/inference_endpoint/commands/benchmark/cli.py index 22893739c..21042fb27 100644 --- a/src/inference_endpoint/commands/benchmark/cli.py +++ b/src/inference_endpoint/commands/benchmark/cli.py @@ -18,7 +18,7 @@ from __future__ import annotations from pathlib import Path -from typing import Annotated +from typing import Annotated, Any import cyclopts import yaml @@ -106,6 +106,15 @@ def from_config( help="Profile the named inference engine around the performance phase", ), ] = None, + profile_urls: Annotated[ + list[str] | None, + cyclopts.Parameter( + name="--profile-urls", + help="Override URL(s) for profiler triggers; " + "defaults to endpoint_config.endpoints", + negative="", + ), + ] = None, ): """Run benchmark from YAML config file.""" try: @@ -114,13 +123,14 @@ def from_config( raise InputValidationError(f"Config error: {e}") from e if timeout is not None: resolved = resolved.with_updates(timeout=timeout) + profiling_update: dict[str, Any] = {} if profile is not None: - new_profiling = resolved.settings.profiling.model_copy( - update={"engine": profile} - ) - new_settings = resolved.settings.model_copy( - update={"profiling": new_profiling} - ) + profiling_update["engine"] = profile + if profile_urls is not None: + profiling_update["urls"] = profile_urls + if profiling_update: + new_profiling = resolved.settings.profiling.model_copy(update=profiling_update) + new_settings = resolved.settings.model_copy(update={"profiling": new_profiling}) resolved = resolved.with_updates(settings=new_settings) test_mode = mode or ( TestMode.BOTH if resolved.type == TestType.SUBMISSION else TestMode.PERF diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 717df676d..9db9c4900 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -855,11 +855,12 @@ def _on_sample_complete(result: QueryResult) -> None: profile_starts: list[dict[str, Any]] = [] profile_stops: list[dict[str, Any]] = [] if profiling_cfg.engine is not None: + profile_endpoints = profiling_cfg.urls or config.endpoint_config.endpoints profile_start_urls = _derive_profile_urls( - config.endpoint_config.endpoints, profiling_cfg.engine, "start" + profile_endpoints, profiling_cfg.engine, "start" ) profile_stop_urls = _derive_profile_urls( - config.endpoint_config.endpoints, profiling_cfg.engine, "stop" + profile_endpoints, profiling_cfg.engine, "stop" ) session_completed_normally = False diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 3dabab773..b0ebe03c7 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -627,7 +627,8 @@ class ProfilingConfig(BaseModel): When ``engine`` is set, fires POST ```` at performance-phase begin and POST ```` at performance-phase end. URLs are derived - from ``endpoint_config.endpoints`` using the engine-specific protocol. + using the engine-specific protocol from ``urls`` when set, otherwise + from ``endpoint_config.endpoints``. Server must be launched with profiling enabled (e.g. vLLM's ``--profiler-config.profiler=cuda|torch``); the schedule (``delay_iterations``, ``max_iterations``) is set there, not here. @@ -645,6 +646,33 @@ class ProfilingConfig(BaseModel): None, description="Profile the named inference engine around the performance phase", ) + urls: Annotated[ + list[str] | None, + cyclopts.Parameter( + alias="--profile-urls", + help="Override URL(s) for profiler triggers; " + "defaults to endpoint_config.endpoints", + negative="", + ), + ] = Field( + None, + description="URL(s) the profiler start/stop triggers are derived from. " + "When None, derived from endpoint_config.endpoints instead. Use when " + "the profiler admin endpoint differs from the inference endpoint.", + ) + + @field_validator("urls", mode="after") + @classmethod + def _validate_url_scheme(cls, v: list[str] | None) -> list[str] | None: + if v is None: + return v + for url in v: + if not url.startswith(("http://", "https://")): + raise ValueError( + f"Profiling endpoint URL must include scheme " + f"(http:// or https://), got: {url!r}" + ) + return v @cyclopts.Parameter(name="*") diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 38829f0f5..d0776d282 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -87,6 +87,9 @@ settings: salt: false # Prepend a unique random hex salt to each warmup prompt drain: false warmup_random_seed: 42 # RNG seed for warmup scheduling and sample ordering + profiling: + engine: null # Profile the named inference engine around the performance phase | options: vllm + urls: null # URL(s) the profiler start/stop triggers are derived from. When None, derived from endpoint_config.endpoints instead. Use when the profiler admin endpoint differs from the inference endpoint. endpoint_config: endpoints: # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'. - http://localhost:8000 diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index c3454d5da..77ded061e 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -87,6 +87,9 @@ settings: salt: false # Prepend a unique random hex salt to each warmup prompt drain: false warmup_random_seed: 42 # RNG seed for warmup scheduling and sample ordering + profiling: + engine: null # Profile the named inference engine around the performance phase | options: vllm + urls: null # URL(s) the profiler start/stop triggers are derived from. When None, derived from endpoint_config.endpoints instead. Use when the profiler admin endpoint differs from the inference endpoint. endpoint_config: endpoints: # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'. - http://localhost:8000 diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 5bea95329..5ec9cbfcf 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -87,6 +87,9 @@ settings: salt: false # Prepend a unique random hex salt to each warmup prompt drain: false warmup_random_seed: 42 # RNG seed for warmup scheduling and sample ordering + profiling: + engine: null # Profile the named inference engine around the performance phase | options: vllm + urls: null # URL(s) the profiler start/stop triggers are derived from. When None, derived from endpoint_config.endpoints instead. Use when the profiler admin endpoint differs from the inference endpoint. endpoint_config: endpoints: # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'. - http://localhost:8000 From 93f9a9e4de0d0acaa75f1572dc68940a4730cd91 Mon Sep 17 00:00:00 2001 From: maxyanghu Date: Tue, 16 Jun 2026 09:55:04 -0700 Subject: [PATCH 4/5] refactor: drop --profile/--profile-urls overrides from from-config Keep the from-config CLI surface minimal per review feedback: profiling is configured via the YAML settings.profiling block for from-config runs. This also removes the model_copy(update=...) path that bypassed ProfilingConfig URL-scheme validation. offline/online keep the schema-generated --profile/--profile-urls flags, which validate normally. --- .../commands/benchmark/cli.py | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/src/inference_endpoint/commands/benchmark/cli.py b/src/inference_endpoint/commands/benchmark/cli.py index 21042fb27..685d2d305 100644 --- a/src/inference_endpoint/commands/benchmark/cli.py +++ b/src/inference_endpoint/commands/benchmark/cli.py @@ -18,7 +18,7 @@ from __future__ import annotations from pathlib import Path -from typing import Annotated, Any +from typing import Annotated import cyclopts import yaml @@ -29,7 +29,6 @@ BenchmarkConfig, OfflineBenchmarkConfig, OnlineBenchmarkConfig, - ProfilerEngine, TestMode, TestType, ) @@ -99,22 +98,6 @@ def from_config( config: Annotated[Path, cyclopts.Parameter(name=["--config", "-c"])], timeout: float | None = None, mode: TestMode | None = None, - profile: Annotated[ - ProfilerEngine | None, - cyclopts.Parameter( - name="--profile", - help="Profile the named inference engine around the performance phase", - ), - ] = None, - profile_urls: Annotated[ - list[str] | None, - cyclopts.Parameter( - name="--profile-urls", - help="Override URL(s) for profiler triggers; " - "defaults to endpoint_config.endpoints", - negative="", - ), - ] = None, ): """Run benchmark from YAML config file.""" try: @@ -123,15 +106,6 @@ def from_config( raise InputValidationError(f"Config error: {e}") from e if timeout is not None: resolved = resolved.with_updates(timeout=timeout) - profiling_update: dict[str, Any] = {} - if profile is not None: - profiling_update["engine"] = profile - if profile_urls is not None: - profiling_update["urls"] = profile_urls - if profiling_update: - new_profiling = resolved.settings.profiling.model_copy(update=profiling_update) - new_settings = resolved.settings.model_copy(update={"profiling": new_profiling}) - resolved = resolved.with_updates(settings=new_settings) test_mode = mode or ( TestMode.BOTH if resolved.type == TestType.SUBMISSION else TestMode.PERF ) From 604770e2871fd5b2f3ffaa4850f4989ca7d12b8d Mon Sep 17 00:00:00 2001 From: maxyanghu Date: Tue, 16 Jun 2026 11:35:12 -0700 Subject: [PATCH 5/5] test: cover profiling trigger config and helpers Adds unit tests for the client-side profiling trigger (review finding #3): - TestProfilingConfig (test_schema.py): defaults, engine enum coercion, and URL-scheme validation on both the direct-construction (offline/online) and model_validate (from-config YAML) paths. - TestProfilingHelpers (test_benchmark.py): _derive_profile_urls /v1 stripping and empty-endpoints ValueError, _post_profile 200/404/connection-failure via mocked urlopen, _render_profile_status, and _write_profiling_section output plus profiling.json serializability. --- tests/unit/commands/test_benchmark.py | 113 ++++++++++++++++++++++++++ tests/unit/config/test_schema.py | 32 ++++++++ 2 files changed, 145 insertions(+) diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 1c90554fb..0a4b43c5d 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -16,11 +16,14 @@ """Tests for benchmark CLI models, config building, and command handlers.""" import asyncio +import io +import json import random import tempfile from pathlib import Path from types import SimpleNamespace from unittest.mock import MagicMock, patch +from urllib import error as urllib_error import pandas as pd import pytest @@ -34,7 +37,11 @@ BenchmarkContext, ResponseCollector, _build_phases, + _derive_profile_urls, + _post_profile, + _render_profile_status, _run_benchmark_async, + _write_profiling_section, setup_benchmark, ) from inference_endpoint.config.runtime_settings import RuntimeSettings @@ -46,6 +53,7 @@ LoadPatternType, OfflineSettings, OnlineSettings, + ProfilerEngine, RuntimeConfig, ScorerMethod, StreamingMode, @@ -1277,3 +1285,108 @@ def test_no_override_yields_none_when_model_has_no_tokenizer( ctx = setup_benchmark(config, TestMode.PERF) assert ctx.tokenizer_name is None + + +class TestProfilingHelpers: + @pytest.mark.unit + @pytest.mark.parametrize( + "endpoint,expected", + [ + ("http://h:8000/v1", "http://h:8000/start_profile"), + ("http://h:8000/v1/", "http://h:8000/start_profile"), + ("http://h:8000", "http://h:8000/start_profile"), + ], + ) + def test_derive_strips_v1(self, endpoint, expected): + out = _derive_profile_urls([endpoint], ProfilerEngine.VLLM, "start") + assert out == [expected] + + @pytest.mark.unit + def test_derive_stop_path_and_fanout(self): + out = _derive_profile_urls( + ["http://a/v1", "http://b/v1"], ProfilerEngine.VLLM, "stop" + ) + assert out == ["http://a/stop_profile", "http://b/stop_profile"] + + @pytest.mark.unit + def test_derive_empty_endpoints_raises(self): + with pytest.raises(ValueError): + _derive_profile_urls([], ProfilerEngine.VLLM, "start") + + @pytest.mark.unit + def test_post_profile_200(self): + resp = MagicMock() + resp.__enter__.return_value.status = 200 + with patch( + "inference_endpoint.commands.benchmark.execute.urllib_request.urlopen", + return_value=resp, + ): + rec = _post_profile("http://h/start_profile") + assert rec["status"] == 200 + assert rec["error"] is None + assert "sent_at_ns" in rec + assert "sent_at_iso" in rec + + @pytest.mark.unit + def test_post_profile_http_error(self): + err = urllib_error.HTTPError("http://h", 404, "Not Found", {}, None) + with patch( + "inference_endpoint.commands.benchmark.execute.urllib_request.urlopen", + side_effect=err, + ): + rec = _post_profile("http://h/start_profile") + assert rec["status"] == 404 + assert "404" in rec["error"] + + @pytest.mark.unit + def test_post_profile_connection_failure_never_raises(self): + with patch( + "inference_endpoint.commands.benchmark.execute.urllib_request.urlopen", + side_effect=OSError("refused"), + ): + rec = _post_profile("http://h/start_profile") + assert rec["status"] is None + assert "OSError" in rec["error"] + + @pytest.mark.unit + def test_render_status_200(self): + assert _render_profile_status({"status": 200, "error": None}) == "200 OK" + + @pytest.mark.unit + def test_render_status_404_hint(self): + out = _render_profile_status({"status": 404, "error": "404 Not Found"}) + assert "profiling not enabled" in out + + @pytest.mark.unit + def test_write_section_and_json_roundtrip(self): + payload = { + "engine": "vllm", + "starts": [ + { + "url": "http://h/start_profile", + "status": 200, + "error": None, + "sent_at_ns": 1, + "sent_at_iso": "2026-01-01T00:00:00.000", + } + ], + "stops": [ + { + "url": "http://h/stop_profile", + "status": 200, + "error": None, + "stop_reason": "phase_end", + "sent_at_ns": 2, + "sent_at_iso": "2026-01-01T00:00:01.000", + } + ], + } + buf = io.StringIO() + _write_profiling_section(buf, payload) + text = buf.getvalue() + assert "Profiling" in text + assert "http://h/start_profile" in text + assert "http://h/stop_profile" in text + assert "Trigger span" in text + # Mirrors what finalize_benchmark dumps to profiling.json + assert json.loads(json.dumps(payload))["engine"] == "vllm" diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py index 143b5bc4c..f339ea3fe 100644 --- a/tests/unit/config/test_schema.py +++ b/tests/unit/config/test_schema.py @@ -31,11 +31,14 @@ ModelParams, OSLDistribution, OSLDistributionType, + ProfilerEngine, + ProfilingConfig, StreamingMode, SubmissionReference, TestType, ) from inference_endpoint.exceptions import CLIError +from pydantic import ValidationError class TestOSLDistribution: @@ -622,3 +625,32 @@ def test_agentic_inference_explicit_n_samples_takes_precedence(self): load_pattern=lp, ) assert rt.total_samples_to_issue() == 200 + + +class TestProfilingConfig: + @pytest.mark.unit + def test_defaults(self): + cfg = ProfilingConfig() + assert cfg.engine is None + assert cfg.urls is None + + @pytest.mark.unit + def test_engine_enum_coercion(self): + assert ProfilingConfig(engine="vllm").engine is ProfilerEngine.VLLM + + @pytest.mark.unit + @pytest.mark.parametrize( + "ctor", + [ + lambda u: ProfilingConfig(engine="vllm", urls=u), + lambda u: ProfilingConfig.model_validate({"engine": "vllm", "urls": u}), + ], + ) + def test_url_scheme_rejected_without_scheme(self, ctor): + with pytest.raises(ValidationError): + ctor(["localhost:8000"]) + + @pytest.mark.unit + def test_valid_urls_accepted(self): + cfg = ProfilingConfig(engine="vllm", urls=["http://h:8001/v1"]) + assert cfg.urls == ["http://h:8001/v1"]