From c119fe990fe23db8fe4504cfc723508d387e4a8a Mon Sep 17 00:00:00 2001
From: maxyanghu <hyoung2991@gmail.com>
Date: Tue, 26 May 2026 11:56:43 -0700
Subject: [PATCH 1/5] feat: add client-side vLLM profiling trigger

Adds an optional client-side trigger that fires POST /start_profile
at the performance phase start and /stop_profile at run end, so a
profiled run can be driven from a YAML/CLI flag without coupling
endpoints to any vendor harness.

Schema: ProfilerEngine enum (currently {vllm}) and ProfilingConfig
hung off Settings. URLs are auto-derived per entry in
endpoint_config.endpoints (strip /v1, append engine-specific path).
Default-off; warn-don't-fail throughout.

Report.txt gets a Profiling section and a sibling profiling.json is
written next to result_summary.json when the trigger is enabled.
---
 .../commands/benchmark/cli.py                 |  16 ++
 .../commands/benchmark/execute.py             | 190 +++++++++++++++++-
 src/inference_endpoint/config/schema.py       |  36 ++++
 3 files changed, 237 insertions(+), 5 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark/cli.py b/src/inference_endpoint/commands/benchmark/cli.py
index 685d2d305..22893739c 100644
--- a/src/inference_endpoint/commands/benchmark/cli.py
+++ b/src/inference_endpoint/commands/benchmark/cli.py
@@ -29,6 +29,7 @@
     BenchmarkConfig,
     OfflineBenchmarkConfig,
     OnlineBenchmarkConfig,
+    ProfilerEngine,
     TestMode,
     TestType,
 )
@@ -98,6 +99,13 @@ def from_config(
     config: Annotated[Path, cyclopts.Parameter(name=["--config", "-c"])],
     timeout: float | None = None,
     mode: TestMode | None = None,
+    profile: Annotated[
+        ProfilerEngine | None,
+        cyclopts.Parameter(
+            name="--profile",
+            help="Profile the named inference engine around the performance phase",
+        ),
+    ] = None,
 ):
     """Run benchmark from YAML config file."""
     try:
@@ -106,6 +114,14 @@ def from_config(
         raise InputValidationError(f"Config error: {e}") from e
     if timeout is not None:
         resolved = resolved.with_updates(timeout=timeout)
+    if profile is not None:
+        new_profiling = resolved.settings.profiling.model_copy(
+            update={"engine": profile}
+        )
+        new_settings = resolved.settings.model_copy(
+            update={"profiling": new_profiling}
+        )
+        resolved = resolved.with_updates(settings=new_settings)
     test_mode = mode or (
         TestMode.BOTH if resolved.type == TestType.SUBMISSION else TestMode.PERF
     )
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index a2050bbe3..717df676d 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -30,13 +30,16 @@
 import shutil
 import signal
 import tempfile
+import time
 import uuid
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from dataclasses import replace as dataclass_replace
 from datetime import datetime
 from pathlib import Path
-from typing import Any
+from typing import Any, TextIO
+from urllib import error as urllib_error
+from urllib import request as urllib_request
 from urllib.parse import urljoin
 
 import msgspec
@@ -65,6 +68,7 @@
     DatasetType,
     LoadPattern,
     LoadPatternType,
+    ProfilerEngine,
     StreamingMode,
     TestMode,
     TestType,
@@ -140,6 +144,10 @@ class BenchmarkResult:
     collector: ResponseCollector
     report: Report | None
     tmpfs_dir: Path
+    # Profile trigger payload {engine: str, starts: [...], stops: [...]} when
+    # settings.profiling.engine is set; None otherwise. Rendered into
+    # report.txt and a sibling profiling.json by finalize_benchmark.
+    profiling: dict[str, Any] | None = None
 
 
 @dataclass
@@ -538,6 +546,110 @@ def _load_final_snapshot_from_disk(path: Path) -> dict[str, Any] | None:
         return None
 
 
+# (start_path, stop_path) for each supported inference engine's profiling
+# protocol. Add a row when introducing a new ProfilerEngine variant.
+_PROFILE_PATHS: dict[ProfilerEngine, tuple[str, str]] = {
+    ProfilerEngine.VLLM: ("/start_profile", "/stop_profile"),
+}
+
+
+def _derive_profile_urls(
+    endpoints: list[str], engine: ProfilerEngine, action: str
+) -> list[str]:
+    """One profile URL per endpoint, derived from the engine's HTTP protocol.
+
+    For vLLM: strip a trailing ``/v1`` from each endpoint and append
+    ``/{start,stop}_profile``. ``action`` is ``"start"`` or ``"stop"``.
+    """
+    if not endpoints:
+        raise ValueError(
+            f"profiling.engine={engine.value} but endpoint_config.endpoints "
+            f"is empty; cannot derive {action} URLs"
+        )
+    start_path, stop_path = _PROFILE_PATHS[engine]
+    path = start_path if action == "start" else stop_path
+    urls: list[str] = []
+    for ep in endpoints:
+        base = ep.rstrip("/")
+        if base.endswith("/v1"):
+            base = base[:-3]
+        urls.append(f"{base.rstrip('/')}{path}")
+    return urls
+
+
+def _post_profile(url: str) -> dict[str, Any]:
+    """POST {url} with empty body; never raises. Returns a record dict suitable
+    for report.txt rendering and profiling.json serialization."""
+    record: dict[str, Any] = {
+        "url": url,
+        "sent_at_ns": time.monotonic_ns(),
+        "sent_at_iso": datetime.now().isoformat(timespec="milliseconds"),
+        "status": None,
+        "error": None,
+    }
+    req = urllib_request.Request(url, method="POST", data=b"")
+    try:
+        with urllib_request.urlopen(req, timeout=2) as resp:
+            record["status"] = resp.status
+    except urllib_error.HTTPError as e:
+        record["status"] = e.code
+        record["error"] = f"{e.code} {e.reason}"
+    except Exception as e:  # noqa: BLE001 — profile failures must never abort a run
+        record["error"] = f"{type(e).__name__}: {e}"
+    return record
+
+
+def _render_profile_status(rec: dict[str, Any]) -> str:
+    status = rec.get("status")
+    error = rec.get("error")
+    if status == 200:
+        return "200 OK"
+    if status == 404:
+        return (
+            "404 (profiling not enabled on server — pass "
+            "--profiler-config.profiler=... to server)"
+        )
+    if error:
+        return error
+    if status is not None:
+        return str(status)
+    return "ERROR"
+
+
+def _write_profiling_section(f: TextIO, profiling: dict[str, Any]) -> None:
+    """Append the Profiling section to report.txt (called after report.display)."""
+    starts = profiling.get("starts", [])
+    stops = profiling.get("stops", [])
+    f.write("\n------------------- Profiling -------------------\n")
+    f.write(f"Engine: {profiling.get('engine', 'unknown')}\n")
+    f.write("Start:\n")
+    for rec in starts:
+        f.write(
+            f"  POST {rec['url']} @ {rec['sent_at_iso']} → "
+            f"{_render_profile_status(rec)}\n"
+        )
+    if stops:
+        f.write("Stop:\n")
+        for rec in stops:
+            suffix = (
+                " (from abort handler)" if rec.get("stop_reason") == "abort" else ""
+            )
+            f.write(
+                f"  POST {rec['url']} @ {rec['sent_at_iso']} → "
+                f"{_render_profile_status(rec)}{suffix}\n"
+            )
+    if starts and stops:
+        first_start = min(r["sent_at_ns"] for r in starts)
+        last_stop = max(r["sent_at_ns"] for r in stops)
+        f.write(f"Trigger span: {(last_stop - first_start) / 1e9:.2f} s\n")
+    f.write(
+        "\nNote: actual trace window is bounded by server-side "
+        "--profiler-config.delay_iterations and "
+        "--profiler-config.max_iterations.\n"
+        "Trace artifact path is in server stdout.\n"
+    )
+
+
 async def _run_benchmark_async(
     ctx: BenchmarkContext,
     loop: asyncio.AbstractEventLoop,
@@ -735,6 +847,22 @@ def _on_sample_complete(result: QueryResult) -> None:
         _timeout_done = False
         max_duration_ms = ctx.rt_settings.max_duration_ms
 
+        # Profile trigger state. Pre-derive URLs once so a bad config
+        # (engine set but no endpoints) fails before the run.
+        profiling_cfg = config.settings.profiling
+        profile_start_urls: list[str] = []
+        profile_stop_urls: list[str] = []
+        profile_starts: list[dict[str, Any]] = []
+        profile_stops: list[dict[str, Any]] = []
+        if profiling_cfg.engine is not None:
+            profile_start_urls = _derive_profile_urls(
+                config.endpoint_config.endpoints, profiling_cfg.engine, "start"
+            )
+            profile_stop_urls = _derive_profile_urls(
+                config.endpoint_config.endpoints, profiling_cfg.engine, "stop"
+            )
+        session_completed_normally = False
+
         def _on_global_timeout() -> None:
             if not _timeout_done:
                 logger.warning(
@@ -745,17 +873,32 @@ def _on_global_timeout() -> None:
 
         def _on_phase_start(phase: PhaseConfig) -> None:
             nonlocal global_timeout_handle
-            if (
-                phase.phase_type == PhaseType.PERFORMANCE
-                and max_duration_ms is not None
-            ):
+            if phase.phase_type != PhaseType.PERFORMANCE:
+                return
+            if max_duration_ms is not None:
                 global_timeout_handle = loop.call_later(
                     max_duration_ms / 1000.0, _on_global_timeout
                 )
+            # Fire /start_profile sequentially before any perf request is
+            # issued, so the server is armed when traffic begins. Blocks
+            # the loop briefly (sub-100ms per URL); strategy task hasn't
+            # been created yet so nothing is starved.
+            for url in profile_start_urls:
+                rec = _post_profile(url)
+                if rec["status"] == 200:
+                    logger.info("Profile start: %s -> 200 OK", url)
+                else:
+                    logger.warning(
+                        "Profile start: %s -> %s",
+                        url,
+                        rec["error"] or rec["status"],
+                    )
+                profile_starts.append(rec)
 
         loop.add_signal_handler(signal.SIGINT, session.stop)
         try:
             result = await session.run(phases, on_phase_start=_on_phase_start)
+            session_completed_normally = True
         except Exception as e:
             raise ExecutionError(f"Benchmark execution failed: {e}") from e
         finally:
@@ -763,6 +906,25 @@ def _on_phase_start(phase: PhaseConfig) -> None:
             if global_timeout_handle is not None:
                 global_timeout_handle.cancel()
             loop.remove_signal_handler(signal.SIGINT)
+            # Fire /stop_profile for URLs whose /start_profile succeeded.
+            # Unifies the clean phase-end path and the abort path —
+            # both reach this block, both fire stops.
+            if profile_starts:
+                stop_reason = "phase_end" if session_completed_normally else "abort"
+                for i, start_rec in enumerate(profile_starts):
+                    if start_rec["status"] != 200 or i >= len(profile_stop_urls):
+                        continue
+                    rec = _post_profile(profile_stop_urls[i])
+                    rec["stop_reason"] = stop_reason
+                    if rec["status"] == 200:
+                        logger.info("Profile stop: %s -> 200 OK", profile_stop_urls[i])
+                    else:
+                        logger.warning(
+                            "Profile stop: %s -> %s",
+                            profile_stop_urls[i],
+                            rec["error"] or rec["status"],
+                        )
+                    profile_stops.append(rec)
             logger.info("Cleaning up...")
             try:
                 if http_client:
@@ -815,11 +977,20 @@ def _on_phase_start(phase: PhaseConfig) -> None:
             metrics_subscriber.close()
             pbar.close()
 
+    profiling_payload: dict[str, Any] | None = None
+    if profiling_cfg.engine is not None:
+        profiling_payload = {
+            "engine": profiling_cfg.engine.value,
+            "starts": profile_starts,
+            "stops": profile_stops,
+        }
+
     return BenchmarkResult(
         session=result,
         collector=collector,
         report=report,
         tmpfs_dir=tmpfs_dir,
+        profiling=profiling_payload,
     )
 
 
@@ -888,8 +1059,17 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None:
         report_txt = ctx.report_dir / "report.txt"
         with report_txt.open("w") as f:
             report.display(fn=lambda s: print(s, file=f))
+            if bench.profiling is not None:
+                _write_profiling_section(f, bench.profiling)
         logger.info(f"Report written to {report_txt}")
 
+    # Sibling profiling.json — kept separate so Report stays a pure
+    # snapshot-derived struct.
+    if bench.profiling is not None:
+        (ctx.report_dir / "profiling.json").write_text(
+            json.dumps(bench.profiling, indent=2)
+        )
+
     # Write scoring artifacts + copy event log from tmpfs to disk
     _write_scoring_artifacts(ctx, result, bench.tmpfs_dir)
 
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 9226d7f85..7822f5a81 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -606,6 +606,41 @@ class DrainConfig(BaseModel):
         2,
         ge=1,
         description="Number of tokenizer worker threads in the metrics aggregator (default: 2).",
+class ProfilerEngine(str, Enum):
+    """Inference engine whose profiling protocol the client should drive.
+
+    Selects the HTTP path layout used to derive start/stop URLs from
+    ``endpoint_config.endpoints``. Each value corresponds to one server-side
+    profiling protocol; add a new variant + ``_PROFILE_PATHS`` row to support
+    another engine.
+    """
+
+    VLLM = "vllm"
+
+
+@cyclopts.Parameter(name="*")
+class ProfilingConfig(BaseModel):
+    """Client-side trigger for the server's profiler.
+
+    When ``engine`` is set, fires POST ``<start_path>`` at performance-phase
+    begin and POST ``<stop_path>`` at performance-phase end. URLs are derived
+    from ``endpoint_config.endpoints`` using the engine-specific protocol.
+    Server must be launched with profiling enabled (e.g. vLLM's
+    ``--profiler-config.profiler=cuda|torch``); the schedule
+    (``delay_iterations``, ``max_iterations``) is set there, not here.
+    """
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    engine: Annotated[
+        ProfilerEngine | None,
+        cyclopts.Parameter(
+            alias="--profile",
+            help="Profile the named inference engine around the performance phase",
+        ),
+    ] = Field(
+        None,
+        description="Profile the named inference engine around the performance phase",
     )
 
 
@@ -623,6 +658,7 @@ class Settings(BaseModel):
         description="Per-phase in-flight response drain timeout configuration",
     )
     warmup: WarmupConfig = Field(default_factory=WarmupConfig)
+    profiling: ProfilingConfig = Field(default_factory=ProfilingConfig)
 
 
 class OfflineSettings(Settings):

From 0cd49ef8372c2fbaa55eb645f81a7e9d49360b3e Mon Sep 17 00:00:00 2001
From: maxyanghu <hyoung2991@gmail.com>
Date: Thu, 11 Jun 2026 12:58:11 -0700
Subject: [PATCH 2/5] fix: close unclosed Field() for metrics_tokenizer_workers

The profiler-trigger commit (a4fe30b) left the Field( call for
metrics_tokenizer_workers unterminated, so config/schema.py raised
SyntaxError and the inference-endpoint CLI could not import. Add the
missing ) so the module compiles.
---
 src/inference_endpoint/config/schema.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 7822f5a81..3dabab773 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -606,6 +606,9 @@ class DrainConfig(BaseModel):
         2,
         ge=1,
         description="Number of tokenizer worker threads in the metrics aggregator (default: 2).",
+    )
+
+
 class ProfilerEngine(str, Enum):
     """Inference engine whose profiling protocol the client should drive.
 

From f73ef13df4a54d92f254f985634bbe9914fae260 Mon Sep 17 00:00:00 2001
From: maxyanghu <hyoung2991@gmail.com>
Date: Mon, 15 Jun 2026 11:01:58 -0700
Subject: [PATCH 3/5] feat: allow separate profiling endpoint override

Add an optional profiling.endpoints (CLI --profile-endpoints) field so the
profiler start/stop triggers can target a different host than the inference
endpoint. When unset, URLs are still derived from endpoint_config.endpoints;
when set, derivation runs over the override list using the same engine-specific
protocol. Adds a scheme validator mirroring EndpointConfig and a matching
--profile-endpoints override on the from-config subcommand.
---
 .../commands/benchmark/cli.py                 | 24 ++++++++++-----
 .../commands/benchmark/execute.py             |  5 ++--
 src/inference_endpoint/config/schema.py       | 30 ++++++++++++++++++-
 .../templates/concurrency_template_full.yaml  |  3 ++
 .../templates/offline_template_full.yaml      |  3 ++
 .../templates/online_template_full.yaml       |  3 ++
 6 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark/cli.py b/src/inference_endpoint/commands/benchmark/cli.py
index 22893739c..21042fb27 100644
--- a/src/inference_endpoint/commands/benchmark/cli.py
+++ b/src/inference_endpoint/commands/benchmark/cli.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Annotated
+from typing import Annotated, Any
 
 import cyclopts
 import yaml
@@ -106,6 +106,15 @@ def from_config(
             help="Profile the named inference engine around the performance phase",
         ),
     ] = None,
+    profile_urls: Annotated[
+        list[str] | None,
+        cyclopts.Parameter(
+            name="--profile-urls",
+            help="Override URL(s) for profiler triggers; "
+            "defaults to endpoint_config.endpoints",
+            negative="",
+        ),
+    ] = None,
 ):
     """Run benchmark from YAML config file."""
     try:
@@ -114,13 +123,14 @@ def from_config(
         raise InputValidationError(f"Config error: {e}") from e
     if timeout is not None:
         resolved = resolved.with_updates(timeout=timeout)
+    profiling_update: dict[str, Any] = {}
     if profile is not None:
-        new_profiling = resolved.settings.profiling.model_copy(
-            update={"engine": profile}
-        )
-        new_settings = resolved.settings.model_copy(
-            update={"profiling": new_profiling}
-        )
+        profiling_update["engine"] = profile
+    if profile_urls is not None:
+        profiling_update["urls"] = profile_urls
+    if profiling_update:
+        new_profiling = resolved.settings.profiling.model_copy(update=profiling_update)
+        new_settings = resolved.settings.model_copy(update={"profiling": new_profiling})
         resolved = resolved.with_updates(settings=new_settings)
     test_mode = mode or (
         TestMode.BOTH if resolved.type == TestType.SUBMISSION else TestMode.PERF
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 717df676d..9db9c4900 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -855,11 +855,12 @@ def _on_sample_complete(result: QueryResult) -> None:
         profile_starts: list[dict[str, Any]] = []
         profile_stops: list[dict[str, Any]] = []
         if profiling_cfg.engine is not None:
+            profile_endpoints = profiling_cfg.urls or config.endpoint_config.endpoints
             profile_start_urls = _derive_profile_urls(
-                config.endpoint_config.endpoints, profiling_cfg.engine, "start"
+                profile_endpoints, profiling_cfg.engine, "start"
             )
             profile_stop_urls = _derive_profile_urls(
-                config.endpoint_config.endpoints, profiling_cfg.engine, "stop"
+                profile_endpoints, profiling_cfg.engine, "stop"
             )
         session_completed_normally = False
 
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 3dabab773..b0ebe03c7 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -627,7 +627,8 @@ class ProfilingConfig(BaseModel):
 
     When ``engine`` is set, fires POST ``<start_path>`` at performance-phase
     begin and POST ``<stop_path>`` at performance-phase end. URLs are derived
-    from ``endpoint_config.endpoints`` using the engine-specific protocol.
+    using the engine-specific protocol from ``urls`` when set, otherwise
+    from ``endpoint_config.endpoints``.
     Server must be launched with profiling enabled (e.g. vLLM's
     ``--profiler-config.profiler=cuda|torch``); the schedule
     (``delay_iterations``, ``max_iterations``) is set there, not here.
@@ -645,6 +646,33 @@ class ProfilingConfig(BaseModel):
         None,
         description="Profile the named inference engine around the performance phase",
     )
+    urls: Annotated[
+        list[str] | None,
+        cyclopts.Parameter(
+            alias="--profile-urls",
+            help="Override URL(s) for profiler triggers; "
+            "defaults to endpoint_config.endpoints",
+            negative="",
+        ),
+    ] = Field(
+        None,
+        description="URL(s) the profiler start/stop triggers are derived from. "
+        "When None, derived from endpoint_config.endpoints instead. Use when "
+        "the profiler admin endpoint differs from the inference endpoint.",
+    )
+
+    @field_validator("urls", mode="after")
+    @classmethod
+    def _validate_url_scheme(cls, v: list[str] | None) -> list[str] | None:
+        if v is None:
+            return v
+        for url in v:
+            if not url.startswith(("http://", "https://")):
+                raise ValueError(
+                    f"Profiling endpoint URL must include scheme "
+                    f"(http:// or https://), got: {url!r}"
+                )
+        return v
 
 
 @cyclopts.Parameter(name="*")
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 38829f0f5..d0776d282 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -87,6 +87,9 @@ settings:
     salt: false  # Prepend a unique random hex salt to each warmup prompt
     drain: false
     warmup_random_seed: 42  # RNG seed for warmup scheduling and sample ordering
+  profiling:
+    engine: null  # Profile the named inference engine around the performance phase | options: vllm
+    urls: null  # URL(s) the profiler start/stop triggers are derived from. When None, derived from endpoint_config.endpoints instead. Use when the profiler admin endpoint differs from the inference endpoint.
 endpoint_config:
   endpoints:  # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'.
   - http://localhost:8000
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index c3454d5da..77ded061e 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -87,6 +87,9 @@ settings:
     salt: false  # Prepend a unique random hex salt to each warmup prompt
     drain: false
     warmup_random_seed: 42  # RNG seed for warmup scheduling and sample ordering
+  profiling:
+    engine: null  # Profile the named inference engine around the performance phase | options: vllm
+    urls: null  # URL(s) the profiler start/stop triggers are derived from. When None, derived from endpoint_config.endpoints instead. Use when the profiler admin endpoint differs from the inference endpoint.
 endpoint_config:
   endpoints:  # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'.
   - http://localhost:8000
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 5bea95329..5ec9cbfcf 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -87,6 +87,9 @@ settings:
     salt: false  # Prepend a unique random hex salt to each warmup prompt
     drain: false
     warmup_random_seed: 42  # RNG seed for warmup scheduling and sample ordering
+  profiling:
+    engine: null  # Profile the named inference engine around the performance phase | options: vllm
+    urls: null  # URL(s) the profiler start/stop triggers are derived from. When None, derived from endpoint_config.endpoints instead. Use when the profiler admin endpoint differs from the inference endpoint.
 endpoint_config:
   endpoints:  # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'.
   - http://localhost:8000

From 93f9a9e4de0d0acaa75f1572dc68940a4730cd91 Mon Sep 17 00:00:00 2001
From: maxyanghu <hyoung2991@gmail.com>
Date: Tue, 16 Jun 2026 09:55:04 -0700
Subject: [PATCH 4/5] refactor: drop --profile/--profile-urls overrides from
 from-config

Keep the from-config CLI surface minimal per review feedback: profiling
is configured via the YAML settings.profiling block for from-config runs.
This also removes the model_copy(update=...) path that bypassed
ProfilingConfig URL-scheme validation. offline/online keep the
schema-generated --profile/--profile-urls flags, which validate normally.
---
 .../commands/benchmark/cli.py                 | 28 +------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark/cli.py b/src/inference_endpoint/commands/benchmark/cli.py
index 21042fb27..685d2d305 100644
--- a/src/inference_endpoint/commands/benchmark/cli.py
+++ b/src/inference_endpoint/commands/benchmark/cli.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Annotated, Any
+from typing import Annotated
 
 import cyclopts
 import yaml
@@ -29,7 +29,6 @@
     BenchmarkConfig,
     OfflineBenchmarkConfig,
     OnlineBenchmarkConfig,
-    ProfilerEngine,
     TestMode,
     TestType,
 )
@@ -99,22 +98,6 @@ def from_config(
     config: Annotated[Path, cyclopts.Parameter(name=["--config", "-c"])],
     timeout: float | None = None,
     mode: TestMode | None = None,
-    profile: Annotated[
-        ProfilerEngine | None,
-        cyclopts.Parameter(
-            name="--profile",
-            help="Profile the named inference engine around the performance phase",
-        ),
-    ] = None,
-    profile_urls: Annotated[
-        list[str] | None,
-        cyclopts.Parameter(
-            name="--profile-urls",
-            help="Override URL(s) for profiler triggers; "
-            "defaults to endpoint_config.endpoints",
-            negative="",
-        ),
-    ] = None,
 ):
     """Run benchmark from YAML config file."""
     try:
@@ -123,15 +106,6 @@ def from_config(
         raise InputValidationError(f"Config error: {e}") from e
     if timeout is not None:
         resolved = resolved.with_updates(timeout=timeout)
-    profiling_update: dict[str, Any] = {}
-    if profile is not None:
-        profiling_update["engine"] = profile
-    if profile_urls is not None:
-        profiling_update["urls"] = profile_urls
-    if profiling_update:
-        new_profiling = resolved.settings.profiling.model_copy(update=profiling_update)
-        new_settings = resolved.settings.model_copy(update={"profiling": new_profiling})
-        resolved = resolved.with_updates(settings=new_settings)
     test_mode = mode or (
         TestMode.BOTH if resolved.type == TestType.SUBMISSION else TestMode.PERF
     )

From 604770e2871fd5b2f3ffaa4850f4989ca7d12b8d Mon Sep 17 00:00:00 2001
From: maxyanghu <hyoung2991@gmail.com>
Date: Tue, 16 Jun 2026 11:35:12 -0700
Subject: [PATCH 5/5] test: cover profiling trigger config and helpers

Adds unit tests for the client-side profiling trigger (review finding #3):
- TestProfilingConfig (test_schema.py): defaults, engine enum coercion, and
  URL-scheme validation on both the direct-construction (offline/online) and
  model_validate (from-config YAML) paths.
- TestProfilingHelpers (test_benchmark.py): _derive_profile_urls /v1 stripping
  and empty-endpoints ValueError, _post_profile 200/404/connection-failure via
  mocked urlopen, _render_profile_status, and _write_profiling_section output
  plus profiling.json serializability.
---
 tests/unit/commands/test_benchmark.py | 113 ++++++++++++++++++++++++++
 tests/unit/config/test_schema.py      |  32 ++++++++
 2 files changed, 145 insertions(+)

diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 1c90554fb..0a4b43c5d 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -16,11 +16,14 @@
 """Tests for benchmark CLI models, config building, and command handlers."""
 
 import asyncio
+import io
+import json
 import random
 import tempfile
 from pathlib import Path
 from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
+from urllib import error as urllib_error
 
 import pandas as pd
 import pytest
@@ -34,7 +37,11 @@
     BenchmarkContext,
     ResponseCollector,
     _build_phases,
+    _derive_profile_urls,
+    _post_profile,
+    _render_profile_status,
     _run_benchmark_async,
+    _write_profiling_section,
     setup_benchmark,
 )
 from inference_endpoint.config.runtime_settings import RuntimeSettings
@@ -46,6 +53,7 @@
     LoadPatternType,
     OfflineSettings,
     OnlineSettings,
+    ProfilerEngine,
     RuntimeConfig,
     ScorerMethod,
     StreamingMode,
@@ -1277,3 +1285,108 @@ def test_no_override_yields_none_when_model_has_no_tokenizer(
             ctx = setup_benchmark(config, TestMode.PERF)
 
         assert ctx.tokenizer_name is None
+
+
+class TestProfilingHelpers:
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "endpoint,expected",
+        [
+            ("http://h:8000/v1", "http://h:8000/start_profile"),
+            ("http://h:8000/v1/", "http://h:8000/start_profile"),
+            ("http://h:8000", "http://h:8000/start_profile"),
+        ],
+    )
+    def test_derive_strips_v1(self, endpoint, expected):
+        out = _derive_profile_urls([endpoint], ProfilerEngine.VLLM, "start")
+        assert out == [expected]
+
+    @pytest.mark.unit
+    def test_derive_stop_path_and_fanout(self):
+        out = _derive_profile_urls(
+            ["http://a/v1", "http://b/v1"], ProfilerEngine.VLLM, "stop"
+        )
+        assert out == ["http://a/stop_profile", "http://b/stop_profile"]
+
+    @pytest.mark.unit
+    def test_derive_empty_endpoints_raises(self):
+        with pytest.raises(ValueError):
+            _derive_profile_urls([], ProfilerEngine.VLLM, "start")
+
+    @pytest.mark.unit
+    def test_post_profile_200(self):
+        resp = MagicMock()
+        resp.__enter__.return_value.status = 200
+        with patch(
+            "inference_endpoint.commands.benchmark.execute.urllib_request.urlopen",
+            return_value=resp,
+        ):
+            rec = _post_profile("http://h/start_profile")
+        assert rec["status"] == 200
+        assert rec["error"] is None
+        assert "sent_at_ns" in rec
+        assert "sent_at_iso" in rec
+
+    @pytest.mark.unit
+    def test_post_profile_http_error(self):
+        err = urllib_error.HTTPError("http://h", 404, "Not Found", {}, None)
+        with patch(
+            "inference_endpoint.commands.benchmark.execute.urllib_request.urlopen",
+            side_effect=err,
+        ):
+            rec = _post_profile("http://h/start_profile")
+        assert rec["status"] == 404
+        assert "404" in rec["error"]
+
+    @pytest.mark.unit
+    def test_post_profile_connection_failure_never_raises(self):
+        with patch(
+            "inference_endpoint.commands.benchmark.execute.urllib_request.urlopen",
+            side_effect=OSError("refused"),
+        ):
+            rec = _post_profile("http://h/start_profile")
+        assert rec["status"] is None
+        assert "OSError" in rec["error"]
+
+    @pytest.mark.unit
+    def test_render_status_200(self):
+        assert _render_profile_status({"status": 200, "error": None}) == "200 OK"
+
+    @pytest.mark.unit
+    def test_render_status_404_hint(self):
+        out = _render_profile_status({"status": 404, "error": "404 Not Found"})
+        assert "profiling not enabled" in out
+
+    @pytest.mark.unit
+    def test_write_section_and_json_roundtrip(self):
+        payload = {
+            "engine": "vllm",
+            "starts": [
+                {
+                    "url": "http://h/start_profile",
+                    "status": 200,
+                    "error": None,
+                    "sent_at_ns": 1,
+                    "sent_at_iso": "2026-01-01T00:00:00.000",
+                }
+            ],
+            "stops": [
+                {
+                    "url": "http://h/stop_profile",
+                    "status": 200,
+                    "error": None,
+                    "stop_reason": "phase_end",
+                    "sent_at_ns": 2,
+                    "sent_at_iso": "2026-01-01T00:00:01.000",
+                }
+            ],
+        }
+        buf = io.StringIO()
+        _write_profiling_section(buf, payload)
+        text = buf.getvalue()
+        assert "Profiling" in text
+        assert "http://h/start_profile" in text
+        assert "http://h/stop_profile" in text
+        assert "Trigger span" in text
+        # Mirrors what finalize_benchmark dumps to profiling.json
+        assert json.loads(json.dumps(payload))["engine"] == "vllm"
diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py
index 143b5bc4c..f339ea3fe 100644
--- a/tests/unit/config/test_schema.py
+++ b/tests/unit/config/test_schema.py
@@ -31,11 +31,14 @@
     ModelParams,
     OSLDistribution,
     OSLDistributionType,
+    ProfilerEngine,
+    ProfilingConfig,
     StreamingMode,
     SubmissionReference,
     TestType,
 )
 from inference_endpoint.exceptions import CLIError
+from pydantic import ValidationError
 
 
 class TestOSLDistribution:
@@ -622,3 +625,32 @@ def test_agentic_inference_explicit_n_samples_takes_precedence(self):
             load_pattern=lp,
         )
         assert rt.total_samples_to_issue() == 200
+
+
+class TestProfilingConfig:
+    @pytest.mark.unit
+    def test_defaults(self):
+        cfg = ProfilingConfig()
+        assert cfg.engine is None
+        assert cfg.urls is None
+
+    @pytest.mark.unit
+    def test_engine_enum_coercion(self):
+        assert ProfilingConfig(engine="vllm").engine is ProfilerEngine.VLLM
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "ctor",
+        [
+            lambda u: ProfilingConfig(engine="vllm", urls=u),
+            lambda u: ProfilingConfig.model_validate({"engine": "vllm", "urls": u}),
+        ],
+    )
+    def test_url_scheme_rejected_without_scheme(self, ctor):
+        with pytest.raises(ValidationError):
+            ctor(["localhost:8000"])
+
+    @pytest.mark.unit
+    def test_valid_urls_accepted(self):
+        cfg = ProfilingConfig(engine="vllm", urls=["http://h:8001/v1"])
+        assert cfg.urls == ["http://h:8001/v1"]