Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 108 additions & 1 deletion src/inference_endpoint/commands/benchmark/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from collections.abc import Callable
from dataclasses import dataclass, field
from dataclasses import replace as dataclass_replace
from datetime import datetime
from datetime import UTC, datetime

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Importing UTC directly from datetime was introduced in Python 3.11. If this project is run on Python 3.10 or older, this import will raise an ImportError. To ensure backward compatibility with older Python versions, it is safer to import timezone and use timezone.utc instead.

Suggested change
from datetime import UTC, datetime
from datetime import datetime, timezone

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is fine as the project uses python>3.12 as detailed here

from pathlib import Path
from typing import Any
from urllib.parse import urljoin
Expand Down Expand Up @@ -96,6 +96,7 @@
SessionResult,
)
from inference_endpoint.metrics.report import Report
from inference_endpoint.utils import monotime_to_datetime

transformers_logging.set_verbosity_error()

Expand Down Expand Up @@ -990,6 +991,112 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None:
except Exception as e:
logger.error(f"Save failed: {e}")

try:
metadata_path = ctx.report_dir / "run_metadata.json"
run_metadata = _build_run_metadata(
ctx, report, qps=qps, start_time_ns=bench.session.start_time_ns
)
with open(metadata_path, "w", encoding="utf-8") as f:
json.dump(run_metadata, f, indent=2)
logger.info("Run metadata written to %s", metadata_path)
except Exception as e:
logger.error("Failed to write run_metadata.json: %s", e)


def _ns_to_ms(val: float | int | None) -> float | None:
return float(val) / 1e6 if val is not None else None


def _metric_stat(metric: dict[str, Any], key: str) -> float | None:
return _ns_to_ms(metric.get(key)) if metric else None


def _metric_pct(metric: dict[str, Any], p: str) -> float | None:
return _ns_to_ms((metric.get("percentiles") or {}).get(p)) if metric else None


def _build_run_metadata(
ctx: BenchmarkContext,
report: Report | None,
*,
qps: float | None,
start_time_ns: int,
) -> dict[str, Any]:
"""Build the run_metadata.json payload from a completed benchmark run.

Infrastructure fields (tensor_parallel, config_summary, etc.) are left as
None
"""
system_tps = report.tps() if report is not None else None
concurrency = ctx.config.settings.load_pattern.target_concurrency
tps_per_user = (
(system_tps / concurrency) if (system_tps is not None and concurrency) else None
)
Comment thread
anandhu-eng marked this conversation as resolved.

ttft = report.ttft if report is not None else {}
tpot = report.tpot if report is not None else {}
latency = report.latency if report is not None else {}

return {
"run_date": monotime_to_datetime(start_time_ns)
.astimezone(UTC)
.date()
.isoformat(),
Comment on lines +1041 to +1044

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Use timezone.utc instead of UTC to maintain compatibility with Python versions older than 3.11.

Suggested change
"run_date": monotime_to_datetime(start_time_ns)
.astimezone(UTC)
.date()
.isoformat(),
"run_date": monotime_to_datetime(start_time_ns)
.astimezone(timezone.utc)
.date()
.isoformat(),

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is fine as the project uses python>3.12 as detailed here

"config_summary": None,
"disaggregated": None,
"expert_parallel": None,
"tensor_parallel": None,
"pipeline_parallel": None,
"data_parallel": None,
"batch": None,
"config_summary_notes": None,
"concurrency": concurrency,
"system_tps": system_tps,
"tps_per_user": tps_per_user,
"ttft": _metric_pct(ttft, "99.0"),
"qps": qps,
"tps_utilization": None,
"measured_total_output_tokens": (
(report.output_sequence_lengths or {}).get("total")
if report is not None
else None
),
"measured_run_duration": (
report.duration_ns / 1e9
if (report is not None and report.duration_ns is not None)
else None
),
"measured_total_requests": (
report.n_samples_completed if report is not None else None
),
"link_config": None,
"link_logs": None,
"measured_latency_ttft_min": _metric_stat(ttft, "min"),

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Anandu, I would suggest that we have submission checker or other means to auto-generate the run_metadata.json by reading the results_summary.json in post-processing, i.e. make results_summary.json self complete and don't create more output file that duplicate contents.

By creating more separate files with a similar set of metrics, we are increasing the entropy of the system and decreasing the maintainability of the module. Not all scenarios will have the TTFT/TPOT in endpoints (e.g. T2V, VLM).

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can discuss in slack or iterate on the exact requirements. Right now we have points.yaml and results_summary.json which should have all the information

"measured_latency_ttft_average": _metric_stat(ttft, "avg"),
"measured_latency_ttft_p50": _metric_pct(ttft, "50.0"),
"measured_latency_ttft_p90": _metric_pct(ttft, "90.0"),
"measured_latency_ttft_p95": _metric_pct(ttft, "95.0"),
"measured_latency_ttft_p99": _metric_pct(ttft, "99.0"),
"measured_latency_ttft_p999": _metric_pct(ttft, "99.9"),
"measured_latency_ttft_max": _metric_stat(ttft, "max"),
"measured_latency_tpot_min": _metric_stat(tpot, "min"),
"measured_latency_tpot_average": _metric_stat(tpot, "avg"),
"measured_latency_tpot_p50": _metric_pct(tpot, "50.0"),
"measured_latency_tpot_p90": _metric_pct(tpot, "90.0"),
"measured_latency_tpot_p95": _metric_pct(tpot, "95.0"),
"measured_latency_tpot_p99": _metric_pct(tpot, "99.0"),
"measured_latency_tpot_p999": _metric_pct(tpot, "99.9"),
"measured_latency_tpot_max": _metric_stat(tpot, "max"),
"measured_latency_request_min": _metric_stat(latency, "min"),
"measured_latency_request_average": _metric_stat(latency, "avg"),
"measured_latency_request_p50": _metric_pct(latency, "50.0"),
"measured_latency_request_p90": _metric_pct(latency, "90.0"),
"measured_latency_request_p95": _metric_pct(latency, "95.0"),
"measured_latency_request_p99": _metric_pct(latency, "99.0"),
"measured_latency_request_p999": _metric_pct(latency, "99.9"),
"measured_latency_request_max": _metric_stat(latency, "max"),
}


def run_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> None:
"""Orchestrate setup → execute → finalize."""
Expand Down
Loading
Loading