-
Notifications
You must be signed in to change notification settings - Fork 22
Feat: Support creation of run_metadata.json at the end of benchmark run #374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
37e839c
055bc53
5494a82
66838b4
ebf218a
c97e07b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,7 @@ | |
| from collections.abc import Callable | ||
| from dataclasses import dataclass, field | ||
| from dataclasses import replace as dataclass_replace | ||
| from datetime import datetime | ||
| from datetime import UTC, datetime | ||
| from pathlib import Path | ||
| from typing import Any | ||
| from urllib.parse import urljoin | ||
|
|
@@ -96,6 +96,7 @@ | |
| SessionResult, | ||
| ) | ||
| from inference_endpoint.metrics.report import Report | ||
| from inference_endpoint.utils import monotime_to_datetime | ||
|
|
||
| transformers_logging.set_verbosity_error() | ||
|
|
||
|
|
@@ -990,6 +991,112 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None: | |
| except Exception as e: | ||
| logger.error(f"Save failed: {e}") | ||
|
|
||
| try: | ||
| metadata_path = ctx.report_dir / "run_metadata.json" | ||
| run_metadata = _build_run_metadata( | ||
| ctx, report, qps=qps, start_time_ns=bench.session.start_time_ns | ||
| ) | ||
| with open(metadata_path, "w", encoding="utf-8") as f: | ||
| json.dump(run_metadata, f, indent=2) | ||
| logger.info("Run metadata written to %s", metadata_path) | ||
| except Exception as e: | ||
| logger.error("Failed to write run_metadata.json: %s", e) | ||
|
|
||
|
|
||
| def _ns_to_ms(val: float | int | None) -> float | None: | ||
| return float(val) / 1e6 if val is not None else None | ||
|
|
||
|
|
||
| def _metric_stat(metric: dict[str, Any], key: str) -> float | None: | ||
| return _ns_to_ms(metric.get(key)) if metric else None | ||
|
|
||
|
|
||
| def _metric_pct(metric: dict[str, Any], p: str) -> float | None: | ||
| return _ns_to_ms((metric.get("percentiles") or {}).get(p)) if metric else None | ||
|
|
||
|
|
||
| def _build_run_metadata( | ||
| ctx: BenchmarkContext, | ||
| report: Report | None, | ||
| *, | ||
| qps: float | None, | ||
| start_time_ns: int, | ||
| ) -> dict[str, Any]: | ||
| """Build the run_metadata.json payload from a completed benchmark run. | ||
|
|
||
| Infrastructure fields (tensor_parallel, config_summary, etc.) are left as | ||
| None | ||
| """ | ||
| system_tps = report.tps() if report is not None else None | ||
| concurrency = ctx.config.settings.load_pattern.target_concurrency | ||
| tps_per_user = ( | ||
| (system_tps / concurrency) if (system_tps is not None and concurrency) else None | ||
| ) | ||
|
anandhu-eng marked this conversation as resolved.
|
||
|
|
||
| ttft = report.ttft if report is not None else {} | ||
| tpot = report.tpot if report is not None else {} | ||
| latency = report.latency if report is not None else {} | ||
|
|
||
| return { | ||
| "run_date": monotime_to_datetime(start_time_ns) | ||
| .astimezone(UTC) | ||
| .date() | ||
| .isoformat(), | ||
|
Comment on lines
+1041
to
+1044
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is fine as the project uses python>3.12 as detailed here |
||
| "config_summary": None, | ||
| "disaggregated": None, | ||
| "expert_parallel": None, | ||
| "tensor_parallel": None, | ||
| "pipeline_parallel": None, | ||
| "data_parallel": None, | ||
| "batch": None, | ||
| "config_summary_notes": None, | ||
| "concurrency": concurrency, | ||
| "system_tps": system_tps, | ||
| "tps_per_user": tps_per_user, | ||
| "ttft": _metric_pct(ttft, "99.0"), | ||
| "qps": qps, | ||
| "tps_utilization": None, | ||
| "measured_total_output_tokens": ( | ||
| (report.output_sequence_lengths or {}).get("total") | ||
| if report is not None | ||
| else None | ||
| ), | ||
| "measured_run_duration": ( | ||
| report.duration_ns / 1e9 | ||
| if (report is not None and report.duration_ns is not None) | ||
| else None | ||
| ), | ||
| "measured_total_requests": ( | ||
| report.n_samples_completed if report is not None else None | ||
| ), | ||
| "link_config": None, | ||
| "link_logs": None, | ||
| "measured_latency_ttft_min": _metric_stat(ttft, "min"), | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Anandu, I would suggest that we have submission checker or other means to auto-generate the run_metadata.json by reading the results_summary.json in post-processing, i.e. make results_summary.json self complete and don't create more output file that duplicate contents. By creating more separate files with a similar set of metrics, we are increasing the entropy of the system and decreasing the maintainability of the module. Not all scenarios will have the TTFT/TPOT in endpoints (e.g. T2V, VLM).
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can discuss in slack or iterate on the exact requirements. Right now we have points.yaml and results_summary.json which should have all the information |
||
| "measured_latency_ttft_average": _metric_stat(ttft, "avg"), | ||
| "measured_latency_ttft_p50": _metric_pct(ttft, "50.0"), | ||
| "measured_latency_ttft_p90": _metric_pct(ttft, "90.0"), | ||
| "measured_latency_ttft_p95": _metric_pct(ttft, "95.0"), | ||
| "measured_latency_ttft_p99": _metric_pct(ttft, "99.0"), | ||
| "measured_latency_ttft_p999": _metric_pct(ttft, "99.9"), | ||
| "measured_latency_ttft_max": _metric_stat(ttft, "max"), | ||
| "measured_latency_tpot_min": _metric_stat(tpot, "min"), | ||
| "measured_latency_tpot_average": _metric_stat(tpot, "avg"), | ||
| "measured_latency_tpot_p50": _metric_pct(tpot, "50.0"), | ||
| "measured_latency_tpot_p90": _metric_pct(tpot, "90.0"), | ||
| "measured_latency_tpot_p95": _metric_pct(tpot, "95.0"), | ||
| "measured_latency_tpot_p99": _metric_pct(tpot, "99.0"), | ||
| "measured_latency_tpot_p999": _metric_pct(tpot, "99.9"), | ||
| "measured_latency_tpot_max": _metric_stat(tpot, "max"), | ||
| "measured_latency_request_min": _metric_stat(latency, "min"), | ||
| "measured_latency_request_average": _metric_stat(latency, "avg"), | ||
| "measured_latency_request_p50": _metric_pct(latency, "50.0"), | ||
| "measured_latency_request_p90": _metric_pct(latency, "90.0"), | ||
| "measured_latency_request_p95": _metric_pct(latency, "95.0"), | ||
| "measured_latency_request_p99": _metric_pct(latency, "99.0"), | ||
| "measured_latency_request_p999": _metric_pct(latency, "99.9"), | ||
| "measured_latency_request_max": _metric_stat(latency, "max"), | ||
| } | ||
|
|
||
|
|
||
| def run_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> None: | ||
| """Orchestrate setup → execute → finalize.""" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Importing
UTCdirectly fromdatetimewas introduced in Python 3.11. If this project is run on Python 3.10 or older, this import will raise anImportError. To ensure backward compatibility with older Python versions, it is safer to importtimezoneand usetimezone.utcinstead.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is fine as the project uses
python>3.12as detailed here