diff --git a/tools/submission/README.md b/tools/submission/README.md
index 6d620233b2..119520a412 100644
--- a/tools/submission/README.md
+++ b/tools/submission/README.md
@@ -32,7 +32,7 @@ The input submission directory is modified with empty directories removed and lo
## `submission_checker/main.py` (Mandatory)
### Inputs
**input**: Path to the directory containing one or several submissions.
-**version**: Checker version. E.g v1.1, v2.0, v2.1, v3.0, v3.1.
+**version**: Checker version. E.g v5.0, v5.1, v6.0, v6.1.
**submitter**: Filter submitters and only run the checks for some specific submitter.
**csv**: Output path where the csv with the results will be stored. E.g `results/summary.csv`.
**skip_compliance**: Flag to skip compliance checks.
@@ -71,25 +71,34 @@ python3 -m inference.tools.submission.submission_checker.main
[--skip-calibration-check]
```
-### implemented checks
-**performance:**
+### Implemented checks
+**performance (loadgen):**
- Check performance detailed log exists
- Check for loadgen errors
- Check for equal issue mode when it is required
- Check the performance sample count used for running the benchmark
- Check loadgen seeds are correct
-- Check latency constrain is met
-- Check minimun query count is met
-- Check minimun duration is met
+- Check latency constraint is met
+- Check minimum query count is met
+- Check minimum duration is met
- Check network requirements
-- Check LLM latencies are met (if applies)
+- Check LLM TTFT/TPOT latencies are met via `use_token_latencies` flag (if applies)
- Check loadgen scenario matches with submission scenario or that result can be inferred
+**performance (endpoints):**
+- Check result_summary.json and config.yaml exist
+- Check latency p99 constraint is met (from `latency.percentiles.99.0` in result_summary.json)
+- Check minimum duration is met (from `settings.runtime.min_duration_ms` in config.yaml)
+- Check LLM TTFT/TPOT p99 limits directly from result_summary.json for Server/Interactive scenarios
+- Extract primary metric as QPS (inferred from `n_samples_issued / duration_s` if not in results)
+- Skips: sample count check, seed check, min query count check (not applicable to endpoints)
+
**accuracy**
- Check the accuracy metric is correct and over the expected threshold (or within a range if applies)
-- Check accuracy json exists and is truncated
+- Check accuracy json exists and is truncated (loadgen only)
- Check for loadgen error
- Check full dataset is used for the accuracy run
+- Check `accuracy_scores` field is present and non-null (endpoints only)
**compliance**
- Check compliance directory exists
@@ -112,11 +121,10 @@ python3 -m inference.tools.submission.submission_checker.main
- Check availability is valid
- Check system type is valid
- Check network fields
-- Check required fields are include in system json file
+- Check required fields are included in system json file
- Check submitter is correct
- Check division is correct
-
### Outputs
- CSV file containing all the valid results in the directory.
- It raises several errors and logs invalid results.
diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py
index db1b1a7559..5f7225e11e 100644
--- a/tools/submission/submission_checker/checks/accuracy_check.py
+++ b/tools/submission/submission_checker/checks/accuracy_check.py
@@ -70,6 +70,8 @@ def __init__(
"scenario", "")
self.scenario = self.mlperf_log["effective_scenario"]
self.division = self.submission_logs.loader_data.get("division", "")
+ self.is_endpoints = self.submission_logs.loader_data.get(
+ "is_endpoints_submission", False)
self.setup_checks()
def setup_checks(self):
@@ -83,6 +85,9 @@ def setup_checks(self):
self.checks.append(self.loadgen_errors_check)
self.checks.append(self.dataset_check)
self.checks.append(self.extra_files_check)
+ self.apply_checks = set(self.checks)
+ if self.is_endpoints:
+ self.apply_checks.remove(self.accuracy_json_check)
def accuracy_result_check(self):
"""Validate reported accuracy metrics in `accuracy.txt`.
@@ -97,6 +102,13 @@ def accuracy_result_check(self):
False otherwise.
"""
+ if self.is_endpoints:
+ if self.mlperf_log["accuracy_score"] is not None:
+ self.submission_logs.loader_data["accuracy_metrics"] = self.mlperf_log["accuracy_score"]
+ return True
+ self.log.error("%s accuracy score not found", self.path)
+ return False
+
patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
self.model
)
diff --git a/tools/submission/submission_checker/checks/base.py b/tools/submission/submission_checker/checks/base.py
index 8e2a678fb9..69f51da638 100644
--- a/tools/submission/submission_checker/checks/base.py
+++ b/tools/submission/submission_checker/checks/base.py
@@ -9,6 +9,7 @@ class BaseCheck(ABC):
def __init__(self, log, path):
self.checks = []
+ self.apply_checks = set()
self.log = log
self.path = path
self.name = "base checks"
@@ -21,22 +22,32 @@ def run_checks(self):
valid = True
errors = []
for check in self.checks:
- try:
- v = self.execute(check)
- valid &= v
- except BaseException:
- valid &= False
- self.log.error(
- "Execution occurred in running check %s. Running %s in %s",
- self.path,
+ if self.check_applies(check):
+ try:
+ v = self.execute(check)
+ valid &= v
+ except BaseException:
+ valid &= False
+ self.log.error(
+ "Execution occurred in running check %s. Running %s in %s",
+ self.path,
+ check.__name__,
+ self.__class__.__name__)
+ else:
+ self.log.warning(
+ "Execution of check %s skipped for %s.",
check.__name__,
- self.__class__.__name__)
+ self.path
+ )
return valid
def execute(self, check):
"""Custom execution of a single check method."""
return check()
+ def check_applies(self, fn):
+ return fn in self.apply_checks
+
def __call__(self):
"""Allows the check instance to be called like a function."""
self.log.info("Starting %s for: %s", self.name, self.path)
diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py
index 13cc6b16b8..4853a832f3 100644
--- a/tools/submission/submission_checker/checks/compliance_check.py
+++ b/tools/submission/submission_checker/checks/compliance_check.py
@@ -50,6 +50,8 @@ def __init__(self, log, path, config: Config,
self.model = self.config.get_mlperf_model(
self.model, self.model_mapping)
self.test_list = self.get_test_list(self.model)
+ self.is_endpoints = self.submission_logs.loader_data.get(
+ "is_endpoints_submission", False)
self.setup_checks()
def setup_checks(self):
@@ -62,6 +64,10 @@ def setup_checks(self):
self.checks.append(self.performance_check)
self.checks.append(self.accuracy_check)
self.checks.append(self.compliance_performance_check)
+ self.apply_checks = set(self.checks)
+ # No compliance tests for endpoints for now
+ if self.is_endpoints:
+ self.apply_checks = set()
def get_test_list(self, model):
"""Return the list of compliance tests applicable to `model`.
@@ -186,6 +192,7 @@ def performance_check(self):
"model_mapping": self.submission_logs.loader_data.get("model_mapping", {}),
"check_scenarios": True,
"compliance_skip": True,
+ "is_endpoints_submission": self.submission_logs.loader_data.get("is_endpoints_submission", False),
}
test_logs = SubmissionLogs(
self.submission_logs.loader_data[f"{test}_perf_log"], None, None, None, self.submission_logs.system_json, None, test_data)
@@ -322,7 +329,9 @@ def accuracy_check(self):
first_token_pass and eos_pass and length_check_pass)
if not is_valid:
self.log.error(
- f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
+ f"TEST06 accuracy check failed. first_token_check:" +
+ f"{first_token_pass} eos_check: " +
+ f"{eos_pass} length_check: {length_check_pass}."
)
elif test == "TEST07":
# TEST07: Verify accuracy in performance mode
diff --git a/tools/submission/submission_checker/checks/measurements_checks.py b/tools/submission/submission_checker/checks/measurements_checks.py
index 06b89f56fc..8a2731f4e6 100644
--- a/tools/submission/submission_checker/checks/measurements_checks.py
+++ b/tools/submission/submission_checker/checks/measurements_checks.py
@@ -61,6 +61,9 @@ def setup_checks(self):
self.checks.append(self.directory_exist_check)
self.checks.append(self.required_files_check)
self.checks.append(self.required_fields_check)
+ self.is_endpoints = self.submission_logs.loader_data.get(
+ "is_endpoints_submission", False)
+ self.apply_checks = set(self.checks)
def missing_check(self):
"""Ensure a measurements JSON was provided.
diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
index e54e7b5564..ddb42f8b4c 100644
--- a/tools/submission/submission_checker/checks/performance_check.py
+++ b/tools/submission/submission_checker/checks/performance_check.py
@@ -53,6 +53,11 @@ def __init__(self, log, path, config: Config,
"scenario", "")
self.scenario = self.mlperf_log["effective_scenario"]
self.division = self.submission_logs.loader_data.get("division", "")
+ self.is_endpoints = self.submission_logs.loader_data.get(
+ "is_endpoints_submission", False)
+ if self.is_endpoints:
+ if self.scenario.lower() == "online":
+ self.scenario = "Server"
self.setup_checks()
def setup_checks(self):
@@ -74,6 +79,10 @@ def setup_checks(self):
self.checks.append(self.llm_check)
self.checks.append(self.inferred_check)
self.checks.append(self.get_performance_metric_check)
+ self.apply_checks = set(self.checks)
+ if self.is_endpoints:
+ self.apply_checks.remove(self.performance_sample_count_check)
+ self.apply_checks.remove(self.min_query_count_check)
def missing_check(self):
"""Ensure the performance log was provided.
@@ -200,14 +209,16 @@ def seeds_check(self):
sample_index_rng_seed = self.mlperf_log["effective_sample_index_rng_seed"]
schedule_rng_seed = self.mlperf_log["effective_schedule_rng_seed"]
is_valid = True
- if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
- self.log.error(
- "%s qsl_rng_seed is wrong, expected=%s, found=%s",
- self.path,
- config_seeds["qsl_rng_seed"],
- qsl_rng_seed,
- )
- is_valid = False
+ if not self.is_endpoints:
+ # This seed does not exists for endpoints runs
+ if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
+ self.log.error(
+ "%s qsl_rng_seed is wrong, expected=%s, found=%s",
+ self.path,
+ config_seeds["qsl_rng_seed"],
+ qsl_rng_seed,
+ )
+ is_valid = False
if sample_index_rng_seed != config_seeds["sample_index_rng_seed"]:
self.log.error(
"%s sample_index_rng_seed is wrong, expected=%s, found=%s",
@@ -237,7 +248,7 @@ def latency_check(self):
bool: True if latency constraints are satisfied, False otherwise.
"""
uses_early_stopping = self.config.uses_early_stopping(self.scenario)
- if uses_early_stopping:
+ if uses_early_stopping and not self.is_endpoints:
# check if early_stopping condition was met
if not self.mlperf_log["early_stopping_met"]:
early_stopping_result = self.mlperf_log["early_stopping_result"]
@@ -386,7 +397,9 @@ def network_check(self):
# (must include "Network SUT" in name)
if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
self.log.error(
- f"{self.path} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
+ f"{self.path} invalid sut name for network mode." +
+ f"expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}'" +
+ f" got '{sut_name}'"
)
return False
@@ -403,6 +416,25 @@ def llm_check(self):
False otherwise.
"""
if self.model in self.config.get_llm_models():
+ if self.is_endpoints:
+ # Endpoints don't use the loadgen use_token_latencies flag;
+ # check TTFT/TPOT directly from the endpoints result JSON.
+ if self.scenario not in ["Server", "Interactive"]:
+ return True
+ limits = LLM_LATENCY_LIMITS[self.model][self.scenario]
+ ttft = self.mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+ tpot = self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+ if ttft is None or tpot is None:
+ self.log.warning(
+ "%s TTFT or TPOT percentile data missing for endpoints LLM check",
+ self.path)
+ return True
+ if ttft < limits["ttft"] and tpot < limits["tpot"]:
+ return True
+ self.log.error(
+ 'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+ ttft, tpot, limits["ttft"], limits["tpot"])
+ return False
if self.mlperf_log["requested_use_token_latencies"]:
if self.scenario not in ["Server", "Interactive"]:
# For offline, singlestream and multistream no further checks are
@@ -457,7 +489,7 @@ def inferred_check(self):
("singlestream", "offline")
]
if (self.scenario.lower(), self.scenario_fixed.lower()
- ) not in list_inferred:
+ ) not in list_inferred:
self.log.error(
"Result for scenario %s can not be inferred from %s for: %s",
self.scenario_fixed,
@@ -485,10 +517,15 @@ def get_performance_metric_check(self):
):
is_valid = True
scenario = self.mlperf_log["effective_scenario"]
+ if self.is_endpoints:
+ if scenario.lower() == "online":
+ scenario = "Server"
+ scenario = scenario.capitalize()
res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]])
if (
- version in RESULT_FIELD_BENCHMARK_OVERWRITE
+ not self.is_endpoints
+ and version in RESULT_FIELD_BENCHMARK_OVERWRITE
and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model]
):
@@ -548,12 +585,12 @@ def get_inferred_result(self, res):
res = qps_wo_loadgen_overhead
if (scenario_fixed in ["Offline"]
- ) and scenario in ["MultiStream"]:
+ ) and scenario in ["MultiStream"]:
inferred = True
res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
if (scenario_fixed in ["MultiStream"]
- ) and scenario in ["SingleStream"]:
+ ) and scenario in ["SingleStream"]:
inferred = True
# samples_per_query does not match with the one reported in the logs
# when inferring MultiStream from SingleStream
@@ -570,6 +607,6 @@ def get_inferred_result(self, res):
else:
res = (latency_99_percentile * samples_per_query) / MS_TO_NS
if (scenario_fixed in ["Interactive"]
- ) and scenario not in ["Server"]:
+ ) and scenario not in ["Server"]:
is_valid = False
return res, is_valid
diff --git a/tools/submission/submission_checker/checks/power_check.py b/tools/submission/submission_checker/checks/power_check.py
index d3519a3503..c8accfd6f2 100644
--- a/tools/submission/submission_checker/checks/power_check.py
+++ b/tools/submission/submission_checker/checks/power_check.py
@@ -68,6 +68,9 @@ def setup_checks(self):
self.checks.append(self.required_files_check)
self.checks.append(self.external_power_check)
self.checks.append(self.get_power_metric_check)
+ self.is_endpoints = self.submission_logs.loader_data.get(
+ "is_endpoints_submission", False)
+ self.apply_checks = set(self.checks)
def required_files_check(self):
"""Verify required files exist in power-related directories.
@@ -226,7 +229,7 @@ def get_power_metric_check(self):
samples_per_query = 8
if (self.scenario_fixed.lower() in ["multistream"]
- ) and scenario.lower() in ["singlestream"]:
+ ) and scenario.lower() in ["singlestream"]:
power_metric = (
avg_power * power_duration * samples_per_query * 1000 / num_queries
)
diff --git a/tools/submission/submission_checker/checks/system_check.py b/tools/submission/submission_checker/checks/system_check.py
index 54746c0408..49b030c399 100644
--- a/tools/submission/submission_checker/checks/system_check.py
+++ b/tools/submission/submission_checker/checks/system_check.py
@@ -42,6 +42,8 @@ def __init__(self, log, path, config: Config,
self.system_json = self.submission_logs.system_json
self.submitter = self.submission_logs.loader_data.get("submitter", "")
self.division = self.submission_logs.loader_data.get("division", "")
+ self.is_endpoints = self.submission_logs.loader_data.get(
+ "is_endpoints_submission", False)
self.config = config
self.setup_checks()
@@ -58,6 +60,7 @@ def setup_checks(self):
self.checks.append(self.required_fields_check)
self.checks.append(self.submitter_check)
self.checks.append(self.division_check)
+ self.apply_checks = set(self.checks)
def missing_check(self):
"""Ensure the system JSON file was provided.
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
index 2f4abd87f8..a4137eabf3 100644
--- a/tools/submission/submission_checker/constants.py
+++ b/tools/submission/submission_checker/constants.py
@@ -1604,6 +1604,8 @@
"server": "Queries/s",
"interactive": "Queries/s",
}
+
+
POWER_UNIT_DICT = {
"SingleStream": "millijoules",
"MultiStream": "millijoules",
@@ -1633,6 +1635,13 @@
"default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt",
}
+PERFORMANCE_ENDPOINTS_DIR = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
+}
+
ACCURACY_LOG_PATH = {
"v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
"v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
@@ -1654,6 +1663,14 @@
"default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
}
+ACCURACY_ENDPOINTS_DIR = {
+ "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
+ "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
+ "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
+ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
+}
+
+
POWER_DIR_PATH = {
"v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
"v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
@@ -1748,3 +1765,93 @@
"v6.0": "{division}/{submitter}/src",
"default": "{division}/{submitter}/src",
}
+
+ENDPOINTS_MAPPINGS = {
+ "endpoints_version": "loadgen_version",
+ "endpoints_git_commit_date": "loadgen_git_commit_date",
+ "endpoints_git_commit_hash": "loadgen_git_commit_hash",
+ "test_datetime": "test_datetime",
+ "n_samples_issued": "qsl_reported_total_count",
+ "n_samples_from_dataset": "qsl_reported_performance_count",
+ "effective_scenario": "effective_scenario",
+ "mode": "effective_test_mode",
+ "streaming": "streaming",
+ "output_sequence_lengths.min": "min_output_tokens",
+ "output_sequence_lengths.max": "max_output_tokens",
+ "load_pattern": "load_pattern",
+ "min_duration_ms": "effective_min_duration_ms",
+ "max_duration_ms": "effective_max_duration_ms",
+ "effective_target_duration_ms": "effective_target_duration_ms",
+ "min_sample_count": "effective_min_query_count",
+ "effective_sample_index_rng_seed": "effective_sample_index_rng_seed",
+ "effective_schedule_rng_seed": "effective_schedule_rng_seed",
+ "effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation",
+ "effective_samples_per_query": "effective_samples_per_query",
+ "generated_query_count": "generated_query_count",
+ "generated_query_duration": "generated_query_duration",
+ "target_qps": "effective_target_qps",
+ "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec",
+ "qps": "result_completed_samples_per_sec",
+ "effective_target_latency_ns": "effective_target_latency_ns",
+ "effective_target_latency_percentile": "effective_target_latency_percentile",
+ "latency.min": "result_min_latency_ns",
+ "latency.max": "result_max_latency_ns",
+ "latency.avg": "result_mean_latency_ns",
+ "latency.percentiles.50.0": "result_50.00_percentile_latency_ns",
+ "latency.percentiles.90.0": "result_90.00_percentile_latency_ns",
+ "latency.percentiles.95.0": "result_95.00_percentile_latency_ns",
+ "latency.percentiles.99.0": "result_99.00_percentile_latency_ns",
+ "latency.percentiles.99.9": "result_99.90_percentile_latency_ns",
+ "ttft.min": "result_first_token_min_latency_ns",
+ "ttft.max": "result_first_token_max_latency_ns",
+ "ttft.avg": "result_first_token_mean_latency_ns",
+ "ttft.percentiles.50.0": "result_first_token_50.00_percentile_latency_ns",
+ "ttft.percentiles.90.0": "result_first_token_90.00_percentile_latency_ns",
+ "ttft.percentiles.95.0": "result_first_token_95.00_percentile_latency_ns",
+ "ttft.percentiles.99.0": "result_first_token_99.00_percentile_latency_ns",
+ "ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns",
+ "tpot.percentiles.50.0": "result_time_per_output_token_50.00_percentile_ns",
+ "tpot.percentiles.90.0": "result_time_per_output_token_90.00_percentile_ns",
+ "tpot.percentiles.95.0": "result_time_per_output_token_95.00_percentile_ns",
+ "tpot.percentiles.99.0": "result_time_per_output_token_99.00_percentile_ns",
+ "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns",
+ "tpot.min": "result_time_per_output_token_min",
+ "tpot.max": "result_time_per_output_token_max",
+ "tpot.avg": "result_time_per_output_token_mean",
+ "tps": "result_completed_tokens_per_second",
+ "result.total": "result_query_count",
+ "result.failed": "num_errors",
+}
+
+
+# Maps endpoints field name (forwards.json key) to the dot-notation path
+# inside config.yaml
+ENDPOINTS_YAML_FIELD_MAP = {
+ "effective_scenario": "type",
+ "endpoints_version": "version",
+ "streaming": "model_params.streaming",
+ "load_pattern": "settings.load_pattern.type",
+ "min_duration_ms": "settings.runtime.min_duration_ms",
+ "max_duration_ms": "settings.runtime.max_duration_ms",
+ "effective_sample_index_rng_seed": "settings.runtime.dataloader_random_seed",
+ "effective_schedule_rng_seed": "settings.runtime.scheduler_random_seed",
+ "target_qps": "settings.load_pattern.target_qps",
+ "min_sample_count": "settings.runtime.n_samples_to_issue",
+}
+
+# Alternative JSON paths for endpoints keys that don't directly match the
+# JSON structure
+ENDPOINTS_JSON_ALT_PATHS = {
+ "result.total": "results.total",
+ "result.failed": "results.failed",
+ "qps": "results.qps",
+ "generated_query_count": "n_samples_issued",
+ "generated_query_duration": "duration_ns",
+ "test_datetime": "test_started_at",
+ "endpoints_git_commit_hash": "git_sha",
+ "n_samples_from_dataset": "n_samples_issued",
+}
+
+ENDPOINTS_INFERRED_FIELDS = {
+ "effective_accuracy_sample_count": "result_query_count"
+}
diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py
index 79cfdce73a..2c8fa592d8 100644
--- a/tools/submission/submission_checker/loader.py
+++ b/tools/submission/submission_checker/loader.py
@@ -2,6 +2,7 @@
from .constants import *
from .utils import list_dir
from .parsers.loadgen_parser import LoadgenParser
+from .parsers.endpoints_parser import EndpointsParser
from typing import Generator, Literal
from .utils import *
from .configuration.configuration import Config
@@ -82,6 +83,12 @@ def __init__(self, root, version, config: Config) -> None:
self.acc_json_path = os.path.join(
self.root, ACCURACY_JSON_PATH.get(
version, ACCURACY_JSON_PATH["default"]))
+ self.perf_endpoints_dir = os.path.join(
+ self.root, PERFORMANCE_ENDPOINTS_DIR.get(
+ version, PERFORMANCE_ENDPOINTS_DIR["default"]))
+ self.acc_endpoints_dir = os.path.join(
+ self.root, ACCURACY_ENDPOINTS_DIR.get(
+ version, ACCURACY_ENDPOINTS_DIR["default"]))
self.system_log_path = os.path.join(
self.root, SYSTEM_PATH.get(
version, SYSTEM_PATH["default"]))
@@ -182,7 +189,7 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy",
accuracy results as line lists, etc.
Args:
- path (str): Filesystem path to the log file.
+ path (str or List[str]): Filesystem path to the log file.
log_type (str): Type of log to load, determining parsing method.
Returns:
@@ -214,6 +221,22 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy",
path)
return log
+ def load_endpoints_logs(self, perf_dir, acc_dir):
+ perf_log = None
+ acc_log = None
+ if os.path.exists(acc_dir) and os.path.exists(perf_dir):
+ acc_log = EndpointsParser(acc_dir)
+ perf_log = EndpointsParser(perf_dir)
+ elif os.path.exists(perf_dir):
+ acc_log = EndpointsParser(perf_dir)
+ perf_log = EndpointsParser(perf_dir)
+ else:
+ self.logger.info(
+ "Could not load endpoints log from %s, path does not exist",
+ perf_dir
+ )
+ return perf_log, acc_log
+
def check_scenarios(self, benchmark, model_mapping,
system_type, scenarios):
self.config.set_type(system_type)
@@ -294,6 +317,18 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
system=system,
benchmark=benchmark,
scenario=scenario)
+ perf_endpoints_dir = self.perf_endpoints_dir.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
+ acc_endpoints_dir = self.acc_endpoints_dir.format(
+ division=division,
+ submitter=submitter,
+ system=system,
+ benchmark=benchmark,
+ scenario=scenario)
acc_result_path = self.acc_result_path.format(
division=division,
submitter=submitter,
@@ -388,7 +423,8 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
src_path = self.src_path.format(
division=division, submitter=submitter)
- # Load logs
+ # Load logs loadgen
+ is_endpoints_submission = False
perf_log = self.load_single_log(
perf_path, "Performance")
acc_log = self.load_single_log(
@@ -399,6 +435,11 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
acc_json_path, "AccuracyJSON")
measurements_json = self.load_single_log(
measurements_path, "Measurements")
+ if perf_log is None and acc_log is None:
+ is_endpoints_submission = True
+ perf_log, acc_log = self.load_endpoints_logs(
+ perf_endpoints_dir, acc_endpoints_dir
+ )
# Load test logs
test01_perf_log = self.load_single_log(
@@ -429,6 +470,7 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
"system": system,
"benchmark": benchmark,
"scenario": scenario,
+ "is_endpoints_submission": is_endpoints_submission,
# Submission paths
"perf_path": perf_path,
"acc_path": acc_path,
diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py
new file mode 100644
index 0000000000..9fc5ddfb7c
--- /dev/null
+++ b/tools/submission/submission_checker/parsers/endpoints_parser.py
@@ -0,0 +1,260 @@
+import json
+import logging
+import os
+import sys
+import yaml
+
+from .base import BaseParser
+from ..constants import (
+ ENDPOINTS_YAML_FIELD_MAP,
+ ENDPOINTS_JSON_ALT_PATHS,
+ ENDPOINTS_MAPPINGS,
+ ENDPOINTS_INFERRED_FIELDS,
+)
+
+_FIELDS_MAP_DIR = os.path.join(
+ os.path.dirname(__file__),
+ "..",
+ "helper",
+ "fields_map")
+_SAMPLE_LOGS_DIR = os.path.join(
+ os.path.dirname(__file__),
+ "..",
+ "helper",
+ "sample_logs")
+
+_RESULT_SUMMARY_FILE = "result_summary.json"
+_RESULTS_FILE = "results.json"
+_CONFIG_FILES = ("config.yaml", "config.yml")
+
+
+def _load_field_map(filename):
+ with open(os.path.join(_FIELDS_MAP_DIR, filename), "r", encoding="utf-8") as f:
+ return json.load(f)
+
+
+def _get_nested(data, dotted_key):
+ """Navigate a nested dict using a dot-notation key.
+
+ Uses a greedy left-to-right match so dotted numeric keys like '99.9' are
+ handled correctly: the longest matching key at each level wins.
+
+ Also handles float-formatted integer keys: '50.0' resolves to key '50'
+ (common in the ENDPOINTS_MAPPINGS percentile entries).
+ """
+ if not isinstance(data, dict):
+ return None
+ parts = dotted_key.split(".")
+ current = data
+ i = 0
+ while i < len(parts):
+ if not isinstance(current, dict):
+ # Trailing '.0' on a float-formatted integer key: treat as
+ # consumed.
+ if parts[i:] == ["0"] and not isinstance(current, (dict, list)):
+ return current
+ return None
+ found = False
+ for j in range(len(parts), i, -1):
+ candidate = ".".join(parts[i:j])
+ if candidate in current:
+ current = current[candidate]
+ i = j
+ found = True
+ break
+ if not found:
+ return None
+ if isinstance(current, (dict, list)) and not current:
+ return None
+ return current
+
+
+def _resolve_value(stripped, summary_data, results_data, yaml_data):
+ """Look up a field in three data sources in priority order.
+
+ Priority: result_summary.json > results.json > config.yaml
+ Within each JSON source, a direct dot-notation path is tried first,
+ then the alternative paths from ENDPOINTS_JSON_ALT_PATHS.
+ For the YAML source, the explicit path overrides in ENDPOINTS_YAML_FIELD_MAP
+ are tried first, then a direct dot-notation path.
+ """
+ for data in (summary_data, results_data):
+ value = _get_nested(data, stripped)
+ if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS:
+ value = _get_nested(data, ENDPOINTS_JSON_ALT_PATHS[stripped])
+ if value is not None:
+ return value
+
+ # YAML: explicit path map first, then direct
+ if stripped in ENDPOINTS_YAML_FIELD_MAP:
+ value = _get_nested(yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped])
+ if value is not None:
+ return value
+ return _get_nested(yaml_data, stripped)
+
+
+class EndpointsParser(BaseParser):
+ def __init__(self, run_dir):
+ """
+ run_dir: path to the run directory containing:
+ - result_summary.json (highest priority)
+ - results.json
+ - config.yaml / config.yml (lowest priority)
+ """
+ super().__init__(run_dir)
+
+ self.logger = logging.getLogger("MLPerfLog")
+ self.messages = {}
+
+ summary_data = self._load_json(
+ os.path.join(run_dir, _RESULT_SUMMARY_FILE))
+ results_data = self._load_json(os.path.join(run_dir, _RESULTS_FILE))
+ yaml_data = self._load_yaml(run_dir)
+
+ for endpoints_key, loadgen_key in ENDPOINTS_MAPPINGS.items():
+ stripped = endpoints_key.strip()
+ value = _resolve_value(
+ stripped, summary_data, results_data, yaml_data)
+ if value is not None:
+ self.messages.setdefault(loadgen_key, []).append(
+ {"key": loadgen_key, "value": value}
+ )
+
+ self.keys = set(self.messages.keys())
+
+ # Inferred fields: copy the value of one loadgen key to another
+ for inferred_key, source_key in ENDPOINTS_INFERRED_FIELDS.items():
+ value = self[source_key]
+ if value is not None:
+ self.messages.setdefault(inferred_key, []).append(
+ {"key": inferred_key, "value": value}
+ )
+
+ # Infer QPS from count / duration when not directly available
+ duration_ns = self["generated_query_duration"]
+ count = self["generated_query_count"]
+ scenario = self["effective_scenario"]
+ if duration_ns and count and scenario:
+ qps_key = (
+ "result_samples_per_second"
+ if scenario.lower() == "offline"
+ else "result_completed_samples_per_sec"
+ )
+ if qps_key not in self.messages:
+ qps = count / (duration_ns / 1e9)
+ self.messages[qps_key] = [{"key": qps_key, "value": qps}]
+
+ # Expose accuracy scores stored in results.json
+ for result in results_data.get("accuracy_scores", {}).values():
+ score = result.get("score")
+ if score is not None:
+ self.messages.setdefault("accuracy_score", []).append(
+ {"key": "accuracy_score", "value": score}
+ )
+
+ self.keys = set(self.messages.keys())
+ self.logger.info("Successfully loaded endpoints log from %s.", run_dir)
+
+ def _load_json(self, path):
+ try:
+ with open(path, "r", encoding="utf-8") as f:
+ return json.load(f)
+ except BaseException:
+ self.logger.error("Could not load json file from %s", path)
+ return {}
+ return {}
+
+ def _load_yaml(self, run_dir):
+ for name in _CONFIG_FILES:
+ path = os.path.join(run_dir, name)
+ if os.path.exists(path):
+ try:
+ with open(path, "r", encoding="utf-8") as f:
+ return yaml.safe_load(f) or {}
+ except BaseException:
+ pass
+ self.logger.error("Yaml file not found in directory %s", run_dir)
+ return {}
+
+ def __getitem__(self, key):
+ if key not in self.keys:
+ return None
+ results = self.messages[key]
+ if len(results) > 1:
+ self.logger.warning(
+ "Multiple messages with key %s in the log. Empirically choosing the first one.",
+ key,
+ )
+ return results[0]["value"]
+
+ def get(self, key):
+ return self[key]
+
+ def get_messages(self):
+ return self.messages
+
+ def get_keys(self):
+ return self.keys
+
+ def num_errors(self):
+ return self["num_errors"]
+
+ def has_error(self):
+ return self.num_errors() != 0
+
+
+def main():
+ logging.basicConfig(
+ level=logging.INFO,
+ format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s",
+ )
+ logger = logging.getLogger("main")
+
+ backwards_map = _load_field_map("backwards.json")
+
+ # Collect all run directories (those containing at least one JSON and one
+ # YAML)
+ run_dirs = []
+ for root, _dirs, files in os.walk(_SAMPLE_LOGS_DIR):
+ has_json = any(f.endswith(".json") for f in files)
+ has_yaml = any(f.endswith(".yaml") or f.endswith(".yml")
+ for f in files)
+ if has_json and has_yaml:
+ run_dirs.append(root)
+
+ if not run_dirs:
+ logger.error("No run directories found under %s.", _SAMPLE_LOGS_DIR)
+ return 1
+
+ for run_dir in sorted(run_dirs):
+ folder = os.path.relpath(run_dir, _SAMPLE_LOGS_DIR)
+ print(f"\n{'=' * 70}")
+ print(f"Directory: {folder}")
+ print(f"{'=' * 70}")
+
+ parser = EndpointsParser(run_dir)
+
+ found = []
+ not_found = []
+ for loadgen_key, endpoints_key in backwards_map.items():
+ value = parser[loadgen_key]
+ if value is not None:
+ found.append((loadgen_key, endpoints_key, value))
+ else:
+ not_found.append((loadgen_key, endpoints_key))
+
+ total = len(backwards_map)
+ print(f"\nFound ({len(found)}/{total}):")
+ for loadgen_key, endpoints_key, value in found:
+ print(f" {loadgen_key:<55} = {value}")
+
+ print(f"\nNot found ({len(not_found)}/{total}):")
+ for loadgen_key, endpoints_key in not_found:
+ label = endpoints_key if endpoints_key != "None" else "(no endpoints mapping)"
+ print(f" {loadgen_key:<55} [{label}]")
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())