From 325ae0de33dace49b48a49dd50049b462744e366 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Mon, 15 Jun 2026 12:01:22 -0500 Subject: [PATCH 1/7] Loadgen++/endpoints integration with submission checker --- .../checks/accuracy_check.py | 1 + .../submission_checker/checks/base.py | 29 ++- .../checks/compliance_check.py | 4 +- .../checks/measurements_checks.py | 1 + .../checks/performance_check.py | 12 +- .../submission_checker/checks/power_check.py | 1 + .../submission_checker/checks/system_check.py | 1 + .../submission_checker/constants.py | 122 +++++++++++ tools/submission/submission_checker/loader.py | 58 ++++- .../parsers/endpoints_parser.py | 198 ++++++++++++++++++ 10 files changed, 409 insertions(+), 18 deletions(-) create mode 100644 tools/submission/submission_checker/parsers/endpoints_parser.py diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py index db1b1a7559..df315e9498 100644 --- a/tools/submission/submission_checker/checks/accuracy_check.py +++ b/tools/submission/submission_checker/checks/accuracy_check.py @@ -83,6 +83,7 @@ def setup_checks(self): self.checks.append(self.loadgen_errors_check) self.checks.append(self.dataset_check) self.checks.append(self.extra_files_check) + self.apply_checks = set(self.checks) def accuracy_result_check(self): """Validate reported accuracy metrics in `accuracy.txt`. diff --git a/tools/submission/submission_checker/checks/base.py b/tools/submission/submission_checker/checks/base.py index 8e2a678fb9..69f51da638 100644 --- a/tools/submission/submission_checker/checks/base.py +++ b/tools/submission/submission_checker/checks/base.py @@ -9,6 +9,7 @@ class BaseCheck(ABC): def __init__(self, log, path): self.checks = [] + self.apply_checks = set() self.log = log self.path = path self.name = "base checks" @@ -21,22 +22,32 @@ def run_checks(self): valid = True errors = [] for check in self.checks: - try: - v = self.execute(check) - valid &= v - except BaseException: - valid &= False - self.log.error( - "Execution occurred in running check %s. Running %s in %s", - self.path, + if self.check_applies(check): + try: + v = self.execute(check) + valid &= v + except BaseException: + valid &= False + self.log.error( + "Execution occurred in running check %s. Running %s in %s", + self.path, + check.__name__, + self.__class__.__name__) + else: + self.log.warning( + "Execution of check %s skipped for %s.", check.__name__, - self.__class__.__name__) + self.path + ) return valid def execute(self, check): """Custom execution of a single check method.""" return check() + def check_applies(self, fn): + return fn in self.apply_checks + def __call__(self): """Allows the check instance to be called like a function.""" self.log.info("Starting %s for: %s", self.name, self.path) diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py index 13cc6b16b8..566d5f5eb5 100644 --- a/tools/submission/submission_checker/checks/compliance_check.py +++ b/tools/submission/submission_checker/checks/compliance_check.py @@ -62,6 +62,7 @@ def setup_checks(self): self.checks.append(self.performance_check) self.checks.append(self.accuracy_check) self.checks.append(self.compliance_performance_check) + self.apply_checks = set(self.checks) def get_test_list(self, model): """Return the list of compliance tests applicable to `model`. @@ -322,7 +323,8 @@ def accuracy_check(self): first_token_pass and eos_pass and length_check_pass) if not is_valid: self.log.error( - f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}." + f"TEST06 accuracy check failed. first_token_check: { + first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}." ) elif test == "TEST07": # TEST07: Verify accuracy in performance mode diff --git a/tools/submission/submission_checker/checks/measurements_checks.py b/tools/submission/submission_checker/checks/measurements_checks.py index 06b89f56fc..c8cdbc7234 100644 --- a/tools/submission/submission_checker/checks/measurements_checks.py +++ b/tools/submission/submission_checker/checks/measurements_checks.py @@ -61,6 +61,7 @@ def setup_checks(self): self.checks.append(self.directory_exist_check) self.checks.append(self.required_files_check) self.checks.append(self.required_fields_check) + self.apply_checks = set(self.checks) def missing_check(self): """Ensure a measurements JSON was provided. diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index e54e7b5564..3c63794f0f 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -74,6 +74,7 @@ def setup_checks(self): self.checks.append(self.llm_check) self.checks.append(self.inferred_check) self.checks.append(self.get_performance_metric_check) + self.apply_checks = set(self.checks) def missing_check(self): """Ensure the performance log was provided. @@ -386,7 +387,8 @@ def network_check(self): # (must include "Network SUT" in name) if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name: self.log.error( - f"{self.path} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'" + f"{self.path} invalid sut name for network mode. expecting the substring '{ + NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'" ) return False @@ -457,7 +459,7 @@ def inferred_check(self): ("singlestream", "offline") ] if (self.scenario.lower(), self.scenario_fixed.lower() - ) not in list_inferred: + ) not in list_inferred: self.log.error( "Result for scenario %s can not be inferred from %s for: %s", self.scenario_fixed, @@ -548,12 +550,12 @@ def get_inferred_result(self, res): res = qps_wo_loadgen_overhead if (scenario_fixed in ["Offline"] - ) and scenario in ["MultiStream"]: + ) and scenario in ["MultiStream"]: inferred = True res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS) if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: inferred = True # samples_per_query does not match with the one reported in the logs # when inferring MultiStream from SingleStream @@ -570,6 +572,6 @@ def get_inferred_result(self, res): else: res = (latency_99_percentile * samples_per_query) / MS_TO_NS if (scenario_fixed in ["Interactive"] - ) and scenario not in ["Server"]: + ) and scenario not in ["Server"]: is_valid = False return res, is_valid diff --git a/tools/submission/submission_checker/checks/power_check.py b/tools/submission/submission_checker/checks/power_check.py index d3519a3503..499768a892 100644 --- a/tools/submission/submission_checker/checks/power_check.py +++ b/tools/submission/submission_checker/checks/power_check.py @@ -68,6 +68,7 @@ def setup_checks(self): self.checks.append(self.required_files_check) self.checks.append(self.external_power_check) self.checks.append(self.get_power_metric_check) + self.apply_checks = set(self.checks) def required_files_check(self): """Verify required files exist in power-related directories. diff --git a/tools/submission/submission_checker/checks/system_check.py b/tools/submission/submission_checker/checks/system_check.py index 54746c0408..8ab811bb7a 100644 --- a/tools/submission/submission_checker/checks/system_check.py +++ b/tools/submission/submission_checker/checks/system_check.py @@ -58,6 +58,7 @@ def setup_checks(self): self.checks.append(self.required_fields_check) self.checks.append(self.submitter_check) self.checks.append(self.division_check) + self.apply_checks = set(self.checks) def missing_check(self): """Ensure the system JSON file was provided. diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py index 2f4abd87f8..e573b6e18c 100644 --- a/tools/submission/submission_checker/constants.py +++ b/tools/submission/submission_checker/constants.py @@ -1604,6 +1604,8 @@ "server": "Queries/s", "interactive": "Queries/s", } + + POWER_UNIT_DICT = { "SingleStream": "millijoules", "MultiStream": "millijoules", @@ -1633,6 +1635,20 @@ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt", } +PERFORMANCE_ENDPOINTS_PATH = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", +} + +PERFORMANCE_CONFIG_ENDPOINTS_PATH = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", +} + ACCURACY_LOG_PATH = { "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt", "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt", @@ -1654,6 +1670,20 @@ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json", } +ACCURACY_ENDPOINTS_PATH = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", +} + +ACCURACY_CONFIG_ENDPOINTS_PATH = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", +} + POWER_DIR_PATH = { "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power", "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power", @@ -1748,3 +1778,95 @@ "v6.0": "{division}/{submitter}/src", "default": "{division}/{submitter}/src", } + +ENDPOINTS_MAPPINGS = { + "endpoints_version": "loadgen_version", + "endpoints_git_commit_date": "loadgen_git_commit_date", + "endpoints_git_commit_hash": "loadgen_git_commit_hash", + "test_datetime": "test_datetime", + "n_samples_issued": "qsl_reported_total_count", + "n_samples_from_dataset": "qsl_reported_performance_count", + "effective_scenario": "effective_scenario", + "mode": "effective_test_mode", + "streaming": "streaming", + "output_sequence_lengths.min": "min_output_tokens", + "output_sequence_lengths.max": "max_output_tokens", + "load_pattern": "load_pattern", + "min_duration_ms": "effective_min_duration_ms", + "max_duration_ms": "effective_max_duration_ms", + "effective_target_duration_ms": "effective_target_duration_ms", + "min_sample_count": "effective_min_query_count", + "effective_sample_index_rng_seed": "effective_sample_index_rng_seed", + "effective_schedule_rng_seed": "effective_schedule_rng_seed", + "min_sample_count (runtime_settings.json)": "effective_min_sample_count", + "effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation", + "effective_samples_per_query": "effective_samples_per_query", + "generated_query_count": "generated_query_count", + "generated_query_duration": "generated_query_duration", + "target_qps (results_summary.json)": "effective_target_qps", + "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec", + "qps ": "result_completed_samples_per_sec", + "effective_target_latency_ns": "effective_target_latency_ns", + "effective_target_latency_percentile": "effective_target_latency_percentile", + "latency.min ": "result_min_latency_ns", + "latency.max": "result_max_latency_ns", + "latency.avg": "result_mean_latency_ns", + "latency.percentiles.50": "result_50.00_percentile_latency_ns", + "latency.percentiles.90": "result_90.00_percentile_latency_ns", + "latency.percentiles.95": "result_95.00_percentile_latency_ns", + "latency.percentiles.99": "result_99.00_percentile_latency_ns", + "latency.percentiles.99.9": "result_99.90_percentile_latency_ns", + "ttft.min": "result_first_token_min_latency_ns", + "ttft.max": "result_first_token_max_latency_ns", + "ttft.avg": "result_first_token_mean_latency_ns", + "ttft.percentiles.50": "result_first_token_50.00_percentile_latency_ns", + "ttft.percentiles.90": "result_first_token_90.00_percentile_latency_ns", + "ttft.percentiles.95": "result_first_token_95.00_percentile_latency_ns", + "ttft.percentiles.99": "result_first_token_99.00_percentile_latency_ns", + "ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns", + "tpot.percentiles.50": "result_time_per_output_token_50.00_percentile_ns", + "tpot.percentiles.90": "result_time_per_output_token_90.00_percentile_ns", + "tpot.percentiles.95": "result_time_per_output_token_95.00_percentile_ns", + "tpot.percentiles.99": "result_time_per_output_token_99.00_percentile_ns", + "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns", + "tpot.min": "result_time_to_output_token_min", + "tpot.max": "result_time_to_output_token_max", + "tpot.avg": "result_time_to_output_token_mean", + "tps": "result_completed_tokens_per_second", + "result.total": "result_query_count", + "result.failed": "num_errors" +} + + +# Maps endpoints field name (forwards.json key) to the dot-notation path +# inside config.yaml +ENDPOINTS_YAML_FIELD_MAP = { + "effective_scenario": "type", + "endpoints_version": "version", + "streaming": "model_params.streaming", + "load_pattern": "settings.load_pattern.type", + "min_duration_ms": "settings.runtime.min_duration_ms", + "max_duration_ms": "settings.runtime.max_duration_ms", + "effective_sample_index_rng_seed": "settings.runtime.dataloader_random_seed", + "effective_schedule_rng_seed": "settings.runtime.scheduler_random_seed", + "target_qps (results_summary.json)": "settings.load_pattern.target_qps", + "min_sample_count (runtime_settings.json)": "settings.runtime.n_samples_to_issue", + "min_sample_count": "settings.runtime.n_samples_to_issue", +} + +# Alternative JSON paths for endpoints keys that don't directly match the +# JSON structure +ENDPOINTS_JSON_ALT_PATHS = { + "result.total": "results.total", + "result.failed": "results.failed", + "qps": "results.qps", + "generated_query_count": "n_samples_issued", + "generated_query_duration": "duration_ns", + "test_datetime": "test_started_at", + "endpoints_git_commit_hash": "git_sha", + "n_samples_from_dataset": "n_samples_issued", +} + +ENDPOINTS_INFERRED_FIELDS = { + "effective_accuracy_sample_count": "result_query_count" +} diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py index 79cfdce73a..170da085f1 100644 --- a/tools/submission/submission_checker/loader.py +++ b/tools/submission/submission_checker/loader.py @@ -2,6 +2,7 @@ from .constants import * from .utils import list_dir from .parsers.loadgen_parser import LoadgenParser +from .parsers.endpoints_parser import EndpointsParser from typing import Generator, Literal from .utils import * from .configuration.configuration import Config @@ -82,6 +83,18 @@ def __init__(self, root, version, config: Config) -> None: self.acc_json_path = os.path.join( self.root, ACCURACY_JSON_PATH.get( version, ACCURACY_JSON_PATH["default"])) + self.perf_endpoints_path = os.path.join( + self.root, PERFORMANCE_ENDPOINTS_PATH.get( + version, PERFORMANCE_ENDPOINTS_PATH["default"])) + self.perf_endpoints_config_path = os.path.join( + self.root, PERFORMANCE_CONFIG_ENDPOINTS_PATH.get( + version, PERFORMANCE_CONFIG_ENDPOINTS_PATH["default"])) + self.acc_endpoints_path = os.path.join( + self.root, ACCURACY_ENDPOINTS_PATH.get( + version, ACCURACY_ENDPOINTS_PATH["default"])) + self.acc_endpoints_config_path = os.path.join( + self.root, ACCURACY_CONFIG_ENDPOINTS_PATH.get( + version, ACCURACY_CONFIG_ENDPOINTS_PATH["default"])) self.system_log_path = os.path.join( self.root, SYSTEM_PATH.get( version, SYSTEM_PATH["default"])) @@ -182,7 +195,7 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy", accuracy results as line lists, etc. Args: - path (str): Filesystem path to the log file. + path (str or List[str]): Filesystem path to the log file. log_type (str): Type of log to load, determining parsing method. Returns: @@ -190,7 +203,9 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy", if loading fails. """ log = None - if os.path.exists(path): + if log_type in ["Endpoints"]: + log = EndpointsParser(path) + elif os.path.exists(path): self.logger.info("Loading %s log from %s", log_type, path) if log_type in ["Performance", "Accuracy", "Test"]: log = LoadgenParser(path) @@ -294,6 +309,30 @@ def load(self) -> Generator[SubmissionLogs, None, None]: system=system, benchmark=benchmark, scenario=scenario) + perf_endpoints_path = self.perf_endpoints_path.format( + division=division, + submitter=submitter, + system=system, + benchmark=benchmark, + scenario=scenario) + perf_endpoints_config_path = self.perf_endpoints_config_path.format( + division=division, + submitter=submitter, + system=system, + benchmark=benchmark, + scenario=scenario) + acc_endpoints_path = self.acc_endpoints_path.format( + division=division, + submitter=submitter, + system=system, + benchmark=benchmark, + scenario=scenario) + acc_endpoints_config_path = self.acc_endpoints_config_path.format( + division=division, + submitter=submitter, + system=system, + benchmark=benchmark, + scenario=scenario) acc_result_path = self.acc_result_path.format( division=division, submitter=submitter, @@ -388,7 +427,8 @@ def load(self) -> Generator[SubmissionLogs, None, None]: src_path = self.src_path.format( division=division, submitter=submitter) - # Load logs + # Load logs loadgen + is_endpoints_submission = False perf_log = self.load_single_log( perf_path, "Performance") acc_log = self.load_single_log( @@ -399,6 +439,17 @@ def load(self) -> Generator[SubmissionLogs, None, None]: acc_json_path, "AccuracyJSON") measurements_json = self.load_single_log( measurements_path, "Measurements") + if perf_log is None and acc_log is None: + is_endpoints_submission = True + perf_log = self.load_single_log( + [perf_endpoints_path, + perf_endpoints_config_path], + "Endpoints" + ) + acc_log = self.load_single_log( + [acc_endpoints_path, acc_endpoints_config_path], + "Endpoints" + ) # Load test logs test01_perf_log = self.load_single_log( @@ -429,6 +480,7 @@ def load(self) -> Generator[SubmissionLogs, None, None]: "system": system, "benchmark": benchmark, "scenario": scenario, + "is_endpoints_submission": is_endpoints_submission, # Submission paths "perf_path": perf_path, "acc_path": acc_path, diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py new file mode 100644 index 0000000000..7102e19a4b --- /dev/null +++ b/tools/submission/submission_checker/parsers/endpoints_parser.py @@ -0,0 +1,198 @@ +import json +import logging +import os +import sys +import yaml + +from .base import BaseParser +from ..constants import ENDPOINTS_YAML_FIELD_MAP, ENDPOINTS_JSON_ALT_PATHS, ENDPOINTS_MAPPINGS, ENDPOINTS_INFERRED_FIELDS + +_FIELDS_MAP_DIR = os.path.join( + os.path.dirname(__file__), + "..", + "helper", + "fields_map") +_SAMPLE_LOGS_DIR = os.path.join( + os.path.dirname(__file__), + "..", + "helper", + "sample_logs") + + +def _load_field_map(filename): + with open(os.path.join(_FIELDS_MAP_DIR, filename), "r", encoding="utf-8") as f: + return json.load(f) + + +def _get_nested(data, dotted_key): + """Navigate a nested dict using a dot-notation key. + + Uses a greedy left-to-right match so dotted numeric keys like '99.9' are + handled correctly: the longest matching key at each level wins. + """ + if not isinstance(data, dict): + return None + parts = dotted_key.split(".") + current = data + i = 0 + while i < len(parts): + if not isinstance(current, dict): + return None + found = False + for j in range(len(parts), i, -1): + candidate = ".".join(parts[i:j]) + if candidate in current: + current = current[candidate] + i = j + found = True + break + if not found: + return None + if isinstance(current, (dict, list)) and not current: + return None + return current + + +class EndpointsParser(BaseParser): + def __init__(self, log_paths): + """ + log_paths: [json_path, yaml_path] + json_path - path to the JSON results file (result_summary.json or results.json) + yaml_path - path to the YAML config file (config.yaml) + """ + json_path, yaml_path = log_paths + super().__init__(json_path) + + self.logger = logging.getLogger("MLPerfLog") + self.messages = {} + + with open(json_path, "r", encoding="utf-8") as f: + json_data = json.load(f) + + with open(yaml_path, "r", encoding="utf-8") as f: + yaml_data = yaml.safe_load(f) + + forwards_map = ENDPOINTS_MAPPINGS + + for endpoints_key, loadgen_key in forwards_map.items(): + stripped = endpoints_key.strip() + value = None + + # 1. Direct dot-notation path in the JSON result file + value = _get_nested(json_data, stripped) + + # 2. Alternative JSON paths for known structural mismatches + if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS: + value = _get_nested( + json_data, ENDPOINTS_JSON_ALT_PATHS[stripped]) + + # 3. Explicit YAML field path overrides + if value is None and stripped in ENDPOINTS_YAML_FIELD_MAP: + value = _get_nested( + yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped]) + + # 4. Fallback: direct dot-notation path in the YAML config + if value is None: + value = _get_nested(yaml_data, stripped) + + if value is not None: + entry = {"key": loadgen_key, "value": value} + self.messages.setdefault(loadgen_key, []).append(entry) + + self.keys = set(self.messages.keys()) + # Additional values that can be inferred from other values + inferred_map = ENDPOINTS_INFERRED_FIELDS + for inferred, key in inferred_map.items(): + value = self.__getitem__(key) + if value is not None: + entry = {"key": inferred, "value": value} + self.messages.setdefault(inferred, []).append(entry) + + self.keys = set(self.messages.keys()) + self.logger.info( + "Successfully loaded endpoints log from %s.", + json_path) + + def __getitem__(self, key): + if key not in self.keys: + return None + results = self.messages[key] + if len(results) > 1: + self.logger.warning( + "Multiple messages with key %s in the log. Empirically choosing the first one.", + key, + ) + return results[0]["value"] + + def get(self, key): + return self.messages[key] + + def get_messages(self): + return self.messages + + def get_keys(self): + return self.keys + + +def main(): + logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s", + ) + logger = logging.getLogger("main") + + backwards_map = _load_field_map("backwards.json") + + # Collect all (json_file, yaml_file) pairs from leaf subdirectories + pairs = [] + for root, _dirs, files in os.walk(_SAMPLE_LOGS_DIR): + json_files = sorted(f for f in files if f.endswith(".json")) + yaml_files = sorted(f for f in files if f.endswith( + ".yaml") or f.endswith(".yml")) + if json_files and yaml_files: + pairs.append( + ( + os.path.join(root, json_files[0]), + os.path.join(root, yaml_files[0]), + ) + ) + + if not pairs: + logger.error("No JSON+YAML pairs found under %s.", _SAMPLE_LOGS_DIR) + return 1 + + for json_path, yaml_path in sorted(pairs): + folder = os.path.relpath(os.path.dirname(json_path), _SAMPLE_LOGS_DIR) + print(f"\n{'=' * 70}") + print(f"Folder : {folder}") + print(f"JSON : {os.path.basename(json_path)}") + print(f"YAML : {os.path.basename(yaml_path)}") + print(f"{'=' * 70}") + + parser = EndpointsParser([json_path, yaml_path]) + + found = [] + not_found = [] + + for loadgen_key, endpoints_key in backwards_map.items(): + value = parser[loadgen_key] + if value is not None: + found.append((loadgen_key, endpoints_key, value)) + else: + not_found.append((loadgen_key, endpoints_key)) + + total = len(backwards_map) + print(f"\nFound ({len(found)}/{total}):") + for loadgen_key, endpoints_key, value in found: + print(f" {loadgen_key:<55} = {value}") + + print(f"\nNot found ({len(not_found)}/{total}):") + for loadgen_key, endpoints_key in not_found: + label = endpoints_key if endpoints_key != "None" else "(no endpoints mapping)" + print(f" {loadgen_key:<55} [{label}]") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 5808d0a22f80577682ef4293f3719690ed19c1fb Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Tue, 16 Jun 2026 00:49:25 -0500 Subject: [PATCH 2/7] Handle checks: remove checks for endpoints submissions --- .../checks/accuracy_check.py | 11 ++++++++ .../checks/compliance_check.py | 5 ++++ .../checks/measurements_checks.py | 2 ++ .../checks/performance_check.py | 23 ++++++++++++---- .../submission_checker/checks/power_check.py | 4 ++- .../submission_checker/checks/system_check.py | 2 ++ .../submission_checker/constants.py | 4 +-- .../parsers/endpoints_parser.py | 26 ++++++++++++++++++- 8 files changed, 68 insertions(+), 9 deletions(-) diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py index df315e9498..5f7225e11e 100644 --- a/tools/submission/submission_checker/checks/accuracy_check.py +++ b/tools/submission/submission_checker/checks/accuracy_check.py @@ -70,6 +70,8 @@ def __init__( "scenario", "") self.scenario = self.mlperf_log["effective_scenario"] self.division = self.submission_logs.loader_data.get("division", "") + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.setup_checks() def setup_checks(self): @@ -84,6 +86,8 @@ def setup_checks(self): self.checks.append(self.dataset_check) self.checks.append(self.extra_files_check) self.apply_checks = set(self.checks) + if self.is_endpoints: + self.apply_checks.remove(self.accuracy_json_check) def accuracy_result_check(self): """Validate reported accuracy metrics in `accuracy.txt`. @@ -98,6 +102,13 @@ def accuracy_result_check(self): False otherwise. """ + if self.is_endpoints: + if self.mlperf_log["accuracy_score"] is not None: + self.submission_logs.loader_data["accuracy_metrics"] = self.mlperf_log["accuracy_score"] + return True + self.log.error("%s accuracy score not found", self.path) + return False + patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values( self.model ) diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py index 566d5f5eb5..11f79282d2 100644 --- a/tools/submission/submission_checker/checks/compliance_check.py +++ b/tools/submission/submission_checker/checks/compliance_check.py @@ -50,6 +50,8 @@ def __init__(self, log, path, config: Config, self.model = self.config.get_mlperf_model( self.model, self.model_mapping) self.test_list = self.get_test_list(self.model) + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.setup_checks() def setup_checks(self): @@ -63,6 +65,9 @@ def setup_checks(self): self.checks.append(self.accuracy_check) self.checks.append(self.compliance_performance_check) self.apply_checks = set(self.checks) + # No compliance tests for endpoints for now + if self.is_endpoints: + self.apply_checks = set() def get_test_list(self, model): """Return the list of compliance tests applicable to `model`. diff --git a/tools/submission/submission_checker/checks/measurements_checks.py b/tools/submission/submission_checker/checks/measurements_checks.py index c8cdbc7234..8a2731f4e6 100644 --- a/tools/submission/submission_checker/checks/measurements_checks.py +++ b/tools/submission/submission_checker/checks/measurements_checks.py @@ -61,6 +61,8 @@ def setup_checks(self): self.checks.append(self.directory_exist_check) self.checks.append(self.required_files_check) self.checks.append(self.required_fields_check) + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.apply_checks = set(self.checks) def missing_check(self): diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index 3c63794f0f..bf1befc231 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -53,6 +53,11 @@ def __init__(self, log, path, config: Config, "scenario", "") self.scenario = self.mlperf_log["effective_scenario"] self.division = self.submission_logs.loader_data.get("division", "") + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) + if self.is_endpoints: + if self.scenario.lower() == "online": + self.scenario = "Server" self.setup_checks() def setup_checks(self): @@ -75,6 +80,10 @@ def setup_checks(self): self.checks.append(self.inferred_check) self.checks.append(self.get_performance_metric_check) self.apply_checks = set(self.checks) + if self.is_endpoints: + self.apply_checks.remove(self.seeds_check) + self.apply_checks.remove(self.performance_sample_count_check) + self.apply_checks.remove(self.min_query_count_check) def missing_check(self): """Ensure the performance log was provided. @@ -238,7 +247,7 @@ def latency_check(self): bool: True if latency constraints are satisfied, False otherwise. """ uses_early_stopping = self.config.uses_early_stopping(self.scenario) - if uses_early_stopping: + if uses_early_stopping and not self.is_endpoints: # check if early_stopping condition was met if not self.mlperf_log["early_stopping_met"]: early_stopping_result = self.mlperf_log["early_stopping_result"] @@ -459,7 +468,7 @@ def inferred_check(self): ("singlestream", "offline") ] if (self.scenario.lower(), self.scenario_fixed.lower() - ) not in list_inferred: + ) not in list_inferred: self.log.error( "Result for scenario %s can not be inferred from %s for: %s", self.scenario_fixed, @@ -487,6 +496,10 @@ def get_performance_metric_check(self): ): is_valid = True scenario = self.mlperf_log["effective_scenario"] + if self.is_endpoints: + if scenario.lower() == "online": + scenario = "Server" + scenario = scenario.capitalize() res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]]) if ( @@ -550,12 +563,12 @@ def get_inferred_result(self, res): res = qps_wo_loadgen_overhead if (scenario_fixed in ["Offline"] - ) and scenario in ["MultiStream"]: + ) and scenario in ["MultiStream"]: inferred = True res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS) if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: inferred = True # samples_per_query does not match with the one reported in the logs # when inferring MultiStream from SingleStream @@ -572,6 +585,6 @@ def get_inferred_result(self, res): else: res = (latency_99_percentile * samples_per_query) / MS_TO_NS if (scenario_fixed in ["Interactive"] - ) and scenario not in ["Server"]: + ) and scenario not in ["Server"]: is_valid = False return res, is_valid diff --git a/tools/submission/submission_checker/checks/power_check.py b/tools/submission/submission_checker/checks/power_check.py index 499768a892..c8accfd6f2 100644 --- a/tools/submission/submission_checker/checks/power_check.py +++ b/tools/submission/submission_checker/checks/power_check.py @@ -68,6 +68,8 @@ def setup_checks(self): self.checks.append(self.required_files_check) self.checks.append(self.external_power_check) self.checks.append(self.get_power_metric_check) + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.apply_checks = set(self.checks) def required_files_check(self): @@ -227,7 +229,7 @@ def get_power_metric_check(self): samples_per_query = 8 if (self.scenario_fixed.lower() in ["multistream"] - ) and scenario.lower() in ["singlestream"]: + ) and scenario.lower() in ["singlestream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) diff --git a/tools/submission/submission_checker/checks/system_check.py b/tools/submission/submission_checker/checks/system_check.py index 8ab811bb7a..49b030c399 100644 --- a/tools/submission/submission_checker/checks/system_check.py +++ b/tools/submission/submission_checker/checks/system_check.py @@ -42,6 +42,8 @@ def __init__(self, log, path, config: Config, self.system_json = self.submission_logs.system_json self.submitter = self.submission_logs.loader_data.get("submitter", "") self.division = self.submission_logs.loader_data.get("division", "") + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.config = config self.setup_checks() diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py index e573b6e18c..09dce5e8d2 100644 --- a/tools/submission/submission_checker/constants.py +++ b/tools/submission/submission_checker/constants.py @@ -1805,7 +1805,7 @@ "generated_query_duration": "generated_query_duration", "target_qps (results_summary.json)": "effective_target_qps", "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec", - "qps ": "result_completed_samples_per_sec", + "qps": "result_completed_samples_per_sec", "effective_target_latency_ns": "effective_target_latency_ns", "effective_target_latency_percentile": "effective_target_latency_percentile", "latency.min ": "result_min_latency_ns", @@ -1834,7 +1834,7 @@ "tpot.avg": "result_time_to_output_token_mean", "tps": "result_completed_tokens_per_second", "result.total": "result_query_count", - "result.failed": "num_errors" + "result.failed": "num_errors", } diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py index 7102e19a4b..b1fc533916 100644 --- a/tools/submission/submission_checker/parsers/endpoints_parser.py +++ b/tools/submission/submission_checker/parsers/endpoints_parser.py @@ -108,6 +108,23 @@ def __init__(self, log_paths): entry = {"key": inferred, "value": value} self.messages.setdefault(inferred, []).append(entry) + # Temporary solution: Hardcoded inferred values + if self.__getitem__("generated_query_duration") and self.__getitem__( + "generated_query_count"): + key = "result_samples_per_second" if self.__getitem__( + "effective_scenario").lower() == "offline" else "result_completed_samples_per_sec" + value = self.__getitem__( + "generated_query_count") / self.__getitem__("generated_query_duration") + entry = {"key": key, "value": value} + self.messages.setdefault(key, []).append(entry) + + # Extract accuracy scores if possible + if "accuracy_scores" in json_data: + for dataset_name, result in json_data["accuracy_scores"].items(): + value = result.get("score", None) + entry = {"key": "accuracy_score", "value": value} + self.messages.setdefault("accuracy_score", []).append(entry) + self.keys = set(self.messages.keys()) self.logger.info( "Successfully loaded endpoints log from %s.", @@ -125,7 +142,7 @@ def __getitem__(self, key): return results[0]["value"] def get(self, key): - return self.messages[key] + return self[key] def get_messages(self): return self.messages @@ -133,6 +150,13 @@ def get_messages(self): def get_keys(self): return self.keys + def num_errors(self): + return self.get("num_errors") + + def has_error(self): + """Check if the log contains any errors.""" + return self.num_errors() != 0 + def main(): logging.basicConfig( From 788e9ed1214442f00d7af02979e1f87f9f3029bc Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Tue, 16 Jun 2026 10:51:22 -0500 Subject: [PATCH 3/7] Split string to avoid error in automatic testing --- .../submission_checker/checks/compliance_check.py | 5 +++-- .../submission_checker/checks/performance_check.py | 13 +++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py index 11f79282d2..b08b23803e 100644 --- a/tools/submission/submission_checker/checks/compliance_check.py +++ b/tools/submission/submission_checker/checks/compliance_check.py @@ -328,8 +328,9 @@ def accuracy_check(self): first_token_pass and eos_pass and length_check_pass) if not is_valid: self.log.error( - f"TEST06 accuracy check failed. first_token_check: { - first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}." + f"TEST06 accuracy check failed. first_token_check:" + + f"{first_token_pass} eos_check: " + + f"{eos_pass} length_check: {length_check_pass}." ) elif test == "TEST07": # TEST07: Verify accuracy in performance mode diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index bf1befc231..624ff220ec 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -396,8 +396,9 @@ def network_check(self): # (must include "Network SUT" in name) if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name: self.log.error( - f"{self.path} invalid sut name for network mode. expecting the substring '{ - NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'" + f"{self.path} invalid sut name for network mode." + + f"expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}'" + + f" got '{sut_name}'" ) return False @@ -468,7 +469,7 @@ def inferred_check(self): ("singlestream", "offline") ] if (self.scenario.lower(), self.scenario_fixed.lower() - ) not in list_inferred: + ) not in list_inferred: self.log.error( "Result for scenario %s can not be inferred from %s for: %s", self.scenario_fixed, @@ -563,12 +564,12 @@ def get_inferred_result(self, res): res = qps_wo_loadgen_overhead if (scenario_fixed in ["Offline"] - ) and scenario in ["MultiStream"]: + ) and scenario in ["MultiStream"]: inferred = True res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS) if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: inferred = True # samples_per_query does not match with the one reported in the logs # when inferring MultiStream from SingleStream @@ -585,6 +586,6 @@ def get_inferred_result(self, res): else: res = (latency_99_percentile * samples_per_query) / MS_TO_NS if (scenario_fixed in ["Interactive"] - ) and scenario not in ["Server"]: + ) and scenario not in ["Server"]: is_valid = False return res, is_valid From d3f2a777972c6f29d7e44e316cbdb8bca9927231 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 17 Jun 2026 21:26:12 +0530 Subject: [PATCH 4/7] Fix endpoints submission checker bugs and update documentation (#2603) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - constants.py: Fix percentile key format in ENDPOINTS_MAPPINGS — the endpoints JSON uses float-format keys (e.g. "99.0") but the mappings used integer strings ("99"), causing latency_check to receive None and crash on the comparison. Updated all latency/ttft/tpot percentile keys to use the .0 suffix (50.0, 90.0, 95.0, 99.0). - performance_check.py: Fix llm_check for endpoints — the check gated on the loadgen use_token_latencies flag which does not exist in endpoints submissions, causing all LLM models to fail. Added an endpoints-specific branch that checks TTFT/TPOT p99 values directly from the result JSON. Also fix get_performance_metric_check to skip RESULT_FIELD_BENCHMARK_OVERWRITE for endpoints (the tokens/sec field is not present in endpoints result files; use QPS instead). - endpoints_parser.py: Fix inferred QPS unit — the fallback QPS calculation divided n_samples_issued by duration_ns directly, giving ~1e-8 instead of the correct value. Convert duration to seconds first. Also guard against overwriting an already-resolved QPS value. - README.md: Update submission checker documentation with current version numbers, endpoints directory structure, endpoints-specific checks, and accuracy_scores requirement. Co-authored-by: Claude Sonnet 4.6 --- tools/submission/README.md | 28 ++++++++++++------- .../checks/performance_check.py | 22 ++++++++++++++- .../submission_checker/constants.py | 24 ++++++++-------- .../parsers/endpoints_parser.py | 12 ++++---- 4 files changed, 58 insertions(+), 28 deletions(-) diff --git a/tools/submission/README.md b/tools/submission/README.md index 6d620233b2..119520a412 100644 --- a/tools/submission/README.md +++ b/tools/submission/README.md @@ -32,7 +32,7 @@ The input submission directory is modified with empty directories removed and lo ## `submission_checker/main.py` (Mandatory) ### Inputs **input**: Path to the directory containing one or several submissions.
-**version**: Checker version. E.g v1.1, v2.0, v2.1, v3.0, v3.1.
+**version**: Checker version. E.g v5.0, v5.1, v6.0, v6.1.
**submitter**: Filter submitters and only run the checks for some specific submitter.
**csv**: Output path where the csv with the results will be stored. E.g `results/summary.csv`.
**skip_compliance**: Flag to skip compliance checks.
@@ -71,25 +71,34 @@ python3 -m inference.tools.submission.submission_checker.main [--skip-calibration-check] ``` -### implemented checks -**performance:** +### Implemented checks +**performance (loadgen):** - Check performance detailed log exists - Check for loadgen errors - Check for equal issue mode when it is required - Check the performance sample count used for running the benchmark - Check loadgen seeds are correct -- Check latency constrain is met -- Check minimun query count is met -- Check minimun duration is met +- Check latency constraint is met +- Check minimum query count is met +- Check minimum duration is met - Check network requirements -- Check LLM latencies are met (if applies) +- Check LLM TTFT/TPOT latencies are met via `use_token_latencies` flag (if applies) - Check loadgen scenario matches with submission scenario or that result can be inferred +**performance (endpoints):** +- Check result_summary.json and config.yaml exist +- Check latency p99 constraint is met (from `latency.percentiles.99.0` in result_summary.json) +- Check minimum duration is met (from `settings.runtime.min_duration_ms` in config.yaml) +- Check LLM TTFT/TPOT p99 limits directly from result_summary.json for Server/Interactive scenarios +- Extract primary metric as QPS (inferred from `n_samples_issued / duration_s` if not in results) +- Skips: sample count check, seed check, min query count check (not applicable to endpoints) + **accuracy** - Check the accuracy metric is correct and over the expected threshold (or within a range if applies) -- Check accuracy json exists and is truncated +- Check accuracy json exists and is truncated (loadgen only) - Check for loadgen error - Check full dataset is used for the accuracy run +- Check `accuracy_scores` field is present and non-null (endpoints only) **compliance** - Check compliance directory exists @@ -112,11 +121,10 @@ python3 -m inference.tools.submission.submission_checker.main - Check availability is valid - Check system type is valid - Check network fields -- Check required fields are include in system json file +- Check required fields are included in system json file - Check submitter is correct - Check division is correct - ### Outputs - CSV file containing all the valid results in the directory. - It raises several errors and logs invalid results. diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index 624ff220ec..12763fa013 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -415,6 +415,25 @@ def llm_check(self): False otherwise. """ if self.model in self.config.get_llm_models(): + if self.is_endpoints: + # Endpoints don't use the loadgen use_token_latencies flag; + # check TTFT/TPOT directly from the endpoints result JSON. + if self.scenario not in ["Server", "Interactive"]: + return True + limits = LLM_LATENCY_LIMITS[self.model][self.scenario] + ttft = self.mlperf_log["result_first_token_99.00_percentile_latency_ns"] + tpot = self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"] + if ttft is None or tpot is None: + self.log.warning( + "%s TTFT or TPOT percentile data missing for endpoints LLM check", + self.path) + return True + if ttft < limits["ttft"] and tpot < limits["tpot"]: + return True + self.log.error( + 'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f', + ttft, tpot, limits["ttft"], limits["tpot"]) + return False if self.mlperf_log["requested_use_token_latencies"]: if self.scenario not in ["Server", "Interactive"]: # For offline, singlestream and multistream no further checks are @@ -504,7 +523,8 @@ def get_performance_metric_check(self): res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]]) if ( - version in RESULT_FIELD_BENCHMARK_OVERWRITE + not self.is_endpoints + and version in RESULT_FIELD_BENCHMARK_OVERWRITE and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model] ): diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py index 09dce5e8d2..1b147cea29 100644 --- a/tools/submission/submission_checker/constants.py +++ b/tools/submission/submission_checker/constants.py @@ -1811,23 +1811,23 @@ "latency.min ": "result_min_latency_ns", "latency.max": "result_max_latency_ns", "latency.avg": "result_mean_latency_ns", - "latency.percentiles.50": "result_50.00_percentile_latency_ns", - "latency.percentiles.90": "result_90.00_percentile_latency_ns", - "latency.percentiles.95": "result_95.00_percentile_latency_ns", - "latency.percentiles.99": "result_99.00_percentile_latency_ns", + "latency.percentiles.50.0": "result_50.00_percentile_latency_ns", + "latency.percentiles.90.0": "result_90.00_percentile_latency_ns", + "latency.percentiles.95.0": "result_95.00_percentile_latency_ns", + "latency.percentiles.99.0": "result_99.00_percentile_latency_ns", "latency.percentiles.99.9": "result_99.90_percentile_latency_ns", "ttft.min": "result_first_token_min_latency_ns", "ttft.max": "result_first_token_max_latency_ns", "ttft.avg": "result_first_token_mean_latency_ns", - "ttft.percentiles.50": "result_first_token_50.00_percentile_latency_ns", - "ttft.percentiles.90": "result_first_token_90.00_percentile_latency_ns", - "ttft.percentiles.95": "result_first_token_95.00_percentile_latency_ns", - "ttft.percentiles.99": "result_first_token_99.00_percentile_latency_ns", + "ttft.percentiles.50.0": "result_first_token_50.00_percentile_latency_ns", + "ttft.percentiles.90.0": "result_first_token_90.00_percentile_latency_ns", + "ttft.percentiles.95.0": "result_first_token_95.00_percentile_latency_ns", + "ttft.percentiles.99.0": "result_first_token_99.00_percentile_latency_ns", "ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns", - "tpot.percentiles.50": "result_time_per_output_token_50.00_percentile_ns", - "tpot.percentiles.90": "result_time_per_output_token_90.00_percentile_ns", - "tpot.percentiles.95": "result_time_per_output_token_95.00_percentile_ns", - "tpot.percentiles.99": "result_time_per_output_token_99.00_percentile_ns", + "tpot.percentiles.50.0": "result_time_per_output_token_50.00_percentile_ns", + "tpot.percentiles.90.0": "result_time_per_output_token_90.00_percentile_ns", + "tpot.percentiles.95.0": "result_time_per_output_token_95.00_percentile_ns", + "tpot.percentiles.99.0": "result_time_per_output_token_99.00_percentile_ns", "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns", "tpot.min": "result_time_to_output_token_min", "tpot.max": "result_time_to_output_token_max", diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py index b1fc533916..e014b0f4b9 100644 --- a/tools/submission/submission_checker/parsers/endpoints_parser.py +++ b/tools/submission/submission_checker/parsers/endpoints_parser.py @@ -108,15 +108,17 @@ def __init__(self, log_paths): entry = {"key": inferred, "value": value} self.messages.setdefault(inferred, []).append(entry) - # Temporary solution: Hardcoded inferred values + # Infer QPS from sample count / duration when not directly available. + # generated_query_duration is in nanoseconds; divide by 1e9 for seconds. if self.__getitem__("generated_query_duration") and self.__getitem__( "generated_query_count"): key = "result_samples_per_second" if self.__getitem__( "effective_scenario").lower() == "offline" else "result_completed_samples_per_sec" - value = self.__getitem__( - "generated_query_count") / self.__getitem__("generated_query_duration") - entry = {"key": key, "value": value} - self.messages.setdefault(key, []).append(entry) + duration_s = self.__getitem__("generated_query_duration") / 1e9 + value = self.__getitem__("generated_query_count") / duration_s + if key not in self.messages: + entry = {"key": key, "value": value} + self.messages[key] = [entry] # Extract accuracy scores if possible if "accuracy_scores" in json_data: From 045b2e38a5fd5429a721bbd09c57a15a663d98bf Mon Sep 17 00:00:00 2001 From: mlc-automations <3246381+mlc-automations@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:57:09 +0000 Subject: [PATCH 5/7] [Automated Commit] Format Codebase --- .../submission/submission_checker/parsers/endpoints_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py index e014b0f4b9..11e05bb57b 100644 --- a/tools/submission/submission_checker/parsers/endpoints_parser.py +++ b/tools/submission/submission_checker/parsers/endpoints_parser.py @@ -109,7 +109,8 @@ def __init__(self, log_paths): self.messages.setdefault(inferred, []).append(entry) # Infer QPS from sample count / duration when not directly available. - # generated_query_duration is in nanoseconds; divide by 1e9 for seconds. + # generated_query_duration is in nanoseconds; divide by 1e9 for + # seconds. if self.__getitem__("generated_query_duration") and self.__getitem__( "generated_query_count"): key = "result_samples_per_second" if self.__getitem__( From e59c7ae2624bdd13f0873d25b155ce94325f9676 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 17 Jun 2026 17:06:11 -0500 Subject: [PATCH 6/7] Checker fixes + update loader --- .../checks/performance_check.py | 19 +- .../submission_checker/constants.py | 45 ++-- tools/submission/submission_checker/loader.py | 64 +++--- .../parsers/endpoints_parser.py | 213 ++++++++++-------- 4 files changed, 176 insertions(+), 165 deletions(-) diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index 12763fa013..ddb42f8b4c 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -81,7 +81,6 @@ def setup_checks(self): self.checks.append(self.get_performance_metric_check) self.apply_checks = set(self.checks) if self.is_endpoints: - self.apply_checks.remove(self.seeds_check) self.apply_checks.remove(self.performance_sample_count_check) self.apply_checks.remove(self.min_query_count_check) @@ -210,14 +209,16 @@ def seeds_check(self): sample_index_rng_seed = self.mlperf_log["effective_sample_index_rng_seed"] schedule_rng_seed = self.mlperf_log["effective_schedule_rng_seed"] is_valid = True - if qsl_rng_seed != config_seeds["qsl_rng_seed"]: - self.log.error( - "%s qsl_rng_seed is wrong, expected=%s, found=%s", - self.path, - config_seeds["qsl_rng_seed"], - qsl_rng_seed, - ) - is_valid = False + if not self.is_endpoints: + # This seed does not exists for endpoints runs + if qsl_rng_seed != config_seeds["qsl_rng_seed"]: + self.log.error( + "%s qsl_rng_seed is wrong, expected=%s, found=%s", + self.path, + config_seeds["qsl_rng_seed"], + qsl_rng_seed, + ) + is_valid = False if sample_index_rng_seed != config_seeds["sample_index_rng_seed"]: self.log.error( "%s sample_index_rng_seed is wrong, expected=%s, found=%s", diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py index 1b147cea29..d126d426eb 100644 --- a/tools/submission/submission_checker/constants.py +++ b/tools/submission/submission_checker/constants.py @@ -1635,18 +1635,11 @@ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt", } -PERFORMANCE_ENDPOINTS_PATH = { - "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", - "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", - "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", - "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json", -} - -PERFORMANCE_CONFIG_ENDPOINTS_PATH = { - "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", - "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", - "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", - "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml", +PERFORMANCE_ENDPOINTS_DIR = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", } ACCURACY_LOG_PATH = { @@ -1670,19 +1663,13 @@ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json", } -ACCURACY_ENDPOINTS_PATH = { - "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", - "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", - "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", - "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json", +ACCURACY_ENDPOINTS_DIR = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", } -ACCURACY_CONFIG_ENDPOINTS_PATH = { - "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", - "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", - "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", - "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml", -} POWER_DIR_PATH = { "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power", @@ -1798,12 +1785,11 @@ "min_sample_count": "effective_min_query_count", "effective_sample_index_rng_seed": "effective_sample_index_rng_seed", "effective_schedule_rng_seed": "effective_schedule_rng_seed", - "min_sample_count (runtime_settings.json)": "effective_min_sample_count", "effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation", "effective_samples_per_query": "effective_samples_per_query", "generated_query_count": "generated_query_count", "generated_query_duration": "generated_query_duration", - "target_qps (results_summary.json)": "effective_target_qps", + "target_qps": "effective_target_qps", "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec", "qps": "result_completed_samples_per_sec", "effective_target_latency_ns": "effective_target_latency_ns", @@ -1829,9 +1815,9 @@ "tpot.percentiles.95.0": "result_time_per_output_token_95.00_percentile_ns", "tpot.percentiles.99.0": "result_time_per_output_token_99.00_percentile_ns", "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns", - "tpot.min": "result_time_to_output_token_min", - "tpot.max": "result_time_to_output_token_max", - "tpot.avg": "result_time_to_output_token_mean", + "tpot.min": "result_time_per_output_token_min", + "tpot.max": "result_time_per_output_token_max", + "tpot.avg": "result_time_per_output_token_mean", "tps": "result_completed_tokens_per_second", "result.total": "result_query_count", "result.failed": "num_errors", @@ -1849,8 +1835,7 @@ "max_duration_ms": "settings.runtime.max_duration_ms", "effective_sample_index_rng_seed": "settings.runtime.dataloader_random_seed", "effective_schedule_rng_seed": "settings.runtime.scheduler_random_seed", - "target_qps (results_summary.json)": "settings.load_pattern.target_qps", - "min_sample_count (runtime_settings.json)": "settings.runtime.n_samples_to_issue", + "target_qps": "settings.load_pattern.target_qps", "min_sample_count": "settings.runtime.n_samples_to_issue", } diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py index 170da085f1..2c8fa592d8 100644 --- a/tools/submission/submission_checker/loader.py +++ b/tools/submission/submission_checker/loader.py @@ -83,18 +83,12 @@ def __init__(self, root, version, config: Config) -> None: self.acc_json_path = os.path.join( self.root, ACCURACY_JSON_PATH.get( version, ACCURACY_JSON_PATH["default"])) - self.perf_endpoints_path = os.path.join( - self.root, PERFORMANCE_ENDPOINTS_PATH.get( - version, PERFORMANCE_ENDPOINTS_PATH["default"])) - self.perf_endpoints_config_path = os.path.join( - self.root, PERFORMANCE_CONFIG_ENDPOINTS_PATH.get( - version, PERFORMANCE_CONFIG_ENDPOINTS_PATH["default"])) - self.acc_endpoints_path = os.path.join( - self.root, ACCURACY_ENDPOINTS_PATH.get( - version, ACCURACY_ENDPOINTS_PATH["default"])) - self.acc_endpoints_config_path = os.path.join( - self.root, ACCURACY_CONFIG_ENDPOINTS_PATH.get( - version, ACCURACY_CONFIG_ENDPOINTS_PATH["default"])) + self.perf_endpoints_dir = os.path.join( + self.root, PERFORMANCE_ENDPOINTS_DIR.get( + version, PERFORMANCE_ENDPOINTS_DIR["default"])) + self.acc_endpoints_dir = os.path.join( + self.root, ACCURACY_ENDPOINTS_DIR.get( + version, ACCURACY_ENDPOINTS_DIR["default"])) self.system_log_path = os.path.join( self.root, SYSTEM_PATH.get( version, SYSTEM_PATH["default"])) @@ -203,9 +197,7 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy", if loading fails. """ log = None - if log_type in ["Endpoints"]: - log = EndpointsParser(path) - elif os.path.exists(path): + if os.path.exists(path): self.logger.info("Loading %s log from %s", log_type, path) if log_type in ["Performance", "Accuracy", "Test"]: log = LoadgenParser(path) @@ -229,6 +221,22 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy", path) return log + def load_endpoints_logs(self, perf_dir, acc_dir): + perf_log = None + acc_log = None + if os.path.exists(acc_dir) and os.path.exists(perf_dir): + acc_log = EndpointsParser(acc_dir) + perf_log = EndpointsParser(perf_dir) + elif os.path.exists(perf_dir): + acc_log = EndpointsParser(perf_dir) + perf_log = EndpointsParser(perf_dir) + else: + self.logger.info( + "Could not load endpoints log from %s, path does not exist", + perf_dir + ) + return perf_log, acc_log + def check_scenarios(self, benchmark, model_mapping, system_type, scenarios): self.config.set_type(system_type) @@ -309,25 +317,13 @@ def load(self) -> Generator[SubmissionLogs, None, None]: system=system, benchmark=benchmark, scenario=scenario) - perf_endpoints_path = self.perf_endpoints_path.format( - division=division, - submitter=submitter, - system=system, - benchmark=benchmark, - scenario=scenario) - perf_endpoints_config_path = self.perf_endpoints_config_path.format( + perf_endpoints_dir = self.perf_endpoints_dir.format( division=division, submitter=submitter, system=system, benchmark=benchmark, scenario=scenario) - acc_endpoints_path = self.acc_endpoints_path.format( - division=division, - submitter=submitter, - system=system, - benchmark=benchmark, - scenario=scenario) - acc_endpoints_config_path = self.acc_endpoints_config_path.format( + acc_endpoints_dir = self.acc_endpoints_dir.format( division=division, submitter=submitter, system=system, @@ -441,14 +437,8 @@ def load(self) -> Generator[SubmissionLogs, None, None]: measurements_path, "Measurements") if perf_log is None and acc_log is None: is_endpoints_submission = True - perf_log = self.load_single_log( - [perf_endpoints_path, - perf_endpoints_config_path], - "Endpoints" - ) - acc_log = self.load_single_log( - [acc_endpoints_path, acc_endpoints_config_path], - "Endpoints" + perf_log, acc_log = self.load_endpoints_logs( + perf_endpoints_dir, acc_endpoints_dir ) # Load test logs diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py index 11e05bb57b..9fc5ddfb7c 100644 --- a/tools/submission/submission_checker/parsers/endpoints_parser.py +++ b/tools/submission/submission_checker/parsers/endpoints_parser.py @@ -5,7 +5,12 @@ import yaml from .base import BaseParser -from ..constants import ENDPOINTS_YAML_FIELD_MAP, ENDPOINTS_JSON_ALT_PATHS, ENDPOINTS_MAPPINGS, ENDPOINTS_INFERRED_FIELDS +from ..constants import ( + ENDPOINTS_YAML_FIELD_MAP, + ENDPOINTS_JSON_ALT_PATHS, + ENDPOINTS_MAPPINGS, + ENDPOINTS_INFERRED_FIELDS, +) _FIELDS_MAP_DIR = os.path.join( os.path.dirname(__file__), @@ -18,6 +23,10 @@ "helper", "sample_logs") +_RESULT_SUMMARY_FILE = "result_summary.json" +_RESULTS_FILE = "results.json" +_CONFIG_FILES = ("config.yaml", "config.yml") + def _load_field_map(filename): with open(os.path.join(_FIELDS_MAP_DIR, filename), "r", encoding="utf-8") as f: @@ -29,6 +38,9 @@ def _get_nested(data, dotted_key): Uses a greedy left-to-right match so dotted numeric keys like '99.9' are handled correctly: the longest matching key at each level wins. + + Also handles float-formatted integer keys: '50.0' resolves to key '50' + (common in the ENDPOINTS_MAPPINGS percentile entries). """ if not isinstance(data, dict): return None @@ -37,6 +49,10 @@ def _get_nested(data, dotted_key): i = 0 while i < len(parts): if not isinstance(current, dict): + # Trailing '.0' on a float-formatted integer key: treat as + # consumed. + if parts[i:] == ["0"] and not isinstance(current, (dict, list)): + return current return None found = False for j in range(len(parts), i, -1): @@ -53,85 +69,112 @@ def _get_nested(data, dotted_key): return current +def _resolve_value(stripped, summary_data, results_data, yaml_data): + """Look up a field in three data sources in priority order. + + Priority: result_summary.json > results.json > config.yaml + Within each JSON source, a direct dot-notation path is tried first, + then the alternative paths from ENDPOINTS_JSON_ALT_PATHS. + For the YAML source, the explicit path overrides in ENDPOINTS_YAML_FIELD_MAP + are tried first, then a direct dot-notation path. + """ + for data in (summary_data, results_data): + value = _get_nested(data, stripped) + if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS: + value = _get_nested(data, ENDPOINTS_JSON_ALT_PATHS[stripped]) + if value is not None: + return value + + # YAML: explicit path map first, then direct + if stripped in ENDPOINTS_YAML_FIELD_MAP: + value = _get_nested(yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped]) + if value is not None: + return value + return _get_nested(yaml_data, stripped) + + class EndpointsParser(BaseParser): - def __init__(self, log_paths): + def __init__(self, run_dir): """ - log_paths: [json_path, yaml_path] - json_path - path to the JSON results file (result_summary.json or results.json) - yaml_path - path to the YAML config file (config.yaml) + run_dir: path to the run directory containing: + - result_summary.json (highest priority) + - results.json + - config.yaml / config.yml (lowest priority) """ - json_path, yaml_path = log_paths - super().__init__(json_path) + super().__init__(run_dir) self.logger = logging.getLogger("MLPerfLog") self.messages = {} - with open(json_path, "r", encoding="utf-8") as f: - json_data = json.load(f) - - with open(yaml_path, "r", encoding="utf-8") as f: - yaml_data = yaml.safe_load(f) - - forwards_map = ENDPOINTS_MAPPINGS + summary_data = self._load_json( + os.path.join(run_dir, _RESULT_SUMMARY_FILE)) + results_data = self._load_json(os.path.join(run_dir, _RESULTS_FILE)) + yaml_data = self._load_yaml(run_dir) - for endpoints_key, loadgen_key in forwards_map.items(): + for endpoints_key, loadgen_key in ENDPOINTS_MAPPINGS.items(): stripped = endpoints_key.strip() - value = None - - # 1. Direct dot-notation path in the JSON result file - value = _get_nested(json_data, stripped) - - # 2. Alternative JSON paths for known structural mismatches - if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS: - value = _get_nested( - json_data, ENDPOINTS_JSON_ALT_PATHS[stripped]) - - # 3. Explicit YAML field path overrides - if value is None and stripped in ENDPOINTS_YAML_FIELD_MAP: - value = _get_nested( - yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped]) - - # 4. Fallback: direct dot-notation path in the YAML config - if value is None: - value = _get_nested(yaml_data, stripped) - + value = _resolve_value( + stripped, summary_data, results_data, yaml_data) if value is not None: - entry = {"key": loadgen_key, "value": value} - self.messages.setdefault(loadgen_key, []).append(entry) + self.messages.setdefault(loadgen_key, []).append( + {"key": loadgen_key, "value": value} + ) self.keys = set(self.messages.keys()) - # Additional values that can be inferred from other values - inferred_map = ENDPOINTS_INFERRED_FIELDS - for inferred, key in inferred_map.items(): - value = self.__getitem__(key) + + # Inferred fields: copy the value of one loadgen key to another + for inferred_key, source_key in ENDPOINTS_INFERRED_FIELDS.items(): + value = self[source_key] if value is not None: - entry = {"key": inferred, "value": value} - self.messages.setdefault(inferred, []).append(entry) - - # Infer QPS from sample count / duration when not directly available. - # generated_query_duration is in nanoseconds; divide by 1e9 for - # seconds. - if self.__getitem__("generated_query_duration") and self.__getitem__( - "generated_query_count"): - key = "result_samples_per_second" if self.__getitem__( - "effective_scenario").lower() == "offline" else "result_completed_samples_per_sec" - duration_s = self.__getitem__("generated_query_duration") / 1e9 - value = self.__getitem__("generated_query_count") / duration_s - if key not in self.messages: - entry = {"key": key, "value": value} - self.messages[key] = [entry] - - # Extract accuracy scores if possible - if "accuracy_scores" in json_data: - for dataset_name, result in json_data["accuracy_scores"].items(): - value = result.get("score", None) - entry = {"key": "accuracy_score", "value": value} - self.messages.setdefault("accuracy_score", []).append(entry) + self.messages.setdefault(inferred_key, []).append( + {"key": inferred_key, "value": value} + ) + + # Infer QPS from count / duration when not directly available + duration_ns = self["generated_query_duration"] + count = self["generated_query_count"] + scenario = self["effective_scenario"] + if duration_ns and count and scenario: + qps_key = ( + "result_samples_per_second" + if scenario.lower() == "offline" + else "result_completed_samples_per_sec" + ) + if qps_key not in self.messages: + qps = count / (duration_ns / 1e9) + self.messages[qps_key] = [{"key": qps_key, "value": qps}] + + # Expose accuracy scores stored in results.json + for result in results_data.get("accuracy_scores", {}).values(): + score = result.get("score") + if score is not None: + self.messages.setdefault("accuracy_score", []).append( + {"key": "accuracy_score", "value": score} + ) self.keys = set(self.messages.keys()) - self.logger.info( - "Successfully loaded endpoints log from %s.", - json_path) + self.logger.info("Successfully loaded endpoints log from %s.", run_dir) + + def _load_json(self, path): + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except BaseException: + self.logger.error("Could not load json file from %s", path) + return {} + return {} + + def _load_yaml(self, run_dir): + for name in _CONFIG_FILES: + path = os.path.join(run_dir, name) + if os.path.exists(path): + try: + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + except BaseException: + pass + self.logger.error("Yaml file not found in directory %s", run_dir) + return {} def __getitem__(self, key): if key not in self.keys: @@ -154,10 +197,9 @@ def get_keys(self): return self.keys def num_errors(self): - return self.get("num_errors") + return self["num_errors"] def has_error(self): - """Check if the log contains any errors.""" return self.num_errors() != 0 @@ -170,37 +212,30 @@ def main(): backwards_map = _load_field_map("backwards.json") - # Collect all (json_file, yaml_file) pairs from leaf subdirectories - pairs = [] + # Collect all run directories (those containing at least one JSON and one + # YAML) + run_dirs = [] for root, _dirs, files in os.walk(_SAMPLE_LOGS_DIR): - json_files = sorted(f for f in files if f.endswith(".json")) - yaml_files = sorted(f for f in files if f.endswith( - ".yaml") or f.endswith(".yml")) - if json_files and yaml_files: - pairs.append( - ( - os.path.join(root, json_files[0]), - os.path.join(root, yaml_files[0]), - ) - ) - - if not pairs: - logger.error("No JSON+YAML pairs found under %s.", _SAMPLE_LOGS_DIR) + has_json = any(f.endswith(".json") for f in files) + has_yaml = any(f.endswith(".yaml") or f.endswith(".yml") + for f in files) + if has_json and has_yaml: + run_dirs.append(root) + + if not run_dirs: + logger.error("No run directories found under %s.", _SAMPLE_LOGS_DIR) return 1 - for json_path, yaml_path in sorted(pairs): - folder = os.path.relpath(os.path.dirname(json_path), _SAMPLE_LOGS_DIR) + for run_dir in sorted(run_dirs): + folder = os.path.relpath(run_dir, _SAMPLE_LOGS_DIR) print(f"\n{'=' * 70}") - print(f"Folder : {folder}") - print(f"JSON : {os.path.basename(json_path)}") - print(f"YAML : {os.path.basename(yaml_path)}") + print(f"Directory: {folder}") print(f"{'=' * 70}") - parser = EndpointsParser([json_path, yaml_path]) + parser = EndpointsParser(run_dir) found = [] not_found = [] - for loadgen_key, endpoints_key in backwards_map.items(): value = parser[loadgen_key] if value is not None: From 733eb8895a954c7b974e80910ef9ebb996582c6c Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 17 Jun 2026 18:33:23 -0500 Subject: [PATCH 7/7] Minor fixes --- tools/submission/submission_checker/checks/compliance_check.py | 1 + tools/submission/submission_checker/constants.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py index b08b23803e..4853a832f3 100644 --- a/tools/submission/submission_checker/checks/compliance_check.py +++ b/tools/submission/submission_checker/checks/compliance_check.py @@ -192,6 +192,7 @@ def performance_check(self): "model_mapping": self.submission_logs.loader_data.get("model_mapping", {}), "check_scenarios": True, "compliance_skip": True, + "is_endpoints_submission": self.submission_logs.loader_data.get("is_endpoints_submission", False), } test_logs = SubmissionLogs( self.submission_logs.loader_data[f"{test}_perf_log"], None, None, None, self.submission_logs.system_json, None, test_data) diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py index d126d426eb..a4137eabf3 100644 --- a/tools/submission/submission_checker/constants.py +++ b/tools/submission/submission_checker/constants.py @@ -1794,7 +1794,7 @@ "qps": "result_completed_samples_per_sec", "effective_target_latency_ns": "effective_target_latency_ns", "effective_target_latency_percentile": "effective_target_latency_percentile", - "latency.min ": "result_min_latency_ns", + "latency.min": "result_min_latency_ns", "latency.max": "result_max_latency_ns", "latency.avg": "result_mean_latency_ns", "latency.percentiles.50.0": "result_50.00_percentile_latency_ns",