diff --git a/tools/submission/README.md b/tools/submission/README.md index 6d620233b2..119520a412 100644 --- a/tools/submission/README.md +++ b/tools/submission/README.md @@ -32,7 +32,7 @@ The input submission directory is modified with empty directories removed and lo ## `submission_checker/main.py` (Mandatory) ### Inputs **input**: Path to the directory containing one or several submissions.
-**version**: Checker version. E.g v1.1, v2.0, v2.1, v3.0, v3.1.
+**version**: Checker version. E.g v5.0, v5.1, v6.0, v6.1.
**submitter**: Filter submitters and only run the checks for some specific submitter.
**csv**: Output path where the csv with the results will be stored. E.g `results/summary.csv`.
**skip_compliance**: Flag to skip compliance checks.
@@ -71,25 +71,34 @@ python3 -m inference.tools.submission.submission_checker.main [--skip-calibration-check] ``` -### implemented checks -**performance:** +### Implemented checks +**performance (loadgen):** - Check performance detailed log exists - Check for loadgen errors - Check for equal issue mode when it is required - Check the performance sample count used for running the benchmark - Check loadgen seeds are correct -- Check latency constrain is met -- Check minimun query count is met -- Check minimun duration is met +- Check latency constraint is met +- Check minimum query count is met +- Check minimum duration is met - Check network requirements -- Check LLM latencies are met (if applies) +- Check LLM TTFT/TPOT latencies are met via `use_token_latencies` flag (if applies) - Check loadgen scenario matches with submission scenario or that result can be inferred +**performance (endpoints):** +- Check result_summary.json and config.yaml exist +- Check latency p99 constraint is met (from `latency.percentiles.99.0` in result_summary.json) +- Check minimum duration is met (from `settings.runtime.min_duration_ms` in config.yaml) +- Check LLM TTFT/TPOT p99 limits directly from result_summary.json for Server/Interactive scenarios +- Extract primary metric as QPS (inferred from `n_samples_issued / duration_s` if not in results) +- Skips: sample count check, seed check, min query count check (not applicable to endpoints) + **accuracy** - Check the accuracy metric is correct and over the expected threshold (or within a range if applies) -- Check accuracy json exists and is truncated +- Check accuracy json exists and is truncated (loadgen only) - Check for loadgen error - Check full dataset is used for the accuracy run +- Check `accuracy_scores` field is present and non-null (endpoints only) **compliance** - Check compliance directory exists @@ -112,11 +121,10 @@ python3 -m inference.tools.submission.submission_checker.main - Check availability is valid - Check system type is valid - Check network fields -- Check required fields are include in system json file +- Check required fields are included in system json file - Check submitter is correct - Check division is correct - ### Outputs - CSV file containing all the valid results in the directory. - It raises several errors and logs invalid results. diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py index db1b1a7559..5f7225e11e 100644 --- a/tools/submission/submission_checker/checks/accuracy_check.py +++ b/tools/submission/submission_checker/checks/accuracy_check.py @@ -70,6 +70,8 @@ def __init__( "scenario", "") self.scenario = self.mlperf_log["effective_scenario"] self.division = self.submission_logs.loader_data.get("division", "") + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.setup_checks() def setup_checks(self): @@ -83,6 +85,9 @@ def setup_checks(self): self.checks.append(self.loadgen_errors_check) self.checks.append(self.dataset_check) self.checks.append(self.extra_files_check) + self.apply_checks = set(self.checks) + if self.is_endpoints: + self.apply_checks.remove(self.accuracy_json_check) def accuracy_result_check(self): """Validate reported accuracy metrics in `accuracy.txt`. @@ -97,6 +102,13 @@ def accuracy_result_check(self): False otherwise. """ + if self.is_endpoints: + if self.mlperf_log["accuracy_score"] is not None: + self.submission_logs.loader_data["accuracy_metrics"] = self.mlperf_log["accuracy_score"] + return True + self.log.error("%s accuracy score not found", self.path) + return False + patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values( self.model ) diff --git a/tools/submission/submission_checker/checks/base.py b/tools/submission/submission_checker/checks/base.py index 8e2a678fb9..69f51da638 100644 --- a/tools/submission/submission_checker/checks/base.py +++ b/tools/submission/submission_checker/checks/base.py @@ -9,6 +9,7 @@ class BaseCheck(ABC): def __init__(self, log, path): self.checks = [] + self.apply_checks = set() self.log = log self.path = path self.name = "base checks" @@ -21,22 +22,32 @@ def run_checks(self): valid = True errors = [] for check in self.checks: - try: - v = self.execute(check) - valid &= v - except BaseException: - valid &= False - self.log.error( - "Execution occurred in running check %s. Running %s in %s", - self.path, + if self.check_applies(check): + try: + v = self.execute(check) + valid &= v + except BaseException: + valid &= False + self.log.error( + "Execution occurred in running check %s. Running %s in %s", + self.path, + check.__name__, + self.__class__.__name__) + else: + self.log.warning( + "Execution of check %s skipped for %s.", check.__name__, - self.__class__.__name__) + self.path + ) return valid def execute(self, check): """Custom execution of a single check method.""" return check() + def check_applies(self, fn): + return fn in self.apply_checks + def __call__(self): """Allows the check instance to be called like a function.""" self.log.info("Starting %s for: %s", self.name, self.path) diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py index 13cc6b16b8..4853a832f3 100644 --- a/tools/submission/submission_checker/checks/compliance_check.py +++ b/tools/submission/submission_checker/checks/compliance_check.py @@ -50,6 +50,8 @@ def __init__(self, log, path, config: Config, self.model = self.config.get_mlperf_model( self.model, self.model_mapping) self.test_list = self.get_test_list(self.model) + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.setup_checks() def setup_checks(self): @@ -62,6 +64,10 @@ def setup_checks(self): self.checks.append(self.performance_check) self.checks.append(self.accuracy_check) self.checks.append(self.compliance_performance_check) + self.apply_checks = set(self.checks) + # No compliance tests for endpoints for now + if self.is_endpoints: + self.apply_checks = set() def get_test_list(self, model): """Return the list of compliance tests applicable to `model`. @@ -186,6 +192,7 @@ def performance_check(self): "model_mapping": self.submission_logs.loader_data.get("model_mapping", {}), "check_scenarios": True, "compliance_skip": True, + "is_endpoints_submission": self.submission_logs.loader_data.get("is_endpoints_submission", False), } test_logs = SubmissionLogs( self.submission_logs.loader_data[f"{test}_perf_log"], None, None, None, self.submission_logs.system_json, None, test_data) @@ -322,7 +329,9 @@ def accuracy_check(self): first_token_pass and eos_pass and length_check_pass) if not is_valid: self.log.error( - f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}." + f"TEST06 accuracy check failed. first_token_check:" + + f"{first_token_pass} eos_check: " + + f"{eos_pass} length_check: {length_check_pass}." ) elif test == "TEST07": # TEST07: Verify accuracy in performance mode diff --git a/tools/submission/submission_checker/checks/measurements_checks.py b/tools/submission/submission_checker/checks/measurements_checks.py index 06b89f56fc..8a2731f4e6 100644 --- a/tools/submission/submission_checker/checks/measurements_checks.py +++ b/tools/submission/submission_checker/checks/measurements_checks.py @@ -61,6 +61,9 @@ def setup_checks(self): self.checks.append(self.directory_exist_check) self.checks.append(self.required_files_check) self.checks.append(self.required_fields_check) + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) + self.apply_checks = set(self.checks) def missing_check(self): """Ensure a measurements JSON was provided. diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index e54e7b5564..ddb42f8b4c 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -53,6 +53,11 @@ def __init__(self, log, path, config: Config, "scenario", "") self.scenario = self.mlperf_log["effective_scenario"] self.division = self.submission_logs.loader_data.get("division", "") + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) + if self.is_endpoints: + if self.scenario.lower() == "online": + self.scenario = "Server" self.setup_checks() def setup_checks(self): @@ -74,6 +79,10 @@ def setup_checks(self): self.checks.append(self.llm_check) self.checks.append(self.inferred_check) self.checks.append(self.get_performance_metric_check) + self.apply_checks = set(self.checks) + if self.is_endpoints: + self.apply_checks.remove(self.performance_sample_count_check) + self.apply_checks.remove(self.min_query_count_check) def missing_check(self): """Ensure the performance log was provided. @@ -200,14 +209,16 @@ def seeds_check(self): sample_index_rng_seed = self.mlperf_log["effective_sample_index_rng_seed"] schedule_rng_seed = self.mlperf_log["effective_schedule_rng_seed"] is_valid = True - if qsl_rng_seed != config_seeds["qsl_rng_seed"]: - self.log.error( - "%s qsl_rng_seed is wrong, expected=%s, found=%s", - self.path, - config_seeds["qsl_rng_seed"], - qsl_rng_seed, - ) - is_valid = False + if not self.is_endpoints: + # This seed does not exists for endpoints runs + if qsl_rng_seed != config_seeds["qsl_rng_seed"]: + self.log.error( + "%s qsl_rng_seed is wrong, expected=%s, found=%s", + self.path, + config_seeds["qsl_rng_seed"], + qsl_rng_seed, + ) + is_valid = False if sample_index_rng_seed != config_seeds["sample_index_rng_seed"]: self.log.error( "%s sample_index_rng_seed is wrong, expected=%s, found=%s", @@ -237,7 +248,7 @@ def latency_check(self): bool: True if latency constraints are satisfied, False otherwise. """ uses_early_stopping = self.config.uses_early_stopping(self.scenario) - if uses_early_stopping: + if uses_early_stopping and not self.is_endpoints: # check if early_stopping condition was met if not self.mlperf_log["early_stopping_met"]: early_stopping_result = self.mlperf_log["early_stopping_result"] @@ -386,7 +397,9 @@ def network_check(self): # (must include "Network SUT" in name) if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name: self.log.error( - f"{self.path} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'" + f"{self.path} invalid sut name for network mode." + + f"expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}'" + + f" got '{sut_name}'" ) return False @@ -403,6 +416,25 @@ def llm_check(self): False otherwise. """ if self.model in self.config.get_llm_models(): + if self.is_endpoints: + # Endpoints don't use the loadgen use_token_latencies flag; + # check TTFT/TPOT directly from the endpoints result JSON. + if self.scenario not in ["Server", "Interactive"]: + return True + limits = LLM_LATENCY_LIMITS[self.model][self.scenario] + ttft = self.mlperf_log["result_first_token_99.00_percentile_latency_ns"] + tpot = self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"] + if ttft is None or tpot is None: + self.log.warning( + "%s TTFT or TPOT percentile data missing for endpoints LLM check", + self.path) + return True + if ttft < limits["ttft"] and tpot < limits["tpot"]: + return True + self.log.error( + 'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f', + ttft, tpot, limits["ttft"], limits["tpot"]) + return False if self.mlperf_log["requested_use_token_latencies"]: if self.scenario not in ["Server", "Interactive"]: # For offline, singlestream and multistream no further checks are @@ -457,7 +489,7 @@ def inferred_check(self): ("singlestream", "offline") ] if (self.scenario.lower(), self.scenario_fixed.lower() - ) not in list_inferred: + ) not in list_inferred: self.log.error( "Result for scenario %s can not be inferred from %s for: %s", self.scenario_fixed, @@ -485,10 +517,15 @@ def get_performance_metric_check(self): ): is_valid = True scenario = self.mlperf_log["effective_scenario"] + if self.is_endpoints: + if scenario.lower() == "online": + scenario = "Server" + scenario = scenario.capitalize() res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]]) if ( - version in RESULT_FIELD_BENCHMARK_OVERWRITE + not self.is_endpoints + and version in RESULT_FIELD_BENCHMARK_OVERWRITE and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model] ): @@ -548,12 +585,12 @@ def get_inferred_result(self, res): res = qps_wo_loadgen_overhead if (scenario_fixed in ["Offline"] - ) and scenario in ["MultiStream"]: + ) and scenario in ["MultiStream"]: inferred = True res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS) if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: inferred = True # samples_per_query does not match with the one reported in the logs # when inferring MultiStream from SingleStream @@ -570,6 +607,6 @@ def get_inferred_result(self, res): else: res = (latency_99_percentile * samples_per_query) / MS_TO_NS if (scenario_fixed in ["Interactive"] - ) and scenario not in ["Server"]: + ) and scenario not in ["Server"]: is_valid = False return res, is_valid diff --git a/tools/submission/submission_checker/checks/power_check.py b/tools/submission/submission_checker/checks/power_check.py index d3519a3503..c8accfd6f2 100644 --- a/tools/submission/submission_checker/checks/power_check.py +++ b/tools/submission/submission_checker/checks/power_check.py @@ -68,6 +68,9 @@ def setup_checks(self): self.checks.append(self.required_files_check) self.checks.append(self.external_power_check) self.checks.append(self.get_power_metric_check) + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) + self.apply_checks = set(self.checks) def required_files_check(self): """Verify required files exist in power-related directories. @@ -226,7 +229,7 @@ def get_power_metric_check(self): samples_per_query = 8 if (self.scenario_fixed.lower() in ["multistream"] - ) and scenario.lower() in ["singlestream"]: + ) and scenario.lower() in ["singlestream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) diff --git a/tools/submission/submission_checker/checks/system_check.py b/tools/submission/submission_checker/checks/system_check.py index 54746c0408..49b030c399 100644 --- a/tools/submission/submission_checker/checks/system_check.py +++ b/tools/submission/submission_checker/checks/system_check.py @@ -42,6 +42,8 @@ def __init__(self, log, path, config: Config, self.system_json = self.submission_logs.system_json self.submitter = self.submission_logs.loader_data.get("submitter", "") self.division = self.submission_logs.loader_data.get("division", "") + self.is_endpoints = self.submission_logs.loader_data.get( + "is_endpoints_submission", False) self.config = config self.setup_checks() @@ -58,6 +60,7 @@ def setup_checks(self): self.checks.append(self.required_fields_check) self.checks.append(self.submitter_check) self.checks.append(self.division_check) + self.apply_checks = set(self.checks) def missing_check(self): """Ensure the system JSON file was provided. diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py index 2f4abd87f8..a4137eabf3 100644 --- a/tools/submission/submission_checker/constants.py +++ b/tools/submission/submission_checker/constants.py @@ -1604,6 +1604,8 @@ "server": "Queries/s", "interactive": "Queries/s", } + + POWER_UNIT_DICT = { "SingleStream": "millijoules", "MultiStream": "millijoules", @@ -1633,6 +1635,13 @@ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt", } +PERFORMANCE_ENDPOINTS_DIR = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/", +} + ACCURACY_LOG_PATH = { "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt", "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt", @@ -1654,6 +1663,14 @@ "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json", } +ACCURACY_ENDPOINTS_DIR = { + "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", + "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", + "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", + "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/", +} + + POWER_DIR_PATH = { "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power", "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power", @@ -1748,3 +1765,93 @@ "v6.0": "{division}/{submitter}/src", "default": "{division}/{submitter}/src", } + +ENDPOINTS_MAPPINGS = { + "endpoints_version": "loadgen_version", + "endpoints_git_commit_date": "loadgen_git_commit_date", + "endpoints_git_commit_hash": "loadgen_git_commit_hash", + "test_datetime": "test_datetime", + "n_samples_issued": "qsl_reported_total_count", + "n_samples_from_dataset": "qsl_reported_performance_count", + "effective_scenario": "effective_scenario", + "mode": "effective_test_mode", + "streaming": "streaming", + "output_sequence_lengths.min": "min_output_tokens", + "output_sequence_lengths.max": "max_output_tokens", + "load_pattern": "load_pattern", + "min_duration_ms": "effective_min_duration_ms", + "max_duration_ms": "effective_max_duration_ms", + "effective_target_duration_ms": "effective_target_duration_ms", + "min_sample_count": "effective_min_query_count", + "effective_sample_index_rng_seed": "effective_sample_index_rng_seed", + "effective_schedule_rng_seed": "effective_schedule_rng_seed", + "effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation", + "effective_samples_per_query": "effective_samples_per_query", + "generated_query_count": "generated_query_count", + "generated_query_duration": "generated_query_duration", + "target_qps": "effective_target_qps", + "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec", + "qps": "result_completed_samples_per_sec", + "effective_target_latency_ns": "effective_target_latency_ns", + "effective_target_latency_percentile": "effective_target_latency_percentile", + "latency.min": "result_min_latency_ns", + "latency.max": "result_max_latency_ns", + "latency.avg": "result_mean_latency_ns", + "latency.percentiles.50.0": "result_50.00_percentile_latency_ns", + "latency.percentiles.90.0": "result_90.00_percentile_latency_ns", + "latency.percentiles.95.0": "result_95.00_percentile_latency_ns", + "latency.percentiles.99.0": "result_99.00_percentile_latency_ns", + "latency.percentiles.99.9": "result_99.90_percentile_latency_ns", + "ttft.min": "result_first_token_min_latency_ns", + "ttft.max": "result_first_token_max_latency_ns", + "ttft.avg": "result_first_token_mean_latency_ns", + "ttft.percentiles.50.0": "result_first_token_50.00_percentile_latency_ns", + "ttft.percentiles.90.0": "result_first_token_90.00_percentile_latency_ns", + "ttft.percentiles.95.0": "result_first_token_95.00_percentile_latency_ns", + "ttft.percentiles.99.0": "result_first_token_99.00_percentile_latency_ns", + "ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns", + "tpot.percentiles.50.0": "result_time_per_output_token_50.00_percentile_ns", + "tpot.percentiles.90.0": "result_time_per_output_token_90.00_percentile_ns", + "tpot.percentiles.95.0": "result_time_per_output_token_95.00_percentile_ns", + "tpot.percentiles.99.0": "result_time_per_output_token_99.00_percentile_ns", + "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns", + "tpot.min": "result_time_per_output_token_min", + "tpot.max": "result_time_per_output_token_max", + "tpot.avg": "result_time_per_output_token_mean", + "tps": "result_completed_tokens_per_second", + "result.total": "result_query_count", + "result.failed": "num_errors", +} + + +# Maps endpoints field name (forwards.json key) to the dot-notation path +# inside config.yaml +ENDPOINTS_YAML_FIELD_MAP = { + "effective_scenario": "type", + "endpoints_version": "version", + "streaming": "model_params.streaming", + "load_pattern": "settings.load_pattern.type", + "min_duration_ms": "settings.runtime.min_duration_ms", + "max_duration_ms": "settings.runtime.max_duration_ms", + "effective_sample_index_rng_seed": "settings.runtime.dataloader_random_seed", + "effective_schedule_rng_seed": "settings.runtime.scheduler_random_seed", + "target_qps": "settings.load_pattern.target_qps", + "min_sample_count": "settings.runtime.n_samples_to_issue", +} + +# Alternative JSON paths for endpoints keys that don't directly match the +# JSON structure +ENDPOINTS_JSON_ALT_PATHS = { + "result.total": "results.total", + "result.failed": "results.failed", + "qps": "results.qps", + "generated_query_count": "n_samples_issued", + "generated_query_duration": "duration_ns", + "test_datetime": "test_started_at", + "endpoints_git_commit_hash": "git_sha", + "n_samples_from_dataset": "n_samples_issued", +} + +ENDPOINTS_INFERRED_FIELDS = { + "effective_accuracy_sample_count": "result_query_count" +} diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py index 79cfdce73a..2c8fa592d8 100644 --- a/tools/submission/submission_checker/loader.py +++ b/tools/submission/submission_checker/loader.py @@ -2,6 +2,7 @@ from .constants import * from .utils import list_dir from .parsers.loadgen_parser import LoadgenParser +from .parsers.endpoints_parser import EndpointsParser from typing import Generator, Literal from .utils import * from .configuration.configuration import Config @@ -82,6 +83,12 @@ def __init__(self, root, version, config: Config) -> None: self.acc_json_path = os.path.join( self.root, ACCURACY_JSON_PATH.get( version, ACCURACY_JSON_PATH["default"])) + self.perf_endpoints_dir = os.path.join( + self.root, PERFORMANCE_ENDPOINTS_DIR.get( + version, PERFORMANCE_ENDPOINTS_DIR["default"])) + self.acc_endpoints_dir = os.path.join( + self.root, ACCURACY_ENDPOINTS_DIR.get( + version, ACCURACY_ENDPOINTS_DIR["default"])) self.system_log_path = os.path.join( self.root, SYSTEM_PATH.get( version, SYSTEM_PATH["default"])) @@ -182,7 +189,7 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy", accuracy results as line lists, etc. Args: - path (str): Filesystem path to the log file. + path (str or List[str]): Filesystem path to the log file. log_type (str): Type of log to load, determining parsing method. Returns: @@ -214,6 +221,22 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy", path) return log + def load_endpoints_logs(self, perf_dir, acc_dir): + perf_log = None + acc_log = None + if os.path.exists(acc_dir) and os.path.exists(perf_dir): + acc_log = EndpointsParser(acc_dir) + perf_log = EndpointsParser(perf_dir) + elif os.path.exists(perf_dir): + acc_log = EndpointsParser(perf_dir) + perf_log = EndpointsParser(perf_dir) + else: + self.logger.info( + "Could not load endpoints log from %s, path does not exist", + perf_dir + ) + return perf_log, acc_log + def check_scenarios(self, benchmark, model_mapping, system_type, scenarios): self.config.set_type(system_type) @@ -294,6 +317,18 @@ def load(self) -> Generator[SubmissionLogs, None, None]: system=system, benchmark=benchmark, scenario=scenario) + perf_endpoints_dir = self.perf_endpoints_dir.format( + division=division, + submitter=submitter, + system=system, + benchmark=benchmark, + scenario=scenario) + acc_endpoints_dir = self.acc_endpoints_dir.format( + division=division, + submitter=submitter, + system=system, + benchmark=benchmark, + scenario=scenario) acc_result_path = self.acc_result_path.format( division=division, submitter=submitter, @@ -388,7 +423,8 @@ def load(self) -> Generator[SubmissionLogs, None, None]: src_path = self.src_path.format( division=division, submitter=submitter) - # Load logs + # Load logs loadgen + is_endpoints_submission = False perf_log = self.load_single_log( perf_path, "Performance") acc_log = self.load_single_log( @@ -399,6 +435,11 @@ def load(self) -> Generator[SubmissionLogs, None, None]: acc_json_path, "AccuracyJSON") measurements_json = self.load_single_log( measurements_path, "Measurements") + if perf_log is None and acc_log is None: + is_endpoints_submission = True + perf_log, acc_log = self.load_endpoints_logs( + perf_endpoints_dir, acc_endpoints_dir + ) # Load test logs test01_perf_log = self.load_single_log( @@ -429,6 +470,7 @@ def load(self) -> Generator[SubmissionLogs, None, None]: "system": system, "benchmark": benchmark, "scenario": scenario, + "is_endpoints_submission": is_endpoints_submission, # Submission paths "perf_path": perf_path, "acc_path": acc_path, diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py new file mode 100644 index 0000000000..9fc5ddfb7c --- /dev/null +++ b/tools/submission/submission_checker/parsers/endpoints_parser.py @@ -0,0 +1,260 @@ +import json +import logging +import os +import sys +import yaml + +from .base import BaseParser +from ..constants import ( + ENDPOINTS_YAML_FIELD_MAP, + ENDPOINTS_JSON_ALT_PATHS, + ENDPOINTS_MAPPINGS, + ENDPOINTS_INFERRED_FIELDS, +) + +_FIELDS_MAP_DIR = os.path.join( + os.path.dirname(__file__), + "..", + "helper", + "fields_map") +_SAMPLE_LOGS_DIR = os.path.join( + os.path.dirname(__file__), + "..", + "helper", + "sample_logs") + +_RESULT_SUMMARY_FILE = "result_summary.json" +_RESULTS_FILE = "results.json" +_CONFIG_FILES = ("config.yaml", "config.yml") + + +def _load_field_map(filename): + with open(os.path.join(_FIELDS_MAP_DIR, filename), "r", encoding="utf-8") as f: + return json.load(f) + + +def _get_nested(data, dotted_key): + """Navigate a nested dict using a dot-notation key. + + Uses a greedy left-to-right match so dotted numeric keys like '99.9' are + handled correctly: the longest matching key at each level wins. + + Also handles float-formatted integer keys: '50.0' resolves to key '50' + (common in the ENDPOINTS_MAPPINGS percentile entries). + """ + if not isinstance(data, dict): + return None + parts = dotted_key.split(".") + current = data + i = 0 + while i < len(parts): + if not isinstance(current, dict): + # Trailing '.0' on a float-formatted integer key: treat as + # consumed. + if parts[i:] == ["0"] and not isinstance(current, (dict, list)): + return current + return None + found = False + for j in range(len(parts), i, -1): + candidate = ".".join(parts[i:j]) + if candidate in current: + current = current[candidate] + i = j + found = True + break + if not found: + return None + if isinstance(current, (dict, list)) and not current: + return None + return current + + +def _resolve_value(stripped, summary_data, results_data, yaml_data): + """Look up a field in three data sources in priority order. + + Priority: result_summary.json > results.json > config.yaml + Within each JSON source, a direct dot-notation path is tried first, + then the alternative paths from ENDPOINTS_JSON_ALT_PATHS. + For the YAML source, the explicit path overrides in ENDPOINTS_YAML_FIELD_MAP + are tried first, then a direct dot-notation path. + """ + for data in (summary_data, results_data): + value = _get_nested(data, stripped) + if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS: + value = _get_nested(data, ENDPOINTS_JSON_ALT_PATHS[stripped]) + if value is not None: + return value + + # YAML: explicit path map first, then direct + if stripped in ENDPOINTS_YAML_FIELD_MAP: + value = _get_nested(yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped]) + if value is not None: + return value + return _get_nested(yaml_data, stripped) + + +class EndpointsParser(BaseParser): + def __init__(self, run_dir): + """ + run_dir: path to the run directory containing: + - result_summary.json (highest priority) + - results.json + - config.yaml / config.yml (lowest priority) + """ + super().__init__(run_dir) + + self.logger = logging.getLogger("MLPerfLog") + self.messages = {} + + summary_data = self._load_json( + os.path.join(run_dir, _RESULT_SUMMARY_FILE)) + results_data = self._load_json(os.path.join(run_dir, _RESULTS_FILE)) + yaml_data = self._load_yaml(run_dir) + + for endpoints_key, loadgen_key in ENDPOINTS_MAPPINGS.items(): + stripped = endpoints_key.strip() + value = _resolve_value( + stripped, summary_data, results_data, yaml_data) + if value is not None: + self.messages.setdefault(loadgen_key, []).append( + {"key": loadgen_key, "value": value} + ) + + self.keys = set(self.messages.keys()) + + # Inferred fields: copy the value of one loadgen key to another + for inferred_key, source_key in ENDPOINTS_INFERRED_FIELDS.items(): + value = self[source_key] + if value is not None: + self.messages.setdefault(inferred_key, []).append( + {"key": inferred_key, "value": value} + ) + + # Infer QPS from count / duration when not directly available + duration_ns = self["generated_query_duration"] + count = self["generated_query_count"] + scenario = self["effective_scenario"] + if duration_ns and count and scenario: + qps_key = ( + "result_samples_per_second" + if scenario.lower() == "offline" + else "result_completed_samples_per_sec" + ) + if qps_key not in self.messages: + qps = count / (duration_ns / 1e9) + self.messages[qps_key] = [{"key": qps_key, "value": qps}] + + # Expose accuracy scores stored in results.json + for result in results_data.get("accuracy_scores", {}).values(): + score = result.get("score") + if score is not None: + self.messages.setdefault("accuracy_score", []).append( + {"key": "accuracy_score", "value": score} + ) + + self.keys = set(self.messages.keys()) + self.logger.info("Successfully loaded endpoints log from %s.", run_dir) + + def _load_json(self, path): + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except BaseException: + self.logger.error("Could not load json file from %s", path) + return {} + return {} + + def _load_yaml(self, run_dir): + for name in _CONFIG_FILES: + path = os.path.join(run_dir, name) + if os.path.exists(path): + try: + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + except BaseException: + pass + self.logger.error("Yaml file not found in directory %s", run_dir) + return {} + + def __getitem__(self, key): + if key not in self.keys: + return None + results = self.messages[key] + if len(results) > 1: + self.logger.warning( + "Multiple messages with key %s in the log. Empirically choosing the first one.", + key, + ) + return results[0]["value"] + + def get(self, key): + return self[key] + + def get_messages(self): + return self.messages + + def get_keys(self): + return self.keys + + def num_errors(self): + return self["num_errors"] + + def has_error(self): + return self.num_errors() != 0 + + +def main(): + logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s", + ) + logger = logging.getLogger("main") + + backwards_map = _load_field_map("backwards.json") + + # Collect all run directories (those containing at least one JSON and one + # YAML) + run_dirs = [] + for root, _dirs, files in os.walk(_SAMPLE_LOGS_DIR): + has_json = any(f.endswith(".json") for f in files) + has_yaml = any(f.endswith(".yaml") or f.endswith(".yml") + for f in files) + if has_json and has_yaml: + run_dirs.append(root) + + if not run_dirs: + logger.error("No run directories found under %s.", _SAMPLE_LOGS_DIR) + return 1 + + for run_dir in sorted(run_dirs): + folder = os.path.relpath(run_dir, _SAMPLE_LOGS_DIR) + print(f"\n{'=' * 70}") + print(f"Directory: {folder}") + print(f"{'=' * 70}") + + parser = EndpointsParser(run_dir) + + found = [] + not_found = [] + for loadgen_key, endpoints_key in backwards_map.items(): + value = parser[loadgen_key] + if value is not None: + found.append((loadgen_key, endpoints_key, value)) + else: + not_found.append((loadgen_key, endpoints_key)) + + total = len(backwards_map) + print(f"\nFound ({len(found)}/{total}):") + for loadgen_key, endpoints_key, value in found: + print(f" {loadgen_key:<55} = {value}") + + print(f"\nNot found ({len(not_found)}/{total}):") + for loadgen_key, endpoints_key in not_found: + label = endpoints_key if endpoints_key != "None" else "(no endpoints mapping)" + print(f" {loadgen_key:<55} [{label}]") + + return 0 + + +if __name__ == "__main__": + sys.exit(main())