mlcommons · pgmpablo157321 · Jun 15, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 17, 2026
@@ -32,7 +32,7 @@ The input submission directory is modified with empty directories removed and lo
 ## `submission_checker/main.py` (Mandatory)
 ### Inputs
 **input**: Path to the directory containing one or several submissions.<br>
-**version**: Checker version. E.g v1.1, v2.0, v2.1, v3.0, v3.1. <br>
+**version**: Checker version. E.g v5.0, v5.1, v6.0, v6.1. <br>
 **submitter**: Filter submitters and only run the checks for some specific submitter. <br>
 **csv**: Output path where the csv with the results will be stored. E.g `results/summary.csv`. <br>
 **skip_compliance**: Flag to skip compliance checks. <br>
@@ -71,25 +71,34 @@ python3 -m inference.tools.submission.submission_checker.main
     [--skip-calibration-check]
 ```
 
-### implemented checks
-**performance:**
+### Implemented checks
+**performance (loadgen):**
 - Check performance detailed log exists
 - Check for loadgen errors
 - Check for equal issue mode when it is required
 - Check the performance sample count used for running the benchmark
 - Check loadgen seeds are correct
-- Check latency constrain is met
-- Check minimun query count is met
-- Check minimun duration is met
+- Check latency constraint is met
+- Check minimum query count is met
+- Check minimum duration is met
 - Check network requirements
-- Check LLM latencies are met (if applies)
+- Check LLM TTFT/TPOT latencies are met via `use_token_latencies` flag (if applies)
 - Check loadgen scenario matches with submission scenario or that result can be inferred
 
+**performance (endpoints):**
+- Check result_summary.json and config.yaml exist
+- Check latency p99 constraint is met (from `latency.percentiles.99.0` in result_summary.json)
+- Check minimum duration is met (from `settings.runtime.min_duration_ms` in config.yaml)
+- Check LLM TTFT/TPOT p99 limits directly from result_summary.json for Server/Interactive scenarios
+- Extract primary metric as QPS (inferred from `n_samples_issued / duration_s` if not in results)
+- Skips: sample count check, seed check, min query count check (not applicable to endpoints)
+
 **accuracy**
 - Check the accuracy metric is correct and over the expected threshold (or within a range if applies)
-- Check accuracy json exists and is truncated
+- Check accuracy json exists and is truncated (loadgen only)
 - Check for loadgen error
 - Check full dataset is used for the accuracy run
+- Check `accuracy_scores` field is present and non-null (endpoints only)
 
 **compliance**
 - Check compliance directory exists
@@ -112,11 +121,10 @@ python3 -m inference.tools.submission.submission_checker.main
 - Check availability is valid
 - Check system type is valid
 - Check network fields
-- Check required fields are include in system json file
+- Check required fields are included in system json file
 - Check submitter is correct
 - Check division is correct
 
-
 ### Outputs
 - CSV file containing all the valid results in the directory.
 - It raises several errors and logs invalid results.

@@ -70,6 +70,8 @@ def __init__(
             "scenario", "")
         self.scenario = self.mlperf_log["effective_scenario"]
         self.division = self.submission_logs.loader_data.get("division", "")
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.setup_checks()
 
     def setup_checks(self):
@@ -83,6 +85,9 @@ def setup_checks(self):
         self.checks.append(self.loadgen_errors_check)
         self.checks.append(self.dataset_check)
         self.checks.append(self.extra_files_check)
+        self.apply_checks = set(self.checks)
+        if self.is_endpoints:
+            self.apply_checks.remove(self.accuracy_json_check)
 
     def accuracy_result_check(self):
         """Validate reported accuracy metrics in `accuracy.txt`.
@@ -97,6 +102,13 @@ def accuracy_result_check(self):
                 False otherwise.
         """
 
+        if self.is_endpoints:
+            if self.mlperf_log["accuracy_score"] is not None:
+                self.submission_logs.loader_data["accuracy_metrics"] = self.mlperf_log["accuracy_score"]
+                return True
+            self.log.error("%s accuracy score not found", self.path)
+            return False
+
         patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
             self.model
         )

@@ -9,6 +9,7 @@ class BaseCheck(ABC):
 
     def __init__(self, log, path):
         self.checks = []
+        self.apply_checks = set()
         self.log = log
         self.path = path
         self.name = "base checks"
@@ -21,22 +22,32 @@ def run_checks(self):
         valid = True
         errors = []
         for check in self.checks:
-            try:
-                v = self.execute(check)
-                valid &= v
-            except BaseException:
-                valid &= False
-                self.log.error(
-                    "Execution occurred in running check %s. Running %s in %s",
-                    self.path,
+            if self.check_applies(check):
+                try:
+                    v = self.execute(check)
+                    valid &= v
+                except BaseException:
+                    valid &= False
+                    self.log.error(
+                        "Execution occurred in running check %s. Running %s in %s",
+                        self.path,
+                        check.__name__,
+                        self.__class__.__name__)
+            else:
+                self.log.warning(
+                    "Execution of check %s skipped for %s.",
                     check.__name__,
-                    self.__class__.__name__)
+                    self.path
+                )
         return valid
 
     def execute(self, check):
         """Custom execution of a single check method."""
         return check()
 
+    def check_applies(self, fn):
+        return fn in self.apply_checks
+
     def __call__(self):
         """Allows the check instance to be called like a function."""
         self.log.info("Starting %s for: %s", self.name, self.path)

@@ -50,6 +50,8 @@ def __init__(self, log, path, config: Config,
         self.model = self.config.get_mlperf_model(
             self.model, self.model_mapping)
         self.test_list = self.get_test_list(self.model)
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.setup_checks()
 
     def setup_checks(self):
@@ -62,6 +64,10 @@ def setup_checks(self):
         self.checks.append(self.performance_check)
         self.checks.append(self.accuracy_check)
         self.checks.append(self.compliance_performance_check)
+        self.apply_checks = set(self.checks)
+        # No compliance tests for endpoints for now
+        if self.is_endpoints:
+            self.apply_checks = set()
 
     def get_test_list(self, model):
         """Return the list of compliance tests applicable to `model`.
@@ -186,6 +192,7 @@ def performance_check(self):
                     "model_mapping": self.submission_logs.loader_data.get("model_mapping", {}),
                     "check_scenarios": True,
                     "compliance_skip": True,
+                    "is_endpoints_submission": self.submission_logs.loader_data.get("is_endpoints_submission", False),
                 }
                 test_logs = SubmissionLogs(
                     self.submission_logs.loader_data[f"{test}_perf_log"], None, None, None, self.submission_logs.system_json, None, test_data)
@@ -322,7 +329,9 @@ def accuracy_check(self):
                     first_token_pass and eos_pass and length_check_pass)
                 if not is_valid:
                     self.log.error(
-                        f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
+                        f"TEST06 accuracy check failed. first_token_check:" +
+                        f"{first_token_pass} eos_check: " +
+                        f"{eos_pass} length_check: {length_check_pass}."
                     )
             elif test == "TEST07":
                 # TEST07: Verify accuracy in performance mode

@@ -61,6 +61,9 @@ def setup_checks(self):
         self.checks.append(self.directory_exist_check)
         self.checks.append(self.required_files_check)
         self.checks.append(self.required_fields_check)
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
+        self.apply_checks = set(self.checks)
 
     def missing_check(self):
         """Ensure a measurements JSON was provided.

@@ -53,6 +53,11 @@ def __init__(self, log, path, config: Config,
             "scenario", "")
         self.scenario = self.mlperf_log["effective_scenario"]
         self.division = self.submission_logs.loader_data.get("division", "")
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
+        if self.is_endpoints:
+            if self.scenario.lower() == "online":
+                self.scenario = "Server"
         self.setup_checks()
 
     def setup_checks(self):
@@ -74,6 +79,10 @@ def setup_checks(self):
         self.checks.append(self.llm_check)
         self.checks.append(self.inferred_check)
         self.checks.append(self.get_performance_metric_check)
+        self.apply_checks = set(self.checks)
+        if self.is_endpoints:
+            self.apply_checks.remove(self.performance_sample_count_check)
+            self.apply_checks.remove(self.min_query_count_check)
 
     def missing_check(self):
         """Ensure the performance log was provided.
@@ -200,14 +209,16 @@ def seeds_check(self):
         sample_index_rng_seed = self.mlperf_log["effective_sample_index_rng_seed"]
         schedule_rng_seed = self.mlperf_log["effective_schedule_rng_seed"]
         is_valid = True
-        if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
-            self.log.error(
-                "%s qsl_rng_seed is wrong, expected=%s, found=%s",
-                self.path,
-                config_seeds["qsl_rng_seed"],
-                qsl_rng_seed,
-            )
-            is_valid = False
+        if not self.is_endpoints:
+            # This seed does not exists for endpoints runs
+            if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
+                self.log.error(
+                    "%s qsl_rng_seed is wrong, expected=%s, found=%s",
+                    self.path,
+                    config_seeds["qsl_rng_seed"],
+                    qsl_rng_seed,
+                )
+                is_valid = False
         if sample_index_rng_seed != config_seeds["sample_index_rng_seed"]:
             self.log.error(
                 "%s sample_index_rng_seed is wrong, expected=%s, found=%s",
@@ -237,7 +248,7 @@ def latency_check(self):
             bool: True if latency constraints are satisfied, False otherwise.
         """
         uses_early_stopping = self.config.uses_early_stopping(self.scenario)
-        if uses_early_stopping:
+        if uses_early_stopping and not self.is_endpoints:
             # check if early_stopping condition was met
             if not self.mlperf_log["early_stopping_met"]:
                 early_stopping_result = self.mlperf_log["early_stopping_result"]
@@ -386,7 +397,9 @@ def network_check(self):
             # (must include "Network SUT" in name)
             if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
                 self.log.error(
-                    f"{self.path} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
+                    f"{self.path} invalid sut name for network mode." +
+                    f"expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}'" +
+                    f" got '{sut_name}'"
                 )
                 return False
 
@@ -403,6 +416,25 @@ def llm_check(self):
                 False otherwise.
         """
         if self.model in self.config.get_llm_models():
+            if self.is_endpoints:
+                # Endpoints don't use the loadgen use_token_latencies flag;
+                # check TTFT/TPOT directly from the endpoints result JSON.
+                if self.scenario not in ["Server", "Interactive"]:
+                    return True
+                limits = LLM_LATENCY_LIMITS[self.model][self.scenario]
+                ttft = self.mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+                tpot = self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+                if ttft is None or tpot is None:
+                    self.log.warning(
+                        "%s TTFT or TPOT percentile data missing for endpoints LLM check",
+                        self.path)
+                    return True
+                if ttft < limits["ttft"] and tpot < limits["tpot"]:
+                    return True
+                self.log.error(
+                    'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+                    ttft, tpot, limits["ttft"], limits["tpot"])
+                return False
             if self.mlperf_log["requested_use_token_latencies"]:
                 if self.scenario not in ["Server", "Interactive"]:
                     # For offline, singlestream and multistream no further checks are
@@ -457,7 +489,7 @@ def inferred_check(self):
                 ("singlestream", "offline")
             ]
             if (self.scenario.lower(), self.scenario_fixed.lower()
-                    ) not in list_inferred:
+                ) not in list_inferred:
                 self.log.error(
                     "Result for scenario %s can not be inferred from %s for: %s",
                     self.scenario_fixed,
@@ -485,10 +517,15 @@ def get_performance_metric_check(self):
         ):
             is_valid = True
         scenario = self.mlperf_log["effective_scenario"]
+        if self.is_endpoints:
+            if scenario.lower() == "online":
+                scenario = "Server"
+            scenario = scenario.capitalize()
 
         res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]])
         if (
-            version in RESULT_FIELD_BENCHMARK_OVERWRITE
+            not self.is_endpoints
+            and version in RESULT_FIELD_BENCHMARK_OVERWRITE
             and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
             and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model]
         ):
@@ -548,12 +585,12 @@ def get_inferred_result(self, res):
             res = qps_wo_loadgen_overhead
 
         if (scenario_fixed in ["Offline"]
-            ) and scenario in ["MultiStream"]:
+                ) and scenario in ["MultiStream"]:
             inferred = True
             res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
 
         if (scenario_fixed in ["MultiStream"]
-            ) and scenario in ["SingleStream"]:
+                ) and scenario in ["SingleStream"]:
             inferred = True
             # samples_per_query does not match with the one reported in the logs
             # when inferring MultiStream from SingleStream
@@ -570,6 +607,6 @@ def get_inferred_result(self, res):
             else:
                 res = (latency_99_percentile * samples_per_query) / MS_TO_NS
         if (scenario_fixed in ["Interactive"]
-            ) and scenario not in ["Server"]:
+                ) and scenario not in ["Server"]:
             is_valid = False
         return res, is_valid
@@ -68,6 +68,9 @@ def setup_checks(self):
         self.checks.append(self.required_files_check)
         self.checks.append(self.external_power_check)
         self.checks.append(self.get_power_metric_check)
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
+        self.apply_checks = set(self.checks)
 
     def required_files_check(self):
         """Verify required files exist in power-related directories.
@@ -226,7 +229,7 @@ def get_power_metric_check(self):
                     samples_per_query = 8
 
                 if (self.scenario_fixed.lower() in ["multistream"]
-                    ) and scenario.lower() in ["singlestream"]:
+                        ) and scenario.lower() in ["singlestream"]:
                     power_metric = (
                         avg_power * power_duration * samples_per_query * 1000 / num_queries
                     )

@@ -42,6 +42,8 @@ def __init__(self, log, path, config: Config,
         self.system_json = self.submission_logs.system_json
         self.submitter = self.submission_logs.loader_data.get("submitter", "")
         self.division = self.submission_logs.loader_data.get("division", "")
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.config = config
         self.setup_checks()
 
@@ -58,6 +60,7 @@ def setup_checks(self):
         self.checks.append(self.required_fields_check)
         self.checks.append(self.submitter_check)
         self.checks.append(self.division_check)
+        self.apply_checks = set(self.checks)
 
     def missing_check(self):
         """Ensure the system JSON file was provided.