From 325ae0de33dace49b48a49dd50049b462744e366 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Mon, 15 Jun 2026 12:01:22 -0500
Subject: [PATCH 1/7] Loadgen++/endpoints integration with submission checker

---
 .../checks/accuracy_check.py                  |   1 +
 .../submission_checker/checks/base.py         |  29 ++-
 .../checks/compliance_check.py                |   4 +-
 .../checks/measurements_checks.py             |   1 +
 .../checks/performance_check.py               |  12 +-
 .../submission_checker/checks/power_check.py  |   1 +
 .../submission_checker/checks/system_check.py |   1 +
 .../submission_checker/constants.py           | 122 +++++++++++
 tools/submission/submission_checker/loader.py |  58 ++++-
 .../parsers/endpoints_parser.py               | 198 ++++++++++++++++++
 10 files changed, 409 insertions(+), 18 deletions(-)
 create mode 100644 tools/submission/submission_checker/parsers/endpoints_parser.py

diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py
index db1b1a7559..df315e9498 100644
--- a/tools/submission/submission_checker/checks/accuracy_check.py
+++ b/tools/submission/submission_checker/checks/accuracy_check.py
@@ -83,6 +83,7 @@ def setup_checks(self):
         self.checks.append(self.loadgen_errors_check)
         self.checks.append(self.dataset_check)
         self.checks.append(self.extra_files_check)
+        self.apply_checks = set(self.checks)
 
     def accuracy_result_check(self):
         """Validate reported accuracy metrics in `accuracy.txt`.
diff --git a/tools/submission/submission_checker/checks/base.py b/tools/submission/submission_checker/checks/base.py
index 8e2a678fb9..69f51da638 100644
--- a/tools/submission/submission_checker/checks/base.py
+++ b/tools/submission/submission_checker/checks/base.py
@@ -9,6 +9,7 @@ class BaseCheck(ABC):
 
     def __init__(self, log, path):
         self.checks = []
+        self.apply_checks = set()
         self.log = log
         self.path = path
         self.name = "base checks"
@@ -21,22 +22,32 @@ def run_checks(self):
         valid = True
         errors = []
         for check in self.checks:
-            try:
-                v = self.execute(check)
-                valid &= v
-            except BaseException:
-                valid &= False
-                self.log.error(
-                    "Execution occurred in running check %s. Running %s in %s",
-                    self.path,
+            if self.check_applies(check):
+                try:
+                    v = self.execute(check)
+                    valid &= v
+                except BaseException:
+                    valid &= False
+                    self.log.error(
+                        "Execution occurred in running check %s. Running %s in %s",
+                        self.path,
+                        check.__name__,
+                        self.__class__.__name__)
+            else:
+                self.log.warning(
+                    "Execution of check %s skipped for %s.",
                     check.__name__,
-                    self.__class__.__name__)
+                    self.path
+                )
         return valid
 
     def execute(self, check):
         """Custom execution of a single check method."""
         return check()
 
+    def check_applies(self, fn):
+        return fn in self.apply_checks
+
     def __call__(self):
         """Allows the check instance to be called like a function."""
         self.log.info("Starting %s for: %s", self.name, self.path)
diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py
index 13cc6b16b8..566d5f5eb5 100644
--- a/tools/submission/submission_checker/checks/compliance_check.py
+++ b/tools/submission/submission_checker/checks/compliance_check.py
@@ -62,6 +62,7 @@ def setup_checks(self):
         self.checks.append(self.performance_check)
         self.checks.append(self.accuracy_check)
         self.checks.append(self.compliance_performance_check)
+        self.apply_checks = set(self.checks)
 
     def get_test_list(self, model):
         """Return the list of compliance tests applicable to `model`.
@@ -322,7 +323,8 @@ def accuracy_check(self):
                     first_token_pass and eos_pass and length_check_pass)
                 if not is_valid:
                     self.log.error(
-                        f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
+                        f"TEST06 accuracy check failed. first_token_check: {
+                            first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
                     )
             elif test == "TEST07":
                 # TEST07: Verify accuracy in performance mode
diff --git a/tools/submission/submission_checker/checks/measurements_checks.py b/tools/submission/submission_checker/checks/measurements_checks.py
index 06b89f56fc..c8cdbc7234 100644
--- a/tools/submission/submission_checker/checks/measurements_checks.py
+++ b/tools/submission/submission_checker/checks/measurements_checks.py
@@ -61,6 +61,7 @@ def setup_checks(self):
         self.checks.append(self.directory_exist_check)
         self.checks.append(self.required_files_check)
         self.checks.append(self.required_fields_check)
+        self.apply_checks = set(self.checks)
 
     def missing_check(self):
         """Ensure a measurements JSON was provided.
diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
index e54e7b5564..3c63794f0f 100644
--- a/tools/submission/submission_checker/checks/performance_check.py
+++ b/tools/submission/submission_checker/checks/performance_check.py
@@ -74,6 +74,7 @@ def setup_checks(self):
         self.checks.append(self.llm_check)
         self.checks.append(self.inferred_check)
         self.checks.append(self.get_performance_metric_check)
+        self.apply_checks = set(self.checks)
 
     def missing_check(self):
         """Ensure the performance log was provided.
@@ -386,7 +387,8 @@ def network_check(self):
             # (must include "Network SUT" in name)
             if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
                 self.log.error(
-                    f"{self.path} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
+                    f"{self.path} invalid sut name for network mode. expecting the substring '{
+                        NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
                 )
                 return False
 
@@ -457,7 +459,7 @@ def inferred_check(self):
                 ("singlestream", "offline")
             ]
             if (self.scenario.lower(), self.scenario_fixed.lower()
-                    ) not in list_inferred:
+                ) not in list_inferred:
                 self.log.error(
                     "Result for scenario %s can not be inferred from %s for: %s",
                     self.scenario_fixed,
@@ -548,12 +550,12 @@ def get_inferred_result(self, res):
             res = qps_wo_loadgen_overhead
 
         if (scenario_fixed in ["Offline"]
-            ) and scenario in ["MultiStream"]:
+                ) and scenario in ["MultiStream"]:
             inferred = True
             res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
 
         if (scenario_fixed in ["MultiStream"]
-            ) and scenario in ["SingleStream"]:
+                ) and scenario in ["SingleStream"]:
             inferred = True
             # samples_per_query does not match with the one reported in the logs
             # when inferring MultiStream from SingleStream
@@ -570,6 +572,6 @@ def get_inferred_result(self, res):
             else:
                 res = (latency_99_percentile * samples_per_query) / MS_TO_NS
         if (scenario_fixed in ["Interactive"]
-            ) and scenario not in ["Server"]:
+                ) and scenario not in ["Server"]:
             is_valid = False
         return res, is_valid
diff --git a/tools/submission/submission_checker/checks/power_check.py b/tools/submission/submission_checker/checks/power_check.py
index d3519a3503..499768a892 100644
--- a/tools/submission/submission_checker/checks/power_check.py
+++ b/tools/submission/submission_checker/checks/power_check.py
@@ -68,6 +68,7 @@ def setup_checks(self):
         self.checks.append(self.required_files_check)
         self.checks.append(self.external_power_check)
         self.checks.append(self.get_power_metric_check)
+        self.apply_checks = set(self.checks)
 
     def required_files_check(self):
         """Verify required files exist in power-related directories.
diff --git a/tools/submission/submission_checker/checks/system_check.py b/tools/submission/submission_checker/checks/system_check.py
index 54746c0408..8ab811bb7a 100644
--- a/tools/submission/submission_checker/checks/system_check.py
+++ b/tools/submission/submission_checker/checks/system_check.py
@@ -58,6 +58,7 @@ def setup_checks(self):
         self.checks.append(self.required_fields_check)
         self.checks.append(self.submitter_check)
         self.checks.append(self.division_check)
+        self.apply_checks = set(self.checks)
 
     def missing_check(self):
         """Ensure the system JSON file was provided.
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
index 2f4abd87f8..e573b6e18c 100644
--- a/tools/submission/submission_checker/constants.py
+++ b/tools/submission/submission_checker/constants.py
@@ -1604,6 +1604,8 @@
     "server": "Queries/s",
     "interactive": "Queries/s",
 }
+
+
 POWER_UNIT_DICT = {
     "SingleStream": "millijoules",
     "MultiStream": "millijoules",
@@ -1633,6 +1635,20 @@
     "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt",
 }
 
+PERFORMANCE_ENDPOINTS_PATH = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
+}
+
+PERFORMANCE_CONFIG_ENDPOINTS_PATH = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
+}
+
 ACCURACY_LOG_PATH = {
     "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
     "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_detail.txt",
@@ -1654,6 +1670,20 @@
     "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
 }
 
+ACCURACY_ENDPOINTS_PATH = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
+}
+
+ACCURACY_CONFIG_ENDPOINTS_PATH = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
+}
+
 POWER_DIR_PATH = {
     "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
     "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
@@ -1748,3 +1778,95 @@
     "v6.0": "{division}/{submitter}/src",
     "default": "{division}/{submitter}/src",
 }
+
+ENDPOINTS_MAPPINGS = {
+    "endpoints_version": "loadgen_version",
+    "endpoints_git_commit_date": "loadgen_git_commit_date",
+    "endpoints_git_commit_hash": "loadgen_git_commit_hash",
+    "test_datetime": "test_datetime",
+    "n_samples_issued": "qsl_reported_total_count",
+    "n_samples_from_dataset": "qsl_reported_performance_count",
+    "effective_scenario": "effective_scenario",
+    "mode": "effective_test_mode",
+    "streaming": "streaming",
+    "output_sequence_lengths.min": "min_output_tokens",
+    "output_sequence_lengths.max": "max_output_tokens",
+    "load_pattern": "load_pattern",
+    "min_duration_ms": "effective_min_duration_ms",
+    "max_duration_ms": "effective_max_duration_ms",
+    "effective_target_duration_ms": "effective_target_duration_ms",
+    "min_sample_count": "effective_min_query_count",
+    "effective_sample_index_rng_seed": "effective_sample_index_rng_seed",
+    "effective_schedule_rng_seed": "effective_schedule_rng_seed",
+    "min_sample_count (runtime_settings.json)": "effective_min_sample_count",
+    "effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation",
+    "effective_samples_per_query": "effective_samples_per_query",
+    "generated_query_count": "generated_query_count",
+    "generated_query_duration": "generated_query_duration",
+    "target_qps (results_summary.json)": "effective_target_qps",
+    "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec",
+    "qps ": "result_completed_samples_per_sec",
+    "effective_target_latency_ns": "effective_target_latency_ns",
+    "effective_target_latency_percentile": "effective_target_latency_percentile",
+    "latency.min ": "result_min_latency_ns",
+    "latency.max": "result_max_latency_ns",
+    "latency.avg": "result_mean_latency_ns",
+    "latency.percentiles.50": "result_50.00_percentile_latency_ns",
+    "latency.percentiles.90": "result_90.00_percentile_latency_ns",
+    "latency.percentiles.95": "result_95.00_percentile_latency_ns",
+    "latency.percentiles.99": "result_99.00_percentile_latency_ns",
+    "latency.percentiles.99.9": "result_99.90_percentile_latency_ns",
+    "ttft.min": "result_first_token_min_latency_ns",
+    "ttft.max": "result_first_token_max_latency_ns",
+    "ttft.avg": "result_first_token_mean_latency_ns",
+    "ttft.percentiles.50": "result_first_token_50.00_percentile_latency_ns",
+    "ttft.percentiles.90": "result_first_token_90.00_percentile_latency_ns",
+    "ttft.percentiles.95": "result_first_token_95.00_percentile_latency_ns",
+    "ttft.percentiles.99": "result_first_token_99.00_percentile_latency_ns",
+    "ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns",
+    "tpot.percentiles.50": "result_time_per_output_token_50.00_percentile_ns",
+    "tpot.percentiles.90": "result_time_per_output_token_90.00_percentile_ns",
+    "tpot.percentiles.95": "result_time_per_output_token_95.00_percentile_ns",
+    "tpot.percentiles.99": "result_time_per_output_token_99.00_percentile_ns",
+    "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns",
+    "tpot.min": "result_time_to_output_token_min",
+    "tpot.max": "result_time_to_output_token_max",
+    "tpot.avg": "result_time_to_output_token_mean",
+    "tps": "result_completed_tokens_per_second",
+    "result.total": "result_query_count",
+    "result.failed": "num_errors"
+}
+
+
+# Maps endpoints field name (forwards.json key) to the dot-notation path
+# inside config.yaml
+ENDPOINTS_YAML_FIELD_MAP = {
+    "effective_scenario": "type",
+    "endpoints_version": "version",
+    "streaming": "model_params.streaming",
+    "load_pattern": "settings.load_pattern.type",
+    "min_duration_ms": "settings.runtime.min_duration_ms",
+    "max_duration_ms": "settings.runtime.max_duration_ms",
+    "effective_sample_index_rng_seed": "settings.runtime.dataloader_random_seed",
+    "effective_schedule_rng_seed": "settings.runtime.scheduler_random_seed",
+    "target_qps (results_summary.json)": "settings.load_pattern.target_qps",
+    "min_sample_count (runtime_settings.json)": "settings.runtime.n_samples_to_issue",
+    "min_sample_count": "settings.runtime.n_samples_to_issue",
+}
+
+# Alternative JSON paths for endpoints keys that don't directly match the
+# JSON structure
+ENDPOINTS_JSON_ALT_PATHS = {
+    "result.total": "results.total",
+    "result.failed": "results.failed",
+    "qps": "results.qps",
+    "generated_query_count": "n_samples_issued",
+    "generated_query_duration": "duration_ns",
+    "test_datetime": "test_started_at",
+    "endpoints_git_commit_hash": "git_sha",
+    "n_samples_from_dataset": "n_samples_issued",
+}
+
+ENDPOINTS_INFERRED_FIELDS = {
+    "effective_accuracy_sample_count": "result_query_count"
+}
diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py
index 79cfdce73a..170da085f1 100644
--- a/tools/submission/submission_checker/loader.py
+++ b/tools/submission/submission_checker/loader.py
@@ -2,6 +2,7 @@
 from .constants import *
 from .utils import list_dir
 from .parsers.loadgen_parser import LoadgenParser
+from .parsers.endpoints_parser import EndpointsParser
 from typing import Generator, Literal
 from .utils import *
 from .configuration.configuration import Config
@@ -82,6 +83,18 @@ def __init__(self, root, version, config: Config) -> None:
         self.acc_json_path = os.path.join(
             self.root, ACCURACY_JSON_PATH.get(
                 version, ACCURACY_JSON_PATH["default"]))
+        self.perf_endpoints_path = os.path.join(
+            self.root, PERFORMANCE_ENDPOINTS_PATH.get(
+                version, PERFORMANCE_ENDPOINTS_PATH["default"]))
+        self.perf_endpoints_config_path = os.path.join(
+            self.root, PERFORMANCE_CONFIG_ENDPOINTS_PATH.get(
+                version, PERFORMANCE_CONFIG_ENDPOINTS_PATH["default"]))
+        self.acc_endpoints_path = os.path.join(
+            self.root, ACCURACY_ENDPOINTS_PATH.get(
+                version, ACCURACY_ENDPOINTS_PATH["default"]))
+        self.acc_endpoints_config_path = os.path.join(
+            self.root, ACCURACY_CONFIG_ENDPOINTS_PATH.get(
+                version, ACCURACY_CONFIG_ENDPOINTS_PATH["default"]))
         self.system_log_path = os.path.join(
             self.root, SYSTEM_PATH.get(
                 version, SYSTEM_PATH["default"]))
@@ -182,7 +195,7 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy",
         accuracy results as line lists, etc.
 
         Args:
-            path (str): Filesystem path to the log file.
+            path (str or List[str]): Filesystem path to the log file.
             log_type (str): Type of log to load, determining parsing method.
 
         Returns:
@@ -190,7 +203,9 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy",
                 if loading fails.
         """
         log = None
-        if os.path.exists(path):
+        if log_type in ["Endpoints"]:
+            log = EndpointsParser(path)
+        elif os.path.exists(path):
             self.logger.info("Loading %s log from %s", log_type, path)
             if log_type in ["Performance", "Accuracy", "Test"]:
                 log = LoadgenParser(path)
@@ -294,6 +309,30 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
                                 system=system,
                                 benchmark=benchmark,
                                 scenario=scenario)
+                            perf_endpoints_path = self.perf_endpoints_path.format(
+                                division=division,
+                                submitter=submitter,
+                                system=system,
+                                benchmark=benchmark,
+                                scenario=scenario)
+                            perf_endpoints_config_path = self.perf_endpoints_config_path.format(
+                                division=division,
+                                submitter=submitter,
+                                system=system,
+                                benchmark=benchmark,
+                                scenario=scenario)
+                            acc_endpoints_path = self.acc_endpoints_path.format(
+                                division=division,
+                                submitter=submitter,
+                                system=system,
+                                benchmark=benchmark,
+                                scenario=scenario)
+                            acc_endpoints_config_path = self.acc_endpoints_config_path.format(
+                                division=division,
+                                submitter=submitter,
+                                system=system,
+                                benchmark=benchmark,
+                                scenario=scenario)
                             acc_result_path = self.acc_result_path.format(
                                 division=division,
                                 submitter=submitter,
@@ -388,7 +427,8 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
                             src_path = self.src_path.format(
                                 division=division, submitter=submitter)
 
-                            # Load logs
+                            # Load logs loadgen
+                            is_endpoints_submission = False
                             perf_log = self.load_single_log(
                                 perf_path, "Performance")
                             acc_log = self.load_single_log(
@@ -399,6 +439,17 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
                                 acc_json_path, "AccuracyJSON")
                             measurements_json = self.load_single_log(
                                 measurements_path, "Measurements")
+                            if perf_log is None and acc_log is None:
+                                is_endpoints_submission = True
+                                perf_log = self.load_single_log(
+                                    [perf_endpoints_path,
+                                        perf_endpoints_config_path],
+                                    "Endpoints"
+                                )
+                                acc_log = self.load_single_log(
+                                    [acc_endpoints_path, acc_endpoints_config_path],
+                                    "Endpoints"
+                                )
 
                             # Load test logs
                             test01_perf_log = self.load_single_log(
@@ -429,6 +480,7 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
                                 "system": system,
                                 "benchmark": benchmark,
                                 "scenario": scenario,
+                                "is_endpoints_submission": is_endpoints_submission,
                                 # Submission paths
                                 "perf_path": perf_path,
                                 "acc_path": acc_path,
diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py
new file mode 100644
index 0000000000..7102e19a4b
--- /dev/null
+++ b/tools/submission/submission_checker/parsers/endpoints_parser.py
@@ -0,0 +1,198 @@
+import json
+import logging
+import os
+import sys
+import yaml
+
+from .base import BaseParser
+from ..constants import ENDPOINTS_YAML_FIELD_MAP, ENDPOINTS_JSON_ALT_PATHS, ENDPOINTS_MAPPINGS, ENDPOINTS_INFERRED_FIELDS
+
+_FIELDS_MAP_DIR = os.path.join(
+    os.path.dirname(__file__),
+    "..",
+    "helper",
+    "fields_map")
+_SAMPLE_LOGS_DIR = os.path.join(
+    os.path.dirname(__file__),
+    "..",
+    "helper",
+    "sample_logs")
+
+
+def _load_field_map(filename):
+    with open(os.path.join(_FIELDS_MAP_DIR, filename), "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def _get_nested(data, dotted_key):
+    """Navigate a nested dict using a dot-notation key.
+
+    Uses a greedy left-to-right match so dotted numeric keys like '99.9' are
+    handled correctly: the longest matching key at each level wins.
+    """
+    if not isinstance(data, dict):
+        return None
+    parts = dotted_key.split(".")
+    current = data
+    i = 0
+    while i < len(parts):
+        if not isinstance(current, dict):
+            return None
+        found = False
+        for j in range(len(parts), i, -1):
+            candidate = ".".join(parts[i:j])
+            if candidate in current:
+                current = current[candidate]
+                i = j
+                found = True
+                break
+        if not found:
+            return None
+    if isinstance(current, (dict, list)) and not current:
+        return None
+    return current
+
+
+class EndpointsParser(BaseParser):
+    def __init__(self, log_paths):
+        """
+        log_paths: [json_path, yaml_path]
+          json_path - path to the JSON results file (result_summary.json or results.json)
+          yaml_path - path to the YAML config file (config.yaml)
+        """
+        json_path, yaml_path = log_paths
+        super().__init__(json_path)
+
+        self.logger = logging.getLogger("MLPerfLog")
+        self.messages = {}
+
+        with open(json_path, "r", encoding="utf-8") as f:
+            json_data = json.load(f)
+
+        with open(yaml_path, "r", encoding="utf-8") as f:
+            yaml_data = yaml.safe_load(f)
+
+        forwards_map = ENDPOINTS_MAPPINGS
+
+        for endpoints_key, loadgen_key in forwards_map.items():
+            stripped = endpoints_key.strip()
+            value = None
+
+            # 1. Direct dot-notation path in the JSON result file
+            value = _get_nested(json_data, stripped)
+
+            # 2. Alternative JSON paths for known structural mismatches
+            if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS:
+                value = _get_nested(
+                    json_data, ENDPOINTS_JSON_ALT_PATHS[stripped])
+
+            # 3. Explicit YAML field path overrides
+            if value is None and stripped in ENDPOINTS_YAML_FIELD_MAP:
+                value = _get_nested(
+                    yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped])
+
+            # 4. Fallback: direct dot-notation path in the YAML config
+            if value is None:
+                value = _get_nested(yaml_data, stripped)
+
+            if value is not None:
+                entry = {"key": loadgen_key, "value": value}
+                self.messages.setdefault(loadgen_key, []).append(entry)
+
+        self.keys = set(self.messages.keys())
+        # Additional values that can be inferred from other values
+        inferred_map = ENDPOINTS_INFERRED_FIELDS
+        for inferred, key in inferred_map.items():
+            value = self.__getitem__(key)
+            if value is not None:
+                entry = {"key": inferred, "value": value}
+                self.messages.setdefault(inferred, []).append(entry)
+
+        self.keys = set(self.messages.keys())
+        self.logger.info(
+            "Successfully loaded endpoints log from %s.",
+            json_path)
+
+    def __getitem__(self, key):
+        if key not in self.keys:
+            return None
+        results = self.messages[key]
+        if len(results) > 1:
+            self.logger.warning(
+                "Multiple messages with key %s in the log. Empirically choosing the first one.",
+                key,
+            )
+        return results[0]["value"]
+
+    def get(self, key):
+        return self.messages[key]
+
+    def get_messages(self):
+        return self.messages
+
+    def get_keys(self):
+        return self.keys
+
+
+def main():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s",
+    )
+    logger = logging.getLogger("main")
+
+    backwards_map = _load_field_map("backwards.json")
+
+    # Collect all (json_file, yaml_file) pairs from leaf subdirectories
+    pairs = []
+    for root, _dirs, files in os.walk(_SAMPLE_LOGS_DIR):
+        json_files = sorted(f for f in files if f.endswith(".json"))
+        yaml_files = sorted(f for f in files if f.endswith(
+            ".yaml") or f.endswith(".yml"))
+        if json_files and yaml_files:
+            pairs.append(
+                (
+                    os.path.join(root, json_files[0]),
+                    os.path.join(root, yaml_files[0]),
+                )
+            )
+
+    if not pairs:
+        logger.error("No JSON+YAML pairs found under %s.", _SAMPLE_LOGS_DIR)
+        return 1
+
+    for json_path, yaml_path in sorted(pairs):
+        folder = os.path.relpath(os.path.dirname(json_path), _SAMPLE_LOGS_DIR)
+        print(f"\n{'=' * 70}")
+        print(f"Folder : {folder}")
+        print(f"JSON   : {os.path.basename(json_path)}")
+        print(f"YAML   : {os.path.basename(yaml_path)}")
+        print(f"{'=' * 70}")
+
+        parser = EndpointsParser([json_path, yaml_path])
+
+        found = []
+        not_found = []
+
+        for loadgen_key, endpoints_key in backwards_map.items():
+            value = parser[loadgen_key]
+            if value is not None:
+                found.append((loadgen_key, endpoints_key, value))
+            else:
+                not_found.append((loadgen_key, endpoints_key))
+
+        total = len(backwards_map)
+        print(f"\nFound ({len(found)}/{total}):")
+        for loadgen_key, endpoints_key, value in found:
+            print(f"  {loadgen_key:<55} = {value}")
+
+        print(f"\nNot found ({len(not_found)}/{total}):")
+        for loadgen_key, endpoints_key in not_found:
+            label = endpoints_key if endpoints_key != "None" else "(no endpoints mapping)"
+            print(f"  {loadgen_key:<55}  [{label}]")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 5808d0a22f80577682ef4293f3719690ed19c1fb Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Tue, 16 Jun 2026 00:49:25 -0500
Subject: [PATCH 2/7] Handle checks: remove checks for endpoints submissions

---
 .../checks/accuracy_check.py                  | 11 ++++++++
 .../checks/compliance_check.py                |  5 ++++
 .../checks/measurements_checks.py             |  2 ++
 .../checks/performance_check.py               | 23 ++++++++++++----
 .../submission_checker/checks/power_check.py  |  4 ++-
 .../submission_checker/checks/system_check.py |  2 ++
 .../submission_checker/constants.py           |  4 +--
 .../parsers/endpoints_parser.py               | 26 ++++++++++++++++++-
 8 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/tools/submission/submission_checker/checks/accuracy_check.py b/tools/submission/submission_checker/checks/accuracy_check.py
index df315e9498..5f7225e11e 100644
--- a/tools/submission/submission_checker/checks/accuracy_check.py
+++ b/tools/submission/submission_checker/checks/accuracy_check.py
@@ -70,6 +70,8 @@ def __init__(
             "scenario", "")
         self.scenario = self.mlperf_log["effective_scenario"]
         self.division = self.submission_logs.loader_data.get("division", "")
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.setup_checks()
 
     def setup_checks(self):
@@ -84,6 +86,8 @@ def setup_checks(self):
         self.checks.append(self.dataset_check)
         self.checks.append(self.extra_files_check)
         self.apply_checks = set(self.checks)
+        if self.is_endpoints:
+            self.apply_checks.remove(self.accuracy_json_check)
 
     def accuracy_result_check(self):
         """Validate reported accuracy metrics in `accuracy.txt`.
@@ -98,6 +102,13 @@ def accuracy_result_check(self):
                 False otherwise.
         """
 
+        if self.is_endpoints:
+            if self.mlperf_log["accuracy_score"] is not None:
+                self.submission_logs.loader_data["accuracy_metrics"] = self.mlperf_log["accuracy_score"]
+                return True
+            self.log.error("%s accuracy score not found", self.path)
+            return False
+
         patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
             self.model
         )
diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py
index 566d5f5eb5..11f79282d2 100644
--- a/tools/submission/submission_checker/checks/compliance_check.py
+++ b/tools/submission/submission_checker/checks/compliance_check.py
@@ -50,6 +50,8 @@ def __init__(self, log, path, config: Config,
         self.model = self.config.get_mlperf_model(
             self.model, self.model_mapping)
         self.test_list = self.get_test_list(self.model)
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.setup_checks()
 
     def setup_checks(self):
@@ -63,6 +65,9 @@ def setup_checks(self):
         self.checks.append(self.accuracy_check)
         self.checks.append(self.compliance_performance_check)
         self.apply_checks = set(self.checks)
+        # No compliance tests for endpoints for now
+        if self.is_endpoints:
+            self.apply_checks = set()
 
     def get_test_list(self, model):
         """Return the list of compliance tests applicable to `model`.
diff --git a/tools/submission/submission_checker/checks/measurements_checks.py b/tools/submission/submission_checker/checks/measurements_checks.py
index c8cdbc7234..8a2731f4e6 100644
--- a/tools/submission/submission_checker/checks/measurements_checks.py
+++ b/tools/submission/submission_checker/checks/measurements_checks.py
@@ -61,6 +61,8 @@ def setup_checks(self):
         self.checks.append(self.directory_exist_check)
         self.checks.append(self.required_files_check)
         self.checks.append(self.required_fields_check)
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.apply_checks = set(self.checks)
 
     def missing_check(self):
diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
index 3c63794f0f..bf1befc231 100644
--- a/tools/submission/submission_checker/checks/performance_check.py
+++ b/tools/submission/submission_checker/checks/performance_check.py
@@ -53,6 +53,11 @@ def __init__(self, log, path, config: Config,
             "scenario", "")
         self.scenario = self.mlperf_log["effective_scenario"]
         self.division = self.submission_logs.loader_data.get("division", "")
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
+        if self.is_endpoints:
+            if self.scenario.lower() == "online":
+                self.scenario = "Server"
         self.setup_checks()
 
     def setup_checks(self):
@@ -75,6 +80,10 @@ def setup_checks(self):
         self.checks.append(self.inferred_check)
         self.checks.append(self.get_performance_metric_check)
         self.apply_checks = set(self.checks)
+        if self.is_endpoints:
+            self.apply_checks.remove(self.seeds_check)
+            self.apply_checks.remove(self.performance_sample_count_check)
+            self.apply_checks.remove(self.min_query_count_check)
 
     def missing_check(self):
         """Ensure the performance log was provided.
@@ -238,7 +247,7 @@ def latency_check(self):
             bool: True if latency constraints are satisfied, False otherwise.
         """
         uses_early_stopping = self.config.uses_early_stopping(self.scenario)
-        if uses_early_stopping:
+        if uses_early_stopping and not self.is_endpoints:
             # check if early_stopping condition was met
             if not self.mlperf_log["early_stopping_met"]:
                 early_stopping_result = self.mlperf_log["early_stopping_result"]
@@ -459,7 +468,7 @@ def inferred_check(self):
                 ("singlestream", "offline")
             ]
             if (self.scenario.lower(), self.scenario_fixed.lower()
-                ) not in list_inferred:
+                    ) not in list_inferred:
                 self.log.error(
                     "Result for scenario %s can not be inferred from %s for: %s",
                     self.scenario_fixed,
@@ -487,6 +496,10 @@ def get_performance_metric_check(self):
         ):
             is_valid = True
         scenario = self.mlperf_log["effective_scenario"]
+        if self.is_endpoints:
+            if scenario.lower() == "online":
+                scenario = "Server"
+            scenario = scenario.capitalize()
 
         res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]])
         if (
@@ -550,12 +563,12 @@ def get_inferred_result(self, res):
             res = qps_wo_loadgen_overhead
 
         if (scenario_fixed in ["Offline"]
-                ) and scenario in ["MultiStream"]:
+            ) and scenario in ["MultiStream"]:
             inferred = True
             res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
 
         if (scenario_fixed in ["MultiStream"]
-                ) and scenario in ["SingleStream"]:
+            ) and scenario in ["SingleStream"]:
             inferred = True
             # samples_per_query does not match with the one reported in the logs
             # when inferring MultiStream from SingleStream
@@ -572,6 +585,6 @@ def get_inferred_result(self, res):
             else:
                 res = (latency_99_percentile * samples_per_query) / MS_TO_NS
         if (scenario_fixed in ["Interactive"]
-                ) and scenario not in ["Server"]:
+            ) and scenario not in ["Server"]:
             is_valid = False
         return res, is_valid
diff --git a/tools/submission/submission_checker/checks/power_check.py b/tools/submission/submission_checker/checks/power_check.py
index 499768a892..c8accfd6f2 100644
--- a/tools/submission/submission_checker/checks/power_check.py
+++ b/tools/submission/submission_checker/checks/power_check.py
@@ -68,6 +68,8 @@ def setup_checks(self):
         self.checks.append(self.required_files_check)
         self.checks.append(self.external_power_check)
         self.checks.append(self.get_power_metric_check)
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.apply_checks = set(self.checks)
 
     def required_files_check(self):
@@ -227,7 +229,7 @@ def get_power_metric_check(self):
                     samples_per_query = 8
 
                 if (self.scenario_fixed.lower() in ["multistream"]
-                    ) and scenario.lower() in ["singlestream"]:
+                        ) and scenario.lower() in ["singlestream"]:
                     power_metric = (
                         avg_power * power_duration * samples_per_query * 1000 / num_queries
                     )
diff --git a/tools/submission/submission_checker/checks/system_check.py b/tools/submission/submission_checker/checks/system_check.py
index 8ab811bb7a..49b030c399 100644
--- a/tools/submission/submission_checker/checks/system_check.py
+++ b/tools/submission/submission_checker/checks/system_check.py
@@ -42,6 +42,8 @@ def __init__(self, log, path, config: Config,
         self.system_json = self.submission_logs.system_json
         self.submitter = self.submission_logs.loader_data.get("submitter", "")
         self.division = self.submission_logs.loader_data.get("division", "")
+        self.is_endpoints = self.submission_logs.loader_data.get(
+            "is_endpoints_submission", False)
         self.config = config
         self.setup_checks()
 
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
index e573b6e18c..09dce5e8d2 100644
--- a/tools/submission/submission_checker/constants.py
+++ b/tools/submission/submission_checker/constants.py
@@ -1805,7 +1805,7 @@
     "generated_query_duration": "generated_query_duration",
     "target_qps (results_summary.json)": "effective_target_qps",
     "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec",
-    "qps ": "result_completed_samples_per_sec",
+    "qps": "result_completed_samples_per_sec",
     "effective_target_latency_ns": "effective_target_latency_ns",
     "effective_target_latency_percentile": "effective_target_latency_percentile",
     "latency.min ": "result_min_latency_ns",
@@ -1834,7 +1834,7 @@
     "tpot.avg": "result_time_to_output_token_mean",
     "tps": "result_completed_tokens_per_second",
     "result.total": "result_query_count",
-    "result.failed": "num_errors"
+    "result.failed": "num_errors",
 }
 
 
diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py
index 7102e19a4b..b1fc533916 100644
--- a/tools/submission/submission_checker/parsers/endpoints_parser.py
+++ b/tools/submission/submission_checker/parsers/endpoints_parser.py
@@ -108,6 +108,23 @@ def __init__(self, log_paths):
                 entry = {"key": inferred, "value": value}
                 self.messages.setdefault(inferred, []).append(entry)
 
+        # Temporary solution: Hardcoded inferred values
+        if self.__getitem__("generated_query_duration") and self.__getitem__(
+                "generated_query_count"):
+            key = "result_samples_per_second" if self.__getitem__(
+                "effective_scenario").lower() == "offline" else "result_completed_samples_per_sec"
+            value = self.__getitem__(
+                "generated_query_count") / self.__getitem__("generated_query_duration")
+            entry = {"key": key, "value": value}
+            self.messages.setdefault(key, []).append(entry)
+
+        # Extract accuracy scores if possible
+        if "accuracy_scores" in json_data:
+            for dataset_name, result in json_data["accuracy_scores"].items():
+                value = result.get("score", None)
+                entry = {"key": "accuracy_score", "value": value}
+                self.messages.setdefault("accuracy_score", []).append(entry)
+
         self.keys = set(self.messages.keys())
         self.logger.info(
             "Successfully loaded endpoints log from %s.",
@@ -125,7 +142,7 @@ def __getitem__(self, key):
         return results[0]["value"]
 
     def get(self, key):
-        return self.messages[key]
+        return self[key]
 
     def get_messages(self):
         return self.messages
@@ -133,6 +150,13 @@ def get_messages(self):
     def get_keys(self):
         return self.keys
 
+    def num_errors(self):
+        return self.get("num_errors")
+
+    def has_error(self):
+        """Check if the log contains any errors."""
+        return self.num_errors() != 0
+
 
 def main():
     logging.basicConfig(

From 788e9ed1214442f00d7af02979e1f87f9f3029bc Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Tue, 16 Jun 2026 10:51:22 -0500
Subject: [PATCH 3/7] Split string to avoid error in automatic testing

---
 .../submission_checker/checks/compliance_check.py   |  5 +++--
 .../submission_checker/checks/performance_check.py  | 13 +++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py
index 11f79282d2..b08b23803e 100644
--- a/tools/submission/submission_checker/checks/compliance_check.py
+++ b/tools/submission/submission_checker/checks/compliance_check.py
@@ -328,8 +328,9 @@ def accuracy_check(self):
                     first_token_pass and eos_pass and length_check_pass)
                 if not is_valid:
                     self.log.error(
-                        f"TEST06 accuracy check failed. first_token_check: {
-                            first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
+                        f"TEST06 accuracy check failed. first_token_check:" +
+                        f"{first_token_pass} eos_check: " +
+                        f"{eos_pass} length_check: {length_check_pass}."
                     )
             elif test == "TEST07":
                 # TEST07: Verify accuracy in performance mode
diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
index bf1befc231..624ff220ec 100644
--- a/tools/submission/submission_checker/checks/performance_check.py
+++ b/tools/submission/submission_checker/checks/performance_check.py
@@ -396,8 +396,9 @@ def network_check(self):
             # (must include "Network SUT" in name)
             if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
                 self.log.error(
-                    f"{self.path} invalid sut name for network mode. expecting the substring '{
-                        NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
+                    f"{self.path} invalid sut name for network mode." +
+                    f"expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}'" +
+                    f" got '{sut_name}'"
                 )
                 return False
 
@@ -468,7 +469,7 @@ def inferred_check(self):
                 ("singlestream", "offline")
             ]
             if (self.scenario.lower(), self.scenario_fixed.lower()
-                    ) not in list_inferred:
+                ) not in list_inferred:
                 self.log.error(
                     "Result for scenario %s can not be inferred from %s for: %s",
                     self.scenario_fixed,
@@ -563,12 +564,12 @@ def get_inferred_result(self, res):
             res = qps_wo_loadgen_overhead
 
         if (scenario_fixed in ["Offline"]
-            ) and scenario in ["MultiStream"]:
+                ) and scenario in ["MultiStream"]:
             inferred = True
             res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
 
         if (scenario_fixed in ["MultiStream"]
-            ) and scenario in ["SingleStream"]:
+                ) and scenario in ["SingleStream"]:
             inferred = True
             # samples_per_query does not match with the one reported in the logs
             # when inferring MultiStream from SingleStream
@@ -585,6 +586,6 @@ def get_inferred_result(self, res):
             else:
                 res = (latency_99_percentile * samples_per_query) / MS_TO_NS
         if (scenario_fixed in ["Interactive"]
-            ) and scenario not in ["Server"]:
+                ) and scenario not in ["Server"]:
             is_valid = False
         return res, is_valid

From d3f2a777972c6f29d7e44e316cbdb8bca9927231 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Wed, 17 Jun 2026 21:26:12 +0530
Subject: [PATCH 4/7] Fix endpoints submission checker bugs and update
 documentation (#2603)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- constants.py: Fix percentile key format in ENDPOINTS_MAPPINGS — the
  endpoints JSON uses float-format keys (e.g. "99.0") but the mappings
  used integer strings ("99"), causing latency_check to receive None and
  crash on the comparison. Updated all latency/ttft/tpot percentile keys
  to use the .0 suffix (50.0, 90.0, 95.0, 99.0).

- performance_check.py: Fix llm_check for endpoints — the check gated
  on the loadgen use_token_latencies flag which does not exist in
  endpoints submissions, causing all LLM models to fail. Added an
  endpoints-specific branch that checks TTFT/TPOT p99 values directly
  from the result JSON. Also fix get_performance_metric_check to skip
  RESULT_FIELD_BENCHMARK_OVERWRITE for endpoints (the tokens/sec field
  is not present in endpoints result files; use QPS instead).

- endpoints_parser.py: Fix inferred QPS unit — the fallback QPS
  calculation divided n_samples_issued by duration_ns directly, giving
  ~1e-8 instead of the correct value. Convert duration to seconds first.
  Also guard against overwriting an already-resolved QPS value.

- README.md: Update submission checker documentation with current
  version numbers, endpoints directory structure, endpoints-specific
  checks, and accuracy_scores requirement.

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/submission/README.md                    | 28 ++++++++++++-------
 .../checks/performance_check.py               | 22 ++++++++++++++-
 .../submission_checker/constants.py           | 24 ++++++++--------
 .../parsers/endpoints_parser.py               | 12 ++++----
 4 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/tools/submission/README.md b/tools/submission/README.md
index 6d620233b2..119520a412 100644
--- a/tools/submission/README.md
+++ b/tools/submission/README.md
@@ -32,7 +32,7 @@ The input submission directory is modified with empty directories removed and lo
 ## `submission_checker/main.py` (Mandatory)
 ### Inputs
 **input**: Path to the directory containing one or several submissions.<br>
-**version**: Checker version. E.g v1.1, v2.0, v2.1, v3.0, v3.1. <br>
+**version**: Checker version. E.g v5.0, v5.1, v6.0, v6.1. <br>
 **submitter**: Filter submitters and only run the checks for some specific submitter. <br>
 **csv**: Output path where the csv with the results will be stored. E.g `results/summary.csv`. <br>
 **skip_compliance**: Flag to skip compliance checks. <br>
@@ -71,25 +71,34 @@ python3 -m inference.tools.submission.submission_checker.main
     [--skip-calibration-check]
 ```
 
-### implemented checks
-**performance:**
+### Implemented checks
+**performance (loadgen):**
 - Check performance detailed log exists
 - Check for loadgen errors
 - Check for equal issue mode when it is required
 - Check the performance sample count used for running the benchmark
 - Check loadgen seeds are correct
-- Check latency constrain is met
-- Check minimun query count is met
-- Check minimun duration is met
+- Check latency constraint is met
+- Check minimum query count is met
+- Check minimum duration is met
 - Check network requirements
-- Check LLM latencies are met (if applies)
+- Check LLM TTFT/TPOT latencies are met via `use_token_latencies` flag (if applies)
 - Check loadgen scenario matches with submission scenario or that result can be inferred
 
+**performance (endpoints):**
+- Check result_summary.json and config.yaml exist
+- Check latency p99 constraint is met (from `latency.percentiles.99.0` in result_summary.json)
+- Check minimum duration is met (from `settings.runtime.min_duration_ms` in config.yaml)
+- Check LLM TTFT/TPOT p99 limits directly from result_summary.json for Server/Interactive scenarios
+- Extract primary metric as QPS (inferred from `n_samples_issued / duration_s` if not in results)
+- Skips: sample count check, seed check, min query count check (not applicable to endpoints)
+
 **accuracy**
 - Check the accuracy metric is correct and over the expected threshold (or within a range if applies)
-- Check accuracy json exists and is truncated
+- Check accuracy json exists and is truncated (loadgen only)
 - Check for loadgen error
 - Check full dataset is used for the accuracy run
+- Check `accuracy_scores` field is present and non-null (endpoints only)
 
 **compliance**
 - Check compliance directory exists
@@ -112,11 +121,10 @@ python3 -m inference.tools.submission.submission_checker.main
 - Check availability is valid
 - Check system type is valid
 - Check network fields
-- Check required fields are include in system json file
+- Check required fields are included in system json file
 - Check submitter is correct
 - Check division is correct
 
-
 ### Outputs
 - CSV file containing all the valid results in the directory.
 - It raises several errors and logs invalid results.
diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
index 624ff220ec..12763fa013 100644
--- a/tools/submission/submission_checker/checks/performance_check.py
+++ b/tools/submission/submission_checker/checks/performance_check.py
@@ -415,6 +415,25 @@ def llm_check(self):
                 False otherwise.
         """
         if self.model in self.config.get_llm_models():
+            if self.is_endpoints:
+                # Endpoints don't use the loadgen use_token_latencies flag;
+                # check TTFT/TPOT directly from the endpoints result JSON.
+                if self.scenario not in ["Server", "Interactive"]:
+                    return True
+                limits = LLM_LATENCY_LIMITS[self.model][self.scenario]
+                ttft = self.mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+                tpot = self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+                if ttft is None or tpot is None:
+                    self.log.warning(
+                        "%s TTFT or TPOT percentile data missing for endpoints LLM check",
+                        self.path)
+                    return True
+                if ttft < limits["ttft"] and tpot < limits["tpot"]:
+                    return True
+                self.log.error(
+                    'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+                    ttft, tpot, limits["ttft"], limits["tpot"])
+                return False
             if self.mlperf_log["requested_use_token_latencies"]:
                 if self.scenario not in ["Server", "Interactive"]:
                     # For offline, singlestream and multistream no further checks are
@@ -504,7 +523,8 @@ def get_performance_metric_check(self):
 
         res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]])
         if (
-            version in RESULT_FIELD_BENCHMARK_OVERWRITE
+            not self.is_endpoints
+            and version in RESULT_FIELD_BENCHMARK_OVERWRITE
             and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
             and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model]
         ):
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
index 09dce5e8d2..1b147cea29 100644
--- a/tools/submission/submission_checker/constants.py
+++ b/tools/submission/submission_checker/constants.py
@@ -1811,23 +1811,23 @@
     "latency.min ": "result_min_latency_ns",
     "latency.max": "result_max_latency_ns",
     "latency.avg": "result_mean_latency_ns",
-    "latency.percentiles.50": "result_50.00_percentile_latency_ns",
-    "latency.percentiles.90": "result_90.00_percentile_latency_ns",
-    "latency.percentiles.95": "result_95.00_percentile_latency_ns",
-    "latency.percentiles.99": "result_99.00_percentile_latency_ns",
+    "latency.percentiles.50.0": "result_50.00_percentile_latency_ns",
+    "latency.percentiles.90.0": "result_90.00_percentile_latency_ns",
+    "latency.percentiles.95.0": "result_95.00_percentile_latency_ns",
+    "latency.percentiles.99.0": "result_99.00_percentile_latency_ns",
     "latency.percentiles.99.9": "result_99.90_percentile_latency_ns",
     "ttft.min": "result_first_token_min_latency_ns",
     "ttft.max": "result_first_token_max_latency_ns",
     "ttft.avg": "result_first_token_mean_latency_ns",
-    "ttft.percentiles.50": "result_first_token_50.00_percentile_latency_ns",
-    "ttft.percentiles.90": "result_first_token_90.00_percentile_latency_ns",
-    "ttft.percentiles.95": "result_first_token_95.00_percentile_latency_ns",
-    "ttft.percentiles.99": "result_first_token_99.00_percentile_latency_ns",
+    "ttft.percentiles.50.0": "result_first_token_50.00_percentile_latency_ns",
+    "ttft.percentiles.90.0": "result_first_token_90.00_percentile_latency_ns",
+    "ttft.percentiles.95.0": "result_first_token_95.00_percentile_latency_ns",
+    "ttft.percentiles.99.0": "result_first_token_99.00_percentile_latency_ns",
     "ttft.percentiles.99.9": "result_first_token_99.90_percentile_latency_ns",
-    "tpot.percentiles.50": "result_time_per_output_token_50.00_percentile_ns",
-    "tpot.percentiles.90": "result_time_per_output_token_90.00_percentile_ns",
-    "tpot.percentiles.95": "result_time_per_output_token_95.00_percentile_ns",
-    "tpot.percentiles.99": "result_time_per_output_token_99.00_percentile_ns",
+    "tpot.percentiles.50.0": "result_time_per_output_token_50.00_percentile_ns",
+    "tpot.percentiles.90.0": "result_time_per_output_token_90.00_percentile_ns",
+    "tpot.percentiles.95.0": "result_time_per_output_token_95.00_percentile_ns",
+    "tpot.percentiles.99.0": "result_time_per_output_token_99.00_percentile_ns",
     "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns",
     "tpot.min": "result_time_to_output_token_min",
     "tpot.max": "result_time_to_output_token_max",
diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py
index b1fc533916..e014b0f4b9 100644
--- a/tools/submission/submission_checker/parsers/endpoints_parser.py
+++ b/tools/submission/submission_checker/parsers/endpoints_parser.py
@@ -108,15 +108,17 @@ def __init__(self, log_paths):
                 entry = {"key": inferred, "value": value}
                 self.messages.setdefault(inferred, []).append(entry)
 
-        # Temporary solution: Hardcoded inferred values
+        # Infer QPS from sample count / duration when not directly available.
+        # generated_query_duration is in nanoseconds; divide by 1e9 for seconds.
         if self.__getitem__("generated_query_duration") and self.__getitem__(
                 "generated_query_count"):
             key = "result_samples_per_second" if self.__getitem__(
                 "effective_scenario").lower() == "offline" else "result_completed_samples_per_sec"
-            value = self.__getitem__(
-                "generated_query_count") / self.__getitem__("generated_query_duration")
-            entry = {"key": key, "value": value}
-            self.messages.setdefault(key, []).append(entry)
+            duration_s = self.__getitem__("generated_query_duration") / 1e9
+            value = self.__getitem__("generated_query_count") / duration_s
+            if key not in self.messages:
+                entry = {"key": key, "value": value}
+                self.messages[key] = [entry]
 
         # Extract accuracy scores if possible
         if "accuracy_scores" in json_data:

From 045b2e38a5fd5429a721bbd09c57a15a663d98bf Mon Sep 17 00:00:00 2001
From: mlc-automations <3246381+mlc-automations@users.noreply.github.com>
Date: Wed, 17 Jun 2026 15:57:09 +0000
Subject: [PATCH 5/7] [Automated Commit] Format Codebase

---
 .../submission/submission_checker/parsers/endpoints_parser.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py
index e014b0f4b9..11e05bb57b 100644
--- a/tools/submission/submission_checker/parsers/endpoints_parser.py
+++ b/tools/submission/submission_checker/parsers/endpoints_parser.py
@@ -109,7 +109,8 @@ def __init__(self, log_paths):
                 self.messages.setdefault(inferred, []).append(entry)
 
         # Infer QPS from sample count / duration when not directly available.
-        # generated_query_duration is in nanoseconds; divide by 1e9 for seconds.
+        # generated_query_duration is in nanoseconds; divide by 1e9 for
+        # seconds.
         if self.__getitem__("generated_query_duration") and self.__getitem__(
                 "generated_query_count"):
             key = "result_samples_per_second" if self.__getitem__(

From e59c7ae2624bdd13f0873d25b155ce94325f9676 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Wed, 17 Jun 2026 17:06:11 -0500
Subject: [PATCH 6/7] Checker fixes + update loader

---
 .../checks/performance_check.py               |  19 +-
 .../submission_checker/constants.py           |  45 ++--
 tools/submission/submission_checker/loader.py |  64 +++---
 .../parsers/endpoints_parser.py               | 213 ++++++++++--------
 4 files changed, 176 insertions(+), 165 deletions(-)

diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py
index 12763fa013..ddb42f8b4c 100644
--- a/tools/submission/submission_checker/checks/performance_check.py
+++ b/tools/submission/submission_checker/checks/performance_check.py
@@ -81,7 +81,6 @@ def setup_checks(self):
         self.checks.append(self.get_performance_metric_check)
         self.apply_checks = set(self.checks)
         if self.is_endpoints:
-            self.apply_checks.remove(self.seeds_check)
             self.apply_checks.remove(self.performance_sample_count_check)
             self.apply_checks.remove(self.min_query_count_check)
 
@@ -210,14 +209,16 @@ def seeds_check(self):
         sample_index_rng_seed = self.mlperf_log["effective_sample_index_rng_seed"]
         schedule_rng_seed = self.mlperf_log["effective_schedule_rng_seed"]
         is_valid = True
-        if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
-            self.log.error(
-                "%s qsl_rng_seed is wrong, expected=%s, found=%s",
-                self.path,
-                config_seeds["qsl_rng_seed"],
-                qsl_rng_seed,
-            )
-            is_valid = False
+        if not self.is_endpoints:
+            # This seed does not exists for endpoints runs
+            if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
+                self.log.error(
+                    "%s qsl_rng_seed is wrong, expected=%s, found=%s",
+                    self.path,
+                    config_seeds["qsl_rng_seed"],
+                    qsl_rng_seed,
+                )
+                is_valid = False
         if sample_index_rng_seed != config_seeds["sample_index_rng_seed"]:
             self.log.error(
                 "%s sample_index_rng_seed is wrong, expected=%s, found=%s",
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
index 1b147cea29..d126d426eb 100644
--- a/tools/submission/submission_checker/constants.py
+++ b/tools/submission/submission_checker/constants.py
@@ -1635,18 +1635,11 @@
     "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/mlperf_log_summary.txt",
 }
 
-PERFORMANCE_ENDPOINTS_PATH = {
-    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
-    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
-    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
-    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/result_summary.json",
-}
-
-PERFORMANCE_CONFIG_ENDPOINTS_PATH = {
-    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
-    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
-    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
-    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/config.yaml",
+PERFORMANCE_ENDPOINTS_DIR = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/run_1/",
 }
 
 ACCURACY_LOG_PATH = {
@@ -1670,19 +1663,13 @@
     "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/mlperf_log_accuracy.json",
 }
 
-ACCURACY_ENDPOINTS_PATH = {
-    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
-    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
-    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
-    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/results.json",
+ACCURACY_ENDPOINTS_DIR = {
+    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
+    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
+    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
+    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/",
 }
 
-ACCURACY_CONFIG_ENDPOINTS_PATH = {
-    "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
-    "v5.1": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
-    "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
-    "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/accuracy/config.yaml",
-}
 
 POWER_DIR_PATH = {
     "v5.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/performance/power",
@@ -1798,12 +1785,11 @@
     "min_sample_count": "effective_min_query_count",
     "effective_sample_index_rng_seed": "effective_sample_index_rng_seed",
     "effective_schedule_rng_seed": "effective_schedule_rng_seed",
-    "min_sample_count (runtime_settings.json)": "effective_min_sample_count",
     "effective_sample_concatenate_permutation": "effective_sample_concatenate_permutation",
     "effective_samples_per_query": "effective_samples_per_query",
     "generated_query_count": "generated_query_count",
     "generated_query_duration": "generated_query_duration",
-    "target_qps (results_summary.json)": "effective_target_qps",
+    "target_qps": "effective_target_qps",
     "result_scheduled_samples_per_sec": "result_scheduled_samples_per_sec",
     "qps": "result_completed_samples_per_sec",
     "effective_target_latency_ns": "effective_target_latency_ns",
@@ -1829,9 +1815,9 @@
     "tpot.percentiles.95.0": "result_time_per_output_token_95.00_percentile_ns",
     "tpot.percentiles.99.0": "result_time_per_output_token_99.00_percentile_ns",
     "tpot.percentiles.99.9": "result_time_per_output_token_99.90_percentile_ns",
-    "tpot.min": "result_time_to_output_token_min",
-    "tpot.max": "result_time_to_output_token_max",
-    "tpot.avg": "result_time_to_output_token_mean",
+    "tpot.min": "result_time_per_output_token_min",
+    "tpot.max": "result_time_per_output_token_max",
+    "tpot.avg": "result_time_per_output_token_mean",
     "tps": "result_completed_tokens_per_second",
     "result.total": "result_query_count",
     "result.failed": "num_errors",
@@ -1849,8 +1835,7 @@
     "max_duration_ms": "settings.runtime.max_duration_ms",
     "effective_sample_index_rng_seed": "settings.runtime.dataloader_random_seed",
     "effective_schedule_rng_seed": "settings.runtime.scheduler_random_seed",
-    "target_qps (results_summary.json)": "settings.load_pattern.target_qps",
-    "min_sample_count (runtime_settings.json)": "settings.runtime.n_samples_to_issue",
+    "target_qps": "settings.load_pattern.target_qps",
     "min_sample_count": "settings.runtime.n_samples_to_issue",
 }
 
diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py
index 170da085f1..2c8fa592d8 100644
--- a/tools/submission/submission_checker/loader.py
+++ b/tools/submission/submission_checker/loader.py
@@ -83,18 +83,12 @@ def __init__(self, root, version, config: Config) -> None:
         self.acc_json_path = os.path.join(
             self.root, ACCURACY_JSON_PATH.get(
                 version, ACCURACY_JSON_PATH["default"]))
-        self.perf_endpoints_path = os.path.join(
-            self.root, PERFORMANCE_ENDPOINTS_PATH.get(
-                version, PERFORMANCE_ENDPOINTS_PATH["default"]))
-        self.perf_endpoints_config_path = os.path.join(
-            self.root, PERFORMANCE_CONFIG_ENDPOINTS_PATH.get(
-                version, PERFORMANCE_CONFIG_ENDPOINTS_PATH["default"]))
-        self.acc_endpoints_path = os.path.join(
-            self.root, ACCURACY_ENDPOINTS_PATH.get(
-                version, ACCURACY_ENDPOINTS_PATH["default"]))
-        self.acc_endpoints_config_path = os.path.join(
-            self.root, ACCURACY_CONFIG_ENDPOINTS_PATH.get(
-                version, ACCURACY_CONFIG_ENDPOINTS_PATH["default"]))
+        self.perf_endpoints_dir = os.path.join(
+            self.root, PERFORMANCE_ENDPOINTS_DIR.get(
+                version, PERFORMANCE_ENDPOINTS_DIR["default"]))
+        self.acc_endpoints_dir = os.path.join(
+            self.root, ACCURACY_ENDPOINTS_DIR.get(
+                version, ACCURACY_ENDPOINTS_DIR["default"]))
         self.system_log_path = os.path.join(
             self.root, SYSTEM_PATH.get(
                 version, SYSTEM_PATH["default"]))
@@ -203,9 +197,7 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy",
                 if loading fails.
         """
         log = None
-        if log_type in ["Endpoints"]:
-            log = EndpointsParser(path)
-        elif os.path.exists(path):
+        if os.path.exists(path):
             self.logger.info("Loading %s log from %s", log_type, path)
             if log_type in ["Performance", "Accuracy", "Test"]:
                 log = LoadgenParser(path)
@@ -229,6 +221,22 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy",
                 path)
         return log
 
+    def load_endpoints_logs(self, perf_dir, acc_dir):
+        perf_log = None
+        acc_log = None
+        if os.path.exists(acc_dir) and os.path.exists(perf_dir):
+            acc_log = EndpointsParser(acc_dir)
+            perf_log = EndpointsParser(perf_dir)
+        elif os.path.exists(perf_dir):
+            acc_log = EndpointsParser(perf_dir)
+            perf_log = EndpointsParser(perf_dir)
+        else:
+            self.logger.info(
+                "Could not load endpoints log from %s, path does not exist",
+                perf_dir
+            )
+        return perf_log, acc_log
+
     def check_scenarios(self, benchmark, model_mapping,
                         system_type, scenarios):
         self.config.set_type(system_type)
@@ -309,25 +317,13 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
                                 system=system,
                                 benchmark=benchmark,
                                 scenario=scenario)
-                            perf_endpoints_path = self.perf_endpoints_path.format(
-                                division=division,
-                                submitter=submitter,
-                                system=system,
-                                benchmark=benchmark,
-                                scenario=scenario)
-                            perf_endpoints_config_path = self.perf_endpoints_config_path.format(
+                            perf_endpoints_dir = self.perf_endpoints_dir.format(
                                 division=division,
                                 submitter=submitter,
                                 system=system,
                                 benchmark=benchmark,
                                 scenario=scenario)
-                            acc_endpoints_path = self.acc_endpoints_path.format(
-                                division=division,
-                                submitter=submitter,
-                                system=system,
-                                benchmark=benchmark,
-                                scenario=scenario)
-                            acc_endpoints_config_path = self.acc_endpoints_config_path.format(
+                            acc_endpoints_dir = self.acc_endpoints_dir.format(
                                 division=division,
                                 submitter=submitter,
                                 system=system,
@@ -441,14 +437,8 @@ def load(self) -> Generator[SubmissionLogs, None, None]:
                                 measurements_path, "Measurements")
                             if perf_log is None and acc_log is None:
                                 is_endpoints_submission = True
-                                perf_log = self.load_single_log(
-                                    [perf_endpoints_path,
-                                        perf_endpoints_config_path],
-                                    "Endpoints"
-                                )
-                                acc_log = self.load_single_log(
-                                    [acc_endpoints_path, acc_endpoints_config_path],
-                                    "Endpoints"
+                                perf_log, acc_log = self.load_endpoints_logs(
+                                    perf_endpoints_dir, acc_endpoints_dir
                                 )
 
                             # Load test logs
diff --git a/tools/submission/submission_checker/parsers/endpoints_parser.py b/tools/submission/submission_checker/parsers/endpoints_parser.py
index 11e05bb57b..9fc5ddfb7c 100644
--- a/tools/submission/submission_checker/parsers/endpoints_parser.py
+++ b/tools/submission/submission_checker/parsers/endpoints_parser.py
@@ -5,7 +5,12 @@
 import yaml
 
 from .base import BaseParser
-from ..constants import ENDPOINTS_YAML_FIELD_MAP, ENDPOINTS_JSON_ALT_PATHS, ENDPOINTS_MAPPINGS, ENDPOINTS_INFERRED_FIELDS
+from ..constants import (
+    ENDPOINTS_YAML_FIELD_MAP,
+    ENDPOINTS_JSON_ALT_PATHS,
+    ENDPOINTS_MAPPINGS,
+    ENDPOINTS_INFERRED_FIELDS,
+)
 
 _FIELDS_MAP_DIR = os.path.join(
     os.path.dirname(__file__),
@@ -18,6 +23,10 @@
     "helper",
     "sample_logs")
 
+_RESULT_SUMMARY_FILE = "result_summary.json"
+_RESULTS_FILE = "results.json"
+_CONFIG_FILES = ("config.yaml", "config.yml")
+
 
 def _load_field_map(filename):
     with open(os.path.join(_FIELDS_MAP_DIR, filename), "r", encoding="utf-8") as f:
@@ -29,6 +38,9 @@ def _get_nested(data, dotted_key):
 
     Uses a greedy left-to-right match so dotted numeric keys like '99.9' are
     handled correctly: the longest matching key at each level wins.
+
+    Also handles float-formatted integer keys: '50.0' resolves to key '50'
+    (common in the ENDPOINTS_MAPPINGS percentile entries).
     """
     if not isinstance(data, dict):
         return None
@@ -37,6 +49,10 @@ def _get_nested(data, dotted_key):
     i = 0
     while i < len(parts):
         if not isinstance(current, dict):
+            # Trailing '.0' on a float-formatted integer key: treat as
+            # consumed.
+            if parts[i:] == ["0"] and not isinstance(current, (dict, list)):
+                return current
             return None
         found = False
         for j in range(len(parts), i, -1):
@@ -53,85 +69,112 @@ def _get_nested(data, dotted_key):
     return current
 
 
+def _resolve_value(stripped, summary_data, results_data, yaml_data):
+    """Look up a field in three data sources in priority order.
+
+    Priority: result_summary.json > results.json > config.yaml
+    Within each JSON source, a direct dot-notation path is tried first,
+    then the alternative paths from ENDPOINTS_JSON_ALT_PATHS.
+    For the YAML source, the explicit path overrides in ENDPOINTS_YAML_FIELD_MAP
+    are tried first, then a direct dot-notation path.
+    """
+    for data in (summary_data, results_data):
+        value = _get_nested(data, stripped)
+        if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS:
+            value = _get_nested(data, ENDPOINTS_JSON_ALT_PATHS[stripped])
+        if value is not None:
+            return value
+
+    # YAML: explicit path map first, then direct
+    if stripped in ENDPOINTS_YAML_FIELD_MAP:
+        value = _get_nested(yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped])
+        if value is not None:
+            return value
+    return _get_nested(yaml_data, stripped)
+
+
 class EndpointsParser(BaseParser):
-    def __init__(self, log_paths):
+    def __init__(self, run_dir):
         """
-        log_paths: [json_path, yaml_path]
-          json_path - path to the JSON results file (result_summary.json or results.json)
-          yaml_path - path to the YAML config file (config.yaml)
+        run_dir: path to the run directory containing:
+          - result_summary.json  (highest priority)
+          - results.json
+          - config.yaml / config.yml  (lowest priority)
         """
-        json_path, yaml_path = log_paths
-        super().__init__(json_path)
+        super().__init__(run_dir)
 
         self.logger = logging.getLogger("MLPerfLog")
         self.messages = {}
 
-        with open(json_path, "r", encoding="utf-8") as f:
-            json_data = json.load(f)
-
-        with open(yaml_path, "r", encoding="utf-8") as f:
-            yaml_data = yaml.safe_load(f)
-
-        forwards_map = ENDPOINTS_MAPPINGS
+        summary_data = self._load_json(
+            os.path.join(run_dir, _RESULT_SUMMARY_FILE))
+        results_data = self._load_json(os.path.join(run_dir, _RESULTS_FILE))
+        yaml_data = self._load_yaml(run_dir)
 
-        for endpoints_key, loadgen_key in forwards_map.items():
+        for endpoints_key, loadgen_key in ENDPOINTS_MAPPINGS.items():
             stripped = endpoints_key.strip()
-            value = None
-
-            # 1. Direct dot-notation path in the JSON result file
-            value = _get_nested(json_data, stripped)
-
-            # 2. Alternative JSON paths for known structural mismatches
-            if value is None and stripped in ENDPOINTS_JSON_ALT_PATHS:
-                value = _get_nested(
-                    json_data, ENDPOINTS_JSON_ALT_PATHS[stripped])
-
-            # 3. Explicit YAML field path overrides
-            if value is None and stripped in ENDPOINTS_YAML_FIELD_MAP:
-                value = _get_nested(
-                    yaml_data, ENDPOINTS_YAML_FIELD_MAP[stripped])
-
-            # 4. Fallback: direct dot-notation path in the YAML config
-            if value is None:
-                value = _get_nested(yaml_data, stripped)
-
+            value = _resolve_value(
+                stripped, summary_data, results_data, yaml_data)
             if value is not None:
-                entry = {"key": loadgen_key, "value": value}
-                self.messages.setdefault(loadgen_key, []).append(entry)
+                self.messages.setdefault(loadgen_key, []).append(
+                    {"key": loadgen_key, "value": value}
+                )
 
         self.keys = set(self.messages.keys())
-        # Additional values that can be inferred from other values
-        inferred_map = ENDPOINTS_INFERRED_FIELDS
-        for inferred, key in inferred_map.items():
-            value = self.__getitem__(key)
+
+        # Inferred fields: copy the value of one loadgen key to another
+        for inferred_key, source_key in ENDPOINTS_INFERRED_FIELDS.items():
+            value = self[source_key]
             if value is not None:
-                entry = {"key": inferred, "value": value}
-                self.messages.setdefault(inferred, []).append(entry)
-
-        # Infer QPS from sample count / duration when not directly available.
-        # generated_query_duration is in nanoseconds; divide by 1e9 for
-        # seconds.
-        if self.__getitem__("generated_query_duration") and self.__getitem__(
-                "generated_query_count"):
-            key = "result_samples_per_second" if self.__getitem__(
-                "effective_scenario").lower() == "offline" else "result_completed_samples_per_sec"
-            duration_s = self.__getitem__("generated_query_duration") / 1e9
-            value = self.__getitem__("generated_query_count") / duration_s
-            if key not in self.messages:
-                entry = {"key": key, "value": value}
-                self.messages[key] = [entry]
-
-        # Extract accuracy scores if possible
-        if "accuracy_scores" in json_data:
-            for dataset_name, result in json_data["accuracy_scores"].items():
-                value = result.get("score", None)
-                entry = {"key": "accuracy_score", "value": value}
-                self.messages.setdefault("accuracy_score", []).append(entry)
+                self.messages.setdefault(inferred_key, []).append(
+                    {"key": inferred_key, "value": value}
+                )
+
+        # Infer QPS from count / duration when not directly available
+        duration_ns = self["generated_query_duration"]
+        count = self["generated_query_count"]
+        scenario = self["effective_scenario"]
+        if duration_ns and count and scenario:
+            qps_key = (
+                "result_samples_per_second"
+                if scenario.lower() == "offline"
+                else "result_completed_samples_per_sec"
+            )
+            if qps_key not in self.messages:
+                qps = count / (duration_ns / 1e9)
+                self.messages[qps_key] = [{"key": qps_key, "value": qps}]
+
+        # Expose accuracy scores stored in results.json
+        for result in results_data.get("accuracy_scores", {}).values():
+            score = result.get("score")
+            if score is not None:
+                self.messages.setdefault("accuracy_score", []).append(
+                    {"key": "accuracy_score", "value": score}
+                )
 
         self.keys = set(self.messages.keys())
-        self.logger.info(
-            "Successfully loaded endpoints log from %s.",
-            json_path)
+        self.logger.info("Successfully loaded endpoints log from %s.", run_dir)
+
+    def _load_json(self, path):
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except BaseException:
+            self.logger.error("Could not load json file from %s", path)
+            return {}
+        return {}
+
+    def _load_yaml(self, run_dir):
+        for name in _CONFIG_FILES:
+            path = os.path.join(run_dir, name)
+            if os.path.exists(path):
+                try:
+                    with open(path, "r", encoding="utf-8") as f:
+                        return yaml.safe_load(f) or {}
+                except BaseException:
+                    pass
+        self.logger.error("Yaml file not found in directory %s", run_dir)
+        return {}
 
     def __getitem__(self, key):
         if key not in self.keys:
@@ -154,10 +197,9 @@ def get_keys(self):
         return self.keys
 
     def num_errors(self):
-        return self.get("num_errors")
+        return self["num_errors"]
 
     def has_error(self):
-        """Check if the log contains any errors."""
         return self.num_errors() != 0
 
 
@@ -170,37 +212,30 @@ def main():
 
     backwards_map = _load_field_map("backwards.json")
 
-    # Collect all (json_file, yaml_file) pairs from leaf subdirectories
-    pairs = []
+    # Collect all run directories (those containing at least one JSON and one
+    # YAML)
+    run_dirs = []
     for root, _dirs, files in os.walk(_SAMPLE_LOGS_DIR):
-        json_files = sorted(f for f in files if f.endswith(".json"))
-        yaml_files = sorted(f for f in files if f.endswith(
-            ".yaml") or f.endswith(".yml"))
-        if json_files and yaml_files:
-            pairs.append(
-                (
-                    os.path.join(root, json_files[0]),
-                    os.path.join(root, yaml_files[0]),
-                )
-            )
-
-    if not pairs:
-        logger.error("No JSON+YAML pairs found under %s.", _SAMPLE_LOGS_DIR)
+        has_json = any(f.endswith(".json") for f in files)
+        has_yaml = any(f.endswith(".yaml") or f.endswith(".yml")
+                       for f in files)
+        if has_json and has_yaml:
+            run_dirs.append(root)
+
+    if not run_dirs:
+        logger.error("No run directories found under %s.", _SAMPLE_LOGS_DIR)
         return 1
 
-    for json_path, yaml_path in sorted(pairs):
-        folder = os.path.relpath(os.path.dirname(json_path), _SAMPLE_LOGS_DIR)
+    for run_dir in sorted(run_dirs):
+        folder = os.path.relpath(run_dir, _SAMPLE_LOGS_DIR)
         print(f"\n{'=' * 70}")
-        print(f"Folder : {folder}")
-        print(f"JSON   : {os.path.basename(json_path)}")
-        print(f"YAML   : {os.path.basename(yaml_path)}")
+        print(f"Directory: {folder}")
         print(f"{'=' * 70}")
 
-        parser = EndpointsParser([json_path, yaml_path])
+        parser = EndpointsParser(run_dir)
 
         found = []
         not_found = []
-
         for loadgen_key, endpoints_key in backwards_map.items():
             value = parser[loadgen_key]
             if value is not None:

From 733eb8895a954c7b974e80910ef9ebb996582c6c Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Wed, 17 Jun 2026 18:33:23 -0500
Subject: [PATCH 7/7] Minor fixes

---
 tools/submission/submission_checker/checks/compliance_check.py | 1 +
 tools/submission/submission_checker/constants.py               | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py
index b08b23803e..4853a832f3 100644
--- a/tools/submission/submission_checker/checks/compliance_check.py
+++ b/tools/submission/submission_checker/checks/compliance_check.py
@@ -192,6 +192,7 @@ def performance_check(self):
                     "model_mapping": self.submission_logs.loader_data.get("model_mapping", {}),
                     "check_scenarios": True,
                     "compliance_skip": True,
+                    "is_endpoints_submission": self.submission_logs.loader_data.get("is_endpoints_submission", False),
                 }
                 test_logs = SubmissionLogs(
                     self.submission_logs.loader_data[f"{test}_perf_log"], None, None, None, self.submission_logs.system_json, None, test_data)
diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py
index d126d426eb..a4137eabf3 100644
--- a/tools/submission/submission_checker/constants.py
+++ b/tools/submission/submission_checker/constants.py
@@ -1794,7 +1794,7 @@
     "qps": "result_completed_samples_per_sec",
     "effective_target_latency_ns": "effective_target_latency_ns",
     "effective_target_latency_percentile": "effective_target_latency_percentile",
-    "latency.min ": "result_min_latency_ns",
+    "latency.min": "result_min_latency_ns",
     "latency.max": "result_max_latency_ns",
     "latency.avg": "result_mean_latency_ns",
     "latency.percentiles.50.0": "result_50.00_percentile_latency_ns",