Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions tools/submission/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The input submission directory is modified with empty directories removed and lo
## `submission_checker/main.py` (Mandatory)
### Inputs
**input**: Path to the directory containing one or several submissions.<br>
**version**: Checker version. E.g v1.1, v2.0, v2.1, v3.0, v3.1. <br>
**version**: Checker version. E.g v5.0, v5.1, v6.0, v6.1. <br>
**submitter**: Filter submitters and only run the checks for some specific submitter. <br>
**csv**: Output path where the csv with the results will be stored. E.g `results/summary.csv`. <br>
**skip_compliance**: Flag to skip compliance checks. <br>
Expand Down Expand Up @@ -71,25 +71,34 @@ python3 -m inference.tools.submission.submission_checker.main
[--skip-calibration-check]
```

### implemented checks
**performance:**
### Implemented checks
**performance (loadgen):**
- Check performance detailed log exists
- Check for loadgen errors
- Check for equal issue mode when it is required
- Check the performance sample count used for running the benchmark
- Check loadgen seeds are correct
- Check latency constrain is met
- Check minimun query count is met
- Check minimun duration is met
- Check latency constraint is met
- Check minimum query count is met
- Check minimum duration is met
- Check network requirements
- Check LLM latencies are met (if applies)
- Check LLM TTFT/TPOT latencies are met via `use_token_latencies` flag (if applies)
- Check loadgen scenario matches with submission scenario or that result can be inferred

**performance (endpoints):**
- Check result_summary.json and config.yaml exist
- Check latency p99 constraint is met (from `latency.percentiles.99.0` in result_summary.json)
- Check minimum duration is met (from `settings.runtime.min_duration_ms` in config.yaml)
- Check LLM TTFT/TPOT p99 limits directly from result_summary.json for Server/Interactive scenarios
- Extract primary metric as QPS (inferred from `n_samples_issued / duration_s` if not in results)
- Skips: sample count check, seed check, min query count check (not applicable to endpoints)

**accuracy**
- Check the accuracy metric is correct and over the expected threshold (or within a range if applies)
- Check accuracy json exists and is truncated
- Check accuracy json exists and is truncated (loadgen only)
- Check for loadgen error
- Check full dataset is used for the accuracy run
- Check `accuracy_scores` field is present and non-null (endpoints only)

**compliance**
- Check compliance directory exists
Expand All @@ -112,11 +121,10 @@ python3 -m inference.tools.submission.submission_checker.main
- Check availability is valid
- Check system type is valid
- Check network fields
- Check required fields are include in system json file
- Check required fields are included in system json file
- Check submitter is correct
- Check division is correct


### Outputs
- CSV file containing all the valid results in the directory.
- It raises several errors and logs invalid results.
Expand Down
12 changes: 12 additions & 0 deletions tools/submission/submission_checker/checks/accuracy_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def __init__(
"scenario", "")
self.scenario = self.mlperf_log["effective_scenario"]
self.division = self.submission_logs.loader_data.get("division", "")
self.is_endpoints = self.submission_logs.loader_data.get(
"is_endpoints_submission", False)
self.setup_checks()

def setup_checks(self):
Expand All @@ -83,6 +85,9 @@ def setup_checks(self):
self.checks.append(self.loadgen_errors_check)
self.checks.append(self.dataset_check)
self.checks.append(self.extra_files_check)
self.apply_checks = set(self.checks)
if self.is_endpoints:
self.apply_checks.remove(self.accuracy_json_check)

def accuracy_result_check(self):
"""Validate reported accuracy metrics in `accuracy.txt`.
Expand All @@ -97,6 +102,13 @@ def accuracy_result_check(self):
False otherwise.
"""

if self.is_endpoints:
if self.mlperf_log["accuracy_score"] is not None:
self.submission_logs.loader_data["accuracy_metrics"] = self.mlperf_log["accuracy_score"]
return True
self.log.error("%s accuracy score not found", self.path)
return False

patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = self.config.get_accuracy_values(
self.model
)
Expand Down
29 changes: 20 additions & 9 deletions tools/submission/submission_checker/checks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class BaseCheck(ABC):

def __init__(self, log, path):
self.checks = []
self.apply_checks = set()
self.log = log
self.path = path
self.name = "base checks"
Expand All @@ -21,22 +22,32 @@ def run_checks(self):
valid = True
errors = []
for check in self.checks:
try:
v = self.execute(check)
valid &= v
except BaseException:
valid &= False
self.log.error(
"Execution occurred in running check %s. Running %s in %s",
self.path,
if self.check_applies(check):
try:
v = self.execute(check)
valid &= v
except BaseException:
valid &= False
self.log.error(
"Execution occurred in running check %s. Running %s in %s",
self.path,
check.__name__,
self.__class__.__name__)
else:
self.log.warning(
"Execution of check %s skipped for %s.",
check.__name__,
self.__class__.__name__)
self.path
)
return valid

def execute(self, check):
"""Custom execution of a single check method."""
return check()

def check_applies(self, fn):
return fn in self.apply_checks

def __call__(self):
"""Allows the check instance to be called like a function."""
self.log.info("Starting %s for: %s", self.name, self.path)
Expand Down
11 changes: 10 additions & 1 deletion tools/submission/submission_checker/checks/compliance_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def __init__(self, log, path, config: Config,
self.model = self.config.get_mlperf_model(
self.model, self.model_mapping)
self.test_list = self.get_test_list(self.model)
self.is_endpoints = self.submission_logs.loader_data.get(
"is_endpoints_submission", False)
self.setup_checks()

def setup_checks(self):
Expand All @@ -62,6 +64,10 @@ def setup_checks(self):
self.checks.append(self.performance_check)
self.checks.append(self.accuracy_check)
self.checks.append(self.compliance_performance_check)
self.apply_checks = set(self.checks)
# No compliance tests for endpoints for now
if self.is_endpoints:
self.apply_checks = set()

def get_test_list(self, model):
"""Return the list of compliance tests applicable to `model`.
Expand Down Expand Up @@ -186,6 +192,7 @@ def performance_check(self):
"model_mapping": self.submission_logs.loader_data.get("model_mapping", {}),
"check_scenarios": True,
"compliance_skip": True,
"is_endpoints_submission": self.submission_logs.loader_data.get("is_endpoints_submission", False),
}
test_logs = SubmissionLogs(
self.submission_logs.loader_data[f"{test}_perf_log"], None, None, None, self.submission_logs.system_json, None, test_data)
Expand Down Expand Up @@ -322,7 +329,9 @@ def accuracy_check(self):
first_token_pass and eos_pass and length_check_pass)
if not is_valid:
self.log.error(
f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
f"TEST06 accuracy check failed. first_token_check:" +
f"{first_token_pass} eos_check: " +
f"{eos_pass} length_check: {length_check_pass}."
)
elif test == "TEST07":
# TEST07: Verify accuracy in performance mode
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def setup_checks(self):
self.checks.append(self.directory_exist_check)
self.checks.append(self.required_files_check)
self.checks.append(self.required_fields_check)
self.is_endpoints = self.submission_logs.loader_data.get(
"is_endpoints_submission", False)
self.apply_checks = set(self.checks)

def missing_check(self):
"""Ensure a measurements JSON was provided.
Expand Down
67 changes: 52 additions & 15 deletions tools/submission/submission_checker/checks/performance_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ def __init__(self, log, path, config: Config,
"scenario", "")
self.scenario = self.mlperf_log["effective_scenario"]
self.division = self.submission_logs.loader_data.get("division", "")
self.is_endpoints = self.submission_logs.loader_data.get(
"is_endpoints_submission", False)
if self.is_endpoints:
if self.scenario.lower() == "online":
self.scenario = "Server"
self.setup_checks()

def setup_checks(self):
Expand All @@ -74,6 +79,10 @@ def setup_checks(self):
self.checks.append(self.llm_check)
self.checks.append(self.inferred_check)
self.checks.append(self.get_performance_metric_check)
self.apply_checks = set(self.checks)
if self.is_endpoints:
self.apply_checks.remove(self.performance_sample_count_check)
self.apply_checks.remove(self.min_query_count_check)

def missing_check(self):
"""Ensure the performance log was provided.
Expand Down Expand Up @@ -200,14 +209,16 @@ def seeds_check(self):
sample_index_rng_seed = self.mlperf_log["effective_sample_index_rng_seed"]
schedule_rng_seed = self.mlperf_log["effective_schedule_rng_seed"]
is_valid = True
if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
self.log.error(
"%s qsl_rng_seed is wrong, expected=%s, found=%s",
self.path,
config_seeds["qsl_rng_seed"],
qsl_rng_seed,
)
is_valid = False
if not self.is_endpoints:
# This seed does not exists for endpoints runs
if qsl_rng_seed != config_seeds["qsl_rng_seed"]:
self.log.error(
"%s qsl_rng_seed is wrong, expected=%s, found=%s",
self.path,
config_seeds["qsl_rng_seed"],
qsl_rng_seed,
)
is_valid = False
if sample_index_rng_seed != config_seeds["sample_index_rng_seed"]:
self.log.error(
"%s sample_index_rng_seed is wrong, expected=%s, found=%s",
Expand Down Expand Up @@ -237,7 +248,7 @@ def latency_check(self):
bool: True if latency constraints are satisfied, False otherwise.
"""
uses_early_stopping = self.config.uses_early_stopping(self.scenario)
if uses_early_stopping:
if uses_early_stopping and not self.is_endpoints:
# check if early_stopping condition was met
if not self.mlperf_log["early_stopping_met"]:
early_stopping_result = self.mlperf_log["early_stopping_result"]
Expand Down Expand Up @@ -386,7 +397,9 @@ def network_check(self):
# (must include "Network SUT" in name)
if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
self.log.error(
f"{self.path} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
f"{self.path} invalid sut name for network mode." +
f"expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}'" +
f" got '{sut_name}'"
)
return False

Expand All @@ -403,6 +416,25 @@ def llm_check(self):
False otherwise.
"""
if self.model in self.config.get_llm_models():
if self.is_endpoints:
# Endpoints don't use the loadgen use_token_latencies flag;
# check TTFT/TPOT directly from the endpoints result JSON.
if self.scenario not in ["Server", "Interactive"]:
return True
limits = LLM_LATENCY_LIMITS[self.model][self.scenario]
ttft = self.mlperf_log["result_first_token_99.00_percentile_latency_ns"]
tpot = self.mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
if ttft is None or tpot is None:
self.log.warning(
"%s TTFT or TPOT percentile data missing for endpoints LLM check",
self.path)
return True
if ttft < limits["ttft"] and tpot < limits["tpot"]:
return True
self.log.error(
'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
ttft, tpot, limits["ttft"], limits["tpot"])
return False
if self.mlperf_log["requested_use_token_latencies"]:
if self.scenario not in ["Server", "Interactive"]:
# For offline, singlestream and multistream no further checks are
Expand Down Expand Up @@ -457,7 +489,7 @@ def inferred_check(self):
("singlestream", "offline")
]
if (self.scenario.lower(), self.scenario_fixed.lower()
) not in list_inferred:
) not in list_inferred:
self.log.error(
"Result for scenario %s can not be inferred from %s for: %s",
self.scenario_fixed,
Expand Down Expand Up @@ -485,10 +517,15 @@ def get_performance_metric_check(self):
):
is_valid = True
scenario = self.mlperf_log["effective_scenario"]
if self.is_endpoints:
if scenario.lower() == "online":
scenario = "Server"
scenario = scenario.capitalize()

res = float(self.mlperf_log[RESULT_FIELD_NEW[version][scenario]])
if (
version in RESULT_FIELD_BENCHMARK_OVERWRITE
not self.is_endpoints
and version in RESULT_FIELD_BENCHMARK_OVERWRITE
and self.model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][self.model]
):
Expand Down Expand Up @@ -548,12 +585,12 @@ def get_inferred_result(self, res):
res = qps_wo_loadgen_overhead

if (scenario_fixed in ["Offline"]
) and scenario in ["MultiStream"]:
) and scenario in ["MultiStream"]:
inferred = True
res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)

if (scenario_fixed in ["MultiStream"]
) and scenario in ["SingleStream"]:
) and scenario in ["SingleStream"]:
inferred = True
# samples_per_query does not match with the one reported in the logs
# when inferring MultiStream from SingleStream
Expand All @@ -570,6 +607,6 @@ def get_inferred_result(self, res):
else:
res = (latency_99_percentile * samples_per_query) / MS_TO_NS
if (scenario_fixed in ["Interactive"]
) and scenario not in ["Server"]:
) and scenario not in ["Server"]:
is_valid = False
return res, is_valid
5 changes: 4 additions & 1 deletion tools/submission/submission_checker/checks/power_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def setup_checks(self):
self.checks.append(self.required_files_check)
self.checks.append(self.external_power_check)
self.checks.append(self.get_power_metric_check)
self.is_endpoints = self.submission_logs.loader_data.get(
"is_endpoints_submission", False)
self.apply_checks = set(self.checks)

def required_files_check(self):
"""Verify required files exist in power-related directories.
Expand Down Expand Up @@ -226,7 +229,7 @@ def get_power_metric_check(self):
samples_per_query = 8

if (self.scenario_fixed.lower() in ["multistream"]
) and scenario.lower() in ["singlestream"]:
) and scenario.lower() in ["singlestream"]:
power_metric = (
avg_power * power_duration * samples_per_query * 1000 / num_queries
)
Expand Down
3 changes: 3 additions & 0 deletions tools/submission/submission_checker/checks/system_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def __init__(self, log, path, config: Config,
self.system_json = self.submission_logs.system_json
self.submitter = self.submission_logs.loader_data.get("submitter", "")
self.division = self.submission_logs.loader_data.get("division", "")
self.is_endpoints = self.submission_logs.loader_data.get(
"is_endpoints_submission", False)
self.config = config
self.setup_checks()

Expand All @@ -58,6 +60,7 @@ def setup_checks(self):
self.checks.append(self.required_fields_check)
self.checks.append(self.submitter_check)
self.checks.append(self.division_check)
self.apply_checks = set(self.checks)

def missing_check(self):
"""Ensure the system JSON file was provided.
Expand Down
Loading
Loading