From 6348746db7bebb6de5a1b4e8f6eab2771734e497 Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 22 Apr 2026 09:38:24 -0400
Subject: [PATCH 1/6] Initial test structure

---
 tests/conftest.py                             |   5 +
 tests/pytest.ini                              |   3 +
 tests/submission_checker/conftest.py          |  59 ++++++
 .../test_accuracy_parser.py                   | 144 +++++++++++++
 tests/submission_checker/test_base_check.py   |  98 +++++++++
 tests/submission_checker/test_config.py       | 153 ++++++++++++++
 .../submission_checker/test_loadgen_parser.py | 120 +++++++++++
 tests/submission_checker/test_utils.py        | 197 ++++++++++++++++++
 8 files changed, 779 insertions(+)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/pytest.ini
 create mode 100644 tests/submission_checker/conftest.py
 create mode 100644 tests/submission_checker/test_accuracy_parser.py
 create mode 100644 tests/submission_checker/test_base_check.py
 create mode 100644 tests/submission_checker/test_config.py
 create mode 100644 tests/submission_checker/test_loadgen_parser.py
 create mode 100644 tests/submission_checker/test_utils.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000..2f0f4db9b7
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,5 @@
+import sys
+import os
+
+# Ensure tools/submission is on the path so `import submission_checker` resolves.
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "tools", "submission"))
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 0000000000..32ecb85b0d
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = .
+pythonpath = ../tools/submission
diff --git a/tests/submission_checker/conftest.py b/tests/submission_checker/conftest.py
new file mode 100644
index 0000000000..cc14060905
--- /dev/null
+++ b/tests/submission_checker/conftest.py
@@ -0,0 +1,59 @@
+import json
+import pytest
+
+
+MLLOG_MARKER = ":::MLLOG"
+
+
+def make_mllog_line(key, value, is_error=False, is_warning=False):
+    entry = {
+        "key": key,
+        "value": value,
+        "time_ms": 0,
+        "namespace": "",
+        "event_type": "POINT_IN_TIME",
+        "metadata": {
+            "file": "test.py",
+            "line_no": 1,
+            "is_error": is_error,
+            "is_warning": is_warning,
+        },
+    }
+    return f"{MLLOG_MARKER} {json.dumps(entry)}\n"
+
+
+@pytest.fixture()
+def simple_mllog(tmp_path):
+    """A minimal valid MLPerf log with two entries."""
+    p = tmp_path / "mlperf_log_detail.txt"
+    lines = [
+        make_mllog_line("result_validity", "VALID"),
+        make_mllog_line("effective_scenario", "Offline"),
+        make_mllog_line("result_samples_per_second", 123.4),
+    ]
+    p.write_text("".join(lines))
+    return p
+
+
+@pytest.fixture()
+def mllog_with_error(tmp_path):
+    """An MLPerf log containing one error entry."""
+    p = tmp_path / "mlperf_log_detail.txt"
+    lines = [
+        make_mllog_line("result_validity", "INVALID"),
+        make_mllog_line("loadgen_error", "something went wrong", is_error=True),
+    ]
+    p.write_text("".join(lines))
+    return p
+
+
+@pytest.fixture()
+def mllog_duplicate_key(tmp_path):
+    """An MLPerf log with the same key appearing twice."""
+    p = tmp_path / "mlperf_log_detail.txt"
+    lines = [
+        make_mllog_line("seeds", 1234),
+        make_mllog_line("seeds", 5678),
+    ]
+    p.write_text("".join(lines))
+    return p
diff --git a/tests/submission_checker/test_accuracy_parser.py b/tests/submission_checker/test_accuracy_parser.py
new file mode 100644
index 0000000000..f61b0370b8
--- /dev/null
+++ b/tests/submission_checker/test_accuracy_parser.py
@@ -0,0 +1,144 @@
+import pytest
+from submission_checker.parsers.accuracy_parser import parse_line
+
+
+# ---------------------------------------------------------------------------
+# Regex-backed metrics
+# ---------------------------------------------------------------------------
+
+class TestAccMetric:
+    def test_plain_accuracy_line(self):
+        assert parse_line("accuracy = 76.50", "acc") == pytest.approx(76.50)
+
+    def test_json_style_accuracy_line(self):
+        assert parse_line('{"accuracy": 76.50}', "acc") == pytest.approx(76.50)
+
+    def test_no_match_returns_none(self):
+        assert parse_line("something else entirely", "acc") is None
+
+
+class TestAUCMetric:
+    def test_auc_line(self):
+        assert parse_line("AUC=80.31", "AUC") == pytest.approx(80.31)
+
+    def test_auc_with_trailing_text(self):
+        assert parse_line("AUC=80.31 (threshold=0.5)", "AUC") == pytest.approx(80.31)
+
+    def test_no_match_returns_none(self):
+        assert parse_line("accuracy = 80.31", "AUC") is None
+
+
+class TestMAPMetric:
+    def test_map_equals_format(self):
+        assert parse_line("mAP=37.55", "mAP") == pytest.approx(37.55)
+
+    def test_map_total_dict_format(self):
+        assert parse_line("'Total': 37.55", "mAP") == pytest.approx(37.55)
+
+    def test_no_match_returns_none(self):
+        assert parse_line("Average Precision = 37.55", "mAP") is None
+
+
+class TestACCURACYMetric:
+    def test_wer_accuracy_line(self):
+        val = parse_line("Word Error Rate: 4.5%, accuracy=95.5%", "ACCURACY")
+        assert val == pytest.approx(95.5)
+
+    def test_no_match_returns_none(self):
+        assert parse_line("accuracy=95.5%", "ACCURACY") is None
+
+
+class TestDICEMetric:
+    def test_dice_line(self):
+        assert parse_line("Accuracy: mean = 0.86170", "DICE") == pytest.approx(0.86170)
+
+    def test_no_match_returns_none(self):
+        assert parse_line("mean accuracy 0.86", "DICE") is None
+
+
+class TestDLRMMetrics:
+    def test_dlrm_ne(self):
+        val = parse_line("metric/lifetime_ne/rating: 0.8500", "DLRM_NE")
+        assert val == pytest.approx(0.85)
+
+    def test_dlrm_acc(self):
+        val = parse_line("metric/lifetime_accuracy/rating: 0.9200", "DLRM_ACC")
+        assert val == pytest.approx(0.92)
+
+    def test_dlrm_auc(self):
+        val = parse_line("metric/lifetime_gauc/rating: 0.8100", "DLRM_AUC")
+        assert val == pytest.approx(0.81)
+
+
+# ---------------------------------------------------------------------------
+# Dict-backed metrics (ast.literal_eval)
+# ---------------------------------------------------------------------------
+
+class TestROUGEMetrics:
+    ROUGE_LINE = "{'rouge1': 44.43, 'rouge2': 22.04, 'rougeL': 28.62, 'rougeLsum': 35.0, 'gen_len': 8167644}"
+
+    def test_rouge1(self):
+        assert parse_line(self.ROUGE_LINE, "ROUGE1") == pytest.approx(44.43)
+
+    def test_rouge2(self):
+        assert parse_line(self.ROUGE_LINE, "ROUGE2") == pytest.approx(22.04)
+
+    def test_rougel(self):
+        assert parse_line(self.ROUGE_LINE, "ROUGEL") == pytest.approx(28.62)
+
+    def test_rougelsum(self):
+        assert parse_line(self.ROUGE_LINE, "ROUGELSUM") == pytest.approx(35.0)
+
+    def test_gen_len(self):
+        assert parse_line(self.ROUGE_LINE, "GEN_LEN") == pytest.approx(8167644)
+
+    def test_no_dict_returns_none(self):
+        assert parse_line("rouge1 = 44.43", "ROUGE1") is None
+
+
+class TestTokensPerSample:
+    def test_tokens_per_sample(self):
+        line = "{'tokens_per_sample': 294.45}"
+        assert parse_line(line, "TOKENS_PER_SAMPLE") == pytest.approx(294.45)
+
+
+class TestCLIPAndFIDMetrics:
+    CLIP_LINE = "Accuracy Results: {'CLIP_SCORE': 31.69, 'FID_SCORE': 23.01}"
+
+    def test_clip_score(self):
+        assert parse_line(self.CLIP_LINE, "CLIP_SCORE") == pytest.approx(31.69)
+
+    def test_fid_score(self):
+        assert parse_line(self.CLIP_LINE, "FID_SCORE") == pytest.approx(23.01)
+
+    def test_clip_score_missing_prefix_returns_none(self):
+        assert parse_line("{'CLIP_SCORE': 31.69}", "CLIP_SCORE") is None
+
+
+# ---------------------------------------------------------------------------
+# JSON-backed metrics
+# ---------------------------------------------------------------------------
+
+class TestF1Metric:
+    def test_f1_line(self):
+        assert parse_line('{"f1": 90.874}', "F1") == pytest.approx(90.874)
+
+    def test_f1_with_prefix(self):
+        assert parse_line('prefix text {"f1": 90.874}', "F1") == pytest.approx(90.874)
+
+    def test_f1_hierarchical(self):
+        assert parse_line('{"f1": 85.0}', "F1_HIERARCHICAL") == pytest.approx(85.0)
+
+    def test_no_json_returns_none(self):
+        assert parse_line("f1 = 90.874", "F1") is None
+
+    def test_missing_key_returns_none(self):
+        assert parse_line('{"score": 90.874}', "F1") is None
+
+
+# ---------------------------------------------------------------------------
+# Unknown metric
+# ---------------------------------------------------------------------------
+
+def test_unknown_metric_returns_none():
+    assert parse_line("accuracy = 75.0", "UNKNOWN_METRIC") is None
diff --git a/tests/submission_checker/test_base_check.py b/tests/submission_checker/test_base_check.py
new file mode 100644
index 0000000000..2783f5cc41
--- /dev/null
+++ b/tests/submission_checker/test_base_check.py
@@ -0,0 +1,98 @@
+import logging
+import pytest
+from submission_checker.checks.base import BaseCheck
+
+
+log = logging.getLogger("test")
+
+
+class AlwaysPassCheck(BaseCheck):
+    def __init__(self):
+        super().__init__(log, "/fake/path")
+        self.checks = [self.check_a, self.check_b]
+
+    def check_a(self):
+        return True
+
+    def check_b(self):
+        return True
+
+
+class SomeFailCheck(BaseCheck):
+    def __init__(self):
+        super().__init__(log, "/fake/path")
+        self.checks = [self.pass_check, self.fail_check]
+
+    def pass_check(self):
+        return True
+
+    def fail_check(self):
+        return False
+
+
+class ExceptionCheck(BaseCheck):
+    def __init__(self):
+        super().__init__(log, "/fake/path")
+        self.checks = [self.boom]
+
+    def boom(self):
+        raise RuntimeError("intentional failure")
+
+
+class EmptyCheck(BaseCheck):
+    def __init__(self):
+        super().__init__(log, "/fake/path")
+        self.checks = []
+
+
+# ---------------------------------------------------------------------------
+# run_checks
+# ---------------------------------------------------------------------------
+
+class TestRunChecks:
+    def test_all_pass_returns_true(self):
+        assert AlwaysPassCheck().run_checks() is True
+
+    def test_any_fail_returns_false(self):
+        assert SomeFailCheck().run_checks() is False
+
+    def test_exception_treated_as_failure(self):
+        assert ExceptionCheck().run_checks() is False
+
+    def test_no_checks_returns_true(self):
+        assert EmptyCheck().run_checks() is True
+
+
+# ---------------------------------------------------------------------------
+# __call__
+# ---------------------------------------------------------------------------
+
+class TestCall:
+    def test_callable_returns_true_when_all_pass(self):
+        assert AlwaysPassCheck()() is True
+
+    def test_callable_returns_false_when_any_fail(self):
+        assert SomeFailCheck()() is False
+
+
+# ---------------------------------------------------------------------------
+# execute
+# ---------------------------------------------------------------------------
+
+def test_execute_delegates_to_check_method():
+    checker = AlwaysPassCheck()
+    assert checker.execute(checker.check_a) is True
+
+
+# ---------------------------------------------------------------------------
+# Attributes
+# ---------------------------------------------------------------------------
+
+def test_path_stored():
+    checker = AlwaysPassCheck()
+    assert checker.path == "/fake/path"
+
+
+def test_log_stored():
+    checker = AlwaysPassCheck()
+    assert checker.log is log
diff --git a/tests/submission_checker/test_config.py b/tests/submission_checker/test_config.py
new file mode 100644
index 0000000000..e9adbc83cc
--- /dev/null
+++ b/tests/submission_checker/test_config.py
@@ -0,0 +1,153 @@
+import pytest
+from submission_checker.configuration.configuration import Config
+
+
+@pytest.fixture()
+def cfg():
+    return Config(version="v6.0", extra_model_benchmark_map={})
+
+
+# ---------------------------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------------------------
+
+class TestConfigInit:
+    def test_version_stored(self, cfg):
+        assert cfg.version == "v6.0"
+
+    def test_models_populated(self, cfg):
+        assert "resnet" in cfg.models
+        assert "bert-99" in cfg.models
+
+    def test_unknown_version_raises(self):
+        with pytest.raises((KeyError, TypeError)):
+            Config(version="v99.99", extra_model_benchmark_map={})
+
+
+# ---------------------------------------------------------------------------
+# set_type
+# ---------------------------------------------------------------------------
+
+class TestSetType:
+    def test_datacenter_sets_required(self, cfg):
+        cfg.set_type("datacenter")
+        assert cfg.required is not None
+
+    def test_edge_sets_required(self, cfg):
+        cfg.set_type("edge")
+        assert cfg.required is not None
+
+    def test_combined_accepted(self, cfg):
+        cfg.set_type("datacenter,edge")
+        assert cfg.required is not None
+
+    def test_combined_reversed_accepted(self, cfg):
+        cfg.set_type("edge,datacenter")
+        assert cfg.required is not None
+
+    def test_invalid_type_raises(self, cfg):
+        with pytest.raises(ValueError, match="invalid system type"):
+            cfg.set_type("cloud")
+
+
+# ---------------------------------------------------------------------------
+# get_mlperf_model
+# ---------------------------------------------------------------------------
+
+class TestGetMlperfModel:
+    def test_official_name_passthrough(self, cfg):
+        assert cfg.get_mlperf_model("resnet") == "resnet"
+
+    def test_resnet50_maps_to_resnet(self, cfg):
+        assert cfg.get_mlperf_model("resnet50") == "resnet"
+
+    def test_mobilenet_maps_to_resnet(self, cfg):
+        assert cfg.get_mlperf_model("mobilenet-v1") == "resnet"
+
+    def test_bert_99_variant(self, cfg):
+        assert cfg.get_mlperf_model("bert-99-large") == "bert-99"
+
+    def test_extra_mapping_used(self, cfg):
+        assert cfg.get_mlperf_model("my_resnet", {"my_resnet": "resnet"}) == "resnet"
+
+
+# ---------------------------------------------------------------------------
+# get_required / get_optional
+# ---------------------------------------------------------------------------
+
+class TestGetRequired:
+    def test_resnet_edge_requires_three_scenarios(self, cfg):
+        cfg.set_type("edge")
+        req = cfg.get_required("resnet")
+        assert req == {"SingleStream", "MultiStream", "Offline"}
+
+    def test_unknown_model_returns_none(self, cfg):
+        cfg.set_type("edge")
+        assert cfg.get_required("nonexistent-model") is None
+
+    def test_optional_empty_set_for_unknown(self, cfg):
+        cfg.set_type("edge")
+        assert cfg.get_optional("nonexistent-model") == set()
+
+
+# ---------------------------------------------------------------------------
+# get_accuracy_target
+# ---------------------------------------------------------------------------
+
+class TestGetAccuracyTarget:
+    def test_resnet_accuracy_target(self, cfg):
+        target = cfg.get_accuracy_target("resnet")
+        assert target is not None
+        assert target[0] == "acc"
+        assert target[1] == pytest.approx(76.46 * 0.99)
+
+    def test_unknown_model_raises(self, cfg):
+        with pytest.raises(ValueError, match="model not known"):
+            cfg.get_accuracy_target("not-a-model")
+
+
+# ---------------------------------------------------------------------------
+# get_delta_perc
+# ---------------------------------------------------------------------------
+
+class TestGetDeltaPerc:
+    def test_standard_model_defaults_to_1(self, cfg):
+        assert cfg.get_delta_perc("resnet", "acc") == 1
+
+    def test_high_accuracy_model_defaults_to_0_1(self, cfg):
+        assert cfg.get_delta_perc("bert-99.9", "F1") == pytest.approx(0.1)
+
+
+# ---------------------------------------------------------------------------
+# Boolean helpers
+# ---------------------------------------------------------------------------
+
+class TestBooleanHelpers:
+    def test_uses_early_stopping_server(self, cfg):
+        assert cfg.uses_early_stopping("Server") is True
+
+    def test_uses_early_stopping_offline_false(self, cfg):
+        assert cfg.uses_early_stopping("Offline") is False
+
+    def test_has_new_logging_format(self, cfg):
+        assert cfg.has_new_logging_format() is True
+
+
+# ---------------------------------------------------------------------------
+# get_llm_models
+# ---------------------------------------------------------------------------
+
+def test_llm_models_include_llama(cfg):
+    llms = cfg.get_llm_models()
+    assert any("llama" in m for m in llms)
+
+
+# ---------------------------------------------------------------------------
+# ignore_errors
+# ---------------------------------------------------------------------------
+
+def test_ignore_errors_matches_configured_string(cfg):
+    # ignore_errors is driven by base["ignore_errors"]; we just verify it
+    # doesn't crash and returns a bool
+    result = cfg.ignore_errors("some random log line")
+    assert isinstance(result, bool)
diff --git a/tests/submission_checker/test_loadgen_parser.py b/tests/submission_checker/test_loadgen_parser.py
new file mode 100644
index 0000000000..8fc519a783
--- /dev/null
+++ b/tests/submission_checker/test_loadgen_parser.py
@@ -0,0 +1,120 @@
+import json
+import pytest
+from submission_checker.parsers.loadgen_parser import LoadgenParser
+
+
+MLLOG_MARKER = ":::MLLOG"
+
+
+def make_mllog_line(key, value, is_error=False, is_warning=False):
+    entry = {
+        "key": key,
+        "value": value,
+        "time_ms": 0,
+        "namespace": "",
+        "event_type": "POINT_IN_TIME",
+        "metadata": {
+            "file": "test.py",
+            "line_no": 1,
+            "is_error": is_error,
+            "is_warning": is_warning,
+        },
+    }
+    return f"{MLLOG_MARKER} {json.dumps(entry)}\n"
+
+
+class TestLoadgenParserBasic:
+    def test_parses_valid_log(self, simple_mllog):
+        p = LoadgenParser(str(simple_mllog))
+        assert "result_validity" in p.get_keys()
+        assert "effective_scenario" in p.get_keys()
+
+    def test_getitem_returns_first_value(self, simple_mllog):
+        p = LoadgenParser(str(simple_mllog))
+        assert p["result_validity"] == "VALID"
+        assert p["effective_scenario"] == "Offline"
+
+    def test_getitem_missing_key_returns_none(self, simple_mllog):
+        p = LoadgenParser(str(simple_mllog))
+        assert p["nonexistent_key"] is None
+
+    def test_num_messages(self, simple_mllog):
+        p = LoadgenParser(str(simple_mllog))
+        assert p.num_messages() == 3
+
+    def test_get_messages_is_dict(self, simple_mllog):
+        p = LoadgenParser(str(simple_mllog))
+        assert isinstance(p.get_messages(), dict)
+
+
+class TestLoadgenParserErrors:
+    def test_no_error_in_clean_log(self, simple_mllog):
+        p = LoadgenParser(str(simple_mllog))
+        assert p.num_errors() == 0
+        assert not p.has_error()
+
+    def test_detects_error_entry(self, mllog_with_error):
+        p = LoadgenParser(str(mllog_with_error))
+        assert p.num_errors() == 1
+        assert p.has_error()
+
+    def test_get_errors_returns_list(self, mllog_with_error):
+        p = LoadgenParser(str(mllog_with_error))
+        errors = p.get_errors()
+        assert len(errors) == 1
+        assert errors[0]["key"] == "loadgen_error"
+
+
+class TestLoadgenParserDuplicateKeys:
+    def test_duplicate_key_stored_twice(self, mllog_duplicate_key):
+        p = LoadgenParser(str(mllog_duplicate_key))
+        entries = p.get("seeds")
+        assert len(entries) == 2
+
+    def test_getitem_returns_first_on_duplicate(self, mllog_duplicate_key):
+        p = LoadgenParser(str(mllog_duplicate_key))
+        assert p["seeds"] == 1234
+
+
+class TestLoadgenParserStrict:
+    def test_invalid_first_line_raises(self, tmp_path):
+        bad = tmp_path / "bad.txt"
+        bad.write_text("not a valid mllog line\n")
+        with pytest.raises(RuntimeError, match="Marker not found"):
+            LoadgenParser(str(bad))
+
+    def test_invalid_json_strict_raises(self, tmp_path):
+        p = tmp_path / "log.txt"
+        p.write_text(":::MLLOG not-valid-json\n")
+        with pytest.raises(RuntimeError):
+            LoadgenParser(str(p), strict=True)
+
+    def test_invalid_json_non_strict_skips(self, tmp_path):
+        p = tmp_path / "log.txt"
+        valid_line = make_mllog_line("result_validity", "VALID")
+        p.write_text(valid_line + ":::MLLOG not-valid-json\n")
+        parser = LoadgenParser(str(p), strict=False)
+        assert parser["result_validity"] == "VALID"
+
+
+class TestLoadgenParserEndpoints:
+    def test_endpoints_marker_accepted(self, tmp_path):
+        p = tmp_path / "log.txt"
+        entry = json.dumps({
+            "key": "endpoint_key",
+            "value": "endpoint_value",
+            "metadata": {"is_error": False, "is_warning": False},
+        })
+        p.write_text(f":::ENDPTS {entry}\n")
+        parser = LoadgenParser(str(p))
+        assert parser.log_is_endpoints
+        assert parser["endpoint_key"] == "endpoint_value"
+
+
+class TestLoadgenParserDump:
+    def test_dump_writes_json(self, simple_mllog, tmp_path):
+        parser = LoadgenParser(str(simple_mllog))
+        out = tmp_path / "out.json"
+        parser.dump(str(out))
+        data = json.loads(out.read_text())
+        assert "result_validity" in data
diff --git a/tests/submission_checker/test_utils.py b/tests/submission_checker/test_utils.py
new file mode 100644
index 0000000000..f1513c91d0
--- /dev/null
+++ b/tests/submission_checker/test_utils.py
@@ -0,0 +1,197 @@
+import os
+import pytest
+from submission_checker.utils import (
+    files_diff,
+    get_boolean,
+    is_number,
+    lower_list,
+    contains_list,
+    merge_two_dict,
+    sum_dict_values,
+    split_path,
+    list_dir,
+    list_files,
+    list_empty_dirs_recursively,
+    list_files_recursively,
+)
+
+
+# ---------------------------------------------------------------------------
+# files_diff
+# ---------------------------------------------------------------------------
+
+class TestFilesDiff:
+    def test_identical_lists_no_diff(self):
+        assert files_diff(["a.txt", "b.txt"], ["a.txt", "b.txt"]) == set()
+
+    def test_missing_file_reported(self):
+        diff = files_diff(["a.txt"], ["a.txt", "b.txt"])
+        assert "b.txt" in diff
+
+    def test_extra_file_reported(self):
+        diff = files_diff(["a.txt", "extra.txt"], ["a.txt"])
+        assert "extra.txt" in diff
+
+    def test_optional_files_ignored(self):
+        # mlperf_log_trace.json is always optional
+        diff = files_diff(["a.txt", "mlperf_log_trace.json"], ["a.txt"])
+        assert diff == set()
+
+    def test_custom_optional_ignored(self):
+        diff = files_diff(["a.txt", "custom.json"], ["a.txt"], optional=["custom.json"])
+        assert diff == set()
+
+
+# ---------------------------------------------------------------------------
+# get_boolean
+# ---------------------------------------------------------------------------
+
+class TestGetBoolean:
+    @pytest.mark.parametrize("val", [True, "true", "True", "TRUE", 1])
+    def test_truthy_values(self, val):
+        assert get_boolean(val) is True
+
+    @pytest.mark.parametrize("val", [False, "false", "False", "FALSE", 0])
+    def test_falsy_values(self, val):
+        assert get_boolean(val) is False
+
+    def test_none_returns_false(self):
+        assert get_boolean(None) is False
+
+    def test_invalid_type_raises(self):
+        with pytest.raises(TypeError):
+            get_boolean([])
+
+
+# ---------------------------------------------------------------------------
+# is_number
+# ---------------------------------------------------------------------------
+
+class TestIsNumber:
+    @pytest.mark.parametrize("val", ["3.14", "0", "-1", "1e5", "123"])
+    def test_numeric_strings(self, val):
+        assert is_number(val) is True
+
+    @pytest.mark.parametrize("val", ["abc", "", "1.2.3"])
+    def test_non_numeric_strings(self, val):
+        assert is_number(val) is False
+
+    def test_nan_is_numeric(self):
+        # float("NaN") succeeds in Python, so is_number returns True
+        assert is_number("NaN") is True
+
+
+# ---------------------------------------------------------------------------
+# lower_list
+# ---------------------------------------------------------------------------
+
+def test_lower_list_converts_to_lowercase():
+    assert lower_list(["Hello", "WORLD", "123"]) == ["hello", "world", "123"]
+
+
+def test_lower_list_empty():
+    assert lower_list([]) == []
+
+
+# ---------------------------------------------------------------------------
+# contains_list
+# ---------------------------------------------------------------------------
+
+class TestContainsList:
+    def test_all_present(self):
+        missing, ok = contains_list(["a", "b", "c"], ["a", "b"])
+        assert ok is True
+        assert missing == []
+
+    def test_some_missing(self):
+        missing, ok = contains_list(["a"], ["a", "b"])
+        assert ok is False
+        assert "b" in missing
+
+    def test_empty_needle(self):
+        _, ok = contains_list(["a"], [])
+        assert ok is True
+
+
+# ---------------------------------------------------------------------------
+# merge_two_dict
+# ---------------------------------------------------------------------------
+
+class TestMergeTwoDict:
+    def test_disjoint_dicts_merged(self):
+        result = merge_two_dict({"a": 1}, {"b": 2})
+        assert result == {"a": 1, "b": 2}
+
+    def test_overlapping_keys_summed(self):
+        result = merge_two_dict({"a": [1]}, {"a": [2]})
+        assert result == {"a": [1, 2]}
+
+    def test_original_not_mutated(self):
+        x = {"a": 1}
+        merge_two_dict(x, {"b": 2})
+        assert x == {"a": 1}
+
+
+# ---------------------------------------------------------------------------
+# sum_dict_values
+# ---------------------------------------------------------------------------
+
+def test_sum_dict_values():
+    assert sum_dict_values({"a": 1, "b": 2, "c": 3}) == 6
+
+
+def test_sum_dict_values_empty():
+    assert sum_dict_values({}) == 0
+
+
+# ---------------------------------------------------------------------------
+# split_path
+# ---------------------------------------------------------------------------
+
+def test_split_path_unix():
+    assert split_path("foo/bar/baz") == ["foo", "bar", "baz"]
+
+
+def test_split_path_windows_backslash():
+    assert split_path("foo\\bar\\baz") == ["foo", "bar", "baz"]
+
+
+# ---------------------------------------------------------------------------
+# Filesystem helpers (use tmp_path)
+# ---------------------------------------------------------------------------
+
+@pytest.fixture()
+def sample_tree(tmp_path):
+    (tmp_path / "subA").mkdir()
+    (tmp_path / "subB").mkdir()
+    (tmp_path / "subA" / "file1.txt").write_text("x")
+    (tmp_path / "subA" / "file2.txt").write_text("y")
+    (tmp_path / "subB").mkdir(exist_ok=True)
+    return tmp_path
+
+
+def test_list_dir(sample_tree):
+    dirs = list_dir(str(sample_tree))
+    assert dirs == ["subA", "subB"]
+
+
+def test_list_files(sample_tree):
+    files = list_files(str(sample_tree / "subA"))
+    assert files == ["file1.txt", "file2.txt"]
+
+
+def test_list_empty_dirs(tmp_path):
+    empty = tmp_path / "empty"
+    empty.mkdir()
+    (tmp_path / "nonempty").mkdir()
+    (tmp_path / "nonempty" / "f.txt").write_text("x")
+    empties = list_empty_dirs_recursively(str(tmp_path))
+    assert str(empty) in empties
+    assert str(tmp_path / "nonempty") not in empties
+
+
+def test_list_files_recursively(sample_tree):
+    files = list_files_recursively(str(sample_tree))
+    names = [os.path.basename(f) for f in files]
+    assert "file1.txt" in names
+    assert "file2.txt" in names

From 38ec3cfaa446a17c1762e2042dd2d4d1448b6f14 Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 22 Apr 2026 09:41:54 -0400
Subject: [PATCH 2/6] TEST ONLY codecov impl

---
 .github/workflows/codecov.yml | 43 +++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 .github/workflows/codecov.yml

diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
new file mode 100644
index 0000000000..27df9e33ff
--- /dev/null
+++ b/.github/workflows/codecov.yml
@@ -0,0 +1,43 @@
+name: Tests and Coverage
+
+on:
+  push:
+    branches: ["arav-codecov-impl"]
+    paths:
+      - "tests/**"
+      - "tools/submission/submission_checker/**"
+      - ".github/workflows/codecov.yml"
+  pull_request:
+    branches: ["arav-codecov-impl"]
+    paths:
+      - "tests/**"
+      - "tools/submission/submission_checker/**"
+      - ".github/workflows/codecov.yml"
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install test dependencies
+        run: pip install pytest pytest-cov
+
+      - name: Run tests with coverage
+        run: >
+          pytest tests/
+          --cov=.
+          --cov-report=xml
+          --cov-report=term-missing
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          files: coverage.xml
+          token: ${{ secrets.CODECOV_TOKEN }}

From 2c8c736e1bd27d276cb73a02892b1f5de99e5a70 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 Apr 2026 13:42:40 +0000
Subject: [PATCH 3/6] [Automated Commit] Format Codebase

---
 tests/conftest.py                                | 11 +++++++++--
 tests/submission_checker/conftest.py             |  5 ++++-
 tests/submission_checker/test_accuracy_parser.py | 16 ++++++++++++----
 tests/submission_checker/test_config.py          |  4 +++-
 tests/submission_checker/test_utils.py           |  3 ++-
 5 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 2f0f4db9b7..f068daa9e8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,12 @@
 import sys
 import os
 
-# Ensure tools/submission is on the path so `import submission_checker` resolves.
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "tools", "submission"))
+# Ensure tools/submission is on the path so `import submission_checker`
+# resolves.
+sys.path.insert(
+    0,
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "tools",
+        "submission"))
diff --git a/tests/submission_checker/conftest.py b/tests/submission_checker/conftest.py
index cc14060905..fd990d5155 100644
--- a/tests/submission_checker/conftest.py
+++ b/tests/submission_checker/conftest.py
@@ -41,7 +41,10 @@ def mllog_with_error(tmp_path):
     p = tmp_path / "mlperf_log_detail.txt"
     lines = [
         make_mllog_line("result_validity", "INVALID"),
-        make_mllog_line("loadgen_error", "something went wrong", is_error=True),
+        make_mllog_line(
+            "loadgen_error",
+            "something went wrong",
+            is_error=True),
     ]
     p.write_text("".join(lines))
     return p
diff --git a/tests/submission_checker/test_accuracy_parser.py b/tests/submission_checker/test_accuracy_parser.py
index f61b0370b8..46e72a187c 100644
--- a/tests/submission_checker/test_accuracy_parser.py
+++ b/tests/submission_checker/test_accuracy_parser.py
@@ -22,7 +22,9 @@ def test_auc_line(self):
         assert parse_line("AUC=80.31", "AUC") == pytest.approx(80.31)
 
     def test_auc_with_trailing_text(self):
-        assert parse_line("AUC=80.31 (threshold=0.5)", "AUC") == pytest.approx(80.31)
+        assert parse_line(
+            "AUC=80.31 (threshold=0.5)",
+            "AUC") == pytest.approx(80.31)
 
     def test_no_match_returns_none(self):
         assert parse_line("accuracy = 80.31", "AUC") is None
@@ -50,7 +52,9 @@ def test_no_match_returns_none(self):
 
 class TestDICEMetric:
     def test_dice_line(self):
-        assert parse_line("Accuracy: mean = 0.86170", "DICE") == pytest.approx(0.86170)
+        assert parse_line(
+            "Accuracy: mean = 0.86170",
+            "DICE") == pytest.approx(0.86170)
 
     def test_no_match_returns_none(self):
         assert parse_line("mean accuracy 0.86", "DICE") is None
@@ -124,10 +128,14 @@ def test_f1_line(self):
         assert parse_line('{"f1": 90.874}', "F1") == pytest.approx(90.874)
 
     def test_f1_with_prefix(self):
-        assert parse_line('prefix text {"f1": 90.874}', "F1") == pytest.approx(90.874)
+        assert parse_line(
+            'prefix text {"f1": 90.874}',
+            "F1") == pytest.approx(90.874)
 
     def test_f1_hierarchical(self):
-        assert parse_line('{"f1": 85.0}', "F1_HIERARCHICAL") == pytest.approx(85.0)
+        assert parse_line(
+            '{"f1": 85.0}',
+            "F1_HIERARCHICAL") == pytest.approx(85.0)
 
     def test_no_json_returns_none(self):
         assert parse_line("f1 = 90.874", "F1") is None
diff --git a/tests/submission_checker/test_config.py b/tests/submission_checker/test_config.py
index e9adbc83cc..5d94989f0b 100644
--- a/tests/submission_checker/test_config.py
+++ b/tests/submission_checker/test_config.py
@@ -68,7 +68,9 @@ def test_bert_99_variant(self, cfg):
         assert cfg.get_mlperf_model("bert-99-large") == "bert-99"
 
     def test_extra_mapping_used(self, cfg):
-        assert cfg.get_mlperf_model("my_resnet", {"my_resnet": "resnet"}) == "resnet"
+        assert cfg.get_mlperf_model(
+            "my_resnet", {
+                "my_resnet": "resnet"}) == "resnet"
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/submission_checker/test_utils.py b/tests/submission_checker/test_utils.py
index f1513c91d0..156277e837 100644
--- a/tests/submission_checker/test_utils.py
+++ b/tests/submission_checker/test_utils.py
@@ -38,7 +38,8 @@ def test_optional_files_ignored(self):
         assert diff == set()
 
     def test_custom_optional_ignored(self):
-        diff = files_diff(["a.txt", "custom.json"], ["a.txt"], optional=["custom.json"])
+        diff = files_diff(["a.txt", "custom.json"], [
+                          "a.txt"], optional=["custom.json"])
         assert diff == set()
 
 
From fdc5b0ace480aab699a6df639d8d4f79898eb57a Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 22 Apr 2026 09:45:04 -0400
Subject: [PATCH 4/6] Remove bad file

---
 .../test_accuracy_parser.py                   | 144 ------------------
 1 file changed, 144 deletions(-)
 delete mode 100644 tests/submission_checker/test_accuracy_parser.py

diff --git a/tests/submission_checker/test_accuracy_parser.py b/tests/submission_checker/test_accuracy_parser.py
deleted file mode 100644
index f61b0370b8..0000000000
--- a/tests/submission_checker/test_accuracy_parser.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import pytest
-from submission_checker.parsers.accuracy_parser import parse_line
-
-
-# ---------------------------------------------------------------------------
-# Regex-backed metrics
-# ---------------------------------------------------------------------------
-
-class TestAccMetric:
-    def test_plain_accuracy_line(self):
-        assert parse_line("accuracy = 76.50", "acc") == pytest.approx(76.50)
-
-    def test_json_style_accuracy_line(self):
-        assert parse_line('{"accuracy": 76.50}', "acc") == pytest.approx(76.50)
-
-    def test_no_match_returns_none(self):
-        assert parse_line("something else entirely", "acc") is None
-
-
-class TestAUCMetric:
-    def test_auc_line(self):
-        assert parse_line("AUC=80.31", "AUC") == pytest.approx(80.31)
-
-    def test_auc_with_trailing_text(self):
-        assert parse_line("AUC=80.31 (threshold=0.5)", "AUC") == pytest.approx(80.31)
-
-    def test_no_match_returns_none(self):
-        assert parse_line("accuracy = 80.31", "AUC") is None
-
-
-class TestMAPMetric:
-    def test_map_equals_format(self):
-        assert parse_line("mAP=37.55", "mAP") == pytest.approx(37.55)
-
-    def test_map_total_dict_format(self):
-        assert parse_line("'Total': 37.55", "mAP") == pytest.approx(37.55)
-
-    def test_no_match_returns_none(self):
-        assert parse_line("Average Precision = 37.55", "mAP") is None
-
-
-class TestACCURACYMetric:
-    def test_wer_accuracy_line(self):
-        val = parse_line("Word Error Rate: 4.5%, accuracy=95.5%", "ACCURACY")
-        assert val == pytest.approx(95.5)
-
-    def test_no_match_returns_none(self):
-        assert parse_line("accuracy=95.5%", "ACCURACY") is None
-
-
-class TestDICEMetric:
-    def test_dice_line(self):
-        assert parse_line("Accuracy: mean = 0.86170", "DICE") == pytest.approx(0.86170)
-
-    def test_no_match_returns_none(self):
-        assert parse_line("mean accuracy 0.86", "DICE") is None
-
-
-class TestDLRMMetrics:
-    def test_dlrm_ne(self):
-        val = parse_line("metric/lifetime_ne/rating: 0.8500", "DLRM_NE")
-        assert val == pytest.approx(0.85)
-
-    def test_dlrm_acc(self):
-        val = parse_line("metric/lifetime_accuracy/rating: 0.9200", "DLRM_ACC")
-        assert val == pytest.approx(0.92)
-
-    def test_dlrm_auc(self):
-        val = parse_line("metric/lifetime_gauc/rating: 0.8100", "DLRM_AUC")
-        assert val == pytest.approx(0.81)
-
-
-# ---------------------------------------------------------------------------
-# Dict-backed metrics (ast.literal_eval)
-# ---------------------------------------------------------------------------
-
-class TestROUGEMetrics:
-    ROUGE_LINE = "{'rouge1': 44.43, 'rouge2': 22.04, 'rougeL': 28.62, 'rougeLsum': 35.0, 'gen_len': 8167644}"
-
-    def test_rouge1(self):
-        assert parse_line(self.ROUGE_LINE, "ROUGE1") == pytest.approx(44.43)
-
-    def test_rouge2(self):
-        assert parse_line(self.ROUGE_LINE, "ROUGE2") == pytest.approx(22.04)
-
-    def test_rougel(self):
-        assert parse_line(self.ROUGE_LINE, "ROUGEL") == pytest.approx(28.62)
-
-    def test_rougelsum(self):
-        assert parse_line(self.ROUGE_LINE, "ROUGELSUM") == pytest.approx(35.0)
-
-    def test_gen_len(self):
-        assert parse_line(self.ROUGE_LINE, "GEN_LEN") == pytest.approx(8167644)
-
-    def test_no_dict_returns_none(self):
-        assert parse_line("rouge1 = 44.43", "ROUGE1") is None
-
-
-class TestTokensPerSample:
-    def test_tokens_per_sample(self):
-        line = "{'tokens_per_sample': 294.45}"
-        assert parse_line(line, "TOKENS_PER_SAMPLE") == pytest.approx(294.45)
-
-
-class TestCLIPAndFIDMetrics:
-    CLIP_LINE = "Accuracy Results: {'CLIP_SCORE': 31.69, 'FID_SCORE': 23.01}"
-
-    def test_clip_score(self):
-        assert parse_line(self.CLIP_LINE, "CLIP_SCORE") == pytest.approx(31.69)
-
-    def test_fid_score(self):
-        assert parse_line(self.CLIP_LINE, "FID_SCORE") == pytest.approx(23.01)
-
-    def test_clip_score_missing_prefix_returns_none(self):
-        assert parse_line("{'CLIP_SCORE': 31.69}", "CLIP_SCORE") is None
-
-
-# ---------------------------------------------------------------------------
-# JSON-backed metrics
-# ---------------------------------------------------------------------------
-
-class TestF1Metric:
-    def test_f1_line(self):
-        assert parse_line('{"f1": 90.874}', "F1") == pytest.approx(90.874)
-
-    def test_f1_with_prefix(self):
-        assert parse_line('prefix text {"f1": 90.874}', "F1") == pytest.approx(90.874)
-
-    def test_f1_hierarchical(self):
-        assert parse_line('{"f1": 85.0}', "F1_HIERARCHICAL") == pytest.approx(85.0)
-
-    def test_no_json_returns_none(self):
-        assert parse_line("f1 = 90.874", "F1") is None
-
-    def test_missing_key_returns_none(self):
-        assert parse_line('{"score": 90.874}', "F1") is None
-
-
-# ---------------------------------------------------------------------------
-# Unknown metric
-# ---------------------------------------------------------------------------
-
-def test_unknown_metric_returns_none():
-    assert parse_line("accuracy = 75.0", "UNKNOWN_METRIC") is None

From 22104d44f0d4bbb20f5d3b333db8e1bbd6096a68 Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 22 Apr 2026 09:48:58 -0400
Subject: [PATCH 5/6] test


From 7e121aa67bb2aab33813e0e793da9c1b90f2e0b9 Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 22 Apr 2026 11:43:30 -0400
Subject: [PATCH 6/6] Add __init__.py to files to ensure full coverage
 knowledge

---
 .coveragerc                                               | 8 ++++++++
 .github/workflows/codecov.yml                             | 2 +-
 automotive/3d-object-detection/__init__.py                | 0
 automotive/3d-object-detection/tools/__init__.py          | 0
 automotive/__init__.py                                    | 0
 calibration/BraTS/__init__.py                             | 0
 calibration/__init__.py                                   | 0
 compliance/TEST01/__init__.py                             | 0
 compliance/TEST04/__init__.py                             | 0
 compliance/TEST06/__init__.py                             | 0
 compliance/TEST07/__init__.py                             | 0
 compliance/TEST08/__init__.py                             | 0
 compliance/TEST09/__init__.py                             | 0
 compliance/__init__.py                                    | 0
 graph/R-GAT/__init__.py                                   | 0
 graph/R-GAT/dgl_utilities/__init__.py                     | 0
 graph/R-GAT/tools/__init__.py                             | 0
 graph/__init__.py                                         | 0
 language/__init__.py                                      | 0
 language/bert/__init__.py                                 | 0
 language/deepseek-r1/__init__.py                          | 0
 language/gpt-j/__init__.py                                | 0
 language/gpt-oss-120b/__init__.py                         | 0
 language/gpt-oss-120b/sglang/__init__.py                  | 0
 language/llama2-70b/__init__.py                           | 0
 language/llama3.1-405b/__init__.py                        | 0
 language/llama3.1-8b/__init__.py                          | 0
 language/mixtral-8x7b/__init__.py                         | 0
 language/mixtral-8x7b/standalone_infer/__init__.py        | 0
 loadgen/demos/__init__.py                                 | 0
 loadgen/demos/lon/__init__.py                             | 0
 loadgen/demos/token_metrics/__init__.py                   | 0
 loadgen/docs/__init__.py                                  | 0
 loadgen/docs/src/__init__.py                              | 0
 lon/__init__.py                                           | 0
 multimodal/__init__.py                                    | 0
 multimodal/qwen3-vl/__init__.py                           | 0
 multimodal/qwen3-vl/src/__init__.py                       | 0
 recommendation/__init__.py                                | 0
 recommendation/dlrm_v2/__init__.py                        | 0
 recommendation/dlrm_v2/pytorch/__init__.py                | 0
 recommendation/dlrm_v2/pytorch/tools/__init__.py          | 0
 recommendation/dlrm_v3/__init__.py                        | 0
 recommendation/dlrm_v3/datasets/__init__.py               | 0
 .../dlrm_v3/generative_recommenders/__init__.py           | 0
 .../dlrm_v3/generative_recommenders/modules/__init__.py   | 0
 .../dlrm_v3/generative_recommenders/ops/__init__.py       | 0
 .../generative_recommenders/ops/pytorch/__init__.py       | 0
 .../generative_recommenders/ops/triton/__init__.py        | 0
 retired_benchmarks/__init__.py                            | 0
 retired_benchmarks/never_adopted/__init__.py              | 0
 retired_benchmarks/never_adopted/language/__init__.py     | 0
 .../never_adopted/language/gpt3/__init__.py               | 0
 .../never_adopted/language/gpt3/megatron/__init__.py      | 0
 retired_benchmarks/recommendation/__init__.py             | 0
 retired_benchmarks/recommendation/dlrm/__init__.py        | 0
 .../recommendation/dlrm/pytorch/__init__.py               | 0
 .../recommendation/dlrm/pytorch/tools/__init__.py         | 0
 retired_benchmarks/speech_recognition/__init__.py         | 0
 retired_benchmarks/speech_recognition/rnnt/__init__.py    | 0
 .../speech_recognition/rnnt/pytorch/__init__.py           | 0
 retired_benchmarks/translation/__init__.py                | 0
 retired_benchmarks/translation/gnmt/__init__.py           | 0
 .../translation/gnmt/tensorflow/__init__.py               | 0
 retired_benchmarks/vision/__init__.py                     | 0
 .../vision/classification_and_detection/__init__.py       | 0
 .../python/models/__init__.py                             | 0
 .../vision/classification_and_detection/tools/__init__.py | 0
 .../vision/medical_imaging/3d-unet-brats19/__init__.py    | 0
 retired_benchmarks/vision/medical_imaging/__init__.py     | 0
 speech2text/__init__.py                                   | 0
 speech2text/utils/__init__.py                             | 0
 text_to_image/__init__.py                                 | 0
 text_to_image/tools/__init__.py                           | 0
 text_to_image/tools/clip/__init__.py                      | 0
 text_to_image/tools/fid/__init__.py                       | 0
 text_to_video/__init__.py                                 | 0
 text_to_video/wan-2.2-t2v-a14b/__init__.py                | 0
 tools/__init__.py                                         | 0
 tools/submission/__init__.py                              | 0
 tools/submission/power/__init__.py                        | 0
 .../submission_checker/checks/power/__init__.py           | 0
 tools/upscale_coco/__init__.py                            | 0
 vision/__init__.py                                        | 0
 vision/classification_and_detection/__init__.py           | 0
 .../python/models/__init__.py                             | 0
 vision/classification_and_detection/tools/__init__.py     | 0
 vision/classification_and_detection/yolo/__init__.py      | 0
 vision/medical_imaging/3d-unet-kits19/__init__.py         | 0
 vision/medical_imaging/__init__.py                        | 0
 90 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 .coveragerc
 create mode 100644 automotive/3d-object-detection/__init__.py
 create mode 100644 automotive/3d-object-detection/tools/__init__.py
 create mode 100644 automotive/__init__.py
 create mode 100644 calibration/BraTS/__init__.py
 create mode 100644 calibration/__init__.py
 create mode 100644 compliance/TEST01/__init__.py
 create mode 100644 compliance/TEST04/__init__.py
 create mode 100644 compliance/TEST06/__init__.py
 create mode 100644 compliance/TEST07/__init__.py
 create mode 100644 compliance/TEST08/__init__.py
 create mode 100644 compliance/TEST09/__init__.py
 create mode 100644 compliance/__init__.py
 create mode 100644 graph/R-GAT/__init__.py
 create mode 100644 graph/R-GAT/dgl_utilities/__init__.py
 create mode 100644 graph/R-GAT/tools/__init__.py
 create mode 100644 graph/__init__.py
 create mode 100644 language/__init__.py
 create mode 100644 language/bert/__init__.py
 create mode 100644 language/deepseek-r1/__init__.py
 create mode 100644 language/gpt-j/__init__.py
 create mode 100644 language/gpt-oss-120b/__init__.py
 create mode 100644 language/gpt-oss-120b/sglang/__init__.py
 create mode 100644 language/llama2-70b/__init__.py
 create mode 100644 language/llama3.1-405b/__init__.py
 create mode 100644 language/llama3.1-8b/__init__.py
 create mode 100644 language/mixtral-8x7b/__init__.py
 create mode 100644 language/mixtral-8x7b/standalone_infer/__init__.py
 create mode 100644 loadgen/demos/__init__.py
 create mode 100644 loadgen/demos/lon/__init__.py
 create mode 100644 loadgen/demos/token_metrics/__init__.py
 create mode 100644 loadgen/docs/__init__.py
 create mode 100644 loadgen/docs/src/__init__.py
 create mode 100644 lon/__init__.py
 create mode 100644 multimodal/__init__.py
 create mode 100644 multimodal/qwen3-vl/__init__.py
 create mode 100644 multimodal/qwen3-vl/src/__init__.py
 create mode 100644 recommendation/__init__.py
 create mode 100644 recommendation/dlrm_v2/__init__.py
 create mode 100644 recommendation/dlrm_v2/pytorch/__init__.py
 create mode 100644 recommendation/dlrm_v2/pytorch/tools/__init__.py
 create mode 100644 recommendation/dlrm_v3/__init__.py
 create mode 100644 recommendation/dlrm_v3/datasets/__init__.py
 create mode 100644 recommendation/dlrm_v3/generative_recommenders/__init__.py
 create mode 100644 recommendation/dlrm_v3/generative_recommenders/modules/__init__.py
 create mode 100644 recommendation/dlrm_v3/generative_recommenders/ops/__init__.py
 create mode 100644 recommendation/dlrm_v3/generative_recommenders/ops/pytorch/__init__.py
 create mode 100644 recommendation/dlrm_v3/generative_recommenders/ops/triton/__init__.py
 create mode 100644 retired_benchmarks/__init__.py
 create mode 100644 retired_benchmarks/never_adopted/__init__.py
 create mode 100644 retired_benchmarks/never_adopted/language/__init__.py
 create mode 100644 retired_benchmarks/never_adopted/language/gpt3/__init__.py
 create mode 100644 retired_benchmarks/never_adopted/language/gpt3/megatron/__init__.py
 create mode 100644 retired_benchmarks/recommendation/__init__.py
 create mode 100644 retired_benchmarks/recommendation/dlrm/__init__.py
 create mode 100644 retired_benchmarks/recommendation/dlrm/pytorch/__init__.py
 create mode 100644 retired_benchmarks/recommendation/dlrm/pytorch/tools/__init__.py
 create mode 100644 retired_benchmarks/speech_recognition/__init__.py
 create mode 100644 retired_benchmarks/speech_recognition/rnnt/__init__.py
 create mode 100644 retired_benchmarks/speech_recognition/rnnt/pytorch/__init__.py
 create mode 100644 retired_benchmarks/translation/__init__.py
 create mode 100644 retired_benchmarks/translation/gnmt/__init__.py
 create mode 100644 retired_benchmarks/translation/gnmt/tensorflow/__init__.py
 create mode 100644 retired_benchmarks/vision/__init__.py
 create mode 100644 retired_benchmarks/vision/classification_and_detection/__init__.py
 create mode 100644 retired_benchmarks/vision/classification_and_detection/python/models/__init__.py
 create mode 100644 retired_benchmarks/vision/classification_and_detection/tools/__init__.py
 create mode 100644 retired_benchmarks/vision/medical_imaging/3d-unet-brats19/__init__.py
 create mode 100644 retired_benchmarks/vision/medical_imaging/__init__.py
 create mode 100644 speech2text/__init__.py
 create mode 100644 speech2text/utils/__init__.py
 create mode 100644 text_to_image/__init__.py
 create mode 100644 text_to_image/tools/__init__.py
 create mode 100644 text_to_image/tools/clip/__init__.py
 create mode 100644 text_to_image/tools/fid/__init__.py
 create mode 100644 text_to_video/__init__.py
 create mode 100644 text_to_video/wan-2.2-t2v-a14b/__init__.py
 create mode 100644 tools/__init__.py
 create mode 100644 tools/submission/__init__.py
 create mode 100644 tools/submission/power/__init__.py
 create mode 100644 tools/submission/submission_checker/checks/power/__init__.py
 create mode 100644 tools/upscale_coco/__init__.py
 create mode 100644 vision/__init__.py
 create mode 100644 vision/classification_and_detection/__init__.py
 create mode 100644 vision/classification_and_detection/python/models/__init__.py
 create mode 100644 vision/classification_and_detection/tools/__init__.py
 create mode 100644 vision/classification_and_detection/yolo/__init__.py
 create mode 100644 vision/medical_imaging/3d-unet-kits19/__init__.py
 create mode 100644 vision/medical_imaging/__init__.py

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000000..b0f808baef
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,8 @@
+[run]
+source = .
+omit =
+    nev/*
+    tests/*
+    retired_benchmarks/*
+    */site-packages/*
+    */dist-packages/*
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
index 27df9e33ff..5aca670a3f 100644
--- a/.github/workflows/codecov.yml
+++ b/.github/workflows/codecov.yml
@@ -11,7 +11,7 @@ on:
     branches: ["arav-codecov-impl"]
     paths:
       - "tests/**"
-      - "tools/submission/submission_checker/**"
+      - "*"
       - ".github/workflows/codecov.yml"
 
 jobs:
diff --git a/automotive/3d-object-detection/__init__.py b/automotive/3d-object-detection/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/automotive/3d-object-detection/tools/__init__.py b/automotive/3d-object-detection/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/automotive/__init__.py b/automotive/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/calibration/BraTS/__init__.py b/calibration/BraTS/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/calibration/__init__.py b/calibration/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compliance/TEST01/__init__.py b/compliance/TEST01/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compliance/TEST04/__init__.py b/compliance/TEST04/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compliance/TEST06/__init__.py b/compliance/TEST06/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compliance/TEST07/__init__.py b/compliance/TEST07/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compliance/TEST08/__init__.py b/compliance/TEST08/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compliance/TEST09/__init__.py b/compliance/TEST09/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compliance/__init__.py b/compliance/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/graph/R-GAT/__init__.py b/graph/R-GAT/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/graph/R-GAT/dgl_utilities/__init__.py b/graph/R-GAT/dgl_utilities/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/graph/R-GAT/tools/__init__.py b/graph/R-GAT/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/graph/__init__.py b/graph/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/__init__.py b/language/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/bert/__init__.py b/language/bert/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/deepseek-r1/__init__.py b/language/deepseek-r1/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/gpt-j/__init__.py b/language/gpt-j/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/gpt-oss-120b/__init__.py b/language/gpt-oss-120b/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/gpt-oss-120b/sglang/__init__.py b/language/gpt-oss-120b/sglang/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/llama2-70b/__init__.py b/language/llama2-70b/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/llama3.1-405b/__init__.py b/language/llama3.1-405b/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/llama3.1-8b/__init__.py b/language/llama3.1-8b/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/mixtral-8x7b/__init__.py b/language/mixtral-8x7b/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/language/mixtral-8x7b/standalone_infer/__init__.py b/language/mixtral-8x7b/standalone_infer/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/loadgen/demos/__init__.py b/loadgen/demos/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/loadgen/demos/lon/__init__.py b/loadgen/demos/lon/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/loadgen/demos/token_metrics/__init__.py b/loadgen/demos/token_metrics/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/loadgen/docs/__init__.py b/loadgen/docs/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/loadgen/docs/src/__init__.py b/loadgen/docs/src/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lon/__init__.py b/lon/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/multimodal/__init__.py b/multimodal/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/multimodal/qwen3-vl/__init__.py b/multimodal/qwen3-vl/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/multimodal/qwen3-vl/src/__init__.py b/multimodal/qwen3-vl/src/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/__init__.py b/recommendation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v2/__init__.py b/recommendation/dlrm_v2/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v2/pytorch/__init__.py b/recommendation/dlrm_v2/pytorch/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v2/pytorch/tools/__init__.py b/recommendation/dlrm_v2/pytorch/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v3/__init__.py b/recommendation/dlrm_v3/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v3/datasets/__init__.py b/recommendation/dlrm_v3/datasets/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v3/generative_recommenders/__init__.py b/recommendation/dlrm_v3/generative_recommenders/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/__init__.py b/recommendation/dlrm_v3/generative_recommenders/modules/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/__init__.py b/recommendation/dlrm_v3/generative_recommenders/ops/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/__init__.py b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/__init__.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/__init__.py b/retired_benchmarks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/never_adopted/__init__.py b/retired_benchmarks/never_adopted/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/never_adopted/language/__init__.py b/retired_benchmarks/never_adopted/language/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/never_adopted/language/gpt3/__init__.py b/retired_benchmarks/never_adopted/language/gpt3/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/__init__.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/recommendation/__init__.py b/retired_benchmarks/recommendation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/recommendation/dlrm/__init__.py b/retired_benchmarks/recommendation/dlrm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/__init__.py b/retired_benchmarks/recommendation/dlrm/pytorch/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/tools/__init__.py b/retired_benchmarks/recommendation/dlrm/pytorch/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/speech_recognition/__init__.py b/retired_benchmarks/speech_recognition/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/speech_recognition/rnnt/__init__.py b/retired_benchmarks/speech_recognition/rnnt/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/__init__.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/translation/__init__.py b/retired_benchmarks/translation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/translation/gnmt/__init__.py b/retired_benchmarks/translation/gnmt/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/__init__.py b/retired_benchmarks/translation/gnmt/tensorflow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/vision/__init__.py b/retired_benchmarks/vision/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/vision/classification_and_detection/__init__.py b/retired_benchmarks/vision/classification_and_detection/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/vision/classification_and_detection/python/models/__init__.py b/retired_benchmarks/vision/classification_and_detection/python/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/__init__.py b/retired_benchmarks/vision/classification_and_detection/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/vision/medical_imaging/3d-unet-brats19/__init__.py b/retired_benchmarks/vision/medical_imaging/3d-unet-brats19/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/retired_benchmarks/vision/medical_imaging/__init__.py b/retired_benchmarks/vision/medical_imaging/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/speech2text/__init__.py b/speech2text/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/speech2text/utils/__init__.py b/speech2text/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/text_to_image/__init__.py b/text_to_image/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/text_to_image/tools/__init__.py b/text_to_image/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/text_to_image/tools/clip/__init__.py b/text_to_image/tools/clip/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/text_to_image/tools/fid/__init__.py b/text_to_image/tools/fid/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/text_to_video/__init__.py b/text_to_video/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/text_to_video/wan-2.2-t2v-a14b/__init__.py b/text_to_video/wan-2.2-t2v-a14b/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/submission/__init__.py b/tools/submission/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/submission/power/__init__.py b/tools/submission/power/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/submission/submission_checker/checks/power/__init__.py b/tools/submission/submission_checker/checks/power/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/upscale_coco/__init__.py b/tools/upscale_coco/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vision/__init__.py b/vision/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vision/classification_and_detection/__init__.py b/vision/classification_and_detection/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vision/classification_and_detection/python/models/__init__.py b/vision/classification_and_detection/python/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vision/classification_and_detection/tools/__init__.py b/vision/classification_and_detection/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vision/classification_and_detection/yolo/__init__.py b/vision/classification_and_detection/yolo/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vision/medical_imaging/3d-unet-kits19/__init__.py b/vision/medical_imaging/3d-unet-kits19/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vision/medical_imaging/__init__.py b/vision/medical_imaging/__init__.py
new file mode 100644
index 0000000000..e69de29bb2