From ba7e571e82cd7d879aec7a94bd7df48d8f18cb5c Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Fri, 24 Apr 2026 17:32:19 +0200
Subject: [PATCH 01/13] Add mildly cleaned up grader to judge/

---
 problemtools/judge/grade.py | 68 +++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 problemtools/judge/grade.py

diff --git a/problemtools/judge/grade.py b/problemtools/judge/grade.py
new file mode 100644
index 00000000..6a62c6fb
--- /dev/null
+++ b/problemtools/judge/grade.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import os
+import re
+import tempfile
+from pathlib import Path
+from typing import cast
+
+from ..diagnostics import Diagnostics
+from ..run import Program
+from .result import SubmissionResult, Verdict
+
+_GRADER_OUTPUT_RE = re.compile(r'^((AC)|(WA)|(TLE)|(RTE)|(JE))\s+-?[0-9.]+\s*$')
+
+
+def grade_group(
+    sub_results: list[SubmissionResult],
+    grader: Program,
+    grader_flags: list[str],
+    base_dir: Path,
+    diag: Diagnostics,
+) -> tuple[Verdict, float | None]:
+    """Run grader on sub_results and return (verdict, score).
+
+    Returns ('AC', 0.0) immediately if sub_results is empty.
+    Returns ('JE', None) on any grader error.
+    """
+    if not sub_results:
+        return ('AC', 0.0)
+
+    if not grader.compile()[0]:
+        diag.error(f'Failed to compile grader {grader}')
+        return ('JE', None)
+
+    grader_input = ''.join(f'{r.verdict} {0 if r.score is None else r.score}\n' for r in sub_results)
+    diag.debug(f'Grading {len(sub_results)} results:\n{grader_input}')
+    diag.debug(f'Grader flags: {grader_flags}')
+
+    with tempfile.TemporaryDirectory(dir=base_dir) as tmpdir:
+        infile = Path(tmpdir) / 'grader_in'
+        outfile = Path(tmpdir) / 'grader_out'
+        errfile = Path(tmpdir) / 'grader_err'
+        infile.write_text(grader_input)
+
+        status, _runtime = grader.run(str(infile), str(outfile), str(errfile), args=grader_flags)
+
+        grader_output = outfile.read_text(errors='replace') if outfile.exists() else ''
+        stderr_content = errfile.read_text(errors='replace') if errfile.exists() else ''
+
+    if not os.WIFEXITED(status) or os.WEXITSTATUS(status) != 0:
+        if not os.WIFEXITED(status):
+            diag.error(f'Judge error: {grader} crashed')
+        else:
+            diag.error(f'Judge error: exit code {os.WEXITSTATUS(status)} for grader {grader}, expected 0')
+        if stderr_content:
+            diag.error(f'Grader stderr:\n{stderr_content}')
+        diag.debug(f'Grader input:\n{grader_input}')
+        return ('JE', None)
+
+    if not _GRADER_OUTPUT_RE.match(grader_output):
+        diag.error('Judge error: invalid format of grader output')
+        diag.debug(f'Output must match: "{_GRADER_OUTPUT_RE.pattern}"')
+        diag.debug(f'Output was: "{grader_output}"')
+        return ('JE', None)
+
+    verdict_str, score_str = grader_output.split()
+    diag.debug(f'Grader result: {verdict_str} ({score_str})')
+    return cast(Verdict, verdict_str), float(score_str)

From 2a2f0382dc9ffe265659177b64e38761452eb853 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Fri, 24 Apr 2026 19:50:18 +0200
Subject: [PATCH 02/13] Add new implementation of judging

---
 problemtools/judge/__init__.py         |   6 +-
 problemtools/judge/cache.py            | 113 ++++++++++++++
 problemtools/judge/result.py           |   3 +-
 problemtools/judge/submission_judge.py | 201 +++++++++++++++++++++++++
 problemtools/verifyproblem.py          |  15 +-
 5 files changed, 333 insertions(+), 5 deletions(-)
 create mode 100644 problemtools/judge/cache.py
 create mode 100644 problemtools/judge/submission_judge.py

diff --git a/problemtools/judge/__init__.py b/problemtools/judge/__init__.py
index 10972ded..6285a067 100644
--- a/problemtools/judge/__init__.py
+++ b/problemtools/judge/__init__.py
@@ -1,17 +1,19 @@
+from .cache import CacheKey
 from .execute import execute_testcase
 from .result import (
     SubmissionResult,
     TimeLimits,
     Verdict,
-    classify_result,
 )
+from .submission_judge import SubmissionJudge
 from .validate import validate_output
 
 __all__ = [
+    'CacheKey',
+    'SubmissionJudge',
     'SubmissionResult',
     'TimeLimits',
     'Verdict',
-    'classify_result',
     'execute_testcase',
     'validate_output',
 ]
diff --git a/problemtools/judge/cache.py b/problemtools/judge/cache.py
new file mode 100644
index 00000000..ea24bc1a
--- /dev/null
+++ b/problemtools/judge/cache.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import copy
+from concurrent.futures import Future
+from dataclasses import dataclass
+from threading import Lock
+from typing import TYPE_CHECKING
+
+from .result import SubmissionResult
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase
+
+
+@dataclass(frozen=True)
+class CacheKey:
+    input_hash: bytes
+    ans_hash: bytes
+    validator_flags: tuple[str, ...]
+
+
+@dataclass
+class _CacheEntry:
+    result: SubmissionResult
+    run_timelim: float
+
+
+def _reclassify(result: SubmissionResult, timelim: float) -> SubmissionResult:
+    """Reclassify a cached result against a (possibly lower) time limit."""
+    if result.runtime > timelim:
+        if result.validator_first and result.verdict == 'WA':
+            # Interactive: validator exited first with WA. This can cause the submission to run
+            # longer than it should. Cap runtimes at timelim so this doesn't inflate the time limit.
+            wa = copy.copy(result)
+            wa.runtime = timelim
+            return wa
+        tle = SubmissionResult('TLE')
+        tle.runtime = result.runtime
+        return tle
+    return result
+
+
+def _with_test_item(result: SubmissionResult, testcase: TestCase) -> SubmissionResult:
+    """Return result with test_item set to testcase, copying only if needed."""
+    if result.test_item is testcase:
+        return result
+    result = copy.copy(result)
+    result.test_item = testcase
+    return result
+
+
+class ResultStore:
+    """Thread-safe store mapping testcase reuse keys to execution results.
+
+    Background workers populate the store via claim()/complete(); the consumer
+    reads results via get().  A key progresses through three states: absent
+    (not yet claimed), in-flight (claimed, Future not yet resolved), and
+    completed (_CacheEntry).
+
+    Because results are always run at the high time limit, a completed entry
+    can serve any query whose time limit is <= the run limit: a result whose
+    runtime exceeds the query limit is reclassified as TLE. A query with a
+    higher limit than the run limit cannot be served from cache and returns None.
+    """
+
+    def __init__(self) -> None:
+        self._lock = Lock()
+        self._store: dict[CacheKey, Future[SubmissionResult] | _CacheEntry] = {}
+
+    def claim(self, testcase: TestCase) -> bool:
+        """Atomically claim testcase for execution.
+
+        Returns True if the key was unclaimed; the caller must eventually call
+        complete().  Returns False if the key is already in-flight or completed.
+        """
+        key = testcase.reuse_key
+        with self._lock:
+            if key in self._store:
+                return False
+            self._store[key] = Future()
+            return True
+
+    def complete(self, testcase: TestCase, result: SubmissionResult, run_timelim: float) -> None:
+        """Store the completed result and wake any consumer waiting on the future."""
+        key = testcase.reuse_key
+        with self._lock:
+            future = self._store[key]
+            self._store[key] = _CacheEntry(result=result, run_timelim=run_timelim)
+        assert isinstance(future, Future)
+        future.set_result(result)  # outside lock — callbacks may acquire other locks
+
+    def get(self, testcase: TestCase, timelim: float) -> SubmissionResult | Future[SubmissionResult] | None:
+        """Look up a result for testcase at timelim.
+
+        Returns:
+            SubmissionResult  — completed result, already reclassified for timelim; use directly.
+            Future            — in-flight; resolves to a reclassified SubmissionResult.
+            None              — not present, or was run at a lower limit than timelim and
+                                cannot be reused; caller must run the testcase synchronously.
+        """
+        key = testcase.reuse_key
+        with self._lock:
+            val = self._store.get(key)
+        if val is None:
+            return None
+        if isinstance(val, Future):
+            chained: Future[SubmissionResult] = Future()
+            val.add_done_callback(lambda f: chained.set_result(_with_test_item(_reclassify(f.result(), timelim), testcase)))
+            return chained
+        if timelim > val.run_timelim:
+            # Entry was produced at a lower limit; cannot safely reclassify upward.
+            return None
+        return _with_test_item(_reclassify(val.result, timelim), testcase)
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
index 7a2530c2..2eb0fdd8 100644
--- a/problemtools/judge/result.py
+++ b/problemtools/judge/result.py
@@ -4,7 +4,7 @@
 from typing import TYPE_CHECKING, Literal
 
 if TYPE_CHECKING:
-    from ..verifyproblem import TestCase
+    from ..verifyproblem import TestCase, TestCaseGroup
 
 Verdict = Literal['AC', 'TLE', 'OLE', 'MLE', 'RTE', 'WA', 'PAC', 'JE']
 
@@ -22,6 +22,7 @@ def __init__(
         self.reason = reason
         self.additional_info = additional_info
         self.testcase: TestCase | None = None
+        self.test_item: TestCase | TestCaseGroup | None = None
         self.runtime_testcase: TestCase | None = None
         self.runtime = -1.0
         self.ac_runtime = -1.0
diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
new file mode 100644
index 00000000..468791e6
--- /dev/null
+++ b/problemtools/judge/submission_judge.py
@@ -0,0 +1,201 @@
+from __future__ import annotations
+
+import copy
+from concurrent.futures import Future
+from pathlib import Path
+from threading import Lock
+from typing import TYPE_CHECKING
+
+from ..context import Context
+from ..diagnostics import Diagnostics
+from ..metadata import Metadata
+from ..run import Program, get_tool
+from .cache import ResultStore
+from .execute import execute_testcase
+from .grade import grade_group
+from .result import SubmissionResult, TimeLimits
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase, TestCaseGroup
+
+
+class _Cancelled:
+    """Thread-safe set of cancelled testcase identities (by Path to infile)."""
+
+    def __init__(self) -> None:
+        self._lock = Lock()
+        self._ids: set[Path] = set()
+
+    def __contains__(self, testcase: TestCase) -> bool:
+        with self._lock:
+            return testcase.infile_path in self._ids
+
+    def add(self, testcase: TestCase) -> None:
+        with self._lock:
+            self._ids.add(testcase.infile_path)
+
+
+class SubmissionJudge:
+    """Run a submission against a test case group tree and collect results.
+
+    The typical flow uses two phases:
+
+      1. precompute(timelim) — submits all filtered testcases as background jobs that
+         execute the submission and populate a result cache.  Returns immediately.
+      2. judge(timelim) — walks the test tree in DFS order, consuming cached results
+         (blocking on any still in-flight) or running synchronously if a worker missed
+         a testcase.  Returns a flat list of SubmissionResults, one per testcase plus
+         one aggregate per group, with the root group's result last.
+
+    This lets the submission run on all testcases in parallel while the consumer
+    processes results in order for grading and early-exit logic.
+
+    When an on_reject:break group encounters a non-AC result, pending (not-yet-started)
+    background jobs for the remaining testcases in that subtree are skipped.  In-flight
+    jobs complete normally; their results are simply not consumed by judge().
+    """
+
+    _default_grader: Program | None = get_tool('default_grader')
+
+    def __init__(
+        self,
+        sub: Program,
+        output_validator: Program,
+        metadata: Metadata,
+        root: TestCaseGroup,
+        base_dir: Path,
+        context: Context,
+        diag: Diagnostics,
+        custom_grader: Program | None = None,
+    ) -> None:
+        self._sub = sub
+        self._output_validator = output_validator
+        self._metadata = metadata
+        self._base_dir = base_dir
+        self._context = context
+        self._diag = diag
+        self._custom_grader = custom_grader
+        self._store = ResultStore()
+        self._root = root
+        self._cancelled = _Cancelled()
+        self._precompute_started = False
+
+    def precompute(self, timelimits: TimeLimits) -> None:
+        """Submit all filtered testcases as background jobs.
+
+        Returns immediately; workers run concurrently and deposit results into the
+        cache as they finish.  Call judge() afterwards to consume results in DFS order.
+        May be called at most once.
+        """
+        assert not self._precompute_started, 'precompute() called more than once'
+        self._precompute_started = True
+        filtered_testcases = (item for item in self._root.get_all_testcases() if item.matches_filter(self._context.data_filter))
+        for testcase in filtered_testcases:
+            self._context.submit_background_work(self._populate_cache_for_testcase, testcase, timelimits)
+
+    def judge(self, timelim: float) -> list[SubmissionResult]:
+        """Walk the test tree in DFS order and return results as a flat list.
+
+        Each SubmissionResult has test_item set to the TestCase or TestCaseGroup it
+        covers.  Group results immediately follow all their descendants; the root
+        group's result is the last element.  Returns an empty list if all testcases
+        were filtered out.
+
+        Blocks on any cache entry still being computed by a precompute() worker.
+        Testcases not yet claimed by a worker are run synchronously.  Safe to call
+        multiple times with different timelim values; subsequent calls almost always
+        hit the cache without new work.  When querying multiple time limits, call
+        with the largest first so that cached results can be reused for smaller limits.
+        """
+        return self._judge_group(self._root, timelim)
+
+    def _run(self, testcase: TestCase, timelim: float) -> SubmissionResult:
+        flat = TimeLimits(nominal=timelim, low=timelim, high=timelim)
+        _, _, raw = execute_testcase(
+            testcase,
+            self._sub,
+            self._output_validator,
+            self._metadata,
+            flat,
+            self._base_dir,
+            self._diag,
+        )
+        return raw
+
+    def _populate_cache_for_testcase(self, testcase: TestCase, timelimits: TimeLimits) -> None:
+        if testcase in self._cancelled:
+            return
+        if not self._store.claim(testcase):
+            return  # duplicate testcase (same reuse_key) or already in store
+        self._store.complete(testcase, self._run(testcase, timelimits.high), timelimits.high)
+
+    def _judge_testcase(self, testcase: TestCase, timelim: float) -> SubmissionResult:
+        val = self._store.get(testcase, timelim)
+        if isinstance(val, Future):
+            return val.result()  # block until worker finishes
+        if val is not None:
+            return val
+        # Synchronous fallback: worker hasn't claimed this testcase yet, or second
+        # judge() call with a timelim the store can't serve.  Claim so any pending
+        # worker for it bails out rather than duplicating work.
+        claimed = self._store.claim(testcase)
+        result = self._run(testcase, timelim)
+        if claimed:
+            self._store.complete(testcase, result, timelim)
+        return result
+
+    def _cancel_subtree(self, group: TestCaseGroup) -> None:
+        for testcase in group.get_all_testcases():
+            self._cancelled.add(testcase)
+
+    def _grader_for(self, group: TestCaseGroup) -> Program | None:
+        if group.config.get('grading') == 'custom':
+            return self._custom_grader
+        return self._default_grader
+
+    def _judge_group(self, group: TestCaseGroup, timelim: float) -> list[SubmissionResult]:
+        all_results: list[SubmissionResult] = []  # Results of all children, groups and test cases, in DFS order. Our return value
+        child_results: list[SubmissionResult] = []  # Results of our direct children, what we'll pass to the grader
+
+        filtered_items = (item for item in group._items if item.matches_filter(self._context.data_filter))
+        for item in filtered_items:
+            if item.is_group:
+                sub = self._judge_group(item, timelim)
+                if not sub:  # If everything in a group is filtered, it returns an empty list.
+                    continue
+                all_results.extend(sub)
+                result = sub[-1]  # last element is the subgroup's own result
+            else:
+                result = self._judge_testcase(item, timelim)
+                # Apply default score here - after we've entered it into the cache, as it may also be present in other groups with different defaults
+                if result.score is None:
+                    result = copy.copy(result)
+                    if result.verdict == 'AC':
+                        result.score = group.config['accept_score']
+                    else:
+                        result.score = group.config['reject_score']
+                all_results.append(result)
+
+            child_results.append(result)
+            if result.verdict != 'AC' and group.config.get('on_reject') == 'break':
+                self._cancel_subtree(group)  # Stop starting more precomputations for submissions in this group or below
+                break
+
+        if not all_results:  # All our children were filtered
+            return []
+
+        judge_error = next((r for r in child_results if r.verdict == 'JE'), None)
+        if judge_error:
+            group_verdict = copy.copy(judge_error)
+        else:
+            grader = self._grader_for(group)
+            if grader is None:
+                group_verdict = SubmissionResult('JE', reason='grader not found')
+            else:
+                grader_flags = group.config.get('grader_flags', '').split()
+                verdict, score = grade_group(child_results, grader, grader_flags, self._base_dir, self._diag)
+                group_verdict = SubmissionResult(verdict, score=score)
+
+        group_verdict.test_item = group
+        all_results.append(group_verdict)
+        return all_results
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 4b3d1fdb..69e7d4fa 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -35,11 +35,12 @@
 from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
-from .judge import SubmissionResult, Verdict, TimeLimits, validate_output, execute_testcase
+from .judge import CacheKey, SubmissionResult, Verdict, TimeLimits, validate_output, execute_testcase
 from .version import add_version_arg
 
 from abc import ABC
-from typing import Any, Callable, ClassVar, Pattern, Match, ParamSpec, TypeVar, cast
+from functools import cached_property
+from typing import Any, Callable, ClassVar, Literal, Pattern, Match, ParamSpec, TypeVar, cast
 from pydantic import ValidationError
 
 random.seed(42)
@@ -129,6 +130,7 @@ def check(self, context: Context) -> bool:
 
 class TestCase(ProblemAspect):
     Result = tuple[SubmissionResult, SubmissionResult, SubmissionResult]
+    is_group: Literal[False] = False  # Temporary workaround for a circular import in judge/submission_judge.py
 
     def __init__(self, problem: Problem, base: str, testcasegroup: TestCaseGroup) -> None:
         super().__init__(f'test.{testcasegroup.name}.{os.path.basename(base)}', problem)
@@ -182,6 +184,14 @@ def output_validator_flags(self) -> list[str]:
             + self.testcasegroup.config.get('output_validator_flags', '').split()
         )
 
+    @cached_property
+    def reuse_key(self) -> CacheKey:
+        return CacheKey(
+            input_hash=hashlib.sha256(self.infile_path.read_bytes()).digest(),
+            ans_hash=hashlib.sha256(self.ansfile_path.read_bytes()).digest(),
+            validator_flags=tuple(self.output_validator_flags),
+        )
+
     def is_in_sample_group(self) -> bool:
         return self.strip_path_prefix(self.infile).startswith('sample')
 
@@ -304,6 +314,7 @@ class TestCaseGroup(ProblemAspect):
     name: str
     _DEFAULT_CONFIG = config.load_config('testdata.yaml')
     _SCORING_ONLY_KEYS = ['accept_score', 'reject_score', 'range']
+    is_group: Literal[True] = True  # Temporary workaround for a circular import in judge/submission_judge.py
 
     def __init__(self, problem: Problem, datadir: str | None = None, parent: TestCaseGroup | None = None):
         self._parent = parent

From 0a6b8cae01ec8e8611458c2430870f3cfb0221ab Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Fri, 24 Apr 2026 20:14:10 +0200
Subject: [PATCH 03/13] Clean up old unused methods all_datasets()

---
 problemtools/verifyproblem.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 69e7d4fa..94ce7b8d 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -306,9 +306,6 @@ def _init_result_for_testcase(self, res: SubmissionResult) -> SubmissionResult:
     def get_all_testcases(self) -> list[TestCase]:
         return [self]
 
-    def all_datasets(self) -> list[str]:
-        return [self._base]
-
 
 class TestCaseGroup(ProblemAspect):
     name: str
@@ -635,12 +632,6 @@ def aggregate_results(self, sub, sub_results: list[SubmissionResult], shadow_res
                     )
         return res
 
-    def all_datasets(self) -> list:
-        res: list = []
-        for child in self._items:
-            res += child.all_datasets()
-        return res
-
 
 class ProblemStatement(ProblemPart):
     statements: dict[str, list[Path]]  # Maps language code -> statement(s)

From ee88f2155cbf59ac15570c3a1a256d6266ee63cd Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 11:04:05 +0200
Subject: [PATCH 04/13] Plug in new SubmissionJudge to run submissions
 (temporarily breaking some of the output)

---
 problemtools/judge/submission_judge.py |   1 +
 problemtools/verifyproblem.py          | 101 ++++++++++++++++++-------
 2 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
index 468791e6..6fe69e27 100644
--- a/problemtools/judge/submission_judge.py
+++ b/problemtools/judge/submission_judge.py
@@ -195,6 +195,7 @@ def _judge_group(self, group: TestCaseGroup, timelim: float) -> list[SubmissionR
                 grader_flags = group.config.get('grader_flags', '').split()
                 verdict, score = grade_group(child_results, grader, grader_flags, self._base_dir, self._diag)
                 group_verdict = SubmissionResult(verdict, score=score)
+                group_verdict.runtime = max(v.runtime for v in child_results)
 
         group_verdict.test_item = group
         all_results.append(group_verdict)
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 94ce7b8d..507d85a9 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -35,7 +35,7 @@
 from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
-from .judge import CacheKey, SubmissionResult, Verdict, TimeLimits, validate_output, execute_testcase
+from .judge import CacheKey, SubmissionJudge, SubmissionResult, Verdict, TimeLimits, validate_output, execute_testcase
 from .version import add_version_arg
 
 from abc import ABC
@@ -419,6 +419,17 @@ def get_score_range(self) -> tuple[float, float]:
         except Exception:
             return (float('-inf'), float('inf'))
 
+    def check_score_in_bounds(self, sub: run.Program, score: float) -> None:
+        # Don't warn twice on the same subgroup, since every submission is likely
+        # to have the same error.
+        min_score, max_score = self.get_score_range()
+        if not (min_score <= score <= max_score) and not self._seen_oob_scores:
+            self._seen_oob_scores = True
+            groupname = os.path.relpath(self._datadir, self._problem.probdir)
+            self.error(
+                f'submission {sub} got score {score} on group {groupname}, which is outside of expected score range [{min_score}, {max_score}]'
+            )
+
     def check(self, context: Context) -> bool:
         if self._check_res is not None:
             return self._check_res
@@ -1423,42 +1434,76 @@ def check_submission(
         self, sub, context: Context, expected_verdict: Verdict, timelim: float, timelim_high: float
     ) -> SubmissionResult:
         desc = f'{expected_verdict} submission {sub}'
-        partial = False
-        if expected_verdict == 'PAC':
-            expected_verdict = 'AC'
-            partial = True
-            # For partially accepted, we don't want to use them to lower bound the time limit, but we do want
-            # to warn if they're slow enough that they would have affected the time limit, had they been used
-            # to compute it.
-            timelim_low = timelim / self.problem.metadata.limits.time_multipliers.ac_to_time_limit
-        else:
-            timelim_low = timelim
-
-        with Runner(self.problem, sub, context, timelim, timelim_low, timelim_high) as runner:
-            result, result_low, result_high = self.problem.testdata.run_submission(sub, runner, context)
+        partial = expected_verdict == 'PAC'
 
-        if result.verdict == 'AC' and expected_verdict == 'AC' and not partial and result.sample_failures:
-            res = result.sample_failures[0]
-            self.warning(f'{desc} got {res.verdict} on sample: {res}')
-
-        if result_low.verdict != result_high.verdict or result_low.score != result_high.score:
-            r1, r2 = (
-                (result_low, result_high)
-                if result_low.verdict == result_high.verdict
-                else (result_low.verdict, result_high.verdict)
+        judge = SubmissionJudge(
+            sub=sub,
+            output_validator=self.problem.output_validators.output_validator,
+            metadata=self.problem.metadata,
+            root=self.problem.testdata,
+            base_dir=Path(self.problem.tmpdir),
+            context=context,
+            diag=self._diag,
+            custom_grader=self.problem.graders._grader,
+        )
+        if context.executor is not None:
+            judge.precompute(TimeLimits(nominal=timelim_high, low=timelim_high, high=timelim_high))
+        results_high = judge.judge(timelim_high)
+        if not results_high:  # TODO: We should move this check further out to avoid needing to fake a result here
+            self.info('Found no test cases to run on. Did you filter them all out?')
+            return SubmissionResult(expected_verdict)
+        result_high = results_high[-1]
+
+        results = judge.judge(timelim)
+        result = results[-1]
+
+        # Check if scores were outside of the range for any groups
+        if self.problem.is_scoring():
+            for r in results:
+                if r.score is not None and isinstance(r.test_item, TestCaseGroup):
+                    r.test_item.check_score_in_bounds(sub, r.score)
+
+        # Warn if AC (but not PAC) submissions fail on samples. It's not uncommon for sample cases to be
+        # ignored, so failing on them could be silent otherwise. Skip warning if the result isn't AC -
+        # then something worse has gone wrong, and we'll error later.
+        if expected_verdict == 'AC' and result.verdict == 'AC':
+            sample_failure = next(
+                (
+                    r
+                    for r in results
+                    if r.verdict != 'AC' and isinstance(r.test_item, TestCase) and r.test_item.is_in_sample_group()
+                ),
+                None,
             )
+            if sample_failure:
+                self.warning(f'{desc} got {sample_failure.verdict} on sample: {sample_failure}')
+
+        # Warn if a PAC submission would affect time limit, had it been use to compute the time limit. Only do this
+        # if it gets AC on the computed time limit, otherwise we have other warnings below.
+        if partial and result.verdict == 'AC':
+            runtime_without_affecting_tl = timelim / self.problem.metadata.limits.time_multipliers.ac_to_time_limit
+            result_without_affecting_tl = judge.judge(runtime_without_affecting_tl)[-1]
+            if result_without_affecting_tl.verdict != 'AC':
+                timelimits_to_test = set(r.runtime for r in results if r.runtime > runtime_without_affecting_tl)
+                for t in sorted(timelimits_to_test):
+                    r = judge.judge(t)[-1]
+                    if r.verdict == 'AC':
+                        self.warning(f'{desc} is slower than all AC submissions. It needs {t:.2f}s to get AC')
+
+        if result.verdict != result_high.verdict or result.score != result_high.score:
             self.warning(
-                f'{desc} sensitive to time limit: limit of {timelim_low} secs -> {r1}, limit of {timelim_high} secs -> {r2}'
+                f'{desc} sensitive to time limit: limit of {timelim} secs -> {result}, limit of {timelim_high} secs -> {result_high}'
             )
 
+        required_verdict: Verdict = 'AC' if partial else expected_verdict
         if partial and self.fully_accepted(result):
-            self.warning(f'{desc} got {result}')
-        elif result.verdict == expected_verdict:
+            self.warning(f'{desc} was fully accepted: {result}')
+        elif result.verdict == required_verdict:
             self.msg(f'   {desc} OK: {result}')
-            if expected_verdict == 'AC' and not partial and not self.fully_accepted(result) and self.full_score_finite():
+            if not partial and required_verdict == 'AC' and not self.fully_accepted(result) and self.full_score_finite():
                 # For some heuristic problems, this is expected. Thus, only warn.
                 self.warning(f'{desc} did not attain full score (consider moving it to partially_accepted)')
-        elif result_high.verdict == expected_verdict and not (partial and self.fully_accepted(result_high)):
+        elif result_high.verdict == required_verdict and not (partial and self.fully_accepted(result_high)):
             self.msg(f'   {desc} OK with extra time: {result_high}')
         else:
             self.error(f'{desc} got {result}', result_high.additional_info)

From d1ea37d93d0b1d25abc242f48947d81d58657981 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 11:38:05 +0200
Subject: [PATCH 05/13] Rip out old code for running submissions from
 verifyproblem (relaced by problemtools.judge)

---
 problemtools/verifyproblem.py | 353 +---------------------------------
 1 file changed, 2 insertions(+), 351 deletions(-)

diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 507d85a9..806738a1 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -4,8 +4,6 @@
 
 import argparse
 import math
-import threading
-import queue
 import glob
 import string
 import hashlib
@@ -16,7 +14,6 @@
 import logging
 import tempfile
 import sys
-import copy
 import random
 import traceback
 import uuid
@@ -35,12 +32,12 @@
 from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
-from .judge import CacheKey, SubmissionJudge, SubmissionResult, Verdict, TimeLimits, validate_output, execute_testcase
+from .judge import CacheKey, SubmissionJudge, SubmissionResult, Verdict, TimeLimits, validate_output
 from .version import add_version_arg
 
 from abc import ABC
 from functools import cached_property
-from typing import Any, Callable, ClassVar, Literal, Pattern, Match, ParamSpec, TypeVar, cast
+from typing import Any, Callable, ClassVar, Literal, Pattern, Match, ParamSpec, TypeVar
 from pydantic import ValidationError
 
 random.seed(42)
@@ -139,7 +136,6 @@ def __init__(self, problem: Problem, base: str, testcasegroup: TestCaseGroup) ->
         self.ansfile = f'{base}.ans'
         self._problem = problem
         self.testcasegroup = testcasegroup
-        self.reuse_result_from: TestCase | None = None
         self.counter = len(problem.testcase_by_infile)
         problem.testcase_by_infile[self.infile] = self
 
@@ -228,7 +224,6 @@ def check(self, context: Context) -> bool:
                     self.error(f'judge answer file got {val_res} on testcase {self.strip_path_prefix(self.ansfile)}')
                 else:
                     self.warning(f'judge answer file got {val_res} on testcase {self.strip_path_prefix(self.ansfile)}')
-        self._check_symlinks()
         return self._check_res
 
     def __str__(self) -> str:
@@ -237,72 +232,6 @@ def __str__(self) -> str:
     def matches_filter(self, filter_re: Pattern[str]) -> bool:
         return filter_re.search(self.strip_path_prefix(self._base)) is not None
 
-    def set_symlinks(self) -> None:
-        if not os.path.islink(self.infile):
-            return
-        target = os.path.realpath(self.infile)
-        if target in self._problem.testcase_by_infile:
-            self.reuse_result_from = self._problem.testcase_by_infile[target]
-
-    def _check_symlinks(self) -> bool:
-        if not os.path.islink(self.infile):
-            return True
-        nicepath = os.path.relpath(self.infile, self._problem.probdir)
-        in_target = os.path.realpath(self.infile)
-        ans_target = os.path.realpath(self.ansfile)
-        if not in_target.endswith('.in'):
-            self.error(f"Symbolic link does not point to a .in file for input '{nicepath}'")
-            return False
-        if ans_target != f'{in_target[:-3]}.ans':
-            self.error(f"Symbolic link '{nicepath}' must have a corresponding link for answer file")
-            return False
-        if self.reuse_result_from is None:
-            self.error(f"Symbolic link points outside data/ directory for file '{nicepath}'")
-            return False
-        if (
-            self.testcasegroup.config['output_validator_flags']
-            != self.reuse_result_from.testcasegroup.config['output_validator_flags']
-        ):
-            self.error(f"Symbolic link '{nicepath}' points to testcase with different output validator flags")
-            return False
-        return True
-
-    def run_submission(self, sub, runner: Runner, context: Context) -> Result:
-        (res, res_low, res_high), reused = runner.run(self)
-        res = self._init_result_for_testcase(res)
-        res_low = self._init_result_for_testcase(res_low)
-        res_high = self._init_result_for_testcase(res_high)
-        msg = 'Reused test file result' if reused else 'Test file result'
-        self.info(f'{msg}: {res}')
-        if res.verdict != 'AC' and self.is_in_sample_group():
-            res.sample_failures.append(res)
-
-        return (res, res_low, res_high)
-
-    def run_submission_real(self, sub, context: Context, timelim: float, timelim_low: float, timelim_high: float) -> Result:
-        # This may be called off-main thread.
-        timelimits = TimeLimits(nominal=timelim, low=timelim_low, high=timelim_high)
-        return execute_testcase(
-            testcase=self,
-            sub=sub,
-            output_validator=self._problem.output_validators.output_validator,
-            metadata=self._problem.metadata,
-            timelimits=timelimits,
-            base_dir=Path(self.problem.tmpdir),
-            diag=self._diag,
-        )
-
-    def _init_result_for_testcase(self, res: SubmissionResult) -> SubmissionResult:
-        res = copy.copy(res)
-        res.testcase = self
-        res.runtime_testcase = self
-        if res.score is None:
-            if res.verdict == 'AC':
-                res.score = self.testcasegroup.config['accept_score']
-            else:
-                res.score = self.testcasegroup.config['reject_score']
-        return res
-
     def get_all_testcases(self) -> list[TestCase]:
         return [self]
 
@@ -374,19 +303,12 @@ def __init__(self, problem: Problem, datadir: str | None = None, parent: TestCas
                     if ext == '.ans' and os.path.isfile(f'{base}.in'):
                         self._items.append(TestCase(problem, base, self))
 
-        if not parent:
-            self.set_symlinks()
-
     def start_background_work(self, context: Context) -> None:
         pass
 
     def __str__(self) -> str:
         return f'testcase group {self.name}'
 
-    def set_symlinks(self) -> None:
-        for sub in self._items:
-            sub.set_symlinks()
-
     def matches_filter(self, filter_re: Pattern[str]) -> bool:
         return True
 
@@ -402,12 +324,6 @@ def get_testcases(self) -> list[TestCase]:
     def get_subgroups(self) -> list[TestCaseGroup]:
         return [child for child in self._items if isinstance(child, TestCaseGroup)]
 
-    def get_subgroup(self, name: str) -> TestCaseGroup | None:
-        return next(
-            (child for child in self._items if isinstance(child, TestCaseGroup) and os.path.basename(child._datadir) == name),
-            None,
-        )
-
     def has_custom_groups(self) -> bool:
         return any(group.get_subgroups() for group in self.get_subgroups())
 
@@ -575,74 +491,6 @@ def parse_num(s: str, i: int) -> tuple[int, int]:
 
         return self._check_res
 
-    def run_submission(self, sub, runner: Runner, context: Context) -> TestCase.Result:
-        self.info(f'Running on {self}')
-        subres: list[SubmissionResult] = []
-        subres_low: list[SubmissionResult] = []
-        subres_high: list[SubmissionResult] = []
-        active_low, active = True, True
-        on_reject = self.config['on_reject']
-        broken = False
-        for child in self._items:
-            if not child.matches_filter(context.data_filter):
-                continue
-            res, res_low, res_high = child.run_submission(sub, runner, context)
-            subres_high.append(res_high)
-            if active:
-                subres.append(res)
-            if active_low:
-                subres_low.append(res_low)
-            if on_reject == 'break':
-                active_low &= res_low.verdict == 'AC'
-                active &= res.verdict == 'AC'
-                if res_high.verdict != 'AC':
-                    broken = True
-                    break
-
-        runner.mark_group_done(self, broken)
-
-        return (
-            self.aggregate_results(sub, subres),
-            self.aggregate_results(sub, subres_low, shadow_result=True),
-            self.aggregate_results(sub, subres_high, shadow_result=True),
-        )
-
-    def aggregate_results(self, sub, sub_results: list[SubmissionResult], shadow_result: bool = False) -> SubmissionResult:
-        res = SubmissionResult('JE')
-
-        for r in sub_results:
-            if r.runtime > res.runtime:
-                res.runtime = r.runtime
-                res.runtime_testcase = r.runtime_testcase
-            if r.ac_runtime > res.ac_runtime:
-                res.ac_runtime = r.ac_runtime
-                res.ac_runtime_testcase = r.ac_runtime_testcase
-            res.sample_failures.extend(r.sample_failures)
-
-        judge_error = next((r for r in sub_results if r.verdict == 'JE'), None)
-        if judge_error:
-            res.verdict = judge_error.verdict
-            res.reason = judge_error.reason
-            res.additional_info = judge_error.additional_info
-            res.testcase = judge_error.testcase
-        else:
-            res.verdict, score = self._problem.graders.grade(sub_results, self, shadow_result)
-            if sub_results:
-                res.testcase = sub_results[-1].testcase
-                res.additional_info = sub_results[-1].additional_info
-            if self._problem.is_scoring():
-                res.score = score
-                min_score, max_score = self.get_score_range()
-                if score is not None and not (min_score <= score <= max_score) and not self._seen_oob_scores:
-                    # Don't warn twice on the same subgroup, since every submission is likely
-                    # to have the same error.
-                    self._seen_oob_scores = True
-                    groupname = os.path.relpath(self._datadir, self._problem.probdir)
-                    self.error(
-                        f'submission {sub} got {res} on group {groupname}, which is outside of expected score range [{min_score}, {max_score}]'
-                    )
-        return res
-
 
 class ProblemStatement(ProblemPart):
     statements: dict[str, list[Path]]  # Maps language code -> statement(s)
@@ -1095,94 +943,6 @@ def check(self, context: Context) -> bool:
                 self.fatal(f'Compile error for {self._grader}', msg)
         return self._check_res
 
-    def grade(
-        self, sub_results: list[SubmissionResult], testcasegroup: TestCaseGroup, shadow_result: bool = False
-    ) -> tuple[Verdict, float | None]:
-        if testcasegroup.config['grading'] == 'default':
-            if not self._default_grader:
-                self.fatal('Failed to locate default grader')
-                return ('JE', None)
-            grader = self._default_grader
-        else:
-            if not self._grader:
-                self.fatal('Problem has grading: custom without any custom grader')
-                return ('JE', None)
-            grader = self._grader
-
-        if not grader.compile()[0]:
-            self.fatal(f'Failed to compile grader {grader}', grader.compile()[1])
-            return ('JE', None)
-
-        grader_input = ''.join([f'{r.verdict} {0 if r.score is None else r.score}\n' for r in sub_results])
-        grader_output_re = r'^((AC)|(WA)|(TLE)|(RTE)|(JE))\s+-?[0-9.]+\s*$'
-        verdict: Verdict = 'AC'
-        score: float = 0
-
-        if not sub_results:
-            self.info(f'No results on {testcasegroup}, so no grader ran')
-            return (verdict, score)
-
-        grader_flags = testcasegroup.config['grader_flags'].split()
-        self.debug(f'Grading {len(sub_results)} results:\n{grader_input}')
-        self.debug(f'Grader flags: {grader_flags}')
-
-        infile_path = outfile_path = errfile_path = None
-        try:
-            # Create input and output files for grader
-            # We do it in this awkward way because the files need to be closed before reading/writing
-            with tempfile.NamedTemporaryFile(mode='w', delete=False) as infile:
-                infile.write(grader_input)
-                infile_path = infile.name
-
-            with tempfile.NamedTemporaryFile(delete=False) as outfile:
-                outfile_path = outfile.name
-
-            with tempfile.NamedTemporaryFile(delete=False) as errfile:
-                errfile_path = errfile.name
-
-            status, runtime = grader.run(infile_path, outfile_path, errfile_path, args=grader_flags)
-
-            with open(outfile_path, 'r') as fh:
-                grader_output = fh.read()
-
-            with open(errfile_path, 'r') as errfile:
-                stderr_content = errfile.read()
-
-            if not os.WIFEXITED(status) or os.WEXITSTATUS(status) != 0:
-                if not os.WIFEXITED(status):
-                    self.error(f'Judge error: {grader} crashed')
-                else:
-                    self.error(f'Judge error: exit code {os.WEXITSTATUS(status)} for grader {grader}, expected 0')
-                self.error(f'Grader stderr:\n{stderr_content}\n')
-                self.debug(f'Grader input:\n{grader_input}')
-                return ('JE', None)
-
-            if not re.match(grader_output_re, grader_output):
-                self.error('Judge error: invalid format of grader output')
-                self.debug(f'Output must match: "{grader_output_re}"')
-                self.debug(f'Output was: "{grader_output}"')
-                return ('JE', None)
-
-            verdict_str, score_str = grader_output.split()
-            # Make mypy happy by explicitly using cast
-            verdict = cast(Verdict, verdict_str)
-            score = float(score_str)
-
-            if not shadow_result:
-                self.debug(f'Grade on {testcasegroup} is {verdict} ({score})')
-
-            return (verdict, score)
-        except Exception as e:
-            self.error(f'Grader failed with exception {e}')
-            return ('JE', None)
-        finally:
-            for path in [infile_path, outfile_path, errfile_path]:
-                if path:
-                    try:
-                        os.remove(path)
-                    except OSError:
-                        pass
-
 
 class OutputValidators(ProblemPart):
     _default_validator = run.get_tool('default_validator')
@@ -1293,115 +1053,6 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
         return self._check_res
 
 
-class Runner:
-    def __init__(self, problem: Problem, sub, context: Context, timelim: float, timelim_low: float, timelim_high: float) -> None:
-        self._problem = problem
-        self._sub = sub
-        self._context = context
-        self._multithreaded = context.executor is not None
-        self._timelim = timelim
-        self._timelim_low = timelim_low
-        self._timelim_high = timelim_high
-        self._cache: dict[TestCase, TestCase.Result] = {}
-        if self._multithreaded:
-            self._queues: dict[TestCase, queue.Queue[TestCase.Result]] = {}
-            self._lock = threading.Lock()
-            self._started_jobs: set[TestCase] = set()
-            self._done_groups: set[TestCaseGroup] = set()
-            self._remaining_jobs: list[TestCase] = []
-            self._recompute_jobs()
-
-    def __enter__(self) -> Runner:
-        if self._multithreaded:
-            for i in range(len(self._remaining_jobs)):
-                self._context.submit_background_work(self._work)
-        return self
-
-    def __exit__(self, *exc) -> None:
-        if self._multithreaded:
-            with self._lock:
-                self._remaining_jobs = []
-
-    def run(self, testcase: TestCase) -> tuple[TestCase.Result, bool]:
-        while testcase.reuse_result_from:
-            testcase = testcase.reuse_result_from
-
-        if testcase in self._cache:
-            return (self._cache[testcase], True)
-
-        if sys.stdout.isatty():
-            msg = f'Running {self._sub} on {testcase}...'
-            sys.stdout.write(msg)
-            sys.stdout.flush()
-
-        if self._multithreaded:
-            result = self._queues[testcase].get()
-        else:
-            result = self._run_submission_real(testcase)
-
-        if sys.stdout.isatty():
-            sys.stdout.write('\b \b' * len(msg))
-
-        self._cache[testcase] = result
-        return (result, False)
-
-    def mark_group_done(self, group: TestCaseGroup, broken: bool) -> None:
-        if self._multithreaded:
-            self._done_groups.add(group)
-            if broken:
-                # Since a group was broken out of, some test cases may no
-                # longer be relevant to run. Recompute the work list.
-                self._recompute_jobs()
-
-    def _run_submission_real(self, item: TestCase) -> TestCase.Result:
-        return item.run_submission_real(self._sub, self._context, self._timelim, self._timelim_low, self._timelim_high)
-
-    def _work(self) -> None:
-        item = self._next_job()
-        if item:
-            res = self._run_submission_real(item)
-            self._queues[item].put(res)
-
-    def _gather_testcases(self, item: TestCase | TestCaseGroup) -> list[TestCase]:
-        if not item.matches_filter(self._context.data_filter):
-            return []
-        if isinstance(item, TestCase):
-            # If testcase is symlink, recursively follow the symlinks until we get a real testcase, ignoring
-            # whether the name of testcases pointed to matches the filter
-            while item.reuse_result_from:
-                item = item.reuse_result_from
-
-            return [item]
-        elif item not in self._done_groups:
-            ret = []
-            for child in item.get_testcases() + item.get_subgroups():
-                ret.extend(self._gather_testcases(child))
-            return ret
-        else:
-            return []
-
-    def _next_job(self) -> TestCase | None:
-        with self._lock:
-            if self._remaining_jobs:
-                job = self._remaining_jobs.pop()
-                self._started_jobs.add(job)
-                return job
-            else:
-                return None
-
-    def _recompute_jobs(self) -> None:
-        with self._lock:
-            seen = set(self._started_jobs)
-            self._remaining_jobs = []
-            for testcase in self._gather_testcases(self._problem.testdata):
-                if testcase not in seen:
-                    seen.add(testcase)
-                    self._remaining_jobs.append(testcase)
-                    if testcase not in self._queues:
-                        self._queues[testcase] = queue.Queue(maxsize=1)
-            self._remaining_jobs.reverse()
-
-
 class Submissions(ProblemPart):
     # (verdict, directory, required)
     _VERDICTS: list[tuple[Verdict, str, bool]] = [

From 55952479fefa0c0e10446d8b4295c8340d0d4546 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 11:53:06 +0200
Subject: [PATCH 06/13] Remove TimeLimits - time limits are now just a simple
 float

---
 problemtools/judge/__init__.py         |  2 --
 problemtools/judge/execute.py          | 13 ++++-----
 problemtools/judge/result.py           | 40 --------------------------
 problemtools/judge/submission_judge.py | 16 +++++------
 problemtools/verifyproblem.py          |  5 ++--
 5 files changed, 15 insertions(+), 61 deletions(-)

diff --git a/problemtools/judge/__init__.py b/problemtools/judge/__init__.py
index 6285a067..e338bc62 100644
--- a/problemtools/judge/__init__.py
+++ b/problemtools/judge/__init__.py
@@ -2,7 +2,6 @@
 from .execute import execute_testcase
 from .result import (
     SubmissionResult,
-    TimeLimits,
     Verdict,
 )
 from .submission_judge import SubmissionJudge
@@ -12,7 +11,6 @@
     'CacheKey',
     'SubmissionJudge',
     'SubmissionResult',
-    'TimeLimits',
     'Verdict',
     'execute_testcase',
     'validate_output',
diff --git a/problemtools/judge/execute.py b/problemtools/judge/execute.py
index d9396f33..fcc98057 100644
--- a/problemtools/judge/execute.py
+++ b/problemtools/judge/execute.py
@@ -32,7 +32,7 @@
 
 if TYPE_CHECKING:
     from ..verifyproblem import TestCase
-from .result import SubmissionResult, TimeLimits, classify_result
+from .result import SubmissionResult
 from .validate import _parse_validator_result, _validate_output
 
 _INTERACTIVE_OUTPUT_RE = re.compile(r'\d+ \d+\.\d+ \d+ \d+\.\d+ (validator|submission)')
@@ -206,16 +206,15 @@ def execute_testcase(
     sub: Program,
     output_validator: Program,
     metadata: Metadata,
-    timelimits: TimeLimits,
+    timelim: float,
     base_dir: Path,
     diag: Diagnostics,
-) -> tuple[SubmissionResult, SubmissionResult, SubmissionResult]:
-    """Run sub on testcase and return (nominal, low, high) SubmissionResults."""
+) -> SubmissionResult:
+    """Run sub on a single testcase."""
     with tempfile.TemporaryDirectory(dir=base_dir) as exec_dir:
         execution_dir = Path(exec_dir)
         (execution_dir / 'feedback').mkdir()
         if metadata.is_multi_pass():
-            raw = _run_multipass(testcase, sub, output_validator, metadata, timelimits.high, execution_dir, diag)
+            return _run_multipass(testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
         else:
-            raw = _run_pass(testcase.infile_path, testcase, sub, output_validator, metadata, timelimits.high, execution_dir, diag)
-    return classify_result(raw, timelimits)
+            return _run_pass(testcase.infile_path, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
index 2eb0fdd8..f8dc3a78 100644
--- a/problemtools/judge/result.py
+++ b/problemtools/judge/result.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
 if TYPE_CHECKING:
@@ -47,42 +46,3 @@ def __str__(self) -> str:
         if self.runtime != -1:
             details.append(f'CPU: {self.runtime:.2f}s @ {self.runtime_testcase}')
         return verdict if not details else f'{verdict} [{", ".join(details)}]'
-
-
-@dataclass
-class TimeLimits:
-    nominal: float  # official limit; verdict based on this
-    low: float  # below this is comfortably AC; above is "sensitive to time limit"
-    high: float  # wall-clock ceiling enforced on the process
-
-
-def classify_result(
-    result: SubmissionResult,
-    tl: TimeLimits,
-) -> tuple[SubmissionResult, SubmissionResult, SubmissionResult]:
-    """Map a raw high-limit result into the (nominal, low, high) triple."""
-    runtime = result.runtime
-    if runtime <= tl.low:
-        nominal = low = high = result
-    elif runtime <= tl.nominal:
-        tle = SubmissionResult('TLE')
-        tle.runtime = runtime
-        nominal, low, high = result, tle, result
-    elif result.validator_first and result.verdict == 'WA':
-        # Interactive: validator exited first with WA. This can cause the submission to run
-        # longer than it should. Cap runtimes at tl.low so this doesn't inflate the time limit.
-        import copy
-
-        high = copy.copy(result)
-        high.runtime = min(runtime, tl.low)
-        wa = SubmissionResult('WA')
-        wa.validator_first = True
-        wa.runtime = high.runtime
-        nominal = low = wa
-    else:
-        tle = SubmissionResult('TLE')
-        tle.runtime = runtime
-        nominal, low, high = tle, tle, result
-    for r in (nominal, low, high):
-        r.set_ac_runtime()
-    return nominal, low, high
diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
index 6fe69e27..49b3c9d9 100644
--- a/problemtools/judge/submission_judge.py
+++ b/problemtools/judge/submission_judge.py
@@ -13,7 +13,7 @@
 from .cache import ResultStore
 from .execute import execute_testcase
 from .grade import grade_group
-from .result import SubmissionResult, TimeLimits
+from .result import SubmissionResult
 
 if TYPE_CHECKING:
     from ..verifyproblem import TestCase, TestCaseGroup
@@ -80,7 +80,7 @@ def __init__(
         self._cancelled = _Cancelled()
         self._precompute_started = False
 
-    def precompute(self, timelimits: TimeLimits) -> None:
+    def precompute(self, timelim: float) -> None:
         """Submit all filtered testcases as background jobs.
 
         Returns immediately; workers run concurrently and deposit results into the
@@ -91,7 +91,7 @@ def precompute(self, timelimits: TimeLimits) -> None:
         self._precompute_started = True
         filtered_testcases = (item for item in self._root.get_all_testcases() if item.matches_filter(self._context.data_filter))
         for testcase in filtered_testcases:
-            self._context.submit_background_work(self._populate_cache_for_testcase, testcase, timelimits)
+            self._context.submit_background_work(self._populate_cache_for_testcase, testcase, timelim)
 
     def judge(self, timelim: float) -> list[SubmissionResult]:
         """Walk the test tree in DFS order and return results as a flat list.
@@ -110,24 +110,22 @@ def judge(self, timelim: float) -> list[SubmissionResult]:
         return self._judge_group(self._root, timelim)
 
     def _run(self, testcase: TestCase, timelim: float) -> SubmissionResult:
-        flat = TimeLimits(nominal=timelim, low=timelim, high=timelim)
-        _, _, raw = execute_testcase(
+        return execute_testcase(
             testcase,
             self._sub,
             self._output_validator,
             self._metadata,
-            flat,
+            timelim,
             self._base_dir,
             self._diag,
         )
-        return raw
 
-    def _populate_cache_for_testcase(self, testcase: TestCase, timelimits: TimeLimits) -> None:
+    def _populate_cache_for_testcase(self, testcase: TestCase, timelim: float) -> None:
         if testcase in self._cancelled:
             return
         if not self._store.claim(testcase):
             return  # duplicate testcase (same reuse_key) or already in store
-        self._store.complete(testcase, self._run(testcase, timelimits.high), timelimits.high)
+        self._store.complete(testcase, self._run(testcase, timelim), timelim)
 
     def _judge_testcase(self, testcase: TestCase, timelim: float) -> SubmissionResult:
         val = self._store.get(testcase, timelim)
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 806738a1..ad47140b 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -32,7 +32,7 @@
 from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
-from .judge import CacheKey, SubmissionJudge, SubmissionResult, Verdict, TimeLimits, validate_output
+from .judge import CacheKey, SubmissionJudge, SubmissionResult, Verdict, validate_output
 from .version import add_version_arg
 
 from abc import ABC
@@ -126,7 +126,6 @@ def check(self, context: Context) -> bool:
 
 
 class TestCase(ProblemAspect):
-    Result = tuple[SubmissionResult, SubmissionResult, SubmissionResult]
     is_group: Literal[False] = False  # Temporary workaround for a circular import in judge/submission_judge.py
 
     def __init__(self, problem: Problem, base: str, testcasegroup: TestCaseGroup) -> None:
@@ -1098,7 +1097,7 @@ def check_submission(
             custom_grader=self.problem.graders._grader,
         )
         if context.executor is not None:
-            judge.precompute(TimeLimits(nominal=timelim_high, low=timelim_high, high=timelim_high))
+            judge.precompute(timelim_high)
         results_high = judge.judge(timelim_high)
         if not results_high:  # TODO: We should move this check further out to avoid needing to fake a result here
             self.info('Found no test cases to run on. Did you filter them all out?')

From 5e30b00149c0f0e4d6f9e1d1df034475f15b7bf4 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 12:34:58 +0200
Subject: [PATCH 07/13] Clean up flow a bit when there are no test cases to run
 on

---
 problemtools/verifyproblem.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index ad47140b..5eacff5b 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -311,7 +311,7 @@ def __str__(self) -> str:
     def matches_filter(self, filter_re: Pattern[str]) -> bool:
         return True
 
-    def get_all_testcases(self) -> list:
+    def get_all_testcases(self) -> list[TestCase]:
         res: list = []
         for child in self._items:
             res += child.get_all_testcases()
@@ -1099,9 +1099,8 @@ def check_submission(
         if context.executor is not None:
             judge.precompute(timelim_high)
         results_high = judge.judge(timelim_high)
-        if not results_high:  # TODO: We should move this check further out to avoid needing to fake a result here
-            self.info('Found no test cases to run on. Did you filter them all out?')
-            return SubmissionResult(expected_verdict)
+        if not results_high:
+            self.fatal('check_submission called, but found no test cases to run on.')
         result_high = results_high[-1]
 
         results = judge.judge(timelim)
@@ -1212,6 +1211,10 @@ def check(self, context: Context) -> bool:
         if limits.time_limit is not None and context.fixed_timelim is not None:
             self.warning('There is a fixed time limit in problem.yaml, and you provided one on command line. Using command line.')
 
+        has_testcases = any(tc.matches_filter(context.data_filter) for tc in self.problem.testdata.get_all_testcases())
+        if not has_testcases:
+            self.warning('Found no test cases to run on. Did you filter them all out?')
+
         for verdict in Submissions._VERDICTS:
             acr = verdict[0]
             if verdict[2] and not self._submissions[acr]:
@@ -1235,11 +1238,12 @@ def check(self, context: Context) -> bool:
                         self.error(f'Compile error for {acr} submission {sub}', additional_info=msg)
                         continue
 
-                    timelim, timelim_high = self._compute_time_limit(fixed_limit, lower_bound_runtime)
-                    res = self.check_submission(sub, context, acr, timelim, timelim_high)
-                    runtimes.append(res.runtime)
+                    if has_testcases:
+                        timelim, timelim_high = self._compute_time_limit(fixed_limit, lower_bound_runtime)
+                        res = self.check_submission(sub, context, acr, timelim, timelim_high)
+                        runtimes.append(res.runtime)
 
-            if acr == 'AC':
+            if acr == 'AC' and has_testcases:
                 if len(runtimes) > 0:
                     lower_bound_runtime = max(runtimes)
 

From 3714f559897f43277f5437b8746d6c18e964094e Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 12:39:14 +0200
Subject: [PATCH 08/13] Rename test_item => test_node which is at least a tad
 more descriptive

---
 problemtools/judge/cache.py            | 12 ++++++------
 problemtools/judge/result.py           |  2 +-
 problemtools/judge/submission_judge.py |  4 ++--
 problemtools/verifyproblem.py          |  6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/problemtools/judge/cache.py b/problemtools/judge/cache.py
index ea24bc1a..28eafe3e 100644
--- a/problemtools/judge/cache.py
+++ b/problemtools/judge/cache.py
@@ -40,12 +40,12 @@ def _reclassify(result: SubmissionResult, timelim: float) -> SubmissionResult:
     return result
 
 
-def _with_test_item(result: SubmissionResult, testcase: TestCase) -> SubmissionResult:
-    """Return result with test_item set to testcase, copying only if needed."""
-    if result.test_item is testcase:
+def _with_test_node(result: SubmissionResult, testcase: TestCase) -> SubmissionResult:
+    """Return result with test_node set to testcase, copying only if needed."""
+    if result.test_node is testcase:
         return result
     result = copy.copy(result)
-    result.test_item = testcase
+    result.test_node = testcase
     return result
 
 
@@ -105,9 +105,9 @@ def get(self, testcase: TestCase, timelim: float) -> SubmissionResult | Future[S
             return None
         if isinstance(val, Future):
             chained: Future[SubmissionResult] = Future()
-            val.add_done_callback(lambda f: chained.set_result(_with_test_item(_reclassify(f.result(), timelim), testcase)))
+            val.add_done_callback(lambda f: chained.set_result(_with_test_node(_reclassify(f.result(), timelim), testcase)))
             return chained
         if timelim > val.run_timelim:
             # Entry was produced at a lower limit; cannot safely reclassify upward.
             return None
-        return _with_test_item(_reclassify(val.result, timelim), testcase)
+        return _with_test_node(_reclassify(val.result, timelim), testcase)
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
index f8dc3a78..c040d8ca 100644
--- a/problemtools/judge/result.py
+++ b/problemtools/judge/result.py
@@ -21,7 +21,7 @@ def __init__(
         self.reason = reason
         self.additional_info = additional_info
         self.testcase: TestCase | None = None
-        self.test_item: TestCase | TestCaseGroup | None = None
+        self.test_node: TestCase | TestCaseGroup | None = None
         self.runtime_testcase: TestCase | None = None
         self.runtime = -1.0
         self.ac_runtime = -1.0
diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
index 49b3c9d9..b35a3553 100644
--- a/problemtools/judge/submission_judge.py
+++ b/problemtools/judge/submission_judge.py
@@ -96,7 +96,7 @@ def precompute(self, timelim: float) -> None:
     def judge(self, timelim: float) -> list[SubmissionResult]:
         """Walk the test tree in DFS order and return results as a flat list.
 
-        Each SubmissionResult has test_item set to the TestCase or TestCaseGroup it
+        Each SubmissionResult has test_node set to the TestCase or TestCaseGroup it
         covers.  Group results immediately follow all their descendants; the root
         group's result is the last element.  Returns an empty list if all testcases
         were filtered out.
@@ -195,6 +195,6 @@ def _judge_group(self, group: TestCaseGroup, timelim: float) -> list[SubmissionR
                 group_verdict = SubmissionResult(verdict, score=score)
                 group_verdict.runtime = max(v.runtime for v in child_results)
 
-        group_verdict.test_item = group
+        group_verdict.test_node = group
         all_results.append(group_verdict)
         return all_results
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 5eacff5b..e5a82ac0 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -1109,8 +1109,8 @@ def check_submission(
         # Check if scores were outside of the range for any groups
         if self.problem.is_scoring():
             for r in results:
-                if r.score is not None and isinstance(r.test_item, TestCaseGroup):
-                    r.test_item.check_score_in_bounds(sub, r.score)
+                if r.score is not None and isinstance(r.test_node, TestCaseGroup):
+                    r.test_node.check_score_in_bounds(sub, r.score)
 
         # Warn if AC (but not PAC) submissions fail on samples. It's not uncommon for sample cases to be
         # ignored, so failing on them could be silent otherwise. Skip warning if the result isn't AC -
@@ -1120,7 +1120,7 @@ def check_submission(
                 (
                     r
                     for r in results
-                    if r.verdict != 'AC' and isinstance(r.test_item, TestCase) and r.test_item.is_in_sample_group()
+                    if r.verdict != 'AC' and isinstance(r.test_node, TestCase) and r.test_node.is_in_sample_group()
                 ),
                 None,
             )

From 8f596612310bacdd9ee72ad3e045c2ba9f08e780 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 13:08:43 +0200
Subject: [PATCH 09/13] Restore details when printing a SubmissionResult

---
 problemtools/judge/cache.py            |  5 +++--
 problemtools/judge/execute.py          |  7 +++++--
 problemtools/judge/result.py           | 15 +++------------
 problemtools/judge/submission_judge.py | 11 ++++++++++-
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/problemtools/judge/cache.py b/problemtools/judge/cache.py
index 28eafe3e..f3f0c77a 100644
--- a/problemtools/judge/cache.py
+++ b/problemtools/judge/cache.py
@@ -41,11 +41,12 @@ def _reclassify(result: SubmissionResult, timelim: float) -> SubmissionResult:
 
 
 def _with_test_node(result: SubmissionResult, testcase: TestCase) -> SubmissionResult:
-    """Return result with test_node set to testcase, copying only if needed."""
-    if result.test_node is testcase:
+    """Return result with test_node and runtime_testcase set to testcase, copying only if needed."""
+    if result.test_node is testcase and result.runtime_testcase is testcase:
         return result
     result = copy.copy(result)
     result.test_node = testcase
+    result.runtime_testcase = testcase
     return result
 
 
diff --git a/problemtools/judge/execute.py b/problemtools/judge/execute.py
index fcc98057..220d94a4 100644
--- a/problemtools/judge/execute.py
+++ b/problemtools/judge/execute.py
@@ -215,6 +215,9 @@ def execute_testcase(
         execution_dir = Path(exec_dir)
         (execution_dir / 'feedback').mkdir()
         if metadata.is_multi_pass():
-            return _run_multipass(testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+            result = _run_multipass(testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
         else:
-            return _run_pass(testcase.infile_path, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+            result = _run_pass(testcase.infile_path, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+    result.test_node = testcase
+    result.runtime_testcase = testcase
+    return result
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
index c040d8ca..3aacc0a4 100644
--- a/problemtools/judge/result.py
+++ b/problemtools/judge/result.py
@@ -20,19 +20,10 @@ def __init__(
         self.score = score
         self.reason = reason
         self.additional_info = additional_info
-        self.testcase: TestCase | None = None
         self.test_node: TestCase | TestCaseGroup | None = None
         self.runtime_testcase: TestCase | None = None
         self.runtime = -1.0
-        self.ac_runtime = -1.0
-        self.ac_runtime_testcase: TestCase | None = None
-        self.validator_first = False
-        self.sample_failures: list[SubmissionResult] = []
-
-    def set_ac_runtime(self) -> None:
-        if self.verdict == 'AC':
-            self.ac_runtime = self.runtime
-            self.ac_runtime_testcase = self.runtime_testcase
+        self.validator_first = False  # Needed to work around interactive giving unreliable runtime on WA
 
     def __str__(self) -> str:
         verdict = self.verdict
@@ -41,8 +32,8 @@ def __str__(self) -> str:
             verdict += f' ({self.score:.0f})'
         if self.reason is not None:
             details.append(self.reason)
-        if self.testcase is not None:
-            details.append(f'testcase: {self.testcase}')
+        if self.test_node is not None and not self.test_node.is_group:
+            details.append(f'testcase: {self.test_node}')
         if self.runtime != -1:
             details.append(f'CPU: {self.runtime:.2f}s @ {self.runtime_testcase}')
         return verdict if not details else f'{verdict} [{", ".join(details)}]'
diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
index b35a3553..36d635f0 100644
--- a/problemtools/judge/submission_judge.py
+++ b/problemtools/judge/submission_judge.py
@@ -193,7 +193,16 @@ def _judge_group(self, group: TestCaseGroup, timelim: float) -> list[SubmissionR
                 grader_flags = group.config.get('grader_flags', '').split()
                 verdict, score = grade_group(child_results, grader, grader_flags, self._base_dir, self._diag)
                 group_verdict = SubmissionResult(verdict, score=score)
-                group_verdict.runtime = max(v.runtime for v in child_results)
+                slowest = max(child_results, key=lambda r: r.runtime)
+                group_verdict.runtime = slowest.runtime
+                group_verdict.runtime_testcase = slowest.runtime_testcase
+                # The grader doesn't tell us why it gave a certain result. We still want to propagate reason
+                # and additional_info. As a heuristic, look for the last entry with the same verdict as the
+                # group got, and copy from there.
+                matching = next((r for r in reversed(child_results) if r.verdict == verdict), None)
+                if matching:
+                    group_verdict.reason = matching.reason
+                    group_verdict.additional_info = matching.additional_info
 
         group_verdict.test_node = group
         all_results.append(group_verdict)

From 9589d67ca5a963bab894af067a16338d89992e54 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 13:44:22 +0200
Subject: [PATCH 10/13] Repair Runing sub on testcase... message when stdout is
 a tty

---
 problemtools/judge/submission_judge.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
index 36d635f0..5286fb45 100644
--- a/problemtools/judge/submission_judge.py
+++ b/problemtools/judge/submission_judge.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import copy
+import sys
 from concurrent.futures import Future
 from pathlib import Path
 from threading import Lock
@@ -164,7 +165,14 @@ def _judge_group(self, group: TestCaseGroup, timelim: float) -> list[SubmissionR
                 all_results.extend(sub)
                 result = sub[-1]  # last element is the subgroup's own result
             else:
+                if sys.stdout.isatty():
+                    msg = f'Running {self._sub} on {item}...'
+                    sys.stdout.write(msg)
+                    sys.stdout.flush()
                 result = self._judge_testcase(item, timelim)
+                if sys.stdout.isatty():
+                    sys.stdout.write('\b \b' * len(msg))
+
                 # Apply default score here - after we've entered it into the cache, as it may also be present in other groups with different defaults
                 if result.score is None:
                     result = copy.copy(result)

From 4191fae23b873831cfa86502727067d33b385d41 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 14:28:42 +0200
Subject: [PATCH 11/13] Refactor - extract _aggregate_group_result from
 _judge_group to simplify function

---
 problemtools/judge/submission_judge.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
index 5286fb45..3edf151b 100644
--- a/problemtools/judge/submission_judge.py
+++ b/problemtools/judge/submission_judge.py
@@ -190,28 +190,31 @@ def _judge_group(self, group: TestCaseGroup, timelim: float) -> list[SubmissionR
         if not all_results:  # All our children were filtered
             return []
 
+        group_verdict = self._aggregate_group_result(child_results, group)
+        all_results.append(group_verdict)
+        return all_results
+
+    def _aggregate_group_result(self, child_results: list[SubmissionResult], group: TestCaseGroup) -> SubmissionResult:
         judge_error = next((r for r in child_results if r.verdict == 'JE'), None)
         if judge_error:
-            group_verdict = copy.copy(judge_error)
+            result = copy.copy(judge_error)
         else:
             grader = self._grader_for(group)
             if grader is None:
-                group_verdict = SubmissionResult('JE', reason='grader not found')
+                result = SubmissionResult('JE', reason='grader not found')
             else:
                 grader_flags = group.config.get('grader_flags', '').split()
                 verdict, score = grade_group(child_results, grader, grader_flags, self._base_dir, self._diag)
-                group_verdict = SubmissionResult(verdict, score=score)
+                result = SubmissionResult(verdict, score=score)
                 slowest = max(child_results, key=lambda r: r.runtime)
-                group_verdict.runtime = slowest.runtime
-                group_verdict.runtime_testcase = slowest.runtime_testcase
+                result.runtime = slowest.runtime
+                result.runtime_testcase = slowest.runtime_testcase
                 # The grader doesn't tell us why it gave a certain result. We still want to propagate reason
                 # and additional_info. As a heuristic, look for the last entry with the same verdict as the
                 # group got, and copy from there.
                 matching = next((r for r in reversed(child_results) if r.verdict == verdict), None)
                 if matching:
-                    group_verdict.reason = matching.reason
-                    group_verdict.additional_info = matching.additional_info
-
-        group_verdict.test_node = group
-        all_results.append(group_verdict)
-        return all_results
+                    result.reason = matching.reason
+                    result.additional_info = matching.additional_info
+        result.test_node = group
+        return result

From c6b61e152429f139b157a572ac4fb34a4aebdfe7 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 14:34:07 +0200
Subject: [PATCH 12/13] Extract checks to simplify check_submission a bit

---
 problemtools/verifyproblem.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index e5a82ac0..ac53c53f 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -1116,28 +1116,13 @@ def check_submission(
         # ignored, so failing on them could be silent otherwise. Skip warning if the result isn't AC -
         # then something worse has gone wrong, and we'll error later.
         if expected_verdict == 'AC' and result.verdict == 'AC':
-            sample_failure = next(
-                (
-                    r
-                    for r in results
-                    if r.verdict != 'AC' and isinstance(r.test_node, TestCase) and r.test_node.is_in_sample_group()
-                ),
-                None,
-            )
-            if sample_failure:
+            if sample_failure := self._find_sample_failure(results):
                 self.warning(f'{desc} got {sample_failure.verdict} on sample: {sample_failure}')
 
         # Warn if a PAC submission would affect time limit, had it been use to compute the time limit. Only do this
         # if it gets AC on the computed time limit, otherwise we have other warnings below.
         if partial and result.verdict == 'AC':
-            runtime_without_affecting_tl = timelim / self.problem.metadata.limits.time_multipliers.ac_to_time_limit
-            result_without_affecting_tl = judge.judge(runtime_without_affecting_tl)[-1]
-            if result_without_affecting_tl.verdict != 'AC':
-                timelimits_to_test = set(r.runtime for r in results if r.runtime > runtime_without_affecting_tl)
-                for t in sorted(timelimits_to_test):
-                    r = judge.judge(t)[-1]
-                    if r.verdict == 'AC':
-                        self.warning(f'{desc} is slower than all AC submissions. It needs {t:.2f}s to get AC')
+            self._warn_pac_too_slow(judge, results, timelim, desc)
 
         if result.verdict != result_high.verdict or result.score != result_high.score:
             self.warning(
@@ -1159,6 +1144,21 @@ def check_submission(
 
         return result
 
+    def _find_sample_failure(self, results: list[SubmissionResult]) -> SubmissionResult | None:
+        for r in results:
+            if r.verdict != 'AC' and isinstance(r.test_node, TestCase) and r.test_node.is_in_sample_group():
+                return r
+        return None
+
+    def _warn_pac_too_slow(self, judge: SubmissionJudge, results: list[SubmissionResult], timelim: float, desc: str) -> None:
+        """Warn if a PAC submission is slow enough that it would have affected the time limit."""
+        runtime_without_affecting_tl = timelim / self.problem.metadata.limits.time_multipliers.ac_to_time_limit
+        if judge.judge(runtime_without_affecting_tl)[-1].verdict == 'AC':
+            return
+        for t in sorted(r.runtime for r in results if r.runtime > runtime_without_affecting_tl):
+            if judge.judge(t)[-1].verdict == 'AC':
+                self.warning(f'{desc} is slower than all AC submissions. It needs {t:.2f}s to get AC')
+
     def full_score_finite(self) -> bool:
         min_score, max_score = self.problem.testdata.get_score_range()
         if self.problem.metadata.legacy_grading.objective == 'min':

From ca58ac925a7cf993fbdedeccfa40fc9c1c438ec1 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Mon, 27 Apr 2026 14:43:45 +0200
Subject: [PATCH 13/13] Avoid deadlock in multithreaded runs if worker crashes

---
 problemtools/judge/submission_judge.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/problemtools/judge/submission_judge.py b/problemtools/judge/submission_judge.py
index 3edf151b..ad0a5d4f 100644
--- a/problemtools/judge/submission_judge.py
+++ b/problemtools/judge/submission_judge.py
@@ -126,7 +126,11 @@ def _populate_cache_for_testcase(self, testcase: TestCase, timelim: float) -> No
             return
         if not self._store.claim(testcase):
             return  # duplicate testcase (same reuse_key) or already in store
-        self._store.complete(testcase, self._run(testcase, timelim), timelim)
+        try:
+            result = self._run(testcase, timelim)
+        except Exception as e:
+            result = SubmissionResult('JE', reason=f'Internal error: {e}')
+        self._store.complete(testcase, result, timelim)
 
     def _judge_testcase(self, testcase: TestCase, timelim: float) -> SubmissionResult:
         val = self._store.get(testcase, timelim)