From 425ec3864b8d44b8ff051115bb3a913167190d02 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Wed, 22 Apr 2026 19:06:26 +0200
Subject: [PATCH 1/8] Move Context to a separate file #398

---
 problemtools/context.py       | 36 +++++++++++++++++++++++++++++++++++
 problemtools/verifyproblem.py | 31 +-----------------------------
 2 files changed, 37 insertions(+), 30 deletions(-)
 create mode 100644 problemtools/context.py

diff --git a/problemtools/context.py b/problemtools/context.py
new file mode 100644
index 00000000..ab705d2b
--- /dev/null
+++ b/problemtools/context.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+import re
+from typing import Callable, Pattern, ParamSpec, TypeVar
+
+_T = TypeVar('_T')
+_P = ParamSpec('_P')
+
+PROBLEM_PARTS = ['config', 'data', 'graders', 'statement', 'submissions', 'validators']
+
+
+class Context:
+    # Default values here must be kept in sync with the defaults in argparser().
+    def __init__(
+        self,
+        data_filter: Pattern[str] = re.compile('.*'),
+        submission_filter: Pattern[str] = re.compile('.*'),
+        fixed_timelim: float | None = None,
+        parts: list[str] | None = None,
+        threads: int = 1,
+    ) -> None:
+        self.data_filter = data_filter
+        self.submission_filter = submission_filter
+        self.fixed_timelim = fixed_timelim
+        self.parts: list[str] = parts if parts is not None else list(PROBLEM_PARTS)
+        self.executor: ThreadPoolExecutor | None = ThreadPoolExecutor(threads) if threads > 1 else None
+        self._background_work: list[concurrent.futures.Future[object]] = []
+
+    def submit_background_work(self, job: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs) -> None:
+        assert self.executor
+        self._background_work.append(self.executor.submit(job, *args, **kwargs))
+
+    def wait_for_background_work(self) -> None:
+        concurrent.futures.wait(self._background_work)
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 4e842bf9..d5d96932 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -3,8 +3,6 @@
 from __future__ import annotations
 
 import argparse
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor
 import math
 import threading
 import queue
@@ -35,6 +33,7 @@
 from . import problem2pdf
 from . import run
 from . import statement_util
+from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
 from .version import add_version_arg
@@ -100,31 +99,6 @@ def __str__(self) -> str:
 _P = ParamSpec('_P')
 
 
-class Context:
-    # Default values here must be kept in sync with the defaults in argparser().
-    def __init__(
-        self,
-        data_filter: Pattern[str] = re.compile('.*'),
-        submission_filter: Pattern[str] = re.compile('.*'),
-        fixed_timelim: float | None = None,
-        parts: list[str] | None = None,
-        threads: int = 1,
-    ) -> None:
-        self.data_filter = data_filter
-        self.submission_filter = submission_filter
-        self.fixed_timelim = fixed_timelim
-        self.parts: list[str] = parts if parts is not None else list(PROBLEM_PARTS)
-        self.executor: ThreadPoolExecutor | None = ThreadPoolExecutor(threads) if threads > 1 else None
-        self._background_work: list[concurrent.futures.Future[object]] = []
-
-    def submit_background_work(self, job: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs) -> None:
-        assert self.executor
-        self._background_work.append(self.executor.submit(job, *args, **kwargs))
-
-    def wait_for_background_work(self) -> None:
-        concurrent.futures.wait(self._background_work)
-
-
 class ProblemAspect(ABC):
     _check_res: bool | None = None
     problem: Problem
@@ -1926,9 +1900,6 @@ def _f_n(number: float | None) -> str:
         return self._check_res
 
 
-PROBLEM_PARTS = ['config', 'data', 'graders', 'statement', 'submissions', 'validators']
-
-
 class Problem(ProblemAspect):
     """Represents a checkable problem"""
 

From 48a37a0ca44b2c56ca827b8b82ae1a3c2b12ade9 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Thu, 23 Apr 2026 16:43:49 +0200
Subject: [PATCH 2/8] Move SubmissionResult, Verdict, is_RTE, is_TLE to a new
 module, judge

---
 problemtools/judge/__init__.py | 13 ++++++++
 problemtools/judge/result.py   | 59 ++++++++++++++++++++++++++++++++++
 problemtools/verifyproblem.py  | 54 ++-----------------------------
 3 files changed, 74 insertions(+), 52 deletions(-)
 create mode 100644 problemtools/judge/__init__.py
 create mode 100644 problemtools/judge/result.py

diff --git a/problemtools/judge/__init__.py b/problemtools/judge/__init__.py
new file mode 100644
index 00000000..80cf773b
--- /dev/null
+++ b/problemtools/judge/__init__.py
@@ -0,0 +1,13 @@
+from .result import (
+    SubmissionResult,
+    Verdict,
+    is_RTE,
+    is_TLE,
+)
+
+__all__ = [
+    'SubmissionResult',
+    'Verdict',
+    'is_RTE',
+    'is_TLE',
+]
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
new file mode 100644
index 00000000..bc2dd243
--- /dev/null
+++ b/problemtools/judge/result.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import os
+import signal
+from typing import TYPE_CHECKING, Literal
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase
+
+Verdict = Literal['AC', 'TLE', 'OLE', 'MLE', 'RTE', 'WA', 'PAC', 'JE']
+
+
+def is_TLE(status: int, may_signal_with_usr1: bool = False) -> bool:
+    return os.WIFSIGNALED(status) and (
+        os.WTERMSIG(status) == signal.SIGXCPU or (may_signal_with_usr1 and os.WTERMSIG(status) == signal.SIGUSR1)
+    )
+
+
+def is_RTE(status: int) -> bool:
+    return not os.WIFEXITED(status) or bool(os.WEXITSTATUS(status))
+
+
+class SubmissionResult:
+    def __init__(
+        self,
+        verdict: str,
+        score: float | None = None,
+        reason: str | None = None,
+        additional_info: str | None = None,
+    ) -> None:
+        self.verdict = verdict
+        self.score = score
+        self.reason = reason
+        self.additional_info = additional_info
+        self.testcase: TestCase | None = None
+        self.runtime_testcase: TestCase | None = None
+        self.runtime = -1.0
+        self.ac_runtime = -1.0
+        self.ac_runtime_testcase: TestCase | None = None
+        self.validator_first = False
+        self.sample_failures: list[SubmissionResult] = []
+
+    def set_ac_runtime(self) -> None:
+        if self.verdict == 'AC':
+            self.ac_runtime = self.runtime
+            self.ac_runtime_testcase = self.runtime_testcase
+
+    def __str__(self) -> str:
+        verdict = self.verdict
+        details = []
+        if verdict == 'AC' and self.score is not None:
+            verdict += f' ({self.score:.0f})'
+        if self.reason is not None:
+            details.append(self.reason)
+        if self.testcase is not None:
+            details.append(f'testcase: {self.testcase}')
+        if self.runtime != -1:
+            details.append(f'CPU: {self.runtime:.2f}s @ {self.runtime_testcase}')
+        return verdict if not details else f'{verdict} [{", ".join(details)}]'
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index d5d96932..a8da49ec 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -11,7 +11,6 @@
 import hashlib
 import collections
 import os
-import signal
 import re
 import shutil
 import logging
@@ -36,64 +35,15 @@
 from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
+from .judge import SubmissionResult, Verdict, is_TLE, is_RTE
 from .version import add_version_arg
 
 from abc import ABC
-from typing import Any, Callable, ClassVar, Literal, Pattern, Match, ParamSpec, TypeVar, cast
+from typing import Any, Callable, ClassVar, Pattern, Match, ParamSpec, TypeVar, cast
 from pydantic import ValidationError
 
 random.seed(42)
 
-Verdict = Literal['AC', 'TLE', 'OLE', 'MLE', 'RTE', 'WA', 'PAC', 'JE']
-
-
-def is_TLE(status: int, may_signal_with_usr1: bool = False) -> bool:
-    return os.WIFSIGNALED(status) and (
-        os.WTERMSIG(status) == signal.SIGXCPU or (may_signal_with_usr1 and os.WTERMSIG(status) == signal.SIGUSR1)
-    )
-
-
-def is_RTE(status: int) -> bool:
-    return not os.WIFEXITED(status) or bool(os.WEXITSTATUS(status))
-
-
-class SubmissionResult:
-    def __init__(self, verdict: str, score: float | None = None, reason: str | None = None, additional_info: str | None = None):
-        self.verdict = verdict
-        self.score = score
-        self.reason = reason
-        self.additional_info = additional_info
-        self.testcase: TestCase | None = None
-        self.runtime_testcase: TestCase | None = None
-        self.runtime = -1.0
-        self.ac_runtime = -1.0
-        self.ac_runtime_testcase: TestCase | None = None
-        self.validator_first = False
-        self.sample_failures: list[SubmissionResult] = []
-
-    def set_ac_runtime(self) -> None:
-        if self.verdict == 'AC':
-            self.ac_runtime = self.runtime
-            self.ac_runtime_testcase = self.runtime_testcase
-
-    def __str__(self) -> str:
-        verdict = self.verdict
-        details = []
-
-        if verdict == 'AC' and self.score is not None:
-            verdict += f' ({self.score:.0f})'
-
-        if self.reason is not None:
-            details.append(self.reason)
-        if self.testcase is not None:
-            details.append(f'testcase: {self.testcase}')
-        if self.runtime != -1:
-            details.append(f'CPU: {self.runtime:.2f}s @ {self.runtime_testcase}')
-
-        if len(details) == 0:
-            return verdict
-        return f'{verdict} [{", ".join(details)}]'
-
 
 _T = TypeVar('_T')
 _P = ParamSpec('_P')

From 6ca0dbd13e17a505f249702881b7bd2c291325f9 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Thu, 23 Apr 2026 16:46:19 +0200
Subject: [PATCH 3/8] Add path to base class - all implementations set it

---
 problemtools/run/program.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/problemtools/run/program.py b/problemtools/run/program.py
index ae27c377..38c789d8 100644
--- a/problemtools/run/program.py
+++ b/problemtools/run/program.py
@@ -18,6 +18,7 @@ class Program(ABC):
     """Abstract base class for programs."""
 
     def __init__(self) -> None:
+        self.path: str = ''
         self.runtime = 0
         self._compile_lock = threading.Lock()
         self._compile_result: tuple[bool, str | None] | None = None

From d9f18376394d630286765d25187f55db6b5bbf40 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Thu, 23 Apr 2026 21:04:27 +0200
Subject: [PATCH 4/8] Begin moving output validation to judge module

---
 problemtools/judge/__init__.py |   2 +
 problemtools/judge/validate.py | 124 +++++++++++++++++++++++++++++++++
 problemtools/verifyproblem.py  |  87 ++++++++---------------
 3 files changed, 154 insertions(+), 59 deletions(-)
 create mode 100644 problemtools/judge/validate.py

diff --git a/problemtools/judge/__init__.py b/problemtools/judge/__init__.py
index 80cf773b..9d0eb72d 100644
--- a/problemtools/judge/__init__.py
+++ b/problemtools/judge/__init__.py
@@ -4,10 +4,12 @@
     is_RTE,
     is_TLE,
 )
+from .validate import validate_output
 
 __all__ = [
     'SubmissionResult',
     'Verdict',
     'is_RTE',
     'is_TLE',
+    'validate_output',
 ]
diff --git a/problemtools/judge/validate.py b/problemtools/judge/validate.py
new file mode 100644
index 00000000..88b05c63
--- /dev/null
+++ b/problemtools/judge/validate.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from ..diagnostics import Diagnostics
+from ..metadata import Metadata
+from ..run import Program
+from .result import SubmissionResult
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase
+
+
+def _get_feedback(feedback_dir: Path) -> str | None:
+    all_feedback = []
+    for path in feedback_dir.iterdir():
+        if path.stat().st_size == 0:
+            continue
+        all_feedback.append(f'=== {path.name}: ===')
+        # Note: The file could contain non-unicode characters, "replace" to be on the safe side
+        with open(path, errors='replace') as f:
+            # Cap amount of feedback per file at some high-ish
+            # size, so that a buggy validator spewing out lots of
+            # data doesn't kill us.
+            all_feedback.append(f.read(128 * 1024))
+    return '\n'.join(all_feedback) if all_feedback else None
+
+
+def _parse_validator_result(
+    val: Program,
+    status: int,
+    feedback_dir: Path,
+    metadata: Metadata,
+) -> SubmissionResult:
+    if not os.WIFEXITED(status):
+        return SubmissionResult(
+            'JE',
+            reason=f'output validator {val} crashed, status {status}',
+            additional_info=_get_feedback(feedback_dir),
+        )
+
+    ret = os.WEXITSTATUS(status)
+    if ret not in [42, 43]:
+        return SubmissionResult(
+            'JE',
+            reason=f'output validator {val} exited with status {ret}',
+            additional_info=_get_feedback(feedback_dir),
+        )
+
+    if ret == 43:
+        return SubmissionResult('WA', additional_info=_get_feedback(feedback_dir))
+
+    # ret == 42 (AC); check score handling
+    score_file = feedback_dir / 'score.txt'
+
+    if not metadata.is_custom_score_allowed() and score_file.is_file():
+        return SubmissionResult('JE', reason='validator produced "score.txt" but problem does not have custom scoring activated')
+
+    score: float | None = None
+    if metadata.is_custom_score_mandatory():
+        if score_file.is_file():
+            try:
+                score = float(score_file.read_text())
+            except Exception as e:
+                return SubmissionResult('JE', reason=f'failed to parse validator score: {e}')
+        elif metadata.is_multi_pass() and (feedback_dir / 'nextpass.in').is_file():
+            score = 0.0
+        else:
+            return SubmissionResult('JE', reason='problem has custom scoring but validator did not produce "score.txt"')
+
+    return SubmissionResult('AC', score=score)
+
+
+def _validate_output(
+    testcase: TestCase,
+    submission_output: Path,
+    output_validator: Program,
+    metadata: Metadata,
+    execution_dir: Path,
+    diag: Diagnostics,
+    infile: Path | None = None,
+) -> SubmissionResult:
+    feedback_dir = execution_dir / 'feedback'
+    effective_infile = infile if infile is not None else testcase.infile_path
+    flags = testcase.output_validator_flags
+    val_timelim = metadata.limits.validation_time
+    val_memlim = metadata.limits.validation_memory
+
+    if not output_validator.compile()[0]:
+        return SubmissionResult('JE', reason=f'output validator {output_validator} failed to compile')
+    val_stdout = execution_dir / 'val_stdout'
+    val_stderr = execution_dir / 'val_stderr'
+    status, _ = output_validator.run(
+        infile=str(submission_output),
+        args=[str(effective_infile), str(testcase.ansfile_path), str(feedback_dir) + os.sep] + flags,
+        timelim=val_timelim,
+        memlim=val_memlim,
+        outfile=str(val_stdout),
+        errfile=str(val_stderr),
+    )
+    for label, path in [('stdout', val_stdout), ('stderr', val_stderr)]:
+        try:
+            if content := path.read_text(errors='replace'):
+                diag.debug(f'Validator {label}: {content}')
+        except OSError as e:
+            diag.info(f'Failed to read validator output: {e}')
+    return _parse_validator_result(output_validator, status, feedback_dir, metadata)
+
+
+def validate_output(
+    testcase: TestCase,
+    submission_output: Path,
+    output_validator: Program,
+    metadata: Metadata,
+    base_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    with tempfile.TemporaryDirectory(dir=base_dir) as exec_dir:
+        execution_dir = Path(exec_dir)
+        (execution_dir / 'feedback').mkdir()
+        return _validate_output(testcase, submission_output, output_validator, metadata, execution_dir, diag)
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index a8da49ec..c52f7d11 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -35,7 +35,7 @@
 from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
-from .judge import SubmissionResult, Verdict, is_TLE, is_RTE
+from .judge import SubmissionResult, Verdict, is_TLE, is_RTE, validate_output
 from .version import add_version_arg
 
 from abc import ABC
@@ -166,6 +166,22 @@ def check_size_limits(self, filename: str) -> None:
     def strip_path_prefix(self, path: str) -> str:
         return os.path.relpath(path, os.path.join(self._problem.probdir, 'data'))
 
+    # Temporary properties for use while refactoring verifyproblem into judge/
+    @property
+    def infile_path(self) -> Path:
+        return Path(self.infile)
+
+    @property
+    def ansfile_path(self) -> Path:
+        return Path(self.ansfile)
+
+    @property
+    def output_validator_flags(self) -> list[str]:
+        return (
+            self._problem.metadata.legacy_validator_flags.split()
+            + self.testcasegroup.config.get('output_validator_flags', '').split()
+        )
+
     def is_in_sample_group(self) -> bool:
         return self.strip_path_prefix(self.infile).startswith('sample')
 
@@ -1287,7 +1303,9 @@ def check(self, context: Context) -> bool:
                 )
 
         if len(self._validators) > 1:
-            self.error_in_2023_07('Found more than one output validator. This was allowed in legacy (but not on Kattis)')
+            self.error_in_2023_07(
+                'Found more than one output validator, will only use one. This was allowed in legacy (but not on Kattis)'
+            )
 
         if self.uses_default_validator() and self._validators:
             self.error('There are validator programs but problem.yaml has validation = "default"')
@@ -1500,64 +1518,15 @@ def validate_interactive(
     def validate(
         self, testcase: TestCase, submission_output: str, infile: str | None = None, feedback_dir_path: str | None = None
     ) -> SubmissionResult:
-        """
-        Run all output validators on the given test case and submission output.
-        Parameters:
-            testcase: The test case we are validating.
-            submission_output: Path to out file of submission.
-            infile: The input file. Overrides testcase.infile if we are running multipass.
-            feedback_dir_path: Path to feedback directory. If None, a temporary directory will be created and cleaned up.
-        """
-        res = SubmissionResult('JE')
-        val_timelim = self.problem.metadata.limits.validation_time
-        val_memlim = self.problem.metadata.limits.validation_memory
-        flags = (
-            self.problem.metadata.legacy_validator_flags.split() + testcase.testcasegroup.config['output_validator_flags'].split()
+        val = self._actual_validators()[0]
+        return validate_output(
+            testcase=testcase,
+            submission_output=Path(submission_output),
+            output_validator=val,
+            metadata=self.problem.metadata,
+            base_dir=Path(self.problem.tmpdir),
+            diag=self._diag,
         )
-        for i, val in enumerate(self._actual_validators()):
-            if val.compile()[0]:
-                # If we are running multiple output validators in legacy, make sure to wipe it
-                # If we are running multipass, i will always be 0 and we do not accidentally wipe feedback
-                if i > 0 and feedback_dir_path:
-                    shutil.rmtree(feedback_dir_path)
-                    Path(feedback_dir_path).mkdir()
-
-                if feedback_dir_path:
-                    feedbackdir = feedback_dir_path
-                else:
-                    feedbackdir = tempfile.mkdtemp(prefix='feedback', dir=self.problem.tmpdir)
-
-                validator_output = tempfile.mkdtemp(prefix='checker_out', dir=self.problem.tmpdir)
-                outfile = validator_output + '/out.txt'
-                errfile = validator_output + '/err.txt'
-                status, runtime = val.run(
-                    infile=submission_output,
-                    args=[infile if infile else testcase.infile, testcase.ansfile, feedbackdir] + flags,
-                    timelim=val_timelim,
-                    memlim=val_memlim,
-                    outfile=outfile,
-                    errfile=errfile,
-                )
-                try:
-                    with open(outfile, mode='rt') as f:
-                        output = f.read()
-                    if output:
-                        self.debug(f'Validator output:\n{output}')
-                    with open(errfile, mode='rt') as f:
-                        error = f.read()
-                    if error:
-                        self.debug(f'Validator stderr:\n{error}')
-                except IOError as e:
-                    self.info(f'Failed to read validator output: {e}')
-                res = self._parse_validator_results(val, status, feedbackdir, testcase)
-                shutil.rmtree(validator_output)
-                if feedback_dir_path is None:
-                    shutil.rmtree(feedbackdir)
-                if res.verdict != 'AC':
-                    return res
-
-        # TODO: check that all output validators give same result
-        return res
 
 
 class Runner:

From 18e883f5ea13f20f09eee573e01a1f9969aa4ab8 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Thu, 23 Apr 2026 22:25:26 +0200
Subject: [PATCH 5/8] Add functions in judge to execute one test case

---
 problemtools/judge/__init__.py |  6 +++++
 problemtools/judge/result.py   | 40 ++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/problemtools/judge/__init__.py b/problemtools/judge/__init__.py
index 9d0eb72d..c9b58f6c 100644
--- a/problemtools/judge/__init__.py
+++ b/problemtools/judge/__init__.py
@@ -1,6 +1,9 @@
+from .execute import execute_testcase
 from .result import (
     SubmissionResult,
+    TimeLimits,
     Verdict,
+    classify_result,
     is_RTE,
     is_TLE,
 )
@@ -8,7 +11,10 @@
 
 __all__ = [
     'SubmissionResult',
+    'TimeLimits',
     'Verdict',
+    'classify_result',
+    'execute_testcase',
     'is_RTE',
     'is_TLE',
     'validate_output',
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
index bc2dd243..f335c3bf 100644
--- a/problemtools/judge/result.py
+++ b/problemtools/judge/result.py
@@ -2,6 +2,7 @@
 
 import os
 import signal
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
 if TYPE_CHECKING:
@@ -57,3 +58,42 @@ def __str__(self) -> str:
         if self.runtime != -1:
             details.append(f'CPU: {self.runtime:.2f}s @ {self.runtime_testcase}')
         return verdict if not details else f'{verdict} [{", ".join(details)}]'
+
+
+@dataclass
+class TimeLimits:
+    nominal: float  # official limit; verdict based on this
+    low: float  # below this is comfortably AC; above is "sensitive to time limit"
+    high: float  # wall-clock ceiling enforced on the process
+
+
+def classify_result(
+    result: SubmissionResult,
+    tl: TimeLimits,
+) -> tuple[SubmissionResult, SubmissionResult, SubmissionResult]:
+    """Map a raw high-limit result into the (nominal, low, high) triple."""
+    runtime = result.runtime
+    if runtime <= tl.low:
+        nominal = low = high = result
+    elif runtime <= tl.nominal:
+        tle = SubmissionResult('TLE')
+        tle.runtime = runtime
+        nominal, low, high = result, tle, result
+    elif result.validator_first and result.verdict == 'WA':
+        # Interactive: validator exited first with WA. This can cause the submission to run
+        # longer than it should. Cap runtimes at tl.low so this doesn't inflate the time limit.
+        import copy
+
+        high = copy.copy(result)
+        high.runtime = min(runtime, tl.low)
+        wa = SubmissionResult('WA')
+        wa.validator_first = True
+        wa.runtime = high.runtime
+        nominal = low = wa
+    else:
+        tle = SubmissionResult('TLE')
+        tle.runtime = runtime
+        nominal, low, high = tle, tle, result
+    for r in (nominal, low, high):
+        r.set_ac_runtime()
+    return nominal, low, high

From ee4dd4a2cf8807d9c73a6f8c6527dbd7d2e5c8ff Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Thu, 23 Apr 2026 22:34:35 +0200
Subject: [PATCH 6/8] Replace run_submission_real with judge module

---
 problemtools/judge/__init__.py |   4 -
 problemtools/judge/execute.py  | 221 +++++++++++++++++++++++++++
 problemtools/judge/result.py   |  12 --
 problemtools/verifyproblem.py  | 271 +++------------------------------
 tests/test_output_validator.py |   6 +-
 5 files changed, 241 insertions(+), 273 deletions(-)
 create mode 100644 problemtools/judge/execute.py

diff --git a/problemtools/judge/__init__.py b/problemtools/judge/__init__.py
index c9b58f6c..10972ded 100644
--- a/problemtools/judge/__init__.py
+++ b/problemtools/judge/__init__.py
@@ -4,8 +4,6 @@
     TimeLimits,
     Verdict,
     classify_result,
-    is_RTE,
-    is_TLE,
 )
 from .validate import validate_output
 
@@ -15,7 +13,5 @@
     'Verdict',
     'classify_result',
     'execute_testcase',
-    'is_RTE',
-    'is_TLE',
     'validate_output',
 ]
diff --git a/problemtools/judge/execute.py b/problemtools/judge/execute.py
new file mode 100644
index 00000000..d9396f33
--- /dev/null
+++ b/problemtools/judge/execute.py
@@ -0,0 +1,221 @@
+"""Single test case execution.
+
+For each call to execute_testcase, a temporary directory (execution_dir) is
+created under base_dir and cleaned up on return.  Its layout:
+
+    feedback/               validator's structured output (score.txt,
+                            judgemessage.txt, …); persists across multipass
+                            passes so the validator can accumulate output
+    submission_stdout       submission's stdout (batch) or unused (interactive)
+    submission_stderr       submission's stderr
+    val_stdout              output validator's stdout
+    val_stderr              output validator's stderr
+    interactive_output      interactive proxy's output (interactive only)
+    input.in                next-pass input after nextpass.in is moved here
+                            (multipass only)
+"""
+
+from __future__ import annotations
+
+import math
+import os
+import re
+import signal
+import tempfile
+from pathlib import Path
+
+from typing import TYPE_CHECKING
+
+from ..diagnostics import Diagnostics
+from ..metadata import Metadata
+from ..run import Program, get_tool
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase
+from .result import SubmissionResult, TimeLimits, classify_result
+from .validate import _parse_validator_result, _validate_output
+
+_INTERACTIVE_OUTPUT_RE = re.compile(r'\d+ \d+\.\d+ \d+ \d+\.\d+ (validator|submission)')
+
+
+def _is_TLE(status: int, may_signal_with_usr1: bool = False) -> bool:
+    return os.WIFSIGNALED(status) and (
+        os.WTERMSIG(status) == signal.SIGXCPU or (may_signal_with_usr1 and os.WTERMSIG(status) == signal.SIGUSR1)
+    )
+
+
+def _is_RTE(status: int) -> bool:
+    return not os.WIFEXITED(status) or bool(os.WEXITSTATUS(status))
+
+
+def _read_safe(path: Path) -> str | None:
+    try:
+        return path.read_text(errors='replace')
+    except OSError:
+        return None
+
+
+def _run_normal(
+    infile: Path,
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    """Run a submission once (non-interactive)"""
+    outfile = execution_dir / 'submission_stdout'
+    errfile = execution_dir / 'submission_stderr'
+    status, runtime = sub.run(
+        infile=str(infile),
+        outfile=str(outfile),
+        errfile=str(errfile),
+        timelim=math.ceil(timelim) + 1,
+        memlim=metadata.limits.memory,
+        work_dir=sub.path,
+    )
+    if _is_TLE(status) or runtime > timelim:
+        result = SubmissionResult('TLE')
+    elif _is_RTE(status):
+        result = SubmissionResult('RTE', additional_info=_read_safe(errfile))
+    else:
+        result = _validate_output(testcase, outfile, output_validator, metadata, execution_dir, diag, infile=infile)
+    result.runtime = runtime
+    return result
+
+
+def _run_interactive(
+    infile: Path,
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    """Run a submission once (interactive)"""
+    interactive = get_tool('interactive')
+    if interactive is None:
+        diag.error('Could not locate interactive runner')
+        return SubmissionResult('JE', reason='Could not locate interactive runner')
+
+    if not output_validator.compile()[0]:
+        return SubmissionResult('JE', reason=f'output validator {output_validator} failed to compile')
+
+    feedback_dir = execution_dir / 'feedback'
+    interactive_out = execution_dir / 'interactive_output'
+
+    i_status, _ = interactive.run(
+        outfile=str(interactive_out),
+        args=(
+            ['1', str(math.ceil(2 * timelim))]
+            + output_validator.get_runcmd(memlim=metadata.limits.validation_memory)
+            + [str(infile), str(testcase.ansfile_path), str(feedback_dir) + os.sep]
+            + [';']
+            + sub.get_runcmd(memlim=metadata.limits.memory)
+        ),
+        work_dir=sub.path,
+    )
+
+    if _is_RTE(i_status):
+        diag.error(f'Interactive runner crashed, status {i_status}')
+        return SubmissionResult('JE', reason=f'Interactive runner crashed, status {i_status}')
+
+    output = interactive_out.read_text()
+    diag.debug(f'Interactive output: "{output}"')
+
+    if not _INTERACTIVE_OUTPUT_RE.match(output):
+        diag.error(f'Interactive runner produced unexpected output: "{output}"')
+        return SubmissionResult('JE', reason=f'Interactive runner produced unexpected output: "{output}"')
+
+    val_status_str, _, sub_status_str, sub_runtime_str, first = output.split()
+    val_status = int(val_status_str)
+    sub_status = int(sub_status_str)
+    sub_runtime = float(sub_runtime_str)
+
+    val_JE = not os.WIFEXITED(val_status) or os.WEXITSTATUS(val_status) not in [42, 43]
+    val_WA = os.WIFEXITED(val_status) and os.WEXITSTATUS(val_status) == 43
+
+    if val_JE or (val_WA and first == 'validator'):
+        # Validator crashed or exited first with WA — follow validator verdict.
+        # Cap runtime, as the submission can behave erratically and time out
+        # after the validator exited.
+        result = _parse_validator_result(output_validator, val_status, feedback_dir, metadata)
+        sub_runtime = min(sub_runtime, timelim)
+    elif _is_TLE(sub_status, may_signal_with_usr1=True) or sub_runtime > timelim:
+        result = SubmissionResult('TLE')
+    elif _is_RTE(sub_status):
+        result = SubmissionResult('RTE')
+    else:
+        result = _parse_validator_result(output_validator, val_status, feedback_dir, metadata)
+
+    result.runtime = sub_runtime
+    result.validator_first = first == 'validator'
+    return result
+
+
+def _run_pass(
+    infile: Path,
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    """Run a submission once (the common case, or one pass for a multi-pass problem)"""
+    if metadata.is_interactive():
+        return _run_interactive(infile, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+    return _run_normal(infile, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+
+
+def _run_multipass(
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    infile = testcase.infile_path
+    slowest = 0.0
+    feedback_dir = execution_dir / 'feedback'
+    for _ in range(metadata.limits.validation_passes):
+        result = _run_pass(infile, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+        slowest = max(slowest, result.runtime)
+        result.runtime = slowest
+        nextpass = feedback_dir / 'nextpass.in'
+        if result.verdict != 'AC':
+            if nextpass.is_file():
+                return SubmissionResult('JE', reason='Output validator produced nextpass.in despite non-42 exit code')
+            return result
+        if not nextpass.is_file():
+            return result
+        infile = execution_dir / 'input.in'
+        nextpass.rename(infile)
+    return SubmissionResult('JE', reason=f'Validator did not give verdict within {metadata.limits.validation_passes} passes')
+
+
+def execute_testcase(
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelimits: TimeLimits,
+    base_dir: Path,
+    diag: Diagnostics,
+) -> tuple[SubmissionResult, SubmissionResult, SubmissionResult]:
+    """Run sub on testcase and return (nominal, low, high) SubmissionResults."""
+    with tempfile.TemporaryDirectory(dir=base_dir) as exec_dir:
+        execution_dir = Path(exec_dir)
+        (execution_dir / 'feedback').mkdir()
+        if metadata.is_multi_pass():
+            raw = _run_multipass(testcase, sub, output_validator, metadata, timelimits.high, execution_dir, diag)
+        else:
+            raw = _run_pass(testcase.infile_path, testcase, sub, output_validator, metadata, timelimits.high, execution_dir, diag)
+    return classify_result(raw, timelimits)
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
index f335c3bf..7a2530c2 100644
--- a/problemtools/judge/result.py
+++ b/problemtools/judge/result.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-import os
-import signal
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
@@ -11,16 +9,6 @@
 Verdict = Literal['AC', 'TLE', 'OLE', 'MLE', 'RTE', 'WA', 'PAC', 'JE']
 
 
-def is_TLE(status: int, may_signal_with_usr1: bool = False) -> bool:
-    return os.WIFSIGNALED(status) and (
-        os.WTERMSIG(status) == signal.SIGXCPU or (may_signal_with_usr1 and os.WTERMSIG(status) == signal.SIGUSR1)
-    )
-
-
-def is_RTE(status: int) -> bool:
-    return not os.WIFEXITED(status) or bool(os.WEXITSTATUS(status))
-
-
 class SubmissionResult:
     def __init__(
         self,
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index c52f7d11..e630d64e 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -35,7 +35,7 @@
 from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
-from .judge import SubmissionResult, Verdict, is_TLE, is_RTE, validate_output
+from .judge import SubmissionResult, Verdict, TimeLimits, validate_output, execute_testcase
 from .version import add_version_arg
 
 from abc import ABC
@@ -262,114 +262,18 @@ def run_submission(self, sub, runner: Runner, context: Context) -> Result:
 
         return (res, res_low, res_high)
 
-    def run_normal(self, sub, infile: Path, time_limit: float, feedback_dir: Path) -> SubmissionResult:
-        """
-        Run a submission batch-style (non-interactive)
-        """
-        outfile = Path(self._problem.tmpdir) / f'output-{self.counter}'
-        errfile = Path(self._problem.tmpdir) / f'error-{self.counter}'
-
-        status, runtime = sub.run(
-            infile=str(infile),
-            outfile=str(outfile),
-            errfile=str(errfile),
-            timelim=math.ceil(time_limit) + 1,
-            memlim=self._problem.metadata.limits.memory,
-            work_dir=sub.path,
-        )
-        if is_TLE(status) or runtime > time_limit:
-            res_high = SubmissionResult('TLE')
-        elif is_RTE(status):
-            try:
-                with open(errfile, mode='rt') as f:
-                    info = f.read()
-            except IOError:
-                self.info(f'Failed to read error file {errfile}')
-                info = None
-            res_high = SubmissionResult('RTE', additional_info=info)
-        else:
-            res_high = self._problem.output_validators.validate(
-                self, submission_output=str(outfile), infile=str(infile), feedback_dir_path=str(feedback_dir)
-            )
-
-        res_high.runtime = runtime
-        return res_high
-
-    def run_submission_multipass(self, feedback_dir: Path, run_sub_fn) -> SubmissionResult:
-        # This may be called off-main thread.
-
-        infile = Path(self.infile)
-        validation_passes = self._problem.metadata.limits.validation_passes
-
-        input_dir = Path(tempfile.mkdtemp(prefix=f'input-{self.counter}-', dir=self.problem.tmpdir))
-
-        slowest_pass = 0
-        for curr_pass in range(validation_passes):
-            res = run_sub_fn(infile)
-
-            slowest_pass = max(slowest_pass, res.runtime)
-            res.runtime = slowest_pass
-
-            nextpass_file = feedback_dir / 'nextpass.in'
-
-            if res.verdict != 'AC':
-                if nextpass_file.is_file():
-                    return SubmissionResult('JE', reason='Output validator produced nextpass.in despite non-42 exit code')
-                return res
-
-            # Done with passes
-            if not nextpass_file.is_file():
-                return res
-
-            infile = input_dir / 'input.in'
-            # Remove nextpass from feedback
-            nextpass_file.rename(infile)
-
-        return SubmissionResult('JE', reason=f'Multipass validator did not give verdict in {validation_passes=} passes')
-
     def run_submission_real(self, sub, context: Context, timelim: float, timelim_low: float, timelim_high: float) -> Result:
         # This may be called off-main thread.
-
-        feedback_dir = Path(tempfile.mkdtemp(prefix=f'feedback-{self.counter}-', dir=self.problem.tmpdir))
-
-        if self._problem.is_interactive():
-
-            def run_submission(infile: Path) -> SubmissionResult:
-                return self._problem.output_validators.validate_interactive(
-                    self, sub, timelim_high, self._problem.submissions, str(infile), str(feedback_dir)
-                )
-        else:
-
-            def run_submission(infile: Path) -> SubmissionResult:
-                return self.run_normal(sub, infile, timelim_high, feedback_dir)
-
-        if self._problem.is_multi_pass():
-            res_high = self.run_submission_multipass(feedback_dir, run_submission)
-        else:
-            res_high = run_submission(Path(self.infile))
-
-        if res_high.runtime <= timelim_low:
-            res_low = res_high
-            res = res_high
-        elif res_high.runtime <= timelim:
-            res_low = SubmissionResult('TLE')
-            res = res_high
-        elif res_high.validator_first and res_high.verdict == 'WA':
-            # WA can override TLE for interactive problems (see comment in validate_interactive).
-            res = SubmissionResult('WA')
-            res.validator_first = True
-            res_low = res
-            res_high.runtime = timelim_low
-        else:
-            res_low = SubmissionResult('TLE')
-            res = res_low
-
-        res.runtime = res_high.runtime
-        res_low.runtime = res_high.runtime
-        res.set_ac_runtime()
-        res_low.set_ac_runtime()
-        res_high.set_ac_runtime()
-        return (res, res_low, res_high)
+        timelimits = TimeLimits(nominal=timelim, low=timelim_low, high=timelim_high)
+        return execute_testcase(
+            testcase=self,
+            sub=sub,
+            output_validator=self._problem.output_validators.output_validator,
+            metadata=self._problem.metadata,
+            timelimits=timelimits,
+            base_dir=Path(self.problem.tmpdir),
+            diag=self._diag,
+        )
 
     def _init_result_for_testcase(self, res: SubmissionResult) -> SubmissionResult:
         res = copy.copy(res)
@@ -1278,6 +1182,12 @@ def uses_default_validator(self) -> bool:
             return self.problem.metadata.legacy_validation == 'default'
         return not self._validators
 
+    @property
+    def output_validator(self) -> run.Program:
+        if self.uses_default_validator() or not self._validators:
+            return self._default_validator
+        return self._validators[0]
+
     def __str__(self) -> str:
         return 'output validators'
 
@@ -1363,158 +1273,11 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
 
         return self._check_res
 
-    @staticmethod
-    def _get_feedback(feedback_dir: str) -> str | None:
-        all_feedback = []
-        for feedback_file in os.listdir(feedback_dir):
-            feedback_path = os.path.join(feedback_dir, feedback_file)
-            if os.path.getsize(feedback_path) == 0:
-                continue
-            all_feedback.append(f'=== {feedback_file}: ===')
-            # Note: The file could contain non-unicode characters, "replace" to be on the safe side
-            with open(feedback_path, 'r', errors='replace') as feedback:
-                # Cap amount of feedback per file at some high-ish
-                # size, so that a buggy validator spewing out lots of
-                # data doesn't kill us.
-                all_feedback.append(feedback.read(128 * 1024))
-        if all_feedback:
-            return '\n'.join(all_feedback)
-        return None
-
-    def _parse_validator_results(self, val, status: int, feedbackdir, testcase: TestCase) -> SubmissionResult:
-        score = None
-        # TODO: would be good to have some way of displaying the feedback for debugging uses
-        score_file = os.path.join(feedbackdir, 'score.txt')
-        if not self.problem.metadata.is_custom_score_allowed() and os.path.isfile(score_file):
-            return SubmissionResult(
-                'JE', reason='validator produced "score.txt" but problem does not have custom scoring activated'
-            )
-
-        if not os.WIFEXITED(status):
-            return SubmissionResult(
-                'JE',
-                reason=f'output validator {val} crashed, status {status}',
-                additional_info=OutputValidators._get_feedback(feedbackdir),
-            )
-        ret = os.WEXITSTATUS(status)
-        if ret not in [42, 43]:
-            return SubmissionResult(
-                'JE',
-                reason=f'output validator {val} exited with status {ret}',
-                additional_info=OutputValidators._get_feedback(feedbackdir),
-            )
-
-        if ret == 43:
-            return SubmissionResult('WA', additional_info=OutputValidators._get_feedback(feedbackdir))
-
-        if self.problem.metadata.is_custom_score_mandatory():
-            if os.path.isfile(score_file):
-                try:
-                    score_str = open(score_file).read()
-                    score = float(score_str)
-                except Exception as e:
-                    return SubmissionResult('JE', reason=f'failed to parse validator score: {e}')
-            else:
-                # If we're running multipass, we do not need to output a score after every pass
-                # We accept the small risk of allowing a non-multipass output validator to not output score.txt
-                # if it produces a file called nextpass.in
-                if (Path(feedbackdir) / 'nextpass.in').exists():
-                    score = 0
-                else:
-                    return SubmissionResult('JE', reason='problem has custom scoring but validator did not produce "score.txt"')
-
-        return SubmissionResult('AC', score=score)
-
     def _actual_validators(self) -> list:
         if self.uses_default_validator():
             return [self._default_validator]
         return self._validators
 
-    def validate_interactive(
-        self,
-        testcase: TestCase,
-        submission,
-        timelim: float,
-        errorhandler: Submissions,
-        infile: str | None = None,
-        feedback_dir_path: str | None = None,
-    ) -> SubmissionResult:
-        # This may be called off-main thread.
-        interactive_output_re = r'\d+ \d+\.\d+ \d+ \d+\.\d+ (validator|submission)'
-        res = SubmissionResult('JE')
-        interactive = run.get_tool('interactive')
-        if interactive is None:
-            errorhandler.error('Could not locate interactive runner')
-            return res
-        # file descriptor, wall time lim
-        initargs = ['1', str(math.ceil(2 * timelim))]
-        validator_args = [infile if infile else testcase.infile, testcase.ansfile, '<feedbackdir>']
-        submission_args = submission.get_runcmd(memlim=self.problem.metadata.limits.memory)
-
-        val_memlim = self.problem.metadata.limits.validation_memory
-        for i, val in enumerate(self._actual_validators()):
-            if val.compile()[0]:
-                # If we are running multiple output validators in legacy, make sure to wipe it
-                # If we are running multipass, i will always be 0 and we do not accidentally wipe feedback
-                if i > 0 and feedback_dir_path:
-                    shutil.rmtree(feedback_dir_path)
-                    Path(feedback_dir_path).mkdir()
-
-                if feedback_dir_path:
-                    feedbackdir = feedback_dir_path
-                else:
-                    feedbackdir = tempfile.mkdtemp(prefix='feedback', dir=self.problem.tmpdir)
-
-                validator_args[2] = feedbackdir + os.sep
-                f = tempfile.NamedTemporaryFile(delete=False)
-                interactive_out = f.name
-                f.close()
-                i_status, _ = interactive.run(
-                    outfile=interactive_out,
-                    args=initargs + val.get_runcmd(memlim=val_memlim) + validator_args + [';'] + submission_args,
-                    work_dir=submission.path,
-                )
-                if is_RTE(i_status):
-                    errorhandler.error(f'Interactive crashed, status {i_status}')
-                else:
-                    interactive_output = open(interactive_out).read()
-                    errorhandler.debug(f'Interactive output: "{interactive_output}"')
-                    if not re.match(interactive_output_re, interactive_output):
-                        errorhandler.error(
-                            f'Output from interactive does not follow expected format, got output "{interactive_output}"'
-                        )
-                    else:
-                        val_status_str, _, sub_status_str, sub_runtime_str, first = interactive_output.split()
-                        sub_status = int(sub_status_str)
-                        sub_runtime = float(sub_runtime_str)
-                        val_status = int(val_status_str)
-                        val_JE = not os.WIFEXITED(val_status) or os.WEXITSTATUS(val_status) not in [42, 43]
-                        val_WA = os.WIFEXITED(val_status) and os.WEXITSTATUS(val_status) == 43
-                        if val_JE or (val_WA and first == 'validator'):
-                            # If the validator crashed, or exited first with WA,
-                            # always follow validator verdict, even if that early
-                            # exit caused the submission to behave erratically and
-                            # time out.
-                            if sub_runtime > timelim:
-                                sub_runtime = timelim
-                            res = self._parse_validator_results(val, val_status, feedbackdir, testcase)
-                        elif is_TLE(sub_status, True) or sub_runtime > timelim:
-                            res = SubmissionResult('TLE')
-                        elif is_RTE(sub_status):
-                            res = SubmissionResult('RTE')
-                        else:
-                            res = self._parse_validator_results(val, val_status, feedbackdir, testcase)
-
-                        res.runtime = sub_runtime
-                        res.validator_first = first == 'validator'
-
-                os.unlink(interactive_out)
-                if feedback_dir_path is None:
-                    shutil.rmtree(feedbackdir)
-                if res.verdict != 'AC':
-                    return res
-        return res
-
     def validate(
         self, testcase: TestCase, submission_output: str, infile: str | None = None, feedback_dir_path: str | None = None
     ) -> SubmissionResult:
diff --git a/tests/test_output_validator.py b/tests/test_output_validator.py
index 552901a0..04759d0d 100644
--- a/tests/test_output_validator.py
+++ b/tests/test_output_validator.py
@@ -3,7 +3,7 @@
 import string
 import tempfile
 
-from problemtools.verifyproblem import OutputValidators
+from problemtools.judge.validate import _get_feedback
 
 
 def test_output_validator_feedback():
@@ -12,7 +12,7 @@ def test_output_validator_feedback():
         feedback = pathlib.Path(directory) / 'feedback.txt'
         text = ''.join(r.choices(string.printable))
         feedback.write_text(text)
-        data = OutputValidators._get_feedback(directory)
+        data = _get_feedback(pathlib.Path(directory))
         assert data is not None and text in data
 
 
@@ -22,4 +22,4 @@ def test_output_validator_feedback_non_unicode():
         feedback = pathlib.Path(directory) / 'feedback.txt'
         feedback.write_bytes(r.randbytes(1024))
         # Just test that this does not throw an error
-        OutputValidators._get_feedback(directory)
+        _get_feedback(pathlib.Path(directory))

From d53295355ec232c7dcf22adb21e6c9664b705c64 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Fri, 24 Apr 2026 08:40:05 +0200
Subject: [PATCH 7/8] Explicitly drop support for multiple output validators

Multiple output validators were technically allowed in legacy, but were
never supported by Kattis. They were likely not used elsewhere either.

Problemtools had some support, but it was never properly finished. As
the standard has moved to only allowing a single output validator for
new versions, we drop support here.
---
 problemtools/verifyproblem.py | 59 ++++++++++++++---------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index e630d64e..f53b2d0c 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -1193,8 +1193,7 @@ def __str__(self) -> str:
 
     def start_background_work(self, context: Context) -> None:
         if not self._has_precompiled:
-            for val in self._actual_validators():
-                context.submit_background_work(lambda v: v.compile(), val)
+            context.submit_background_work(lambda v: v.compile(), self.output_validator)
             self._has_precompiled = True
 
     def check(self, context: Context) -> bool:
@@ -1204,17 +1203,18 @@ def check(self, context: Context) -> bool:
 
         self.warn_directory('output validators', 'output_validator_directory')
 
-        safe_output_validator_languages = {'c', 'cpp', 'python3'}
-
-        for v in self._validators:
-            if isinstance(v, run.SourceCode) and v.language.lang_id not in safe_output_validator_languages:
-                self.error_in_2023_07(
-                    f'Output validator in {v.language.name}. Only {safe_output_validator_languages} are standardized. Check carefully if your CCS supports more (Kattis does not).'
-                )
-
         if len(self._validators) > 1:
             self.error_in_2023_07(
-                'Found more than one output validator, will only use one. This was allowed in legacy (but not on Kattis)'
+                f'Support for multiple output validators has been dropped. will only use {self.output_validator}'
+            )
+
+        safe_output_validator_languages = {'c', 'cpp', 'python3'}
+        if (
+            isinstance(self.output_validator, run.SourceCode)
+            and self.output_validator.language.lang_id not in safe_output_validator_languages
+        ):
+            self.error_in_2023_07(
+                f'Output validator in {self.output_validator.language.name}. Only {safe_output_validator_languages} are standardized. Check carefully if your CCS supports more (Kattis does not).'
             )
 
         if self.uses_default_validator() and self._validators:
@@ -1225,18 +1225,15 @@ def check(self, context: Context) -> bool:
         if self.uses_default_validator() and self._default_validator is None:
             self.fatal('Unable to locate default validator')
 
-        for val in self._validators[:]:
-            try:
-                success, msg = val.compile()
-                if not success:
-                    self.fatal(f'Compile error for output validator {val}', msg)
-            except run.ProgramError as e:
-                self.error(str(e))
+        try:
+            success, msg = self.output_validator.compile()
+            if not success:
+                self.fatal(f'Compile error for output validator {self.output_validator}', msg)
+        except run.ProgramError as e:
+            self.fatal(f'Compile error for output validator {self.output_validator}', str(e))
 
         # Only sanity check output validators if they all actually compiled
         if self._check_res:
-            flags = self.problem.metadata.legacy_validator_flags
-
             # Sanity check cases that should be rejected by the output validator
             def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]) -> list[SubmissionResult]:
                 results = []
@@ -1247,7 +1244,7 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
                         result = self.validate(testcase, f.name)
                         results.append(result)
                         if result.verdict == 'JE':
-                            self.error(f'{case_desc} as output, and output validator flags "{" ".join(flags)}" gave {result}')
+                            self.error(f'{case_desc} as output on test case {testcase} gave {result}')
                             break
                 return results
 
@@ -1258,30 +1255,20 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
                 if not rejected:
                     self.warning(f'{desc} gets AC')
 
-            # For performance reasons, strongly limit the amount of testcases we run on
-            fast_languages = {'c', 'cpp'}
-            all_validators_are_fast = True
-            for val in self._validators:
-                if isinstance(val, run.SourceCode):
-                    all_validators_are_fast &= val.language.lang_id in fast_languages
-            num_testcases = 3 if all_validators_are_fast else 1
-            test_cases = self.problem.testdata.get_all_testcases()[:num_testcases]
             # Malformed cases that a poorly-written output validator might crash on
-            # Note that these might be valid output, so we only check if it crashes
+            # Note that these might be valid output, so we only check if it crashes.
+            # These bugs are rarely dependent on the actual test case, so we just
+            # run on a few to keep things speedy.
+            test_cases = self.problem.testdata.get_all_testcases()[:3]
             for desc, junk_case_content in _JUNK_CASES_CRASH:
                 run_junk_case(desc, junk_case_content, test_cases)
 
         return self._check_res
 
-    def _actual_validators(self) -> list:
-        if self.uses_default_validator():
-            return [self._default_validator]
-        return self._validators
-
     def validate(
         self, testcase: TestCase, submission_output: str, infile: str | None = None, feedback_dir_path: str | None = None
     ) -> SubmissionResult:
-        val = self._actual_validators()[0]
+        val = self.output_validator
         return validate_output(
             testcase=testcase,
             submission_output=Path(submission_output),

From 2d6c51aca56b2bb005b7fc5da8fb05b5b790ae12 Mon Sep 17 00:00:00 2001
From: Gunnar Kreitz <gkreitz@kattis.com>
Date: Fri, 24 Apr 2026 09:06:48 +0200
Subject: [PATCH 8/8] Drop OutputValidators.validate - call
 judge.validate_output directly.

---
 problemtools/verifyproblem.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index f53b2d0c..4b3d1fdb 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -205,7 +205,14 @@ def check(self, context: Context) -> bool:
                 f'Answer file ({anssize:.1f} Mb) is within 50% of output limit ({outputlim} Mb), you might want to increase output limit'
             )
         if not self._problem.is_interactive() and not self._problem.is_multi_pass():
-            val_res = self._problem.output_validators.validate(self, self.ansfile)
+            val_res = validate_output(
+                testcase=self,
+                submission_output=Path(self.ansfile),
+                output_validator=self._problem.output_validators.output_validator,
+                metadata=self._problem.metadata,
+                base_dir=Path(self._problem.tmpdir),
+                diag=self._diag,
+            )
             if val_res.verdict != 'AC':
                 if self.is_in_sample_group():
                     self.error(f'judge answer file got {val_res} on testcase {self.strip_path_prefix(self.ansfile)}')
@@ -1241,7 +1248,14 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
                     f.write(junk_content)
                     f.flush()
                     for testcase in testcases:
-                        result = self.validate(testcase, f.name)
+                        result = validate_output(
+                            testcase=testcase,
+                            submission_output=Path(f.name),
+                            output_validator=self.output_validator,
+                            metadata=self.problem.metadata,
+                            base_dir=Path(self.problem.tmpdir),
+                            diag=self._diag,
+                        )
                         results.append(result)
                         if result.verdict == 'JE':
                             self.error(f'{case_desc} as output on test case {testcase} gave {result}')
@@ -1265,19 +1279,6 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
 
         return self._check_res
 
-    def validate(
-        self, testcase: TestCase, submission_output: str, infile: str | None = None, feedback_dir_path: str | None = None
-    ) -> SubmissionResult:
-        val = self.output_validator
-        return validate_output(
-            testcase=testcase,
-            submission_output=Path(submission_output),
-            output_validator=val,
-            metadata=self.problem.metadata,
-            base_dir=Path(self.problem.tmpdir),
-            diag=self._diag,
-        )
-
 
 class Runner:
     def __init__(self, problem: Problem, sub, context: Context, timelim: float, timelim_low: float, timelim_high: float) -> None: