diff --git a/problemtools/context.py b/problemtools/context.py
new file mode 100644
index 00000000..ab705d2b
--- /dev/null
+++ b/problemtools/context.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+import re
+from typing import Callable, Pattern, ParamSpec, TypeVar
+
+_T = TypeVar('_T')
+_P = ParamSpec('_P')
+
+PROBLEM_PARTS = ['config', 'data', 'graders', 'statement', 'submissions', 'validators']
+
+
+class Context:
+    # Default values here must be kept in sync with the defaults in argparser().
+    def __init__(
+        self,
+        data_filter: Pattern[str] = re.compile('.*'),
+        submission_filter: Pattern[str] = re.compile('.*'),
+        fixed_timelim: float | None = None,
+        parts: list[str] | None = None,
+        threads: int = 1,
+    ) -> None:
+        self.data_filter = data_filter
+        self.submission_filter = submission_filter
+        self.fixed_timelim = fixed_timelim
+        self.parts: list[str] = parts if parts is not None else list(PROBLEM_PARTS)
+        self.executor: ThreadPoolExecutor | None = ThreadPoolExecutor(threads) if threads > 1 else None
+        self._background_work: list[concurrent.futures.Future[object]] = []
+
+    def submit_background_work(self, job: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs) -> None:
+        assert self.executor
+        self._background_work.append(self.executor.submit(job, *args, **kwargs))
+
+    def wait_for_background_work(self) -> None:
+        concurrent.futures.wait(self._background_work)
diff --git a/problemtools/judge/__init__.py b/problemtools/judge/__init__.py
new file mode 100644
index 00000000..10972ded
--- /dev/null
+++ b/problemtools/judge/__init__.py
@@ -0,0 +1,17 @@
+from .execute import execute_testcase
+from .result import (
+    SubmissionResult,
+    TimeLimits,
+    Verdict,
+    classify_result,
+)
+from .validate import validate_output
+
+__all__ = [
+    'SubmissionResult',
+    'TimeLimits',
+    'Verdict',
+    'classify_result',
+    'execute_testcase',
+    'validate_output',
+]
diff --git a/problemtools/judge/execute.py b/problemtools/judge/execute.py
new file mode 100644
index 00000000..d9396f33
--- /dev/null
+++ b/problemtools/judge/execute.py
@@ -0,0 +1,221 @@
+"""Single test case execution.
+
+For each call to execute_testcase, a temporary directory (execution_dir) is
+created under base_dir and cleaned up on return.  Its layout:
+
+    feedback/               validator's structured output (score.txt,
+                            judgemessage.txt, …); persists across multipass
+                            passes so the validator can accumulate output
+    submission_stdout       submission's stdout (batch) or unused (interactive)
+    submission_stderr       submission's stderr
+    val_stdout              output validator's stdout
+    val_stderr              output validator's stderr
+    interactive_output      interactive proxy's output (interactive only)
+    input.in                next-pass input after nextpass.in is moved here
+                            (multipass only)
+"""
+
+from __future__ import annotations
+
+import math
+import os
+import re
+import signal
+import tempfile
+from pathlib import Path
+
+from typing import TYPE_CHECKING
+
+from ..diagnostics import Diagnostics
+from ..metadata import Metadata
+from ..run import Program, get_tool
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase
+from .result import SubmissionResult, TimeLimits, classify_result
+from .validate import _parse_validator_result, _validate_output
+
+_INTERACTIVE_OUTPUT_RE = re.compile(r'\d+ \d+\.\d+ \d+ \d+\.\d+ (validator|submission)')
+
+
+def _is_TLE(status: int, may_signal_with_usr1: bool = False) -> bool:
+    return os.WIFSIGNALED(status) and (
+        os.WTERMSIG(status) == signal.SIGXCPU or (may_signal_with_usr1 and os.WTERMSIG(status) == signal.SIGUSR1)
+    )
+
+
+def _is_RTE(status: int) -> bool:
+    return not os.WIFEXITED(status) or bool(os.WEXITSTATUS(status))
+
+
+def _read_safe(path: Path) -> str | None:
+    try:
+        return path.read_text(errors='replace')
+    except OSError:
+        return None
+
+
+def _run_normal(
+    infile: Path,
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    """Run a submission once (non-interactive)"""
+    outfile = execution_dir / 'submission_stdout'
+    errfile = execution_dir / 'submission_stderr'
+    status, runtime = sub.run(
+        infile=str(infile),
+        outfile=str(outfile),
+        errfile=str(errfile),
+        timelim=math.ceil(timelim) + 1,
+        memlim=metadata.limits.memory,
+        work_dir=sub.path,
+    )
+    if _is_TLE(status) or runtime > timelim:
+        result = SubmissionResult('TLE')
+    elif _is_RTE(status):
+        result = SubmissionResult('RTE', additional_info=_read_safe(errfile))
+    else:
+        result = _validate_output(testcase, outfile, output_validator, metadata, execution_dir, diag, infile=infile)
+    result.runtime = runtime
+    return result
+
+
+def _run_interactive(
+    infile: Path,
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    """Run a submission once (interactive)"""
+    interactive = get_tool('interactive')
+    if interactive is None:
+        diag.error('Could not locate interactive runner')
+        return SubmissionResult('JE', reason='Could not locate interactive runner')
+
+    if not output_validator.compile()[0]:
+        return SubmissionResult('JE', reason=f'output validator {output_validator} failed to compile')
+
+    feedback_dir = execution_dir / 'feedback'
+    interactive_out = execution_dir / 'interactive_output'
+
+    i_status, _ = interactive.run(
+        outfile=str(interactive_out),
+        args=(
+            ['1', str(math.ceil(2 * timelim))]
+            + output_validator.get_runcmd(memlim=metadata.limits.validation_memory)
+            + [str(infile), str(testcase.ansfile_path), str(feedback_dir) + os.sep]
+            + [';']
+            + sub.get_runcmd(memlim=metadata.limits.memory)
+        ),
+        work_dir=sub.path,
+    )
+
+    if _is_RTE(i_status):
+        diag.error(f'Interactive runner crashed, status {i_status}')
+        return SubmissionResult('JE', reason=f'Interactive runner crashed, status {i_status}')
+
+    output = interactive_out.read_text()
+    diag.debug(f'Interactive output: "{output}"')
+
+    if not _INTERACTIVE_OUTPUT_RE.match(output):
+        diag.error(f'Interactive runner produced unexpected output: "{output}"')
+        return SubmissionResult('JE', reason=f'Interactive runner produced unexpected output: "{output}"')
+
+    val_status_str, _, sub_status_str, sub_runtime_str, first = output.split()
+    val_status = int(val_status_str)
+    sub_status = int(sub_status_str)
+    sub_runtime = float(sub_runtime_str)
+
+    val_JE = not os.WIFEXITED(val_status) or os.WEXITSTATUS(val_status) not in [42, 43]
+    val_WA = os.WIFEXITED(val_status) and os.WEXITSTATUS(val_status) == 43
+
+    if val_JE or (val_WA and first == 'validator'):
+        # Validator crashed or exited first with WA — follow validator verdict.
+        # Cap runtime, as the submission can behave erratically and time out
+        # after the validator exited.
+        result = _parse_validator_result(output_validator, val_status, feedback_dir, metadata)
+        sub_runtime = min(sub_runtime, timelim)
+    elif _is_TLE(sub_status, may_signal_with_usr1=True) or sub_runtime > timelim:
+        result = SubmissionResult('TLE')
+    elif _is_RTE(sub_status):
+        result = SubmissionResult('RTE')
+    else:
+        result = _parse_validator_result(output_validator, val_status, feedback_dir, metadata)
+
+    result.runtime = sub_runtime
+    result.validator_first = first == 'validator'
+    return result
+
+
+def _run_pass(
+    infile: Path,
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    """Run a submission once (the common case, or one pass for a multi-pass problem)"""
+    if metadata.is_interactive():
+        return _run_interactive(infile, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+    return _run_normal(infile, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+
+
+def _run_multipass(
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelim: float,
+    execution_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    infile = testcase.infile_path
+    slowest = 0.0
+    feedback_dir = execution_dir / 'feedback'
+    for _ in range(metadata.limits.validation_passes):
+        result = _run_pass(infile, testcase, sub, output_validator, metadata, timelim, execution_dir, diag)
+        slowest = max(slowest, result.runtime)
+        result.runtime = slowest
+        nextpass = feedback_dir / 'nextpass.in'
+        if result.verdict != 'AC':
+            if nextpass.is_file():
+                return SubmissionResult('JE', reason='Output validator produced nextpass.in despite non-42 exit code')
+            return result
+        if not nextpass.is_file():
+            return result
+        infile = execution_dir / 'input.in'
+        nextpass.rename(infile)
+    return SubmissionResult('JE', reason=f'Validator did not give verdict within {metadata.limits.validation_passes} passes')
+
+
+def execute_testcase(
+    testcase: TestCase,
+    sub: Program,
+    output_validator: Program,
+    metadata: Metadata,
+    timelimits: TimeLimits,
+    base_dir: Path,
+    diag: Diagnostics,
+) -> tuple[SubmissionResult, SubmissionResult, SubmissionResult]:
+    """Run sub on testcase and return (nominal, low, high) SubmissionResults."""
+    with tempfile.TemporaryDirectory(dir=base_dir) as exec_dir:
+        execution_dir = Path(exec_dir)
+        (execution_dir / 'feedback').mkdir()
+        if metadata.is_multi_pass():
+            raw = _run_multipass(testcase, sub, output_validator, metadata, timelimits.high, execution_dir, diag)
+        else:
+            raw = _run_pass(testcase.infile_path, testcase, sub, output_validator, metadata, timelimits.high, execution_dir, diag)
+    return classify_result(raw, timelimits)
diff --git a/problemtools/judge/result.py b/problemtools/judge/result.py
new file mode 100644
index 00000000..7a2530c2
--- /dev/null
+++ b/problemtools/judge/result.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Literal
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase
+
+Verdict = Literal['AC', 'TLE', 'OLE', 'MLE', 'RTE', 'WA', 'PAC', 'JE']
+
+
+class SubmissionResult:
+    def __init__(
+        self,
+        verdict: str,
+        score: float | None = None,
+        reason: str | None = None,
+        additional_info: str | None = None,
+    ) -> None:
+        self.verdict = verdict
+        self.score = score
+        self.reason = reason
+        self.additional_info = additional_info
+        self.testcase: TestCase | None = None
+        self.runtime_testcase: TestCase | None = None
+        self.runtime = -1.0
+        self.ac_runtime = -1.0
+        self.ac_runtime_testcase: TestCase | None = None
+        self.validator_first = False
+        self.sample_failures: list[SubmissionResult] = []
+
+    def set_ac_runtime(self) -> None:
+        if self.verdict == 'AC':
+            self.ac_runtime = self.runtime
+            self.ac_runtime_testcase = self.runtime_testcase
+
+    def __str__(self) -> str:
+        verdict = self.verdict
+        details = []
+        if verdict == 'AC' and self.score is not None:
+            verdict += f' ({self.score:.0f})'
+        if self.reason is not None:
+            details.append(self.reason)
+        if self.testcase is not None:
+            details.append(f'testcase: {self.testcase}')
+        if self.runtime != -1:
+            details.append(f'CPU: {self.runtime:.2f}s @ {self.runtime_testcase}')
+        return verdict if not details else f'{verdict} [{", ".join(details)}]'
+
+
+@dataclass
+class TimeLimits:
+    nominal: float  # official limit; verdict based on this
+    low: float  # below this is comfortably AC; above is "sensitive to time limit"
+    high: float  # wall-clock ceiling enforced on the process
+
+
+def classify_result(
+    result: SubmissionResult,
+    tl: TimeLimits,
+) -> tuple[SubmissionResult, SubmissionResult, SubmissionResult]:
+    """Map a raw high-limit result into the (nominal, low, high) triple."""
+    runtime = result.runtime
+    if runtime <= tl.low:
+        nominal = low = high = result
+    elif runtime <= tl.nominal:
+        tle = SubmissionResult('TLE')
+        tle.runtime = runtime
+        nominal, low, high = result, tle, result
+    elif result.validator_first and result.verdict == 'WA':
+        # Interactive: validator exited first with WA. This can cause the submission to run
+        # longer than it should. Cap runtimes at tl.low so this doesn't inflate the time limit.
+        import copy
+
+        high = copy.copy(result)
+        high.runtime = min(runtime, tl.low)
+        wa = SubmissionResult('WA')
+        wa.validator_first = True
+        wa.runtime = high.runtime
+        nominal = low = wa
+    else:
+        tle = SubmissionResult('TLE')
+        tle.runtime = runtime
+        nominal, low, high = tle, tle, result
+    for r in (nominal, low, high):
+        r.set_ac_runtime()
+    return nominal, low, high
diff --git a/problemtools/judge/validate.py b/problemtools/judge/validate.py
new file mode 100644
index 00000000..88b05c63
--- /dev/null
+++ b/problemtools/judge/validate.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from ..diagnostics import Diagnostics
+from ..metadata import Metadata
+from ..run import Program
+from .result import SubmissionResult
+
+if TYPE_CHECKING:
+    from ..verifyproblem import TestCase
+
+
+def _get_feedback(feedback_dir: Path) -> str | None:
+    all_feedback = []
+    for path in feedback_dir.iterdir():
+        if path.stat().st_size == 0:
+            continue
+        all_feedback.append(f'=== {path.name}: ===')
+        # Note: The file could contain non-unicode characters, "replace" to be on the safe side
+        with open(path, errors='replace') as f:
+            # Cap amount of feedback per file at some high-ish
+            # size, so that a buggy validator spewing out lots of
+            # data doesn't kill us.
+            all_feedback.append(f.read(128 * 1024))
+    return '\n'.join(all_feedback) if all_feedback else None
+
+
+def _parse_validator_result(
+    val: Program,
+    status: int,
+    feedback_dir: Path,
+    metadata: Metadata,
+) -> SubmissionResult:
+    if not os.WIFEXITED(status):
+        return SubmissionResult(
+            'JE',
+            reason=f'output validator {val} crashed, status {status}',
+            additional_info=_get_feedback(feedback_dir),
+        )
+
+    ret = os.WEXITSTATUS(status)
+    if ret not in [42, 43]:
+        return SubmissionResult(
+            'JE',
+            reason=f'output validator {val} exited with status {ret}',
+            additional_info=_get_feedback(feedback_dir),
+        )
+
+    if ret == 43:
+        return SubmissionResult('WA', additional_info=_get_feedback(feedback_dir))
+
+    # ret == 42 (AC); check score handling
+    score_file = feedback_dir / 'score.txt'
+
+    if not metadata.is_custom_score_allowed() and score_file.is_file():
+        return SubmissionResult('JE', reason='validator produced "score.txt" but problem does not have custom scoring activated')
+
+    score: float | None = None
+    if metadata.is_custom_score_mandatory():
+        if score_file.is_file():
+            try:
+                score = float(score_file.read_text())
+            except Exception as e:
+                return SubmissionResult('JE', reason=f'failed to parse validator score: {e}')
+        elif metadata.is_multi_pass() and (feedback_dir / 'nextpass.in').is_file():
+            score = 0.0
+        else:
+            return SubmissionResult('JE', reason='problem has custom scoring but validator did not produce "score.txt"')
+
+    return SubmissionResult('AC', score=score)
+
+
+def _validate_output(
+    testcase: TestCase,
+    submission_output: Path,
+    output_validator: Program,
+    metadata: Metadata,
+    execution_dir: Path,
+    diag: Diagnostics,
+    infile: Path | None = None,
+) -> SubmissionResult:
+    feedback_dir = execution_dir / 'feedback'
+    effective_infile = infile if infile is not None else testcase.infile_path
+    flags = testcase.output_validator_flags
+    val_timelim = metadata.limits.validation_time
+    val_memlim = metadata.limits.validation_memory
+
+    if not output_validator.compile()[0]:
+        return SubmissionResult('JE', reason=f'output validator {output_validator} failed to compile')
+    val_stdout = execution_dir / 'val_stdout'
+    val_stderr = execution_dir / 'val_stderr'
+    status, _ = output_validator.run(
+        infile=str(submission_output),
+        args=[str(effective_infile), str(testcase.ansfile_path), str(feedback_dir) + os.sep] + flags,
+        timelim=val_timelim,
+        memlim=val_memlim,
+        outfile=str(val_stdout),
+        errfile=str(val_stderr),
+    )
+    for label, path in [('stdout', val_stdout), ('stderr', val_stderr)]:
+        try:
+            if content := path.read_text(errors='replace'):
+                diag.debug(f'Validator {label}: {content}')
+        except OSError as e:
+            diag.info(f'Failed to read validator output: {e}')
+    return _parse_validator_result(output_validator, status, feedback_dir, metadata)
+
+
+def validate_output(
+    testcase: TestCase,
+    submission_output: Path,
+    output_validator: Program,
+    metadata: Metadata,
+    base_dir: Path,
+    diag: Diagnostics,
+) -> SubmissionResult:
+    with tempfile.TemporaryDirectory(dir=base_dir) as exec_dir:
+        execution_dir = Path(exec_dir)
+        (execution_dir / 'feedback').mkdir()
+        return _validate_output(testcase, submission_output, output_validator, metadata, execution_dir, diag)
diff --git a/problemtools/run/program.py b/problemtools/run/program.py
index ae27c377..38c789d8 100644
--- a/problemtools/run/program.py
+++ b/problemtools/run/program.py
@@ -18,6 +18,7 @@ class Program(ABC):
     """Abstract base class for programs."""
 
     def __init__(self) -> None:
+        self.path: str = ''
         self.runtime = 0
         self._compile_lock = threading.Lock()
         self._compile_result: tuple[bool, str | None] | None = None
diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py
index 4e842bf9..4b3d1fdb 100644
--- a/problemtools/verifyproblem.py
+++ b/problemtools/verifyproblem.py
@@ -3,8 +3,6 @@
 from __future__ import annotations
 
 import argparse
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor
 import math
 import threading
 import queue
@@ -13,7 +11,6 @@
 import hashlib
 import collections
 import os
-import signal
 import re
 import shutil
 import logging
@@ -35,96 +32,23 @@
 from . import problem2pdf
 from . import run
 from . import statement_util
+from .context import Context, PROBLEM_PARTS
 from .diagnostics import Diagnostics, LoggingDiagnostics, VerifyError
 from .formatversion import FormatVersion, get_format_version
+from .judge import SubmissionResult, Verdict, TimeLimits, validate_output, execute_testcase
 from .version import add_version_arg
 
 from abc import ABC
-from typing import Any, Callable, ClassVar, Literal, Pattern, Match, ParamSpec, TypeVar, cast
+from typing import Any, Callable, ClassVar, Pattern, Match, ParamSpec, TypeVar, cast
 from pydantic import ValidationError
 
 random.seed(42)
 
-Verdict = Literal['AC', 'TLE', 'OLE', 'MLE', 'RTE', 'WA', 'PAC', 'JE']
-
-
-def is_TLE(status: int, may_signal_with_usr1: bool = False) -> bool:
-    return os.WIFSIGNALED(status) and (
-        os.WTERMSIG(status) == signal.SIGXCPU or (may_signal_with_usr1 and os.WTERMSIG(status) == signal.SIGUSR1)
-    )
-
-
-def is_RTE(status: int) -> bool:
-    return not os.WIFEXITED(status) or bool(os.WEXITSTATUS(status))
-
-
-class SubmissionResult:
-    def __init__(self, verdict: str, score: float | None = None, reason: str | None = None, additional_info: str | None = None):
-        self.verdict = verdict
-        self.score = score
-        self.reason = reason
-        self.additional_info = additional_info
-        self.testcase: TestCase | None = None
-        self.runtime_testcase: TestCase | None = None
-        self.runtime = -1.0
-        self.ac_runtime = -1.0
-        self.ac_runtime_testcase: TestCase | None = None
-        self.validator_first = False
-        self.sample_failures: list[SubmissionResult] = []
-
-    def set_ac_runtime(self) -> None:
-        if self.verdict == 'AC':
-            self.ac_runtime = self.runtime
-            self.ac_runtime_testcase = self.runtime_testcase
-
-    def __str__(self) -> str:
-        verdict = self.verdict
-        details = []
-
-        if verdict == 'AC' and self.score is not None:
-            verdict += f' ({self.score:.0f})'
-
-        if self.reason is not None:
-            details.append(self.reason)
-        if self.testcase is not None:
-            details.append(f'testcase: {self.testcase}')
-        if self.runtime != -1:
-            details.append(f'CPU: {self.runtime:.2f}s @ {self.runtime_testcase}')
-
-        if len(details) == 0:
-            return verdict
-        return f'{verdict} [{", ".join(details)}]'
-
 
 _T = TypeVar('_T')
 _P = ParamSpec('_P')
 
 
-class Context:
-    # Default values here must be kept in sync with the defaults in argparser().
-    def __init__(
-        self,
-        data_filter: Pattern[str] = re.compile('.*'),
-        submission_filter: Pattern[str] = re.compile('.*'),
-        fixed_timelim: float | None = None,
-        parts: list[str] | None = None,
-        threads: int = 1,
-    ) -> None:
-        self.data_filter = data_filter
-        self.submission_filter = submission_filter
-        self.fixed_timelim = fixed_timelim
-        self.parts: list[str] = parts if parts is not None else list(PROBLEM_PARTS)
-        self.executor: ThreadPoolExecutor | None = ThreadPoolExecutor(threads) if threads > 1 else None
-        self._background_work: list[concurrent.futures.Future[object]] = []
-
-    def submit_background_work(self, job: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs) -> None:
-        assert self.executor
-        self._background_work.append(self.executor.submit(job, *args, **kwargs))
-
-    def wait_for_background_work(self) -> None:
-        concurrent.futures.wait(self._background_work)
-
-
 class ProblemAspect(ABC):
     _check_res: bool | None = None
     problem: Problem
@@ -242,6 +166,22 @@ def check_size_limits(self, filename: str) -> None:
     def strip_path_prefix(self, path: str) -> str:
         return os.path.relpath(path, os.path.join(self._problem.probdir, 'data'))
 
+    # Temporary properties for use while refactoring verifyproblem into judge/
+    @property
+    def infile_path(self) -> Path:
+        return Path(self.infile)
+
+    @property
+    def ansfile_path(self) -> Path:
+        return Path(self.ansfile)
+
+    @property
+    def output_validator_flags(self) -> list[str]:
+        return (
+            self._problem.metadata.legacy_validator_flags.split()
+            + self.testcasegroup.config.get('output_validator_flags', '').split()
+        )
+
     def is_in_sample_group(self) -> bool:
         return self.strip_path_prefix(self.infile).startswith('sample')
 
@@ -265,7 +205,14 @@ def check(self, context: Context) -> bool:
                 f'Answer file ({anssize:.1f} Mb) is within 50% of output limit ({outputlim} Mb), you might want to increase output limit'
             )
         if not self._problem.is_interactive() and not self._problem.is_multi_pass():
-            val_res = self._problem.output_validators.validate(self, self.ansfile)
+            val_res = validate_output(
+                testcase=self,
+                submission_output=Path(self.ansfile),
+                output_validator=self._problem.output_validators.output_validator,
+                metadata=self._problem.metadata,
+                base_dir=Path(self._problem.tmpdir),
+                diag=self._diag,
+            )
             if val_res.verdict != 'AC':
                 if self.is_in_sample_group():
                     self.error(f'judge answer file got {val_res} on testcase {self.strip_path_prefix(self.ansfile)}')
@@ -322,114 +269,18 @@ def run_submission(self, sub, runner: Runner, context: Context) -> Result:
 
         return (res, res_low, res_high)
 
-    def run_normal(self, sub, infile: Path, time_limit: float, feedback_dir: Path) -> SubmissionResult:
-        """
-        Run a submission batch-style (non-interactive)
-        """
-        outfile = Path(self._problem.tmpdir) / f'output-{self.counter}'
-        errfile = Path(self._problem.tmpdir) / f'error-{self.counter}'
-
-        status, runtime = sub.run(
-            infile=str(infile),
-            outfile=str(outfile),
-            errfile=str(errfile),
-            timelim=math.ceil(time_limit) + 1,
-            memlim=self._problem.metadata.limits.memory,
-            work_dir=sub.path,
-        )
-        if is_TLE(status) or runtime > time_limit:
-            res_high = SubmissionResult('TLE')
-        elif is_RTE(status):
-            try:
-                with open(errfile, mode='rt') as f:
-                    info = f.read()
-            except IOError:
-                self.info(f'Failed to read error file {errfile}')
-                info = None
-            res_high = SubmissionResult('RTE', additional_info=info)
-        else:
-            res_high = self._problem.output_validators.validate(
-                self, submission_output=str(outfile), infile=str(infile), feedback_dir_path=str(feedback_dir)
-            )
-
-        res_high.runtime = runtime
-        return res_high
-
-    def run_submission_multipass(self, feedback_dir: Path, run_sub_fn) -> SubmissionResult:
-        # This may be called off-main thread.
-
-        infile = Path(self.infile)
-        validation_passes = self._problem.metadata.limits.validation_passes
-
-        input_dir = Path(tempfile.mkdtemp(prefix=f'input-{self.counter}-', dir=self.problem.tmpdir))
-
-        slowest_pass = 0
-        for curr_pass in range(validation_passes):
-            res = run_sub_fn(infile)
-
-            slowest_pass = max(slowest_pass, res.runtime)
-            res.runtime = slowest_pass
-
-            nextpass_file = feedback_dir / 'nextpass.in'
-
-            if res.verdict != 'AC':
-                if nextpass_file.is_file():
-                    return SubmissionResult('JE', reason='Output validator produced nextpass.in despite non-42 exit code')
-                return res
-
-            # Done with passes
-            if not nextpass_file.is_file():
-                return res
-
-            infile = input_dir / 'input.in'
-            # Remove nextpass from feedback
-            nextpass_file.rename(infile)
-
-        return SubmissionResult('JE', reason=f'Multipass validator did not give verdict in {validation_passes=} passes')
-
     def run_submission_real(self, sub, context: Context, timelim: float, timelim_low: float, timelim_high: float) -> Result:
         # This may be called off-main thread.
-
-        feedback_dir = Path(tempfile.mkdtemp(prefix=f'feedback-{self.counter}-', dir=self.problem.tmpdir))
-
-        if self._problem.is_interactive():
-
-            def run_submission(infile: Path) -> SubmissionResult:
-                return self._problem.output_validators.validate_interactive(
-                    self, sub, timelim_high, self._problem.submissions, str(infile), str(feedback_dir)
-                )
-        else:
-
-            def run_submission(infile: Path) -> SubmissionResult:
-                return self.run_normal(sub, infile, timelim_high, feedback_dir)
-
-        if self._problem.is_multi_pass():
-            res_high = self.run_submission_multipass(feedback_dir, run_submission)
-        else:
-            res_high = run_submission(Path(self.infile))
-
-        if res_high.runtime <= timelim_low:
-            res_low = res_high
-            res = res_high
-        elif res_high.runtime <= timelim:
-            res_low = SubmissionResult('TLE')
-            res = res_high
-        elif res_high.validator_first and res_high.verdict == 'WA':
-            # WA can override TLE for interactive problems (see comment in validate_interactive).
-            res = SubmissionResult('WA')
-            res.validator_first = True
-            res_low = res
-            res_high.runtime = timelim_low
-        else:
-            res_low = SubmissionResult('TLE')
-            res = res_low
-
-        res.runtime = res_high.runtime
-        res_low.runtime = res_high.runtime
-        res.set_ac_runtime()
-        res_low.set_ac_runtime()
-        res_high.set_ac_runtime()
-        return (res, res_low, res_high)
+        timelimits = TimeLimits(nominal=timelim, low=timelim_low, high=timelim_high)
+        return execute_testcase(
+            testcase=self,
+            sub=sub,
+            output_validator=self._problem.output_validators.output_validator,
+            metadata=self._problem.metadata,
+            timelimits=timelimits,
+            base_dir=Path(self.problem.tmpdir),
+            diag=self._diag,
+        )
 
     def _init_result_for_testcase(self, res: SubmissionResult) -> SubmissionResult:
         res = copy.copy(res)
@@ -1338,13 +1189,18 @@ def uses_default_validator(self) -> bool:
             return self.problem.metadata.legacy_validation == 'default'
         return not self._validators
 
+    @property
+    def output_validator(self) -> run.Program:
+        if self.uses_default_validator() or not self._validators:
+            return self._default_validator
+        return self._validators[0]
+
     def __str__(self) -> str:
         return 'output validators'
 
     def start_background_work(self, context: Context) -> None:
         if not self._has_precompiled:
-            for val in self._actual_validators():
-                context.submit_background_work(lambda v: v.compile(), val)
+            context.submit_background_work(lambda v: v.compile(), self.output_validator)
             self._has_precompiled = True
 
     def check(self, context: Context) -> bool:
@@ -1354,16 +1210,19 @@ def check(self, context: Context) -> bool:
 
         self.warn_directory('output validators', 'output_validator_directory')
 
-        safe_output_validator_languages = {'c', 'cpp', 'python3'}
-
-        for v in self._validators:
-            if isinstance(v, run.SourceCode) and v.language.lang_id not in safe_output_validator_languages:
-                self.error_in_2023_07(
-                    f'Output validator in {v.language.name}. Only {safe_output_validator_languages} are standardized. Check carefully if your CCS supports more (Kattis does not).'
-                )
-
         if len(self._validators) > 1:
-            self.error_in_2023_07('Found more than one output validator. This was allowed in legacy (but not on Kattis)')
+            self.error_in_2023_07(
+                f'Support for multiple output validators has been dropped. will only use {self.output_validator}'
+            )
+
+        safe_output_validator_languages = {'c', 'cpp', 'python3'}
+        if (
+            isinstance(self.output_validator, run.SourceCode)
+            and self.output_validator.language.lang_id not in safe_output_validator_languages
+        ):
+            self.error_in_2023_07(
+                f'Output validator in {self.output_validator.language.name}. Only {safe_output_validator_languages} are standardized. Check carefully if your CCS supports more (Kattis does not).'
+            )
 
         if self.uses_default_validator() and self._validators:
             self.error('There are validator programs but problem.yaml has validation = "default"')
@@ -1373,18 +1232,15 @@ def check(self, context: Context) -> bool:
         if self.uses_default_validator() and self._default_validator is None:
             self.fatal('Unable to locate default validator')
 
-        for val in self._validators[:]:
-            try:
-                success, msg = val.compile()
-                if not success:
-                    self.fatal(f'Compile error for output validator {val}', msg)
-            except run.ProgramError as e:
-                self.error(str(e))
+        try:
+            success, msg = self.output_validator.compile()
+            if not success:
+                self.fatal(f'Compile error for output validator {self.output_validator}', msg)
+        except run.ProgramError as e:
+            self.fatal(f'Compile error for output validator {self.output_validator}', str(e))
 
         # Only sanity check output validators if they all actually compiled
         if self._check_res:
-            flags = self.problem.metadata.legacy_validator_flags
-
             # Sanity check cases that should be rejected by the output validator
             def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]) -> list[SubmissionResult]:
                 results = []
@@ -1392,10 +1248,17 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
                     f.write(junk_content)
                     f.flush()
                     for testcase in testcases:
-                        result = self.validate(testcase, f.name)
+                        result = validate_output(
+                            testcase=testcase,
+                            submission_output=Path(f.name),
+                            output_validator=self.output_validator,
+                            metadata=self.problem.metadata,
+                            base_dir=Path(self.problem.tmpdir),
+                            diag=self._diag,
+                        )
                         results.append(result)
                         if result.verdict == 'JE':
-                            self.error(f'{case_desc} as output, and output validator flags "{" ".join(flags)}" gave {result}')
+                            self.error(f'{case_desc} as output on test case {testcase} gave {result}')
                             break
                 return results
 
@@ -1406,235 +1269,16 @@ def run_junk_case(case_desc: str, junk_content: bytes, testcases: list[TestCase]
                 if not rejected:
                     self.warning(f'{desc} gets AC')
 
-            # For performance reasons, strongly limit the amount of testcases we run on
-            fast_languages = {'c', 'cpp'}
-            all_validators_are_fast = True
-            for val in self._validators:
-                if isinstance(val, run.SourceCode):
-                    all_validators_are_fast &= val.language.lang_id in fast_languages
-            num_testcases = 3 if all_validators_are_fast else 1
-            test_cases = self.problem.testdata.get_all_testcases()[:num_testcases]
             # Malformed cases that a poorly-written output validator might crash on
-            # Note that these might be valid output, so we only check if it crashes
+            # Note that these might be valid output, so we only check if it crashes.
+            # These bugs are rarely dependent on the actual test case, so we just
+            # run on a few to keep things speedy.
+            test_cases = self.problem.testdata.get_all_testcases()[:3]
             for desc, junk_case_content in _JUNK_CASES_CRASH:
                 run_junk_case(desc, junk_case_content, test_cases)
 
         return self._check_res
 
-    @staticmethod
-    def _get_feedback(feedback_dir: str) -> str | None:
-        all_feedback = []
-        for feedback_file in os.listdir(feedback_dir):
-            feedback_path = os.path.join(feedback_dir, feedback_file)
-            if os.path.getsize(feedback_path) == 0:
-                continue
-            all_feedback.append(f'=== {feedback_file}: ===')
-            # Note: The file could contain non-unicode characters, "replace" to be on the safe side
-            with open(feedback_path, 'r', errors='replace') as feedback:
-                # Cap amount of feedback per file at some high-ish
-                # size, so that a buggy validator spewing out lots of
-                # data doesn't kill us.
-                all_feedback.append(feedback.read(128 * 1024))
-        if all_feedback:
-            return '\n'.join(all_feedback)
-        return None
-
-    def _parse_validator_results(self, val, status: int, feedbackdir, testcase: TestCase) -> SubmissionResult:
-        score = None
-        # TODO: would be good to have some way of displaying the feedback for debugging uses
-        score_file = os.path.join(feedbackdir, 'score.txt')
-        if not self.problem.metadata.is_custom_score_allowed() and os.path.isfile(score_file):
-            return SubmissionResult(
-                'JE', reason='validator produced "score.txt" but problem does not have custom scoring activated'
-            )
-
-        if not os.WIFEXITED(status):
-            return SubmissionResult(
-                'JE',
-                reason=f'output validator {val} crashed, status {status}',
-                additional_info=OutputValidators._get_feedback(feedbackdir),
-            )
-        ret = os.WEXITSTATUS(status)
-        if ret not in [42, 43]:
-            return SubmissionResult(
-                'JE',
-                reason=f'output validator {val} exited with status {ret}',
-                additional_info=OutputValidators._get_feedback(feedbackdir),
-            )
-
-        if ret == 43:
-            return SubmissionResult('WA', additional_info=OutputValidators._get_feedback(feedbackdir))
-
-        if self.problem.metadata.is_custom_score_mandatory():
-            if os.path.isfile(score_file):
-                try:
-                    score_str = open(score_file).read()
-                    score = float(score_str)
-                except Exception as e:
-                    return SubmissionResult('JE', reason=f'failed to parse validator score: {e}')
-            else:
-                # If we're running multipass, we do not need to output a score after every pass
-                # We accept the small risk of allowing a non-multipass output validator to not output score.txt
-                # if it produces a file called nextpass.in
-                if (Path(feedbackdir) / 'nextpass.in').exists():
-                    score = 0
-                else:
-                    return SubmissionResult('JE', reason='problem has custom scoring but validator did not produce "score.txt"')
-
-        return SubmissionResult('AC', score=score)
-
-    def _actual_validators(self) -> list:
-        if self.uses_default_validator():
-            return [self._default_validator]
-        return self._validators
-
-    def validate_interactive(
-        self,
-        testcase: TestCase,
-        submission,
-        timelim: float,
-        errorhandler: Submissions,
-        infile: str | None = None,
-        feedback_dir_path: str | None = None,
-    ) -> SubmissionResult:
-        # This may be called off-main thread.
-        interactive_output_re = r'\d+ \d+\.\d+ \d+ \d+\.\d+ (validator|submission)'
-        res = SubmissionResult('JE')
-        interactive = run.get_tool('interactive')
-        if interactive is None:
-            errorhandler.error('Could not locate interactive runner')
-            return res
-        # file descriptor, wall time lim
-        initargs = ['1', str(math.ceil(2 * timelim))]
-        validator_args = [infile if infile else testcase.infile, testcase.ansfile, '<feedbackdir>']
-        submission_args = submission.get_runcmd(memlim=self.problem.metadata.limits.memory)
-
-        val_memlim = self.problem.metadata.limits.validation_memory
-        for i, val in enumerate(self._actual_validators()):
-            if val.compile()[0]:
-                # If we are running multiple output validators in legacy, make sure to wipe it
-                # If we are running multipass, i will always be 0 and we do not accidentally wipe feedback
-                if i > 0 and feedback_dir_path:
-                    shutil.rmtree(feedback_dir_path)
-                    Path(feedback_dir_path).mkdir()
-
-                if feedback_dir_path:
-                    feedbackdir = feedback_dir_path
-                else:
-                    feedbackdir = tempfile.mkdtemp(prefix='feedback', dir=self.problem.tmpdir)
-
-                validator_args[2] = feedbackdir + os.sep
-                f = tempfile.NamedTemporaryFile(delete=False)
-                interactive_out = f.name
-                f.close()
-                i_status, _ = interactive.run(
-                    outfile=interactive_out,
-                    args=initargs + val.get_runcmd(memlim=val_memlim) + validator_args + [';'] + submission_args,
-                    work_dir=submission.path,
-                )
-                if is_RTE(i_status):
-                    errorhandler.error(f'Interactive crashed, status {i_status}')
-                else:
-                    interactive_output = open(interactive_out).read()
-                    errorhandler.debug(f'Interactive output: "{interactive_output}"')
-                    if not re.match(interactive_output_re, interactive_output):
-                        errorhandler.error(
-                            f'Output from interactive does not follow expected format, got output "{interactive_output}"'
-                        )
-                    else:
-                        val_status_str, _, sub_status_str, sub_runtime_str, first = interactive_output.split()
-                        sub_status = int(sub_status_str)
-                        sub_runtime = float(sub_runtime_str)
-                        val_status = int(val_status_str)
-                        val_JE = not os.WIFEXITED(val_status) or os.WEXITSTATUS(val_status) not in [42, 43]
-                        val_WA = os.WIFEXITED(val_status) and os.WEXITSTATUS(val_status) == 43
-                        if val_JE or (val_WA and first == 'validator'):
-                            # If the validator crashed, or exited first with WA,
-                            # always follow validator verdict, even if that early
-                            # exit caused the submission to behave erratically and
-                            # time out.
-                            if sub_runtime > timelim:
-                                sub_runtime = timelim
-                            res = self._parse_validator_results(val, val_status, feedbackdir, testcase)
-                        elif is_TLE(sub_status, True) or sub_runtime > timelim:
-                            res = SubmissionResult('TLE')
-                        elif is_RTE(sub_status):
-                            res = SubmissionResult('RTE')
-                        else:
-                            res = self._parse_validator_results(val, val_status, feedbackdir, testcase)
-
-                        res.runtime = sub_runtime
-                        res.validator_first = first == 'validator'
-
-                os.unlink(interactive_out)
-                if feedback_dir_path is None:
-                    shutil.rmtree(feedbackdir)
-                if res.verdict != 'AC':
-                    return res
-        return res
-
-    def validate(
-        self, testcase: TestCase, submission_output: str, infile: str | None = None, feedback_dir_path: str | None = None
-    ) -> SubmissionResult:
-        """
-        Run all output validators on the given test case and submission output.
-        Parameters:
-            testcase: The test case we are validating.
-            submission_output: Path to out file of submission.
-            infile: The input file. Overrides testcase.infile if we are running multipass.
-            feedback_dir_path: Path to feedback directory. If None, a temporary directory will be created and cleaned up.
-        """
-        res = SubmissionResult('JE')
-        val_timelim = self.problem.metadata.limits.validation_time
-        val_memlim = self.problem.metadata.limits.validation_memory
-        flags = (
-            self.problem.metadata.legacy_validator_flags.split() + testcase.testcasegroup.config['output_validator_flags'].split()
-        )
-        for i, val in enumerate(self._actual_validators()):
-            if val.compile()[0]:
-                # If we are running multiple output validators in legacy, make sure to wipe it
-                # If we are running multipass, i will always be 0 and we do not accidentally wipe feedback
-                if i > 0 and feedback_dir_path:
-                    shutil.rmtree(feedback_dir_path)
-                    Path(feedback_dir_path).mkdir()
-
-                if feedback_dir_path:
-                    feedbackdir = feedback_dir_path
-                else:
-                    feedbackdir = tempfile.mkdtemp(prefix='feedback', dir=self.problem.tmpdir)
-
-                validator_output = tempfile.mkdtemp(prefix='checker_out', dir=self.problem.tmpdir)
-                outfile = validator_output + '/out.txt'
-                errfile = validator_output + '/err.txt'
-                status, runtime = val.run(
-                    infile=submission_output,
-                    args=[infile if infile else testcase.infile, testcase.ansfile, feedbackdir] + flags,
-                    timelim=val_timelim,
-                    memlim=val_memlim,
-                    outfile=outfile,
-                    errfile=errfile,
-                )
-                try:
-                    with open(outfile, mode='rt') as f:
-                        output = f.read()
-                    if output:
-                        self.debug(f'Validator output:\n{output}')
-                    with open(errfile, mode='rt') as f:
-                        error = f.read()
-                    if error:
-                        self.debug(f'Validator stderr:\n{error}')
-                except IOError as e:
-                    self.info(f'Failed to read validator output: {e}')
-                res = self._parse_validator_results(val, status, feedbackdir, testcase)
-                shutil.rmtree(validator_output)
-                if feedback_dir_path is None:
-                    shutil.rmtree(feedbackdir)
-                if res.verdict != 'AC':
-                    return res
-
-        # TODO: check that all output validators give same result
-        return res
-
 
 class Runner:
     def __init__(self, problem: Problem, sub, context: Context, timelim: float, timelim_low: float, timelim_high: float) -> None:
@@ -1926,9 +1570,6 @@ def _f_n(number: float | None) -> str:
         return self._check_res
 
 
-PROBLEM_PARTS = ['config', 'data', 'graders', 'statement', 'submissions', 'validators']
-
-
 class Problem(ProblemAspect):
     """Represents a checkable problem"""
 
diff --git a/tests/test_output_validator.py b/tests/test_output_validator.py
index 552901a0..04759d0d 100644
--- a/tests/test_output_validator.py
+++ b/tests/test_output_validator.py
@@ -3,7 +3,7 @@
 import string
 import tempfile
 
-from problemtools.verifyproblem import OutputValidators
+from problemtools.judge.validate import _get_feedback
 
 
 def test_output_validator_feedback():
@@ -12,7 +12,7 @@ def test_output_validator_feedback():
         feedback = pathlib.Path(directory) / 'feedback.txt'
         text = ''.join(r.choices(string.printable))
         feedback.write_text(text)
-        data = OutputValidators._get_feedback(directory)
+        data = _get_feedback(pathlib.Path(directory))
         assert data is not None and text in data
 
 
@@ -22,4 +22,4 @@ def test_output_validator_feedback_non_unicode():
         feedback = pathlib.Path(directory) / 'feedback.txt'
         feedback.write_bytes(r.randbytes(1024))
         # Just test that this does not throw an error
-        OutputValidators._get_feedback(directory)
+        _get_feedback(pathlib.Path(directory))