arty-kk · arty-kk · Mar 31, 2026 · Mar 31, 2026
diff --git a/docs/prompts.md b/docs/prompts.md
@@ -28,3 +28,14 @@ Prompts should request structured JSON for:
 - evaluations,
 - reflections,
 - code-change proposals.
+
+## Supported runtime tasks
+
+`LLMOrchestrator` currently supports exactly these task keys:
+
+- `plan`
+- `evaluate`
+- `reflect`
+- `code_changes`
+
+Each task must map to an explicit runtime path and fallback payload.
diff --git a/src/core/evaluator.py b/src/core/evaluator.py
@@ -91,12 +91,8 @@ async def evaluate_async(
         tests_cmd = [
             sys.executable,
             "-m",
-            "unittest",
-            "discover",
-            "-s",
-            str(workspace_path / "tests"),
-            "-t",
-            str(workspace_path),
+            "pytest",
+            "-q",
         ]
         (
             tests_returncode,

diff --git a/src/core/experiment_manager.py b/src/core/experiment_manager.py
@@ -81,6 +81,7 @@ def _normalize_code_changes_paths(code_changes: List[CodeChange], repo_root: Pat
 
 
 _DEFAULT_EVALUATOR_REQUIRED_PATHS = ("src", "tests")
+_REQUIRED_EVALUATOR_METRICS_FIELDS = ("compile_success", "tests_success", "tests_skipped")
 
 
 def _materialization_paths_for_candidate(
@@ -121,6 +122,17 @@ def _candidate_paths(code_changes: List[CodeChange], repo_root: Path) -> list[st
     return candidate_paths
 
 
+def _validate_evaluator_payload(payload: Dict[str, Any]) -> str | None:
+    missing_fields = [
+        field_name
+        for field_name in _REQUIRED_EVALUATOR_METRICS_FIELDS
+        if field_name not in payload
+    ]
+    if missing_fields:
+        return f"missing_required_metrics:{','.join(missing_fields)}"
+    return None
+
+
 @dataclass
 class ExperimentManager:
     repo_root: Path = REPO_ROOT
@@ -186,6 +198,17 @@ async def _evaluate_candidate(
                         and cached.get("repo_hash") == repo_hash
                         and cached.get("code_hash") == code_hash
                     ):
+                        validation_error = _validate_evaluator_payload(metrics)
+                        if validation_error:
+                            malformed_metrics = dict(metrics)
+                            malformed_metrics.setdefault("reason", validation_error)
+                            return candidate.id, {
+                                "metrics": malformed_metrics,
+                                "accepted": False,
+                                "reason": "malformed_metrics",
+                                "cached": True,
+                                "error": validation_error,
+                            }, None, None
                         accepted, reason = should_accept(metrics, baseline_metrics)
                         return candidate.id, {
                             "metrics": metrics,
@@ -251,6 +274,16 @@ async def _evaluate_candidate(
                                 }, None, None
                             payload.setdefault("duration_sec", elapsed)
                             payload.setdefault("timed_out", False)
+                            validation_error = _validate_evaluator_payload(payload)
+                            if validation_error:
+                                payload.setdefault("reason", validation_error)
+                                return candidate.id, {
+                                    "metrics": payload,
+                                    "accepted": False,
+                                    "reason": "malformed_metrics",
+                                    "cached": False,
+                                    "error": validation_error,
+                                }, None, None
                             accepted, reason = should_accept(payload, baseline_metrics)
                             return candidate.id, {
                                 "metrics": payload,

diff --git a/src/core/llm.py b/src/core/llm.py
@@ -24,7 +24,7 @@ class LLMOrchestrator:
     memory_key_requests: str = 'llm_requests'
     memory_key_responses: str = 'llm_responses'
     memory_key_status: str = 'llm_status'
-    supported_tasks: List[str] = field(default_factory=lambda: ['plan', 'evaluate', 'reflect', 'self_evolve', 'code_changes'])
+    supported_tasks: List[str] = field(default_factory=lambda: ['plan', 'evaluate', 'reflect', 'code_changes'])
     _client: OpenAIClient | None = field(default=None, init=False, repr=False)
     _lifecycle_lock: asyncio.Lock = field(default_factory=asyncio.Lock, init=False, repr=False)
 

diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2026 Сацук Артём Венедиктович (Satsuk Artem)
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+
+from sif.core import evaluator
+
+
+def test_evaluator_uses_pytest_and_reports_success(monkeypatch, tmp_path: Path) -> None:
+    commands: list[list[str]] = []
+
+    async def fake_run_subprocess(cmd: list[str], *, timeout_s: float, cwd=None, env=None):
+        commands.append(cmd)
+        if cmd[2] == 'compileall':
+            return 0, 'compile ok', '', False
+        return 0, 'tests ok', '', False
+
+    monkeypatch.setattr(evaluator, '_run_subprocess', fake_run_subprocess)
+    monkeypatch.setattr(evaluator, 'run_benchmarks_async', lambda _workspace: asyncio.sleep(0, result={}))
+
+    result = asyncio.run(evaluator.evaluate_async(tmp_path, benchmark_mode='never'))
+
+    assert result['compile_success'] is True
+    assert result['tests_success'] is True
+    assert result['tests_status'] == 'passed'
+    assert commands[1][0:4] == [commands[1][0], '-m', 'pytest', '-q']
+
+
+def test_evaluator_marks_test_failure_when_pytest_exits_nonzero(monkeypatch, tmp_path: Path) -> None:
+    async def fake_run_subprocess(cmd: list[str], *, timeout_s: float, cwd=None, env=None):
+        if cmd[2] == 'compileall':
+            return 0, 'compile ok', '', False
+        return 1, '', 'tests failed', False
+
+    monkeypatch.setattr(evaluator, '_run_subprocess', fake_run_subprocess)
+    monkeypatch.setattr(evaluator, 'run_benchmarks_async', lambda _workspace: asyncio.sleep(0, result={}))
+
+    result = asyncio.run(evaluator.evaluate_async(tmp_path, benchmark_mode='never'))
+
+    assert result['compile_success'] is True
+    assert result['tests_success'] is False
+    assert result['tests_status'] == 'failed'
+    assert result['tests_returncode'] == 1