Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/prompts.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,14 @@ Prompts should request structured JSON for:
- evaluations,
- reflections,
- code-change proposals.

## Supported runtime tasks

`LLMOrchestrator` currently supports exactly these task keys:

- `plan`
- `evaluate`
- `reflect`
- `code_changes`

Each task must map to an explicit runtime path and fallback payload.
8 changes: 2 additions & 6 deletions src/core/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,8 @@ async def evaluate_async(
tests_cmd = [
sys.executable,
"-m",
"unittest",
"discover",
"-s",
str(workspace_path / "tests"),
"-t",
str(workspace_path),
"pytest",
"-q",
]
(
tests_returncode,
Expand Down
33 changes: 33 additions & 0 deletions src/core/experiment_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def _normalize_code_changes_paths(code_changes: List[CodeChange], repo_root: Pat


_DEFAULT_EVALUATOR_REQUIRED_PATHS = ("src", "tests")
_REQUIRED_EVALUATOR_METRICS_FIELDS = ("compile_success", "tests_success", "tests_skipped")


def _materialization_paths_for_candidate(
Expand Down Expand Up @@ -121,6 +122,17 @@ def _candidate_paths(code_changes: List[CodeChange], repo_root: Path) -> list[st
return candidate_paths


def _validate_evaluator_payload(payload: Dict[str, Any]) -> str | None:
missing_fields = [
field_name
for field_name in _REQUIRED_EVALUATOR_METRICS_FIELDS
if field_name not in payload
]
if missing_fields:
return f"missing_required_metrics:{','.join(missing_fields)}"
return None


@dataclass
class ExperimentManager:
repo_root: Path = REPO_ROOT
Expand Down Expand Up @@ -186,6 +198,17 @@ async def _evaluate_candidate(
and cached.get("repo_hash") == repo_hash
and cached.get("code_hash") == code_hash
):
validation_error = _validate_evaluator_payload(metrics)
if validation_error:
malformed_metrics = dict(metrics)
malformed_metrics.setdefault("reason", validation_error)
return candidate.id, {
"metrics": malformed_metrics,
"accepted": False,
"reason": "malformed_metrics",
"cached": True,
"error": validation_error,
}, None, None
accepted, reason = should_accept(metrics, baseline_metrics)
return candidate.id, {
"metrics": metrics,
Expand Down Expand Up @@ -251,6 +274,16 @@ async def _evaluate_candidate(
}, None, None
payload.setdefault("duration_sec", elapsed)
payload.setdefault("timed_out", False)
validation_error = _validate_evaluator_payload(payload)
if validation_error:
payload.setdefault("reason", validation_error)
return candidate.id, {
"metrics": payload,
"accepted": False,
"reason": "malformed_metrics",
"cached": False,
"error": validation_error,
}, None, None
accepted, reason = should_accept(payload, baseline_metrics)
return candidate.id, {
"metrics": payload,
Expand Down
2 changes: 1 addition & 1 deletion src/core/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class LLMOrchestrator:
memory_key_requests: str = 'llm_requests'
memory_key_responses: str = 'llm_responses'
memory_key_status: str = 'llm_status'
supported_tasks: List[str] = field(default_factory=lambda: ['plan', 'evaluate', 'reflect', 'self_evolve', 'code_changes'])
supported_tasks: List[str] = field(default_factory=lambda: ['plan', 'evaluate', 'reflect', 'code_changes'])
_client: OpenAIClient | None = field(default=None, init=False, repr=False)
_lifecycle_lock: asyncio.Lock = field(default_factory=asyncio.Lock, init=False, repr=False)

Expand Down
46 changes: 46 additions & 0 deletions tests/test_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: 2026 Сацук Артём Венедиктович (Satsuk Artem)
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import asyncio
from pathlib import Path

from sif.core import evaluator


def test_evaluator_uses_pytest_and_reports_success(monkeypatch, tmp_path: Path) -> None:
commands: list[list[str]] = []

async def fake_run_subprocess(cmd: list[str], *, timeout_s: float, cwd=None, env=None):
commands.append(cmd)
if cmd[2] == 'compileall':
return 0, 'compile ok', '', False
return 0, 'tests ok', '', False

monkeypatch.setattr(evaluator, '_run_subprocess', fake_run_subprocess)
monkeypatch.setattr(evaluator, 'run_benchmarks_async', lambda _workspace: asyncio.sleep(0, result={}))

result = asyncio.run(evaluator.evaluate_async(tmp_path, benchmark_mode='never'))

assert result['compile_success'] is True
assert result['tests_success'] is True
assert result['tests_status'] == 'passed'
assert commands[1][0:4] == [commands[1][0], '-m', 'pytest', '-q']


def test_evaluator_marks_test_failure_when_pytest_exits_nonzero(monkeypatch, tmp_path: Path) -> None:
async def fake_run_subprocess(cmd: list[str], *, timeout_s: float, cwd=None, env=None):
if cmd[2] == 'compileall':
return 0, 'compile ok', '', False
return 1, '', 'tests failed', False

monkeypatch.setattr(evaluator, '_run_subprocess', fake_run_subprocess)
monkeypatch.setattr(evaluator, 'run_benchmarks_async', lambda _workspace: asyncio.sleep(0, result={}))

result = asyncio.run(evaluator.evaluate_async(tmp_path, benchmark_mode='never'))

assert result['compile_success'] is True
assert result['tests_success'] is False
assert result['tests_status'] == 'failed'
assert result['tests_returncode'] == 1
Loading
Loading