From 2b535a397d90eb1479522b40e87d67f7bf1b0e69 Mon Sep 17 00:00:00 2001 From: "Arty S." <248714260+arty-kk@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:08:37 +0300 Subject: [PATCH] Avoid mutating cached metrics during malformed payload handling --- docs/prompts.md | 11 ++ src/core/evaluator.py | 8 +- src/core/experiment_manager.py | 33 ++++ src/core/llm.py | 2 +- tests/test_evaluator.py | 46 ++++++ tests/test_experiment_manager.py | 266 +++++++++++++++++++++++++++++++ tests/test_llm.py | 19 +++ tests/test_selector.py | 28 ++++ tests/test_spiral_engine.py | 224 ++++++++++++++++++++++++++ 9 files changed, 630 insertions(+), 7 deletions(-) create mode 100644 tests/test_evaluator.py create mode 100644 tests/test_experiment_manager.py create mode 100644 tests/test_selector.py create mode 100644 tests/test_spiral_engine.py diff --git a/docs/prompts.md b/docs/prompts.md index bca4b1b..720bd37 100644 --- a/docs/prompts.md +++ b/docs/prompts.md @@ -28,3 +28,14 @@ Prompts should request structured JSON for: - evaluations, - reflections, - code-change proposals. + +## Supported runtime tasks + +`LLMOrchestrator` currently supports exactly these task keys: + +- `plan` +- `evaluate` +- `reflect` +- `code_changes` + +Each task must map to an explicit runtime path and fallback payload. diff --git a/src/core/evaluator.py b/src/core/evaluator.py index 105ab0e..e430fbd 100644 --- a/src/core/evaluator.py +++ b/src/core/evaluator.py @@ -91,12 +91,8 @@ async def evaluate_async( tests_cmd = [ sys.executable, "-m", - "unittest", - "discover", - "-s", - str(workspace_path / "tests"), - "-t", - str(workspace_path), + "pytest", + "-q", ] ( tests_returncode, diff --git a/src/core/experiment_manager.py b/src/core/experiment_manager.py index 2ca231a..3a54a24 100644 --- a/src/core/experiment_manager.py +++ b/src/core/experiment_manager.py @@ -81,6 +81,7 @@ def _normalize_code_changes_paths(code_changes: List[CodeChange], repo_root: Pat _DEFAULT_EVALUATOR_REQUIRED_PATHS = ("src", "tests") +_REQUIRED_EVALUATOR_METRICS_FIELDS = ("compile_success", "tests_success", "tests_skipped") def _materialization_paths_for_candidate( @@ -121,6 +122,17 @@ def _candidate_paths(code_changes: List[CodeChange], repo_root: Path) -> list[st return candidate_paths +def _validate_evaluator_payload(payload: Dict[str, Any]) -> str | None: + missing_fields = [ + field_name + for field_name in _REQUIRED_EVALUATOR_METRICS_FIELDS + if field_name not in payload + ] + if missing_fields: + return f"missing_required_metrics:{','.join(missing_fields)}" + return None + + @dataclass class ExperimentManager: repo_root: Path = REPO_ROOT @@ -186,6 +198,17 @@ async def _evaluate_candidate( and cached.get("repo_hash") == repo_hash and cached.get("code_hash") == code_hash ): + validation_error = _validate_evaluator_payload(metrics) + if validation_error: + malformed_metrics = dict(metrics) + malformed_metrics.setdefault("reason", validation_error) + return candidate.id, { + "metrics": malformed_metrics, + "accepted": False, + "reason": "malformed_metrics", + "cached": True, + "error": validation_error, + }, None, None accepted, reason = should_accept(metrics, baseline_metrics) return candidate.id, { "metrics": metrics, @@ -251,6 +274,16 @@ async def _evaluate_candidate( }, None, None payload.setdefault("duration_sec", elapsed) payload.setdefault("timed_out", False) + validation_error = _validate_evaluator_payload(payload) + if validation_error: + payload.setdefault("reason", validation_error) + return candidate.id, { + "metrics": payload, + "accepted": False, + "reason": "malformed_metrics", + "cached": False, + "error": validation_error, + }, None, None accepted, reason = should_accept(payload, baseline_metrics) return candidate.id, { "metrics": payload, diff --git a/src/core/llm.py b/src/core/llm.py index fa7095b..21da199 100644 --- a/src/core/llm.py +++ b/src/core/llm.py @@ -24,7 +24,7 @@ class LLMOrchestrator: memory_key_requests: str = 'llm_requests' memory_key_responses: str = 'llm_responses' memory_key_status: str = 'llm_status' - supported_tasks: List[str] = field(default_factory=lambda: ['plan', 'evaluate', 'reflect', 'self_evolve', 'code_changes']) + supported_tasks: List[str] = field(default_factory=lambda: ['plan', 'evaluate', 'reflect', 'code_changes']) _client: OpenAIClient | None = field(default=None, init=False, repr=False) _lifecycle_lock: asyncio.Lock = field(default_factory=asyncio.Lock, init=False, repr=False) diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py new file mode 100644 index 0000000..59f7565 --- /dev/null +++ b/tests/test_evaluator.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2026 Сацук Артём Венедиктович (Satsuk Artem) +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import asyncio +from pathlib import Path + +from sif.core import evaluator + + +def test_evaluator_uses_pytest_and_reports_success(monkeypatch, tmp_path: Path) -> None: + commands: list[list[str]] = [] + + async def fake_run_subprocess(cmd: list[str], *, timeout_s: float, cwd=None, env=None): + commands.append(cmd) + if cmd[2] == 'compileall': + return 0, 'compile ok', '', False + return 0, 'tests ok', '', False + + monkeypatch.setattr(evaluator, '_run_subprocess', fake_run_subprocess) + monkeypatch.setattr(evaluator, 'run_benchmarks_async', lambda _workspace: asyncio.sleep(0, result={})) + + result = asyncio.run(evaluator.evaluate_async(tmp_path, benchmark_mode='never')) + + assert result['compile_success'] is True + assert result['tests_success'] is True + assert result['tests_status'] == 'passed' + assert commands[1][0:4] == [commands[1][0], '-m', 'pytest', '-q'] + + +def test_evaluator_marks_test_failure_when_pytest_exits_nonzero(monkeypatch, tmp_path: Path) -> None: + async def fake_run_subprocess(cmd: list[str], *, timeout_s: float, cwd=None, env=None): + if cmd[2] == 'compileall': + return 0, 'compile ok', '', False + return 1, '', 'tests failed', False + + monkeypatch.setattr(evaluator, '_run_subprocess', fake_run_subprocess) + monkeypatch.setattr(evaluator, 'run_benchmarks_async', lambda _workspace: asyncio.sleep(0, result={})) + + result = asyncio.run(evaluator.evaluate_async(tmp_path, benchmark_mode='never')) + + assert result['compile_success'] is True + assert result['tests_success'] is False + assert result['tests_status'] == 'failed' + assert result['tests_returncode'] == 1 diff --git a/tests/test_experiment_manager.py b/tests/test_experiment_manager.py new file mode 100644 index 0000000..8564d2f --- /dev/null +++ b/tests/test_experiment_manager.py @@ -0,0 +1,266 @@ +# SPDX-FileCopyrightText: 2026 Сацук Артём Венедиктович (Satsuk Artem) +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import asyncio +from pathlib import Path + +from sif.core.candidates import Candidate +from sif.core.evolution import CodeChange +from sif.core.experiment_manager import ExperimentManager +import sif.core.experiment_manager as experiment_manager_module + + +class _InMemoryCacheStore: + _data: dict[str, object] = {} + + def __init__(self, _cache_path) -> None: + self._namespace = str(_cache_path) + + async def start(self) -> None: + return None + + async def stop(self) -> None: + return None + + async def get(self, key: str): + return self._data.get(f'{self._namespace}:{key}') + + async def put_many(self, payload: dict[str, object]) -> None: + for key, value in payload.items(): + self._data[f'{self._namespace}:{key}'] = value + + +def _patch_cache_store(monkeypatch) -> None: + _InMemoryCacheStore._data.clear() + monkeypatch.setattr(experiment_manager_module, 'AsyncCacheStore', _InMemoryCacheStore) + + +async def _passing_evaluator(_workspace: Path) -> dict[str, object]: + return { + 'compile_success': True, + 'tests_success': True, + 'tests_skipped': False, + 'duration_sec': 0.01, + } + + +def _prepare_repo(repo_root: Path) -> None: + (repo_root / 'src' / 'components').mkdir(parents=True) + (repo_root / 'tests').mkdir(parents=True) + (repo_root / 'src' / 'components' / '__init__.py').write_text('', encoding='utf-8') + (repo_root / 'tests' / 'test_smoke.py').write_text('def test_smoke():\n assert True\n', encoding='utf-8') + + +def test_experiment_manager_accepts_candidate_with_valid_metrics(monkeypatch, tmp_path: Path) -> None: + _patch_cache_store(monkeypatch) + repo_root = tmp_path / 'repo' + repo_root.mkdir() + _prepare_repo(repo_root) + + manager = ExperimentManager( + repo_root=repo_root, + evaluator=_passing_evaluator, + cache_path=repo_root / '.sif' / 'cache' / 'evals.json', + ) + candidate = Candidate( + id='accepted', + source='test', + code_changes=[CodeChange(path='src/components/generated_candidate.py', content='x = 1\n')], + ) + + best_candidate, results = asyncio.run(manager.run_async([candidate])) + + assert best_candidate is not None + assert best_candidate.id == 'accepted' + assert results['accepted']['accepted'] is True + assert results['accepted']['reason'] == 'accepted' + + +def test_experiment_manager_rejects_malformed_metrics_payload(monkeypatch, tmp_path: Path) -> None: + _patch_cache_store(monkeypatch) + repo_root = tmp_path / 'repo' + repo_root.mkdir() + _prepare_repo(repo_root) + + async def malformed_evaluator(_workspace: Path) -> dict[str, object]: + return {'compile_success': True, 'duration_sec': 0.02} + + manager = ExperimentManager( + repo_root=repo_root, + evaluator=malformed_evaluator, + cache_path=repo_root / '.sif' / 'cache' / 'evals.json', + ) + candidate = Candidate( + id='malformed', + source='test', + code_changes=[CodeChange(path='src/components/generated_candidate.py', content='x = 2\n')], + ) + + best_candidate, results = asyncio.run(manager.run_async([candidate])) + + assert best_candidate is None + assert results['malformed']['accepted'] is False + assert results['malformed']['reason'] == 'malformed_metrics' + assert 'missing_required_metrics' in results['malformed']['error'] + + +def test_experiment_manager_reports_blocked_timeout_and_cached_outcomes(monkeypatch, tmp_path: Path) -> None: + _patch_cache_store(monkeypatch) + repo_root = tmp_path / 'repo' + repo_root.mkdir() + _prepare_repo(repo_root) + cache_path = repo_root / '.sif' / 'cache' / 'evals.json' + + async def dynamic_evaluator(workspace: Path) -> dict[str, object]: + marker = workspace / 'src' / 'components' / 'mode.txt' + mode = marker.read_text(encoding='utf-8').strip() if marker.exists() else 'accepted' + if mode == 'timeout': + await asyncio.sleep(0.05) + return { + 'compile_success': True, + 'tests_success': mode != 'reject', + 'tests_skipped': False, + 'duration_sec': 0.01, + } + + manager = ExperimentManager( + repo_root=repo_root, + evaluator=dynamic_evaluator, + cache_path=cache_path, + timeout_per_candidate=0.01, + ) + + accepted = Candidate( + id='accepted', + source='test', + code_changes=[CodeChange(path='src/components/mode.txt', content='accepted\n')], + ) + blocked = Candidate( + id='blocked', + source='test', + code_changes=[CodeChange(path='src/core/blocked.py', content='x = 1\n')], + ) + timeout = Candidate( + id='timeout', + source='test', + code_changes=[CodeChange(path='src/components/mode.txt', content='timeout\n')], + ) + + best_candidate, results = asyncio.run(manager.run_async([accepted, blocked, timeout])) + + assert best_candidate is not None + assert best_candidate.id == 'accepted' + assert results['accepted']['reason'] == 'accepted' + assert results['blocked']['reason'] == 'partial_application_blocked' + assert results['timeout']['reason'] == 'timeout' + + cache_manager = ExperimentManager( + repo_root=repo_root, + evaluator=dynamic_evaluator, + cache_path=cache_path, + ) + cached_best, cached_results = asyncio.run(cache_manager.run_async([accepted])) + + assert cached_best is not None + assert cached_best.id == 'accepted' + assert cached_results['accepted']['accepted'] is True + assert cached_results['accepted']['cached'] is True + + +def test_experiment_manager_reports_no_changes_applied_reason(monkeypatch, tmp_path: Path) -> None: + _patch_cache_store(monkeypatch) + repo_root = tmp_path / 'repo' + repo_root.mkdir() + _prepare_repo(repo_root) + existing_path = repo_root / 'src' / 'components' / 'existing.py' + existing_path.write_text('x = 1\n', encoding='utf-8') + + manager = ExperimentManager( + repo_root=repo_root, + evaluator=_passing_evaluator, + cache_path=repo_root / '.sif' / 'cache' / 'evals.json', + ) + candidate = Candidate( + id='no-op', + source='test', + code_changes=[CodeChange(path='src/components/existing.py', content='x = 1\n')], + ) + + best_candidate, results = asyncio.run(manager.run_async([candidate])) + + assert best_candidate is None + assert results['no-op']['accepted'] is False + assert results['no-op']['reason'] == 'no_changes_applied' + + +def test_experiment_manager_reports_evaluation_failed_for_non_dict_payload(monkeypatch, tmp_path: Path) -> None: + _patch_cache_store(monkeypatch) + repo_root = tmp_path / 'repo' + repo_root.mkdir() + _prepare_repo(repo_root) + + async def invalid_payload_evaluator(_workspace: Path): + return ['not', 'a', 'dict'] + + manager = ExperimentManager( + repo_root=repo_root, + evaluator=invalid_payload_evaluator, + cache_path=repo_root / '.sif' / 'cache' / 'evals.json', + ) + candidate = Candidate( + id='invalid-payload', + source='test', + code_changes=[CodeChange(path='src/components/generated_candidate.py', content='x = 3\n')], + ) + + best_candidate, results = asyncio.run(manager.run_async([candidate])) + + assert best_candidate is None + assert results['invalid-payload']['accepted'] is False + assert results['invalid-payload']['reason'] == 'evaluation_failed' + + +def test_experiment_manager_rejects_cached_malformed_metrics(monkeypatch, tmp_path: Path) -> None: + _patch_cache_store(monkeypatch) + repo_root = tmp_path / 'repo' + repo_root.mkdir() + _prepare_repo(repo_root) + cache_path = repo_root / '.sif' / 'cache' / 'evals.json' + + manager = ExperimentManager( + repo_root=repo_root, + evaluator=_passing_evaluator, + cache_path=cache_path, + ) + candidate = Candidate( + id='cached-malformed', + source='test', + code_changes=[CodeChange(path='src/components/generated_candidate.py', content='x = 4\n')], + ) + + _, first_results = asyncio.run(manager.run_async([candidate])) + assert first_results['cached-malformed']['accepted'] is True + + cache_key = next( + key for key in _InMemoryCacheStore._data if ':cached-malformed:' in key + ) + cached_entry = _InMemoryCacheStore._data[cache_key] + assert isinstance(cached_entry, dict) + metrics = cached_entry.get('metrics') + assert isinstance(metrics, dict) + metrics.pop('tests_skipped', None) + + second_manager = ExperimentManager( + repo_root=repo_root, + evaluator=_passing_evaluator, + cache_path=cache_path, + ) + best_candidate, second_results = asyncio.run(second_manager.run_async([candidate])) + + assert best_candidate is None + assert second_results['cached-malformed']['accepted'] is False + assert second_results['cached-malformed']['reason'] == 'malformed_metrics' + assert second_results['cached-malformed']['cached'] is True + assert 'reason' not in metrics diff --git a/tests/test_llm.py b/tests/test_llm.py index e737638..016a26a 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -1,8 +1,12 @@ # SPDX-FileCopyrightText: 2026 Сацук Артём Венедиктович (Satsuk Artem) # SPDX-License-Identifier: Apache-2.0 +import inspect +import re + from sif.core.kernel import Kernel, KernelState from sif.core.llm import LLMOrchestrator +from sif.core.spiral_engine import SpiralEngine def test_llm_directive_is_grounded() -> None: @@ -20,3 +24,18 @@ def test_llm_fallback_plan_and_response_storage() -> None: assert isinstance(response['plan'], list) llm._store_response(kernel, 'plan', response) assert llm.load_response(kernel, 'plan') == response + + +def test_llm_supported_tasks_match_fallback_handlers() -> None: + llm = LLMOrchestrator() + assert 'self_evolve' not in llm.supported_tasks + for task in llm.supported_tasks: + payload = llm.build_fallback(task, {}) + assert isinstance(payload, dict) + + +def test_llm_supported_tasks_have_spiral_engine_call_sites() -> None: + llm = LLMOrchestrator() + source = inspect.getsource(SpiralEngine) + used_tasks = set(re.findall(r'_run_llm_request_limited\("([a-z_]+)"', source)) + assert set(llm.supported_tasks) == used_tasks diff --git a/tests/test_selector.py b/tests/test_selector.py new file mode 100644 index 0000000..6ee4680 --- /dev/null +++ b/tests/test_selector.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2026 Сацук Артём Венедиктович (Satsuk Artem) +# SPDX-License-Identifier: Apache-2.0 + +from sif.core.selector import should_accept + + +def test_should_accept_accepts_when_compile_and_tests_pass() -> None: + accepted, reason = should_accept( + { + 'compile_success': True, + 'tests_success': True, + 'tests_skipped': False, + } + ) + assert accepted is True + assert reason == 'accepted' + + +def test_should_accept_rejects_when_tests_skipped() -> None: + accepted, reason = should_accept( + { + 'compile_success': True, + 'tests_success': True, + 'tests_skipped': True, + } + ) + assert accepted is False + assert reason == 'tests_skipped' diff --git a/tests/test_spiral_engine.py b/tests/test_spiral_engine.py new file mode 100644 index 0000000..c64ea1a --- /dev/null +++ b/tests/test_spiral_engine.py @@ -0,0 +1,224 @@ +# SPDX-FileCopyrightText: 2026 Сацук Артём Венедиктович (Satsuk Artem) +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import asyncio +import json +from pathlib import Path + +from sif.core.evolution import CodeApplicationResult, CodeChange +from sif.core.kernel import Kernel, KernelState +from sif.core.spiral_engine import SpiralEngine +import sif.core.spiral_engine as spiral_engine + + +class _AcceptedManager: + def __init__(self, *args, **kwargs) -> None: + _ = args, kwargs + + async def run_async(self, candidates, baseline_metrics=None): + _ = baseline_metrics + candidate = candidates[0] + return candidate, { + candidate.id: { + 'metrics': { + 'compile_success': True, + 'tests_success': True, + 'tests_skipped': False, + 'duration_sec': 0.01, + }, + 'accepted': True, + 'reason': 'accepted', + } + } + + +def _build_engine() -> SpiralEngine: + kernel = Kernel(state=KernelState(goals=['g'], constraints=['c'], memory={'cycle_index': '1'})) + return SpiralEngine(kernel=kernel) + + +def test_evolve_rolls_back_to_lkg_on_post_apply_degradation(monkeypatch, tmp_path: Path) -> None: + engine = _build_engine() + engine.kernel.update_memory('lkg_version_id', 'lkg-1') + + async def fake_create_version_async(*args, **kwargs): + _ = args, kwargs + return 'pre-1' + + async def fake_apply_code_changes_to_root_async(*args, **kwargs): + _ = args, kwargs + return CodeApplicationResult( + applied_changes=[CodeChange(path='src/components/generated.py', content='x = 1\n')], + blocked_changes=[], + ) + + async def fake_evaluate(*args, **kwargs): + _ = args, kwargs + return {'compile_success': False, 'tests_success': False, 'tests_skipped': False} + + async def fake_restore_version_async(version_id: str, mode: str = 'soft') -> bool: + _ = mode + return version_id == 'lkg-1' + + async def fake_latest_version_async() -> str | None: + return 'fallback-1' + + async def fake_exists(path: Path) -> bool: + return path.name == 'lkg-1' + + async def fake_load_version_paths_async(_version_id: str) -> list[str]: + return ['src/components/generated.py'] + + async def fake_append_event(*args, **kwargs): + _ = args, kwargs + return None + + monkeypatch.setattr(spiral_engine, 'ExperimentManager', _AcceptedManager) + monkeypatch.setattr(spiral_engine, 'create_version_async', fake_create_version_async) + monkeypatch.setattr(spiral_engine, 'apply_code_changes_to_root_async', fake_apply_code_changes_to_root_async) + monkeypatch.setattr(spiral_engine, 'evaluate', fake_evaluate) + monkeypatch.setattr(spiral_engine, 'restore_version_async', fake_restore_version_async) + monkeypatch.setattr(spiral_engine, 'latest_version_async', fake_latest_version_async) + monkeypatch.setattr(spiral_engine.async_fs, 'exists', fake_exists) + monkeypatch.setattr(spiral_engine, 'append_event', fake_append_event) + monkeypatch.setattr(engine, '_load_version_paths_async', fake_load_version_paths_async) + monkeypatch.setattr(spiral_engine, 'REPO_ROOT', tmp_path) + + _, applied = asyncio.run( + engine.evolve( + evaluation={'alignment': 'ok'}, + updates=[], + code_changes=[CodeChange(path='src/components/generated.py', content='x = 1\n')], + ) + ) + + assert applied == [] + assert engine.kernel.state.memory['rollback_triggered'] == 'true' + assert engine.kernel.state.memory['rollback_reason'] == 'post_apply_compile_failed' + assert engine.kernel.state.memory['rollback_version_id'] == 'lkg-1' + rollback_info = json.loads(engine.kernel.state.memory['rollback_info']) + assert rollback_info['restore_success'] is True + assert rollback_info['restored_version_id'] == 'lkg-1' + + +def test_evolve_uses_latest_version_fallback_when_lkg_missing(monkeypatch, tmp_path: Path) -> None: + engine = _build_engine() + engine.kernel.update_memory('lkg_version_id', 'lkg-missing') + + async def fake_create_version_async(*args, **kwargs): + _ = args, kwargs + return 'pre-2' + + async def fake_apply_code_changes_to_root_async(*args, **kwargs): + _ = args, kwargs + return CodeApplicationResult( + applied_changes=[CodeChange(path='src/components/generated.py', content='x = 2\n')], + blocked_changes=[], + ) + + async def fake_evaluate(*args, **kwargs): + _ = args, kwargs + return {'compile_success': True, 'tests_success': False, 'tests_skipped': False} + + async def fake_restore_version_async(version_id: str, mode: str = 'soft') -> bool: + _ = mode + return version_id == 'fallback-2' + + async def fake_latest_version_async() -> str | None: + return 'fallback-2' + + async def fake_exists(_path: Path) -> bool: + return False + + async def fake_load_version_paths_async(_version_id: str) -> list[str]: + return ['src/components/generated.py'] + + async def fake_append_event(*args, **kwargs): + _ = args, kwargs + return None + + monkeypatch.setattr(spiral_engine, 'ExperimentManager', _AcceptedManager) + monkeypatch.setattr(spiral_engine, 'create_version_async', fake_create_version_async) + monkeypatch.setattr(spiral_engine, 'apply_code_changes_to_root_async', fake_apply_code_changes_to_root_async) + monkeypatch.setattr(spiral_engine, 'evaluate', fake_evaluate) + monkeypatch.setattr(spiral_engine, 'restore_version_async', fake_restore_version_async) + monkeypatch.setattr(spiral_engine, 'latest_version_async', fake_latest_version_async) + monkeypatch.setattr(spiral_engine.async_fs, 'exists', fake_exists) + monkeypatch.setattr(spiral_engine, 'append_event', fake_append_event) + monkeypatch.setattr(engine, '_load_version_paths_async', fake_load_version_paths_async) + monkeypatch.setattr(spiral_engine, 'REPO_ROOT', tmp_path) + + _, applied = asyncio.run( + engine.evolve( + evaluation={'alignment': 'ok'}, + updates=[], + code_changes=[CodeChange(path='src/components/generated.py', content='x = 2\n')], + ) + ) + + assert applied == [] + assert engine.kernel.state.memory['rollback_triggered'] == 'true' + assert engine.kernel.state.memory['rollback_version_id'] == 'fallback-2' + assert engine.kernel.state.memory['lkg_version_fallback'] == 'latest_version' + rollback_info = json.loads(engine.kernel.state.memory['rollback_info']) + assert rollback_info['fallback_restore_attempted'] is True + assert rollback_info['fallback_restore_ok'] is True + + +def test_evolve_sets_rollback_failed_when_restore_paths_unavailable(monkeypatch, tmp_path: Path) -> None: + engine = _build_engine() + engine.kernel.update_memory('lkg_version_id', 'lkg-missing') + + async def fake_create_version_async(*args, **kwargs): + _ = args, kwargs + return 'pre-3' + + async def fake_apply_code_changes_to_root_async(*args, **kwargs): + _ = args, kwargs + return CodeApplicationResult( + applied_changes=[CodeChange(path='src/components/generated.py', content='x = 4\n')], + blocked_changes=[], + ) + + async def fake_evaluate(*args, **kwargs): + _ = args, kwargs + return {'compile_success': False, 'tests_success': False, 'tests_skipped': False} + + async def fake_restore_version_async(version_id: str, mode: str = 'soft') -> bool: + _ = version_id, mode + return False + + async def fake_latest_version_async() -> str | None: + return None + + async def fake_exists(_path: Path) -> bool: + return False + + async def fake_append_event(*args, **kwargs): + _ = args, kwargs + return None + + monkeypatch.setattr(spiral_engine, 'ExperimentManager', _AcceptedManager) + monkeypatch.setattr(spiral_engine, 'create_version_async', fake_create_version_async) + monkeypatch.setattr(spiral_engine, 'apply_code_changes_to_root_async', fake_apply_code_changes_to_root_async) + monkeypatch.setattr(spiral_engine, 'evaluate', fake_evaluate) + monkeypatch.setattr(spiral_engine, 'restore_version_async', fake_restore_version_async) + monkeypatch.setattr(spiral_engine, 'latest_version_async', fake_latest_version_async) + monkeypatch.setattr(spiral_engine.async_fs, 'exists', fake_exists) + monkeypatch.setattr(spiral_engine, 'append_event', fake_append_event) + monkeypatch.setattr(spiral_engine, 'REPO_ROOT', tmp_path) + + _, applied = asyncio.run( + engine.evolve( + evaluation={'alignment': 'ok'}, + updates=[], + code_changes=[CodeChange(path='src/components/generated.py', content='x = 4\n')], + ) + ) + + assert applied != [] + assert engine.kernel.state.memory['rollback_triggered'] == 'true' + assert engine.kernel.state.memory['rollback_failed'] == 'true' + assert engine.kernel.state.memory['rollback_failure_reason'] == 'lkg_version_not_found_and_fallback_unavailable'