diff --git a/src/pytest_codingagents/__init__.py b/src/pytest_codingagents/__init__.py index 2d93271..182169f 100644 --- a/src/pytest_codingagents/__init__.py +++ b/src/pytest_codingagents/__init__.py @@ -4,7 +4,10 @@ from pytest_codingagents.copilot.agent import CopilotAgent from pytest_codingagents.copilot.agents import load_custom_agent, load_custom_agents -from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction +from pytest_codingagents.copilot.optimizer import ( + InstructionSuggestion, + optimize_instruction, +) from pytest_codingagents.copilot.result import CopilotResult __all__ = [ diff --git a/src/pytest_codingagents/copilot/optimizer.py b/src/pytest_codingagents/copilot/optimizer.py index 1d2fb99..2c5bb20 100644 --- a/src/pytest_codingagents/copilot/optimizer.py +++ b/src/pytest_codingagents/copilot/optimizer.py @@ -4,9 +4,18 @@ between a current agent instruction and the observed behavior, and suggests a concrete improvement. -Requires ``pydantic-ai``: +Model strings follow the same ``provider/model`` format used by +``pytest-aitest`` (e.g. ``"azure/gpt-5.2-chat"``, ``"openai/gpt-4o-mini"``). +Azure Entra ID authentication is handled automatically when +``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set. - uv add pydantic-ai +Example:: + + suggestion = await optimize_instruction( + agent.instructions or "", + result, + "Agent should add docstrings.", + ) """ from __future__ import annotations @@ -15,6 +24,9 @@ from typing import TYPE_CHECKING from pydantic import BaseModel +from pydantic_ai import Agent as PydanticAgent +from pydantic_ai.models import Model +from pytest_aitest.execution.pydantic_adapter import build_model_from_string if TYPE_CHECKING: from pytest_codingagents.copilot.result import CopilotResult @@ -70,7 +82,7 @@ async def optimize_instruction( result: CopilotResult, criterion: str, *, - model: str = "openai:gpt-4o-mini", + model: str | Model = "azure/gpt-5.2-chat", ) -> InstructionSuggestion: """Analyze a result and suggest an improved instruction. @@ -79,7 +91,11 @@ async def optimize_instruction( concrete, actionable improvement. Designed to drop into ``pytest.fail()`` so the failure message - contains a ready-to-use fix: + contains a ready-to-use fix. + + Model strings follow the same ``provider/model`` format used by + ``pytest-aitest``. Azure Entra ID auth is handled automatically + when ``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set. Example:: @@ -97,24 +113,16 @@ async def optimize_instruction( result: The ``CopilotResult`` from the (failed) run. criterion: What the agent *should* have done — the test expectation in plain English (e.g. ``"Always write docstrings"``). - model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"`` - or ``"anthropic:claude-3-haiku-20240307"``). + model: Provider/model string (e.g. ``"azure/gpt-5.2-chat"``, + ``"openai/gpt-4o-mini"``) or a pre-configured pydantic-ai + ``Model`` object. Defaults to ``"azure/gpt-5.2-chat"``. Returns: An :class:`InstructionSuggestion` with the improved instruction. - - Raises: - ImportError: If pydantic-ai is not installed. """ - try: - from pydantic_ai import Agent as PydanticAgent - except ImportError as exc: - msg = ( - "pydantic-ai is required for optimize_instruction(). " - "Install it with: uv add pydantic-ai" - ) - raise ImportError(msg) from exc - + resolved_model: str | Model = ( + build_model_from_string(model) if isinstance(model, str) else model + ) final_output = result.final_response or "(no response)" tool_calls = ", ".join(sorted(result.tool_names_called)) or "none" @@ -142,7 +150,7 @@ async def optimize_instruction( that would make the agent satisfy the criterion. Keep the instruction under 200 words. Do not add unrelated rules.""" - optimizer_agent = PydanticAgent(model, output_type=_OptimizationOutput) + optimizer_agent = PydanticAgent(resolved_model, output_type=_OptimizationOutput) run_result = await optimizer_agent.run(prompt) output = run_result.output diff --git a/tests/test_optimizer_integration.py b/tests/test_optimizer_integration.py index b1b6ea7..6642053 100644 --- a/tests/test_optimizer_integration.py +++ b/tests/test_optimizer_integration.py @@ -2,9 +2,9 @@ These tests require: - GitHub Copilot credentials (for copilot_run to produce a real result) -- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model) +- AZURE_API_BASE or AZURE_OPENAI_ENDPOINT env var (for the optimizer LLM via Azure Entra ID) -Skipped automatically when the required API key is absent. +Skipped automatically when AZURE_OPENAI_ENDPOINT is absent. """ from __future__ import annotations @@ -19,13 +19,13 @@ @pytest.mark.copilot class TestOptimizeInstructionIntegration: - """Integration tests for optimize_instruction() with real LLM calls.""" + """Integration tests for optimize_instruction() with real Azure LLM calls.""" @pytest.fixture(autouse=True) - def require_openai_key(self): - """Skip entire class when OPENAI_API_KEY is not set.""" - if not os.environ.get("OPENAI_API_KEY"): - pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests") + def require_azure_endpoint(self): + """Skip entire class when AZURE_OPENAI_ENDPOINT is not set.""" + if not os.environ.get("AZURE_OPENAI_ENDPOINT") and not os.environ.get("AZURE_API_BASE"): + pytest.skip("AZURE_OPENAI_ENDPOINT not set — skipping optimizer integration tests") async def test_returns_valid_suggestion(self, copilot_run, tmp_path): """optimize_instruction returns an InstructionSuggestion with non-empty fields.""" @@ -100,3 +100,64 @@ async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path): f"Instruction: {suggestion.instruction}\n" f"Reasoning: {suggestion.reasoning}" ) + + async def test_full_optimize_loop(self, copilot_run, tmp_path): + """Full test→optimize→test loop: weak instruction fails, improved instruction passes. + + This is the hero use case: verify that optimize_instruction() produces + an instruction that actually fixes a failing criterion. + + Round 1: Run with a deliberately weak instruction (no docstring mandate). + The agent writes code but skips docstrings. + Optimize: Call optimize_instruction() with the failing criterion. + Receive a suggested instruction that mandates docstrings. + Round 2: Run again with the improved instruction. + The agent now includes docstrings — criterion passes. + """ + CRITERION = "Every function must include a Google-style docstring." + TASK = "Create calculator.py with add(a, b) and subtract(a, b) functions." + + # --- Round 1: weak instruction, expect no docstrings --- + weak_agent = CopilotAgent( + name="weak-coder", + instructions="Write minimal Python code. No comments or documentation needed.", + working_directory=str(tmp_path / "round1"), + ) + (tmp_path / "round1").mkdir() + result1 = await copilot_run(weak_agent, TASK) + assert result1.success, "Round 1 Copilot run failed" + + code1 = result1.file("calculator.py") or "" + has_docstrings_round1 = '"""' in code1 or "'''" in code1 + + # --- Optimize --- + suggestion = await optimize_instruction( + weak_agent.instructions or "", + result1, + CRITERION, + ) + assert suggestion.instruction.strip(), "Optimizer returned empty instruction" + print(f"\n💡 Suggested instruction:\n{suggestion}") # visible in -s output + + # --- Round 2: improved instruction --- + improved_agent = CopilotAgent( + name="improved-coder", + instructions=suggestion.instruction, + working_directory=str(tmp_path / "round2"), + ) + (tmp_path / "round2").mkdir() + result2 = await copilot_run(improved_agent, TASK) + assert result2.success, "Round 2 Copilot run failed" + + code2 = result2.file("calculator.py") or "" + has_docstrings_round2 = '"""' in code2 or "'''" in code2 + + assert has_docstrings_round2, ( + f"Round 2 code still has no docstrings after optimization.\n" + f"Suggested instruction: {suggestion.instruction}\n" + f"Round 2 code:\n{code2}" + ) + print( + f"\n✅ Loop complete. " + f"Docstrings round 1: {has_docstrings_round1}, round 2: {has_docstrings_round2}" + ) diff --git a/tests/unit/test_optimizer.py b/tests/unit/test_optimizer.py index 81e797c..fd551fd 100644 --- a/tests/unit/test_optimizer.py +++ b/tests/unit/test_optimizer.py @@ -2,14 +2,16 @@ from __future__ import annotations -import sys -from unittest.mock import AsyncMock, MagicMock - -import pytest +from unittest.mock import AsyncMock, MagicMock, patch from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction from pytest_codingagents.copilot.result import CopilotResult, ToolCall, Turn +# Patch targets +_AGENT_PATCH = "pytest_codingagents.copilot.optimizer.PydanticAgent" +_BUILD_MODEL_PATCH = "pytest_codingagents.copilot.optimizer.build_model_from_string" +_FAKE_MODEL = MagicMock(name="fake-model") + def _make_result( *, @@ -20,27 +22,17 @@ def _make_result( tool_calls = [ToolCall(name=t, arguments={}) for t in (tools or [])] return CopilotResult( success=success, - turns=[ - Turn(role="assistant", content=final_response, tool_calls=tool_calls), - ], + turns=[Turn(role="assistant", content=final_response, tool_calls=tool_calls)], ) def _make_agent_mock(instruction: str, reasoning: str, changes: str) -> MagicMock: - """Build a pydantic-ai Agent mock that returns a structured suggestion.""" - output = MagicMock() - output.instruction = instruction - output.reasoning = reasoning - output.changes = changes - - run_result = MagicMock() - run_result.output = output - + """Return a MagicMock that behaves like pydantic-ai Agent class.""" + output = MagicMock(instruction=instruction, reasoning=reasoning, changes=changes) + run_result = MagicMock(output=output) agent_instance = MagicMock() agent_instance.run = AsyncMock(return_value=run_result) - - agent_class = MagicMock(return_value=agent_instance) - return agent_class + return MagicMock(return_value=agent_instance) class TestInstructionSuggestion: @@ -55,27 +47,17 @@ def test_str_contains_instruction(self): assert "Always add docstrings." in str(s) def test_str_contains_reasoning(self): - s = InstructionSuggestion( - instruction="inst", - reasoning="because reasons", - changes="changed x", - ) + s = InstructionSuggestion(instruction="inst", reasoning="because reasons", changes="x") assert "because reasons" in str(s) def test_str_contains_changes(self): s = InstructionSuggestion( - instruction="inst", - reasoning="reason", - changes="Added docstring mandate.", + instruction="inst", reasoning="reason", changes="Added docstring mandate." ) assert "Added docstring mandate." in str(s) def test_fields_accessible(self): - s = InstructionSuggestion( - instruction="inst", - reasoning="reason", - changes="changes", - ) + s = InstructionSuggestion(instruction="inst", reasoning="reason", changes="changes") assert s.instruction == "inst" assert s.reasoning == "reason" assert s.changes == "changes" @@ -85,151 +67,98 @@ class TestOptimizeInstruction: """Tests for optimize_instruction().""" async def test_returns_instruction_suggestion(self): - """optimize_instruction returns an InstructionSuggestion.""" agent_class = _make_agent_mock( instruction="Always add Google-style docstrings.", reasoning="The original instruction omits documentation.", changes="Added docstring mandate.", ) - - # patch pydantic_ai.Agent in the module where it's imported - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = await optimize_instruction( - "Write Python code.", - _make_result(), - "Agent should add docstrings.", - ) - + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction( + "Write Python code.", _make_result(), "Agent should add docstrings." + ) assert isinstance(result, InstructionSuggestion) assert result.instruction == "Always add Google-style docstrings." assert result.reasoning == "The original instruction omits documentation." assert result.changes == "Added docstring mandate." async def test_uses_default_model(self): - """optimize_instruction defaults to openai:gpt-4o-mini.""" + """optimize_instruction defaults to azure/gpt-5.2-chat.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction("inst", _make_result(), "criterion") - - agent_class.assert_called_once() - assert agent_class.call_args[0][0] == "openai:gpt-4o-mini" - - async def test_accepts_custom_model(self): - """optimize_instruction accepts a custom model string.""" + with ( + patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL) as mock_build, + patch(_AGENT_PATCH, agent_class), + ): + await optimize_instruction("inst", _make_result(), "criterion") + mock_build.assert_called_once_with("azure/gpt-5.2-chat") + assert agent_class.call_args[0][0] is _FAKE_MODEL + + async def test_accepts_custom_model_string(self): + """optimize_instruction passes model string through build_model_from_string.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "inst", - _make_result(), - "criterion", - model="anthropic:claude-3-haiku-20240307", - ) - - assert agent_class.call_args[0][0] == "anthropic:claude-3-haiku-20240307" + with ( + patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL) as mock_build, + patch(_AGENT_PATCH, agent_class), + ): + await optimize_instruction( + "inst", _make_result(), "criterion", model="openai/gpt-4o-mini" + ) + mock_build.assert_called_once_with("openai/gpt-4o-mini") + + async def test_accepts_model_object(self): + """optimize_instruction skips build_model_from_string for a pre-built Model object.""" + agent_class = _make_agent_mock("inst", "reason", "changes") + fake_model = MagicMock() + with patch(_BUILD_MODEL_PATCH) as mock_build, patch(_AGENT_PATCH, agent_class): + await optimize_instruction("inst", _make_result(), "criterion", model=fake_model) + mock_build.assert_not_called() + assert agent_class.call_args[0][0] is fake_model async def test_includes_criterion_in_prompt(self): - """The LLM prompt includes the criterion text.""" agent_class = _make_agent_mock("improved", "reason", "change") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "Write code.", - _make_result(), - "Agent must use type hints on all functions.", - ) - - prompt = agent_instance.run.call_args[0][0] - assert "type hints" in prompt + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "Write code.", _make_result(), "Agent must use type hints on all functions." + ) + assert "type hints" in agent_instance.run.call_args[0][0] async def test_includes_current_instruction_in_prompt(self): - """The LLM prompt contains the current instruction.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "Always use FastAPI for web APIs.", - _make_result(), - "criterion", - ) - - prompt = agent_instance.run.call_args[0][0] - assert "FastAPI" in prompt + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "Always use FastAPI for web APIs.", _make_result(), "criterion" + ) + assert "FastAPI" in agent_instance.run.call_args[0][0] async def test_includes_agent_output_in_prompt(self): - """The LLM prompt contains the agent's final response.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = _make_result(final_response="def add(a, b): return a + b") - await optimize_instruction("inst", result, "criterion") - - prompt = agent_instance.run.call_args[0][0] - assert "def add" in prompt + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", _make_result(final_response="def add(a, b): return a + b"), "criterion" + ) + assert "def add" in agent_instance.run.call_args[0][0] async def test_handles_no_final_response(self): - """optimize_instruction handles results with no turns gracefully.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - empty_result = CopilotResult(success=False, turns=[]) - result = await optimize_instruction("inst", empty_result, "criterion") - + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction( + "inst", CopilotResult(success=False, turns=[]), "criterion" + ) assert isinstance(result, InstructionSuggestion) async def test_handles_empty_instruction(self): - """optimize_instruction handles empty current instruction.""" agent_class = _make_agent_mock("new inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = await optimize_instruction("", _make_result(), "criterion") + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction("", _make_result(), "criterion") assert isinstance(result, InstructionSuggestion) async def test_includes_tool_calls_in_prompt(self): - """The LLM prompt includes tool call information.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = _make_result(tools=["create_file", "read_file"]) - await optimize_instruction("inst", result, "criterion") - - prompt = agent_instance.run.call_args[0][0] - assert "create_file" in prompt - - -class TestOptimizeInstructionImportError: - """Test ImportError when pydantic-ai is not installed.""" - - async def test_raises_import_error_when_pydantic_ai_missing(self): - """optimize_instruction raises ImportError if pydantic-ai not installed.""" - saved = sys.modules.get("pydantic_ai") - try: - sys.modules["pydantic_ai"] = None # type: ignore - - with pytest.raises(ImportError, match="pydantic-ai"): - await optimize_instruction("inst", _make_result(), "criterion") - finally: - if saved is not None: - sys.modules["pydantic_ai"] = saved - else: - del sys.modules["pydantic_ai"] - - async def test_import_error_includes_install_hint(self): - """ImportError message includes the uv add install hint.""" - saved = sys.modules.get("pydantic_ai") - try: - sys.modules["pydantic_ai"] = None # type: ignore - - with pytest.raises(ImportError, match="uv add pydantic-ai"): - await optimize_instruction("inst", _make_result(), "criterion") - finally: - if saved is not None: - sys.modules["pydantic_ai"] = saved - else: - del sys.modules["pydantic_ai"] + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", _make_result(tools=["create_file", "read_file"]), "criterion" + ) + assert "create_file" in agent_instance.run.call_args[0][0]