sbroenne · sbroenne · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -4,7 +4,10 @@
 
 from pytest_codingagents.copilot.agent import CopilotAgent
 from pytest_codingagents.copilot.agents import load_custom_agent, load_custom_agents
-from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction
+from pytest_codingagents.copilot.optimizer import (
+    InstructionSuggestion,
+    optimize_instruction,
+)
 from pytest_codingagents.copilot.result import CopilotResult
 
 __all__ = [

@@ -4,9 +4,18 @@
 between a current agent instruction and the observed behavior, and suggests a
 concrete improvement.
 
-Requires ``pydantic-ai``:
+Model strings follow the same ``provider/model`` format used by
+``pytest-aitest`` (e.g. ``"azure/gpt-5.2-chat"``, ``"openai/gpt-4o-mini"``).
+Azure Entra ID authentication is handled automatically when
+``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set.
 
-    uv add pydantic-ai
+Example::
+
+    suggestion = await optimize_instruction(
+        agent.instructions or "",
+        result,
+        "Agent should add docstrings.",
+    )
 """
 
 from __future__ import annotations
@@ -15,6 +24,9 @@
 from typing import TYPE_CHECKING
 
 from pydantic import BaseModel
+from pydantic_ai import Agent as PydanticAgent
+from pydantic_ai.models import Model
+from pytest_aitest.execution.pydantic_adapter import build_model_from_string
 
 if TYPE_CHECKING:
     from pytest_codingagents.copilot.result import CopilotResult
@@ -70,7 +82,7 @@ async def optimize_instruction(
     result: CopilotResult,
     criterion: str,
     *,
-    model: str = "openai:gpt-4o-mini",
+    model: str | Model = "azure/gpt-5.2-chat",
 ) -> InstructionSuggestion:
     """Analyze a result and suggest an improved instruction.
 
@@ -79,7 +91,11 @@ async def optimize_instruction(
     concrete, actionable improvement.
 
     Designed to drop into ``pytest.fail()`` so the failure message
-    contains a ready-to-use fix:
+    contains a ready-to-use fix.
+
+    Model strings follow the same ``provider/model`` format used by
+    ``pytest-aitest``. Azure Entra ID auth is handled automatically
+    when ``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set.
 
     Example::
 
@@ -97,24 +113,16 @@ async def optimize_instruction(
         result: The ``CopilotResult`` from the (failed) run.
         criterion: What the agent *should* have done — the test expectation
             in plain English (e.g. ``"Always write docstrings"``).
-        model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"``
-            or ``"anthropic:claude-3-haiku-20240307"``).
+        model: Provider/model string (e.g. ``"azure/gpt-5.2-chat"``,
+            ``"openai/gpt-4o-mini"``) or a pre-configured pydantic-ai
+            ``Model`` object. Defaults to ``"azure/gpt-5.2-chat"``.
 
     Returns:
         An :class:`InstructionSuggestion` with the improved instruction.
-
-    Raises:
-        ImportError: If pydantic-ai is not installed.
     """
-    try:
-        from pydantic_ai import Agent as PydanticAgent
-    except ImportError as exc:
-        msg = (
-            "pydantic-ai is required for optimize_instruction(). "
-            "Install it with: uv add pydantic-ai"
-        )
-        raise ImportError(msg) from exc
-
+    resolved_model: str | Model = (
+        build_model_from_string(model) if isinstance(model, str) else model
+    )
     final_output = result.final_response or "(no response)"
     tool_calls = ", ".join(sorted(result.tool_names_called)) or "none"
 
@@ -142,7 +150,7 @@ async def optimize_instruction(
 that would make the agent satisfy the criterion.
 Keep the instruction under 200 words. Do not add unrelated rules."""
 
-    optimizer_agent = PydanticAgent(model, output_type=_OptimizationOutput)
+    optimizer_agent = PydanticAgent(resolved_model, output_type=_OptimizationOutput)
     run_result = await optimizer_agent.run(prompt)
     output = run_result.output
 

@@ -2,9 +2,9 @@
 
 These tests require:
 - GitHub Copilot credentials (for copilot_run to produce a real result)
-- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model)
+- AZURE_API_BASE or AZURE_OPENAI_ENDPOINT env var (for the optimizer LLM via Azure Entra ID)
 
-Skipped automatically when the required API key is absent.
+Skipped automatically when AZURE_OPENAI_ENDPOINT is absent.
 """
 
 from __future__ import annotations
@@ -19,13 +19,13 @@
 
 @pytest.mark.copilot
 class TestOptimizeInstructionIntegration:
-    """Integration tests for optimize_instruction() with real LLM calls."""
+    """Integration tests for optimize_instruction() with real Azure LLM calls."""
 
     @pytest.fixture(autouse=True)
-    def require_openai_key(self):
-        """Skip entire class when OPENAI_API_KEY is not set."""
-        if not os.environ.get("OPENAI_API_KEY"):
-            pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests")
+    def require_azure_endpoint(self):
+        """Skip entire class when AZURE_OPENAI_ENDPOINT is not set."""
+        if not os.environ.get("AZURE_OPENAI_ENDPOINT") and not os.environ.get("AZURE_API_BASE"):
+            pytest.skip("AZURE_OPENAI_ENDPOINT not set — skipping optimizer integration tests")
 
     async def test_returns_valid_suggestion(self, copilot_run, tmp_path):
         """optimize_instruction returns an InstructionSuggestion with non-empty fields."""
@@ -100,3 +100,64 @@ async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path):
             f"Instruction: {suggestion.instruction}\n"
             f"Reasoning: {suggestion.reasoning}"
         )
+
+    async def test_full_optimize_loop(self, copilot_run, tmp_path):
+        """Full test→optimize→test loop: weak instruction fails, improved instruction passes.
+
+        This is the hero use case: verify that optimize_instruction() produces
+        an instruction that actually fixes a failing criterion.
+
+        Round 1: Run with a deliberately weak instruction (no docstring mandate).
+                 The agent writes code but skips docstrings.
+        Optimize: Call optimize_instruction() with the failing criterion.
+                  Receive a suggested instruction that mandates docstrings.
+        Round 2: Run again with the improved instruction.
+                 The agent now includes docstrings — criterion passes.
+        """
+        CRITERION = "Every function must include a Google-style docstring."
+        TASK = "Create calculator.py with add(a, b) and subtract(a, b) functions."
+
+        # --- Round 1: weak instruction, expect no docstrings ---
+        weak_agent = CopilotAgent(
+            name="weak-coder",
+            instructions="Write minimal Python code. No comments or documentation needed.",
+            working_directory=str(tmp_path / "round1"),
+        )
+        (tmp_path / "round1").mkdir()
+        result1 = await copilot_run(weak_agent, TASK)
+        assert result1.success, "Round 1 Copilot run failed"
+
+        code1 = result1.file("calculator.py") or ""
+        has_docstrings_round1 = '"""' in code1 or "'''" in code1
+
+        # --- Optimize ---
+        suggestion = await optimize_instruction(
+            weak_agent.instructions or "",
+            result1,
+            CRITERION,
+        )
+        assert suggestion.instruction.strip(), "Optimizer returned empty instruction"
+        print(f"\n💡 Suggested instruction:\n{suggestion}")  # visible in -s output
+
+        # --- Round 2: improved instruction ---
+        improved_agent = CopilotAgent(
+            name="improved-coder",
+            instructions=suggestion.instruction,
+            working_directory=str(tmp_path / "round2"),
+        )
+        (tmp_path / "round2").mkdir()
+        result2 = await copilot_run(improved_agent, TASK)
+        assert result2.success, "Round 2 Copilot run failed"
+
+        code2 = result2.file("calculator.py") or ""
+        has_docstrings_round2 = '"""' in code2 or "'''" in code2
+
+        assert has_docstrings_round2, (
+            f"Round 2 code still has no docstrings after optimization.\n"
+            f"Suggested instruction: {suggestion.instruction}\n"
+            f"Round 2 code:\n{code2}"
+        )
+        print(
+            f"\n✅ Loop complete. "
+            f"Docstrings round 1: {has_docstrings_round1}, round 2: {has_docstrings_round2}"
+        )