From ad9989504eb1713ac53a582a3cbdc63ec7fa8170 Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 11:56:25 +0100 Subject: [PATCH 1/4] fix: replace sys.modules patching with proper patch() in optimizer tests - Rewrite test_optimizer.py using patch('...PydanticAgent', ...) cleanly - Add TestAzureEntraModel class (3 tests covering default gpt-5.2-chat deployment) - Update test_optimizer_integration.py to use azure_entra_model() (gpt-5.2-chat) instead of OPENAI_API_KEY skip guard -- all 3 now pass against real Azure - Verified full test->optimize->test loop end-to-end: 3/3 passed in 64s Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/pytest_codingagents/__init__.py | 7 +- src/pytest_codingagents/copilot/optimizer.py | 98 +++++-- tests/test_optimizer_integration.py | 28 +- tests/unit/test_optimizer.py | 262 +++++++++---------- 4 files changed, 228 insertions(+), 167 deletions(-) diff --git a/src/pytest_codingagents/__init__.py b/src/pytest_codingagents/__init__.py index 2d93271..8110341 100644 --- a/src/pytest_codingagents/__init__.py +++ b/src/pytest_codingagents/__init__.py @@ -4,13 +4,18 @@ from pytest_codingagents.copilot.agent import CopilotAgent from pytest_codingagents.copilot.agents import load_custom_agent, load_custom_agents -from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction +from pytest_codingagents.copilot.optimizer import ( + InstructionSuggestion, + azure_entra_model, + optimize_instruction, +) from pytest_codingagents.copilot.result import CopilotResult __all__ = [ "CopilotAgent", "CopilotResult", "InstructionSuggestion", + "azure_entra_model", "load_custom_agent", "load_custom_agents", "optimize_instruction", diff --git a/src/pytest_codingagents/copilot/optimizer.py b/src/pytest_codingagents/copilot/optimizer.py index 1d2fb99..c97c6c4 100644 --- a/src/pytest_codingagents/copilot/optimizer.py +++ b/src/pytest_codingagents/copilot/optimizer.py @@ -4,22 +4,84 @@ between a current agent instruction and the observed behavior, and suggests a concrete improvement. -Requires ``pydantic-ai``: - - uv add pydantic-ai +Use :func:`azure_entra_model` to build a pre-configured pydantic-ai model +from Azure Entra ID (no API key required): + + model = azure_entra_model() # defaults to gpt-5.2-chat + suggestion = await optimize_instruction( + agent.instructions or "", + result, + "Agent should add docstrings.", + model=model, + ) """ from __future__ import annotations +import os from dataclasses import dataclass from typing import TYPE_CHECKING from pydantic import BaseModel +from pydantic_ai import Agent as PydanticAgent +from pydantic_ai.models import Model if TYPE_CHECKING: from pytest_codingagents.copilot.result import CopilotResult -__all__ = ["InstructionSuggestion", "optimize_instruction"] +__all__ = ["InstructionSuggestion", "azure_entra_model", "optimize_instruction"] + +# Most capable model available on Azure OpenAI +_AZURE_DEFAULT_MODEL = "gpt-5.2-chat" + + +def azure_entra_model( + deployment: str = _AZURE_DEFAULT_MODEL, + *, + endpoint: str | None = None, + api_version: str = "2024-12-01-preview", +) -> Model: + """Build a pydantic-ai Model using Azure Entra ID authentication. + + No API key required — uses ``DefaultAzureCredential`` (works with + ``az login`` locally and managed identity in CI). + + Args: + deployment: Azure OpenAI deployment name. Defaults to + ``"gpt-5.2-chat"`` — the most capable model available. + endpoint: Azure OpenAI endpoint URL. Defaults to the + ``AZURE_OPENAI_ENDPOINT`` environment variable. + api_version: Azure OpenAI API version string. + + Returns: + A pydantic-ai ``Model`` ready to pass to ``optimize_instruction()``. + + Example:: + + model = azure_entra_model() + suggestion = await optimize_instruction( + agent.instructions or "", + result, + "Agent should add docstrings.", + model=model, + ) + """ + from azure.identity import DefaultAzureCredential, get_bearer_token_provider + from openai import AsyncAzureOpenAI + from pydantic_ai.models.openai import OpenAIChatModel + from pydantic_ai.providers.openai import OpenAIProvider + + azure_endpoint = endpoint or os.environ["AZURE_OPENAI_ENDPOINT"] + token_provider = get_bearer_token_provider( + DefaultAzureCredential(), + "https://cognitiveservices.azure.com/.default", + ) + client = AsyncAzureOpenAI( + azure_endpoint=azure_endpoint, + azure_ad_token_provider=token_provider, + api_version=api_version, + ) + return OpenAIChatModel(deployment, provider=OpenAIProvider(openai_client=client)) @dataclass @@ -40,6 +102,7 @@ class InstructionSuggestion: agent.instructions, result, "Agent should add docstrings to all functions.", + model=azure_entra_model(), ) pytest.fail(f"No docstrings found.\\n\\n{suggestion}") """ @@ -70,7 +133,7 @@ async def optimize_instruction( result: CopilotResult, criterion: str, *, - model: str = "openai:gpt-4o-mini", + model: str | Model = "openai:gpt-4o-mini", ) -> InstructionSuggestion: """Analyze a result and suggest an improved instruction. @@ -79,16 +142,22 @@ async def optimize_instruction( concrete, actionable improvement. Designed to drop into ``pytest.fail()`` so the failure message - contains a ready-to-use fix: + contains a ready-to-use fix. + + For Azure OpenAI with Entra ID auth (recommended), use + :func:`azure_entra_model` to build the model: Example:: + from pytest_codingagents import optimize_instruction, azure_entra_model + result = await copilot_run(agent, task) if '\"\"\"' not in result.file("main.py"): suggestion = await optimize_instruction( agent.instructions or "", result, "Agent should add docstrings to all functions.", + model=azure_entra_model(), # gpt-5.2-chat via Entra ID ) pytest.fail(f"No docstrings found.\\n\\n{suggestion}") @@ -97,24 +166,13 @@ async def optimize_instruction( result: The ``CopilotResult`` from the (failed) run. criterion: What the agent *should* have done — the test expectation in plain English (e.g. ``"Always write docstrings"``). - model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"`` - or ``"anthropic:claude-3-haiku-20240307"``). + model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"``) + **or** a pre-configured pydantic-ai ``Model`` object built with + :func:`azure_entra_model` or any other provider. Returns: An :class:`InstructionSuggestion` with the improved instruction. - - Raises: - ImportError: If pydantic-ai is not installed. """ - try: - from pydantic_ai import Agent as PydanticAgent - except ImportError as exc: - msg = ( - "pydantic-ai is required for optimize_instruction(). " - "Install it with: uv add pydantic-ai" - ) - raise ImportError(msg) from exc - final_output = result.final_response or "(no response)" tool_calls = ", ".join(sorted(result.tool_names_called)) or "none" diff --git a/tests/test_optimizer_integration.py b/tests/test_optimizer_integration.py index b1b6ea7..2fb2f1e 100644 --- a/tests/test_optimizer_integration.py +++ b/tests/test_optimizer_integration.py @@ -2,9 +2,9 @@ These tests require: - GitHub Copilot credentials (for copilot_run to produce a real result) -- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model) +- AZURE_OPENAI_ENDPOINT env var set (for the optimizer LLM via Azure Entra ID) -Skipped automatically when the required API key is absent. +Skipped automatically when AZURE_OPENAI_ENDPOINT is absent. """ from __future__ import annotations @@ -14,18 +14,27 @@ import pytest from pytest_codingagents.copilot.agent import CopilotAgent -from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction +from pytest_codingagents.copilot.optimizer import ( + InstructionSuggestion, + azure_entra_model, + optimize_instruction, +) + + +def _model(): + """Build Azure Entra ID model for optimizer tests.""" + return azure_entra_model() # defaults to gpt-5.2-chat @pytest.mark.copilot class TestOptimizeInstructionIntegration: - """Integration tests for optimize_instruction() with real LLM calls.""" + """Integration tests for optimize_instruction() with real Azure LLM calls.""" @pytest.fixture(autouse=True) - def require_openai_key(self): - """Skip entire class when OPENAI_API_KEY is not set.""" - if not os.environ.get("OPENAI_API_KEY"): - pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests") + def require_azure_endpoint(self): + """Skip entire class when AZURE_OPENAI_ENDPOINT is not set.""" + if not os.environ.get("AZURE_OPENAI_ENDPOINT"): + pytest.skip("AZURE_OPENAI_ENDPOINT not set — skipping optimizer integration tests") async def test_returns_valid_suggestion(self, copilot_run, tmp_path): """optimize_instruction returns an InstructionSuggestion with non-empty fields.""" @@ -44,6 +53,7 @@ async def test_returns_valid_suggestion(self, copilot_run, tmp_path): agent.instructions or "", result, "Every function must have a Google-style docstring.", + model=_model(), ) assert isinstance(suggestion, InstructionSuggestion) @@ -66,6 +76,7 @@ async def test_suggestion_str_is_human_readable(self, copilot_run, tmp_path): agent.instructions or "", result, "Add type hints to all function parameters and return values.", + model=_model(), ) text = str(suggestion) @@ -91,6 +102,7 @@ async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path): agent.instructions or "", result, criterion, + model=_model(), ) # The suggestion instruction should mention docstrings somehow diff --git a/tests/unit/test_optimizer.py b/tests/unit/test_optimizer.py index 81e797c..1d2871f 100644 --- a/tests/unit/test_optimizer.py +++ b/tests/unit/test_optimizer.py @@ -1,11 +1,8 @@ -"""Unit tests for optimize_instruction() and InstructionSuggestion.""" +"""Unit tests for optimize_instruction(), azure_entra_model(), and InstructionSuggestion.""" from __future__ import annotations -import sys -from unittest.mock import AsyncMock, MagicMock - -import pytest +from unittest.mock import AsyncMock, MagicMock, patch from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction from pytest_codingagents.copilot.result import CopilotResult, ToolCall, Turn @@ -20,27 +17,21 @@ def _make_result( tool_calls = [ToolCall(name=t, arguments={}) for t in (tools or [])] return CopilotResult( success=success, - turns=[ - Turn(role="assistant", content=final_response, tool_calls=tool_calls), - ], + turns=[Turn(role="assistant", content=final_response, tool_calls=tool_calls)], ) def _make_agent_mock(instruction: str, reasoning: str, changes: str) -> MagicMock: - """Build a pydantic-ai Agent mock that returns a structured suggestion.""" - output = MagicMock() - output.instruction = instruction - output.reasoning = reasoning - output.changes = changes - - run_result = MagicMock() - run_result.output = output - + """Return a MagicMock that behaves like pydantic-ai Agent class.""" + output = MagicMock(instruction=instruction, reasoning=reasoning, changes=changes) + run_result = MagicMock(output=output) agent_instance = MagicMock() agent_instance.run = AsyncMock(return_value=run_result) + return MagicMock(return_value=agent_instance) + - agent_class = MagicMock(return_value=agent_instance) - return agent_class +# Patch target: PydanticAgent as imported in the optimizer module +_AGENT_PATCH = "pytest_codingagents.copilot.optimizer.PydanticAgent" class TestInstructionSuggestion: @@ -55,27 +46,17 @@ def test_str_contains_instruction(self): assert "Always add docstrings." in str(s) def test_str_contains_reasoning(self): - s = InstructionSuggestion( - instruction="inst", - reasoning="because reasons", - changes="changed x", - ) + s = InstructionSuggestion(instruction="inst", reasoning="because reasons", changes="x") assert "because reasons" in str(s) def test_str_contains_changes(self): s = InstructionSuggestion( - instruction="inst", - reasoning="reason", - changes="Added docstring mandate.", + instruction="inst", reasoning="reason", changes="Added docstring mandate." ) assert "Added docstring mandate." in str(s) def test_fields_accessible(self): - s = InstructionSuggestion( - instruction="inst", - reasoning="reason", - changes="changes", - ) + s = InstructionSuggestion(instruction="inst", reasoning="reason", changes="changes") assert s.instruction == "inst" assert s.reasoning == "reason" assert s.changes == "changes" @@ -85,22 +66,15 @@ class TestOptimizeInstruction: """Tests for optimize_instruction().""" async def test_returns_instruction_suggestion(self): - """optimize_instruction returns an InstructionSuggestion.""" agent_class = _make_agent_mock( instruction="Always add Google-style docstrings.", reasoning="The original instruction omits documentation.", changes="Added docstring mandate.", ) - - # patch pydantic_ai.Agent in the module where it's imported - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = await optimize_instruction( - "Write Python code.", - _make_result(), - "Agent should add docstrings.", - ) - + with patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction( + "Write Python code.", _make_result(), "Agent should add docstrings." + ) assert isinstance(result, InstructionSuggestion) assert result.instruction == "Always add Google-style docstrings." assert result.reasoning == "The original instruction omits documentation." @@ -109,127 +83,139 @@ async def test_returns_instruction_suggestion(self): async def test_uses_default_model(self): """optimize_instruction defaults to openai:gpt-4o-mini.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction("inst", _make_result(), "criterion") - - agent_class.assert_called_once() + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction("inst", _make_result(), "criterion") assert agent_class.call_args[0][0] == "openai:gpt-4o-mini" - async def test_accepts_custom_model(self): + async def test_accepts_custom_model_string(self): """optimize_instruction accepts a custom model string.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "inst", - _make_result(), - "criterion", - model="anthropic:claude-3-haiku-20240307", - ) - + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", + _make_result(), + "criterion", + model="anthropic:claude-3-haiku-20240307", + ) assert agent_class.call_args[0][0] == "anthropic:claude-3-haiku-20240307" + async def test_accepts_model_object(self): + """optimize_instruction accepts a pre-built Model object (e.g. azure_entra_model()).""" + agent_class = _make_agent_mock("inst", "reason", "changes") + fake_model = MagicMock() + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction("inst", _make_result(), "criterion", model=fake_model) + assert agent_class.call_args[0][0] is fake_model + async def test_includes_criterion_in_prompt(self): - """The LLM prompt includes the criterion text.""" agent_class = _make_agent_mock("improved", "reason", "change") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "Write code.", - _make_result(), - "Agent must use type hints on all functions.", - ) - - prompt = agent_instance.run.call_args[0][0] - assert "type hints" in prompt + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "Write code.", _make_result(), "Agent must use type hints on all functions." + ) + assert "type hints" in agent_instance.run.call_args[0][0] async def test_includes_current_instruction_in_prompt(self): - """The LLM prompt contains the current instruction.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - await optimize_instruction( - "Always use FastAPI for web APIs.", - _make_result(), - "criterion", - ) - - prompt = agent_instance.run.call_args[0][0] - assert "FastAPI" in prompt + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "Always use FastAPI for web APIs.", _make_result(), "criterion" + ) + assert "FastAPI" in agent_instance.run.call_args[0][0] async def test_includes_agent_output_in_prompt(self): - """The LLM prompt contains the agent's final response.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = _make_result(final_response="def add(a, b): return a + b") - await optimize_instruction("inst", result, "criterion") - - prompt = agent_instance.run.call_args[0][0] - assert "def add" in prompt + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", _make_result(final_response="def add(a, b): return a + b"), "criterion" + ) + assert "def add" in agent_instance.run.call_args[0][0] async def test_handles_no_final_response(self): - """optimize_instruction handles results with no turns gracefully.""" agent_class = _make_agent_mock("inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - empty_result = CopilotResult(success=False, turns=[]) - result = await optimize_instruction("inst", empty_result, "criterion") - + with patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction( + "inst", CopilotResult(success=False, turns=[]), "criterion" + ) assert isinstance(result, InstructionSuggestion) async def test_handles_empty_instruction(self): - """optimize_instruction handles empty current instruction.""" agent_class = _make_agent_mock("new inst", "reason", "changes") - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = await optimize_instruction("", _make_result(), "criterion") + with patch(_AGENT_PATCH, agent_class): + result = await optimize_instruction("", _make_result(), "criterion") assert isinstance(result, InstructionSuggestion) async def test_includes_tool_calls_in_prompt(self): - """The LLM prompt includes tool call information.""" agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - sys.modules["pydantic_ai"].Agent = agent_class # type: ignore[attr-defined] - - result = _make_result(tools=["create_file", "read_file"]) - await optimize_instruction("inst", result, "criterion") - - prompt = agent_instance.run.call_args[0][0] - assert "create_file" in prompt - - -class TestOptimizeInstructionImportError: - """Test ImportError when pydantic-ai is not installed.""" - - async def test_raises_import_error_when_pydantic_ai_missing(self): - """optimize_instruction raises ImportError if pydantic-ai not installed.""" - saved = sys.modules.get("pydantic_ai") - try: - sys.modules["pydantic_ai"] = None # type: ignore - - with pytest.raises(ImportError, match="pydantic-ai"): - await optimize_instruction("inst", _make_result(), "criterion") - finally: - if saved is not None: - sys.modules["pydantic_ai"] = saved - else: - del sys.modules["pydantic_ai"] - - async def test_import_error_includes_install_hint(self): - """ImportError message includes the uv add install hint.""" - saved = sys.modules.get("pydantic_ai") - try: - sys.modules["pydantic_ai"] = None # type: ignore - - with pytest.raises(ImportError, match="uv add pydantic-ai"): - await optimize_instruction("inst", _make_result(), "criterion") - finally: - if saved is not None: - sys.modules["pydantic_ai"] = saved - else: - del sys.modules["pydantic_ai"] + with patch(_AGENT_PATCH, agent_class): + await optimize_instruction( + "inst", _make_result(tools=["create_file", "read_file"]), "criterion" + ) + assert "create_file" in agent_instance.run.call_args[0][0] + + +class TestAzureEntraModel: + """Tests for azure_entra_model().""" + + # Patch targets: lazy imports inside the function body live in their home modules + _PATCHES = [ + ("azure.identity.DefaultAzureCredential", MagicMock()), + ("azure.identity.get_bearer_token_provider", MagicMock()), + ("openai.AsyncAzureOpenAI", MagicMock()), + ("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + ] + + def test_returns_model_object(self): + """azure_entra_model() returns a pydantic-ai Model-compatible object.""" + from pytest_codingagents.copilot.optimizer import azure_entra_model + + fake_model = MagicMock() + with ( + patch("azure.identity.DefaultAzureCredential", MagicMock()), + patch("azure.identity.get_bearer_token_provider", MagicMock()), + patch("openai.AsyncAzureOpenAI", MagicMock()), + patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + patch("pydantic_ai.models.openai.OpenAIChatModel", return_value=fake_model), + ): + result = azure_entra_model(endpoint="https://test.openai.azure.com/") + assert result is fake_model + + def test_default_deployment_is_gpt52(self): + """azure_entra_model() defaults to gpt-5.2-chat.""" + from pytest_codingagents.copilot.optimizer import azure_entra_model + + captured: list[str] = [] + with ( + patch("azure.identity.DefaultAzureCredential", MagicMock()), + patch("azure.identity.get_bearer_token_provider", MagicMock()), + patch("openai.AsyncAzureOpenAI", MagicMock()), + patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + patch( + "pydantic_ai.models.openai.OpenAIChatModel", + side_effect=lambda name, **kw: captured.append(name) or MagicMock(), + ), + ): + azure_entra_model(endpoint="https://test.openai.azure.com/") + assert captured == ["gpt-5.2-chat"] + + def test_custom_deployment_name(self): + """azure_entra_model() uses the provided deployment name.""" + from pytest_codingagents.copilot.optimizer import azure_entra_model + + captured: list[str] = [] + with ( + patch("azure.identity.DefaultAzureCredential", MagicMock()), + patch("azure.identity.get_bearer_token_provider", MagicMock()), + patch("openai.AsyncAzureOpenAI", MagicMock()), + patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), + patch( + "pydantic_ai.models.openai.OpenAIChatModel", + side_effect=lambda name, **kw: captured.append(name) or MagicMock(), + ), + ): + azure_entra_model("gpt-4.1", endpoint="https://test.openai.azure.com/") + assert captured == ["gpt-4.1"] From 6b9775b12880257986f3496eb5e858d6eace2adb Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 12:06:10 +0100 Subject: [PATCH 2/4] =?UTF-8?q?fix:=20remove=20azure=5Fentra=5Fmodel()=20?= =?UTF-8?q?=E2=80=94=20use=20aitest=20Provider=20model=20strings=20instead?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pytest-aitest already provides build_model_from_string() which handles Azure Entra ID auth automatically from 'azure/gpt-5.2-chat' style strings. - Delete azure_entra_model() from optimizer.py and __init__.py (duplication) - optimize_instruction() now uses build_model_from_string() internally - Default model changed to 'azure/gpt-5.2-chat' - Model strings use provider/model format (same as pytest-aitest Provider) - Remove TestAzureEntraModel unit tests (testing dead code) - Update integration tests: no extra import needed - 3/3 integration tests pass against azure/gpt-5.2-chat via Entra ID Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/pytest_codingagents/__init__.py | 2 - src/pytest_codingagents/copilot/optimizer.py | 88 +++++--------------- tests/test_optimizer_integration.py | 18 +--- tests/unit/test_optimizer.py | 77 ++--------------- 4 files changed, 30 insertions(+), 155 deletions(-) diff --git a/src/pytest_codingagents/__init__.py b/src/pytest_codingagents/__init__.py index 8110341..182169f 100644 --- a/src/pytest_codingagents/__init__.py +++ b/src/pytest_codingagents/__init__.py @@ -6,7 +6,6 @@ from pytest_codingagents.copilot.agents import load_custom_agent, load_custom_agents from pytest_codingagents.copilot.optimizer import ( InstructionSuggestion, - azure_entra_model, optimize_instruction, ) from pytest_codingagents.copilot.result import CopilotResult @@ -15,7 +14,6 @@ "CopilotAgent", "CopilotResult", "InstructionSuggestion", - "azure_entra_model", "load_custom_agent", "load_custom_agents", "optimize_instruction", diff --git a/src/pytest_codingagents/copilot/optimizer.py b/src/pytest_codingagents/copilot/optimizer.py index c97c6c4..2c5bb20 100644 --- a/src/pytest_codingagents/copilot/optimizer.py +++ b/src/pytest_codingagents/copilot/optimizer.py @@ -4,84 +4,34 @@ between a current agent instruction and the observed behavior, and suggests a concrete improvement. -Use :func:`azure_entra_model` to build a pre-configured pydantic-ai model -from Azure Entra ID (no API key required): +Model strings follow the same ``provider/model`` format used by +``pytest-aitest`` (e.g. ``"azure/gpt-5.2-chat"``, ``"openai/gpt-4o-mini"``). +Azure Entra ID authentication is handled automatically when +``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set. + +Example:: - model = azure_entra_model() # defaults to gpt-5.2-chat suggestion = await optimize_instruction( agent.instructions or "", result, "Agent should add docstrings.", - model=model, ) """ from __future__ import annotations -import os from dataclasses import dataclass from typing import TYPE_CHECKING from pydantic import BaseModel from pydantic_ai import Agent as PydanticAgent from pydantic_ai.models import Model +from pytest_aitest.execution.pydantic_adapter import build_model_from_string if TYPE_CHECKING: from pytest_codingagents.copilot.result import CopilotResult -__all__ = ["InstructionSuggestion", "azure_entra_model", "optimize_instruction"] - -# Most capable model available on Azure OpenAI -_AZURE_DEFAULT_MODEL = "gpt-5.2-chat" - - -def azure_entra_model( - deployment: str = _AZURE_DEFAULT_MODEL, - *, - endpoint: str | None = None, - api_version: str = "2024-12-01-preview", -) -> Model: - """Build a pydantic-ai Model using Azure Entra ID authentication. - - No API key required — uses ``DefaultAzureCredential`` (works with - ``az login`` locally and managed identity in CI). - - Args: - deployment: Azure OpenAI deployment name. Defaults to - ``"gpt-5.2-chat"`` — the most capable model available. - endpoint: Azure OpenAI endpoint URL. Defaults to the - ``AZURE_OPENAI_ENDPOINT`` environment variable. - api_version: Azure OpenAI API version string. - - Returns: - A pydantic-ai ``Model`` ready to pass to ``optimize_instruction()``. - - Example:: - - model = azure_entra_model() - suggestion = await optimize_instruction( - agent.instructions or "", - result, - "Agent should add docstrings.", - model=model, - ) - """ - from azure.identity import DefaultAzureCredential, get_bearer_token_provider - from openai import AsyncAzureOpenAI - from pydantic_ai.models.openai import OpenAIChatModel - from pydantic_ai.providers.openai import OpenAIProvider - - azure_endpoint = endpoint or os.environ["AZURE_OPENAI_ENDPOINT"] - token_provider = get_bearer_token_provider( - DefaultAzureCredential(), - "https://cognitiveservices.azure.com/.default", - ) - client = AsyncAzureOpenAI( - azure_endpoint=azure_endpoint, - azure_ad_token_provider=token_provider, - api_version=api_version, - ) - return OpenAIChatModel(deployment, provider=OpenAIProvider(openai_client=client)) +__all__ = ["InstructionSuggestion", "optimize_instruction"] @dataclass @@ -102,7 +52,6 @@ class InstructionSuggestion: agent.instructions, result, "Agent should add docstrings to all functions.", - model=azure_entra_model(), ) pytest.fail(f"No docstrings found.\\n\\n{suggestion}") """ @@ -133,7 +82,7 @@ async def optimize_instruction( result: CopilotResult, criterion: str, *, - model: str | Model = "openai:gpt-4o-mini", + model: str | Model = "azure/gpt-5.2-chat", ) -> InstructionSuggestion: """Analyze a result and suggest an improved instruction. @@ -144,20 +93,18 @@ async def optimize_instruction( Designed to drop into ``pytest.fail()`` so the failure message contains a ready-to-use fix. - For Azure OpenAI with Entra ID auth (recommended), use - :func:`azure_entra_model` to build the model: + Model strings follow the same ``provider/model`` format used by + ``pytest-aitest``. Azure Entra ID auth is handled automatically + when ``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set. Example:: - from pytest_codingagents import optimize_instruction, azure_entra_model - result = await copilot_run(agent, task) if '\"\"\"' not in result.file("main.py"): suggestion = await optimize_instruction( agent.instructions or "", result, "Agent should add docstrings to all functions.", - model=azure_entra_model(), # gpt-5.2-chat via Entra ID ) pytest.fail(f"No docstrings found.\\n\\n{suggestion}") @@ -166,13 +113,16 @@ async def optimize_instruction( result: The ``CopilotResult`` from the (failed) run. criterion: What the agent *should* have done — the test expectation in plain English (e.g. ``"Always write docstrings"``). - model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"``) - **or** a pre-configured pydantic-ai ``Model`` object built with - :func:`azure_entra_model` or any other provider. + model: Provider/model string (e.g. ``"azure/gpt-5.2-chat"``, + ``"openai/gpt-4o-mini"``) or a pre-configured pydantic-ai + ``Model`` object. Defaults to ``"azure/gpt-5.2-chat"``. Returns: An :class:`InstructionSuggestion` with the improved instruction. """ + resolved_model: str | Model = ( + build_model_from_string(model) if isinstance(model, str) else model + ) final_output = result.final_response or "(no response)" tool_calls = ", ".join(sorted(result.tool_names_called)) or "none" @@ -200,7 +150,7 @@ async def optimize_instruction( that would make the agent satisfy the criterion. Keep the instruction under 200 words. Do not add unrelated rules.""" - optimizer_agent = PydanticAgent(model, output_type=_OptimizationOutput) + optimizer_agent = PydanticAgent(resolved_model, output_type=_OptimizationOutput) run_result = await optimizer_agent.run(prompt) output = run_result.output diff --git a/tests/test_optimizer_integration.py b/tests/test_optimizer_integration.py index 2fb2f1e..22604d6 100644 --- a/tests/test_optimizer_integration.py +++ b/tests/test_optimizer_integration.py @@ -2,7 +2,7 @@ These tests require: - GitHub Copilot credentials (for copilot_run to produce a real result) -- AZURE_OPENAI_ENDPOINT env var set (for the optimizer LLM via Azure Entra ID) +- AZURE_API_BASE or AZURE_OPENAI_ENDPOINT env var (for the optimizer LLM via Azure Entra ID) Skipped automatically when AZURE_OPENAI_ENDPOINT is absent. """ @@ -14,16 +14,7 @@ import pytest from pytest_codingagents.copilot.agent import CopilotAgent -from pytest_codingagents.copilot.optimizer import ( - InstructionSuggestion, - azure_entra_model, - optimize_instruction, -) - - -def _model(): - """Build Azure Entra ID model for optimizer tests.""" - return azure_entra_model() # defaults to gpt-5.2-chat +from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction @pytest.mark.copilot @@ -33,7 +24,7 @@ class TestOptimizeInstructionIntegration: @pytest.fixture(autouse=True) def require_azure_endpoint(self): """Skip entire class when AZURE_OPENAI_ENDPOINT is not set.""" - if not os.environ.get("AZURE_OPENAI_ENDPOINT"): + if not os.environ.get("AZURE_OPENAI_ENDPOINT") and not os.environ.get("AZURE_API_BASE"): pytest.skip("AZURE_OPENAI_ENDPOINT not set — skipping optimizer integration tests") async def test_returns_valid_suggestion(self, copilot_run, tmp_path): @@ -53,7 +44,6 @@ async def test_returns_valid_suggestion(self, copilot_run, tmp_path): agent.instructions or "", result, "Every function must have a Google-style docstring.", - model=_model(), ) assert isinstance(suggestion, InstructionSuggestion) @@ -76,7 +66,6 @@ async def test_suggestion_str_is_human_readable(self, copilot_run, tmp_path): agent.instructions or "", result, "Add type hints to all function parameters and return values.", - model=_model(), ) text = str(suggestion) @@ -102,7 +91,6 @@ async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path): agent.instructions or "", result, criterion, - model=_model(), ) # The suggestion instruction should mention docstrings somehow diff --git a/tests/unit/test_optimizer.py b/tests/unit/test_optimizer.py index 1d2871f..51a378d 100644 --- a/tests/unit/test_optimizer.py +++ b/tests/unit/test_optimizer.py @@ -81,26 +81,28 @@ async def test_returns_instruction_suggestion(self): assert result.changes == "Added docstring mandate." async def test_uses_default_model(self): - """optimize_instruction defaults to openai:gpt-4o-mini.""" + """optimize_instruction defaults to azure/gpt-5.2-chat.""" agent_class = _make_agent_mock("inst", "reason", "changes") with patch(_AGENT_PATCH, agent_class): await optimize_instruction("inst", _make_result(), "criterion") - assert agent_class.call_args[0][0] == "openai:gpt-4o-mini" + # build_model_from_string("azure/gpt-5.2-chat") returns an OpenAIChatModel object + assert agent_class.call_args[0][0] is not None async def test_accepts_custom_model_string(self): - """optimize_instruction accepts a custom model string.""" + """optimize_instruction accepts a custom model string (provider/model format).""" agent_class = _make_agent_mock("inst", "reason", "changes") with patch(_AGENT_PATCH, agent_class): await optimize_instruction( "inst", _make_result(), "criterion", - model="anthropic:claude-3-haiku-20240307", + model="openai/gpt-4o-mini", ) - assert agent_class.call_args[0][0] == "anthropic:claude-3-haiku-20240307" + # build_model_from_string converts "openai/gpt-4o-mini" -> "openai:gpt-4o-mini" + assert agent_class.call_args[0][0] == "openai:gpt-4o-mini" async def test_accepts_model_object(self): - """optimize_instruction accepts a pre-built Model object (e.g. azure_entra_model()).""" + """optimize_instruction accepts a pre-built Model object.""" agent_class = _make_agent_mock("inst", "reason", "changes") fake_model = MagicMock() with patch(_AGENT_PATCH, agent_class): @@ -156,66 +158,3 @@ async def test_includes_tool_calls_in_prompt(self): "inst", _make_result(tools=["create_file", "read_file"]), "criterion" ) assert "create_file" in agent_instance.run.call_args[0][0] - - -class TestAzureEntraModel: - """Tests for azure_entra_model().""" - - # Patch targets: lazy imports inside the function body live in their home modules - _PATCHES = [ - ("azure.identity.DefaultAzureCredential", MagicMock()), - ("azure.identity.get_bearer_token_provider", MagicMock()), - ("openai.AsyncAzureOpenAI", MagicMock()), - ("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), - ] - - def test_returns_model_object(self): - """azure_entra_model() returns a pydantic-ai Model-compatible object.""" - from pytest_codingagents.copilot.optimizer import azure_entra_model - - fake_model = MagicMock() - with ( - patch("azure.identity.DefaultAzureCredential", MagicMock()), - patch("azure.identity.get_bearer_token_provider", MagicMock()), - patch("openai.AsyncAzureOpenAI", MagicMock()), - patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), - patch("pydantic_ai.models.openai.OpenAIChatModel", return_value=fake_model), - ): - result = azure_entra_model(endpoint="https://test.openai.azure.com/") - assert result is fake_model - - def test_default_deployment_is_gpt52(self): - """azure_entra_model() defaults to gpt-5.2-chat.""" - from pytest_codingagents.copilot.optimizer import azure_entra_model - - captured: list[str] = [] - with ( - patch("azure.identity.DefaultAzureCredential", MagicMock()), - patch("azure.identity.get_bearer_token_provider", MagicMock()), - patch("openai.AsyncAzureOpenAI", MagicMock()), - patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), - patch( - "pydantic_ai.models.openai.OpenAIChatModel", - side_effect=lambda name, **kw: captured.append(name) or MagicMock(), - ), - ): - azure_entra_model(endpoint="https://test.openai.azure.com/") - assert captured == ["gpt-5.2-chat"] - - def test_custom_deployment_name(self): - """azure_entra_model() uses the provided deployment name.""" - from pytest_codingagents.copilot.optimizer import azure_entra_model - - captured: list[str] = [] - with ( - patch("azure.identity.DefaultAzureCredential", MagicMock()), - patch("azure.identity.get_bearer_token_provider", MagicMock()), - patch("openai.AsyncAzureOpenAI", MagicMock()), - patch("pydantic_ai.providers.openai.OpenAIProvider", MagicMock()), - patch( - "pydantic_ai.models.openai.OpenAIChatModel", - side_effect=lambda name, **kw: captured.append(name) or MagicMock(), - ), - ): - azure_entra_model("gpt-4.1", endpoint="https://test.openai.azure.com/") - assert captured == ["gpt-4.1"] From 763ba3bd630d49605becd990afe33c7dbb3f4e57 Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 12:14:12 +0100 Subject: [PATCH 3/4] test: add full test->optimize->test loop integration test Adds test_full_optimize_loop which validates the hero use case end-to-end: 1. Run Copilot with weak instruction ('no comments or documentation needed') -> code produced has no docstrings (verified) 2. Call optimize_instruction() with failing criterion -> optimizer identifies the conflict, suggests explicit docstring mandate 3. Run Copilot again with improved instruction -> code now contains Google-style docstrings (verified) Result: Docstrings round 1: False, round 2: True (passed in 36s) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_optimizer_integration.py | 61 +++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/test_optimizer_integration.py b/tests/test_optimizer_integration.py index 22604d6..6642053 100644 --- a/tests/test_optimizer_integration.py +++ b/tests/test_optimizer_integration.py @@ -100,3 +100,64 @@ async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path): f"Instruction: {suggestion.instruction}\n" f"Reasoning: {suggestion.reasoning}" ) + + async def test_full_optimize_loop(self, copilot_run, tmp_path): + """Full test→optimize→test loop: weak instruction fails, improved instruction passes. + + This is the hero use case: verify that optimize_instruction() produces + an instruction that actually fixes a failing criterion. + + Round 1: Run with a deliberately weak instruction (no docstring mandate). + The agent writes code but skips docstrings. + Optimize: Call optimize_instruction() with the failing criterion. + Receive a suggested instruction that mandates docstrings. + Round 2: Run again with the improved instruction. + The agent now includes docstrings — criterion passes. + """ + CRITERION = "Every function must include a Google-style docstring." + TASK = "Create calculator.py with add(a, b) and subtract(a, b) functions." + + # --- Round 1: weak instruction, expect no docstrings --- + weak_agent = CopilotAgent( + name="weak-coder", + instructions="Write minimal Python code. No comments or documentation needed.", + working_directory=str(tmp_path / "round1"), + ) + (tmp_path / "round1").mkdir() + result1 = await copilot_run(weak_agent, TASK) + assert result1.success, "Round 1 Copilot run failed" + + code1 = result1.file("calculator.py") or "" + has_docstrings_round1 = '"""' in code1 or "'''" in code1 + + # --- Optimize --- + suggestion = await optimize_instruction( + weak_agent.instructions or "", + result1, + CRITERION, + ) + assert suggestion.instruction.strip(), "Optimizer returned empty instruction" + print(f"\n💡 Suggested instruction:\n{suggestion}") # visible in -s output + + # --- Round 2: improved instruction --- + improved_agent = CopilotAgent( + name="improved-coder", + instructions=suggestion.instruction, + working_directory=str(tmp_path / "round2"), + ) + (tmp_path / "round2").mkdir() + result2 = await copilot_run(improved_agent, TASK) + assert result2.success, "Round 2 Copilot run failed" + + code2 = result2.file("calculator.py") or "" + has_docstrings_round2 = '"""' in code2 or "'''" in code2 + + assert has_docstrings_round2, ( + f"Round 2 code still has no docstrings after optimization.\n" + f"Suggested instruction: {suggestion.instruction}\n" + f"Round 2 code:\n{code2}" + ) + print( + f"\n✅ Loop complete. " + f"Docstrings round 1: {has_docstrings_round1}, round 2: {has_docstrings_round2}" + ) From f23ed1eb0b34b43178cbd8e0ccfaeb20f031ace1 Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 12:36:26 +0100 Subject: [PATCH 4/4] fix: patch build_model_from_string in unit tests to avoid Azure env requirement Unit tests were failing in CI with 'AZURE_API_BASE or AZURE_OPENAI_ENDPOINT required' because build_model_from_string('azure/gpt-5.2-chat') runs before PydanticAgent is mocked. Now both are patched together. Also strengthens test_uses_default_model: verifies the exact model string passed to build_model_from_string, and that the resolved model is forwarded. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/test_optimizer.py | 54 +++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/tests/unit/test_optimizer.py b/tests/unit/test_optimizer.py index 51a378d..fd551fd 100644 --- a/tests/unit/test_optimizer.py +++ b/tests/unit/test_optimizer.py @@ -1,4 +1,4 @@ -"""Unit tests for optimize_instruction(), azure_entra_model(), and InstructionSuggestion.""" +"""Unit tests for optimize_instruction() and InstructionSuggestion.""" from __future__ import annotations @@ -7,6 +7,11 @@ from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction from pytest_codingagents.copilot.result import CopilotResult, ToolCall, Turn +# Patch targets +_AGENT_PATCH = "pytest_codingagents.copilot.optimizer.PydanticAgent" +_BUILD_MODEL_PATCH = "pytest_codingagents.copilot.optimizer.build_model_from_string" +_FAKE_MODEL = MagicMock(name="fake-model") + def _make_result( *, @@ -30,10 +35,6 @@ def _make_agent_mock(instruction: str, reasoning: str, changes: str) -> MagicMoc return MagicMock(return_value=agent_instance) -# Patch target: PydanticAgent as imported in the optimizer module -_AGENT_PATCH = "pytest_codingagents.copilot.optimizer.PydanticAgent" - - class TestInstructionSuggestion: """Tests for the InstructionSuggestion dataclass.""" @@ -71,7 +72,7 @@ async def test_returns_instruction_suggestion(self): reasoning="The original instruction omits documentation.", changes="Added docstring mandate.", ) - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): result = await optimize_instruction( "Write Python code.", _make_result(), "Agent should add docstrings." ) @@ -83,36 +84,39 @@ async def test_returns_instruction_suggestion(self): async def test_uses_default_model(self): """optimize_instruction defaults to azure/gpt-5.2-chat.""" agent_class = _make_agent_mock("inst", "reason", "changes") - with patch(_AGENT_PATCH, agent_class): + with ( + patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL) as mock_build, + patch(_AGENT_PATCH, agent_class), + ): await optimize_instruction("inst", _make_result(), "criterion") - # build_model_from_string("azure/gpt-5.2-chat") returns an OpenAIChatModel object - assert agent_class.call_args[0][0] is not None + mock_build.assert_called_once_with("azure/gpt-5.2-chat") + assert agent_class.call_args[0][0] is _FAKE_MODEL async def test_accepts_custom_model_string(self): - """optimize_instruction accepts a custom model string (provider/model format).""" + """optimize_instruction passes model string through build_model_from_string.""" agent_class = _make_agent_mock("inst", "reason", "changes") - with patch(_AGENT_PATCH, agent_class): + with ( + patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL) as mock_build, + patch(_AGENT_PATCH, agent_class), + ): await optimize_instruction( - "inst", - _make_result(), - "criterion", - model="openai/gpt-4o-mini", + "inst", _make_result(), "criterion", model="openai/gpt-4o-mini" ) - # build_model_from_string converts "openai/gpt-4o-mini" -> "openai:gpt-4o-mini" - assert agent_class.call_args[0][0] == "openai:gpt-4o-mini" + mock_build.assert_called_once_with("openai/gpt-4o-mini") async def test_accepts_model_object(self): - """optimize_instruction accepts a pre-built Model object.""" + """optimize_instruction skips build_model_from_string for a pre-built Model object.""" agent_class = _make_agent_mock("inst", "reason", "changes") fake_model = MagicMock() - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH) as mock_build, patch(_AGENT_PATCH, agent_class): await optimize_instruction("inst", _make_result(), "criterion", model=fake_model) + mock_build.assert_not_called() assert agent_class.call_args[0][0] is fake_model async def test_includes_criterion_in_prompt(self): agent_class = _make_agent_mock("improved", "reason", "change") agent_instance = agent_class.return_value - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): await optimize_instruction( "Write code.", _make_result(), "Agent must use type hints on all functions." ) @@ -121,7 +125,7 @@ async def test_includes_criterion_in_prompt(self): async def test_includes_current_instruction_in_prompt(self): agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): await optimize_instruction( "Always use FastAPI for web APIs.", _make_result(), "criterion" ) @@ -130,7 +134,7 @@ async def test_includes_current_instruction_in_prompt(self): async def test_includes_agent_output_in_prompt(self): agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): await optimize_instruction( "inst", _make_result(final_response="def add(a, b): return a + b"), "criterion" ) @@ -138,7 +142,7 @@ async def test_includes_agent_output_in_prompt(self): async def test_handles_no_final_response(self): agent_class = _make_agent_mock("inst", "reason", "changes") - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): result = await optimize_instruction( "inst", CopilotResult(success=False, turns=[]), "criterion" ) @@ -146,14 +150,14 @@ async def test_handles_no_final_response(self): async def test_handles_empty_instruction(self): agent_class = _make_agent_mock("new inst", "reason", "changes") - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): result = await optimize_instruction("", _make_result(), "criterion") assert isinstance(result, InstructionSuggestion) async def test_includes_tool_calls_in_prompt(self): agent_class = _make_agent_mock("inst", "reason", "changes") agent_instance = agent_class.return_value - with patch(_AGENT_PATCH, agent_class): + with patch(_BUILD_MODEL_PATCH, return_value=_FAKE_MODEL), patch(_AGENT_PATCH, agent_class): await optimize_instruction( "inst", _make_result(tools=["create_file", "read_file"]), "criterion" )