Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/pytest_codingagents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

from pytest_codingagents.copilot.agent import CopilotAgent
from pytest_codingagents.copilot.agents import load_custom_agent, load_custom_agents
from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction
from pytest_codingagents.copilot.optimizer import (
InstructionSuggestion,
optimize_instruction,
)
from pytest_codingagents.copilot.result import CopilotResult

__all__ = [
Expand Down
46 changes: 27 additions & 19 deletions src/pytest_codingagents/copilot/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,18 @@
between a current agent instruction and the observed behavior, and suggests a
concrete improvement.

Requires ``pydantic-ai``:
Model strings follow the same ``provider/model`` format used by
``pytest-aitest`` (e.g. ``"azure/gpt-5.2-chat"``, ``"openai/gpt-4o-mini"``).
Azure Entra ID authentication is handled automatically when
``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set.

uv add pydantic-ai
Example::

suggestion = await optimize_instruction(
agent.instructions or "",
result,
"Agent should add docstrings.",
)
"""

from __future__ import annotations
Expand All @@ -15,6 +24,9 @@
from typing import TYPE_CHECKING

from pydantic import BaseModel
from pydantic_ai import Agent as PydanticAgent
from pydantic_ai.models import Model
from pytest_aitest.execution.pydantic_adapter import build_model_from_string

if TYPE_CHECKING:
from pytest_codingagents.copilot.result import CopilotResult
Expand Down Expand Up @@ -70,7 +82,7 @@ async def optimize_instruction(
result: CopilotResult,
criterion: str,
*,
model: str = "openai:gpt-4o-mini",
model: str | Model = "azure/gpt-5.2-chat",
) -> InstructionSuggestion:
"""Analyze a result and suggest an improved instruction.

Expand All @@ -79,7 +91,11 @@ async def optimize_instruction(
concrete, actionable improvement.

Designed to drop into ``pytest.fail()`` so the failure message
contains a ready-to-use fix:
contains a ready-to-use fix.

Model strings follow the same ``provider/model`` format used by
``pytest-aitest``. Azure Entra ID auth is handled automatically
when ``AZURE_API_BASE`` or ``AZURE_OPENAI_ENDPOINT`` is set.

Example::

Expand All @@ -97,24 +113,16 @@ async def optimize_instruction(
result: The ``CopilotResult`` from the (failed) run.
criterion: What the agent *should* have done — the test expectation
in plain English (e.g. ``"Always write docstrings"``).
model: LiteLLM-style model string (e.g. ``"openai:gpt-4o-mini"``
or ``"anthropic:claude-3-haiku-20240307"``).
model: Provider/model string (e.g. ``"azure/gpt-5.2-chat"``,
``"openai/gpt-4o-mini"``) or a pre-configured pydantic-ai
``Model`` object. Defaults to ``"azure/gpt-5.2-chat"``.

Returns:
An :class:`InstructionSuggestion` with the improved instruction.

Raises:
ImportError: If pydantic-ai is not installed.
"""
try:
from pydantic_ai import Agent as PydanticAgent
except ImportError as exc:
msg = (
"pydantic-ai is required for optimize_instruction(). "
"Install it with: uv add pydantic-ai"
)
raise ImportError(msg) from exc

resolved_model: str | Model = (
build_model_from_string(model) if isinstance(model, str) else model
)
final_output = result.final_response or "(no response)"
tool_calls = ", ".join(sorted(result.tool_names_called)) or "none"

Expand Down Expand Up @@ -142,7 +150,7 @@ async def optimize_instruction(
that would make the agent satisfy the criterion.
Keep the instruction under 200 words. Do not add unrelated rules."""

optimizer_agent = PydanticAgent(model, output_type=_OptimizationOutput)
optimizer_agent = PydanticAgent(resolved_model, output_type=_OptimizationOutput)
run_result = await optimizer_agent.run(prompt)
output = run_result.output

Expand Down
75 changes: 68 additions & 7 deletions tests/test_optimizer_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

These tests require:
- GitHub Copilot credentials (for copilot_run to produce a real result)
- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model)
- AZURE_API_BASE or AZURE_OPENAI_ENDPOINT env var (for the optimizer LLM via Azure Entra ID)

Skipped automatically when the required API key is absent.
Skipped automatically when AZURE_OPENAI_ENDPOINT is absent.
"""

from __future__ import annotations
Expand All @@ -19,13 +19,13 @@

@pytest.mark.copilot
class TestOptimizeInstructionIntegration:
"""Integration tests for optimize_instruction() with real LLM calls."""
"""Integration tests for optimize_instruction() with real Azure LLM calls."""

@pytest.fixture(autouse=True)
def require_openai_key(self):
"""Skip entire class when OPENAI_API_KEY is not set."""
if not os.environ.get("OPENAI_API_KEY"):
pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests")
def require_azure_endpoint(self):
"""Skip entire class when AZURE_OPENAI_ENDPOINT is not set."""
if not os.environ.get("AZURE_OPENAI_ENDPOINT") and not os.environ.get("AZURE_API_BASE"):
pytest.skip("AZURE_OPENAI_ENDPOINT not set — skipping optimizer integration tests")

async def test_returns_valid_suggestion(self, copilot_run, tmp_path):
"""optimize_instruction returns an InstructionSuggestion with non-empty fields."""
Expand Down Expand Up @@ -100,3 +100,64 @@ async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path):
f"Instruction: {suggestion.instruction}\n"
f"Reasoning: {suggestion.reasoning}"
)

async def test_full_optimize_loop(self, copilot_run, tmp_path):
"""Full test→optimize→test loop: weak instruction fails, improved instruction passes.

This is the hero use case: verify that optimize_instruction() produces
an instruction that actually fixes a failing criterion.

Round 1: Run with a deliberately weak instruction (no docstring mandate).
The agent writes code but skips docstrings.
Optimize: Call optimize_instruction() with the failing criterion.
Receive a suggested instruction that mandates docstrings.
Round 2: Run again with the improved instruction.
The agent now includes docstrings — criterion passes.
"""
CRITERION = "Every function must include a Google-style docstring."
TASK = "Create calculator.py with add(a, b) and subtract(a, b) functions."

# --- Round 1: weak instruction, expect no docstrings ---
weak_agent = CopilotAgent(
name="weak-coder",
instructions="Write minimal Python code. No comments or documentation needed.",
working_directory=str(tmp_path / "round1"),
)
(tmp_path / "round1").mkdir()
result1 = await copilot_run(weak_agent, TASK)
assert result1.success, "Round 1 Copilot run failed"

code1 = result1.file("calculator.py") or ""
has_docstrings_round1 = '"""' in code1 or "'''" in code1

# --- Optimize ---
suggestion = await optimize_instruction(
weak_agent.instructions or "",
result1,
CRITERION,
)
assert suggestion.instruction.strip(), "Optimizer returned empty instruction"
print(f"\n💡 Suggested instruction:\n{suggestion}") # visible in -s output

# --- Round 2: improved instruction ---
improved_agent = CopilotAgent(
name="improved-coder",
instructions=suggestion.instruction,
working_directory=str(tmp_path / "round2"),
)
(tmp_path / "round2").mkdir()
result2 = await copilot_run(improved_agent, TASK)
assert result2.success, "Round 2 Copilot run failed"

code2 = result2.file("calculator.py") or ""
has_docstrings_round2 = '"""' in code2 or "'''" in code2

assert has_docstrings_round2, (
f"Round 2 code still has no docstrings after optimization.\n"
f"Suggested instruction: {suggestion.instruction}\n"
f"Round 2 code:\n{code2}"
)
print(
f"\n✅ Loop complete. "
f"Docstrings round 1: {has_docstrings_round1}, round 2: {has_docstrings_round2}"
)
Loading