Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions tests/test_ab_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Integration tests for the ab_run fixture.

Proves that ab_run correctly:
- Creates isolated directories for each agent
- Runs both agents against the same task
- Returns a (baseline, treatment) tuple of real CopilotResult objects
- The agents do NOT share workspace (files from one don't appear in the other)

These tests require GitHub Copilot credentials.
"""

from __future__ import annotations

import pytest

from pytest_codingagents.copilot.agent import CopilotAgent


@pytest.mark.copilot
class TestAbRunFixture:
"""Integration tests for the ab_run fixture."""

async def test_ab_run_returns_two_results(self, ab_run):
"""ab_run returns a tuple of two successful CopilotResults."""
from pytest_codingagents.copilot.result import CopilotResult

baseline = CopilotAgent(
name="baseline",
instructions="Create files as requested.",
)
treatment = CopilotAgent(
name="treatment",
instructions="Create files as requested.",
)

b, t = await ab_run(baseline, treatment, "Create hello.txt with the text 'hello'.")

assert isinstance(b, CopilotResult)
assert isinstance(t, CopilotResult)
assert b.success, f"Baseline failed: {b.error}"
assert t.success, f"Treatment failed: {t.error}"

async def test_ab_run_isolates_working_directories(self, ab_run, tmp_path):
"""Files created by baseline agent do not appear in treatment workspace."""
baseline = CopilotAgent(
name="baseline",
instructions="Create files as requested.",
)
treatment = CopilotAgent(
name="treatment",
instructions="Create files as requested.",
)

task = "Create a file called sentinel.txt containing the text 'hello'."
b, t = await ab_run(baseline, treatment, task)

assert b.success and t.success

baseline_dir = tmp_path / "baseline"
treatment_dir = tmp_path / "treatment"

# Both dirs must exist (created by ab_run)
assert baseline_dir.exists(), "ab_run did not create baseline/ dir"
assert treatment_dir.exists(), "ab_run did not create treatment/ dir"

# Dirs must be DIFFERENT (not the same path)
assert baseline_dir != treatment_dir

async def test_ab_run_produces_differential_output(self, ab_run):
"""Treatment instruction change produces measurably different output.

Baseline: no special instructions (no docstrings expected).
Treatment: explicit docstring mandate.

This is the canonical A/B test — proves the fixture enables real
differential testing, not just running the same thing twice.
"""
baseline = CopilotAgent(
name="baseline",
instructions=(
"Write minimal Python code only. "
"NO docstrings whatsoever. NO type hints. NO comments. "
"Pure function definitions and logic only."
),
)
treatment = CopilotAgent(
name="treatment",
instructions=(
"Write fully documented Python. EVERY function MUST have:\n"
'- A docstring: """What this function does."""\n'
"- Type hints on all parameters and return value."
),
)

b, t = await ab_run(
baseline,
treatment,
"Create calculator.py with add(a, b) and subtract(a, b).",
)

assert b.success, f"Baseline failed: {b.error}"
assert t.success, f"Treatment failed: {t.error}"

# Verify isolation: each result knows its own working directory
assert b.working_directory != t.working_directory, (
"Baseline and treatment should have different working directories"
)

# Verify differential output
baseline_code = b.file("calculator.py")
treatment_code = t.file("calculator.py")

# Treatment should have docstrings; baseline should not
assert '"""' in treatment_code or "'''" in treatment_code, (
"Treatment instruction required docstrings but none found.\n"
f"Treatment code:\n{treatment_code}"
)
assert '"""' not in baseline_code and "'''" not in baseline_code, (
"Baseline instruction forbade docstrings but they appeared.\n"
f"Baseline code:\n{baseline_code}"
)

async def test_ab_run_working_directories_are_accessible_via_result(self, ab_run, tmp_path):
"""CopilotResult.working_directory points to the correct isolated dir."""
baseline = CopilotAgent(name="baseline", instructions="Create files as requested.")
treatment = CopilotAgent(name="treatment", instructions="Create files as requested.")

b, t = await ab_run(
baseline,
treatment,
"Create a file called check.txt with content 'check'.",
)

assert b.success and t.success

# working_directory on result should point to the isolated dirs
assert b.working_directory == tmp_path / "baseline"
assert t.working_directory == tmp_path / "treatment"
102 changes: 102 additions & 0 deletions tests/test_optimizer_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Integration tests for optimize_instruction().

These tests require:
- GitHub Copilot credentials (for copilot_run to produce a real result)
- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model)

Skipped automatically when the required API key is absent.
"""

from __future__ import annotations

import os

import pytest

from pytest_codingagents.copilot.agent import CopilotAgent
from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction


@pytest.mark.copilot
class TestOptimizeInstructionIntegration:
"""Integration tests for optimize_instruction() with real LLM calls."""

@pytest.fixture(autouse=True)
def require_openai_key(self):
"""Skip entire class when OPENAI_API_KEY is not set."""
if not os.environ.get("OPENAI_API_KEY"):
pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests")

async def test_returns_valid_suggestion(self, copilot_run, tmp_path):
"""optimize_instruction returns an InstructionSuggestion with non-empty fields."""
agent = CopilotAgent(
name="minimal-coder",
instructions="Write Python code.",
working_directory=str(tmp_path),
)
result = await copilot_run(
agent,
"Create calc.py with add(a, b) and subtract(a, b).",
)
assert result.success

suggestion = await optimize_instruction(
agent.instructions or "",
result,
"Every function must have a Google-style docstring.",
)

assert isinstance(suggestion, InstructionSuggestion)
assert suggestion.instruction.strip(), "Suggestion instruction must not be empty"
assert suggestion.reasoning.strip(), "Suggestion reasoning must not be empty"
assert suggestion.changes.strip(), "Suggestion changes must not be empty"
assert len(suggestion.instruction) > 20, "Instruction too short to be useful"

async def test_suggestion_str_is_human_readable(self, copilot_run, tmp_path):
"""str(InstructionSuggestion) is readable and contains all fields."""
agent = CopilotAgent(
name="coder",
instructions="Write Python code.",
working_directory=str(tmp_path),
)
result = await copilot_run(agent, "Create utils.py with a helper function.")
assert result.success

suggestion = await optimize_instruction(
agent.instructions or "",
result,
"Add type hints to all function parameters and return values.",
)

text = str(suggestion)
assert suggestion.instruction in text
assert suggestion.reasoning in text
assert suggestion.changes in text

async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path):
"""Optimizer returns a suggestion that addresses the given criterion."""
agent = CopilotAgent(
name="coder",
instructions="Write Python code.",
working_directory=str(tmp_path),
)
result = await copilot_run(
agent,
"Create math.py with add(a, b) and multiply(a, b).",
)
assert result.success

criterion = "All functions must include Google-style docstrings."
suggestion = await optimize_instruction(
agent.instructions or "",
result,
criterion,
)

# The suggestion instruction should mention docstrings somehow
combined = (suggestion.instruction + " " + suggestion.reasoning).lower()
assert any(word in combined for word in ["docstring", "doc", "documentation", "google"]), (
f"Suggestion doesn't address 'docstring' criterion.\n"
f"Instruction: {suggestion.instruction}\n"
f"Reasoning: {suggestion.reasoning}"
)