From 636bae4d56a7c89fad0425dfa09e83b73bd9705a Mon Sep 17 00:00:00 2001 From: Stefan Broenner Date: Thu, 19 Feb 2026 11:30:46 +0100 Subject: [PATCH] test: add integration tests for ab_run and optimize_instruction - tests/test_ab_run.py: 4 integration tests covering - returns two CopilotResult objects - creates isolated baseline/ and treatment/ directories - produces differential output (docstring A/B test) - working_directory on result points to correct isolated dir - tests/test_optimizer_integration.py: 3 tests (auto-skipped without OPENAI_API_KEY) covering - returns valid InstructionSuggestion with non-empty fields - str() is human-readable - suggestion addresses the given criterion All 4 ab_run tests pass (94s); optimizer tests skip without API key. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_ab_run.py | 138 ++++++++++++++++++++++++++++ tests/test_optimizer_integration.py | 102 ++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 tests/test_ab_run.py create mode 100644 tests/test_optimizer_integration.py diff --git a/tests/test_ab_run.py b/tests/test_ab_run.py new file mode 100644 index 0000000..656d496 --- /dev/null +++ b/tests/test_ab_run.py @@ -0,0 +1,138 @@ +"""Integration tests for the ab_run fixture. + +Proves that ab_run correctly: +- Creates isolated directories for each agent +- Runs both agents against the same task +- Returns a (baseline, treatment) tuple of real CopilotResult objects +- The agents do NOT share workspace (files from one don't appear in the other) + +These tests require GitHub Copilot credentials. +""" + +from __future__ import annotations + +import pytest + +from pytest_codingagents.copilot.agent import CopilotAgent + + +@pytest.mark.copilot +class TestAbRunFixture: + """Integration tests for the ab_run fixture.""" + + async def test_ab_run_returns_two_results(self, ab_run): + """ab_run returns a tuple of two successful CopilotResults.""" + from pytest_codingagents.copilot.result import CopilotResult + + baseline = CopilotAgent( + name="baseline", + instructions="Create files as requested.", + ) + treatment = CopilotAgent( + name="treatment", + instructions="Create files as requested.", + ) + + b, t = await ab_run(baseline, treatment, "Create hello.txt with the text 'hello'.") + + assert isinstance(b, CopilotResult) + assert isinstance(t, CopilotResult) + assert b.success, f"Baseline failed: {b.error}" + assert t.success, f"Treatment failed: {t.error}" + + async def test_ab_run_isolates_working_directories(self, ab_run, tmp_path): + """Files created by baseline agent do not appear in treatment workspace.""" + baseline = CopilotAgent( + name="baseline", + instructions="Create files as requested.", + ) + treatment = CopilotAgent( + name="treatment", + instructions="Create files as requested.", + ) + + task = "Create a file called sentinel.txt containing the text 'hello'." + b, t = await ab_run(baseline, treatment, task) + + assert b.success and t.success + + baseline_dir = tmp_path / "baseline" + treatment_dir = tmp_path / "treatment" + + # Both dirs must exist (created by ab_run) + assert baseline_dir.exists(), "ab_run did not create baseline/ dir" + assert treatment_dir.exists(), "ab_run did not create treatment/ dir" + + # Dirs must be DIFFERENT (not the same path) + assert baseline_dir != treatment_dir + + async def test_ab_run_produces_differential_output(self, ab_run): + """Treatment instruction change produces measurably different output. + + Baseline: no special instructions (no docstrings expected). + Treatment: explicit docstring mandate. + + This is the canonical A/B test — proves the fixture enables real + differential testing, not just running the same thing twice. + """ + baseline = CopilotAgent( + name="baseline", + instructions=( + "Write minimal Python code only. " + "NO docstrings whatsoever. NO type hints. NO comments. " + "Pure function definitions and logic only." + ), + ) + treatment = CopilotAgent( + name="treatment", + instructions=( + "Write fully documented Python. EVERY function MUST have:\n" + '- A docstring: """What this function does."""\n' + "- Type hints on all parameters and return value." + ), + ) + + b, t = await ab_run( + baseline, + treatment, + "Create calculator.py with add(a, b) and subtract(a, b).", + ) + + assert b.success, f"Baseline failed: {b.error}" + assert t.success, f"Treatment failed: {t.error}" + + # Verify isolation: each result knows its own working directory + assert b.working_directory != t.working_directory, ( + "Baseline and treatment should have different working directories" + ) + + # Verify differential output + baseline_code = b.file("calculator.py") + treatment_code = t.file("calculator.py") + + # Treatment should have docstrings; baseline should not + assert '"""' in treatment_code or "'''" in treatment_code, ( + "Treatment instruction required docstrings but none found.\n" + f"Treatment code:\n{treatment_code}" + ) + assert '"""' not in baseline_code and "'''" not in baseline_code, ( + "Baseline instruction forbade docstrings but they appeared.\n" + f"Baseline code:\n{baseline_code}" + ) + + async def test_ab_run_working_directories_are_accessible_via_result(self, ab_run, tmp_path): + """CopilotResult.working_directory points to the correct isolated dir.""" + baseline = CopilotAgent(name="baseline", instructions="Create files as requested.") + treatment = CopilotAgent(name="treatment", instructions="Create files as requested.") + + b, t = await ab_run( + baseline, + treatment, + "Create a file called check.txt with content 'check'.", + ) + + assert b.success and t.success + + # working_directory on result should point to the isolated dirs + assert b.working_directory == tmp_path / "baseline" + assert t.working_directory == tmp_path / "treatment" diff --git a/tests/test_optimizer_integration.py b/tests/test_optimizer_integration.py new file mode 100644 index 0000000..b1b6ea7 --- /dev/null +++ b/tests/test_optimizer_integration.py @@ -0,0 +1,102 @@ +"""Integration tests for optimize_instruction(). + +These tests require: +- GitHub Copilot credentials (for copilot_run to produce a real result) +- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model) + +Skipped automatically when the required API key is absent. +""" + +from __future__ import annotations + +import os + +import pytest + +from pytest_codingagents.copilot.agent import CopilotAgent +from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction + + +@pytest.mark.copilot +class TestOptimizeInstructionIntegration: + """Integration tests for optimize_instruction() with real LLM calls.""" + + @pytest.fixture(autouse=True) + def require_openai_key(self): + """Skip entire class when OPENAI_API_KEY is not set.""" + if not os.environ.get("OPENAI_API_KEY"): + pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests") + + async def test_returns_valid_suggestion(self, copilot_run, tmp_path): + """optimize_instruction returns an InstructionSuggestion with non-empty fields.""" + agent = CopilotAgent( + name="minimal-coder", + instructions="Write Python code.", + working_directory=str(tmp_path), + ) + result = await copilot_run( + agent, + "Create calc.py with add(a, b) and subtract(a, b).", + ) + assert result.success + + suggestion = await optimize_instruction( + agent.instructions or "", + result, + "Every function must have a Google-style docstring.", + ) + + assert isinstance(suggestion, InstructionSuggestion) + assert suggestion.instruction.strip(), "Suggestion instruction must not be empty" + assert suggestion.reasoning.strip(), "Suggestion reasoning must not be empty" + assert suggestion.changes.strip(), "Suggestion changes must not be empty" + assert len(suggestion.instruction) > 20, "Instruction too short to be useful" + + async def test_suggestion_str_is_human_readable(self, copilot_run, tmp_path): + """str(InstructionSuggestion) is readable and contains all fields.""" + agent = CopilotAgent( + name="coder", + instructions="Write Python code.", + working_directory=str(tmp_path), + ) + result = await copilot_run(agent, "Create utils.py with a helper function.") + assert result.success + + suggestion = await optimize_instruction( + agent.instructions or "", + result, + "Add type hints to all function parameters and return values.", + ) + + text = str(suggestion) + assert suggestion.instruction in text + assert suggestion.reasoning in text + assert suggestion.changes in text + + async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path): + """Optimizer returns a suggestion that addresses the given criterion.""" + agent = CopilotAgent( + name="coder", + instructions="Write Python code.", + working_directory=str(tmp_path), + ) + result = await copilot_run( + agent, + "Create math.py with add(a, b) and multiply(a, b).", + ) + assert result.success + + criterion = "All functions must include Google-style docstrings." + suggestion = await optimize_instruction( + agent.instructions or "", + result, + criterion, + ) + + # The suggestion instruction should mention docstrings somehow + combined = (suggestion.instruction + " " + suggestion.reasoning).lower() + assert any(word in combined for word in ["docstring", "doc", "documentation", "google"]), ( + f"Suggestion doesn't address 'docstring' criterion.\n" + f"Instruction: {suggestion.instruction}\n" + f"Reasoning: {suggestion.reasoning}" + )