diff --git a/README.md b/README.md index b33ff01..fc31a89 100644 --- a/README.md +++ b/README.md @@ -125,28 +125,35 @@ If you’re experimenting with Evolve or exploring on‑the‑job learning for a ### Running Tests -```bash -uv run pytest -``` - -#### Phoenix Sync Tests - -Tests for the Phoenix trajectory sync functionality are **skipped by default** since they require familiarity with the Phoenix integration. To include them: - -```bash -# Run all tests including Phoenix tests -uv run pytest --run-phoenix - -# Run only Phoenix tests -uv run pytest -m phoenix -``` - -#### End-to-End (E2E) Low-Code Verification - -To run the full end-to-end verification pipeline (Agent -> Trace -> Tip): - -```bash -EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -s -``` - -See [docs/LOW_CODE_TRACING.md](docs/LOW_CODE_TRACING.md#end-to-end-verification) for more details. +The test suite is organized into 4 cleanly isolated tiers depending on infrastructure requirements: + +1. **Default Local Suite** + Runs both fast logic tests (`unit`) and filesystem script verifications (`platform_integrations`). + ```bash + uv run pytest + ``` + +2. **Unit Tests (Only)** + Fast, fully-mocked tests verifying core logic and offline pipeline schemas. + ```bash + uv run pytest -m unit + ``` + +3. **Platform Integration Tests** + Fast filesystem-level integration tests verifying local tool installation and idempotency. + ```bash + uv run pytest -m platform_integrations + ``` + +4. **End-to-End Infrastructure Tests** + Heavy tests that autonomously spin up a background Phoenix server and simulate full agent workflows. + ```bash + uv run pytest -m e2e --run-e2e + ``` + *(See [docs/LOW_CODE_TRACING.md](docs/LOW_CODE_TRACING.md#end-to-end-verification) for more details).* + +5. **LLM Evaluation Tests** + Tests needing active LLM inference to test resolution pipelines (requires LLM API keys). + ```bash + uv run pytest -m llm + ``` diff --git a/README_phoenix_sync.md b/README_phoenix_sync.md index 8049af4..cf4953f 100644 --- a/README_phoenix_sync.md +++ b/README_phoenix_sync.md @@ -32,20 +32,20 @@ No additional dependencies required - uses only stdlib for Phoenix API calls. ```bash # Basic sync with defaults -uv run python -m evolve.cli.cli sync phoenix +uv run evolve sync phoenix # Custom Phoenix URL and namespace -uv run python -m evolve.cli.cli sync phoenix \ +uv run evolve sync phoenix \ --url http://phoenix.example.com:6006 \ --namespace my_namespace # Fetch more spans and include errors -uv run python -m evolve.cli.cli sync phoenix \ +uv run evolve sync phoenix \ --limit 500 \ --include-errors # Full options -uv run python -m evolve.cli.cli sync phoenix \ +uv run evolve sync phoenix \ --url http://localhost:6006 \ --namespace production \ --project my_project \ @@ -145,7 +145,7 @@ Two entity types are stored: ```bash # Sync every hour -0 * * * * cd /path/to/evolve && uv run python -m evolve.cli.cli sync phoenix --limit 100 +0 * * * * cd /path/to/evolve && uv run evolve sync phoenix --limit 100 ``` ### Systemd Timer @@ -158,7 +158,7 @@ Description=Evolve Phoenix Sync [Service] Type=oneshot WorkingDirectory=/path/to/evolve -ExecStart=/path/to/uv run python -m evolve.cli.cli sync phoenix +ExecStart=/path/to/uv run evolve sync phoenix Environment=PHOENIX_URL=http://localhost:6006 Environment=EVOLVE_NAMESPACE_ID=production ``` diff --git a/docs/LOW_CODE_TRACING.md b/docs/LOW_CODE_TRACING.md index a6f6122..8294f10 100644 --- a/docs/LOW_CODE_TRACING.md +++ b/docs/LOW_CODE_TRACING.md @@ -200,7 +200,7 @@ curl "http://localhost:6006/v1/projects/test-agent/spans?limit=5" cd evolve_repo EVOLVE_BACKEND=filesystem \ EVOLVE_TIPS_MODEL="gpt-4" \ -uv run python -m evolve.frontend.cli.cli sync phoenix \ +uv run evolve sync phoenix \ --project test-agent \ --include-errors ``` @@ -209,7 +209,7 @@ uv run python -m evolve.frontend.cli.cli sync phoenix \ ```bash EVOLVE_BACKEND=filesystem \ -uv run python -m evolve.frontend.cli.cli entities list evolve --type guideline +uv run evolve entities list evolve --type guideline ``` ### 6. Understanding Tip Provenance (Metadata) @@ -246,7 +246,7 @@ Evolve includes a comprehensive E2E verification suite to ensure that tracing an You can run the full regression suite using `pytest`: ```bash -EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -s +uv run pytest -m e2e --run-e2e -s ``` ### Running Specific Tests @@ -255,10 +255,10 @@ To test a specific agent framework: ```bash # Test smolagents -EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -k smolagents -s +uv run pytest tests/e2e/test_e2e_pipeline.py -k smolagents -m e2e --run-e2e -s # Test OpenAI Agents -EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -k openai_agents -s +uv run pytest tests/e2e/test_e2e_pipeline.py -k openai_agents -m e2e --run-e2e -s ``` ### What It Tests diff --git a/pyproject.toml b/pyproject.toml index 7cb9539..cbaae11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,9 @@ dev = [ "anyio", "detect-secrets", "mypy", + "pgvector>=0.3", "pre-commit", + "psycopg[binary]>=3.1", "pytest", "pytest-cov", "pytest-retry", @@ -85,12 +87,12 @@ evolve = ["**/*.jinja2"] package = true [tool.pytest.ini_options] -addopts = "--ignore=explorations -m 'not phoenix and not llm'" +addopts = "--ignore=explorations -m 'not llm and not e2e'" markers = [ "e2e", "unit", - "phoenix", - "llm" + "llm", + "platform_integrations" ] anyio_mode = "auto" diff --git a/tests/conftest.py b/tests/conftest.py index 38a8919..1c48fc2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,22 +31,19 @@ def mock_sentence_transformer(request): def pytest_addoption(parser): """Add custom command line options.""" parser.addoption( - "--run-phoenix", + "--run-e2e", action="store_true", default=False, - help="Run Phoenix sync tests (skipped by default)", + help="Run End-to-End infrastructure tests (skipped by default)", ) def pytest_configure(config): - """Override marker filter when --run-phoenix is passed.""" - if config.getoption("--run-phoenix"): - # Remove the default marker filter to include phoenix tests - # Get current markexpr and modify it - markexpr = config.getoption("markexpr", default="") - if markexpr == "not phoenix": - config.option.markexpr = "" - elif "not phoenix" in markexpr: - # Remove "not phoenix" from the expression - new_expr = markexpr.replace("not phoenix and ", "").replace(" and not phoenix", "").replace("not phoenix", "") - config.option.markexpr = new_expr.strip() + """Override marker filter when relevant flags are passed.""" + new_expr = config.getoption("markexpr", default="") + + if config.getoption("--run-e2e"): + # Remove "not e2e" from the expression + new_expr = new_expr.replace("not e2e and ", "").replace(" and not e2e", "").replace("not e2e", "") + + config.option.markexpr = new_expr.strip() diff --git a/tests/e2e/test_e2e_pipeline.py b/tests/e2e/test_e2e_pipeline.py index ab74094..898756e 100644 --- a/tests/e2e/test_e2e_pipeline.py +++ b/tests/e2e/test_e2e_pipeline.py @@ -4,10 +4,10 @@ import os import datetime import pytest -from evolve.config.phoenix import phoenix_settings +import urllib.request +import urllib.error # Configuration -PHOENIX_URL = phoenix_settings.url # Use a session-scope timestamp or generate per test? # Per-test ensures no collisions even if run in parallel (though these should satisfy sequential) TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") @@ -20,10 +20,65 @@ ] +@pytest.fixture(scope="session", autouse=True) +def phoenix_server(): + """Ensure a Phoenix server is running before executing E2E tests, and shut it down afterward.""" + # 1. Check if it's already running locally + try: + urllib.request.urlopen("http://localhost:6006/status", timeout=2) + print("\nPhoenix is already running on port 6006.") + yield "http://localhost:6006" + return + except (urllib.error.URLError, ConnectionError): + pass + + import sys + + print("\nStarting local Phoenix server for E2E tests...") + + env = os.environ.copy() + env["PHOENIX_PORT"] = "6006" + + # Start it using the current python executable to avoid 'uv run' overhead + # We use run_in_thread=True and a sleepy while loop because run_in_thread=False + # can crash the fastAPI uvicorn startup in some MacOS environments. + script = "import phoenix as px; import time; px.launch_app(run_in_thread=True); import sys; sys.stdout.flush(); time.sleep(86400)" + + proc = subprocess.Popen([sys.executable, "-c", script], env=env, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True) + + # Poll until the server is responsive + max_retries = 30 + for _ in range(max_retries): + try: + # specifically hit the status endpoint + urllib.request.urlopen("http://localhost:6006/status", timeout=2) + print("Phoenix server is up and running.") + break + except Exception: + # Also check if process crashed early + if proc.poll() is not None: + stderr_output = proc.stderr.read() if proc.stderr else "Unknown error" + pytest.fail(f"Phoenix server process crashed unexpectedly: {stderr_output}") + time.sleep(1) + else: + proc.terminate() + stderr_output = proc.stderr.read() if proc.stderr else "Unknown error" + pytest.fail(f"Failed to start local Phoenix server within 30 seconds. Stderr: {stderr_output}") + + yield "http://localhost:6006" + + # Cleanup: shut down Phoenix when tests are done + print("\nShutting down local Phoenix server...") + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + + @pytest.mark.e2e -@pytest.mark.phoenix @pytest.mark.parametrize("agent_config", AGENTS_TO_TEST, ids=[a["name"] for a in AGENTS_TO_TEST]) -def test_e2e_pipeline_agent(agent_config): +def test_e2e_pipeline_agent(agent_config, phoenix_server): """ Runs the full E2E pipeline for a specific agent configuration: 1. Executing the agent script @@ -58,7 +113,16 @@ def test_e2e_pipeline_agent(agent_config): if not os.path.exists(script_path): pytest.fail(f"Script not found: {script_path}") - result = subprocess.run(["uv", "run", "python", script_path], env=env, capture_output=True, text=True) + try: + result = subprocess.run(["uv", "run", "python", script_path], env=env, capture_output=True, text=True, timeout=90) + except subprocess.TimeoutExpired as e: + print("❌ Agent execution timed out after 90s") + # Still try to capture what we can from stdout/stderr if possible + stdout = e.stdout if e.stdout else "" + stderr = e.stderr if e.stderr else "" + print("STDOUT:", stdout) + print("STDERR:", stderr) + pytest.fail(f"Agent execution timed out for {agent_name}") if result.returncode != 0: print(f"❌ Agent failed with exit code {result.returncode}") @@ -78,7 +142,7 @@ def test_e2e_pipeline_agent(agent_config): import phoenix as px import sys try: - c = px.Client(endpoint='{PHOENIX_URL}') + c = px.Client(endpoint='{phoenix_server}') df = c.get_spans_dataframe(project_name='{project_name}') if df is not None and not df.empty: print(f"FOUND_TRACES:{{len(df)}}") @@ -87,7 +151,11 @@ def test_e2e_pipeline_agent(agent_config): except Exception as e: print(f"ERROR:{{e}}") """ - result = subprocess.run(["uv", "run", "python", "-c", check_script], capture_output=True, text=True) + try: + result = subprocess.run(["uv", "run", "python", "-c", check_script], capture_output=True, text=True, timeout=30) + except subprocess.TimeoutExpired: + print("❌ Phoenix trace verification script timed out") + pytest.fail(f"Trace verification timed out for {project_name}") output = result.stdout + result.stderr if "FOUND_TRACES" in output: @@ -103,9 +171,7 @@ def test_e2e_pipeline_agent(agent_config): sync_command = [ "uv", "run", - "python", - "-m", - "evolve.frontend.cli.cli", + "evolve", "sync", "phoenix", "--project", @@ -128,36 +194,42 @@ def test_e2e_pipeline_agent(agent_config): tips_found = False sync_start = time.time() timeout = 120 # 2 minute timeout for sync + output_lines = [] try: while True: if time.time() - sync_start > timeout: - print("❌ Timeout waiting for tips generation") + print(f"❌ Timeout waiting for tips generation ({timeout}s)") break line = process.stdout.readline() - if not line and process.poll() is not None: - break - - if line: - line_stripped = line.strip() - # print(f"[Sync] {line_stripped}") # Optional: verbose logging - - # Check target log pattern - match = re.search(r"generated (\d+) tips", line_stripped) - if match: - count = match.group(1) - print(f"\n✅ SUCCESS: Generated {count} tips!") - tips_found = True + if not line: + if process.poll() is not None: break + time.sleep(0.1) # Avoid tight loop if no output but process alive + continue + + output_lines.append(line) + line_stripped = line.strip() + # print(f"[Sync] {line_stripped}") # Optional: verbose logging + + # Check target log pattern + match = re.search(r"generated (\d+) tips", line_stripped) + if match: + count = match.group(1) + print(f"\n✅ SUCCESS: Generated {count} tips!") + tips_found = True + break finally: if process.poll() is None: print("Stopping sync process...") process.terminate() try: - process.wait(timeout=5) + process.wait(timeout=10) except subprocess.TimeoutExpired: process.kill() if not tips_found: + full_output = "".join(output_lines) + print(f"Final Sync Output:\n{full_output}") pytest.fail(f"Failed to detect tip generation for {agent_name} within {timeout}s.") diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 6520591..08ab136 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -559,7 +559,6 @@ def test_sync_help(self): @pytest.mark.unit -@pytest.mark.phoenix class TestSyncPhoenix: """Tests for 'evolve sync phoenix' command.""" diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 89324fa..c2ec8d4 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -12,9 +12,13 @@ from evolve.frontend.client.evolve_client import EvolveClient +from evolve.config.evolve import EvolveConfig + + @pytest.fixture(scope="module") def evolve_client() -> EvolveClient: - evolve_client = EvolveClient() + config = EvolveConfig(backend="filesystem") + evolve_client = EvolveClient(config=config) return evolve_client diff --git a/tests/unit/test_extract_trajectories.py b/tests/unit/test_extract_trajectories.py index 9edd394..87c62ab 100644 --- a/tests/unit/test_extract_trajectories.py +++ b/tests/unit/test_extract_trajectories.py @@ -21,9 +21,8 @@ get_trajectories, ) -# Mark all tests in this module as phoenix tests (skipped by default) -pytestmark = pytest.mark.phoenix - +# Mark all tests in this module as unit tests +pytestmark = pytest.mark.unit # ============================================================================= # parse_content() Tests diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py index 458136d..6e2d0f9 100644 --- a/tests/unit/test_mcp_server.py +++ b/tests/unit/test_mcp_server.py @@ -6,6 +6,8 @@ from evolve.frontend.mcp.mcp_server import save_trajectory, create_entity from evolve.schema.conflict_resolution import EntityUpdate +pytestmark = pytest.mark.unit + @pytest.fixture def mock_get_client(): diff --git a/tests/unit/test_phoenix_sync.py b/tests/unit/test_phoenix_sync.py index 59018ca..4e4621f 100644 --- a/tests/unit/test_phoenix_sync.py +++ b/tests/unit/test_phoenix_sync.py @@ -8,8 +8,8 @@ from evolve.sync.phoenix_sync import PhoenixSync, SyncResult from evolve.schema.tips import TipGenerationResult -# Mark all tests in this module as phoenix tests (skipped by default) -pytestmark = pytest.mark.phoenix +# Mark all tests in this module as unit tests +pytestmark = pytest.mark.unit @pytest.fixture diff --git a/tests/unit/test_tracing.py b/tests/unit/test_tracing.py index a1adbba..0d168f7 100644 --- a/tests/unit/test_tracing.py +++ b/tests/unit/test_tracing.py @@ -12,6 +12,9 @@ import pytest from unittest.mock import patch, MagicMock +# Mark all tests in this module as unit tests +pytestmark = pytest.mark.unit + class TestFrameworkDetection: """Tests for detect_installed_frameworks()""" @@ -197,7 +200,6 @@ def test_returns_provider_after_setup(self): auto._tracer_provider = original_provider -@pytest.mark.unit class TestTracingIntegration: """Integration-style tests for the tracing module.""" diff --git a/uv.lock b/uv.lock index 3ec0f26..114c320 100644 --- a/uv.lock +++ b/uv.lock @@ -1050,7 +1050,9 @@ dev = [ { name = "anyio" }, { name = "detect-secrets" }, { name = "mypy" }, + { name = "pgvector" }, { name = "pre-commit" }, + { name = "psycopg", extra = ["binary"] }, { name = "pytest" }, { name = "pytest-cov" }, { name = "pytest-retry" }, @@ -1103,7 +1105,9 @@ dev = [ { name = "anyio" }, { name = "detect-secrets", git = "https://github.com/ibm/detect-secrets?branch=master" }, { name = "mypy" }, + { name = "pgvector", specifier = ">=0.3" }, { name = "pre-commit" }, + { name = "psycopg", extras = ["binary"], specifier = ">=3.1" }, { name = "pytest" }, { name = "pytest-cov" }, { name = "pytest-retry" },