AgentToolkit · visahak · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/README.md b/README.md
@@ -116,28 +116,35 @@ See the [Low-Code Tracing Guide](docs/LOW_CODE_TRACING.md#6-understanding-tip-pr
 
 ### Running Tests
 
-```bash
-uv run pytest
-```
-
-#### Phoenix Sync Tests
-
-Tests for the Phoenix trajectory sync functionality are **skipped by default** since they require familiarity with the Phoenix integration. To include them:
-
-```bash
-# Run all tests including Phoenix tests
-uv run pytest --run-phoenix
-
-# Run only Phoenix tests
-uv run pytest -m phoenix
-```
-
-#### End-to-End (E2E) Low-Code Verification
-
-To run the full end-to-end verification pipeline (Agent -> Trace -> Tip):
-
-```bash
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -s
-```
-
-See [docs/LOW_CODE_TRACING.md](docs/LOW_CODE_TRACING.md#end-to-end-verification) for more details.
+The test suite is organized into 4 cleanly isolated tiers depending on infrastructure requirements:
+
+1. **Default Local Suite**
+   Runs both fast logic tests (`unit`) and filesystem script verifications (`platform_integrations`).
+   ```bash
+   uv run pytest
+   ```
+
+2. **Unit Tests (Only)**
+   Fast, fully-mocked tests verifying core logic and offline pipeline schemas.
+   ```bash
+   uv run pytest -m unit
+   ```
+
+3. **Platform Integration Tests**
+   Fast filesystem-level integration tests verifying local tool installation and idempotency.
+   ```bash
+   uv run pytest -m platform_integrations
+   ```
+
+4. **End-to-End Infrastructure Tests**
+   Heavy tests that autonomously spin up a background Phoenix server and simulate full agent workflows.
+   ```bash
+   uv run pytest -m e2e --run-e2e
+   ```
+   *(See [docs/LOW_CODE_TRACING.md](docs/LOW_CODE_TRACING.md#end-to-end-verification) for more details).*
+
+5. **LLM Evaluation Tests**
+   Tests needing active LLM inference to test resolution pipelines (requires LLM API keys).
+   ```bash
+   uv run pytest -m llm
+   ```
diff --git a/README_phoenix_sync.md b/README_phoenix_sync.md
@@ -32,20 +32,20 @@ No additional dependencies required - uses only stdlib for Phoenix API calls.
 
 ```bash
 # Basic sync with defaults
-uv run python -m evolve.cli.cli sync phoenix
+uv run evolve sync phoenix
 
 # Custom Phoenix URL and namespace
-uv run python -m evolve.cli.cli sync phoenix \
+uv run evolve sync phoenix \
   --url http://phoenix.example.com:6006 \
   --namespace my_namespace
 
 # Fetch more spans and include errors
-uv run python -m evolve.cli.cli sync phoenix \
+uv run evolve sync phoenix \
   --limit 500 \
   --include-errors
 
 # Full options
-uv run python -m evolve.cli.cli sync phoenix \
+uv run evolve sync phoenix \
   --url http://localhost:6006 \
   --namespace production \
   --project my_project \
@@ -145,7 +145,7 @@ Two entity types are stored:
 
 ```bash
 # Sync every hour
-0 * * * * cd /path/to/evolve && uv run python -m evolve.cli.cli sync phoenix --limit 100
+0 * * * * cd /path/to/evolve && uv run evolve sync phoenix --limit 100
 ```
 
 ### Systemd Timer
@@ -158,7 +158,7 @@ Description=Evolve Phoenix Sync
 [Service]
 Type=oneshot
 WorkingDirectory=/path/to/evolve
-ExecStart=/path/to/uv run python -m evolve.cli.cli sync phoenix
+ExecStart=/path/to/uv run evolve sync phoenix
 Environment=PHOENIX_URL=http://localhost:6006
 Environment=EVOLVE_NAMESPACE_ID=production
 ```

diff --git a/docs/LOW_CODE_TRACING.md b/docs/LOW_CODE_TRACING.md
@@ -200,7 +200,7 @@ curl "http://localhost:6006/v1/projects/test-agent/spans?limit=5"
 cd evolve_repo
 EVOLVE_BACKEND=filesystem \
 EVOLVE_TIPS_MODEL="gpt-4" \
-uv run python -m evolve.frontend.cli.cli sync phoenix \
+uv run evolve sync phoenix \
     --project test-agent \
     --include-errors
 ```
@@ -209,7 +209,7 @@ uv run python -m evolve.frontend.cli.cli sync phoenix \
 
 ```bash
 EVOLVE_BACKEND=filesystem \
-uv run python -m evolve.frontend.cli.cli entities list evolve --type guideline
+uv run evolve entities list evolve --type guideline
 ```
 
 ### 6. Understanding Tip Provenance (Metadata)
@@ -246,7 +246,7 @@ Evolve includes a comprehensive E2E verification suite to ensure that tracing an
 You can run the full regression suite using `pytest`:
 
 ```bash
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -s
+uv run pytest -m e2e --run-e2e -s
 ```
 
 ### Running Specific Tests
@@ -255,10 +255,10 @@ To test a specific agent framework:
 
 ```bash
 # Test smolagents
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -k smolagents -s
+uv run pytest tests/e2e/test_e2e_pipeline.py -k smolagents -m e2e --run-e2e -s
 
 # Test OpenAI Agents
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -k openai_agents -s
+uv run pytest tests/e2e/test_e2e_pipeline.py -k openai_agents -m e2e --run-e2e -s
 ```
 
 ### What It Tests

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,9 @@ dev = [
     "anyio",
     "detect-secrets",
     "mypy",
+    "pgvector>=0.3",
     "pre-commit",
+    "psycopg[binary]>=3.1",
     "pytest",
     "pytest-cov",
     "pytest-retry",
@@ -85,12 +87,12 @@ evolve = ["**/*.jinja2"]
 package = true
 
 [tool.pytest.ini_options]
-addopts = "--ignore=explorations -m 'not phoenix and not llm'"
+addopts = "--ignore=explorations -m 'not llm and not e2e'"
 markers = [
     "e2e",
     "unit",
-    "phoenix",
-    "llm"
+    "llm",
+    "platform_integrations"
 ]
 anyio_mode = "auto"
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -31,22 +31,19 @@ def mock_sentence_transformer(request):
 def pytest_addoption(parser):
     """Add custom command line options."""
     parser.addoption(
-        "--run-phoenix",
+        "--run-e2e",
         action="store_true",
         default=False,
-        help="Run Phoenix sync tests (skipped by default)",
+        help="Run End-to-End infrastructure tests (skipped by default)",
     )
 
 
 def pytest_configure(config):
-    """Override marker filter when --run-phoenix is passed."""
-    if config.getoption("--run-phoenix"):
-        # Remove the default marker filter to include phoenix tests
-        # Get current markexpr and modify it
-        markexpr = config.getoption("markexpr", default="")
-        if markexpr == "not phoenix":
-            config.option.markexpr = ""
-        elif "not phoenix" in markexpr:
-            # Remove "not phoenix" from the expression
-            new_expr = markexpr.replace("not phoenix and ", "").replace(" and not phoenix", "").replace("not phoenix", "")
-            config.option.markexpr = new_expr.strip()
+    """Override marker filter when relevant flags are passed."""
+    new_expr = config.getoption("markexpr", default="")
+
+    if config.getoption("--run-e2e"):
+        # Remove "not e2e" from the expression
+        new_expr = new_expr.replace("not e2e and ", "").replace(" and not e2e", "").replace("not e2e", "")
+
+    config.option.markexpr = new_expr.strip()
diff --git a/tests/e2e/test_e2e_pipeline.py b/tests/e2e/test_e2e_pipeline.py
@@ -4,10 +4,10 @@
 import os
 import datetime
 import pytest
-from evolve.config.phoenix import phoenix_settings
+import urllib.request
+import urllib.error
 
 # Configuration
-PHOENIX_URL = phoenix_settings.url
 # Use a session-scope timestamp or generate per test?
 # Per-test ensures no collisions even if run in parallel (though these should satisfy sequential)
 TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -20,10 +20,65 @@
 ]
 
 
+@pytest.fixture(scope="session", autouse=True)
+def phoenix_server():
+    """Ensure a Phoenix server is running before executing E2E tests, and shut it down afterward."""
+    # 1. Check if it's already running locally
+    try:
+        urllib.request.urlopen("http://localhost:6006/status", timeout=2)
+        print("\nPhoenix is already running on port 6006.")
+        yield "http://localhost:6006"
+        return
+    except (urllib.error.URLError, ConnectionError):
+        pass
+
+    import sys
+
+    print("\nStarting local Phoenix server for E2E tests...")
+
+    env = os.environ.copy()
+    env["PHOENIX_PORT"] = "6006"
+
+    # Start it using the current python executable to avoid 'uv run' overhead
+    # We use run_in_thread=True and a sleepy while loop because run_in_thread=False
+    # can crash the fastAPI uvicorn startup in some MacOS environments.
+    script = "import phoenix as px; import time; px.launch_app(run_in_thread=True); import sys; sys.stdout.flush(); time.sleep(86400)"
+
+    proc = subprocess.Popen([sys.executable, "-c", script], env=env, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
+
+    # Poll until the server is responsive
+    max_retries = 30
+    for _ in range(max_retries):
+        try:
+            # specifically hit the status endpoint
+            urllib.request.urlopen("http://localhost:6006/status", timeout=2)
+            print("Phoenix server is up and running.")
+            break
+        except Exception:
+            # Also check if process crashed early
+            if proc.poll() is not None:
+                stderr_output = proc.stderr.read() if proc.stderr else "Unknown error"
+                pytest.fail(f"Phoenix server process crashed unexpectedly: {stderr_output}")
+            time.sleep(1)
+    else:
+        proc.terminate()
+        stderr_output = proc.stderr.read() if proc.stderr else "Unknown error"
+        pytest.fail(f"Failed to start local Phoenix server within 30 seconds. Stderr: {stderr_output}")
+
+    yield "http://localhost:6006"
+
+    # Cleanup: shut down Phoenix when tests are done
+    print("\nShutting down local Phoenix server...")
+    proc.terminate()
+    try:
+        proc.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        proc.kill()
+
+
 @pytest.mark.e2e
-@pytest.mark.phoenix
 @pytest.mark.parametrize("agent_config", AGENTS_TO_TEST, ids=[a["name"] for a in AGENTS_TO_TEST])
-def test_e2e_pipeline_agent(agent_config):
+def test_e2e_pipeline_agent(agent_config, phoenix_server):
     """
     Runs the full E2E pipeline for a specific agent configuration:
     1. Executing the agent script
@@ -58,7 +113,16 @@ def test_e2e_pipeline_agent(agent_config):
     if not os.path.exists(script_path):
         pytest.fail(f"Script not found: {script_path}")
 
-    result = subprocess.run(["uv", "run", "python", script_path], env=env, capture_output=True, text=True)
+    try:
+        result = subprocess.run(["uv", "run", "python", script_path], env=env, capture_output=True, text=True, timeout=90)
+    except subprocess.TimeoutExpired as e:
+        print("❌ Agent execution timed out after 90s")
+        # Still try to capture what we can from stdout/stderr if possible
+        stdout = e.stdout if e.stdout else ""
+        stderr = e.stderr if e.stderr else ""
+        print("STDOUT:", stdout)
+        print("STDERR:", stderr)
+        pytest.fail(f"Agent execution timed out for {agent_name}")
 
     if result.returncode != 0:
         print(f"❌ Agent failed with exit code {result.returncode}")
@@ -78,7 +142,7 @@ def test_e2e_pipeline_agent(agent_config):
 import phoenix as px
 import sys
 try:
-    c = px.Client(endpoint='{PHOENIX_URL}')
+    c = px.Client(endpoint='{phoenix_server}')
     df = c.get_spans_dataframe(project_name='{project_name}')
     if df is not None and not df.empty:
         print(f"FOUND_TRACES:{{len(df)}}")
@@ -87,7 +151,11 @@ def test_e2e_pipeline_agent(agent_config):
 except Exception as e:
     print(f"ERROR:{{e}}")
 """
-    result = subprocess.run(["uv", "run", "python", "-c", check_script], capture_output=True, text=True)
+    try:
+        result = subprocess.run(["uv", "run", "python", "-c", check_script], capture_output=True, text=True, timeout=30)
+    except subprocess.TimeoutExpired:
+        print("❌ Phoenix trace verification script timed out")
+        pytest.fail(f"Trace verification timed out for {project_name}")
 
     output = result.stdout + result.stderr
     if "FOUND_TRACES" in output:
@@ -103,9 +171,7 @@ def test_e2e_pipeline_agent(agent_config):
     sync_command = [
         "uv",
         "run",
-        "python",
-        "-m",
-        "evolve.frontend.cli.cli",
+        "evolve",
         "sync",
         "phoenix",
         "--project",
@@ -128,36 +194,42 @@ def test_e2e_pipeline_agent(agent_config):
     tips_found = False
     sync_start = time.time()
     timeout = 120  # 2 minute timeout for sync
+    output_lines = []
 
     try:
         while True:
             if time.time() - sync_start > timeout:
-                print("❌ Timeout waiting for tips generation")
+                print(f"❌ Timeout waiting for tips generation ({timeout}s)")
                 break
 
             line = process.stdout.readline()
-            if not line and process.poll() is not None:
-                break
-
-            if line:
-                line_stripped = line.strip()
-                # print(f"[Sync] {line_stripped}") # Optional: verbose logging
-
-                # Check target log pattern
-                match = re.search(r"generated (\d+) tips", line_stripped)
-                if match:
-                    count = match.group(1)
-                    print(f"\n✅ SUCCESS: Generated {count} tips!")
-                    tips_found = True
+            if not line:
+                if process.poll() is not None:
                     break
+                time.sleep(0.1)  # Avoid tight loop if no output but process alive
+                continue
+
+            output_lines.append(line)
+            line_stripped = line.strip()
+            # print(f"[Sync] {line_stripped}") # Optional: verbose logging
+
+            # Check target log pattern
+            match = re.search(r"generated (\d+) tips", line_stripped)
+            if match:
+                count = match.group(1)
+                print(f"\n✅ SUCCESS: Generated {count} tips!")
+                tips_found = True
+                break
     finally:
         if process.poll() is None:
             print("Stopping sync process...")
             process.terminate()
             try:
-                process.wait(timeout=5)
+                process.wait(timeout=10)
             except subprocess.TimeoutExpired:
                 process.kill()
 
     if not tips_found:
+        full_output = "".join(output_lines)
+        print(f"Final Sync Output:\n{full_output}")
         pytest.fail(f"Failed to detect tip generation for {agent_name} within {timeout}s.")