From 317217dd930afb79b2386c6d0e0ebef884e51791 Mon Sep 17 00:00:00 2001
From: Vatche Isahagian <vatchei@ibm.com>
Date: Wed, 1 Apr 2026 16:35:12 -0400
Subject: [PATCH 1/5] fix(tests): add postgres dependencies to dev group for
 testing

Added psycopg[binary]>=3.1 and pgvector>=0.3 to dev dependency group to ensure all unit tests can run during development and CI.

This fixes the test collection error for test_postgres_backend.py while keeping postgres support optional for end users (via the pgvector optional dependency group).
---
 pyproject.toml | 2 ++
 uv.lock        | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7cb9539..9d4d060 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,7 +48,9 @@ dev = [
     "anyio",
     "detect-secrets",
     "mypy",
+    "pgvector>=0.3",
     "pre-commit",
+    "psycopg[binary]>=3.1",
     "pytest",
     "pytest-cov",
     "pytest-retry",
diff --git a/uv.lock b/uv.lock
index 3ec0f26..114c320 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1050,7 +1050,9 @@ dev = [
     { name = "anyio" },
     { name = "detect-secrets" },
     { name = "mypy" },
+    { name = "pgvector" },
     { name = "pre-commit" },
+    { name = "psycopg", extra = ["binary"] },
     { name = "pytest" },
     { name = "pytest-cov" },
     { name = "pytest-retry" },
@@ -1103,7 +1105,9 @@ dev = [
     { name = "anyio" },
     { name = "detect-secrets", git = "https://github.com/ibm/detect-secrets?branch=master" },
     { name = "mypy" },
+    { name = "pgvector", specifier = ">=0.3" },
     { name = "pre-commit" },
+    { name = "psycopg", extras = ["binary"], specifier = ">=3.1" },
     { name = "pytest" },
     { name = "pytest-cov" },
     { name = "pytest-retry" },

From 89b7d63fc3e2bda62f4b16f909493f75b46df732 Mon Sep 17 00:00:00 2001
From: Vatche Isahagian <vatchei@ibm.com>
Date: Wed, 1 Apr 2026 21:45:32 -0400
Subject: [PATCH 2/5] test: restructure test suite into four distinct execution
 tiers

---
 README.md                               |  51 +++++-----
 docs/LOW_CODE_TRACING.md                |  10 +-
 pyproject.toml                          |   6 +-
 tests/conftest.py                       |  23 ++---
 tests/e2e/test_e2e_pipeline.py          | 123 ++++++++++++++++++++----
 tests/unit/test_cli.py                  |   1 -
 tests/unit/test_extract_trajectories.py |   5 +-
 tests/unit/test_mcp_server.py           |   2 +
 tests/unit/test_phoenix_sync.py         |   4 +-
 tests/unit/test_tracing.py              |   4 +-
 10 files changed, 155 insertions(+), 74 deletions(-)

diff --git a/README.md b/README.md
index c189314..46a8ce5 100644
--- a/README.md
+++ b/README.md
@@ -116,28 +116,29 @@ See the [Low-Code Tracing Guide](docs/LOW_CODE_TRACING.md#6-understanding-tip-pr
 
 ### Running Tests
 
-```bash
-uv run pytest
-```
-
-#### Phoenix Sync Tests
-
-Tests for the Phoenix trajectory sync functionality are **skipped by default** since they require familiarity with the Phoenix integration. To include them:
-
-```bash
-# Run all tests including Phoenix tests
-uv run pytest --run-phoenix
-
-# Run only Phoenix tests
-uv run pytest -m phoenix
-```
-
-#### End-to-End (E2E) Low-Code Verification
-
-To run the full end-to-end verification pipeline (Agent -> Trace -> Tip):
-
-```bash
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -s
-```
-
-See [docs/LOW_CODE_TRACING.md](docs/LOW_CODE_TRACING.md#end-to-end-verification) for more details.
+The test suite is organized into 4 cleanly isolated tiers depending on infrastructure requirements:
+
+1. **Unit Tests (Default)**
+   Fast, fully-mocked tests verifying core logic and offline pipeline schemas.
+   ```bash
+   uv run pytest
+   ```
+
+2. **Platform Integration Tests**
+   Fast filesystem-level integration tests verifying local tool installation and idempotency.
+   ```bash
+   uv run pytest -m platform_integrations
+   ```
+
+3. **End-to-End Infrastructure Tests**
+   Heavy tests that autonomously spin up a background Phoenix server and simulate full agent workflows.
+   ```bash
+   uv run pytest -m e2e --run-e2e
+   ```
+   *(See [docs/LOW_CODE_TRACING.md](docs/LOW_CODE_TRACING.md#end-to-end-verification) for more details).*
+
+4. **LLM Evaluation Tests**
+   Tests needing active LLM inference to test resolution pipelines (requires LLM API keys).
+   ```bash
+   uv run pytest -m llm
+   ```
diff --git a/docs/LOW_CODE_TRACING.md b/docs/LOW_CODE_TRACING.md
index a6f6122..18c43a6 100644
--- a/docs/LOW_CODE_TRACING.md
+++ b/docs/LOW_CODE_TRACING.md
@@ -200,7 +200,7 @@ curl "http://localhost:6006/v1/projects/test-agent/spans?limit=5"
 cd evolve_repo
 EVOLVE_BACKEND=filesystem \
 EVOLVE_TIPS_MODEL="gpt-4" \
-uv run python -m evolve.frontend.cli.cli sync phoenix \
+uv run python -m evolve.cli sync phoenix \
     --project test-agent \
     --include-errors
 ```
@@ -209,7 +209,7 @@ uv run python -m evolve.frontend.cli.cli sync phoenix \
 
 ```bash
 EVOLVE_BACKEND=filesystem \
-uv run python -m evolve.frontend.cli.cli entities list evolve --type guideline
+uv run python -m evolve.cli entities list evolve --type guideline
 ```
 
 ### 6. Understanding Tip Provenance (Metadata)
@@ -246,7 +246,7 @@ Evolve includes a comprehensive E2E verification suite to ensure that tracing an
 You can run the full regression suite using `pytest`:
 
 ```bash
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -s
+uv run pytest -m e2e --run-e2e -s
 ```
 
 ### Running Specific Tests
@@ -255,10 +255,10 @@ To test a specific agent framework:
 
 ```bash
 # Test smolagents
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -k smolagents -s
+uv run pytest tests/e2e/test_e2e_pipeline.py -k smolagents -m e2e --run-e2e -s
 
 # Test OpenAI Agents
-EVOLVE_E2E=true uv run pytest tests/e2e/test_e2e_pipeline.py -k openai_agents -s
+uv run pytest tests/e2e/test_e2e_pipeline.py -k openai_agents -m e2e --run-e2e -s
 ```
 
 ### What It Tests
diff --git a/pyproject.toml b/pyproject.toml
index 9d4d060..cbaae11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,12 +87,12 @@ evolve = ["**/*.jinja2"]
 package = true
 
 [tool.pytest.ini_options]
-addopts = "--ignore=explorations -m 'not phoenix and not llm'"
+addopts = "--ignore=explorations -m 'not llm and not e2e'"
 markers = [
     "e2e",
     "unit",
-    "phoenix",
-    "llm"
+    "llm",
+    "platform_integrations"
 ]
 anyio_mode = "auto"
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 38a8919..1c48fc2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -31,22 +31,19 @@ def mock_sentence_transformer(request):
 def pytest_addoption(parser):
     """Add custom command line options."""
     parser.addoption(
-        "--run-phoenix",
+        "--run-e2e",
         action="store_true",
         default=False,
-        help="Run Phoenix sync tests (skipped by default)",
+        help="Run End-to-End infrastructure tests (skipped by default)",
     )
 
 
 def pytest_configure(config):
-    """Override marker filter when --run-phoenix is passed."""
-    if config.getoption("--run-phoenix"):
-        # Remove the default marker filter to include phoenix tests
-        # Get current markexpr and modify it
-        markexpr = config.getoption("markexpr", default="")
-        if markexpr == "not phoenix":
-            config.option.markexpr = ""
-        elif "not phoenix" in markexpr:
-            # Remove "not phoenix" from the expression
-            new_expr = markexpr.replace("not phoenix and ", "").replace(" and not phoenix", "").replace("not phoenix", "")
-            config.option.markexpr = new_expr.strip()
+    """Override marker filter when relevant flags are passed."""
+    new_expr = config.getoption("markexpr", default="")
+
+    if config.getoption("--run-e2e"):
+        # Remove "not e2e" from the expression
+        new_expr = new_expr.replace("not e2e and ", "").replace(" and not e2e", "").replace("not e2e", "")
+
+    config.option.markexpr = new_expr.strip()
diff --git a/tests/e2e/test_e2e_pipeline.py b/tests/e2e/test_e2e_pipeline.py
index ab74094..b974a44 100644
--- a/tests/e2e/test_e2e_pipeline.py
+++ b/tests/e2e/test_e2e_pipeline.py
@@ -18,10 +18,74 @@
     {"name": "manual_phoenix", "script": "examples/low_code/manual_phoenix_demo.py", "project_prefix": "verify-manual"},
     {"name": "simple_openai", "script": "examples/low_code/simple_openai.py", "project_prefix": "verify-simple-openai"},
 ]
+import urllib.request
+import urllib.error
+
+@pytest.fixture(scope="session", autouse=True)
+def phoenix_server():
+    """Ensure a Phoenix server is running before executing E2E tests, and shut it down afterward."""
+    # 1. Check if it's already running locally
+    try:
+        urllib.request.urlopen("http://localhost:6006/status", timeout=2)
+        print("\nPhoenix is already running on port 6006.")
+        yield "http://localhost:6006"
+        return
+    except (urllib.error.URLError, ConnectionError):
+        pass
+
+    import sys
+    print("\nStarting local Phoenix server for E2E tests...")
+    
+    env = os.environ.copy()
+    env["PHOENIX_PORT"] = "6006"
+    
+    # Start it using the current python executable to avoid 'uv run' overhead
+    # We use run_in_thread=True and a sleepy while loop because run_in_thread=False
+    # can crash the fastAPI uvicorn startup in some MacOS environments.
+    script = (
+        "import phoenix as px; import time; px.launch_app(run_in_thread=True); "
+        "import sys; sys.stdout.flush(); time.sleep(86400)"
+    )
+
+    proc = subprocess.Popen(
+        [sys.executable, "-c", script],
+        env=env,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.PIPE,
+        text=True
+    )
+
+    # Poll until the server is responsive
+    max_retries = 30
+    for _ in range(max_retries):
+        try:
+            # specifically hit the status endpoint
+            urllib.request.urlopen("http://localhost:6006/status", timeout=2)
+            print("Phoenix server is up and running.")
+            break
+        except Exception:
+            # Also check if process crashed early
+            if proc.poll() is not None:
+                stderr_output = proc.stderr.read() if proc.stderr else "Unknown error"
+                pytest.fail(f"Phoenix server process crashed unexpectedly: {stderr_output}")
+            time.sleep(1)
+    else:
+        proc.terminate()
+        stderr_output = proc.stderr.read() if proc.stderr else "Unknown error"
+        pytest.fail(f"Failed to start local Phoenix server within 30 seconds. Stderr: {stderr_output}")
+
+    yield "http://localhost:6006"
+
+    # Cleanup: shut down Phoenix when tests are done
+    print("\nShutting down local Phoenix server...")
+    proc.terminate()
+    try:
+        proc.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        proc.kill()
 
 
 @pytest.mark.e2e
-@pytest.mark.phoenix
 @pytest.mark.parametrize("agent_config", AGENTS_TO_TEST, ids=[a["name"] for a in AGENTS_TO_TEST])
 def test_e2e_pipeline_agent(agent_config):
     """
@@ -58,7 +122,16 @@ def test_e2e_pipeline_agent(agent_config):
     if not os.path.exists(script_path):
         pytest.fail(f"Script not found: {script_path}")
 
-    result = subprocess.run(["uv", "run", "python", script_path], env=env, capture_output=True, text=True)
+    try:
+        result = subprocess.run(["uv", "run", "python", script_path], env=env, capture_output=True, text=True, timeout=90)
+    except subprocess.TimeoutExpired as e:
+        print(f"❌ Agent execution timed out after 90s")
+        # Still try to capture what we can from stdout/stderr if possible
+        stdout = e.stdout if e.stdout else ""
+        stderr = e.stderr if e.stderr else ""
+        print("STDOUT:", stdout)
+        print("STDERR:", stderr)
+        pytest.fail(f"Agent execution timed out for {agent_name}")
 
     if result.returncode != 0:
         print(f"❌ Agent failed with exit code {result.returncode}")
@@ -87,7 +160,11 @@ def test_e2e_pipeline_agent(agent_config):
 except Exception as e:
     print(f"ERROR:{{e}}")
 """
-    result = subprocess.run(["uv", "run", "python", "-c", check_script], capture_output=True, text=True)
+    try:
+        result = subprocess.run(["uv", "run", "python", "-c", check_script], capture_output=True, text=True, timeout=30)
+    except subprocess.TimeoutExpired:
+        print("❌ Phoenix trace verification script timed out")
+        pytest.fail(f"Trace verification timed out for {project_name}")
 
     output = result.stdout + result.stderr
     if "FOUND_TRACES" in output:
@@ -103,9 +180,7 @@ def test_e2e_pipeline_agent(agent_config):
     sync_command = [
         "uv",
         "run",
-        "python",
-        "-m",
-        "evolve.frontend.cli.cli",
+        "evolve",
         "sync",
         "phoenix",
         "--project",
@@ -128,36 +203,42 @@ def test_e2e_pipeline_agent(agent_config):
     tips_found = False
     sync_start = time.time()
     timeout = 120  # 2 minute timeout for sync
+    output_lines = []
 
     try:
         while True:
             if time.time() - sync_start > timeout:
-                print("❌ Timeout waiting for tips generation")
+                print(f"❌ Timeout waiting for tips generation ({timeout}s)")
                 break
 
             line = process.stdout.readline()
-            if not line and process.poll() is not None:
-                break
-
-            if line:
-                line_stripped = line.strip()
-                # print(f"[Sync] {line_stripped}") # Optional: verbose logging
-
-                # Check target log pattern
-                match = re.search(r"generated (\d+) tips", line_stripped)
-                if match:
-                    count = match.group(1)
-                    print(f"\n✅ SUCCESS: Generated {count} tips!")
-                    tips_found = True
+            if not line:
+                if process.poll() is not None:
                     break
+                time.sleep(0.1)  # Avoid tight loop if no output but process alive
+                continue
+            
+            output_lines.append(line)
+            line_stripped = line.strip()
+            # print(f"[Sync] {line_stripped}") # Optional: verbose logging
+
+            # Check target log pattern
+            match = re.search(r"generated (\d+) tips", line_stripped)
+            if match:
+                count = match.group(1)
+                print(f"\n✅ SUCCESS: Generated {count} tips!")
+                tips_found = True
+                break
     finally:
         if process.poll() is None:
             print("Stopping sync process...")
             process.terminate()
             try:
-                process.wait(timeout=5)
+                process.wait(timeout=10)
             except subprocess.TimeoutExpired:
                 process.kill()
 
     if not tips_found:
+        full_output = "".join(output_lines)
+        print(f"Final Sync Output:\n{full_output}")
         pytest.fail(f"Failed to detect tip generation for {agent_name} within {timeout}s.")
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
index 6520591..08ab136 100644
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -559,7 +559,6 @@ def test_sync_help(self):
 
 
 @pytest.mark.unit
-@pytest.mark.phoenix
 class TestSyncPhoenix:
     """Tests for 'evolve sync phoenix' command."""
 
diff --git a/tests/unit/test_extract_trajectories.py b/tests/unit/test_extract_trajectories.py
index 9edd394..87c62ab 100644
--- a/tests/unit/test_extract_trajectories.py
+++ b/tests/unit/test_extract_trajectories.py
@@ -21,9 +21,8 @@
     get_trajectories,
 )
 
-# Mark all tests in this module as phoenix tests (skipped by default)
-pytestmark = pytest.mark.phoenix
-
+# Mark all tests in this module as unit tests
+pytestmark = pytest.mark.unit
 
 # =============================================================================
 # parse_content() Tests
diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py
index 458136d..d5ad714 100644
--- a/tests/unit/test_mcp_server.py
+++ b/tests/unit/test_mcp_server.py
@@ -3,6 +3,8 @@
 import pytest
 from unittest.mock import patch, MagicMock
 
+pytestmark = pytest.mark.unit
+
 from evolve.frontend.mcp.mcp_server import save_trajectory, create_entity
 from evolve.schema.conflict_resolution import EntityUpdate
 
diff --git a/tests/unit/test_phoenix_sync.py b/tests/unit/test_phoenix_sync.py
index 59018ca..4e4621f 100644
--- a/tests/unit/test_phoenix_sync.py
+++ b/tests/unit/test_phoenix_sync.py
@@ -8,8 +8,8 @@
 from evolve.sync.phoenix_sync import PhoenixSync, SyncResult
 from evolve.schema.tips import TipGenerationResult
 
-# Mark all tests in this module as phoenix tests (skipped by default)
-pytestmark = pytest.mark.phoenix
+# Mark all tests in this module as unit tests
+pytestmark = pytest.mark.unit
 
 
 @pytest.fixture
diff --git a/tests/unit/test_tracing.py b/tests/unit/test_tracing.py
index a1adbba..0d168f7 100644
--- a/tests/unit/test_tracing.py
+++ b/tests/unit/test_tracing.py
@@ -12,6 +12,9 @@
 import pytest
 from unittest.mock import patch, MagicMock
 
+# Mark all tests in this module as unit tests
+pytestmark = pytest.mark.unit
+
 
 class TestFrameworkDetection:
     """Tests for detect_installed_frameworks()"""
@@ -197,7 +200,6 @@ def test_returns_provider_after_setup(self):
             auto._tracer_provider = original_provider
 
 
-@pytest.mark.unit
 class TestTracingIntegration:
     """Integration-style tests for the tracing module."""
 

From 52f1ed3cb54f0579314246668d66ed4e1fce57b9 Mon Sep 17 00:00:00 2001
From: Vatche Isahagian <vatchei@ibm.com>
Date: Wed, 1 Apr 2026 22:11:05 -0400
Subject: [PATCH 3/5] fix(tests): resolve linting errors and force isolated
 filesystem config in unit test fixture

---
 tests/e2e/test_e2e_pipeline.py | 27 ++++++++++-----------------
 tests/unit/test_client.py      |  6 +++++-
 tests/unit/test_mcp_server.py  |  4 ++--
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/tests/e2e/test_e2e_pipeline.py b/tests/e2e/test_e2e_pipeline.py
index b974a44..f4dd65c 100644
--- a/tests/e2e/test_e2e_pipeline.py
+++ b/tests/e2e/test_e2e_pipeline.py
@@ -4,6 +4,8 @@
 import os
 import datetime
 import pytest
+import urllib.request
+import urllib.error
 from evolve.config.phoenix import phoenix_settings
 
 # Configuration
@@ -18,8 +20,7 @@
     {"name": "manual_phoenix", "script": "examples/low_code/manual_phoenix_demo.py", "project_prefix": "verify-manual"},
     {"name": "simple_openai", "script": "examples/low_code/simple_openai.py", "project_prefix": "verify-simple-openai"},
 ]
-import urllib.request
-import urllib.error
+
 
 @pytest.fixture(scope="session", autouse=True)
 def phoenix_server():
@@ -34,26 +35,18 @@ def phoenix_server():
         pass
 
     import sys
+
     print("\nStarting local Phoenix server for E2E tests...")
-    
+
     env = os.environ.copy()
     env["PHOENIX_PORT"] = "6006"
-    
+
     # Start it using the current python executable to avoid 'uv run' overhead
     # We use run_in_thread=True and a sleepy while loop because run_in_thread=False
     # can crash the fastAPI uvicorn startup in some MacOS environments.
-    script = (
-        "import phoenix as px; import time; px.launch_app(run_in_thread=True); "
-        "import sys; sys.stdout.flush(); time.sleep(86400)"
-    )
+    script = "import phoenix as px; import time; px.launch_app(run_in_thread=True); import sys; sys.stdout.flush(); time.sleep(86400)"
 
-    proc = subprocess.Popen(
-        [sys.executable, "-c", script],
-        env=env,
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.PIPE,
-        text=True
-    )
+    proc = subprocess.Popen([sys.executable, "-c", script], env=env, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
 
     # Poll until the server is responsive
     max_retries = 30
@@ -125,7 +118,7 @@ def test_e2e_pipeline_agent(agent_config):
     try:
         result = subprocess.run(["uv", "run", "python", script_path], env=env, capture_output=True, text=True, timeout=90)
     except subprocess.TimeoutExpired as e:
-        print(f"❌ Agent execution timed out after 90s")
+        print("❌ Agent execution timed out after 90s")
         # Still try to capture what we can from stdout/stderr if possible
         stdout = e.stdout if e.stdout else ""
         stderr = e.stderr if e.stderr else ""
@@ -217,7 +210,7 @@ def test_e2e_pipeline_agent(agent_config):
                     break
                 time.sleep(0.1)  # Avoid tight loop if no output but process alive
                 continue
-            
+
             output_lines.append(line)
             line_stripped = line.strip()
             # print(f"[Sync] {line_stripped}") # Optional: verbose logging
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py
index 89324fa..c2ec8d4 100644
--- a/tests/unit/test_client.py
+++ b/tests/unit/test_client.py
@@ -12,9 +12,13 @@
 from evolve.frontend.client.evolve_client import EvolveClient
 
 
+from evolve.config.evolve import EvolveConfig
+
+
 @pytest.fixture(scope="module")
 def evolve_client() -> EvolveClient:
-    evolve_client = EvolveClient()
+    config = EvolveConfig(backend="filesystem")
+    evolve_client = EvolveClient(config=config)
     return evolve_client
 
 
diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py
index d5ad714..6e2d0f9 100644
--- a/tests/unit/test_mcp_server.py
+++ b/tests/unit/test_mcp_server.py
@@ -3,11 +3,11 @@
 import pytest
 from unittest.mock import patch, MagicMock
 
-pytestmark = pytest.mark.unit
-
 from evolve.frontend.mcp.mcp_server import save_trajectory, create_entity
 from evolve.schema.conflict_resolution import EntityUpdate
 
+pytestmark = pytest.mark.unit
+
 
 @pytest.fixture
 def mock_get_client():

From 187ef5dc9e3f455f67aca78853a84deae754ad32 Mon Sep 17 00:00:00 2001
From: Vatche Isahagian <vatchei@ibm.com>
Date: Wed, 1 Apr 2026 22:15:35 -0400
Subject: [PATCH 4/5] docs: standardize cli commands to use evolve entrypoint
 across all readmes

---
 README.md                | 16 +++++++++++-----
 README_phoenix_sync.md   | 12 ++++++------
 docs/LOW_CODE_TRACING.md |  4 ++--
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 46a8ce5..47ceeba 100644
--- a/README.md
+++ b/README.md
@@ -118,26 +118,32 @@ See the [Low-Code Tracing Guide](docs/LOW_CODE_TRACING.md#6-understanding-tip-pr
 
 The test suite is organized into 4 cleanly isolated tiers depending on infrastructure requirements:
 
-1. **Unit Tests (Default)**
-   Fast, fully-mocked tests verifying core logic and offline pipeline schemas.
+1. **Default Local Suite**
+   Runs both fast logic tests (`unit`) and filesystem script verifications (`platform_integrations`).
    ```bash
    uv run pytest
    ```
 
-2. **Platform Integration Tests**
+2. **Unit Tests (Only)**
+   Fast, fully-mocked tests verifying core logic and offline pipeline schemas.
+   ```bash
+   uv run pytest -m unit
+   ```
+
+3. **Platform Integration Tests**
    Fast filesystem-level integration tests verifying local tool installation and idempotency.
    ```bash
    uv run pytest -m platform_integrations
    ```
 
-3. **End-to-End Infrastructure Tests**
+4. **End-to-End Infrastructure Tests**
    Heavy tests that autonomously spin up a background Phoenix server and simulate full agent workflows.
    ```bash
    uv run pytest -m e2e --run-e2e
    ```
    *(See [docs/LOW_CODE_TRACING.md](docs/LOW_CODE_TRACING.md#end-to-end-verification) for more details).*
 
-4. **LLM Evaluation Tests**
+5. **LLM Evaluation Tests**
    Tests needing active LLM inference to test resolution pipelines (requires LLM API keys).
    ```bash
    uv run pytest -m llm
diff --git a/README_phoenix_sync.md b/README_phoenix_sync.md
index 8049af4..cf4953f 100644
--- a/README_phoenix_sync.md
+++ b/README_phoenix_sync.md
@@ -32,20 +32,20 @@ No additional dependencies required - uses only stdlib for Phoenix API calls.
 
 ```bash
 # Basic sync with defaults
-uv run python -m evolve.cli.cli sync phoenix
+uv run evolve sync phoenix
 
 # Custom Phoenix URL and namespace
-uv run python -m evolve.cli.cli sync phoenix \
+uv run evolve sync phoenix \
   --url http://phoenix.example.com:6006 \
   --namespace my_namespace
 
 # Fetch more spans and include errors
-uv run python -m evolve.cli.cli sync phoenix \
+uv run evolve sync phoenix \
   --limit 500 \
   --include-errors
 
 # Full options
-uv run python -m evolve.cli.cli sync phoenix \
+uv run evolve sync phoenix \
   --url http://localhost:6006 \
   --namespace production \
   --project my_project \
@@ -145,7 +145,7 @@ Two entity types are stored:
 
 ```bash
 # Sync every hour
-0 * * * * cd /path/to/evolve && uv run python -m evolve.cli.cli sync phoenix --limit 100
+0 * * * * cd /path/to/evolve && uv run evolve sync phoenix --limit 100
 ```
 
 ### Systemd Timer
@@ -158,7 +158,7 @@ Description=Evolve Phoenix Sync
 [Service]
 Type=oneshot
 WorkingDirectory=/path/to/evolve
-ExecStart=/path/to/uv run python -m evolve.cli.cli sync phoenix
+ExecStart=/path/to/uv run evolve sync phoenix
 Environment=PHOENIX_URL=http://localhost:6006
 Environment=EVOLVE_NAMESPACE_ID=production
 ```
diff --git a/docs/LOW_CODE_TRACING.md b/docs/LOW_CODE_TRACING.md
index 18c43a6..8294f10 100644
--- a/docs/LOW_CODE_TRACING.md
+++ b/docs/LOW_CODE_TRACING.md
@@ -200,7 +200,7 @@ curl "http://localhost:6006/v1/projects/test-agent/spans?limit=5"
 cd evolve_repo
 EVOLVE_BACKEND=filesystem \
 EVOLVE_TIPS_MODEL="gpt-4" \
-uv run python -m evolve.cli sync phoenix \
+uv run evolve sync phoenix \
     --project test-agent \
     --include-errors
 ```
@@ -209,7 +209,7 @@ uv run python -m evolve.cli sync phoenix \
 
 ```bash
 EVOLVE_BACKEND=filesystem \
-uv run python -m evolve.cli entities list evolve --type guideline
+uv run evolve entities list evolve --type guideline
 ```
 
 ### 6. Understanding Tip Provenance (Metadata)

From 079fbbd2c9e5379e28b9e1d28b9fae75eea60bd7 Mon Sep 17 00:00:00 2001
From: Vatche Isahagian <vatchei@ibm.com>
Date: Wed, 1 Apr 2026 22:16:33 -0400
Subject: [PATCH 5/5] test: dynamically inject E2E phoenix fixture URL

---
 tests/e2e/test_e2e_pipeline.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/e2e/test_e2e_pipeline.py b/tests/e2e/test_e2e_pipeline.py
index f4dd65c..898756e 100644
--- a/tests/e2e/test_e2e_pipeline.py
+++ b/tests/e2e/test_e2e_pipeline.py
@@ -6,10 +6,8 @@
 import pytest
 import urllib.request
 import urllib.error
-from evolve.config.phoenix import phoenix_settings
 
 # Configuration
-PHOENIX_URL = phoenix_settings.url
 # Use a session-scope timestamp or generate per test?
 # Per-test ensures no collisions even if run in parallel (though these should satisfy sequential)
 TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -80,7 +78,7 @@ def phoenix_server():
 
 @pytest.mark.e2e
 @pytest.mark.parametrize("agent_config", AGENTS_TO_TEST, ids=[a["name"] for a in AGENTS_TO_TEST])
-def test_e2e_pipeline_agent(agent_config):
+def test_e2e_pipeline_agent(agent_config, phoenix_server):
     """
     Runs the full E2E pipeline for a specific agent configuration:
     1. Executing the agent script
@@ -144,7 +142,7 @@ def test_e2e_pipeline_agent(agent_config):
 import phoenix as px
 import sys
 try:
-    c = px.Client(endpoint='{PHOENIX_URL}')
+    c = px.Client(endpoint='{phoenix_server}')
     df = c.get_spans_dataframe(project_name='{project_name}')
     if df is not None and not df.empty:
         print(f"FOUND_TRACES:{{len(df)}}")