From 63f7cb7f69397e7c9ccd7d1af4a1120e86d4ec85 Mon Sep 17 00:00:00 2001
From: sysid <sysid@gmx.de>
Date: Thu, 1 Jan 2026 19:29:21 +0100
Subject: [PATCH 1/4] chore: update gitignore

---
 .workmux.yaml | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 .workmux.yaml
diff --git a/.workmux.yaml b/.workmux.yaml
new file mode 100644
index 0000000..ee95fe1
--- /dev/null
+++ b/.workmux.yaml
@@ -0,0 +1,139 @@
+# workmux project configuration
+# For global settings, edit ~/.config/workmux/config.yaml
+# All options below are commented out - uncomment to override defaults.
+
+#-------------------------------------------------------------------------------
+# Git
+#-------------------------------------------------------------------------------
+
+# The primary branch to merge into.
+# Default: Auto-detected from remote HEAD, falls back to main/master.
+# main_branch: main
+
+# Default merge strategy for `workmux merge`.
+# Options: merge (default), rebase, squash
+# CLI flags (--rebase, --squash) always override this.
+# merge_strategy: rebase
+
+#-------------------------------------------------------------------------------
+# Naming & Paths
+#-------------------------------------------------------------------------------
+
+# Directory where worktrees are created.
+# Can be relative to repo root or absolute.
+# Default: Sibling directory '<project>__worktrees'.
+# worktree_dir: .worktrees
+
+# Strategy for deriving names from branch names.
+# Options: full (default), basename (part after last '/').
+# worktree_naming: basename
+
+# Prefix added to worktree directories and tmux window names.
+# worktree_prefix: ""
+
+# Prefix for tmux window names.
+# Default: "wm-"
+# window_prefix: "wm-"
+
+#-------------------------------------------------------------------------------
+# Tmux
+#-------------------------------------------------------------------------------
+
+# Custom tmux pane layout.
+# Default: Two-pane layout with shell and clear command.
+# panes:
+#   - command: pnpm install
+#     focus: true
+#   - split: horizontal
+#   - command: clear
+#     split: vertical
+#     size: 5
+
+# Auto-apply agent status icons to tmux window format.
+# Default: true
+# status_format: true
+
+# Custom icons for agent status display.
+status_icons:
+  working: "🤖"
+  waiting: "💬"
+  done: "✅"
+
+#-------------------------------------------------------------------------------
+# Agent & AI
+#-------------------------------------------------------------------------------
+
+# Agent command for '<agent>' placeholder in pane commands.
+# Default: "claude"
+# agent: claude
+
+# LLM-based branch name generation (`workmux add -a`).
+# auto_name:
+#   model: "gpt-4o-mini"
+#   system_prompt: "Generate a kebab-case git branch name."
+
+#-------------------------------------------------------------------------------
+# Hooks
+#-------------------------------------------------------------------------------
+
+# Commands to run in new worktree before tmux window opens.
+# These block window creation - use for short tasks only.
+# Use "<global>" to inherit from global config.
+# Set to empty list to disable: `post_create: []`
+# post_create:
+#   - "<global>"
+#   - mise use
+
+  # ẞOTCHA: copies .envrc target, not link
+post_create:
+  - ln -s $SOPS_PATH/dot.envrc .envrc
+  - direnv allow
+
+# Commands to run before merging (e.g., linting, tests).
+# Aborts the merge if any command fails.
+# Use "<global>" to inherit from global config.
+# Environment variables available:
+#   - WM_BRANCH_NAME: The name of the branch being merged
+#   - WM_TARGET_BRANCH: The name of the target branch (e.g., main)
+#   - WM_WORKTREE_PATH: Absolute path to the worktree
+#   - WM_PROJECT_ROOT: Absolute path of the main project directory
+#   - WM_HANDLE: The worktree handle/window name
+# pre_merge:
+#   - "<global>"
+#   - cargo test
+#   - cargo clippy -- -D warnings
+
+# Commands to run before worktree removal (during merge or remove).
+# Useful for backing up gitignored files before cleanup.
+# Default: Auto-detects Node.js projects and fast-deletes node_modules.
+# Set to empty list to disable: `pre_remove: []`
+# Environment variables available:
+#   - WM_HANDLE: The worktree handle (directory name)
+#   - WM_WORKTREE_PATH: Absolute path of the worktree being deleted
+#   - WM_PROJECT_ROOT: Absolute path of the main project directory
+# pre_remove:
+#   - mkdir -p "$WM_PROJECT_ROOT/artifacts/$WM_HANDLE"
+#   - cp -r test-results/ "$WM_PROJECT_ROOT/artifacts/$WM_HANDLE/"
+
+#-------------------------------------------------------------------------------
+# Files
+#-------------------------------------------------------------------------------
+
+# File operations when creating a worktree.
+# files:
+#   # Files to copy (useful for .env files that need to be unique).
+#   copy:
+#     - .env.local
+#
+#   # Files/directories to symlink (saves disk space, shares caches).
+#   # Default: None.
+#   # Use "<global>" to inherit from global config.
+#   symlink:
+#     - "<global>"
+#     - node_modules
+files:
+  symlink:
+    - .venv
+    - .claude
+    - CLAUDE.md
+    - thoughts

From 105fee90977578dd5a2a23eeecb5c903b4d71952 Mon Sep 17 00:00:00 2001
From: sysid <sysid@gmx.de>
Date: Wed, 31 Dec 2025 11:39:24 +0100
Subject: [PATCH 2/4] feat: add comprehensive load testing infrastructure

Add exhaustive load testing to detect memory leaks,
watcher deduplication at scale, and prevent performance regressions.

Test coverage (15 tests):
- Memory stability: leak detection, baseline return, event set cleanup
- Watcher scale: single watcher verification, rapid connect/disconnect
- Throughput: single/multi-client, TTFE, inter-event latency
- Shutdown: graceful termination with active connections
- Backpressure: slow client isolation, connection churn, send_timeout

Infrastructure:
- Docker-based test server with /metrics endpoint (psutil)
- testcontainers fixtures with health check wait strategies
- httpx-sse + asyncio.gather() for concurrent SSE clients
- Manual GitHub Actions workflow (workflow_dispatch)

New dependencies in dev group: httpx-sse
New Makefile target: test-load
---
 .github/workflows/load-test.yml               |  76 ++++++
 Makefile                                      |  13 +-
 ...{issue77.py => issue77_lock_contention.py} |   0
 pyproject.toml                                |   8 +-
 tests/Dockerfile.loadtest                     |  32 +++
 tests/load/__init__.py                        |   1 +
 tests/load/conftest.py                        | 128 ++++++++++
 tests/load/server_app.py                      | 107 ++++++++
 tests/load/test_backpressure.py               | 215 ++++++++++++++++
 tests/load/test_memory_stability.py           | 235 ++++++++++++++++++
 tests/load/test_shutdown.py                   | 182 ++++++++++++++
 tests/load/test_throughput.py                 | 185 ++++++++++++++
 tests/load/test_watcher_scale.py              | 159 ++++++++++++
 uv.lock                                       |  19 +-
 14 files changed, 1357 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/load-test.yml
 rename examples/issues/{issue77.py => issue77_lock_contention.py} (100%)
 create mode 100644 tests/Dockerfile.loadtest
 create mode 100644 tests/load/__init__.py
 create mode 100644 tests/load/conftest.py
 create mode 100644 tests/load/server_app.py
 create mode 100644 tests/load/test_backpressure.py
 create mode 100644 tests/load/test_memory_stability.py
 create mode 100644 tests/load/test_shutdown.py
 create mode 100644 tests/load/test_throughput.py
 create mode 100644 tests/load/test_watcher_scale.py

diff --git a/.github/workflows/load-test.yml b/.github/workflows/load-test.yml
new file mode 100644
index 0000000..59eeb79
--- /dev/null
+++ b/.github/workflows/load-test.yml
@@ -0,0 +1,76 @@
+name: Load Tests
+
+on:
+  workflow_dispatch:
+    inputs:
+      scale:
+        description: 'Number of concurrent connections'
+        required: true
+        default: '100'
+        type: choice
+        options:
+          - '100'
+          - '500'
+          - '1000'
+      duration:
+        description: 'Test duration in minutes'
+        required: true
+        default: '1'
+        type: choice
+        options:
+          - '1'
+          - '5'
+          - '10'
+
+jobs:
+  load-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+
+      - name: Install dependencies
+        run: |
+          uv pip install --system -e ".[loadtest]"
+
+      - name: Build load test Docker image
+        run: |
+          docker build -f tests/Dockerfile.loadtest -t sse-starlette-loadtest:latest .
+
+      - name: Run load tests
+        run: |
+          python -m pytest tests/load/ -m "loadtest" \
+            --scale=${{ inputs.scale }} \
+            --duration=${{ inputs.duration }} \
+            -v --tb=short \
+            --junitxml=load-test-results.xml
+
+      - name: Upload test results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: load-test-results
+          path: |
+            load-test-results.xml
+          retention-days: 30
+
+      - name: Test Summary
+        if: always()
+        run: |
+          echo "## Load Test Results" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "- **Scale**: ${{ inputs.scale }} concurrent connections" >> $GITHUB_STEP_SUMMARY
+          echo "- **Duration**: ${{ inputs.duration }} minutes" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ -f load-test-results.xml ]; then
+            echo "Test results uploaded as artifact." >> $GITHUB_STEP_SUMMARY
+          fi
diff --git a/Makefile b/Makefile
index fe0c664..8d8e4f4 100644
--- a/Makefile
+++ b/Makefile
@@ -89,7 +89,7 @@ test: test-unit test-docker  ## run tests
 
 .PHONY: test-unit
 test-unit:  ## run all tests except "integration" marked
-	RUN_ENV=local python -m pytest -m "not (integration or experimentation)" --cov-config=pyproject.toml --cov-report=html --cov-report=term --cov=$(pkg_src) tests
+	RUN_ENV=local python -m pytest -m "not (integration or experimentation or loadtest)" --cov-config=pyproject.toml --cov-report=html --cov-report=term --cov=$(pkg_src) tests
 
 .PHONY: test-docker
 test-docker:  ## test-docker (docker desktop: advanced settings)
@@ -100,6 +100,17 @@ test-docker:  ## test-docker (docker desktop: advanced settings)
 		echo "Skipping tests: /var/run/docker.sock does not exist."; \
 	fi
 
+.PHONY: test-load
+test-load:  ## run load tests (requires docker, make test-load PYTEST_ARGS="--scale=500 --duration=5")
+	@if [ -S /var/run/docker.sock > /dev/null 2>&1 ]; then \
+		echo "Building load test image..."; \
+		docker build -f tests/Dockerfile.loadtest -t sse-starlette-loadtest:latest .; \
+		echo "Running load tests..."; \
+		RUN_ENV=local python -m pytest -m "loadtest" tests/load/ -v --tb=short $(PYTEST_ARGS); \
+	else \
+		echo "Skipping load tests: /var/run/docker.sock does not exist."; \
+	fi
+
 
 ################################################################################
 # Code Quality \
diff --git a/examples/issues/issue77.py b/examples/issues/issue77_lock_contention.py
similarity index 100%
rename from examples/issues/issue77.py
rename to examples/issues/issue77_lock_contention.py
diff --git a/pyproject.toml b/pyproject.toml
index 998361d..ca9f2a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,12 +42,17 @@ granian = [
 daphne = [
     "daphne>=4.2.0",
 ]
+loadtest = [
+    "httpx-sse>=0.4.0",
+    "psutil>=6.1.1",
+]
 
 [dependency-groups]  # new standard, included by default
 dev = [
     "asgi-lifespan>=2.1.0",
     "async-timeout>=5.0.1",
     "httpx>=0.28.1",
+    "httpx-sse>=0.4.0",
     "mypy>=1.14.0",
     "portend>=3.2.0",
     "psutil>=6.1.1",
@@ -102,7 +107,8 @@ filename = "sse_starlette/__init__.py"
 [tool.pytest.ini_options]
 markers = [
     "integration: marks tests as integration tests",
-    "experimentation: marks tests as experimental tests, not to be run in CICD"
+    "experimentation: marks tests as experimental tests, not to be run in CICD",
+    "loadtest: marks tests as load tests (require docker and significant resources)"
 ]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
diff --git a/tests/Dockerfile.loadtest b/tests/Dockerfile.loadtest
new file mode 100644
index 0000000..3c3e6dd
--- /dev/null
+++ b/tests/Dockerfile.loadtest
@@ -0,0 +1,32 @@
+# Load test server image for sse-starlette
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install build dependencies and cleanup in one layer
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy package files
+COPY pyproject.toml ./
+COPY README.md ./
+COPY sse_starlette ./sse_starlette
+
+# Install package with loadtest dependencies
+RUN pip install --no-cache-dir -e ".[loadtest]"
+
+# Install uvicorn for serving
+RUN pip install --no-cache-dir uvicorn
+
+# Copy load test server app
+COPY tests/load/server_app.py ./server_app.py
+
+# Expose port
+EXPOSE 8000
+
+# Set Python path
+ENV PYTHONPATH=/app
+
+# Default command - run the load test server
+CMD ["uvicorn", "server_app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
diff --git a/tests/load/__init__.py b/tests/load/__init__.py
new file mode 100644
index 0000000..758eb8e
--- /dev/null
+++ b/tests/load/__init__.py
@@ -0,0 +1 @@
+# Load testing module for sse-starlette
diff --git a/tests/load/conftest.py b/tests/load/conftest.py
new file mode 100644
index 0000000..ec73419
--- /dev/null
+++ b/tests/load/conftest.py
@@ -0,0 +1,128 @@
+"""
+Pytest fixtures for load testing.
+
+Provides container-based SSE server and utility fixtures.
+"""
+
+import os
+import time
+from typing import Generator
+
+import httpx
+import pytest
+from testcontainers.core.container import DockerContainer
+
+
+class SSELoadTestContainer(DockerContainer):
+    """Custom container for SSE load testing."""
+
+    def __init__(self, image: str = "sse-starlette-loadtest:latest"):
+        super().__init__(image)
+        self.with_exposed_ports(8000)
+
+    def get_base_url(self) -> str:
+        """Get the base URL for the SSE server."""
+        host = self.get_container_host_ip()
+        port = self.get_exposed_port(8000)
+        return f"http://{host}:{port}"
+
+
+def _wait_for_port(container: DockerContainer, port: int, timeout: float = 30) -> str:
+    """Wait for port mapping to be available and return base URL."""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            host = container.get_container_host_ip()
+            mapped_port = container.get_exposed_port(port)
+            return f"http://{host}:{mapped_port}"
+        except ConnectionError:
+            time.sleep(0.5)
+    raise TimeoutError(f"Port {port} not available after {timeout}s")
+
+
+def _wait_for_health(base_url: str, timeout: float = 30) -> None:
+    """Wait for server health endpoint to respond."""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            resp = httpx.get(f"{base_url}/health", timeout=2.0)
+            if resp.status_code == 200:
+                return
+        except httpx.RequestError:
+            pass
+        time.sleep(0.5)
+    raise TimeoutError(f"Server at {base_url} not ready after {timeout}s")
+
+
+@pytest.fixture(scope="module")
+def docker_available() -> bool:
+    """Check if Docker is available."""
+    return os.path.exists("/var/run/docker.sock")
+
+
+@pytest.fixture(scope="module")
+def sse_container(
+    docker_available: bool,
+) -> Generator[SSELoadTestContainer, None, None]:
+    """Start SSE server in Docker container for load testing."""
+    if not docker_available:
+        pytest.skip("Docker not available")
+
+    container = SSELoadTestContainer()
+    container.start()
+
+    # Wait for port mapping, then health check
+    base_url = _wait_for_port(container, 8000, timeout=30)
+    _wait_for_health(base_url, timeout=30)
+
+    yield container
+
+    container.stop()
+
+
+@pytest.fixture(scope="module")
+def sse_server_url(sse_container: SSELoadTestContainer) -> str:
+    """Get the base URL for the SSE server."""
+    return sse_container.get_base_url()
+
+
+@pytest.fixture
+def sync_client() -> Generator[httpx.Client, None, None]:
+    """Synchronous HTTP client for simple requests."""
+    with httpx.Client(timeout=30.0) as client:
+        yield client
+
+
+@pytest.fixture
+async def async_client() -> httpx.AsyncClient:
+    """Async HTTP client for SSE streaming."""
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        yield client
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Add custom command line options for load tests."""
+    parser.addoption(
+        "--scale",
+        action="store",
+        default="100",
+        help="Number of concurrent connections for load tests",
+    )
+    parser.addoption(
+        "--duration",
+        action="store",
+        default="1",
+        help="Test duration in minutes",
+    )
+
+
+@pytest.fixture
+def scale(request: pytest.FixtureRequest) -> int:
+    """Get the scale (number of connections) for load tests."""
+    return int(request.config.getoption("--scale"))
+
+
+@pytest.fixture
+def duration_minutes(request: pytest.FixtureRequest) -> int:
+    """Get the duration in minutes for load tests."""
+    return int(request.config.getoption("--duration"))
diff --git a/tests/load/server_app.py b/tests/load/server_app.py
new file mode 100644
index 0000000..516bc76
--- /dev/null
+++ b/tests/load/server_app.py
@@ -0,0 +1,107 @@
+"""
+Load test SSE server application.
+
+Provides SSE endpoints and a metrics endpoint for monitoring during load tests.
+"""
+
+import asyncio
+import os
+import time
+from typing import AsyncGenerator
+
+import psutil
+from starlette.applications import Starlette
+from starlette.requests import Request
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+
+from sse_starlette import EventSourceResponse
+from sse_starlette.sse import _get_shutdown_state
+
+
+async def metrics(request: Request) -> JSONResponse:
+    """Expose server metrics for monitoring during load tests."""
+    process = psutil.Process(os.getpid())
+    memory_info = process.memory_info()
+
+    # Get watcher and event count from thread-local state
+    shutdown_state = _get_shutdown_state()
+
+    return JSONResponse(
+        {
+            "memory_rss_mb": memory_info.rss / 1024 / 1024,
+            "memory_vms_mb": memory_info.vms / 1024 / 1024,
+            "num_fds": process.num_fds() if hasattr(process, "num_fds") else -1,
+            "num_threads": process.num_threads(),
+            "connections": len(process.connections()),
+            "cpu_percent": process.cpu_percent(),
+            "watcher_started": shutdown_state.watcher_started,
+            "registered_events": len(shutdown_state.events),
+            "uptime_seconds": time.time() - process.create_time(),
+        }
+    )
+
+
+async def endless_stream(request: Request) -> EventSourceResponse:
+    """High-frequency event stream for load testing."""
+    delay = float(request.query_params.get("delay", "0.01"))  # 100 events/sec default
+
+    async def generate() -> AsyncGenerator[dict, None]:
+        counter = 0
+        while True:
+            if await request.is_disconnected():
+                break
+            yield {"data": f"event-{counter}", "id": str(counter)}
+            counter += 1
+            await asyncio.sleep(delay)
+
+    return EventSourceResponse(generate())
+
+
+async def finite_stream(request: Request) -> EventSourceResponse:
+    """Finite event stream for testing completion."""
+    count = int(request.query_params.get("count", "100"))
+    delay = float(request.query_params.get("delay", "0.01"))
+
+    async def generate() -> AsyncGenerator[dict, None]:
+        for i in range(count):
+            if await request.is_disconnected():
+                break
+            yield {"data": f"event-{i}", "id": str(i)}
+            await asyncio.sleep(delay)
+
+    return EventSourceResponse(generate())
+
+
+async def slow_stream(request: Request) -> EventSourceResponse:
+    """Slow event stream for backpressure testing."""
+    delay = float(request.query_params.get("delay", "1.0"))
+
+    async def generate() -> AsyncGenerator[dict, None]:
+        counter = 0
+        while True:
+            if await request.is_disconnected():
+                break
+            # Generate larger payloads
+            payload = "x" * 4096
+            yield {"data": payload, "id": str(counter)}
+            counter += 1
+            await asyncio.sleep(delay)
+
+    return EventSourceResponse(generate())
+
+
+async def health(request: Request) -> JSONResponse:
+    """Health check endpoint."""
+    return JSONResponse({"status": "healthy"})
+
+
+routes = [
+    Route("/sse", endless_stream),
+    Route("/sse/finite", finite_stream),
+    Route("/sse/slow", slow_stream),
+    Route("/metrics", metrics),
+    Route("/health", health),
+]
+
+app = Starlette(routes=routes)
diff --git a/tests/load/test_backpressure.py b/tests/load/test_backpressure.py
new file mode 100644
index 0000000..94f111b
--- /dev/null
+++ b/tests/load/test_backpressure.py
@@ -0,0 +1,215 @@
+"""
+Backpressure and slow client tests.
+
+Verifies server handles slow consumers correctly without affecting fast clients.
+"""
+
+import asyncio
+import time
+from typing import Tuple
+
+import httpx
+import pytest
+from httpx_sse import aconnect_sse
+
+
+@pytest.mark.loadtest
+async def test_slow_clients_dont_block_fast_clients(
+    sse_server_url: str,
+) -> None:
+    """
+    Slow clients should not affect throughput of fast clients.
+
+    Tests that the server properly handles mixed client speeds.
+    """
+    test_duration = 10  # seconds
+
+    async def fast_client() -> int:
+        """Client that consumes events as fast as possible."""
+        count = 0
+        start = time.perf_counter()
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.01"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        count += 1
+                        if time.perf_counter() - start >= test_duration:
+                            break
+        except Exception:
+            pass
+        return count
+
+    async def slow_client() -> int:
+        """Client that reads slowly (simulating processing delay)."""
+        count = 0
+        start = time.perf_counter()
+        try:
+            async with httpx.AsyncClient(timeout=60.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.01"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        await asyncio.sleep(0.5)  # Slow processing
+                        count += 1
+                        if time.perf_counter() - start >= test_duration:
+                            break
+        except Exception:
+            pass
+        return count
+
+    # Mix of fast and slow clients
+    fast_tasks = [asyncio.create_task(fast_client()) for _ in range(10)]
+    slow_tasks = [asyncio.create_task(slow_client()) for _ in range(10)]
+
+    fast_results = await asyncio.gather(*fast_tasks)
+    slow_results = await asyncio.gather(*slow_tasks)
+
+    avg_fast = sum(fast_results) / len(fast_results)
+    avg_slow = sum(slow_results) / len(slow_results)
+
+    # Fast clients should receive significantly more events
+    assert avg_fast > avg_slow * 5, (
+        f"Fast clients ({avg_fast:.0f} events) should be much faster than "
+        f"slow clients ({avg_slow:.0f} events)"
+    )
+
+    # Fast clients should not be severely throttled
+    # With 0.01s delay, should get ~1000 events in 10s
+    assert (
+        avg_fast > 500
+    ), f"Fast clients throttled: {avg_fast:.0f} events, expected > 500"
+
+
+@pytest.mark.loadtest
+async def test_connection_churn_stability(
+    sse_server_url: str,
+    scale: int,
+) -> None:
+    """
+    Rapid connect/disconnect should not cause resource exhaustion.
+
+    Tests cleanup under high churn rate.
+    """
+    churn_rate = min(100, scale)  # connections per second
+    duration = 30  # seconds
+    total_connections = churn_rate * duration
+
+    async def quick_connection() -> bool:
+        try:
+            async with httpx.AsyncClient(timeout=5.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        return True
+        except Exception:
+            return False
+        return False
+
+    # Get baseline metrics
+    async with httpx.AsyncClient() as client:
+        baseline = (await client.get(f"{sse_server_url}/metrics")).json()
+
+    baseline_fds = baseline.get("num_fds", 0)
+    baseline_memory = baseline["memory_rss_mb"]
+
+    # Create connections at target rate
+    successful = 0
+    for batch in range(duration):
+        tasks = [asyncio.create_task(quick_connection()) for _ in range(churn_rate)]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        successful += sum(1 for r in results if r is True)
+        await asyncio.sleep(0.5)  # Allow some cleanup
+
+    # Get final metrics
+    async with httpx.AsyncClient() as client:
+        final = (await client.get(f"{sse_server_url}/metrics")).json()
+
+    final_fds = final.get("num_fds", 0)
+    final_memory = final["memory_rss_mb"]
+
+    # File descriptors should return to baseline
+    if baseline_fds > 0 and final_fds > 0:
+        fd_growth = final_fds - baseline_fds
+        assert fd_growth < 50, (
+            f"File descriptor leak: {fd_growth} new FDs after {total_connections} "
+            f"connections"
+        )
+
+    # Memory should not grow excessively
+    memory_growth = final_memory - baseline_memory
+    assert (
+        memory_growth < 100
+    ), f"Memory grew by {memory_growth:.1f}MB during churn test"
+
+    # Success rate should be high
+    success_rate = successful / total_connections if total_connections > 0 else 0
+    assert success_rate > 0.9, (
+        f"Low success rate during churn: {success_rate:.1%} "
+        f"({successful}/{total_connections})"
+    )
+
+
+@pytest.mark.loadtest
+async def test_send_timeout_under_load(sse_server_url: str) -> None:
+    """
+    Verify send_timeout works correctly under load.
+
+    Clients that stop reading should eventually be disconnected.
+    """
+
+    async def frozen_client() -> Tuple[str, float]:
+        """Client that stops reading after first event (simulates frozen client)."""
+        start = time.perf_counter()
+        try:
+            async with httpx.AsyncClient(timeout=120.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.001"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        # Stop reading but keep connection open
+                        await asyncio.sleep(60)  # Will be interrupted by timeout
+                        break
+        except httpx.ReadTimeout:
+            return "timeout", time.perf_counter() - start
+        except Exception as e:
+            return f"error:{type(e).__name__}", time.perf_counter() - start
+        return "completed", time.perf_counter() - start
+
+    # Start some frozen clients (server has default send_timeout)
+    tasks = [asyncio.create_task(frozen_client()) for _ in range(5)]
+
+    # Also verify server remains responsive with normal clients
+    async def normal_client() -> int:
+        count = 0
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.1"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        count += 1
+                        if count >= 50:
+                            break
+        except Exception:
+            pass
+        return count
+
+    normal_tasks = [asyncio.create_task(normal_client()) for _ in range(3)]
+
+    # Wait for normal clients to complete
+    normal_results = await asyncio.gather(*normal_tasks)
+
+    # Cancel frozen clients if still running
+    for task in tasks:
+        if not task.done():
+            task.cancel()
+
+    await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Normal clients should have completed successfully
+    assert all(
+        r >= 45 for r in normal_results
+    ), f"Normal clients affected by frozen clients: {normal_results}"
diff --git a/tests/load/test_memory_stability.py b/tests/load/test_memory_stability.py
new file mode 100644
index 0000000..ba23eea
--- /dev/null
+++ b/tests/load/test_memory_stability.py
@@ -0,0 +1,235 @@
+"""
+Memory stability tests for sse-starlette under load.
+
+Verifies no memory leaks during sustained SSE streaming with many concurrent connections.
+"""
+
+import asyncio
+import statistics
+from typing import List
+
+import httpx
+import pytest
+from httpx_sse import aconnect_sse
+
+
+@pytest.mark.loadtest
+async def test_memory_stability_under_load(
+    sse_server_url: str,
+    scale: int,
+    duration_minutes: int,
+) -> None:
+    """
+    Connect many clients, stream for duration, verify memory is stable.
+
+    Pass criteria:
+    - Memory growth < 50MB over test duration
+    - No unbounded growth trend (linear regression slope < 0.1 MB/sec)
+    """
+    events_per_client = duration_minutes * 60 * 10  # 10 events/sec
+
+    async def client_task(client_id: int) -> int:
+        """Single client consuming SSE events."""
+        events_received = 0
+        try:
+            async with httpx.AsyncClient(timeout=300.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.1"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        events_received += 1
+                        if events_received >= events_per_client:
+                            break
+        except Exception:
+            pass  # Connection errors during shutdown are expected
+        return events_received
+
+    # Get baseline memory
+    async with httpx.AsyncClient() as client:
+        baseline = (await client.get(f"{sse_server_url}/metrics")).json()
+    baseline_memory = baseline["memory_rss_mb"]
+
+    # Start all clients
+    tasks = [asyncio.create_task(client_task(i)) for i in range(scale)]
+
+    # Sample memory periodically
+    memory_samples: List[float] = []
+    sample_interval = max(10, duration_minutes * 6)  # At least 10 samples
+
+    for _ in range(sample_interval):
+        await asyncio.sleep(duration_minutes * 60 / sample_interval)
+        try:
+            async with httpx.AsyncClient() as client:
+                metrics = (await client.get(f"{sse_server_url}/metrics")).json()
+                memory_samples.append(metrics["memory_rss_mb"])
+        except Exception:
+            pass  # Server might be under heavy load
+
+    # Wait for all clients to complete
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    completed = sum(1 for r in results if isinstance(r, int))
+
+    # Get final memory
+    async with httpx.AsyncClient() as client:
+        final = (await client.get(f"{sse_server_url}/metrics")).json()
+    final_memory = final["memory_rss_mb"]
+
+    # Calculate memory growth
+    max_memory = max(memory_samples) if memory_samples else final_memory
+    memory_growth = max_memory - baseline_memory
+
+    # Calculate growth trend (simple linear regression slope)
+    if len(memory_samples) >= 2:
+        x_mean = len(memory_samples) / 2
+        y_mean = statistics.mean(memory_samples)
+        numerator = sum(
+            (i - x_mean) * (y - y_mean) for i, y in enumerate(memory_samples)
+        )
+        denominator = sum((i - x_mean) ** 2 for i in range(len(memory_samples)))
+        slope = numerator / denominator if denominator else 0
+        # Convert to MB/sec
+        sample_interval_sec = duration_minutes * 60 / len(memory_samples)
+        slope_per_sec = slope / sample_interval_sec
+    else:
+        slope_per_sec = 0
+
+    # Assert criteria
+    assert (
+        completed >= scale * 0.9
+    ), f"Too many failed connections: {completed}/{scale} completed"
+    assert memory_growth < 50, (
+        f"Memory grew by {memory_growth:.1f}MB (baseline: {baseline_memory:.1f}MB, "
+        f"max: {max_memory:.1f}MB), expected < 50MB"
+    )
+    assert (
+        slope_per_sec < 0.1
+    ), f"Memory growth trend {slope_per_sec:.3f} MB/sec, expected < 0.1 MB/sec"
+
+
+@pytest.mark.loadtest
+async def test_memory_returns_to_baseline_after_disconnect(
+    sse_server_url: str,
+    scale: int,
+) -> None:
+    """
+    Connect many clients, disconnect all, verify memory returns near baseline.
+
+    Pass criteria:
+    - Memory within 20% of baseline after all connections close
+    """
+
+    async def client_task(client_id: int) -> None:
+        """Client that connects, receives few events, then disconnects."""
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.01"
+                ) as source:
+                    count = 0
+                    async for _ in source.aiter_sse():
+                        count += 1
+                        if count >= 50:
+                            break
+        except Exception:
+            pass
+
+    # Get baseline
+    async with httpx.AsyncClient() as client:
+        baseline = (await client.get(f"{sse_server_url}/metrics")).json()
+    baseline_memory = baseline["memory_rss_mb"]
+
+    # Connect and disconnect clients in batches
+    batch_size = min(100, scale)
+    for batch_start in range(0, scale, batch_size):
+        batch_end = min(batch_start + batch_size, scale)
+        tasks = [
+            asyncio.create_task(client_task(i)) for i in range(batch_start, batch_end)
+        ]
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Wait for cleanup
+    await asyncio.sleep(2)
+
+    # Check memory returned to near baseline
+    async with httpx.AsyncClient() as client:
+        final = (await client.get(f"{sse_server_url}/metrics")).json()
+    final_memory = final["memory_rss_mb"]
+
+    # Allow 20% growth from baseline (some overhead is expected)
+    max_allowed = baseline_memory * 1.2
+    assert final_memory <= max_allowed, (
+        f"Memory did not return to baseline: {final_memory:.1f}MB "
+        f"(baseline: {baseline_memory:.1f}MB, max allowed: {max_allowed:.1f}MB)"
+    )
+
+
+@pytest.mark.loadtest
+async def test_event_set_cleanup(sse_server_url: str, scale: int) -> None:
+    """
+    Verify the internal event set empties after connections close.
+
+    This tests the Issue #152 fix - events should be properly removed
+    from the thread-local state when connections close.
+    """
+
+    connected = asyncio.Event()
+    connection_count = 0
+
+    async def client_task() -> None:
+        nonlocal connection_count
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.5"
+                ) as source:
+                    connection_count += 1
+                    if connection_count >= scale * 0.5:
+                        connected.set()
+                    count = 0
+                    async for _ in source.aiter_sse():
+                        count += 1
+                        if count >= 5:  # Stay connected for ~2.5s
+                            break
+        except Exception:
+            pass
+
+    # Get baseline event count
+    async with httpx.AsyncClient() as client:
+        baseline = (await client.get(f"{sse_server_url}/metrics")).json()
+    baseline_events = baseline["registered_events"]
+
+    # Connect many clients
+    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+
+    # Wait for connections to establish (with timeout)
+    try:
+        await asyncio.wait_for(connected.wait(), timeout=10)
+    except asyncio.TimeoutError:
+        pass
+    await asyncio.sleep(0.5)  # Extra margin
+
+    # Check events registered during peak
+    async with httpx.AsyncClient() as client:
+        peak = (await client.get(f"{sse_server_url}/metrics")).json()
+    peak_events = peak["registered_events"]
+
+    # Wait for all to complete
+    await asyncio.gather(*tasks, return_exceptions=True)
+    await asyncio.sleep(2)  # Allow cleanup time
+
+    # Check events cleaned up
+    async with httpx.AsyncClient() as client:
+        final = (await client.get(f"{sse_server_url}/metrics")).json()
+    final_events = final["registered_events"]
+
+    # Events should have been registered during peak (relaxed threshold)
+    assert peak_events >= scale * 0.2, (
+        f"Expected at least {scale * 0.2} events registered during peak, "
+        f"got {peak_events}"
+    )
+
+    # Events should be cleaned up after
+    assert final_events <= baseline_events + 10, (
+        f"Event set not cleaned up: {final_events} events remaining "
+        f"(baseline: {baseline_events})"
+    )
diff --git a/tests/load/test_shutdown.py b/tests/load/test_shutdown.py
new file mode 100644
index 0000000..1bdf43d
--- /dev/null
+++ b/tests/load/test_shutdown.py
@@ -0,0 +1,182 @@
+"""
+Graceful shutdown tests under load.
+
+Verifies clean shutdown behavior with many active connections.
+"""
+
+import asyncio
+import signal
+import time
+
+import httpx
+import pytest
+from httpx_sse import aconnect_sse
+
+
+@pytest.mark.loadtest
+async def test_graceful_shutdown_with_active_connections(
+    docker_available: bool,
+    scale: int,
+) -> None:
+    """
+    Send SIGTERM to server with active connections, verify clean shutdown.
+
+    Pass criteria:
+    - Shutdown completes within 5 seconds
+    - All connections receive disconnect (no hanging clients)
+    """
+    if not docker_available:
+        pytest.skip("Docker not available")
+
+    from tests.load.conftest import SSELoadTestContainer
+
+    container = SSELoadTestContainer()
+    container.start()
+
+    # Wait for server ready
+    await asyncio.sleep(2)
+    base_url = container.get_base_url()
+
+    # Verify server is up
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(f"{base_url}/health")
+        assert resp.status_code == 200
+
+    disconnected = asyncio.Event()
+    connections_made = 0
+    connections_closed = 0
+
+    async def client_task() -> str:
+        nonlocal connections_made, connections_closed
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{base_url}/sse?delay=0.1"
+                ) as source:
+                    connections_made += 1
+                    async for _ in source.aiter_sse():
+                        if disconnected.is_set():
+                            break
+            connections_closed += 1
+            return "clean_close"
+        except httpx.RemoteProtocolError:
+            connections_closed += 1
+            return "server_closed"
+        except Exception as e:
+            connections_closed += 1
+            return f"error:{type(e).__name__}"
+
+    # Start clients
+    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+
+    # Wait for connections to establish
+    await asyncio.sleep(2)
+
+    # Send SIGTERM to container
+    start_shutdown = time.perf_counter()
+    container.get_wrapped_container().kill(signal=signal.SIGTERM)
+
+    # Wait for shutdown
+    shutdown_timeout = 10
+    try:
+        results = await asyncio.wait_for(
+            asyncio.gather(*tasks, return_exceptions=True),
+            timeout=shutdown_timeout,
+        )
+    except asyncio.TimeoutError:
+        # Cancel remaining tasks
+        for task in tasks:
+            task.cancel()
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    shutdown_time = time.perf_counter() - start_shutdown
+
+    # Cleanup container
+    try:
+        container.stop()
+    except Exception:
+        pass
+
+    # Analyze results
+    clean_closes = sum(1 for r in results if r == "clean_close")
+    server_closes = sum(1 for r in results if r == "server_closed")
+    errors = sum(1 for r in results if isinstance(r, str) and r.startswith("error:"))
+
+    # All connections should have closed (one way or another)
+    total_closed = clean_closes + server_closes + errors
+    assert (
+        total_closed >= scale * 0.9
+    ), f"Not all connections closed: {total_closed}/{scale}"
+
+    # Shutdown should be fast
+    assert shutdown_time < 10, f"Shutdown took {shutdown_time:.1f}s, expected < 10s"
+
+
+@pytest.mark.loadtest
+async def test_connections_receive_shutdown_signal(
+    docker_available: bool,
+) -> None:
+    """
+    Verify connections are notified of shutdown via SSE.
+
+    When AppStatus.should_exit is set, active streams should terminate gracefully.
+    """
+    if not docker_available:
+        pytest.skip("Docker not available")
+
+    from tests.load.conftest import SSELoadTestContainer
+
+    container = SSELoadTestContainer()
+    container.start()
+
+    await asyncio.sleep(2)
+    base_url = container.get_base_url()
+
+    # Connect clients that will wait for events
+    async def client_task() -> int:
+        count = 0
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{base_url}/sse?delay=0.5"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        count += 1
+                        if count >= 20:  # Should not reach this
+                            break
+        except Exception:
+            pass
+        return count
+
+    tasks = [asyncio.create_task(client_task()) for _ in range(10)]
+
+    # Let them receive a few events
+    await asyncio.sleep(3)
+
+    # Kill the server
+    container.get_wrapped_container().kill(signal=signal.SIGTERM)
+
+    # Gather results
+    try:
+        results = await asyncio.wait_for(
+            asyncio.gather(*tasks, return_exceptions=True),
+            timeout=10,
+        )
+    except asyncio.TimeoutError:
+        for task in tasks:
+            task.cancel()
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    try:
+        container.stop()
+    except Exception:
+        pass
+
+    # Clients should have received some events before shutdown
+    event_counts = [r for r in results if isinstance(r, int)]
+    total_events = sum(event_counts)
+
+    assert total_events > 0, "Clients should have received events before shutdown"
+    assert all(
+        c < 20 for c in event_counts
+    ), "Clients should have been interrupted by shutdown"
diff --git a/tests/load/test_throughput.py b/tests/load/test_throughput.py
new file mode 100644
index 0000000..22d8407
--- /dev/null
+++ b/tests/load/test_throughput.py
@@ -0,0 +1,185 @@
+"""
+Throughput and latency tests for sse-starlette.
+
+Measures events per second, latency percentiles, and first event latency.
+"""
+
+import asyncio
+import time
+from typing import List
+
+import httpx
+import pytest
+from httpx_sse import aconnect_sse
+
+
+@pytest.mark.loadtest
+async def test_throughput_single_client(sse_server_url: str) -> None:
+    """
+    Measure maximum throughput for a single client.
+
+    Baseline measurement without contention.
+    """
+    events_received = 0
+    start_time = time.perf_counter()
+    duration_seconds = 10
+
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        async with aconnect_sse(
+            client, "GET", f"{sse_server_url}/sse?delay=0"
+        ) as source:
+            async for _ in source.aiter_sse():
+                events_received += 1
+                if time.perf_counter() - start_time >= duration_seconds:
+                    break
+
+    elapsed = time.perf_counter() - start_time
+    throughput = events_received / elapsed
+
+    # Should achieve at least 1000 events/sec for a single client
+    assert (
+        throughput >= 1000
+    ), f"Single client throughput {throughput:.0f} events/sec, expected >= 1000"
+
+
+@pytest.mark.loadtest
+async def test_throughput_multiple_clients(
+    sse_server_url: str,
+    scale: int,
+) -> None:
+    """
+    Measure aggregate throughput with multiple concurrent clients.
+
+    Pass criteria:
+    - Aggregate throughput > 10,000 events/sec
+    """
+    duration_seconds = 30
+
+    async def client_task() -> int:
+        count = 0
+        start = time.perf_counter()
+        try:
+            async with httpx.AsyncClient(timeout=60.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.001"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        count += 1
+                        if time.perf_counter() - start >= duration_seconds:
+                            break
+        except Exception:
+            pass
+        return count
+
+    start_time = time.perf_counter()
+    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    elapsed = time.perf_counter() - start_time
+
+    total_events = sum(r for r in results if isinstance(r, int))
+    aggregate_throughput = total_events / elapsed
+
+    # With scale clients, should achieve high aggregate throughput
+    min_expected = min(10000, scale * 100)  # Scale expectation with client count
+    assert aggregate_throughput >= min_expected, (
+        f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {scale} "
+        f"clients, expected >= {min_expected}"
+    )
+
+
+@pytest.mark.loadtest
+async def test_first_event_latency(
+    sse_server_url: str,
+    scale: int,
+) -> None:
+    """
+    Measure time to first event (TTFE) for multiple connections.
+
+    Pass criteria (relaxed for Docker overhead):
+    - p50 TTFE < 2000ms
+    - p99 TTFE < 5000ms
+    """
+    latencies: List[float] = []
+
+    async def measure_ttfe() -> float:
+        start = time.perf_counter()
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        return (time.perf_counter() - start) * 1000  # ms
+        except Exception:
+            return -1
+        return -1
+
+    tasks = [asyncio.create_task(measure_ttfe()) for _ in range(scale)]
+    results = await asyncio.gather(*tasks)
+
+    latencies = [r for r in results if r > 0]
+
+    if len(latencies) < scale * 0.9:
+        pytest.fail(f"Too many failed connections: {len(latencies)}/{scale}")
+
+    latencies.sort()
+    p50 = latencies[len(latencies) // 2]
+    p99 = latencies[int(len(latencies) * 0.99)]
+
+    # Relaxed thresholds: Docker networking + container overhead
+    assert p50 < 2000, f"p50 TTFE {p50:.1f}ms, expected < 2000ms"
+    assert p99 < 5000, f"p99 TTFE {p99:.1f}ms, expected < 5000ms"
+
+
+@pytest.mark.loadtest
+async def test_event_latency_under_load(
+    sse_server_url: str,
+    scale: int,
+) -> None:
+    """
+    Measure event-to-event latency under load.
+
+    Captures latency between consecutive events to detect backpressure.
+    """
+    all_latencies: List[float] = []
+
+    async def measure_latencies() -> List[float]:
+        latencies: List[float] = []
+        last_time = None
+        try:
+            async with httpx.AsyncClient(timeout=60.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.01"
+                ) as source:
+                    count = 0
+                    async for _ in source.aiter_sse():
+                        now = time.perf_counter()
+                        if last_time is not None:
+                            latencies.append((now - last_time) * 1000)
+                        last_time = now
+                        count += 1
+                        if count >= 100:
+                            break
+        except Exception:
+            pass
+        return latencies
+
+    tasks = [asyncio.create_task(measure_latencies()) for _ in range(scale)]
+    results = await asyncio.gather(*tasks)
+
+    for client_latencies in results:
+        all_latencies.extend(client_latencies)
+
+    if len(all_latencies) < 100:
+        pytest.fail(f"Insufficient latency samples: {len(all_latencies)}")
+
+    all_latencies.sort()
+    p50 = all_latencies[len(all_latencies) // 2]
+    p95 = all_latencies[int(len(all_latencies) * 0.95)]
+    p99 = all_latencies[int(len(all_latencies) * 0.99)]
+
+    # Expected ~10ms between events (0.01s delay)
+    # Allow 2x for processing overhead under load
+    assert p50 < 50, f"p50 inter-event latency {p50:.1f}ms, expected < 50ms"
+    assert p95 < 100, f"p95 inter-event latency {p95:.1f}ms, expected < 100ms"
+    assert p99 < 200, f"p99 inter-event latency {p99:.1f}ms, expected < 200ms"
diff --git a/tests/load/test_watcher_scale.py b/tests/load/test_watcher_scale.py
new file mode 100644
index 0000000..3a4d150
--- /dev/null
+++ b/tests/load/test_watcher_scale.py
@@ -0,0 +1,159 @@
+"""
+Watcher deduplication tests at scale.
+
+Validates the Issue #152 fix: only one watcher task per thread regardless
+of the number of concurrent connections.
+"""
+
+import asyncio
+
+import httpx
+import pytest
+from httpx_sse import aconnect_sse
+
+
+@pytest.mark.loadtest
+async def test_single_watcher_with_many_connections(
+    sse_server_url: str,
+    scale: int,
+) -> None:
+    """
+    With N concurrent connections, verify only 1 watcher is running.
+
+    This is the core regression test for Issue #152.
+    """
+
+    async def client_task() -> None:
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.1"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        await asyncio.sleep(5)  # Stay connected
+                        break
+        except Exception:
+            pass
+
+    # Start many connections
+    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+
+    # Wait for connections to establish
+    await asyncio.sleep(2)
+
+    # Check watcher status
+    async with httpx.AsyncClient() as client:
+        metrics = (await client.get(f"{sse_server_url}/metrics")).json()
+
+    watcher_started = metrics["watcher_started"]
+    registered_events = metrics["registered_events"]
+
+    # Cancel all tasks
+    for task in tasks:
+        task.cancel()
+    await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Watcher should be running
+    assert watcher_started is True, "Watcher should be started with active connections"
+
+    # Should have many events registered (one per connection)
+    assert (
+        registered_events >= scale * 0.5
+    ), f"Expected at least {scale * 0.5} events, got {registered_events}"
+
+
+@pytest.mark.loadtest
+async def test_rapid_connect_disconnect_watcher_stability(
+    sse_server_url: str,
+    scale: int,
+) -> None:
+    """
+    Rapid connect/disconnect cycles should not accumulate watchers.
+
+    Each connect/disconnect should reuse the existing watcher, not spawn new ones.
+    """
+
+    async def quick_connect() -> None:
+        try:
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.01"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        break  # Disconnect after first event
+        except Exception:
+            pass
+
+    # Rapid connect/disconnect cycles
+    for batch in range(scale // 10):
+        tasks = [asyncio.create_task(quick_connect()) for _ in range(10)]
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Brief pause
+    await asyncio.sleep(0.5)
+
+    # Check metrics - watcher should still be singular
+    async with httpx.AsyncClient() as client:
+        metrics = (await client.get(f"{sse_server_url}/metrics")).json()
+
+    # The watcher_started flag confirms single watcher pattern
+    # If multiple watchers had spawned, we'd see resource issues
+    num_threads = metrics["num_threads"]
+
+    # Thread count should be reasonable (not proportional to connection count)
+    # A healthy uvicorn worker has ~5-10 threads typically
+    assert num_threads < 50, f"Too many threads ({num_threads}), possible watcher leak"
+
+
+@pytest.mark.loadtest
+async def test_watcher_cleanup_allows_restart(sse_server_url: str) -> None:
+    """
+    After all connections close, new connections should start fresh watcher.
+
+    Tests the watcher lifecycle: start -> broadcast -> cleanup -> restart.
+    """
+
+    async def connect_and_consume(n_events: int) -> int:
+        count = 0
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                async with aconnect_sse(
+                    client, "GET", f"{sse_server_url}/sse?delay=0.05"
+                ) as source:
+                    async for _ in source.aiter_sse():
+                        count += 1
+                        if count >= n_events:
+                            break
+        except Exception:
+            pass
+        return count
+
+    # Phase 1: Connect, consume, disconnect
+    tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)]
+    results = await asyncio.gather(*tasks)
+    assert sum(results) > 0, "Phase 1 should have received events"
+
+    # Wait for cleanup
+    await asyncio.sleep(1)
+
+    # Check state is clean
+    async with httpx.AsyncClient() as client:
+        metrics1 = (await client.get(f"{sse_server_url}/metrics")).json()
+    events_after_phase1 = metrics1["registered_events"]
+
+    # Phase 2: New connections should work
+    tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)]
+    results = await asyncio.gather(*tasks)
+    assert sum(results) > 0, "Phase 2 should have received events"
+
+    # Wait for cleanup
+    await asyncio.sleep(1)
+
+    # Verify clean state
+    async with httpx.AsyncClient() as client:
+        metrics2 = (await client.get(f"{sse_server_url}/metrics")).json()
+
+    # Events should be cleaned up after both phases
+    assert (
+        metrics2["registered_events"] <= events_after_phase1 + 5
+    ), "Event set should be cleaned up between phases"
diff --git a/uv.lock b/uv.lock
index a2ba9db..cf95af6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1294,6 +1294,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload_time = "2024-12-06T15:37:21.509Z" },
 ]
 
+[[package]]
+name = "httpx-sse"
+version = "0.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload_time = "2025-10-10T21:48:22.271Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload_time = "2025-10-10T21:48:21.158Z" },
+]
+
 [[package]]
 name = "hyperlink"
 version = "21.0.0"
@@ -2347,6 +2356,10 @@ granian = [
     { name = "granian", version = "2.5.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "granian", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
 ]
+loadtest = [
+    { name = "httpx-sse" },
+    { name = "psutil" },
+]
 uvicorn = [
     { name = "uvicorn" },
 ]
@@ -2357,6 +2370,7 @@ dev = [
     { name = "async-timeout" },
     { name = "build" },
     { name = "httpx" },
+    { name = "httpx-sse" },
     { name = "mypy" },
     { name = "portend" },
     { name = "pre-commit", version = "4.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@@ -2383,12 +2397,14 @@ requires-dist = [
     { name = "daphne", marker = "extra == 'daphne'", specifier = ">=4.2.0" },
     { name = "fastapi", marker = "extra == 'examples'", specifier = ">=0.115.12" },
     { name = "granian", marker = "extra == 'granian'", specifier = ">=2.3.1" },
+    { name = "httpx-sse", marker = "extra == 'loadtest'", specifier = ">=0.4.0" },
+    { name = "psutil", marker = "extra == 'loadtest'", specifier = ">=6.1.1" },
     { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'examples'", specifier = ">=2.0.41" },
     { name = "starlette", specifier = ">=0.49.1" },
     { name = "uvicorn", marker = "extra == 'examples'", specifier = ">=0.34.0" },
     { name = "uvicorn", marker = "extra == 'uvicorn'", specifier = ">=0.34.0" },
 ]
-provides-extras = ["examples", "uvicorn", "granian", "daphne"]
+provides-extras = ["examples", "uvicorn", "granian", "daphne", "loadtest"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -2396,6 +2412,7 @@ dev = [
     { name = "async-timeout", specifier = ">=5.0.1" },
     { name = "build", specifier = ">=1.2.2.post1" },
     { name = "httpx", specifier = ">=0.28.1" },
+    { name = "httpx-sse", specifier = ">=0.4.0" },
     { name = "mypy", specifier = ">=1.14.0" },
     { name = "portend", specifier = ">=3.2.0" },
     { name = "pre-commit", specifier = ">=4.0.0" },

From 03255553be096bd769713c9d5c119e825f912edd Mon Sep 17 00:00:00 2001
From: sysid <sysid@gmx.de>
Date: Fri, 2 Jan 2026 12:33:17 +0100
Subject: [PATCH 3/4] feat(loadtest): add metrics infrastructure with baseline
 comparison

Add structured metrics collection, baseline management, and HTML/JSON
reporting to load tests. Tests now produce observable performance data
instead of just pass/fail results.

New components:
- MetricsCollector: aggregates latency, memory, throughput samples
- BaselineManager: stores/loads per-test baselines, detects regressions
- ReportGenerator: produces JSON reports and self-contained HTML with
  inline SVG charts

All 15 load tests updated with:
- Metrics collection integration
- Structured docstrings explaining what/why/how for each test
- Baseline comparison and optional regression detection

CLI options added: --update-baseline, --fail-on-regression,
--output-dir, --baselines-dir, --regression-threshold

GitHub workflow updated with baseline update and regression detection
inputs, plus artifact upload for reports.
---
 .github/workflows/load-test.yml     |  38 +-
 .gitignore                          |   4 +
 tests/load/README.md                | 228 +++++++++++
 tests/load/baseline.py              | 338 +++++++++++++++
 tests/load/baselines/.gitkeep       |   0
 tests/load/conftest.py              | 115 +++++-
 tests/load/metrics.py               | 422 +++++++++++++++++++
 tests/load/reporter.py              | 615 ++++++++++++++++++++++++++++
 tests/load/test_backpressure.py     | 349 +++++++++++++---
 tests/load/test_memory_stability.py | 341 ++++++++++++---
 tests/load/test_shutdown.py         | 211 ++++++++--
 tests/load/test_throughput.py       | 342 ++++++++++++++--
 tests/load/test_watcher_scale.py    | 323 +++++++++++++--
 13 files changed, 3101 insertions(+), 225 deletions(-)
 create mode 100644 tests/load/README.md
 create mode 100644 tests/load/baseline.py
 create mode 100644 tests/load/baselines/.gitkeep
 create mode 100644 tests/load/metrics.py
 create mode 100644 tests/load/reporter.py

diff --git a/.github/workflows/load-test.yml b/.github/workflows/load-test.yml
index 59eeb79..ec3691e 100644
--- a/.github/workflows/load-test.yml
+++ b/.github/workflows/load-test.yml
@@ -21,6 +21,16 @@ on:
           - '1'
           - '5'
           - '10'
+      update_baseline:
+        description: 'Update baselines after run'
+        required: false
+        default: false
+        type: boolean
+      fail_on_regression:
+        description: 'Fail if regression detected'
+        required: false
+        default: false
+        type: boolean
 
 jobs:
   load-test:
@@ -51,6 +61,9 @@ jobs:
           python -m pytest tests/load/ -m "loadtest" \
             --scale=${{ inputs.scale }} \
             --duration=${{ inputs.duration }} \
+            --output-dir=tests/load/results \
+            ${{ inputs.update_baseline && '--update-baseline' || '' }} \
+            ${{ inputs.fail_on_regression && '--fail-on-regression' || '' }} \
             -v --tb=short \
             --junitxml=load-test-results.xml
 
@@ -58,11 +71,21 @@ jobs:
         uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: load-test-results
+          name: load-test-results-${{ github.sha }}
           path: |
             load-test-results.xml
+            tests/load/results/*.json
+            tests/load/results/*.html
           retention-days: 30
 
+      - name: Upload updated baselines
+        uses: actions/upload-artifact@v4
+        if: inputs.update_baseline
+        with:
+          name: updated-baselines-${{ github.sha }}
+          path: tests/load/baselines/*.json
+          retention-days: 90
+
       - name: Test Summary
         if: always()
         run: |
@@ -70,7 +93,16 @@ jobs:
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "- **Scale**: ${{ inputs.scale }} concurrent connections" >> $GITHUB_STEP_SUMMARY
           echo "- **Duration**: ${{ inputs.duration }} minutes" >> $GITHUB_STEP_SUMMARY
+          echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          if [ -f load-test-results.xml ]; then
-            echo "Test results uploaded as artifact." >> $GITHUB_STEP_SUMMARY
+          echo "### Reports" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ -d tests/load/results ]; then
+            for f in tests/load/results/*.json; do
+              if [ -f "$f" ]; then
+                echo "- $(basename "$f")" >> $GITHUB_STEP_SUMMARY
+              fi
+            done
           fi
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "Download artifacts for detailed HTML reports with charts." >> $GITHUB_STEP_SUMMARY
diff --git a/.gitignore b/.gitignore
index 02740bb..6f1e208 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,7 @@ venv
 Pipfile.lock
 .envrc
 .pdm-python
+
+# Load test results (generated, not tracked)
+tests/load/results/
+!tests/load/results/.gitkeep
diff --git a/tests/load/README.md b/tests/load/README.md
new file mode 100644
index 0000000..13332a5
--- /dev/null
+++ b/tests/load/README.md
@@ -0,0 +1,228 @@
+# SSE-Starlette Load Tests
+
+Performance and stability tests for the SSE implementation under realistic load conditions.
+
+## Overview
+
+These tests measure performance characteristics that unit tests cannot capture:
+- Throughput under concurrent load
+- Memory stability over time
+- Resource cleanup after disconnections
+- Graceful shutdown behavior
+- Backpressure handling
+
+## Quick Start
+
+```bash
+# Run load tests locally (requires Docker)
+make test-load
+
+# Run with custom scale
+make test-load PYTEST_ARGS="--scale=500 --duration=5"
+
+# Update baselines after intentional changes
+make test-load PYTEST_ARGS="--update-baseline"
+```
+
+## Architecture
+
+```
+tests/load/
+├── conftest.py          # Fixtures, CLI options, Docker container setup
+├── metrics.py           # MetricsCollector, statistics computation
+├── baseline.py          # BaselineManager, regression detection
+├── reporter.py          # JSON + HTML report generation
+├── server_app.py        # Test server with /sse and /metrics endpoints
+├── Dockerfile.loadtest  # Container for isolated server testing
+├── baselines/           # Git-tracked baseline files (*.json)
+├── results/             # Generated reports (gitignored)
+└── test_*.py            # Test modules
+```
+
+## KPI Persistence & Baselining
+
+### How It Works
+
+1. **During Test Run**: `MetricsCollector` aggregates samples (latencies, memory, events)
+2. **After Test**: Statistics computed (p50/p95/p99, mean, stdev, slopes)
+3. **Report Generation**: JSON file saved to `tests/load/results/<test_name>.json`
+4. **Baseline Comparison**: Current run compared against `tests/load/baselines/<test_name>.json`
+5. **Regression Detection**: Percent changes flagged if exceeding thresholds
+
+### Baseline Files
+
+Baselines are **git-tracked** so changes are visible in PRs:
+
+```
+tests/load/baselines/
+├── test_throughput_single_client.json
+├── test_memory_stability_under_load.json
+└── ...
+```
+
+Each baseline contains:
+```json
+{
+  "test_name": "test_throughput_single_client",
+  "timestamp": "2024-01-15T14:30:00Z",
+  "git_commit": "abc1234",
+  "throughput": {
+    "aggregate_events_per_sec": 12456.7,
+    "per_client_events_per_sec": [12456.7]
+  },
+  "latency": { "p50_ms": 14.8, "p95_ms": 21.4, "p99_ms": 27.4 },
+  "memory": { "baseline_mb": 45.2, "peak_mb": 67.8, "growth_mb": 22.6 }
+}
+```
+
+### Updating Baselines
+
+```bash
+# After intentional performance changes (optimization, new features)
+make test-load PYTEST_ARGS="--update-baseline"
+
+# Then commit the updated baseline files
+git add tests/load/baselines/
+git commit -m "Update load test baselines after optimization"
+```
+
+### Regression Detection
+
+| Metric | Warning Threshold | Fail Threshold |
+|--------|-------------------|----------------|
+| Latency p99 | +20% | +50% |
+| Throughput | -20% | - |
+| Memory growth | +50% | - |
+| Memory slope | - | >0.1 MB/sec |
+| Error rate | - | >5% |
+
+Enable in CI:
+```bash
+make test-load PYTEST_ARGS="--fail-on-regression"
+```
+
+## CLI Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--scale` | 100 | Concurrent connections |
+| `--duration` | 1 | Test duration (minutes) |
+| `--output-dir` | `tests/load/results` | Report output directory |
+| `--baselines-dir` | `tests/load/baselines` | Baseline file directory |
+| `--update-baseline` | False | Save current run as new baseline |
+| `--fail-on-regression` | False | Exit non-zero if regression detected |
+| `--regression-threshold` | 20 | Percent change to trigger warning |
+
+## Test Categories
+
+### Throughput (`test_throughput.py`)
+- Single client maximum throughput (baseline without contention)
+- Multi-client aggregate throughput (scaling behavior)
+- Time to first event (connection setup latency)
+- Inter-event latency under load (backpressure detection)
+
+### Memory Stability (`test_memory_stability.py`)
+- Memory growth during sustained streaming
+- Memory reclamation after disconnect
+- Event set cleanup (Issue #152 regression)
+
+### Watcher Scale (`test_watcher_scale.py`)
+- Single watcher with many connections (Issue #152 core test)
+- Watcher stability under rapid churn
+- Watcher lifecycle (start → broadcast → cleanup → restart)
+
+### Shutdown (`test_shutdown.py`)
+- Graceful shutdown timing with active connections
+- Shutdown signal propagation to streams
+
+### Backpressure (`test_backpressure.py`)
+- Slow client isolation (fast clients unaffected)
+- Resource stability under connection churn
+- send_timeout behavior with frozen clients
+
+## Limitations: What These Tests Don't Cover
+
+### Not Measured
+
+1. **True Production Scale**
+   - Tests run at 100-1000 connections; production may see 10K+
+   - Resource contention patterns differ at extreme scale
+   - OS-level limits (ulimit, ephemeral ports) not tested
+
+2. **Network Conditions**
+   - Tests run on localhost/Docker bridge
+   - No simulation of latency, packet loss, or bandwidth limits
+   - Real network jitter not captured
+
+3. **Long-Running Stability**
+   - Tests run for minutes; production runs for days/weeks
+   - Slow leaks (bytes/hour) may not appear in short tests
+   - GC pressure patterns differ over extended periods
+
+4. **CPU Profiling**
+   - No measurement of CPU cycles per event
+   - Hot path optimization regressions not detected
+   - Async scheduler overhead not isolated
+
+5. **Multi-Process/Multi-Node**
+   - Tests run single uvicorn process
+   - No testing of gunicorn worker coordination
+   - No distributed load balancer behavior
+
+6. **Client Diversity**
+   - All clients use httpx (same HTTP/1.1 implementation)
+   - No HTTP/2 or HTTP/3 testing
+   - No browser-specific SSE behavior (reconnection, Last-Event-ID)
+
+7. **Garbage Collection Impact**
+   - Python GC pauses not isolated
+   - Memory pressure from other processes not simulated
+   - Different GC generations not separately measured
+
+### Potential Blind Spots
+
+| Regression Type | Detection Gap |
+|-----------------|---------------|
+| 5% throughput drop | Below noise floor |
+| Sub-millisecond latency spikes | Averaged out in percentiles |
+| Memory leak < 1KB/connection | Too slow to appear in test duration |
+| CPU regression without throughput impact | Not measured |
+| Thread pool exhaustion at >1000 connections | Scale not tested |
+| Event loop blocking < 10ms | Within jitter tolerance |
+
+### Recommendations for Production
+
+1. **APM Integration**: Use Datadog/NewRelic for continuous production metrics
+2. **Synthetic Monitoring**: Run periodic load tests against staging
+3. **Canary Deployments**: Compare metrics between old/new versions
+4. **Memory Profiling**: Run tracemalloc in staging for leak detection
+5. **CPU Profiling**: Use py-spy periodically to catch hot path regressions
+
+## Report Outputs
+
+### JSON Report
+Full structured data for programmatic analysis:
+```
+tests/load/results/test_throughput_single_client.json
+```
+
+### HTML Report
+Self-contained visualization with inline SVG charts:
+```
+tests/load/results/test_throughput_single_client.html
+```
+
+Features:
+- Summary metrics table
+- Memory usage over time chart
+- Latency distribution (when applicable)
+- Comparison against baseline with delta percentages
+- Regression/warning highlights
+
+## GitHub Actions Integration
+
+The workflow (`.github/workflows/load-test.yml`) supports:
+- Manual trigger with scale/duration inputs
+- Baseline update option
+- Regression detection for CI gates
+- Artifact upload for reports
diff --git a/tests/load/baseline.py b/tests/load/baseline.py
new file mode 100644
index 0000000..4423b79
--- /dev/null
+++ b/tests/load/baseline.py
@@ -0,0 +1,338 @@
+"""
+Baseline management for load test metrics.
+
+Handles loading, saving, and comparing performance baselines.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from .metrics import (
+    LatencyStats,
+    MemoryStats,
+    ReliabilityStats,
+    SSEInternals,
+    TestReport,
+    ThroughputStats,
+)
+
+
+@dataclass
+class ComparisonResult:
+    """Result of comparing current run against baseline."""
+
+    # Percent changes (positive = worse for latency/memory, negative = worse for throughput)
+    latency_p99_change_pct: float | None = None
+    latency_p50_change_pct: float | None = None
+    ttfe_p99_change_pct: float | None = None
+    throughput_change_pct: float | None = None
+    memory_growth_change_pct: float | None = None
+    memory_slope_change_pct: float | None = None
+    error_rate_change_pct: float | None = None
+
+    # Regression detection
+    regression_detected: bool = False
+    regression_reasons: list[str] | None = None
+    warnings: list[str] | None = None
+
+    # Baseline info
+    baseline_commit: str | None = None
+    baseline_timestamp: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to JSON-serializable dict."""
+        result: dict[str, Any] = {}
+
+        if self.baseline_commit:
+            result["baseline_commit"] = self.baseline_commit
+        if self.baseline_timestamp:
+            result["baseline_timestamp"] = self.baseline_timestamp
+
+        if self.latency_p99_change_pct is not None:
+            result["latency_p99_change_pct"] = round(self.latency_p99_change_pct, 2)
+        if self.latency_p50_change_pct is not None:
+            result["latency_p50_change_pct"] = round(self.latency_p50_change_pct, 2)
+        if self.ttfe_p99_change_pct is not None:
+            result["ttfe_p99_change_pct"] = round(self.ttfe_p99_change_pct, 2)
+        if self.throughput_change_pct is not None:
+            result["throughput_change_pct"] = round(self.throughput_change_pct, 2)
+        if self.memory_growth_change_pct is not None:
+            result["memory_growth_change_pct"] = round(self.memory_growth_change_pct, 2)
+        if self.memory_slope_change_pct is not None:
+            result["memory_slope_change_pct"] = round(self.memory_slope_change_pct, 2)
+        if self.error_rate_change_pct is not None:
+            result["error_rate_change_pct"] = round(self.error_rate_change_pct, 2)
+
+        result["regression_detected"] = self.regression_detected
+        if self.regression_reasons:
+            result["regression_reasons"] = self.regression_reasons
+        if self.warnings:
+            result["warnings"] = self.warnings
+
+        return result
+
+
+# Default thresholds for regression detection
+DEFAULT_THRESHOLDS = {
+    "latency_p99_warning_pct": 20.0,
+    "latency_p99_fail_pct": 50.0,
+    "throughput_warning_pct": -20.0,  # Negative = decrease
+    "memory_growth_warning_pct": 50.0,
+    "memory_slope_fail": 0.1,  # MB/sec absolute threshold
+    "error_rate_fail_pct": 5.0,  # Absolute percentage
+}
+
+
+class BaselineManager:
+    """Manages per-test baselines for comparison."""
+
+    def __init__(
+        self,
+        baselines_dir: Path | str = "tests/load/baselines",
+        thresholds: dict[str, float] | None = None,
+    ):
+        self.baselines_dir = Path(baselines_dir)
+        self.thresholds = thresholds or DEFAULT_THRESHOLDS
+
+    def _baseline_path(self, test_name: str) -> Path:
+        """Get path to baseline file for a test."""
+        # Sanitize test name for filename
+        safe_name = test_name.replace("::", "_").replace("/", "_").replace("\\", "_")
+        return self.baselines_dir / f"{safe_name}.json"
+
+    def load_baseline(self, test_name: str) -> TestReport | None:
+        """Load baseline for a specific test."""
+        path = self._baseline_path(test_name)
+        if not path.exists():
+            return None
+
+        try:
+            with open(path) as f:
+                data = json.load(f)
+            return _dict_to_report(data)
+        except (json.JSONDecodeError, KeyError, TypeError):
+            return None
+
+    def save_baseline(self, report: TestReport) -> Path:
+        """Save report as new baseline."""
+        self.baselines_dir.mkdir(parents=True, exist_ok=True)
+        path = self._baseline_path(report.test_name)
+
+        with open(path, "w") as f:
+            json.dump(report.to_dict(), f, indent=2)
+
+        return path
+
+    def compare(
+        self, current: TestReport, baseline: TestReport | None = None
+    ) -> ComparisonResult:
+        """Compare current run against baseline."""
+        if baseline is None:
+            baseline = self.load_baseline(current.test_name)
+
+        if baseline is None:
+            return ComparisonResult()
+
+        result = ComparisonResult(
+            baseline_commit=baseline.git_commit,
+            baseline_timestamp=baseline.timestamp,
+        )
+        warnings: list[str] = []
+        regressions: list[str] = []
+
+        # Compare latency p99
+        if current.latency and baseline.latency:
+            if baseline.latency.p99_ms > 0:
+                change = (
+                    (current.latency.p99_ms - baseline.latency.p99_ms)
+                    / baseline.latency.p99_ms
+                    * 100
+                )
+                result.latency_p99_change_pct = change
+                if change > self.thresholds["latency_p99_fail_pct"]:
+                    regressions.append(
+                        f"Latency p99 increased by {change:.1f}% "
+                        f"(>{self.thresholds['latency_p99_fail_pct']}%)"
+                    )
+                elif change > self.thresholds["latency_p99_warning_pct"]:
+                    warnings.append(f"Latency p99 increased by {change:.1f}%")
+
+            if baseline.latency.p50_ms > 0:
+                change = (
+                    (current.latency.p50_ms - baseline.latency.p50_ms)
+                    / baseline.latency.p50_ms
+                    * 100
+                )
+                result.latency_p50_change_pct = change
+
+        # Compare TTFE p99
+        if current.ttfe and baseline.ttfe:
+            if baseline.ttfe.p99_ms > 0:
+                change = (
+                    (current.ttfe.p99_ms - baseline.ttfe.p99_ms)
+                    / baseline.ttfe.p99_ms
+                    * 100
+                )
+                result.ttfe_p99_change_pct = change
+
+        # Compare throughput
+        if current.throughput and baseline.throughput:
+            if baseline.throughput.aggregate_events_per_sec > 0:
+                change = (
+                    (
+                        current.throughput.aggregate_events_per_sec
+                        - baseline.throughput.aggregate_events_per_sec
+                    )
+                    / baseline.throughput.aggregate_events_per_sec
+                    * 100
+                )
+                result.throughput_change_pct = change
+                if change < self.thresholds["throughput_warning_pct"]:
+                    warnings.append(f"Throughput decreased by {abs(change):.1f}%")
+
+        # Compare memory growth
+        if current.memory and baseline.memory:
+            if baseline.memory.growth_mb > 0:
+                change = (
+                    (current.memory.growth_mb - baseline.memory.growth_mb)
+                    / baseline.memory.growth_mb
+                    * 100
+                )
+                result.memory_growth_change_pct = change
+                if change > self.thresholds["memory_growth_warning_pct"]:
+                    warnings.append(f"Memory growth increased by {change:.1f}%")
+
+            # Memory slope absolute check
+            if current.memory.slope_mb_per_sec > self.thresholds["memory_slope_fail"]:
+                regressions.append(
+                    f"Memory slope {current.memory.slope_mb_per_sec:.3f} MB/sec "
+                    f"exceeds threshold {self.thresholds['memory_slope_fail']} MB/sec"
+                )
+
+            if baseline.memory.slope_mb_per_sec > 0:
+                change = (
+                    (current.memory.slope_mb_per_sec - baseline.memory.slope_mb_per_sec)
+                    / baseline.memory.slope_mb_per_sec
+                    * 100
+                )
+                result.memory_slope_change_pct = change
+
+        # Compare error rate
+        if current.reliability and baseline.reliability:
+            change = (
+                current.reliability.error_rate - baseline.reliability.error_rate
+            ) * 100
+            result.error_rate_change_pct = change
+
+            if (
+                current.reliability.error_rate * 100
+                > self.thresholds["error_rate_fail_pct"]
+            ):
+                regressions.append(
+                    f"Error rate {current.reliability.error_rate * 100:.1f}% "
+                    f"exceeds threshold {self.thresholds['error_rate_fail_pct']}%"
+                )
+
+        result.regression_detected = len(regressions) > 0
+        result.regression_reasons = regressions if regressions else None
+        result.warnings = warnings if warnings else None
+
+        return result
+
+
+def _dict_to_report(data: dict[str, Any]) -> TestReport:
+    """Convert JSON dict back to TestReport."""
+    metadata = data.get("metadata", data)
+
+    # Reconstruct latency stats
+    latency = None
+    if "latency" in data:
+        lat = data["latency"]
+        latency = LatencyStats(
+            p50_ms=lat["p50_ms"],
+            p90_ms=lat["p90_ms"],
+            p95_ms=lat["p95_ms"],
+            p99_ms=lat["p99_ms"],
+            max_ms=lat["max_ms"],
+            min_ms=lat.get("min_ms", 0.0),
+            mean_ms=lat["mean_ms"],
+            stdev_ms=lat["stdev_ms"],
+            sample_count=lat["sample_count"],
+        )
+
+    ttfe = None
+    if "ttfe" in data:
+        t = data["ttfe"]
+        ttfe = LatencyStats(
+            p50_ms=t["p50_ms"],
+            p90_ms=t["p90_ms"],
+            p95_ms=t["p95_ms"],
+            p99_ms=t["p99_ms"],
+            max_ms=t["max_ms"],
+            min_ms=t.get("min_ms", 0.0),
+            mean_ms=t["mean_ms"],
+            stdev_ms=t["stdev_ms"],
+            sample_count=t["sample_count"],
+        )
+
+    throughput = None
+    if "throughput" in data:
+        th = data["throughput"]
+        throughput = ThroughputStats(
+            aggregate_events_per_sec=th["aggregate_events_per_sec"],
+            per_client_events_per_sec=th["per_client_events_per_sec"],
+            total_events=th["total_events"],
+            total_duration_sec=th["total_duration_sec"],
+            client_count=th["client_count"],
+        )
+
+    memory = None
+    if "memory" in data:
+        m = data["memory"]
+        memory = MemoryStats(
+            baseline_mb=m["baseline_mb"],
+            peak_mb=m["peak_mb"],
+            final_mb=m["final_mb"],
+            growth_mb=m["growth_mb"],
+            slope_mb_per_sec=m["slope_mb_per_sec"],
+            samples=[(s[0], s[1]) for s in m.get("samples", [])],
+        )
+
+    reliability = None
+    if "reliability" in data:
+        r = data["reliability"]
+        reliability = ReliabilityStats(
+            successful_connections=r["successful_connections"],
+            failed_connections=r["failed_connections"],
+            error_rate=r["error_rate"],
+            errors=r.get("errors", []),
+        )
+
+    sse_internals = None
+    if "sse_internals" in data:
+        s = data["sse_internals"]
+        sse_internals = SSEInternals(
+            watcher_started=s["watcher_started"],
+            peak_registered_events=s["peak_registered_events"],
+            final_registered_events=s["final_registered_events"],
+        )
+
+    return TestReport(
+        test_name=metadata.get("test_name", "unknown"),
+        timestamp=metadata.get("timestamp", ""),
+        git_commit=metadata.get("git_commit", "unknown"),
+        git_branch=metadata.get("git_branch", "unknown"),
+        scale=metadata.get("scale", 0),
+        duration_minutes=metadata.get("duration_minutes", 0),
+        latency=latency,
+        ttfe=ttfe,
+        throughput=throughput,
+        memory=memory,
+        reliability=reliability,
+        sse_internals=sse_internals,
+        comparison=data.get("comparison"),
+    )
diff --git a/tests/load/baselines/.gitkeep b/tests/load/baselines/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/load/conftest.py b/tests/load/conftest.py
index ec73419..b70f888 100644
--- a/tests/load/conftest.py
+++ b/tests/load/conftest.py
@@ -4,14 +4,24 @@
 Provides container-based SSE server and utility fixtures.
 """
 
+from __future__ import annotations
+
 import os
 import time
-from typing import Generator
+from pathlib import Path
+from typing import TYPE_CHECKING, Generator
 
 import httpx
 import pytest
 from testcontainers.core.container import DockerContainer
 
+from .baseline import BaselineManager
+from .metrics import MetricsCollector
+from .reporter import ReportGenerator
+
+if TYPE_CHECKING:
+    from .metrics import TestReport
+
 
 class SSELoadTestContainer(DockerContainer):
     """Custom container for SSE load testing."""
@@ -114,6 +124,37 @@ def pytest_addoption(parser: pytest.Parser) -> None:
         default="1",
         help="Test duration in minutes",
     )
+    parser.addoption(
+        "--output-dir",
+        action="store",
+        default="tests/load/results",
+        help="Directory for test reports",
+    )
+    parser.addoption(
+        "--baselines-dir",
+        action="store",
+        default="tests/load/baselines",
+        help="Directory for baseline files",
+    )
+    parser.addoption(
+        "--update-baseline",
+        action="store_true",
+        default=False,
+        help="Save current run as new baseline",
+    )
+    parser.addoption(
+        "--fail-on-regression",
+        action="store_true",
+        default=False,
+        help="Exit non-zero if regression detected",
+    )
+    parser.addoption(
+        "--regression-threshold",
+        action="store",
+        type=int,
+        default=20,
+        help="Percent change to trigger regression warning",
+    )
 
 
 @pytest.fixture
@@ -126,3 +167,75 @@ def scale(request: pytest.FixtureRequest) -> int:
 def duration_minutes(request: pytest.FixtureRequest) -> int:
     """Get the duration in minutes for load tests."""
     return int(request.config.getoption("--duration"))
+
+
+@pytest.fixture
+def output_dir(request: pytest.FixtureRequest) -> Path:
+    """Get the output directory for reports."""
+    return Path(request.config.getoption("--output-dir"))
+
+
+@pytest.fixture
+def baselines_dir(request: pytest.FixtureRequest) -> Path:
+    """Get the baselines directory."""
+    return Path(request.config.getoption("--baselines-dir"))
+
+
+@pytest.fixture
+def update_baseline(request: pytest.FixtureRequest) -> bool:
+    """Whether to update baselines."""
+    return bool(request.config.getoption("--update-baseline"))
+
+
+@pytest.fixture
+def fail_on_regression(request: pytest.FixtureRequest) -> bool:
+    """Whether to fail on regression."""
+    return bool(request.config.getoption("--fail-on-regression"))
+
+
+@pytest.fixture
+def metrics_collector() -> MetricsCollector:
+    """Fresh metrics collector for each test."""
+    return MetricsCollector()
+
+
+@pytest.fixture(scope="session")
+def baseline_manager(request: pytest.FixtureRequest) -> BaselineManager:
+    """Baseline manager for comparison."""
+    baselines_dir = Path(request.config.getoption("--baselines-dir"))
+    threshold = int(request.config.getoption("--regression-threshold"))
+    thresholds = {
+        "latency_p99_warning_pct": float(threshold),
+        "latency_p99_fail_pct": float(threshold * 2.5),
+        "throughput_warning_pct": float(-threshold),
+        "memory_growth_warning_pct": float(threshold * 2.5),
+        "memory_slope_fail": 0.1,
+        "error_rate_fail_pct": 5.0,
+    }
+    return BaselineManager(baselines_dir=baselines_dir, thresholds=thresholds)
+
+
+@pytest.fixture(scope="session")
+def report_generator(request: pytest.FixtureRequest) -> ReportGenerator:
+    """Report generator for output."""
+    output_dir = Path(request.config.getoption("--output-dir"))
+    return ReportGenerator(output_dir=output_dir)
+
+
+# Store test reports for session-level access
+_test_reports: dict[str, "TestReport"] = {}
+
+
+def pytest_sessionstart(session: pytest.Session) -> None:
+    """Clear reports at session start."""
+    _test_reports.clear()
+
+
+def register_test_report(report: "TestReport") -> None:
+    """Register a test report for later processing."""
+    _test_reports[report.test_name] = report
+
+
+def get_test_reports() -> dict[str, "TestReport"]:
+    """Get all registered test reports."""
+    return _test_reports.copy()
diff --git a/tests/load/metrics.py b/tests/load/metrics.py
new file mode 100644
index 0000000..c3bfea5
--- /dev/null
+++ b/tests/load/metrics.py
@@ -0,0 +1,422 @@
+"""
+Core metrics collection and reporting infrastructure for load tests.
+
+Provides dataclasses for structured metrics and a collector for aggregating
+samples during test execution.
+"""
+
+from __future__ import annotations
+
+import statistics
+import subprocess
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+
+@dataclass
+class LatencyStats:
+    """Statistical summary of latency measurements."""
+
+    p50_ms: float
+    p90_ms: float
+    p95_ms: float
+    p99_ms: float
+    max_ms: float
+    min_ms: float
+    mean_ms: float
+    stdev_ms: float
+    sample_count: int
+
+    @classmethod
+    def from_samples(cls, samples: list[float]) -> LatencyStats | None:
+        """Compute statistics from raw latency samples (in ms)."""
+        if not samples:
+            return None
+
+        sorted_samples = sorted(samples)
+        n = len(sorted_samples)
+
+        def percentile(p: float) -> float:
+            idx = int(n * p / 100)
+            return sorted_samples[min(idx, n - 1)]
+
+        return cls(
+            p50_ms=percentile(50),
+            p90_ms=percentile(90),
+            p95_ms=percentile(95),
+            p99_ms=percentile(99),
+            max_ms=sorted_samples[-1],
+            min_ms=sorted_samples[0],
+            mean_ms=statistics.mean(sorted_samples),
+            stdev_ms=statistics.stdev(sorted_samples) if n > 1 else 0.0,
+            sample_count=n,
+        )
+
+    def to_dict(self) -> dict[str, float | int]:
+        """Convert to JSON-serializable dict."""
+        return {
+            "p50_ms": round(self.p50_ms, 3),
+            "p90_ms": round(self.p90_ms, 3),
+            "p95_ms": round(self.p95_ms, 3),
+            "p99_ms": round(self.p99_ms, 3),
+            "max_ms": round(self.max_ms, 3),
+            "min_ms": round(self.min_ms, 3),
+            "mean_ms": round(self.mean_ms, 3),
+            "stdev_ms": round(self.stdev_ms, 3),
+            "sample_count": self.sample_count,
+        }
+
+
+@dataclass
+class MemoryStats:
+    """Memory usage statistics."""
+
+    baseline_mb: float
+    peak_mb: float
+    final_mb: float
+    growth_mb: float
+    slope_mb_per_sec: float
+    samples: list[tuple[float, float]]  # (elapsed_sec, rss_mb)
+
+    @classmethod
+    def from_samples(
+        cls,
+        samples: list[tuple[float, float]],
+        baseline_mb: float,
+        final_mb: float,
+    ) -> MemoryStats:
+        """Compute statistics from time-series memory samples."""
+        if not samples:
+            return cls(
+                baseline_mb=baseline_mb,
+                peak_mb=baseline_mb,
+                final_mb=final_mb,
+                growth_mb=0.0,
+                slope_mb_per_sec=0.0,
+                samples=[],
+            )
+
+        peak_mb = max(s[1] for s in samples)
+        growth_mb = peak_mb - baseline_mb
+
+        # Linear regression for slope
+        slope = 0.0
+        if len(samples) >= 2:
+            x_vals = [s[0] for s in samples]
+            y_vals = [s[1] for s in samples]
+            x_mean = statistics.mean(x_vals)
+            y_mean = statistics.mean(y_vals)
+            numerator = sum((x - x_mean) * (y - y_mean) for x, y in zip(x_vals, y_vals))
+            denominator = sum((x - x_mean) ** 2 for x in x_vals)
+            if denominator > 0:
+                slope = numerator / denominator
+
+        return cls(
+            baseline_mb=baseline_mb,
+            peak_mb=peak_mb,
+            final_mb=final_mb,
+            growth_mb=growth_mb,
+            slope_mb_per_sec=slope,
+            samples=samples,
+        )
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to JSON-serializable dict."""
+        return {
+            "baseline_mb": round(self.baseline_mb, 2),
+            "peak_mb": round(self.peak_mb, 2),
+            "final_mb": round(self.final_mb, 2),
+            "growth_mb": round(self.growth_mb, 2),
+            "slope_mb_per_sec": round(self.slope_mb_per_sec, 4),
+            "samples": [[round(t, 2), round(m, 2)] for t, m in self.samples],
+        }
+
+
+@dataclass
+class ThroughputStats:
+    """Throughput statistics."""
+
+    aggregate_events_per_sec: float
+    per_client_events_per_sec: float
+    total_events: int
+    total_duration_sec: float
+    client_count: int
+
+    def to_dict(self) -> dict[str, float | int]:
+        """Convert to JSON-serializable dict."""
+        return {
+            "aggregate_events_per_sec": round(self.aggregate_events_per_sec, 2),
+            "per_client_events_per_sec": round(self.per_client_events_per_sec, 2),
+            "total_events": self.total_events,
+            "total_duration_sec": round(self.total_duration_sec, 2),
+            "client_count": self.client_count,
+        }
+
+
+@dataclass
+class ReliabilityStats:
+    """Connection reliability statistics."""
+
+    successful_connections: int
+    failed_connections: int
+    error_rate: float
+    errors: list[str]
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to JSON-serializable dict."""
+        return {
+            "successful_connections": self.successful_connections,
+            "failed_connections": self.failed_connections,
+            "error_rate": round(self.error_rate, 4),
+            "errors": self.errors[:10],  # Limit to first 10 errors
+        }
+
+
+@dataclass
+class SSEInternals:
+    """SSE library internal state (Issue #152 validation)."""
+
+    watcher_started: bool
+    peak_registered_events: int
+    final_registered_events: int
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to JSON-serializable dict."""
+        return {
+            "watcher_started": self.watcher_started,
+            "peak_registered_events": self.peak_registered_events,
+            "final_registered_events": self.final_registered_events,
+        }
+
+
+@dataclass
+class TestReport:
+    """Complete performance report for a single test."""
+
+    test_name: str
+    timestamp: str
+    git_commit: str
+    git_branch: str
+    scale: int
+    duration_minutes: int
+
+    # Metrics (optional based on test type)
+    latency: LatencyStats | None = None
+    ttfe: LatencyStats | None = None
+    throughput: ThroughputStats | None = None
+    memory: MemoryStats | None = None
+    reliability: ReliabilityStats | None = None
+    sse_internals: SSEInternals | None = None
+
+    # Comparison results (populated by BaselineManager)
+    comparison: dict[str, float] | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to JSON-serializable dict."""
+        result: dict[str, Any] = {
+            "metadata": {
+                "test_name": self.test_name,
+                "timestamp": self.timestamp,
+                "git_commit": self.git_commit,
+                "git_branch": self.git_branch,
+                "scale": self.scale,
+                "duration_minutes": self.duration_minutes,
+            }
+        }
+
+        if self.latency:
+            result["latency"] = self.latency.to_dict()
+        if self.ttfe:
+            result["ttfe"] = self.ttfe.to_dict()
+        if self.throughput:
+            result["throughput"] = self.throughput.to_dict()
+        if self.memory:
+            result["memory"] = self.memory.to_dict()
+        if self.reliability:
+            result["reliability"] = self.reliability.to_dict()
+        if self.sse_internals:
+            result["sse_internals"] = self.sse_internals.to_dict()
+        if self.comparison:
+            result["comparison"] = self.comparison
+
+        return result
+
+
+def _get_git_info() -> tuple[str, str]:
+    """Get current git commit and branch."""
+    try:
+        commit = subprocess.run(
+            ["git", "rev-parse", "--short", "HEAD"],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout.strip()
+    except subprocess.CalledProcessError:
+        commit = "unknown"
+
+    try:
+        branch = subprocess.run(
+            ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout.strip()
+    except subprocess.CalledProcessError:
+        branch = "unknown"
+
+    return commit, branch
+
+
+@dataclass
+class MetricsCollector:
+    """Collects performance metrics during test execution."""
+
+    # Latency samples (milliseconds)
+    latency_samples: list[float] = field(default_factory=list)
+    ttfe_samples: list[float] = field(default_factory=list)
+
+    # Memory samples (elapsed_sec, rss_mb)
+    memory_samples: list[tuple[float, float]] = field(default_factory=list)
+    memory_baseline_mb: float = 0.0
+    memory_final_mb: float = 0.0
+
+    # Throughput tracking
+    events_per_client: list[int] = field(default_factory=list)
+    total_duration_sec: float = 0.0
+
+    # Reliability
+    successful_connections: int = 0
+    failed_connections: int = 0
+    errors: list[str] = field(default_factory=list)
+
+    # SSE internals
+    watcher_started: bool = False
+    peak_registered_events: int = 0
+    final_registered_events: int = 0
+
+    # Internal timing
+    _start_time: float = field(default_factory=time.perf_counter)
+
+    def add_latency_sample(self, ms: float) -> None:
+        """Record an inter-event latency sample."""
+        self.latency_samples.append(ms)
+
+    def add_ttfe_sample(self, ms: float) -> None:
+        """Record a time-to-first-event sample."""
+        self.ttfe_samples.append(ms)
+
+    def add_memory_sample(self, rss_mb: float) -> None:
+        """Record a memory usage sample with timestamp."""
+        elapsed = time.perf_counter() - self._start_time
+        self.memory_samples.append((elapsed, rss_mb))
+
+    def set_memory_baseline(self, rss_mb: float) -> None:
+        """Set the baseline memory before test starts."""
+        self.memory_baseline_mb = rss_mb
+
+    def set_memory_final(self, rss_mb: float) -> None:
+        """Set the final memory after test completes."""
+        self.memory_final_mb = rss_mb
+
+    def add_client_events(self, count: int) -> None:
+        """Record events received by a client."""
+        self.events_per_client.append(count)
+
+    def set_duration(self, seconds: float) -> None:
+        """Set total test duration."""
+        self.total_duration_sec = seconds
+
+    def record_success(self) -> None:
+        """Record a successful connection."""
+        self.successful_connections += 1
+
+    def record_failure(self, error: str) -> None:
+        """Record a failed connection."""
+        self.failed_connections += 1
+        self.errors.append(error)
+
+    def set_sse_internals(
+        self, watcher_started: bool, peak_events: int, final_events: int
+    ) -> None:
+        """Record SSE library internal state."""
+        self.watcher_started = watcher_started
+        self.peak_registered_events = peak_events
+        self.final_registered_events = final_events
+
+    def compute_report(
+        self, test_name: str, scale: int, duration_minutes: int
+    ) -> TestReport:
+        """Compute final report from collected samples."""
+        git_commit, git_branch = _get_git_info()
+        timestamp = datetime.now(timezone.utc).isoformat()
+
+        # Compute latency stats
+        latency = LatencyStats.from_samples(self.latency_samples)
+        ttfe = LatencyStats.from_samples(self.ttfe_samples)
+
+        # Compute memory stats
+        memory = None
+        if self.memory_samples or self.memory_baseline_mb > 0:
+            memory = MemoryStats.from_samples(
+                self.memory_samples,
+                self.memory_baseline_mb,
+                self.memory_final_mb,
+            )
+
+        # Compute throughput stats
+        throughput = None
+        if self.events_per_client and self.total_duration_sec > 0:
+            total_events = sum(self.events_per_client)
+            client_count = len(self.events_per_client)
+            throughput = ThroughputStats(
+                aggregate_events_per_sec=total_events / self.total_duration_sec,
+                per_client_events_per_sec=(
+                    (total_events / client_count / self.total_duration_sec)
+                    if client_count > 0
+                    else 0.0
+                ),
+                total_events=total_events,
+                total_duration_sec=self.total_duration_sec,
+                client_count=client_count,
+            )
+
+        # Compute reliability stats
+        total_connections = self.successful_connections + self.failed_connections
+        reliability = None
+        if total_connections > 0:
+            reliability = ReliabilityStats(
+                successful_connections=self.successful_connections,
+                failed_connections=self.failed_connections,
+                error_rate=(
+                    self.failed_connections / total_connections
+                    if total_connections > 0
+                    else 0.0
+                ),
+                errors=self.errors,
+            )
+
+        # SSE internals
+        sse_internals = None
+        if self.peak_registered_events > 0 or self.watcher_started:
+            sse_internals = SSEInternals(
+                watcher_started=self.watcher_started,
+                peak_registered_events=self.peak_registered_events,
+                final_registered_events=self.final_registered_events,
+            )
+
+        return TestReport(
+            test_name=test_name,
+            timestamp=timestamp,
+            git_commit=git_commit,
+            git_branch=git_branch,
+            scale=scale,
+            duration_minutes=duration_minutes,
+            latency=latency,
+            ttfe=ttfe,
+            throughput=throughput,
+            memory=memory,
+            reliability=reliability,
+            sse_internals=sse_internals,
+        )
diff --git a/tests/load/reporter.py b/tests/load/reporter.py
new file mode 100644
index 0000000..ac873fc
--- /dev/null
+++ b/tests/load/reporter.py
@@ -0,0 +1,615 @@
+"""
+Report generation for load test results.
+
+Produces JSON and HTML reports with inline SVG charts.
+"""
+
+from __future__ import annotations
+
+import html
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .baseline import ComparisonResult
+    from .metrics import (
+        LatencyStats,
+        MemoryStats,
+        ReliabilityStats,
+        TestReport,
+        ThroughputStats,
+    )
+
+
+class ReportGenerator:
+    """Generates JSON and HTML reports from test results."""
+
+    def __init__(self, output_dir: Path | str = "tests/load/results"):
+        self.output_dir = Path(output_dir)
+
+    def _report_path(self, test_name: str, ext: str) -> Path:
+        """Get path to report file."""
+        safe_name = test_name.replace("::", "_").replace("/", "_").replace("\\", "_")
+        return self.output_dir / f"{safe_name}.{ext}"
+
+    def save_json(self, report: TestReport) -> Path:
+        """Save report as JSON."""
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        path = self._report_path(report.test_name, "json")
+
+        with open(path, "w") as f:
+            json.dump(report.to_dict(), f, indent=2)
+
+        return path
+
+    def save_html(
+        self, report: TestReport, comparison: ComparisonResult | None = None
+    ) -> Path:
+        """Save report as HTML with inline SVG charts."""
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        path = self._report_path(report.test_name, "html")
+
+        html_content = self._render_html(report, comparison)
+        with open(path, "w") as f:
+            f.write(html_content)
+
+        return path
+
+    def print_summary(
+        self, report: TestReport, comparison: ComparisonResult | None = None
+    ) -> None:
+        """Print summary to console."""
+        sep = "=" * 70
+        print(f"\n{sep}")
+        print(f"  SSE Load Test Results: {report.test_name}")
+        print(sep)
+        print(
+            f"Run:     {report.timestamp} | commit: {report.git_commit} | "
+            f"branch: {report.git_branch}"
+        )
+        print(
+            f"Scale:   {report.scale} connections | Duration: {report.duration_minutes} min"
+        )
+        print()
+
+        # Latency
+        if report.latency:
+            print("LATENCY (inter-event)")
+            self._print_latency_line(
+                "  p50:", report.latency.p50_ms, comparison, "latency_p50"
+            )
+            print(f"  p95:   {report.latency.p95_ms:.1f} ms")
+            self._print_latency_line(
+                "  p99:", report.latency.p99_ms, comparison, "latency_p99"
+            )
+            print(f"  max:   {report.latency.max_ms:.1f} ms")
+            print()
+
+        # TTFE
+        if report.ttfe:
+            print("TIME TO FIRST EVENT")
+            print(f"  p50:   {report.ttfe.p50_ms:.1f} ms")
+            self._print_latency_line(
+                "  p99:", report.ttfe.p99_ms, comparison, "ttfe_p99"
+            )
+            print()
+
+        # Throughput
+        if report.throughput:
+            print("THROUGHPUT")
+            self._print_throughput_line(
+                report.throughput.aggregate_events_per_sec, comparison
+            )
+            print(
+                f"  Per client: {report.throughput.per_client_events_per_sec:.1f} events/sec"
+            )
+            print()
+
+        # Memory
+        if report.memory:
+            print("MEMORY")
+            print(f"  Baseline:  {report.memory.baseline_mb:.1f} MB")
+            print(f"  Peak:      {report.memory.peak_mb:.1f} MB")
+            self._print_memory_line(
+                "  Growth:", report.memory.growth_mb, comparison, "memory_growth"
+            )
+            self._print_slope_line(report.memory.slope_mb_per_sec)
+            print()
+
+        # Reliability
+        if report.reliability:
+            total = (
+                report.reliability.successful_connections
+                + report.reliability.failed_connections
+            )
+            pct = (
+                report.reliability.successful_connections / total * 100
+                if total > 0
+                else 0
+            )
+            print("RELIABILITY")
+            print(
+                f"  Successful: {report.reliability.successful_connections}/{total} ({pct:.1f}%)"
+            )
+            if report.reliability.errors:
+                print(f"  Errors:     {len(report.reliability.errors)}")
+            print()
+
+        # Comparison summary
+        if comparison and (comparison.regression_reasons or comparison.warnings):
+            if comparison.regression_detected:
+                print("REGRESSIONS DETECTED:")
+                for reason in comparison.regression_reasons or []:
+                    print(f"  - {reason}")
+            if comparison.warnings:
+                print("WARNINGS:")
+                for warning in comparison.warnings:
+                    print(f"  - {warning}")
+            print()
+
+        print(sep)
+        result = "PASS"
+        if comparison and comparison.regression_detected:
+            result = "FAIL (regression detected)"
+        elif comparison and comparison.warnings:
+            result = f"PASS ({len(comparison.warnings)} warnings)"
+        print(f"Result: {result}")
+        print(sep + "\n")
+
+    def _print_latency_line(
+        self,
+        label: str,
+        value: float,
+        comparison: ComparisonResult | None,
+        key: str,
+    ) -> None:
+        """Print latency line with optional comparison."""
+        line = f"{label}   {value:.1f} ms"
+        if comparison:
+            change = getattr(comparison, f"{key}_change_pct", None)
+            if change is not None:
+                symbol = "+" if change > 0 else ""
+                indicator = "!" if abs(change) > 20 else ""
+                line += f"  ({symbol}{change:.1f}% vs baseline) {indicator}"
+        print(line)
+
+    def _print_throughput_line(
+        self, value: float, comparison: ComparisonResult | None
+    ) -> None:
+        """Print throughput line with optional comparison."""
+        line = f"  Aggregate:  {value:,.0f} events/sec"
+        if comparison and comparison.throughput_change_pct is not None:
+            change = comparison.throughput_change_pct
+            symbol = "+" if change > 0 else ""
+            indicator = "!" if change < -20 else ""
+            line += f"  ({symbol}{change:.1f}% vs baseline) {indicator}"
+        print(line)
+
+    def _print_memory_line(
+        self,
+        label: str,
+        value: float,
+        comparison: ComparisonResult | None,
+        key: str,
+    ) -> None:
+        """Print memory line with optional comparison."""
+        line = f"{label}    {value:.1f} MB"
+        if comparison:
+            change = getattr(comparison, f"{key}_change_pct", None)
+            if change is not None:
+                symbol = "+" if change > 0 else ""
+                indicator = "!" if change > 50 else ""
+                line += f"  ({symbol}{change:.1f}% vs baseline) {indicator}"
+        print(line)
+
+    def _print_slope_line(self, slope: float) -> None:
+        """Print memory slope line."""
+        indicator = "!" if slope > 0.1 else ""
+        print(f"  Slope:     {slope:.3f} MB/sec {indicator}")
+
+    def _render_html(
+        self, report: TestReport, comparison: ComparisonResult | None
+    ) -> str:
+        """Render full HTML report."""
+        charts_html = ""
+
+        # Latency histogram
+        if report.latency:
+            charts_html += self._render_section(
+                "Latency Distribution",
+                self._render_latency_summary(report.latency, comparison),
+            )
+
+        # TTFE stats
+        if report.ttfe:
+            charts_html += self._render_section(
+                "Time to First Event",
+                self._render_ttfe_summary(report.ttfe, comparison),
+            )
+
+        # Memory chart
+        if report.memory and report.memory.samples:
+            charts_html += self._render_section(
+                "Memory Usage Over Time",
+                self._render_memory_chart(report.memory)
+                + self._render_memory_summary(report.memory, comparison),
+            )
+
+        # Throughput
+        if report.throughput:
+            charts_html += self._render_section(
+                "Throughput",
+                self._render_throughput_summary(report.throughput, comparison),
+            )
+
+        # Reliability
+        if report.reliability:
+            charts_html += self._render_section(
+                "Reliability",
+                self._render_reliability_summary(report.reliability),
+            )
+
+        # Comparison
+        comparison_html = ""
+        if comparison and (comparison.regression_reasons or comparison.warnings):
+            comparison_html = self._render_comparison_section(comparison)
+
+        return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Load Test Report: {html.escape(report.test_name)}</title>
+    <style>
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            line-height: 1.6;
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 20px;
+            background: #f5f5f5;
+        }}
+        .header {{
+            background: #1a1a2e;
+            color: white;
+            padding: 20px;
+            border-radius: 8px;
+            margin-bottom: 20px;
+        }}
+        .header h1 {{
+            margin: 0 0 10px 0;
+            font-size: 1.5em;
+        }}
+        .metadata {{
+            color: #aaa;
+            font-size: 0.9em;
+        }}
+        .section {{
+            background: white;
+            padding: 20px;
+            border-radius: 8px;
+            margin-bottom: 20px;
+            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+        }}
+        .section h2 {{
+            margin-top: 0;
+            color: #333;
+            border-bottom: 2px solid #eee;
+            padding-bottom: 10px;
+        }}
+        .chart-container {{
+            margin: 20px 0;
+        }}
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+        }}
+        th, td {{
+            text-align: left;
+            padding: 8px 12px;
+            border-bottom: 1px solid #eee;
+        }}
+        th {{
+            color: #666;
+            font-weight: 500;
+        }}
+        .value {{
+            font-family: 'SF Mono', Monaco, monospace;
+            font-weight: 600;
+        }}
+        .change {{
+            font-size: 0.85em;
+            color: #666;
+        }}
+        .change.positive {{ color: #e74c3c; }}
+        .change.negative {{ color: #27ae60; }}
+        .regression {{
+            background: #fee;
+            border-left: 4px solid #e74c3c;
+            padding: 15px;
+            margin: 10px 0;
+        }}
+        .warning {{
+            background: #fff8e1;
+            border-left: 4px solid #f9a825;
+            padding: 15px;
+            margin: 10px 0;
+        }}
+        svg {{
+            max-width: 100%;
+            height: auto;
+        }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>{html.escape(report.test_name)}</h1>
+        <div class="metadata">
+            <span>Timestamp: {html.escape(report.timestamp)}</span> |
+            <span>Commit: {html.escape(report.git_commit)}</span> |
+            <span>Branch: {html.escape(report.git_branch)}</span><br>
+            <span>Scale: {report.scale} connections</span> |
+            <span>Duration: {report.duration_minutes} min</span>
+        </div>
+    </div>
+
+    {comparison_html}
+    {charts_html}
+</body>
+</html>
+"""
+
+    def _render_section(self, title: str, content: str) -> str:
+        """Render a section with title."""
+        return f"""
+    <div class="section">
+        <h2>{html.escape(title)}</h2>
+        {content}
+    </div>
+"""
+
+    def _render_latency_summary(
+        self, stats: LatencyStats, comparison: ComparisonResult | None
+    ) -> str:
+        """Render latency summary table."""
+        p99_change = ""
+        if comparison and comparison.latency_p99_change_pct is not None:
+            cls = "positive" if comparison.latency_p99_change_pct > 0 else "negative"
+            sign = "+" if comparison.latency_p99_change_pct > 0 else ""
+            p99_change = (
+                f'<span class="change {cls}">'
+                f"({sign}{comparison.latency_p99_change_pct:.1f}%)</span>"
+            )
+
+        return f"""
+        <table>
+            <tr><th>Percentile</th><th>Value</th></tr>
+            <tr><td>p50</td><td class="value">{stats.p50_ms:.2f} ms</td></tr>
+            <tr><td>p90</td><td class="value">{stats.p90_ms:.2f} ms</td></tr>
+            <tr><td>p95</td><td class="value">{stats.p95_ms:.2f} ms</td></tr>
+            <tr><td>p99</td><td class="value">{stats.p99_ms:.2f} ms {p99_change}</td></tr>
+            <tr><td>max</td><td class="value">{stats.max_ms:.2f} ms</td></tr>
+            <tr><td>mean</td><td class="value">{stats.mean_ms:.2f} ms</td></tr>
+            <tr><td>stdev</td><td class="value">{stats.stdev_ms:.2f} ms</td></tr>
+            <tr><td>samples</td><td class="value">{stats.sample_count:,}</td></tr>
+        </table>
+"""
+
+    def _render_ttfe_summary(
+        self, stats: LatencyStats, comparison: ComparisonResult | None
+    ) -> str:
+        """Render TTFE summary table."""
+        p99_change = ""
+        if comparison and comparison.ttfe_p99_change_pct is not None:
+            cls = "positive" if comparison.ttfe_p99_change_pct > 0 else "negative"
+            sign = "+" if comparison.ttfe_p99_change_pct > 0 else ""
+            p99_change = (
+                f'<span class="change {cls}">'
+                f"({sign}{comparison.ttfe_p99_change_pct:.1f}%)</span>"
+            )
+
+        return f"""
+        <table>
+            <tr><th>Metric</th><th>Value</th></tr>
+            <tr><td>p50</td><td class="value">{stats.p50_ms:.1f} ms</td></tr>
+            <tr><td>p99</td><td class="value">{stats.p99_ms:.1f} ms {p99_change}</td></tr>
+            <tr><td>max</td><td class="value">{stats.max_ms:.1f} ms</td></tr>
+            <tr><td>samples</td><td class="value">{stats.sample_count:,}</td></tr>
+        </table>
+"""
+
+    def _render_memory_chart(self, memory: MemoryStats) -> str:
+        """Render SVG line chart for memory over time."""
+        if not memory.samples:
+            return ""
+
+        # Chart dimensions
+        width = 600
+        height = 200
+        padding = 40
+
+        times = [s[0] for s in memory.samples]
+        values = [s[1] for s in memory.samples]
+
+        if not times or len(times) < 2:
+            return ""
+
+        x_min, x_max = min(times), max(times)
+        y_min = min(values) * 0.9
+        y_max = max(values) * 1.1
+
+        def scale_x(t: float) -> float:
+            if x_max == x_min:
+                return padding
+            return padding + (t - x_min) / (x_max - x_min) * (width - 2 * padding)
+
+        def scale_y(v: float) -> float:
+            if y_max == y_min:
+                return height - padding
+            return (
+                height
+                - padding
+                - (v - y_min) / (y_max - y_min) * (height - 2 * padding)
+            )
+
+        # Generate path
+        points = [f"{scale_x(t):.1f},{scale_y(v):.1f}" for t, v in memory.samples]
+        path_d = "M " + " L ".join(points)
+
+        # Generate axis labels
+        y_labels = ""
+        for i in range(5):
+            y_val = y_min + (y_max - y_min) * i / 4
+            y_pos = scale_y(y_val)
+            y_labels += f'<text x="{padding - 5}" y="{y_pos}" text-anchor="end" font-size="10">{y_val:.0f}</text>'
+
+        x_labels = ""
+        for i in range(5):
+            x_val = x_min + (x_max - x_min) * i / 4
+            x_pos = scale_x(x_val)
+            x_labels += f'<text x="{x_pos}" y="{height - padding + 15}" text-anchor="middle" font-size="10">{x_val:.0f}s</text>'
+
+        return f"""
+        <div class="chart-container">
+            <svg viewBox="0 0 {width} {height}" xmlns="http://www.w3.org/2000/svg">
+                <!-- Grid -->
+                <line x1="{padding}" y1="{padding}" x2="{padding}" y2="{height - padding}"
+                      stroke="#ddd" stroke-width="1"/>
+                <line x1="{padding}" y1="{height - padding}" x2="{width - padding}" y2="{height - padding}"
+                      stroke="#ddd" stroke-width="1"/>
+
+                <!-- Y axis label -->
+                <text x="15" y="{height / 2}" text-anchor="middle" font-size="11"
+                      transform="rotate(-90 15 {height / 2})">Memory (MB)</text>
+
+                <!-- Axis labels -->
+                {y_labels}
+                {x_labels}
+
+                <!-- Data line -->
+                <path d="{path_d}" fill="none" stroke="#3498db" stroke-width="2"/>
+
+                <!-- Baseline reference -->
+                <line x1="{padding}" y1="{scale_y(memory.baseline_mb)}"
+                      x2="{width - padding}" y2="{scale_y(memory.baseline_mb)}"
+                      stroke="#27ae60" stroke-width="1" stroke-dasharray="5,5"/>
+            </svg>
+        </div>
+"""
+
+    def _render_memory_summary(
+        self, memory: MemoryStats, comparison: ComparisonResult | None
+    ) -> str:
+        """Render memory summary table."""
+        growth_change = ""
+        if comparison and comparison.memory_growth_change_pct is not None:
+            cls = "positive" if comparison.memory_growth_change_pct > 0 else "negative"
+            sign = "+" if comparison.memory_growth_change_pct > 0 else ""
+            growth_change = (
+                f'<span class="change {cls}">'
+                f"({sign}{comparison.memory_growth_change_pct:.1f}%)</span>"
+            )
+
+        return f"""
+        <table>
+            <tr><th>Metric</th><th>Value</th></tr>
+            <tr><td>Baseline</td><td class="value">{memory.baseline_mb:.1f} MB</td></tr>
+            <tr><td>Peak</td><td class="value">{memory.peak_mb:.1f} MB</td></tr>
+            <tr><td>Final</td><td class="value">{memory.final_mb:.1f} MB</td></tr>
+            <tr><td>Growth</td><td class="value">{memory.growth_mb:.1f} MB {growth_change}</td></tr>
+            <tr><td>Slope</td><td class="value">{memory.slope_mb_per_sec:.4f} MB/sec</td></tr>
+        </table>
+"""
+
+    def _render_throughput_summary(
+        self, throughput: "ThroughputStats", comparison: ComparisonResult | None
+    ) -> str:
+        """Render throughput summary table."""
+        from .metrics import ThroughputStats
+
+        if not isinstance(throughput, ThroughputStats):
+            return ""
+
+        change = ""
+        if comparison and comparison.throughput_change_pct is not None:
+            cls = "negative" if comparison.throughput_change_pct < 0 else "positive"
+            sign = "+" if comparison.throughput_change_pct > 0 else ""
+            change = (
+                f'<span class="change {cls}">'
+                f"({sign}{comparison.throughput_change_pct:.1f}%)</span>"
+            )
+
+        return f"""
+        <table>
+            <tr><th>Metric</th><th>Value</th></tr>
+            <tr><td>Aggregate</td><td class="value">{throughput.aggregate_events_per_sec:,.0f} events/sec {change}</td></tr>
+            <tr><td>Per Client</td><td class="value">{throughput.per_client_events_per_sec:.1f} events/sec</td></tr>
+            <tr><td>Total Events</td><td class="value">{throughput.total_events:,}</td></tr>
+            <tr><td>Duration</td><td class="value">{throughput.total_duration_sec:.1f} sec</td></tr>
+            <tr><td>Clients</td><td class="value">{throughput.client_count:,}</td></tr>
+        </table>
+"""
+
+    def _render_reliability_summary(self, reliability: "ReliabilityStats") -> str:
+        """Render reliability summary."""
+        from .metrics import ReliabilityStats
+
+        if not isinstance(reliability, ReliabilityStats):
+            return ""
+
+        total = reliability.successful_connections + reliability.failed_connections
+        pct = reliability.successful_connections / total * 100 if total > 0 else 0
+
+        errors_html = ""
+        if reliability.errors:
+            error_items = "".join(
+                f"<li>{html.escape(e)}</li>" for e in reliability.errors[:10]
+            )
+            errors_html = f"<ul>{error_items}</ul>"
+
+        return f"""
+        <table>
+            <tr><th>Metric</th><th>Value</th></tr>
+            <tr><td>Successful</td><td class="value">{reliability.successful_connections:,} / {total:,} ({pct:.1f}%)</td></tr>
+            <tr><td>Failed</td><td class="value">{reliability.failed_connections:,}</td></tr>
+            <tr><td>Error Rate</td><td class="value">{reliability.error_rate * 100:.2f}%</td></tr>
+        </table>
+        {errors_html}
+"""
+
+    def _render_comparison_section(self, comparison: ComparisonResult) -> str:
+        """Render comparison alerts section."""
+        content = ""
+
+        if comparison.regression_reasons:
+            reasons = "".join(
+                f"<li>{html.escape(r)}</li>" for r in comparison.regression_reasons
+            )
+            content += f"""
+    <div class="regression">
+        <strong>Regressions Detected</strong>
+        <ul>{reasons}</ul>
+    </div>
+"""
+
+        if comparison.warnings:
+            warnings = "".join(
+                f"<li>{html.escape(w)}</li>" for w in comparison.warnings
+            )
+            content += f"""
+    <div class="warning">
+        <strong>Warnings</strong>
+        <ul>{warnings}</ul>
+    </div>
+"""
+
+        if comparison.baseline_commit:
+            content += f"""
+    <div class="section" style="padding: 10px;">
+        <small>Compared against baseline: {html.escape(comparison.baseline_commit)}
+        ({html.escape(comparison.baseline_timestamp or 'unknown')})</small>
+    </div>
+"""
+
+        return content
diff --git a/tests/load/test_backpressure.py b/tests/load/test_backpressure.py
index 94f111b..024adf6 100644
--- a/tests/load/test_backpressure.py
+++ b/tests/load/test_backpressure.py
@@ -1,30 +1,74 @@
 """
 Backpressure and slow client tests.
 
-Verifies server handles slow consumers correctly without affecting fast clients.
+This module verifies the server handles mixed client speeds correctly:
+- Slow consumers don't block fast consumers (per-connection isolation)
+- Rapid connection churn doesn't exhaust resources
+- send_timeout properly disconnects frozen clients
+
+SSE servers must handle heterogeneous clients. A slow consumer (mobile on 2G)
+shouldn't cause head-of-line blocking for fast consumers (desktop on fiber).
 """
 
+from __future__ import annotations
+
 import asyncio
 import time
-from typing import Tuple
 
 import httpx
 import pytest
 from httpx_sse import aconnect_sse
 
+from .baseline import BaselineManager
+from .conftest import register_test_report
+from .metrics import MetricsCollector
+from .reporter import ReportGenerator
+
 
 @pytest.mark.loadtest
 async def test_slow_clients_dont_block_fast_clients(
     sse_server_url: str,
+    scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Slow clients should not affect throughput of fast clients.
-
-    Tests that the server properly handles mixed client speeds.
+    Verify slow consumers don't throttle fast consumers (connection isolation).
+
+    ## What is Measured
+    - Event count for "fast" clients (consume immediately)
+    - Event count for "slow" clients (0.5s processing delay per event)
+    - Ratio between fast and slow throughput
+
+    ## Why This Matters
+    Tests per-connection isolation in the send path:
+    - Each connection has its own anyio.Lock for sends
+    - Slow client's blocked send() doesn't block other connections
+    - No shared buffers that could cause head-of-line blocking
+
+    Without isolation, a single slow client could stall all other streams,
+    making the server unusable under mixed load.
+
+    ## Methodology
+    1. Connect 10 "fast" clients (consume events immediately)
+    2. Connect 10 "slow" clients (sleep 0.5s after each event)
+    3. Run for 10 seconds
+    4. Compare event counts
+
+    ## Pass Criteria
+    - Fast clients avg > slow clients avg * 5 (isolation works)
+    - Fast clients avg > 500 events (not throttled by slow clients)
+    - Rationale: With 10ms delay, fast clients should receive ~1000 events.
+      Slow clients receive ~20 (10s / 0.5s). 5x ratio is conservative.
+      500 events threshold catches severe throttling.
     """
     test_duration = 10  # seconds
 
-    async def fast_client() -> int:
+    async def fast_client() -> tuple[int, str | None]:
         """Client that consumes events as fast as possible."""
         count = 0
         start = time.perf_counter()
@@ -37,11 +81,11 @@ async def fast_client() -> int:
                         count += 1
                         if time.perf_counter() - start >= test_duration:
                             break
-        except Exception:
-            pass
-        return count
+            return count, None
+        except Exception as e:
+            return count, str(e)
 
-    async def slow_client() -> int:
+    async def slow_client() -> tuple[int, str | None]:
         """Client that reads slowly (simulating processing delay)."""
         count = 0
         start = time.perf_counter()
@@ -55,9 +99,11 @@ async def slow_client() -> int:
                         count += 1
                         if time.perf_counter() - start >= test_duration:
                             break
-        except Exception:
-            pass
-        return count
+            return count, None
+        except Exception as e:
+            return count, str(e)
+
+    start_time = time.perf_counter()
 
     # Mix of fast and slow clients
     fast_tasks = [asyncio.create_task(fast_client()) for _ in range(10)]
@@ -66,17 +112,63 @@ async def slow_client() -> int:
     fast_results = await asyncio.gather(*fast_tasks)
     slow_results = await asyncio.gather(*slow_tasks)
 
-    avg_fast = sum(fast_results) / len(fast_results)
-    avg_slow = sum(slow_results) / len(slow_results)
+    elapsed = time.perf_counter() - start_time
+    metrics_collector.set_duration(elapsed)
+
+    # Process results
+    fast_counts: list[int] = []
+    slow_counts: list[int] = []
+
+    for result in fast_results:
+        if isinstance(result, tuple):
+            count, error = result
+            fast_counts.append(count)
+            metrics_collector.add_client_events(count)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
+    for result in slow_results:
+        if isinstance(result, tuple):
+            count, error = result
+            slow_counts.append(count)
+            metrics_collector.add_client_events(count)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
+    avg_fast = sum(fast_counts) / len(fast_counts) if fast_counts else 0
+    avg_slow = sum(slow_counts) / len(slow_counts) if slow_counts else 0
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_slow_clients_dont_block_fast_clients",
+        scale=20,  # 10 fast + 10 slow
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
 
-    # Fast clients should receive significantly more events
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
     assert avg_fast > avg_slow * 5, (
         f"Fast clients ({avg_fast:.0f} events) should be much faster than "
         f"slow clients ({avg_slow:.0f} events)"
     )
-
-    # Fast clients should not be severely throttled
-    # With 0.01s delay, should get ~1000 events in 10s
     assert (
         avg_fast > 500
     ), f"Fast clients throttled: {avg_fast:.0f} events, expected > 500"
@@ -86,27 +178,60 @@ async def slow_client() -> int:
 async def test_connection_churn_stability(
     sse_server_url: str,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Rapid connect/disconnect should not cause resource exhaustion.
-
-    Tests cleanup under high churn rate.
+    Verify rapid connect/disconnect doesn't exhaust file descriptors or memory.
+
+    ## What is Measured
+    - File descriptor count before and after churn
+    - Memory (RSS) before and after churn
+    - Connection success rate during churn
+
+    ## Why This Matters
+    Tests resource cleanup under high connection churn:
+    - Sockets properly closed on disconnect
+    - Task references released after completion
+    - No accumulation of leaked resources
+
+    In production, clients frequently reconnect (mobile network switches,
+    browser tab refresh). Resource leaks under churn cause eventual exhaustion.
+
+    ## Methodology
+    1. Record baseline FDs and memory
+    2. Create `churn_rate` connections per second for 30 seconds
+    3. Each connection receives one event and disconnects
+    4. Sample memory every 5 seconds
+    5. Record final FDs and memory
+
+    ## Pass Criteria
+    - FD growth < 50 (no socket leaks)
+    - Memory growth < 100MB (no major retention)
+    - Success rate > 90% (server stays responsive under churn)
+    - Rationale: 50 FDs allows for some timing variance in cleanup.
+      100MB memory is generous but catches runaway allocation.
+      90% success rate accounts for expected failures under heavy churn.
     """
     churn_rate = min(100, scale)  # connections per second
     duration = 30  # seconds
     total_connections = churn_rate * duration
 
-    async def quick_connection() -> bool:
+    async def quick_connection() -> tuple[bool, str | None]:
         try:
             async with httpx.AsyncClient(timeout=5.0) as client:
                 async with aconnect_sse(
                     client, "GET", f"{sse_server_url}/sse?delay=0"
                 ) as source:
                     async for _ in source.aiter_sse():
-                        return True
-        except Exception:
-            return False
-        return False
+                        return True, None
+        except Exception as e:
+            return False, str(e)
+        return False, "no events"
 
     # Get baseline metrics
     async with httpx.AsyncClient() as client:
@@ -114,23 +239,72 @@ async def quick_connection() -> bool:
 
     baseline_fds = baseline.get("num_fds", 0)
     baseline_memory = baseline["memory_rss_mb"]
+    metrics_collector.set_memory_baseline(baseline_memory)
+
+    start_time = time.perf_counter()
 
     # Create connections at target rate
     successful = 0
     for batch in range(duration):
         tasks = [asyncio.create_task(quick_connection()) for _ in range(churn_rate)]
         results = await asyncio.gather(*tasks, return_exceptions=True)
-        successful += sum(1 for r in results if r is True)
+
+        for result in results:
+            if isinstance(result, Exception):
+                metrics_collector.record_failure(str(result))
+            elif isinstance(result, tuple):
+                success, error = result
+                if success:
+                    metrics_collector.record_success()
+                    successful += 1
+                else:
+                    metrics_collector.record_failure(error or "unknown")
+
+        # Sample memory periodically
+        if batch % 5 == 0:
+            try:
+                async with httpx.AsyncClient() as client:
+                    metrics = (await client.get(f"{sse_server_url}/metrics")).json()
+                    metrics_collector.add_memory_sample(metrics["memory_rss_mb"])
+            except Exception:
+                pass
+
         await asyncio.sleep(0.5)  # Allow some cleanup
 
+    elapsed = time.perf_counter() - start_time
+    metrics_collector.set_duration(elapsed)
+
     # Get final metrics
     async with httpx.AsyncClient() as client:
         final = (await client.get(f"{sse_server_url}/metrics")).json()
 
     final_fds = final.get("num_fds", 0)
     final_memory = final["memory_rss_mb"]
+    metrics_collector.set_memory_final(final_memory)
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_connection_churn_stability",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
 
-    # File descriptors should return to baseline
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
     if baseline_fds > 0 and final_fds > 0:
         fd_growth = final_fds - baseline_fds
         assert fd_growth < 50, (
@@ -138,13 +312,11 @@ async def quick_connection() -> bool:
             f"connections"
         )
 
-    # Memory should not grow excessively
     memory_growth = final_memory - baseline_memory
     assert (
         memory_growth < 100
     ), f"Memory grew by {memory_growth:.1f}MB during churn test"
 
-    # Success rate should be high
     success_rate = successful / total_connections if total_connections > 0 else 0
     assert success_rate > 0.9, (
         f"Low success rate during churn: {success_rate:.1%} "
@@ -153,14 +325,48 @@ async def quick_connection() -> bool:
 
 
 @pytest.mark.loadtest
-async def test_send_timeout_under_load(sse_server_url: str) -> None:
+async def test_send_timeout_under_load(
+    sse_server_url: str,
+    scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
+) -> None:
     """
-    Verify send_timeout works correctly under load.
-
-    Clients that stop reading should eventually be disconnected.
+    Verify send_timeout disconnects frozen clients without blocking normal clients.
+
+    ## What is Measured
+    - Event count for normal clients (should complete successfully)
+    - Outcome for "frozen" clients (stop reading after first event)
+    - Implicit: server responsiveness during frozen client handling
+
+    ## Why This Matters
+    Tests the send_timeout feature:
+    - Frozen clients (stop reading but don't close connection) block the send()
+    - Without timeout, server thread/task hangs indefinitely
+    - With timeout, server detects blocked send and closes connection
+    - Normal clients should be unaffected by frozen clients
+
+    This is critical for production resilience. Mobile clients frequently
+    "freeze" (backgrounded, network change) without closing connections.
+
+    ## Methodology
+    1. Connect 5 "frozen" clients (receive one event, then stop reading)
+    2. Connect 3 "normal" clients (receive 50 events normally)
+    3. Wait for normal clients to complete
+    4. Verify normal clients weren't affected
+
+    ## Pass Criteria
+    - Normal clients receive >= 45/50 events
+    - Rationale: Normal clients should complete unaffected. 45/50 allows
+      small margin for timing. If frozen clients blocked the server,
+      normal clients would timeout or receive far fewer events.
     """
 
-    async def frozen_client() -> Tuple[str, float]:
+    async def frozen_client() -> tuple[str, float, str | None]:
         """Client that stops reading after first event (simulates frozen client)."""
         start = time.perf_counter()
         try:
@@ -173,16 +379,16 @@ async def frozen_client() -> Tuple[str, float]:
                         await asyncio.sleep(60)  # Will be interrupted by timeout
                         break
         except httpx.ReadTimeout:
-            return "timeout", time.perf_counter() - start
+            return "timeout", time.perf_counter() - start, None
         except Exception as e:
-            return f"error:{type(e).__name__}", time.perf_counter() - start
-        return "completed", time.perf_counter() - start
+            return f"error:{type(e).__name__}", time.perf_counter() - start, str(e)
+        return "completed", time.perf_counter() - start, None
 
     # Start some frozen clients (server has default send_timeout)
-    tasks = [asyncio.create_task(frozen_client()) for _ in range(5)]
+    frozen_tasks = [asyncio.create_task(frozen_client()) for _ in range(5)]
 
     # Also verify server remains responsive with normal clients
-    async def normal_client() -> int:
+    async def normal_client() -> tuple[int, str | None]:
         count = 0
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
@@ -193,23 +399,68 @@ async def normal_client() -> int:
                         count += 1
                         if count >= 50:
                             break
-        except Exception:
-            pass
-        return count
+            return count, None
+        except Exception as e:
+            return count, str(e)
 
     normal_tasks = [asyncio.create_task(normal_client()) for _ in range(3)]
 
     # Wait for normal clients to complete
     normal_results = await asyncio.gather(*normal_tasks)
 
+    # Process normal client results
+    normal_counts: list[int] = []
+    for result in normal_results:
+        if isinstance(result, tuple):
+            count, error = result
+            normal_counts.append(count)
+            metrics_collector.add_client_events(count)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
     # Cancel frozen clients if still running
-    for task in tasks:
+    for task in frozen_tasks:
         if not task.done():
             task.cancel()
 
-    await asyncio.gather(*tasks, return_exceptions=True)
+    frozen_results = await asyncio.gather(*frozen_tasks, return_exceptions=True)
+
+    # Process frozen client results
+    for result in frozen_results:
+        if isinstance(result, Exception):
+            metrics_collector.record_failure(str(result))
+        elif isinstance(result, tuple):
+            status, duration, error = result
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_send_timeout_under_load",
+        scale=8,  # 5 frozen + 3 normal
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
 
-    # Normal clients should have completed successfully
+    # Original assertion
     assert all(
-        r >= 45 for r in normal_results
-    ), f"Normal clients affected by frozen clients: {normal_results}"
+        r >= 45 for r in normal_counts
+    ), f"Normal clients affected by frozen clients: {normal_counts}"
diff --git a/tests/load/test_memory_stability.py b/tests/load/test_memory_stability.py
index ba23eea..883b8ad 100644
--- a/tests/load/test_memory_stability.py
+++ b/tests/load/test_memory_stability.py
@@ -1,34 +1,74 @@
 """
 Memory stability tests for sse-starlette under load.
 
-Verifies no memory leaks during sustained SSE streaming with many concurrent connections.
+This module detects memory leaks and resource accumulation in the SSE implementation:
+- Memory growth during sustained streaming (leak detection)
+- Memory reclamation after connections close (cleanup verification)
+- Internal event set cleanup (Issue #152 regression test)
+
+Memory leaks in SSE are particularly insidious because they accumulate slowly
+over days/weeks in production, eventually causing OOM kills.
 """
 
+from __future__ import annotations
+
 import asyncio
-import statistics
-from typing import List
 
 import httpx
 import pytest
 from httpx_sse import aconnect_sse
 
+from .baseline import BaselineManager
+from .conftest import register_test_report
+from .metrics import MetricsCollector
+from .reporter import ReportGenerator
+
 
 @pytest.mark.loadtest
 async def test_memory_stability_under_load(
     sse_server_url: str,
     scale: int,
     duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Connect many clients, stream for duration, verify memory is stable.
-
-    Pass criteria:
-    - Memory growth < 50MB over test duration
-    - No unbounded growth trend (linear regression slope < 0.1 MB/sec)
+    Verify memory remains stable during sustained SSE streaming.
+
+    ## What is Measured
+    - RSS memory at start, during streaming, and at end
+    - Total memory growth (peak - baseline)
+    - Memory growth rate (linear regression slope over time samples)
+
+    ## Why This Matters
+    Detects memory leaks in the EventSourceResponse lifecycle:
+    - Buffers not released after send
+    - Task references held after completion
+    - Event objects accumulating in queues
+    - Closure captures preventing garbage collection
+
+    A small leak (e.g., 1KB/connection) becomes catastrophic with thousands of
+    connections over hours of operation.
+
+    ## Methodology
+    1. Record baseline memory before any connections
+    2. Connect `scale` clients, each streaming for `duration_minutes`
+    3. Sample memory periodically during streaming
+    4. Compute total growth and growth rate (slope)
+
+    ## Pass Criteria
+    - Memory growth < 50MB total
+    - Growth rate (slope) < 0.1 MB/sec
+    - Rationale: 50MB allows for legitimate per-connection overhead while
+      catching runaway leaks. The slope check catches slow leaks that might
+      stay under the absolute threshold but indicate unbounded growth.
     """
     events_per_client = duration_minutes * 60 * 10  # 10 events/sec
 
-    async def client_task(client_id: int) -> int:
+    async def client_task(client_id: int) -> tuple[int, str | None]:
         """Single client consuming SSE events."""
         events_received = 0
         try:
@@ -40,20 +80,20 @@ async def client_task(client_id: int) -> int:
                         events_received += 1
                         if events_received >= events_per_client:
                             break
-        except Exception:
-            pass  # Connection errors during shutdown are expected
-        return events_received
+            return events_received, None
+        except Exception as e:
+            return events_received, str(e)
 
     # Get baseline memory
     async with httpx.AsyncClient() as client:
         baseline = (await client.get(f"{sse_server_url}/metrics")).json()
     baseline_memory = baseline["memory_rss_mb"]
+    metrics_collector.set_memory_baseline(baseline_memory)
 
     # Start all clients
     tasks = [asyncio.create_task(client_task(i)) for i in range(scale)]
 
     # Sample memory periodically
-    memory_samples: List[float] = []
     sample_interval = max(10, duration_minutes * 6)  # At least 10 samples
 
     for _ in range(sample_interval):
@@ -61,64 +101,117 @@ async def client_task(client_id: int) -> int:
         try:
             async with httpx.AsyncClient() as client:
                 metrics = (await client.get(f"{sse_server_url}/metrics")).json()
-                memory_samples.append(metrics["memory_rss_mb"])
+                metrics_collector.add_memory_sample(metrics["memory_rss_mb"])
         except Exception:
             pass  # Server might be under heavy load
 
     # Wait for all clients to complete
     results = await asyncio.gather(*tasks, return_exceptions=True)
-    completed = sum(1 for r in results if isinstance(r, int))
+
+    # Process results
+    for result in results:
+        if isinstance(result, Exception):
+            metrics_collector.record_failure(str(result))
+        elif isinstance(result, tuple):
+            events, error = result
+            metrics_collector.add_client_events(events)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
 
     # Get final memory
     async with httpx.AsyncClient() as client:
         final = (await client.get(f"{sse_server_url}/metrics")).json()
     final_memory = final["memory_rss_mb"]
+    metrics_collector.set_memory_final(final_memory)
 
-    # Calculate memory growth
-    max_memory = max(memory_samples) if memory_samples else final_memory
-    memory_growth = max_memory - baseline_memory
+    # Set duration
+    metrics_collector.set_duration(duration_minutes * 60)
 
-    # Calculate growth trend (simple linear regression slope)
-    if len(memory_samples) >= 2:
-        x_mean = len(memory_samples) / 2
-        y_mean = statistics.mean(memory_samples)
-        numerator = sum(
-            (i - x_mean) * (y - y_mean) for i, y in enumerate(memory_samples)
-        )
-        denominator = sum((i - x_mean) ** 2 for i in range(len(memory_samples)))
-        slope = numerator / denominator if denominator else 0
-        # Convert to MB/sec
-        sample_interval_sec = duration_minutes * 60 / len(memory_samples)
-        slope_per_sec = slope / sample_interval_sec
-    else:
-        slope_per_sec = 0
-
-    # Assert criteria
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_memory_stability_under_load",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
+    completed = metrics_collector.successful_connections
     assert (
         completed >= scale * 0.9
     ), f"Too many failed connections: {completed}/{scale} completed"
-    assert memory_growth < 50, (
-        f"Memory grew by {memory_growth:.1f}MB (baseline: {baseline_memory:.1f}MB, "
-        f"max: {max_memory:.1f}MB), expected < 50MB"
-    )
-    assert (
-        slope_per_sec < 0.1
-    ), f"Memory growth trend {slope_per_sec:.3f} MB/sec, expected < 0.1 MB/sec"
+
+    if report.memory:
+        assert report.memory.growth_mb < 50, (
+            f"Memory grew by {report.memory.growth_mb:.1f}MB "
+            f"(baseline: {baseline_memory:.1f}MB, peak: {report.memory.peak_mb:.1f}MB), "
+            f"expected < 50MB"
+        )
+        assert report.memory.slope_mb_per_sec < 0.1, (
+            f"Memory growth trend {report.memory.slope_mb_per_sec:.3f} MB/sec, "
+            f"expected < 0.1 MB/sec"
+        )
 
 
 @pytest.mark.loadtest
 async def test_memory_returns_to_baseline_after_disconnect(
     sse_server_url: str,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Connect many clients, disconnect all, verify memory returns near baseline.
-
-    Pass criteria:
-    - Memory within 20% of baseline after all connections close
+    Verify memory is reclaimed after all connections close.
+
+    ## What is Measured
+    - Memory before any connections (baseline)
+    - Memory after all connections close (final)
+    - Delta as percentage of baseline
+
+    ## Why This Matters
+    Complements the stability test by verifying cleanup:
+    - Task references properly released
+    - anyio.Event objects garbage collected
+    - No lingering closures or callbacks
+    - Thread-local state cleared
+
+    Even if memory doesn't grow during streaming, retained references after
+    disconnect indicate a leak that will accumulate across connection cycles.
+
+    ## Methodology
+    1. Record baseline memory
+    2. Connect clients in batches, each receiving 50 events then disconnecting
+    3. Wait 2 seconds for cleanup (GC, finalizers)
+    4. Record final memory and compare to baseline
+
+    ## Pass Criteria
+    - Final memory <= baseline * 1.2 (20% margin)
+    - Rationale: Python's memory allocator doesn't always return memory to OS
+      immediately. 20% margin accounts for fragmentation and GC timing while
+      still catching significant retention issues.
     """
 
-    async def client_task(client_id: int) -> None:
+    async def client_task(client_id: int) -> tuple[int, str | None]:
         """Client that connects, receives few events, then disconnects."""
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
@@ -130,13 +223,15 @@ async def client_task(client_id: int) -> None:
                         count += 1
                         if count >= 50:
                             break
-        except Exception:
-            pass
+            return count, None
+        except Exception as e:
+            return 0, str(e)
 
     # Get baseline
     async with httpx.AsyncClient() as client:
         baseline = (await client.get(f"{sse_server_url}/metrics")).json()
     baseline_memory = baseline["memory_rss_mb"]
+    metrics_collector.set_memory_baseline(baseline_memory)
 
     # Connect and disconnect clients in batches
     batch_size = min(100, scale)
@@ -145,7 +240,26 @@ async def client_task(client_id: int) -> None:
         tasks = [
             asyncio.create_task(client_task(i)) for i in range(batch_start, batch_end)
         ]
-        await asyncio.gather(*tasks, return_exceptions=True)
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        for result in results:
+            if isinstance(result, Exception):
+                metrics_collector.record_failure(str(result))
+            elif isinstance(result, tuple):
+                events, error = result
+                metrics_collector.add_client_events(events)
+                if error:
+                    metrics_collector.record_failure(error)
+                else:
+                    metrics_collector.record_success()
+
+        # Sample memory after each batch
+        try:
+            async with httpx.AsyncClient() as client:
+                metrics = (await client.get(f"{sse_server_url}/metrics")).json()
+                metrics_collector.add_memory_sample(metrics["memory_rss_mb"])
+        except Exception:
+            pass
 
     # Wait for cleanup
     await asyncio.sleep(2)
@@ -154,8 +268,31 @@ async def client_task(client_id: int) -> None:
     async with httpx.AsyncClient() as client:
         final = (await client.get(f"{sse_server_url}/metrics")).json()
     final_memory = final["memory_rss_mb"]
+    metrics_collector.set_memory_final(final_memory)
 
-    # Allow 20% growth from baseline (some overhead is expected)
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_memory_returns_to_baseline_after_disconnect",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertion
     max_allowed = baseline_memory * 1.2
     assert final_memory <= max_allowed, (
         f"Memory did not return to baseline: {final_memory:.1f}MB "
@@ -164,18 +301,54 @@ async def client_task(client_id: int) -> None:
 
 
 @pytest.mark.loadtest
-async def test_event_set_cleanup(sse_server_url: str, scale: int) -> None:
+async def test_event_set_cleanup(
+    sse_server_url: str,
+    scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
+) -> None:
     """
-    Verify the internal event set empties after connections close.
-
-    This tests the Issue #152 fix - events should be properly removed
-    from the thread-local state when connections close.
+    Verify internal event set is cleaned up after connections close (Issue #152).
+
+    ## What is Measured
+    - `registered_events` count from /metrics endpoint
+    - Events at baseline, peak (during connections), and after cleanup
+    - Watcher started status (should be True if connections exist)
+
+    ## Why This Matters
+    This is a regression test for Issue #152 (watcher task leak). Before the fix:
+    - Each SSE connection created a new watcher task
+    - Events accumulated in `_ShutdownState.events` without cleanup
+    - CPU usage grew unbounded as N watchers polled AppStatus.should_exit
+
+    After the fix (using threading.local):
+    - One watcher per thread, not per connection
+    - Events removed from set on connection close
+    - Watcher stops when set becomes empty
+
+    ## Methodology
+    1. Record baseline `registered_events` count
+    2. Connect `scale` clients, wait for connections to establish
+    3. Record peak `registered_events` (should be >= scale * 0.2)
+    4. Wait for all connections to close + 2s cleanup
+    5. Record final `registered_events` (should return near baseline)
+
+    ## Pass Criteria
+    - Peak events >= scale * 0.2 (events were registered)
+    - Final events <= baseline + 10 (events were cleaned up)
+    - Rationale: We expect most (not all) connections to register events.
+      After cleanup, the set should be nearly empty. The +10 margin allows
+      for concurrent test interference.
     """
 
     connected = asyncio.Event()
     connection_count = 0
 
-    async def client_task() -> None:
+    async def client_task() -> tuple[int, str | None]:
         nonlocal connection_count
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
@@ -190,13 +363,16 @@ async def client_task() -> None:
                         count += 1
                         if count >= 5:  # Stay connected for ~2.5s
                             break
-        except Exception:
-            pass
+            return count, None
+        except Exception as e:
+            return 0, str(e)
 
     # Get baseline event count
     async with httpx.AsyncClient() as client:
         baseline = (await client.get(f"{sse_server_url}/metrics")).json()
     baseline_events = baseline["registered_events"]
+    baseline_memory = baseline["memory_rss_mb"]
+    metrics_collector.set_memory_baseline(baseline_memory)
 
     # Connect many clients
     tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
@@ -212,23 +388,64 @@ async def client_task() -> None:
     async with httpx.AsyncClient() as client:
         peak = (await client.get(f"{sse_server_url}/metrics")).json()
     peak_events = peak["registered_events"]
+    metrics_collector.add_memory_sample(peak["memory_rss_mb"])
 
     # Wait for all to complete
-    await asyncio.gather(*tasks, return_exceptions=True)
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    for result in results:
+        if isinstance(result, Exception):
+            metrics_collector.record_failure(str(result))
+        elif isinstance(result, tuple):
+            events, error = result
+            metrics_collector.add_client_events(events)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
     await asyncio.sleep(2)  # Allow cleanup time
 
     # Check events cleaned up
     async with httpx.AsyncClient() as client:
         final = (await client.get(f"{sse_server_url}/metrics")).json()
     final_events = final["registered_events"]
+    metrics_collector.set_memory_final(final["memory_rss_mb"])
 
-    # Events should have been registered during peak (relaxed threshold)
+    # Record SSE internals
+    metrics_collector.set_sse_internals(
+        watcher_started=peak.get("watcher_started", False),
+        peak_events=peak_events,
+        final_events=final_events,
+    )
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_event_set_cleanup",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
     assert peak_events >= scale * 0.2, (
         f"Expected at least {scale * 0.2} events registered during peak, "
         f"got {peak_events}"
     )
-
-    # Events should be cleaned up after
     assert final_events <= baseline_events + 10, (
         f"Event set not cleaned up: {final_events} events remaining "
         f"(baseline: {baseline_events})"
diff --git a/tests/load/test_shutdown.py b/tests/load/test_shutdown.py
index 1bdf43d..406a4a3 100644
--- a/tests/load/test_shutdown.py
+++ b/tests/load/test_shutdown.py
@@ -1,9 +1,18 @@
 """
 Graceful shutdown tests under load.
 
-Verifies clean shutdown behavior with many active connections.
+This module verifies the server shuts down cleanly with active SSE connections:
+- SIGTERM handling with concurrent streams
+- Connection notification and cleanup timing
+- No hanging connections after shutdown
+
+Graceful shutdown is critical for zero-downtime deployments. If SSE connections
+aren't properly notified, clients hang until TCP timeout (minutes), causing
+poor user experience during rolling updates.
 """
 
+from __future__ import annotations
+
 import asyncio
 import signal
 import time
@@ -12,18 +21,56 @@
 import pytest
 from httpx_sse import aconnect_sse
 
+from .baseline import BaselineManager
+from .conftest import register_test_report
+from .metrics import MetricsCollector
+from .reporter import ReportGenerator
+
 
 @pytest.mark.loadtest
 async def test_graceful_shutdown_with_active_connections(
     docker_available: bool,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Send SIGTERM to server with active connections, verify clean shutdown.
-
-    Pass criteria:
-    - Shutdown completes within 5 seconds
-    - All connections receive disconnect (no hanging clients)
+    Verify server shuts down cleanly within timeout when SIGTERM is sent.
+
+    ## What is Measured
+    - Time from SIGTERM to all connections closed
+    - Connection close status (clean_close, server_closed, or error)
+    - Percentage of connections that closed successfully
+
+    ## Why This Matters
+    Tests the core graceful shutdown mechanism:
+    - Uvicorn receives SIGTERM, sets Server.should_exit
+    - Watcher detects should_exit, broadcasts to all registered events
+    - EventSourceResponse streams terminate, connections close
+    - Server waits for in-flight requests, then exits
+
+    Without this working:
+    - Rolling deployments cause client disconnects
+    - Container orchestrators kill processes after timeout
+    - Users experience broken connections during updates
+
+    ## Methodology
+    1. Start server in Docker container
+    2. Connect `scale` concurrent SSE clients
+    3. Wait for connections to establish (~2s)
+    4. Send SIGTERM to container
+    5. Measure time until all connections close
+    6. Categorize close reasons (clean, server-initiated, error)
+
+    ## Pass Criteria
+    - >= 90% connections closed (clean_closes + server_closes)
+    - Shutdown time < 10 seconds
+    - Rationale: 90% accounts for race conditions in test timing. 10s is
+      generous but catches hangs. Production should complete in <5s.
     """
     if not docker_available:
         pytest.skip("Docker not available")
@@ -46,7 +93,7 @@ async def test_graceful_shutdown_with_active_connections(
     connections_made = 0
     connections_closed = 0
 
-    async def client_task() -> str:
+    async def client_task() -> tuple[str, str | None]:
         nonlocal connections_made, connections_closed
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
@@ -58,13 +105,13 @@ async def client_task() -> str:
                         if disconnected.is_set():
                             break
             connections_closed += 1
-            return "clean_close"
+            return "clean_close", None
         except httpx.RemoteProtocolError:
             connections_closed += 1
-            return "server_closed"
+            return "server_closed", None
         except Exception as e:
             connections_closed += 1
-            return f"error:{type(e).__name__}"
+            return f"error:{type(e).__name__}", str(e)
 
     # Start clients
     tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
@@ -90,6 +137,7 @@ async def client_task() -> str:
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
     shutdown_time = time.perf_counter() - start_shutdown
+    metrics_collector.set_duration(shutdown_time)
 
     # Cleanup container
     try:
@@ -98,28 +146,99 @@ async def client_task() -> str:
         pass
 
     # Analyze results
-    clean_closes = sum(1 for r in results if r == "clean_close")
-    server_closes = sum(1 for r in results if r == "server_closed")
-    errors = sum(1 for r in results if isinstance(r, str) and r.startswith("error:"))
-
-    # All connections should have closed (one way or another)
+    clean_closes = 0
+    server_closes = 0
+    errors = 0
+
+    for result in results:
+        if isinstance(result, Exception):
+            metrics_collector.record_failure(str(result))
+            errors += 1
+        elif isinstance(result, tuple):
+            status, error = result
+            if status == "clean_close":
+                metrics_collector.record_success()
+                clean_closes += 1
+            elif status == "server_closed":
+                metrics_collector.record_success()
+                server_closes += 1
+            elif status.startswith("error:"):
+                metrics_collector.record_failure(error or status)
+                errors += 1
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_graceful_shutdown_with_active_connections",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
     total_closed = clean_closes + server_closes + errors
     assert (
         total_closed >= scale * 0.9
     ), f"Not all connections closed: {total_closed}/{scale}"
-
-    # Shutdown should be fast
     assert shutdown_time < 10, f"Shutdown took {shutdown_time:.1f}s, expected < 10s"
 
 
 @pytest.mark.loadtest
 async def test_connections_receive_shutdown_signal(
     docker_available: bool,
+    scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Verify connections are notified of shutdown via SSE.
-
-    When AppStatus.should_exit is set, active streams should terminate gracefully.
+    Verify active SSE streams are interrupted by shutdown signal.
+
+    ## What is Measured
+    - Events received per client before shutdown
+    - Events received (or not) after shutdown signal
+    - Connection termination triggered by AppStatus.should_exit
+
+    ## Why This Matters
+    Tests that the watcher correctly broadcasts shutdown to active streams:
+    - AppStatus.should_exit propagates to watcher task
+    - Watcher sets all registered anyio.Event objects
+    - EventSourceResponse._ping_task detects event, stops iteration
+    - Client receives connection close, not just timeout
+
+    This complements the shutdown timing test by verifying the signal path
+    works, not just that connections eventually close.
+
+    ## Methodology
+    1. Start server in Docker container
+    2. Connect 10 clients to /sse?delay=0.5 (slow stream to keep connections active)
+    3. Wait 3s for clients to receive events
+    4. Send SIGTERM
+    5. Wait for clients to notice stream end
+    6. Count events before/after signal
+
+    ## Pass Criteria
+    - Total events > 0 (clients received events before shutdown)
+    - All clients received < 20 events (interrupted before completing)
+    - Rationale: With 0.5s delay, clients receive ~6 events in 3s. If they
+      reached 20, they weren't interrupted. This proves the shutdown signal
+      propagated through the watcher to active streams.
     """
     if not docker_available:
         pytest.skip("Docker not available")
@@ -133,7 +252,7 @@ async def test_connections_receive_shutdown_signal(
     base_url = container.get_base_url()
 
     # Connect clients that will wait for events
-    async def client_task() -> int:
+    async def client_task() -> tuple[int, str | None]:
         count = 0
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
@@ -144,9 +263,9 @@ async def client_task() -> int:
                         count += 1
                         if count >= 20:  # Should not reach this
                             break
-        except Exception:
-            pass
-        return count
+            return count, None
+        except Exception as e:
+            return count, str(e)
 
     tasks = [asyncio.create_task(client_task()) for _ in range(10)]
 
@@ -172,10 +291,46 @@ async def client_task() -> int:
     except Exception:
         pass
 
-    # Clients should have received some events before shutdown
-    event_counts = [r for r in results if isinstance(r, int)]
-    total_events = sum(event_counts)
-
+    # Process results
+    total_events = 0
+    event_counts: list[int] = []
+
+    for result in results:
+        if isinstance(result, Exception):
+            metrics_collector.record_failure(str(result))
+        elif isinstance(result, tuple):
+            count, error = result
+            metrics_collector.add_client_events(count)
+            total_events += count
+            event_counts.append(count)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_connections_receive_shutdown_signal",
+        scale=10,  # Fixed scale for this test
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
     assert total_events > 0, "Clients should have received events before shutdown"
     assert all(
         c < 20 for c in event_counts
diff --git a/tests/load/test_throughput.py b/tests/load/test_throughput.py
index 22d8407..7f47fbc 100644
--- a/tests/load/test_throughput.py
+++ b/tests/load/test_throughput.py
@@ -1,24 +1,64 @@
 """
 Throughput and latency tests for sse-starlette.
 
-Measures events per second, latency percentiles, and first event latency.
+This module measures the core performance characteristics of the SSE server:
+- Raw throughput (events/sec) under various client loads
+- Time to first event (connection setup latency)
+- Inter-event latency distribution under load
+
+These metrics establish performance baselines and detect regressions in
+event delivery, connection handling, and async task scheduling.
 """
 
+from __future__ import annotations
+
 import asyncio
 import time
-from typing import List
 
 import httpx
 import pytest
 from httpx_sse import aconnect_sse
 
+from .baseline import BaselineManager
+from .conftest import register_test_report
+from .metrics import MetricsCollector
+from .reporter import ReportGenerator
+
 
 @pytest.mark.loadtest
-async def test_throughput_single_client(sse_server_url: str) -> None:
+async def test_throughput_single_client(
+    sse_server_url: str,
+    scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
+) -> None:
     """
     Measure maximum throughput for a single client.
 
-    Baseline measurement without contention.
+    ## What is Measured
+    - Events per second delivered to a single consumer with zero delay
+    - Server's maximum event generation rate without client contention
+
+    ## Why This Matters
+    Establishes the performance ceiling for the SSE implementation. A regression
+    here indicates fundamental slowdown in event serialization, async scheduling,
+    or the streaming response path. This baseline is used to evaluate how well
+    throughput scales with multiple clients.
+
+    ## Methodology
+    1. Connect single client to /sse?delay=0 (server sends events as fast as possible)
+    2. Count events received over 10 seconds
+    3. Calculate events/sec throughput
+
+    ## Pass Criteria
+    - Throughput >= 1000 events/sec
+    - Rationale: With zero delay, the bottleneck should be network I/O and async
+      scheduling. 1000 events/sec is achievable on any modern system and leaves
+      headroom for real-world latency.
     """
     events_received = 0
     start_time = time.perf_counter()
@@ -36,7 +76,35 @@ async def test_throughput_single_client(sse_server_url: str) -> None:
     elapsed = time.perf_counter() - start_time
     throughput = events_received / elapsed
 
-    # Should achieve at least 1000 events/sec for a single client
+    # Record metrics
+    metrics_collector.add_client_events(events_received)
+    metrics_collector.set_duration(elapsed)
+    metrics_collector.record_success()
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_throughput_single_client",
+        scale=1,  # Single client test
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    # Check for regression
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertion
     assert (
         throughput >= 1000
     ), f"Single client throughput {throughput:.0f} events/sec, expected >= 1000"
@@ -46,16 +114,44 @@ async def test_throughput_single_client(sse_server_url: str) -> None:
 async def test_throughput_multiple_clients(
     sse_server_url: str,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
     Measure aggregate throughput with multiple concurrent clients.
 
-    Pass criteria:
-    - Aggregate throughput > 10,000 events/sec
+    ## What is Measured
+    - Total events/sec delivered across N concurrent SSE connections
+    - Per-client event counts to detect uneven distribution
+    - Connection success rate under concurrent load
+
+    ## Why This Matters
+    Detects contention issues that only appear under load:
+    - Lock contention in the send path (anyio.Lock)
+    - Per-connection memory/CPU overhead scaling poorly
+    - Async task scheduler saturation
+    - Event loop blocking under concurrent I/O
+
+    ## Methodology
+    1. Launch `scale` concurrent client tasks (default 100)
+    2. Each client connects to /sse?delay=0.001 (1ms between events)
+    3. Run for 30 seconds, counting events per client
+    4. Sum total events and calculate aggregate throughput
+
+    ## Pass Criteria
+    - Aggregate throughput >= min(10000, scale * 100) events/sec
+    - Rationale: With 1ms delay, each client should receive ~1000 events/sec.
+      With 100 clients, expect ~100K events/sec total. The min() handles
+      smaller scale values gracefully.
     """
     duration_seconds = 30
 
-    async def client_task() -> int:
+    async def client_task() -> tuple[int, str | None]:
+        """Run client and return (event_count, error_or_none)."""
         count = 0
         start = time.perf_counter()
         try:
@@ -67,20 +163,56 @@ async def client_task() -> int:
                         count += 1
                         if time.perf_counter() - start >= duration_seconds:
                             break
-        except Exception:
-            pass
-        return count
+            return count, None
+        except Exception as e:
+            return count, str(e)
 
     start_time = time.perf_counter()
     tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
     results = await asyncio.gather(*tasks, return_exceptions=True)
     elapsed = time.perf_counter() - start_time
 
-    total_events = sum(r for r in results if isinstance(r, int))
-    aggregate_throughput = total_events / elapsed
+    # Process results
+    total_events = 0
+    for result in results:
+        if isinstance(result, Exception):
+            metrics_collector.record_failure(str(result))
+        elif isinstance(result, tuple):
+            count, error = result
+            metrics_collector.add_client_events(count)
+            total_events += count
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
+    metrics_collector.set_duration(elapsed)
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_throughput_multiple_clients",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
 
-    # With scale clients, should achieve high aggregate throughput
-    min_expected = min(10000, scale * 100)  # Scale expectation with client count
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertion
+    aggregate_throughput = total_events / elapsed
+    min_expected = min(10000, scale * 100)
     assert aggregate_throughput >= min_expected, (
         f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {scale} "
         f"clients, expected >= {min_expected}"
@@ -91,17 +223,44 @@ async def client_task() -> int:
 async def test_first_event_latency(
     sse_server_url: str,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
     Measure time to first event (TTFE) for multiple connections.
 
-    Pass criteria (relaxed for Docker overhead):
-    - p50 TTFE < 2000ms
-    - p99 TTFE < 5000ms
+    The TTFE is high due to the "thundering herd" effect (100 connections hitting simultaneously).
+    But the inter-event latency is excellent - only ~5ms overhead on top of the 10ms delay.
+
+    ## What is Measured
+    - Time from connection initiation to first SSE event received
+    - Latency distribution across concurrent connections (p50, p99)
+    - Connection success rate under concurrent connection storms
+
+    ## Why This Matters
+    TTFE is the user-perceived responsiveness metric. High TTFE indicates:
+    - Slow connection acceptance in the ASGI server
+    - Blocking operations in EventSourceResponse initialization
+    - Resource exhaustion during connection setup
+    - Inefficient task group initialization
+
+    ## Methodology
+    1. Launch `scale` concurrent connection attempts simultaneously
+    2. Each client measures time from connect() to first SSE event
+    3. Collect latency samples and compute percentiles
+
+    ## Pass Criteria
+    - p50 < 1250ms, p99 < 2500ms
+    - Calibrated from measured p50=932ms, p99=1779ms at scale=100
+    - Threshold factor: 1.3x measured values
+
     """
-    latencies: List[float] = []
 
-    async def measure_ttfe() -> float:
+    async def measure_ttfe() -> tuple[float, str | None]:
         start = time.perf_counter()
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
@@ -109,16 +268,51 @@ async def measure_ttfe() -> float:
                     client, "GET", f"{sse_server_url}/sse?delay=0"
                 ) as source:
                     async for _ in source.aiter_sse():
-                        return (time.perf_counter() - start) * 1000  # ms
-        except Exception:
-            return -1
-        return -1
+                        return (time.perf_counter() - start) * 1000, None
+        except Exception as e:
+            return -1, str(e)
+        return -1, "no events received"
 
+    start_time = time.perf_counter()
     tasks = [asyncio.create_task(measure_ttfe()) for _ in range(scale)]
     results = await asyncio.gather(*tasks)
+    elapsed = time.perf_counter() - start_time
+
+    # Process results
+    latencies: list[float] = []
+    for latency, error in results:
+        if latency > 0:
+            metrics_collector.add_ttfe_sample(latency)
+            metrics_collector.record_success()
+            latencies.append(latency)
+        else:
+            metrics_collector.record_failure(error or "unknown")
+
+    metrics_collector.set_duration(elapsed)
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_first_event_latency",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
 
-    latencies = [r for r in results if r > 0]
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
 
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
     if len(latencies) < scale * 0.9:
         pytest.fail(f"Too many failed connections: {len(latencies)}/{scale}")
 
@@ -126,25 +320,52 @@ async def measure_ttfe() -> float:
     p50 = latencies[len(latencies) // 2]
     p99 = latencies[int(len(latencies) * 0.99)]
 
-    # Relaxed thresholds: Docker networking + container overhead
-    assert p50 < 2000, f"p50 TTFE {p50:.1f}ms, expected < 2000ms"
-    assert p99 < 5000, f"p99 TTFE {p99:.1f}ms, expected < 5000ms"
+    assert p50 < 1250, f"p50 TTFE {p50:.1f}ms, expected < 1250ms"
+    assert p99 < 2500, f"p99 TTFE {p99:.1f}ms, expected < 2500ms"
 
 
 @pytest.mark.loadtest
 async def test_event_latency_under_load(
     sse_server_url: str,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Measure event-to-event latency under load.
-
-    Captures latency between consecutive events to detect backpressure.
+    Measure inter-event latency under concurrent load.
+
+    ## What is Measured
+    - Time between consecutive SSE events (inter-event latency)
+    - Latency distribution percentiles (p50, p95, p99)
+    - Variance across multiple concurrent connections
+
+    ## Why This Matters
+    Inter-event latency reveals hidden performance issues:
+    - Backpressure from slow sends affecting fast clients
+    - Buffer bloat in the response stream
+    - Async scheduler starvation under load
+    - GC pauses or memory pressure spikes
+
+    Unlike throughput (which averages over time), latency percentiles expose
+    tail latency issues that degrade user experience.
+
+    ## Methodology
+    1. Launch `scale` concurrent clients to /sse?delay=0.01 (10ms between events)
+    2. Each client receives 100 events and records inter-event times
+    3. Aggregate all latency samples and compute percentiles
+
+    ## Pass Criteria
+    - p50 < 20ms, p95 < 30ms, p99 < 40ms
+    - Calibrated from measured p50=14.8ms, p95=21.4ms, p99=27.4ms at scale=100
+    - Server delay: 10ms. Threshold factor: 1.3x measured values
     """
-    all_latencies: List[float] = []
 
-    async def measure_latencies() -> List[float]:
-        latencies: List[float] = []
+    async def measure_latencies() -> tuple[list[float], str | None]:
+        latencies: list[float] = []
         last_time = None
         try:
             async with httpx.AsyncClient(timeout=60.0) as client:
@@ -160,16 +381,51 @@ async def measure_latencies() -> List[float]:
                         count += 1
                         if count >= 100:
                             break
-        except Exception:
-            pass
-        return latencies
+            return latencies, None
+        except Exception as e:
+            return latencies, str(e)
 
+    start_time = time.perf_counter()
     tasks = [asyncio.create_task(measure_latencies()) for _ in range(scale)]
     results = await asyncio.gather(*tasks)
+    elapsed = time.perf_counter() - start_time
+
+    # Process results
+    all_latencies: list[float] = []
+    for client_latencies, error in results:
+        for lat in client_latencies:
+            metrics_collector.add_latency_sample(lat)
+            all_latencies.append(lat)
+        if error:
+            metrics_collector.record_failure(error)
+        else:
+            metrics_collector.record_success()
+
+    metrics_collector.set_duration(elapsed)
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_event_latency_under_load",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
 
-    for client_latencies in results:
-        all_latencies.extend(client_latencies)
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
 
+    # Original assertions
     if len(all_latencies) < 100:
         pytest.fail(f"Insufficient latency samples: {len(all_latencies)}")
 
@@ -178,8 +434,6 @@ async def measure_latencies() -> List[float]:
     p95 = all_latencies[int(len(all_latencies) * 0.95)]
     p99 = all_latencies[int(len(all_latencies) * 0.99)]
 
-    # Expected ~10ms between events (0.01s delay)
-    # Allow 2x for processing overhead under load
-    assert p50 < 50, f"p50 inter-event latency {p50:.1f}ms, expected < 50ms"
-    assert p95 < 100, f"p95 inter-event latency {p95:.1f}ms, expected < 100ms"
-    assert p99 < 200, f"p99 inter-event latency {p99:.1f}ms, expected < 200ms"
+    assert p50 < 20, f"p50 inter-event latency {p50:.1f}ms, expected < 20ms"
+    assert p95 < 30, f"p95 inter-event latency {p95:.1f}ms, expected < 30ms"
+    assert p99 < 40, f"p99 inter-event latency {p99:.1f}ms, expected < 40ms"
diff --git a/tests/load/test_watcher_scale.py b/tests/load/test_watcher_scale.py
index 3a4d150..67cc28f 100644
--- a/tests/load/test_watcher_scale.py
+++ b/tests/load/test_watcher_scale.py
@@ -1,29 +1,76 @@
 """
-Watcher deduplication tests at scale.
+Watcher deduplication tests at scale (Issue #152 regression suite).
 
-Validates the Issue #152 fix: only one watcher task per thread regardless
-of the number of concurrent connections.
+This module validates the fix for Issue #152: watcher task accumulation.
+Before the fix, each SSE connection spawned a new watcher task that polled
+AppStatus.should_exit. With thousands of connections, CPU usage grew unbounded.
+
+The fix uses threading.local() to maintain one watcher per thread. These tests
+verify that pattern holds under various load conditions:
+- Many simultaneous connections sharing a single watcher
+- Rapid connect/disconnect cycles not spawning new watchers
+- Clean watcher lifecycle (start -> broadcast -> cleanup -> restart)
 """
 
+from __future__ import annotations
+
 import asyncio
 
 import httpx
 import pytest
 from httpx_sse import aconnect_sse
 
+from .baseline import BaselineManager
+from .conftest import register_test_report
+from .metrics import MetricsCollector
+from .reporter import ReportGenerator
+
 
 @pytest.mark.loadtest
 async def test_single_watcher_with_many_connections(
     sse_server_url: str,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    With N concurrent connections, verify only 1 watcher is running.
-
-    This is the core regression test for Issue #152.
+    Verify only one watcher runs regardless of connection count (Issue #152 core test).
+
+    ## What is Measured
+    - `watcher_started` flag from /metrics (True = watcher exists)
+    - `registered_events` count (should be >= scale * 0.5)
+    - Implicit: CPU usage would spike if multiple watchers existed (not measured)
+
+    ## Why This Matters
+    This is the primary regression test for Issue #152. Before the fix:
+    - N connections = N watcher tasks
+    - Each watcher polls AppStatus.should_exit every 0.5s
+    - 1000 connections = 1000 polling tasks = CPU exhaustion
+
+    After the fix:
+    - N connections = 1 watcher task (per thread)
+    - Watcher broadcasts to all registered events
+    - Constant CPU overhead regardless of connection count
+
+    ## Methodology
+    1. Connect `scale` concurrent clients (default 100)
+    2. Wait for connections to establish (~2s)
+    3. Query /metrics for watcher_started and registered_events
+    4. Cancel all connections
+
+    ## Pass Criteria
+    - watcher_started = True (watcher exists for active connections)
+    - registered_events >= scale * 0.5 (most connections registered)
+    - Rationale: watcher_started=True confirms the mechanism works.
+      Event count verifies registration worked. We don't directly measure
+      watcher count, but CPU metrics in CI would catch proliferation.
     """
 
-    async def client_task() -> None:
+    async def client_task() -> tuple[int, str | None]:
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
                 async with aconnect_sse(
@@ -32,8 +79,9 @@ async def client_task() -> None:
                     async for _ in source.aiter_sse():
                         await asyncio.sleep(5)  # Stay connected
                         break
-        except Exception:
-            pass
+            return 1, None
+        except Exception as e:
+            return 0, str(e)
 
     # Start many connections
     tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
@@ -47,16 +95,55 @@ async def client_task() -> None:
 
     watcher_started = metrics["watcher_started"]
     registered_events = metrics["registered_events"]
+    metrics_collector.add_memory_sample(metrics["memory_rss_mb"])
 
     # Cancel all tasks
     for task in tasks:
         task.cancel()
-    await asyncio.gather(*tasks, return_exceptions=True)
-
-    # Watcher should be running
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Process results
+    for result in results:
+        if isinstance(result, Exception):
+            metrics_collector.record_failure(str(result))
+        elif isinstance(result, tuple):
+            _, error = result
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
+
+    # Record SSE internals
+    metrics_collector.set_sse_internals(
+        watcher_started=watcher_started,
+        peak_events=registered_events,
+        final_events=0,
+    )
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_single_watcher_with_many_connections",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
     assert watcher_started is True, "Watcher should be started with active connections"
-
-    # Should have many events registered (one per connection)
     assert (
         registered_events >= scale * 0.5
     ), f"Expected at least {scale * 0.5} events, got {registered_events}"
@@ -66,14 +153,43 @@ async def client_task() -> None:
 async def test_rapid_connect_disconnect_watcher_stability(
     sse_server_url: str,
     scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
 ) -> None:
     """
-    Rapid connect/disconnect cycles should not accumulate watchers.
-
-    Each connect/disconnect should reuse the existing watcher, not spawn new ones.
+    Verify rapid connect/disconnect cycles don't accumulate watcher tasks.
+
+    ## What is Measured
+    - Thread count after many rapid connection cycles
+    - Memory samples during the churn
+    - SSE internals (watcher_started, registered_events)
+
+    ## Why This Matters
+    Tests the watcher lifecycle under high churn:
+    - Connections come and go faster than the watcher poll interval (0.5s)
+    - Watcher must survive connection churn without proliferation
+    - Event registration/deregistration must be thread-safe
+
+    Before Issue #152 fix, each connection left behind a watcher task. Even if
+    connections closed quickly, watchers accumulated and never stopped.
+
+    ## Methodology
+    1. Run `scale / 10` batches of 10 quick connections each
+    2. Each connection receives 1 event and disconnects immediately
+    3. After all batches, check thread count and watcher status
+
+    ## Pass Criteria
+    - num_threads < 50
+    - Rationale: A healthy uvicorn server has ~5-10 threads. If watchers
+      accumulated, we'd see hundreds of threads (one per watcher task).
+      50 provides margin for legitimate worker threads.
     """
 
-    async def quick_connect() -> None:
+    async def quick_connect() -> tuple[int, str | None]:
         try:
             async with httpx.AsyncClient(timeout=10.0) as client:
                 async with aconnect_sse(
@@ -81,13 +197,25 @@ async def quick_connect() -> None:
                 ) as source:
                     async for _ in source.aiter_sse():
                         break  # Disconnect after first event
-        except Exception:
-            pass
+            return 1, None
+        except Exception as e:
+            return 0, str(e)
 
     # Rapid connect/disconnect cycles
     for batch in range(scale // 10):
         tasks = [asyncio.create_task(quick_connect()) for _ in range(10)]
-        await asyncio.gather(*tasks, return_exceptions=True)
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        for result in results:
+            if isinstance(result, Exception):
+                metrics_collector.record_failure(str(result))
+            elif isinstance(result, tuple):
+                count, error = result
+                metrics_collector.add_client_events(count)
+                if error:
+                    metrics_collector.record_failure(error)
+                else:
+                    metrics_collector.record_success()
 
     # Brief pause
     await asyncio.sleep(0.5)
@@ -96,24 +224,88 @@ async def quick_connect() -> None:
     async with httpx.AsyncClient() as client:
         metrics = (await client.get(f"{sse_server_url}/metrics")).json()
 
-    # The watcher_started flag confirms single watcher pattern
-    # If multiple watchers had spawned, we'd see resource issues
     num_threads = metrics["num_threads"]
-
-    # Thread count should be reasonable (not proportional to connection count)
-    # A healthy uvicorn worker has ~5-10 threads typically
+    metrics_collector.add_memory_sample(metrics["memory_rss_mb"])
+
+    # Record SSE internals
+    metrics_collector.set_sse_internals(
+        watcher_started=metrics.get("watcher_started", False),
+        peak_events=metrics.get("registered_events", 0),
+        final_events=metrics.get("registered_events", 0),
+    )
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_rapid_connect_disconnect_watcher_stability",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertion
     assert num_threads < 50, f"Too many threads ({num_threads}), possible watcher leak"
 
 
 @pytest.mark.loadtest
-async def test_watcher_cleanup_allows_restart(sse_server_url: str) -> None:
+async def test_watcher_cleanup_allows_restart(
+    sse_server_url: str,
+    scale: int,
+    duration_minutes: int,
+    metrics_collector: MetricsCollector,
+    baseline_manager: BaselineManager,
+    report_generator: ReportGenerator,
+    update_baseline: bool,
+    fail_on_regression: bool,
+) -> None:
     """
-    After all connections close, new connections should start fresh watcher.
-
-    Tests the watcher lifecycle: start -> broadcast -> cleanup -> restart.
+    Verify watcher stops when all connections close, restarts with new connections.
+
+    ## What is Measured
+    - registered_events after Phase 1 (should be near 0 after cleanup)
+    - Events received in Phase 2 (watcher must restart to deliver them)
+    - Final registered_events (should match Phase 1 cleanup)
+
+    ## Why This Matters
+    Tests the complete watcher lifecycle:
+    1. Start: First connection starts the watcher
+    2. Broadcast: Watcher delivers shutdown signals to all registered events
+    3. Cleanup: Last connection removes its event, watcher stops
+    4. Restart: New connections restart the watcher
+
+    If cleanup fails, events accumulate indefinitely. If restart fails, new
+    connections won't receive shutdown signals, causing graceful shutdown to fail.
+
+    ## Methodology
+    1. Phase 1: Connect 50 clients, each receives 20 events, then disconnects
+    2. Wait 1s for cleanup
+    3. Check registered_events (should be near 0)
+    4. Phase 2: Connect 50 new clients, each receives 20 events
+    5. Wait 1s for cleanup
+    6. Verify final state matches Phase 1 post-cleanup
+
+    ## Pass Criteria
+    - phase1_events > 0 (Phase 1 received events)
+    - phase2_events > 0 (Phase 2 received events - proves restart worked)
+    - final_events <= events_after_phase1 + 5 (cleanup works consistently)
+    - Rationale: If watcher didn't restart in Phase 2, no events would be
+      delivered. The +5 margin allows for concurrent test interference.
     """
 
-    async def connect_and_consume(n_events: int) -> int:
+    async def connect_and_consume(n_events: int) -> tuple[int, str | None]:
         count = 0
         try:
             async with httpx.AsyncClient(timeout=30.0) as client:
@@ -124,14 +316,24 @@ async def connect_and_consume(n_events: int) -> int:
                         count += 1
                         if count >= n_events:
                             break
-        except Exception:
-            pass
-        return count
+            return count, None
+        except Exception as e:
+            return count, str(e)
 
     # Phase 1: Connect, consume, disconnect
     tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)]
     results = await asyncio.gather(*tasks)
-    assert sum(results) > 0, "Phase 1 should have received events"
+
+    phase1_events = 0
+    for result in results:
+        if isinstance(result, tuple):
+            count, error = result
+            phase1_events += count
+            metrics_collector.add_client_events(count)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
 
     # Wait for cleanup
     await asyncio.sleep(1)
@@ -140,11 +342,22 @@ async def connect_and_consume(n_events: int) -> int:
     async with httpx.AsyncClient() as client:
         metrics1 = (await client.get(f"{sse_server_url}/metrics")).json()
     events_after_phase1 = metrics1["registered_events"]
+    metrics_collector.add_memory_sample(metrics1["memory_rss_mb"])
 
     # Phase 2: New connections should work
     tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)]
     results = await asyncio.gather(*tasks)
-    assert sum(results) > 0, "Phase 2 should have received events"
+
+    phase2_events = 0
+    for result in results:
+        if isinstance(result, tuple):
+            count, error = result
+            phase2_events += count
+            metrics_collector.add_client_events(count)
+            if error:
+                metrics_collector.record_failure(error)
+            else:
+                metrics_collector.record_success()
 
     # Wait for cleanup
     await asyncio.sleep(1)
@@ -153,7 +366,41 @@ async def connect_and_consume(n_events: int) -> int:
     async with httpx.AsyncClient() as client:
         metrics2 = (await client.get(f"{sse_server_url}/metrics")).json()
 
-    # Events should be cleaned up after both phases
+    final_events = metrics2["registered_events"]
+    metrics_collector.add_memory_sample(metrics2["memory_rss_mb"])
+
+    # Record SSE internals
+    metrics_collector.set_sse_internals(
+        watcher_started=metrics2.get("watcher_started", False),
+        peak_events=max(events_after_phase1, final_events),
+        final_events=final_events,
+    )
+
+    # Generate report
+    report = metrics_collector.compute_report(
+        test_name="test_watcher_cleanup_allows_restart",
+        scale=scale,
+        duration_minutes=duration_minutes,
+    )
+    register_test_report(report)
+
+    # Compare and output
+    comparison = baseline_manager.compare(report)
+    report.comparison = comparison.to_dict() if comparison else None
+
+    report_generator.save_json(report)
+    report_generator.save_html(report, comparison)
+    report_generator.print_summary(report, comparison)
+
+    if update_baseline:
+        baseline_manager.save_baseline(report)
+
+    if fail_on_regression and comparison and comparison.regression_detected:
+        pytest.fail(f"Regression detected: {comparison.regression_reasons}")
+
+    # Original assertions
+    assert phase1_events > 0, "Phase 1 should have received events"
+    assert phase2_events > 0, "Phase 2 should have received events"
     assert (
-        metrics2["registered_events"] <= events_after_phase1 + 5
+        final_events <= events_after_phase1 + 5
     ), "Event set should be cleaned up between phases"

From 1b25b1625fab4099d79393db9886cf1d41bff2ad Mon Sep 17 00:00:00 2001
From: sysid <sysid@gmx.de>
Date: Fri, 2 Jan 2026 13:13:47 +0100
Subject: [PATCH 4/4] refactor(loadtest): replace CLI scale/duration with
 per-test constants

Remove --scale and --duration CLI options from load tests. Each test
now defines its own parameters as constants, allowing appropriate
values per test type (e.g., shutdown tests need fewer connections).

Changes:
- conftest.py: remove --scale, --duration options and fixtures
- metrics.py: compute duration_minutes internally from actual duration
- test_*.py: add explicit NUM_CLIENTS, DURATION_SEC, etc. constants
- README.md: update CLI options documentation
- load-test.yml: remove scale/duration workflow inputs
---
 .github/workflows/load-test.yml     | 22 --------
 tests/load/README.md                | 48 ++++++++++++++--
 tests/load/conftest.py              | 24 --------
 tests/load/metrics.py               |  7 ++-
 tests/load/test_backpressure.py     | 67 +++++++++++-----------
 tests/load/test_memory_stability.py | 87 +++++++++++++++++------------
 tests/load/test_shutdown.py         | 33 +++++------
 tests/load/test_throughput.py       | 69 +++++++++++------------
 tests/load/test_watcher_scale.py    | 74 +++++++++++++++---------
 9 files changed, 227 insertions(+), 204 deletions(-)

diff --git a/.github/workflows/load-test.yml b/.github/workflows/load-test.yml
index ec3691e..726c170 100644
--- a/.github/workflows/load-test.yml
+++ b/.github/workflows/load-test.yml
@@ -3,24 +3,6 @@ name: Load Tests
 on:
   workflow_dispatch:
     inputs:
-      scale:
-        description: 'Number of concurrent connections'
-        required: true
-        default: '100'
-        type: choice
-        options:
-          - '100'
-          - '500'
-          - '1000'
-      duration:
-        description: 'Test duration in minutes'
-        required: true
-        default: '1'
-        type: choice
-        options:
-          - '1'
-          - '5'
-          - '10'
       update_baseline:
         description: 'Update baselines after run'
         required: false
@@ -59,8 +41,6 @@ jobs:
       - name: Run load tests
         run: |
           python -m pytest tests/load/ -m "loadtest" \
-            --scale=${{ inputs.scale }} \
-            --duration=${{ inputs.duration }} \
             --output-dir=tests/load/results \
             ${{ inputs.update_baseline && '--update-baseline' || '' }} \
             ${{ inputs.fail_on_regression && '--fail-on-regression' || '' }} \
@@ -91,8 +71,6 @@ jobs:
         run: |
           echo "## Load Test Results" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "- **Scale**: ${{ inputs.scale }} concurrent connections" >> $GITHUB_STEP_SUMMARY
-          echo "- **Duration**: ${{ inputs.duration }} minutes" >> $GITHUB_STEP_SUMMARY
           echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "### Reports" >> $GITHUB_STEP_SUMMARY
diff --git a/tests/load/README.md b/tests/load/README.md
index 13332a5..9b0343a 100644
--- a/tests/load/README.md
+++ b/tests/load/README.md
@@ -17,9 +17,6 @@ These tests measure performance characteristics that unit tests cannot capture:
 # Run load tests locally (requires Docker)
 make test-load
 
-# Run with custom scale
-make test-load PYTEST_ARGS="--scale=500 --duration=5"
-
 # Update baselines after intentional changes
 make test-load PYTEST_ARGS="--update-baseline"
 ```
@@ -105,14 +102,15 @@ make test-load PYTEST_ARGS="--fail-on-regression"
 
 | Option | Default | Description |
 |--------|---------|-------------|
-| `--scale` | 100 | Concurrent connections |
-| `--duration` | 1 | Test duration (minutes) |
 | `--output-dir` | `tests/load/results` | Report output directory |
 | `--baselines-dir` | `tests/load/baselines` | Baseline file directory |
 | `--update-baseline` | False | Save current run as new baseline |
 | `--fail-on-regression` | False | Exit non-zero if regression detected |
 | `--regression-threshold` | 20 | Percent change to trigger warning |
 
+**Note**: Test scale (connections, duration) is controlled via constants within each test file.
+This allows appropriate parameters per test type (e.g., shutdown tests use fewer connections).
+
 ## Test Categories
 
 ### Throughput (`test_throughput.py`)
@@ -219,10 +217,48 @@ Features:
 - Comparison against baseline with delta percentages
 - Regression/warning highlights
 
+## Server Metrics Endpoint
+
+The load test server exposes `/metrics` for monitoring:
+
+```json
+{
+  "memory_rss_mb": 45.2,
+  "num_fds": 25,
+  "num_threads": 8,
+  "watcher_started": true,
+  "registered_events": 100,
+  "uptime_seconds": 30.5
+}
+```
+
+Key metrics:
+- `memory_rss_mb`: Detect memory leaks
+- `registered_events`: Verify Issue #152 (should equal active connections)
+- `watcher_started`: Confirm single watcher pattern
+- `num_fds`: Detect file descriptor leaks
+
+## Dependencies
+
+Added to `pyproject.toml` as optional `[loadtest]` group:
+
+```bash
+pip install -e ".[loadtest]"
+```
+
 ## GitHub Actions Integration
 
 The workflow (`.github/workflows/load-test.yml`) supports:
-- Manual trigger with scale/duration inputs
+- Manual trigger via workflow_dispatch
 - Baseline update option
 - Regression detection for CI gates
 - Artifact upload for reports
+
+## Design Decisions
+
+| Choice | Rationale |
+|--------|-----------|
+| httpx-sse + asyncio | Native async SSE client, simple concurrency with asyncio.gather() |
+| Docker containers | Isolated environment, reproducible, clean SIGTERM shutdown |
+| Manual CI trigger | Load tests are resource-intensive, not suitable for every PR |
+| psutil for metrics | Cross-platform, no infrastructure needed, real-time data |
diff --git a/tests/load/conftest.py b/tests/load/conftest.py
index b70f888..160d2e3 100644
--- a/tests/load/conftest.py
+++ b/tests/load/conftest.py
@@ -112,18 +112,6 @@ async def async_client() -> httpx.AsyncClient:
 
 def pytest_addoption(parser: pytest.Parser) -> None:
     """Add custom command line options for load tests."""
-    parser.addoption(
-        "--scale",
-        action="store",
-        default="100",
-        help="Number of concurrent connections for load tests",
-    )
-    parser.addoption(
-        "--duration",
-        action="store",
-        default="1",
-        help="Test duration in minutes",
-    )
     parser.addoption(
         "--output-dir",
         action="store",
@@ -157,18 +145,6 @@ def pytest_addoption(parser: pytest.Parser) -> None:
     )
 
 
-@pytest.fixture
-def scale(request: pytest.FixtureRequest) -> int:
-    """Get the scale (number of connections) for load tests."""
-    return int(request.config.getoption("--scale"))
-
-
-@pytest.fixture
-def duration_minutes(request: pytest.FixtureRequest) -> int:
-    """Get the duration in minutes for load tests."""
-    return int(request.config.getoption("--duration"))
-
-
 @pytest.fixture
 def output_dir(request: pytest.FixtureRequest) -> Path:
     """Get the output directory for reports."""
diff --git a/tests/load/metrics.py b/tests/load/metrics.py
index c3bfea5..c5929dc 100644
--- a/tests/load/metrics.py
+++ b/tests/load/metrics.py
@@ -345,9 +345,7 @@ def set_sse_internals(
         self.peak_registered_events = peak_events
         self.final_registered_events = final_events
 
-    def compute_report(
-        self, test_name: str, scale: int, duration_minutes: int
-    ) -> TestReport:
+    def compute_report(self, test_name: str, scale: int) -> TestReport:
         """Compute final report from collected samples."""
         git_commit, git_branch = _get_git_info()
         timestamp = datetime.now(timezone.utc).isoformat()
@@ -406,6 +404,9 @@ def compute_report(
                 final_registered_events=self.final_registered_events,
             )
 
+        # Compute duration_minutes from actual test duration
+        duration_minutes = max(1, int(self.total_duration_sec / 60))
+
         return TestReport(
             test_name=test_name,
             timestamp=timestamp,
diff --git a/tests/load/test_backpressure.py b/tests/load/test_backpressure.py
index 024adf6..6c598e9 100644
--- a/tests/load/test_backpressure.py
+++ b/tests/load/test_backpressure.py
@@ -28,8 +28,6 @@
 @pytest.mark.loadtest
 async def test_slow_clients_dont_block_fast_clients(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -54,9 +52,9 @@ async def test_slow_clients_dont_block_fast_clients(
     making the server unusable under mixed load.
 
     ## Methodology
-    1. Connect 10 "fast" clients (consume events immediately)
-    2. Connect 10 "slow" clients (sleep 0.5s after each event)
-    3. Run for 10 seconds
+    1. Connect NUM_FAST "fast" clients (consume events immediately)
+    2. Connect NUM_SLOW "slow" clients (sleep 0.5s after each event)
+    3. Run for DURATION_SEC seconds
     4. Compare event counts
 
     ## Pass Criteria
@@ -66,7 +64,10 @@ async def test_slow_clients_dont_block_fast_clients(
       Slow clients receive ~20 (10s / 0.5s). 5x ratio is conservative.
       500 events threshold catches severe throttling.
     """
-    test_duration = 10  # seconds
+    # Test parameters
+    NUM_FAST = 10
+    NUM_SLOW = 10
+    DURATION_SEC = 10
 
     async def fast_client() -> tuple[int, str | None]:
         """Client that consumes events as fast as possible."""
@@ -79,7 +80,7 @@ async def fast_client() -> tuple[int, str | None]:
                 ) as source:
                     async for _ in source.aiter_sse():
                         count += 1
-                        if time.perf_counter() - start >= test_duration:
+                        if time.perf_counter() - start >= DURATION_SEC:
                             break
             return count, None
         except Exception as e:
@@ -97,7 +98,7 @@ async def slow_client() -> tuple[int, str | None]:
                     async for _ in source.aiter_sse():
                         await asyncio.sleep(0.5)  # Slow processing
                         count += 1
-                        if time.perf_counter() - start >= test_duration:
+                        if time.perf_counter() - start >= DURATION_SEC:
                             break
             return count, None
         except Exception as e:
@@ -106,8 +107,8 @@ async def slow_client() -> tuple[int, str | None]:
     start_time = time.perf_counter()
 
     # Mix of fast and slow clients
-    fast_tasks = [asyncio.create_task(fast_client()) for _ in range(10)]
-    slow_tasks = [asyncio.create_task(slow_client()) for _ in range(10)]
+    fast_tasks = [asyncio.create_task(fast_client()) for _ in range(NUM_FAST)]
+    slow_tasks = [asyncio.create_task(slow_client()) for _ in range(NUM_SLOW)]
 
     fast_results = await asyncio.gather(*fast_tasks)
     slow_results = await asyncio.gather(*slow_tasks)
@@ -145,8 +146,7 @@ async def slow_client() -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_slow_clients_dont_block_fast_clients",
-        scale=20,  # 10 fast + 10 slow
-        duration_minutes=duration_minutes,
+        scale=NUM_FAST + NUM_SLOW,
     )
     register_test_report(report)
 
@@ -177,8 +177,6 @@ async def slow_client() -> tuple[int, str | None]:
 @pytest.mark.loadtest
 async def test_connection_churn_stability(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -204,7 +202,7 @@ async def test_connection_churn_stability(
 
     ## Methodology
     1. Record baseline FDs and memory
-    2. Create `churn_rate` connections per second for 30 seconds
+    2. Create CHURN_RATE connections per second for DURATION_SEC seconds
     3. Each connection receives one event and disconnects
     4. Sample memory every 5 seconds
     5. Record final FDs and memory
@@ -217,9 +215,11 @@ async def test_connection_churn_stability(
       100MB memory is generous but catches runaway allocation.
       90% success rate accounts for expected failures under heavy churn.
     """
-    churn_rate = min(100, scale)  # connections per second
-    duration = 30  # seconds
-    total_connections = churn_rate * duration
+    # Test parameters
+    CHURN_RATE = 100  # connections per second
+    DURATION_SEC = 30
+
+    total_connections = CHURN_RATE * DURATION_SEC
 
     async def quick_connection() -> tuple[bool, str | None]:
         try:
@@ -245,8 +245,8 @@ async def quick_connection() -> tuple[bool, str | None]:
 
     # Create connections at target rate
     successful = 0
-    for batch in range(duration):
-        tasks = [asyncio.create_task(quick_connection()) for _ in range(churn_rate)]
+    for batch in range(DURATION_SEC):
+        tasks = [asyncio.create_task(quick_connection()) for _ in range(CHURN_RATE)]
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
         for result in results:
@@ -285,8 +285,7 @@ async def quick_connection() -> tuple[bool, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_connection_churn_stability",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=total_connections,
     )
     register_test_report(report)
 
@@ -327,8 +326,6 @@ async def quick_connection() -> tuple[bool, str | None]:
 @pytest.mark.loadtest
 async def test_send_timeout_under_load(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -354,8 +351,8 @@ async def test_send_timeout_under_load(
     "freeze" (backgrounded, network change) without closing connections.
 
     ## Methodology
-    1. Connect 5 "frozen" clients (receive one event, then stop reading)
-    2. Connect 3 "normal" clients (receive 50 events normally)
+    1. Connect NUM_FROZEN "frozen" clients (receive one event, then stop reading)
+    2. Connect NUM_NORMAL "normal" clients (receive EVENTS_PER_NORMAL events normally)
     3. Wait for normal clients to complete
     4. Verify normal clients weren't affected
 
@@ -365,6 +362,10 @@ async def test_send_timeout_under_load(
       small margin for timing. If frozen clients blocked the server,
       normal clients would timeout or receive far fewer events.
     """
+    # Test parameters
+    NUM_FROZEN = 5
+    NUM_NORMAL = 3
+    EVENTS_PER_NORMAL = 50
 
     async def frozen_client() -> tuple[str, float, str | None]:
         """Client that stops reading after first event (simulates frozen client)."""
@@ -385,7 +386,7 @@ async def frozen_client() -> tuple[str, float, str | None]:
         return "completed", time.perf_counter() - start, None
 
     # Start some frozen clients (server has default send_timeout)
-    frozen_tasks = [asyncio.create_task(frozen_client()) for _ in range(5)]
+    frozen_tasks = [asyncio.create_task(frozen_client()) for _ in range(NUM_FROZEN)]
 
     # Also verify server remains responsive with normal clients
     async def normal_client() -> tuple[int, str | None]:
@@ -397,13 +398,13 @@ async def normal_client() -> tuple[int, str | None]:
                 ) as source:
                     async for _ in source.aiter_sse():
                         count += 1
-                        if count >= 50:
+                        if count >= EVENTS_PER_NORMAL:
                             break
             return count, None
         except Exception as e:
             return count, str(e)
 
-    normal_tasks = [asyncio.create_task(normal_client()) for _ in range(3)]
+    normal_tasks = [asyncio.create_task(normal_client()) for _ in range(NUM_NORMAL)]
 
     # Wait for normal clients to complete
     normal_results = await asyncio.gather(*normal_tasks)
@@ -441,8 +442,7 @@ async def normal_client() -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_send_timeout_under_load",
-        scale=8,  # 5 frozen + 3 normal
-        duration_minutes=duration_minutes,
+        scale=NUM_FROZEN + NUM_NORMAL,
     )
     register_test_report(report)
 
@@ -460,7 +460,8 @@ async def normal_client() -> tuple[int, str | None]:
     if fail_on_regression and comparison and comparison.regression_detected:
         pytest.fail(f"Regression detected: {comparison.regression_reasons}")
 
-    # Original assertion
+    # TODO: fix percentage
+    min_expected = EVENTS_PER_NORMAL - 5  # Allow 10% margin
     assert all(
-        r >= 45 for r in normal_counts
+        r >= min_expected for r in normal_counts
     ), f"Normal clients affected by frozen clients: {normal_counts}"
diff --git a/tests/load/test_memory_stability.py b/tests/load/test_memory_stability.py
index 883b8ad..118c1c7 100644
--- a/tests/load/test_memory_stability.py
+++ b/tests/load/test_memory_stability.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import asyncio
+import time
 
 import httpx
 import pytest
@@ -27,8 +28,6 @@
 @pytest.mark.loadtest
 async def test_memory_stability_under_load(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -55,7 +54,7 @@ async def test_memory_stability_under_load(
 
     ## Methodology
     1. Record baseline memory before any connections
-    2. Connect `scale` clients, each streaming for `duration_minutes`
+    2. Connect NUM_CLIENTS clients, each streaming for DURATION_SEC
     3. Sample memory periodically during streaming
     4. Compute total growth and growth rate (slope)
 
@@ -66,7 +65,10 @@ async def test_memory_stability_under_load(
       catching runaway leaks. The slope check catches slow leaks that might
       stay under the absolute threshold but indicate unbounded growth.
     """
-    events_per_client = duration_minutes * 60 * 10  # 10 events/sec
+    # Test parameters
+    NUM_CLIENTS = 100
+    DURATION_SEC = 60
+    EVENTS_PER_CLIENT = DURATION_SEC * 10  # 10 events/sec with 0.1s delay
 
     async def client_task(client_id: int) -> tuple[int, str | None]:
         """Single client consuming SSE events."""
@@ -78,7 +80,7 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
                 ) as source:
                     async for _ in source.aiter_sse():
                         events_received += 1
-                        if events_received >= events_per_client:
+                        if events_received >= EVENTS_PER_CLIENT:
                             break
             return events_received, None
         except Exception as e:
@@ -91,13 +93,13 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
     metrics_collector.set_memory_baseline(baseline_memory)
 
     # Start all clients
-    tasks = [asyncio.create_task(client_task(i)) for i in range(scale)]
+    start_time = time.perf_counter()
+    tasks = [asyncio.create_task(client_task(i)) for i in range(NUM_CLIENTS)]
 
-    # Sample memory periodically
-    sample_interval = max(10, duration_minutes * 6)  # At least 10 samples
-
-    for _ in range(sample_interval):
-        await asyncio.sleep(duration_minutes * 60 / sample_interval)
+    # Sample memory periodically (at least 10 samples)
+    num_samples = 10
+    for _ in range(num_samples):
+        await asyncio.sleep(DURATION_SEC / num_samples)
         try:
             async with httpx.AsyncClient() as client:
                 metrics = (await client.get(f"{sse_server_url}/metrics")).json()
@@ -107,6 +109,7 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
 
     # Wait for all clients to complete
     results = await asyncio.gather(*tasks, return_exceptions=True)
+    elapsed = time.perf_counter() - start_time
 
     # Process results
     for result in results:
@@ -127,13 +130,12 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
     metrics_collector.set_memory_final(final_memory)
 
     # Set duration
-    metrics_collector.set_duration(duration_minutes * 60)
+    metrics_collector.set_duration(elapsed)
 
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_memory_stability_under_load",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -154,8 +156,8 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
     # Original assertions
     completed = metrics_collector.successful_connections
     assert (
-        completed >= scale * 0.9
-    ), f"Too many failed connections: {completed}/{scale} completed"
+        completed >= NUM_CLIENTS * 0.9
+    ), f"Too many failed connections: {completed}/{NUM_CLIENTS} completed"
 
     if report.memory:
         assert report.memory.growth_mb < 50, (
@@ -172,8 +174,6 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
 @pytest.mark.loadtest
 async def test_memory_returns_to_baseline_after_disconnect(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -200,7 +200,7 @@ async def test_memory_returns_to_baseline_after_disconnect(
 
     ## Methodology
     1. Record baseline memory
-    2. Connect clients in batches, each receiving 50 events then disconnecting
+    2. Connect clients in batches, each receiving EVENTS_PER_CLIENT events then disconnecting
     3. Wait 2 seconds for cleanup (GC, finalizers)
     4. Record final memory and compare to baseline
 
@@ -210,6 +210,10 @@ async def test_memory_returns_to_baseline_after_disconnect(
       immediately. 20% margin accounts for fragmentation and GC timing while
       still catching significant retention issues.
     """
+    # Test parameters
+    NUM_CLIENTS = 100
+    EVENTS_PER_CLIENT = 50
+    BATCH_SIZE = 100
 
     async def client_task(client_id: int) -> tuple[int, str | None]:
         """Client that connects, receives few events, then disconnects."""
@@ -221,7 +225,7 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
                     count = 0
                     async for _ in source.aiter_sse():
                         count += 1
-                        if count >= 50:
+                        if count >= EVENTS_PER_CLIENT:
                             break
             return count, None
         except Exception as e:
@@ -233,10 +237,12 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
     baseline_memory = baseline["memory_rss_mb"]
     metrics_collector.set_memory_baseline(baseline_memory)
 
+    start_time = time.perf_counter()
+
     # Connect and disconnect clients in batches
-    batch_size = min(100, scale)
-    for batch_start in range(0, scale, batch_size):
-        batch_end = min(batch_start + batch_size, scale)
+    batch_size = min(BATCH_SIZE, NUM_CLIENTS)
+    for batch_start in range(0, NUM_CLIENTS, batch_size):
+        batch_end = min(batch_start + batch_size, NUM_CLIENTS)
         tasks = [
             asyncio.create_task(client_task(i)) for i in range(batch_start, batch_end)
         ]
@@ -264,17 +270,19 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
     # Wait for cleanup
     await asyncio.sleep(2)
 
+    elapsed = time.perf_counter() - start_time
+
     # Check memory returned to near baseline
     async with httpx.AsyncClient() as client:
         final = (await client.get(f"{sse_server_url}/metrics")).json()
     final_memory = final["memory_rss_mb"]
     metrics_collector.set_memory_final(final_memory)
+    metrics_collector.set_duration(elapsed)
 
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_memory_returns_to_baseline_after_disconnect",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -303,8 +311,6 @@ async def client_task(client_id: int) -> tuple[int, str | None]:
 @pytest.mark.loadtest
 async def test_event_set_cleanup(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -332,18 +338,21 @@ async def test_event_set_cleanup(
 
     ## Methodology
     1. Record baseline `registered_events` count
-    2. Connect `scale` clients, wait for connections to establish
-    3. Record peak `registered_events` (should be >= scale * 0.2)
+    2. Connect NUM_CLIENTS clients, wait for connections to establish
+    3. Record peak `registered_events` (should be >= NUM_CLIENTS * 0.2)
     4. Wait for all connections to close + 2s cleanup
     5. Record final `registered_events` (should return near baseline)
 
     ## Pass Criteria
-    - Peak events >= scale * 0.2 (events were registered)
+    - Peak events >= NUM_CLIENTS * 0.2 (events were registered)
     - Final events <= baseline + 10 (events were cleaned up)
     - Rationale: We expect most (not all) connections to register events.
       After cleanup, the set should be nearly empty. The +10 margin allows
       for concurrent test interference.
     """
+    # Test parameters
+    NUM_CLIENTS = 100
+    EVENTS_PER_CLIENT = 5
 
     connected = asyncio.Event()
     connection_count = 0
@@ -356,12 +365,12 @@ async def client_task() -> tuple[int, str | None]:
                     client, "GET", f"{sse_server_url}/sse?delay=0.5"
                 ) as source:
                     connection_count += 1
-                    if connection_count >= scale * 0.5:
+                    if connection_count >= NUM_CLIENTS * 0.5:
                         connected.set()
                     count = 0
                     async for _ in source.aiter_sse():
                         count += 1
-                        if count >= 5:  # Stay connected for ~2.5s
+                        if count >= EVENTS_PER_CLIENT:  # Stay connected for ~2.5s
                             break
             return count, None
         except Exception as e:
@@ -374,8 +383,10 @@ async def client_task() -> tuple[int, str | None]:
     baseline_memory = baseline["memory_rss_mb"]
     metrics_collector.set_memory_baseline(baseline_memory)
 
+    start_time = time.perf_counter()
+
     # Connect many clients
-    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+    tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)]
 
     # Wait for connections to establish (with timeout)
     try:
@@ -406,11 +417,14 @@ async def client_task() -> tuple[int, str | None]:
 
     await asyncio.sleep(2)  # Allow cleanup time
 
+    elapsed = time.perf_counter() - start_time
+
     # Check events cleaned up
     async with httpx.AsyncClient() as client:
         final = (await client.get(f"{sse_server_url}/metrics")).json()
     final_events = final["registered_events"]
     metrics_collector.set_memory_final(final["memory_rss_mb"])
+    metrics_collector.set_duration(elapsed)
 
     # Record SSE internals
     metrics_collector.set_sse_internals(
@@ -422,8 +436,7 @@ async def client_task() -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_event_set_cleanup",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -442,8 +455,8 @@ async def client_task() -> tuple[int, str | None]:
         pytest.fail(f"Regression detected: {comparison.regression_reasons}")
 
     # Original assertions
-    assert peak_events >= scale * 0.2, (
-        f"Expected at least {scale * 0.2} events registered during peak, "
+    assert peak_events >= NUM_CLIENTS * 0.2, (
+        f"Expected at least {NUM_CLIENTS * 0.2} events registered during peak, "
         f"got {peak_events}"
     )
     assert final_events <= baseline_events + 10, (
diff --git a/tests/load/test_shutdown.py b/tests/load/test_shutdown.py
index 406a4a3..172594f 100644
--- a/tests/load/test_shutdown.py
+++ b/tests/load/test_shutdown.py
@@ -30,8 +30,6 @@
 @pytest.mark.loadtest
 async def test_graceful_shutdown_with_active_connections(
     docker_available: bool,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -60,7 +58,7 @@ async def test_graceful_shutdown_with_active_connections(
 
     ## Methodology
     1. Start server in Docker container
-    2. Connect `scale` concurrent SSE clients
+    2. Connect NUM_CLIENTS concurrent SSE clients
     3. Wait for connections to establish (~2s)
     4. Send SIGTERM to container
     5. Measure time until all connections close
@@ -72,6 +70,9 @@ async def test_graceful_shutdown_with_active_connections(
     - Rationale: 90% accounts for race conditions in test timing. 10s is
       generous but catches hangs. Production should complete in <5s.
     """
+    # Test parameters
+    NUM_CLIENTS = 100
+
     if not docker_available:
         pytest.skip("Docker not available")
 
@@ -114,7 +115,7 @@ async def client_task() -> tuple[str, str | None]:
             return f"error:{type(e).__name__}", str(e)
 
     # Start clients
-    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+    tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)]
 
     # Wait for connections to establish
     await asyncio.sleep(2)
@@ -169,8 +170,7 @@ async def client_task() -> tuple[str, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_graceful_shutdown_with_active_connections",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -191,16 +191,14 @@ async def client_task() -> tuple[str, str | None]:
     # Original assertions
     total_closed = clean_closes + server_closes + errors
     assert (
-        total_closed >= scale * 0.9
-    ), f"Not all connections closed: {total_closed}/{scale}"
+        total_closed >= NUM_CLIENTS * 0.9
+    ), f"Not all connections closed: {total_closed}/{NUM_CLIENTS}"
     assert shutdown_time < 10, f"Shutdown took {shutdown_time:.1f}s, expected < 10s"
 
 
 @pytest.mark.loadtest
 async def test_connections_receive_shutdown_signal(
     docker_available: bool,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -227,7 +225,7 @@ async def test_connections_receive_shutdown_signal(
 
     ## Methodology
     1. Start server in Docker container
-    2. Connect 10 clients to /sse?delay=0.5 (slow stream to keep connections active)
+    2. Connect NUM_CLIENTS clients to /sse?delay=0.5 (slow stream to keep connections active)
     3. Wait 3s for clients to receive events
     4. Send SIGTERM
     5. Wait for clients to notice stream end
@@ -240,6 +238,10 @@ async def test_connections_receive_shutdown_signal(
       reached 20, they weren't interrupted. This proves the shutdown signal
       propagated through the watcher to active streams.
     """
+    # Test parameters
+    NUM_CLIENTS = 10
+    MAX_EVENTS_PER_CLIENT = 20
+
     if not docker_available:
         pytest.skip("Docker not available")
 
@@ -261,13 +263,13 @@ async def client_task() -> tuple[int, str | None]:
                 ) as source:
                     async for _ in source.aiter_sse():
                         count += 1
-                        if count >= 20:  # Should not reach this
+                        if count >= MAX_EVENTS_PER_CLIENT:  # Should not reach this
                             break
             return count, None
         except Exception as e:
             return count, str(e)
 
-    tasks = [asyncio.create_task(client_task()) for _ in range(10)]
+    tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)]
 
     # Let them receive a few events
     await asyncio.sleep(3)
@@ -311,8 +313,7 @@ async def client_task() -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_connections_receive_shutdown_signal",
-        scale=10,  # Fixed scale for this test
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -333,5 +334,5 @@ async def client_task() -> tuple[int, str | None]:
     # Original assertions
     assert total_events > 0, "Clients should have received events before shutdown"
     assert all(
-        c < 20 for c in event_counts
+        c < MAX_EVENTS_PER_CLIENT for c in event_counts
     ), "Clients should have been interrupted by shutdown"
diff --git a/tests/load/test_throughput.py b/tests/load/test_throughput.py
index 7f47fbc..d395791 100644
--- a/tests/load/test_throughput.py
+++ b/tests/load/test_throughput.py
@@ -28,8 +28,6 @@
 @pytest.mark.loadtest
 async def test_throughput_single_client(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -60,9 +58,11 @@ async def test_throughput_single_client(
       scheduling. 1000 events/sec is achievable on any modern system and leaves
       headroom for real-world latency.
     """
+    # Test parameters
+    DURATION_SEC = 10
+
     events_received = 0
     start_time = time.perf_counter()
-    duration_seconds = 10
 
     async with httpx.AsyncClient(timeout=60.0) as client:
         async with aconnect_sse(
@@ -70,7 +70,7 @@ async def test_throughput_single_client(
         ) as source:
             async for _ in source.aiter_sse():
                 events_received += 1
-                if time.perf_counter() - start_time >= duration_seconds:
+                if time.perf_counter() - start_time >= DURATION_SEC:
                     break
 
     elapsed = time.perf_counter() - start_time
@@ -84,8 +84,7 @@ async def test_throughput_single_client(
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_throughput_single_client",
-        scale=1,  # Single client test
-        duration_minutes=duration_minutes,
+        scale=1,
     )
     register_test_report(report)
 
@@ -113,8 +112,6 @@ async def test_throughput_single_client(
 @pytest.mark.loadtest
 async def test_throughput_multiple_clients(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -137,18 +134,19 @@ async def test_throughput_multiple_clients(
     - Event loop blocking under concurrent I/O
 
     ## Methodology
-    1. Launch `scale` concurrent client tasks (default 100)
+    1. Launch NUM_CLIENTS concurrent client tasks
     2. Each client connects to /sse?delay=0.001 (1ms between events)
-    3. Run for 30 seconds, counting events per client
+    3. Run for DURATION_SEC seconds, counting events per client
     4. Sum total events and calculate aggregate throughput
 
     ## Pass Criteria
-    - Aggregate throughput >= min(10000, scale * 100) events/sec
+    - Aggregate throughput >= min(10000, NUM_CLIENTS * 100) events/sec
     - Rationale: With 1ms delay, each client should receive ~1000 events/sec.
-      With 100 clients, expect ~100K events/sec total. The min() handles
-      smaller scale values gracefully.
+      With 100 clients, expect ~100K events/sec total.
     """
-    duration_seconds = 30
+    # Test parameters
+    NUM_CLIENTS = 100
+    DURATION_SEC = 30
 
     async def client_task() -> tuple[int, str | None]:
         """Run client and return (event_count, error_or_none)."""
@@ -161,14 +159,14 @@ async def client_task() -> tuple[int, str | None]:
                 ) as source:
                     async for _ in source.aiter_sse():
                         count += 1
-                        if time.perf_counter() - start >= duration_seconds:
+                        if time.perf_counter() - start >= DURATION_SEC:
                             break
             return count, None
         except Exception as e:
             return count, str(e)
 
     start_time = time.perf_counter()
-    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+    tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)]
     results = await asyncio.gather(*tasks, return_exceptions=True)
     elapsed = time.perf_counter() - start_time
 
@@ -191,8 +189,7 @@ async def client_task() -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_throughput_multiple_clients",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -212,9 +209,9 @@ async def client_task() -> tuple[int, str | None]:
 
     # Original assertion
     aggregate_throughput = total_events / elapsed
-    min_expected = min(10000, scale * 100)
+    min_expected = min(10000, NUM_CLIENTS * 100)
     assert aggregate_throughput >= min_expected, (
-        f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {scale} "
+        f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {NUM_CLIENTS} "
         f"clients, expected >= {min_expected}"
     )
 
@@ -222,8 +219,6 @@ async def client_task() -> tuple[int, str | None]:
 @pytest.mark.loadtest
 async def test_first_event_latency(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -249,7 +244,7 @@ async def test_first_event_latency(
     - Inefficient task group initialization
 
     ## Methodology
-    1. Launch `scale` concurrent connection attempts simultaneously
+    1. Launch NUM_CLIENTS concurrent connection attempts simultaneously
     2. Each client measures time from connect() to first SSE event
     3. Collect latency samples and compute percentiles
 
@@ -257,8 +252,9 @@ async def test_first_event_latency(
     - p50 < 1250ms, p99 < 2500ms
     - Calibrated from measured p50=932ms, p99=1779ms at scale=100
     - Threshold factor: 1.3x measured values
-
     """
+    # Test parameters
+    NUM_CLIENTS = 100
 
     async def measure_ttfe() -> tuple[float, str | None]:
         start = time.perf_counter()
@@ -274,7 +270,7 @@ async def measure_ttfe() -> tuple[float, str | None]:
         return -1, "no events received"
 
     start_time = time.perf_counter()
-    tasks = [asyncio.create_task(measure_ttfe()) for _ in range(scale)]
+    tasks = [asyncio.create_task(measure_ttfe()) for _ in range(NUM_CLIENTS)]
     results = await asyncio.gather(*tasks)
     elapsed = time.perf_counter() - start_time
 
@@ -293,8 +289,7 @@ async def measure_ttfe() -> tuple[float, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_first_event_latency",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -313,8 +308,8 @@ async def measure_ttfe() -> tuple[float, str | None]:
         pytest.fail(f"Regression detected: {comparison.regression_reasons}")
 
     # Original assertions
-    if len(latencies) < scale * 0.9:
-        pytest.fail(f"Too many failed connections: {len(latencies)}/{scale}")
+    if len(latencies) < NUM_CLIENTS * 0.9:
+        pytest.fail(f"Too many failed connections: {len(latencies)}/{NUM_CLIENTS}")
 
     latencies.sort()
     p50 = latencies[len(latencies) // 2]
@@ -327,8 +322,6 @@ async def measure_ttfe() -> tuple[float, str | None]:
 @pytest.mark.loadtest
 async def test_event_latency_under_load(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -354,8 +347,8 @@ async def test_event_latency_under_load(
     tail latency issues that degrade user experience.
 
     ## Methodology
-    1. Launch `scale` concurrent clients to /sse?delay=0.01 (10ms between events)
-    2. Each client receives 100 events and records inter-event times
+    1. Launch NUM_CLIENTS concurrent clients to /sse?delay=0.01 (10ms between events)
+    2. Each client receives EVENTS_PER_CLIENT events and records inter-event times
     3. Aggregate all latency samples and compute percentiles
 
     ## Pass Criteria
@@ -363,6 +356,9 @@ async def test_event_latency_under_load(
     - Calibrated from measured p50=14.8ms, p95=21.4ms, p99=27.4ms at scale=100
     - Server delay: 10ms. Threshold factor: 1.3x measured values
     """
+    # Test parameters
+    NUM_CLIENTS = 100
+    EVENTS_PER_CLIENT = 100
 
     async def measure_latencies() -> tuple[list[float], str | None]:
         latencies: list[float] = []
@@ -379,14 +375,14 @@ async def measure_latencies() -> tuple[list[float], str | None]:
                             latencies.append((now - last_time) * 1000)
                         last_time = now
                         count += 1
-                        if count >= 100:
+                        if count >= EVENTS_PER_CLIENT:
                             break
             return latencies, None
         except Exception as e:
             return latencies, str(e)
 
     start_time = time.perf_counter()
-    tasks = [asyncio.create_task(measure_latencies()) for _ in range(scale)]
+    tasks = [asyncio.create_task(measure_latencies()) for _ in range(NUM_CLIENTS)]
     results = await asyncio.gather(*tasks)
     elapsed = time.perf_counter() - start_time
 
@@ -406,8 +402,7 @@ async def measure_latencies() -> tuple[list[float], str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_event_latency_under_load",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
diff --git a/tests/load/test_watcher_scale.py b/tests/load/test_watcher_scale.py
index 67cc28f..e71f9cd 100644
--- a/tests/load/test_watcher_scale.py
+++ b/tests/load/test_watcher_scale.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import asyncio
+import time
 
 import httpx
 import pytest
@@ -29,8 +30,6 @@
 @pytest.mark.loadtest
 async def test_single_watcher_with_many_connections(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -42,7 +41,7 @@ async def test_single_watcher_with_many_connections(
 
     ## What is Measured
     - `watcher_started` flag from /metrics (True = watcher exists)
-    - `registered_events` count (should be >= scale * 0.5)
+    - `registered_events` count (should be >= NUM_CLIENTS * 0.5)
     - Implicit: CPU usage would spike if multiple watchers existed (not measured)
 
     ## Why This Matters
@@ -57,18 +56,21 @@ async def test_single_watcher_with_many_connections(
     - Constant CPU overhead regardless of connection count
 
     ## Methodology
-    1. Connect `scale` concurrent clients (default 100)
+    1. Connect NUM_CLIENTS concurrent clients
     2. Wait for connections to establish (~2s)
     3. Query /metrics for watcher_started and registered_events
     4. Cancel all connections
 
     ## Pass Criteria
     - watcher_started = True (watcher exists for active connections)
-    - registered_events >= scale * 0.5 (most connections registered)
+    - registered_events >= NUM_CLIENTS * 0.5 (most connections registered)
     - Rationale: watcher_started=True confirms the mechanism works.
       Event count verifies registration worked. We don't directly measure
       watcher count, but CPU metrics in CI would catch proliferation.
     """
+    # Test parameters
+    NUM_CLIENTS = 100
+    HOLD_DURATION_SEC = 5
 
     async def client_task() -> tuple[int, str | None]:
         try:
@@ -77,14 +79,16 @@ async def client_task() -> tuple[int, str | None]:
                     client, "GET", f"{sse_server_url}/sse?delay=0.1"
                 ) as source:
                     async for _ in source.aiter_sse():
-                        await asyncio.sleep(5)  # Stay connected
+                        await asyncio.sleep(HOLD_DURATION_SEC)  # Stay connected
                         break
             return 1, None
         except Exception as e:
             return 0, str(e)
 
+    start_time = time.perf_counter()
+
     # Start many connections
-    tasks = [asyncio.create_task(client_task()) for _ in range(scale)]
+    tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)]
 
     # Wait for connections to establish
     await asyncio.sleep(2)
@@ -102,6 +106,9 @@ async def client_task() -> tuple[int, str | None]:
         task.cancel()
     results = await asyncio.gather(*tasks, return_exceptions=True)
 
+    elapsed = time.perf_counter() - start_time
+    metrics_collector.set_duration(elapsed)
+
     # Process results
     for result in results:
         if isinstance(result, Exception):
@@ -123,8 +130,7 @@ async def client_task() -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_single_watcher_with_many_connections",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_CLIENTS,
     )
     register_test_report(report)
 
@@ -145,15 +151,13 @@ async def client_task() -> tuple[int, str | None]:
     # Original assertions
     assert watcher_started is True, "Watcher should be started with active connections"
     assert (
-        registered_events >= scale * 0.5
-    ), f"Expected at least {scale * 0.5} events, got {registered_events}"
+        registered_events >= NUM_CLIENTS * 0.5
+    ), f"Expected at least {NUM_CLIENTS * 0.5} events, got {registered_events}"
 
 
 @pytest.mark.loadtest
 async def test_rapid_connect_disconnect_watcher_stability(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -178,7 +182,7 @@ async def test_rapid_connect_disconnect_watcher_stability(
     connections closed quickly, watchers accumulated and never stopped.
 
     ## Methodology
-    1. Run `scale / 10` batches of 10 quick connections each
+    1. Run NUM_BATCHES batches of BATCH_SIZE quick connections each
     2. Each connection receives 1 event and disconnects immediately
     3. After all batches, check thread count and watcher status
 
@@ -188,6 +192,9 @@ async def test_rapid_connect_disconnect_watcher_stability(
       accumulated, we'd see hundreds of threads (one per watcher task).
       50 provides margin for legitimate worker threads.
     """
+    # Test parameters
+    NUM_BATCHES = 10
+    BATCH_SIZE = 10
 
     async def quick_connect() -> tuple[int, str | None]:
         try:
@@ -201,9 +208,11 @@ async def quick_connect() -> tuple[int, str | None]:
         except Exception as e:
             return 0, str(e)
 
+    start_time = time.perf_counter()
+
     # Rapid connect/disconnect cycles
-    for batch in range(scale // 10):
-        tasks = [asyncio.create_task(quick_connect()) for _ in range(10)]
+    for _ in range(NUM_BATCHES):
+        tasks = [asyncio.create_task(quick_connect()) for _ in range(BATCH_SIZE)]
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
         for result in results:
@@ -220,6 +229,9 @@ async def quick_connect() -> tuple[int, str | None]:
     # Brief pause
     await asyncio.sleep(0.5)
 
+    elapsed = time.perf_counter() - start_time
+    metrics_collector.set_duration(elapsed)
+
     # Check metrics - watcher should still be singular
     async with httpx.AsyncClient() as client:
         metrics = (await client.get(f"{sse_server_url}/metrics")).json()
@@ -237,8 +249,7 @@ async def quick_connect() -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_rapid_connect_disconnect_watcher_stability",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=NUM_BATCHES * BATCH_SIZE,
     )
     register_test_report(report)
 
@@ -263,8 +274,6 @@ async def quick_connect() -> tuple[int, str | None]:
 @pytest.mark.loadtest
 async def test_watcher_cleanup_allows_restart(
     sse_server_url: str,
-    scale: int,
-    duration_minutes: int,
     metrics_collector: MetricsCollector,
     baseline_manager: BaselineManager,
     report_generator: ReportGenerator,
@@ -290,10 +299,10 @@ async def test_watcher_cleanup_allows_restart(
     connections won't receive shutdown signals, causing graceful shutdown to fail.
 
     ## Methodology
-    1. Phase 1: Connect 50 clients, each receives 20 events, then disconnects
+    1. Phase 1: Connect CLIENTS_PER_PHASE clients, each receives EVENTS_PER_CLIENT events, then disconnects
     2. Wait 1s for cleanup
     3. Check registered_events (should be near 0)
-    4. Phase 2: Connect 50 new clients, each receives 20 events
+    4. Phase 2: Connect CLIENTS_PER_PHASE new clients, each receives EVENTS_PER_CLIENT events
     5. Wait 1s for cleanup
     6. Verify final state matches Phase 1 post-cleanup
 
@@ -304,6 +313,9 @@ async def test_watcher_cleanup_allows_restart(
     - Rationale: If watcher didn't restart in Phase 2, no events would be
       delivered. The +5 margin allows for concurrent test interference.
     """
+    # Test parameters
+    CLIENTS_PER_PHASE = 50
+    EVENTS_PER_CLIENT = 20
 
     async def connect_and_consume(n_events: int) -> tuple[int, str | None]:
         count = 0
@@ -320,8 +332,13 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]:
         except Exception as e:
             return count, str(e)
 
+    start_time = time.perf_counter()
+
     # Phase 1: Connect, consume, disconnect
-    tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)]
+    tasks = [
+        asyncio.create_task(connect_and_consume(EVENTS_PER_CLIENT))
+        for _ in range(CLIENTS_PER_PHASE)
+    ]
     results = await asyncio.gather(*tasks)
 
     phase1_events = 0
@@ -345,7 +362,10 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]:
     metrics_collector.add_memory_sample(metrics1["memory_rss_mb"])
 
     # Phase 2: New connections should work
-    tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)]
+    tasks = [
+        asyncio.create_task(connect_and_consume(EVENTS_PER_CLIENT))
+        for _ in range(CLIENTS_PER_PHASE)
+    ]
     results = await asyncio.gather(*tasks)
 
     phase2_events = 0
@@ -362,6 +382,9 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]:
     # Wait for cleanup
     await asyncio.sleep(1)
 
+    elapsed = time.perf_counter() - start_time
+    metrics_collector.set_duration(elapsed)
+
     # Verify clean state
     async with httpx.AsyncClient() as client:
         metrics2 = (await client.get(f"{sse_server_url}/metrics")).json()
@@ -379,8 +402,7 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]:
     # Generate report
     report = metrics_collector.compute_report(
         test_name="test_watcher_cleanup_allows_restart",
-        scale=scale,
-        duration_minutes=duration_minutes,
+        scale=CLIENTS_PER_PHASE * 2,
     )
     register_test_report(report)
 

Percentile	Value
p50	{stats.p50_ms:.2f} ms
p90	{stats.p90_ms:.2f} ms
p95	{stats.p95_ms:.2f} ms
p99	{stats.p99_ms:.2f} ms {p99_change}
max	{stats.max_ms:.2f} ms
mean	{stats.mean_ms:.2f} ms
stdev	{stats.stdev_ms:.2f} ms
samples	{stats.sample_count:,}
Metric	Value
p50	{stats.p50_ms:.1f} ms
p99	{stats.p99_ms:.1f} ms {p99_change}
max	{stats.max_ms:.1f} ms
samples	{stats.sample_count:,}
Metric	Value
Baseline	{memory.baseline_mb:.1f} MB
Peak	{memory.peak_mb:.1f} MB
Final	{memory.final_mb:.1f} MB
Growth	{memory.growth_mb:.1f} MB {growth_change}
Slope	{memory.slope_mb_per_sec:.4f} MB/sec
Metric	Value
Aggregate	{throughput.aggregate_events_per_sec:,.0f} events/sec {change}
Per Client	{throughput.per_client_events_per_sec:.1f} events/sec
Total Events	{throughput.total_events:,}
Duration	{throughput.total_duration_sec:.1f} sec
Clients	{throughput.client_count:,}
Metric	Value
Successful	{reliability.successful_connections:,} / {total:,} ({pct:.1f}%)
Failed	{reliability.failed_connections:,}
Error Rate	{reliability.error_rate * 100:.2f}%