From 63f7cb7f69397e7c9ccd7d1af4a1120e86d4ec85 Mon Sep 17 00:00:00 2001 From: sysid Date: Thu, 1 Jan 2026 19:29:21 +0100 Subject: [PATCH 1/4] chore: update gitignore --- .workmux.yaml | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 .workmux.yaml diff --git a/.workmux.yaml b/.workmux.yaml new file mode 100644 index 0000000..ee95fe1 --- /dev/null +++ b/.workmux.yaml @@ -0,0 +1,139 @@ +# workmux project configuration +# For global settings, edit ~/.config/workmux/config.yaml +# All options below are commented out - uncomment to override defaults. + +#------------------------------------------------------------------------------- +# Git +#------------------------------------------------------------------------------- + +# The primary branch to merge into. +# Default: Auto-detected from remote HEAD, falls back to main/master. +# main_branch: main + +# Default merge strategy for `workmux merge`. +# Options: merge (default), rebase, squash +# CLI flags (--rebase, --squash) always override this. +# merge_strategy: rebase + +#------------------------------------------------------------------------------- +# Naming & Paths +#------------------------------------------------------------------------------- + +# Directory where worktrees are created. +# Can be relative to repo root or absolute. +# Default: Sibling directory '__worktrees'. +# worktree_dir: .worktrees + +# Strategy for deriving names from branch names. +# Options: full (default), basename (part after last '/'). +# worktree_naming: basename + +# Prefix added to worktree directories and tmux window names. +# worktree_prefix: "" + +# Prefix for tmux window names. +# Default: "wm-" +# window_prefix: "wm-" + +#------------------------------------------------------------------------------- +# Tmux +#------------------------------------------------------------------------------- + +# Custom tmux pane layout. +# Default: Two-pane layout with shell and clear command. +# panes: +# - command: pnpm install +# focus: true +# - split: horizontal +# - command: clear +# split: vertical +# size: 5 + +# Auto-apply agent status icons to tmux window format. +# Default: true +# status_format: true + +# Custom icons for agent status display. +status_icons: + working: "πŸ€–" + waiting: "πŸ’¬" + done: "βœ…" + +#------------------------------------------------------------------------------- +# Agent & AI +#------------------------------------------------------------------------------- + +# Agent command for '' placeholder in pane commands. +# Default: "claude" +# agent: claude + +# LLM-based branch name generation (`workmux add -a`). +# auto_name: +# model: "gpt-4o-mini" +# system_prompt: "Generate a kebab-case git branch name." + +#------------------------------------------------------------------------------- +# Hooks +#------------------------------------------------------------------------------- + +# Commands to run in new worktree before tmux window opens. +# These block window creation - use for short tasks only. +# Use "" to inherit from global config. +# Set to empty list to disable: `post_create: []` +# post_create: +# - "" +# - mise use + + # ẞOTCHA: copies .envrc target, not link +post_create: + - ln -s $SOPS_PATH/dot.envrc .envrc + - direnv allow + +# Commands to run before merging (e.g., linting, tests). +# Aborts the merge if any command fails. +# Use "" to inherit from global config. +# Environment variables available: +# - WM_BRANCH_NAME: The name of the branch being merged +# - WM_TARGET_BRANCH: The name of the target branch (e.g., main) +# - WM_WORKTREE_PATH: Absolute path to the worktree +# - WM_PROJECT_ROOT: Absolute path of the main project directory +# - WM_HANDLE: The worktree handle/window name +# pre_merge: +# - "" +# - cargo test +# - cargo clippy -- -D warnings + +# Commands to run before worktree removal (during merge or remove). +# Useful for backing up gitignored files before cleanup. +# Default: Auto-detects Node.js projects and fast-deletes node_modules. +# Set to empty list to disable: `pre_remove: []` +# Environment variables available: +# - WM_HANDLE: The worktree handle (directory name) +# - WM_WORKTREE_PATH: Absolute path of the worktree being deleted +# - WM_PROJECT_ROOT: Absolute path of the main project directory +# pre_remove: +# - mkdir -p "$WM_PROJECT_ROOT/artifacts/$WM_HANDLE" +# - cp -r test-results/ "$WM_PROJECT_ROOT/artifacts/$WM_HANDLE/" + +#------------------------------------------------------------------------------- +# Files +#------------------------------------------------------------------------------- + +# File operations when creating a worktree. +# files: +# # Files to copy (useful for .env files that need to be unique). +# copy: +# - .env.local +# +# # Files/directories to symlink (saves disk space, shares caches). +# # Default: None. +# # Use "" to inherit from global config. +# symlink: +# - "" +# - node_modules +files: + symlink: + - .venv + - .claude + - CLAUDE.md + - thoughts From 105fee90977578dd5a2a23eeecb5c903b4d71952 Mon Sep 17 00:00:00 2001 From: sysid Date: Wed, 31 Dec 2025 11:39:24 +0100 Subject: [PATCH 2/4] feat: add comprehensive load testing infrastructure Add exhaustive load testing to detect memory leaks, watcher deduplication at scale, and prevent performance regressions. Test coverage (15 tests): - Memory stability: leak detection, baseline return, event set cleanup - Watcher scale: single watcher verification, rapid connect/disconnect - Throughput: single/multi-client, TTFE, inter-event latency - Shutdown: graceful termination with active connections - Backpressure: slow client isolation, connection churn, send_timeout Infrastructure: - Docker-based test server with /metrics endpoint (psutil) - testcontainers fixtures with health check wait strategies - httpx-sse + asyncio.gather() for concurrent SSE clients - Manual GitHub Actions workflow (workflow_dispatch) New dependencies in dev group: httpx-sse New Makefile target: test-load --- .github/workflows/load-test.yml | 76 ++++++ Makefile | 13 +- ...{issue77.py => issue77_lock_contention.py} | 0 pyproject.toml | 8 +- tests/Dockerfile.loadtest | 32 +++ tests/load/__init__.py | 1 + tests/load/conftest.py | 128 ++++++++++ tests/load/server_app.py | 107 ++++++++ tests/load/test_backpressure.py | 215 ++++++++++++++++ tests/load/test_memory_stability.py | 235 ++++++++++++++++++ tests/load/test_shutdown.py | 182 ++++++++++++++ tests/load/test_throughput.py | 185 ++++++++++++++ tests/load/test_watcher_scale.py | 159 ++++++++++++ uv.lock | 19 +- 14 files changed, 1357 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/load-test.yml rename examples/issues/{issue77.py => issue77_lock_contention.py} (100%) create mode 100644 tests/Dockerfile.loadtest create mode 100644 tests/load/__init__.py create mode 100644 tests/load/conftest.py create mode 100644 tests/load/server_app.py create mode 100644 tests/load/test_backpressure.py create mode 100644 tests/load/test_memory_stability.py create mode 100644 tests/load/test_shutdown.py create mode 100644 tests/load/test_throughput.py create mode 100644 tests/load/test_watcher_scale.py diff --git a/.github/workflows/load-test.yml b/.github/workflows/load-test.yml new file mode 100644 index 0000000..59eeb79 --- /dev/null +++ b/.github/workflows/load-test.yml @@ -0,0 +1,76 @@ +name: Load Tests + +on: + workflow_dispatch: + inputs: + scale: + description: 'Number of concurrent connections' + required: true + default: '100' + type: choice + options: + - '100' + - '500' + - '1000' + duration: + description: 'Test duration in minutes' + required: true + default: '1' + type: choice + options: + - '1' + - '5' + - '10' + +jobs: + load-test: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Install dependencies + run: | + uv pip install --system -e ".[loadtest]" + + - name: Build load test Docker image + run: | + docker build -f tests/Dockerfile.loadtest -t sse-starlette-loadtest:latest . + + - name: Run load tests + run: | + python -m pytest tests/load/ -m "loadtest" \ + --scale=${{ inputs.scale }} \ + --duration=${{ inputs.duration }} \ + -v --tb=short \ + --junitxml=load-test-results.xml + + - name: Upload test results + uses: actions/upload-artifact@v4 + if: always() + with: + name: load-test-results + path: | + load-test-results.xml + retention-days: 30 + + - name: Test Summary + if: always() + run: | + echo "## Load Test Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Scale**: ${{ inputs.scale }} concurrent connections" >> $GITHUB_STEP_SUMMARY + echo "- **Duration**: ${{ inputs.duration }} minutes" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + if [ -f load-test-results.xml ]; then + echo "Test results uploaded as artifact." >> $GITHUB_STEP_SUMMARY + fi diff --git a/Makefile b/Makefile index fe0c664..8d8e4f4 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ test: test-unit test-docker ## run tests .PHONY: test-unit test-unit: ## run all tests except "integration" marked - RUN_ENV=local python -m pytest -m "not (integration or experimentation)" --cov-config=pyproject.toml --cov-report=html --cov-report=term --cov=$(pkg_src) tests + RUN_ENV=local python -m pytest -m "not (integration or experimentation or loadtest)" --cov-config=pyproject.toml --cov-report=html --cov-report=term --cov=$(pkg_src) tests .PHONY: test-docker test-docker: ## test-docker (docker desktop: advanced settings) @@ -100,6 +100,17 @@ test-docker: ## test-docker (docker desktop: advanced settings) echo "Skipping tests: /var/run/docker.sock does not exist."; \ fi +.PHONY: test-load +test-load: ## run load tests (requires docker, make test-load PYTEST_ARGS="--scale=500 --duration=5") + @if [ -S /var/run/docker.sock > /dev/null 2>&1 ]; then \ + echo "Building load test image..."; \ + docker build -f tests/Dockerfile.loadtest -t sse-starlette-loadtest:latest .; \ + echo "Running load tests..."; \ + RUN_ENV=local python -m pytest -m "loadtest" tests/load/ -v --tb=short $(PYTEST_ARGS); \ + else \ + echo "Skipping load tests: /var/run/docker.sock does not exist."; \ + fi + ################################################################################ # Code Quality \ diff --git a/examples/issues/issue77.py b/examples/issues/issue77_lock_contention.py similarity index 100% rename from examples/issues/issue77.py rename to examples/issues/issue77_lock_contention.py diff --git a/pyproject.toml b/pyproject.toml index 998361d..ca9f2a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,12 +42,17 @@ granian = [ daphne = [ "daphne>=4.2.0", ] +loadtest = [ + "httpx-sse>=0.4.0", + "psutil>=6.1.1", +] [dependency-groups] # new standard, included by default dev = [ "asgi-lifespan>=2.1.0", "async-timeout>=5.0.1", "httpx>=0.28.1", + "httpx-sse>=0.4.0", "mypy>=1.14.0", "portend>=3.2.0", "psutil>=6.1.1", @@ -102,7 +107,8 @@ filename = "sse_starlette/__init__.py" [tool.pytest.ini_options] markers = [ "integration: marks tests as integration tests", - "experimentation: marks tests as experimental tests, not to be run in CICD" + "experimentation: marks tests as experimental tests, not to be run in CICD", + "loadtest: marks tests as load tests (require docker and significant resources)" ] asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "function" diff --git a/tests/Dockerfile.loadtest b/tests/Dockerfile.loadtest new file mode 100644 index 0000000..3c3e6dd --- /dev/null +++ b/tests/Dockerfile.loadtest @@ -0,0 +1,32 @@ +# Load test server image for sse-starlette +FROM python:3.12-slim + +WORKDIR /app + +# Install build dependencies and cleanup in one layer +RUN apt-get update && apt-get install -y \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Copy package files +COPY pyproject.toml ./ +COPY README.md ./ +COPY sse_starlette ./sse_starlette + +# Install package with loadtest dependencies +RUN pip install --no-cache-dir -e ".[loadtest]" + +# Install uvicorn for serving +RUN pip install --no-cache-dir uvicorn + +# Copy load test server app +COPY tests/load/server_app.py ./server_app.py + +# Expose port +EXPOSE 8000 + +# Set Python path +ENV PYTHONPATH=/app + +# Default command - run the load test server +CMD ["uvicorn", "server_app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"] diff --git a/tests/load/__init__.py b/tests/load/__init__.py new file mode 100644 index 0000000..758eb8e --- /dev/null +++ b/tests/load/__init__.py @@ -0,0 +1 @@ +# Load testing module for sse-starlette diff --git a/tests/load/conftest.py b/tests/load/conftest.py new file mode 100644 index 0000000..ec73419 --- /dev/null +++ b/tests/load/conftest.py @@ -0,0 +1,128 @@ +""" +Pytest fixtures for load testing. + +Provides container-based SSE server and utility fixtures. +""" + +import os +import time +from typing import Generator + +import httpx +import pytest +from testcontainers.core.container import DockerContainer + + +class SSELoadTestContainer(DockerContainer): + """Custom container for SSE load testing.""" + + def __init__(self, image: str = "sse-starlette-loadtest:latest"): + super().__init__(image) + self.with_exposed_ports(8000) + + def get_base_url(self) -> str: + """Get the base URL for the SSE server.""" + host = self.get_container_host_ip() + port = self.get_exposed_port(8000) + return f"http://{host}:{port}" + + +def _wait_for_port(container: DockerContainer, port: int, timeout: float = 30) -> str: + """Wait for port mapping to be available and return base URL.""" + start = time.time() + while time.time() - start < timeout: + try: + host = container.get_container_host_ip() + mapped_port = container.get_exposed_port(port) + return f"http://{host}:{mapped_port}" + except ConnectionError: + time.sleep(0.5) + raise TimeoutError(f"Port {port} not available after {timeout}s") + + +def _wait_for_health(base_url: str, timeout: float = 30) -> None: + """Wait for server health endpoint to respond.""" + start = time.time() + while time.time() - start < timeout: + try: + resp = httpx.get(f"{base_url}/health", timeout=2.0) + if resp.status_code == 200: + return + except httpx.RequestError: + pass + time.sleep(0.5) + raise TimeoutError(f"Server at {base_url} not ready after {timeout}s") + + +@pytest.fixture(scope="module") +def docker_available() -> bool: + """Check if Docker is available.""" + return os.path.exists("/var/run/docker.sock") + + +@pytest.fixture(scope="module") +def sse_container( + docker_available: bool, +) -> Generator[SSELoadTestContainer, None, None]: + """Start SSE server in Docker container for load testing.""" + if not docker_available: + pytest.skip("Docker not available") + + container = SSELoadTestContainer() + container.start() + + # Wait for port mapping, then health check + base_url = _wait_for_port(container, 8000, timeout=30) + _wait_for_health(base_url, timeout=30) + + yield container + + container.stop() + + +@pytest.fixture(scope="module") +def sse_server_url(sse_container: SSELoadTestContainer) -> str: + """Get the base URL for the SSE server.""" + return sse_container.get_base_url() + + +@pytest.fixture +def sync_client() -> Generator[httpx.Client, None, None]: + """Synchronous HTTP client for simple requests.""" + with httpx.Client(timeout=30.0) as client: + yield client + + +@pytest.fixture +async def async_client() -> httpx.AsyncClient: + """Async HTTP client for SSE streaming.""" + async with httpx.AsyncClient(timeout=60.0) as client: + yield client + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add custom command line options for load tests.""" + parser.addoption( + "--scale", + action="store", + default="100", + help="Number of concurrent connections for load tests", + ) + parser.addoption( + "--duration", + action="store", + default="1", + help="Test duration in minutes", + ) + + +@pytest.fixture +def scale(request: pytest.FixtureRequest) -> int: + """Get the scale (number of connections) for load tests.""" + return int(request.config.getoption("--scale")) + + +@pytest.fixture +def duration_minutes(request: pytest.FixtureRequest) -> int: + """Get the duration in minutes for load tests.""" + return int(request.config.getoption("--duration")) diff --git a/tests/load/server_app.py b/tests/load/server_app.py new file mode 100644 index 0000000..516bc76 --- /dev/null +++ b/tests/load/server_app.py @@ -0,0 +1,107 @@ +""" +Load test SSE server application. + +Provides SSE endpoints and a metrics endpoint for monitoring during load tests. +""" + +import asyncio +import os +import time +from typing import AsyncGenerator + +import psutil +from starlette.applications import Starlette +from starlette.requests import Request +from starlette.responses import JSONResponse +from starlette.routing import Route + +from sse_starlette import EventSourceResponse +from sse_starlette.sse import _get_shutdown_state + + +async def metrics(request: Request) -> JSONResponse: + """Expose server metrics for monitoring during load tests.""" + process = psutil.Process(os.getpid()) + memory_info = process.memory_info() + + # Get watcher and event count from thread-local state + shutdown_state = _get_shutdown_state() + + return JSONResponse( + { + "memory_rss_mb": memory_info.rss / 1024 / 1024, + "memory_vms_mb": memory_info.vms / 1024 / 1024, + "num_fds": process.num_fds() if hasattr(process, "num_fds") else -1, + "num_threads": process.num_threads(), + "connections": len(process.connections()), + "cpu_percent": process.cpu_percent(), + "watcher_started": shutdown_state.watcher_started, + "registered_events": len(shutdown_state.events), + "uptime_seconds": time.time() - process.create_time(), + } + ) + + +async def endless_stream(request: Request) -> EventSourceResponse: + """High-frequency event stream for load testing.""" + delay = float(request.query_params.get("delay", "0.01")) # 100 events/sec default + + async def generate() -> AsyncGenerator[dict, None]: + counter = 0 + while True: + if await request.is_disconnected(): + break + yield {"data": f"event-{counter}", "id": str(counter)} + counter += 1 + await asyncio.sleep(delay) + + return EventSourceResponse(generate()) + + +async def finite_stream(request: Request) -> EventSourceResponse: + """Finite event stream for testing completion.""" + count = int(request.query_params.get("count", "100")) + delay = float(request.query_params.get("delay", "0.01")) + + async def generate() -> AsyncGenerator[dict, None]: + for i in range(count): + if await request.is_disconnected(): + break + yield {"data": f"event-{i}", "id": str(i)} + await asyncio.sleep(delay) + + return EventSourceResponse(generate()) + + +async def slow_stream(request: Request) -> EventSourceResponse: + """Slow event stream for backpressure testing.""" + delay = float(request.query_params.get("delay", "1.0")) + + async def generate() -> AsyncGenerator[dict, None]: + counter = 0 + while True: + if await request.is_disconnected(): + break + # Generate larger payloads + payload = "x" * 4096 + yield {"data": payload, "id": str(counter)} + counter += 1 + await asyncio.sleep(delay) + + return EventSourceResponse(generate()) + + +async def health(request: Request) -> JSONResponse: + """Health check endpoint.""" + return JSONResponse({"status": "healthy"}) + + +routes = [ + Route("/sse", endless_stream), + Route("/sse/finite", finite_stream), + Route("/sse/slow", slow_stream), + Route("/metrics", metrics), + Route("/health", health), +] + +app = Starlette(routes=routes) diff --git a/tests/load/test_backpressure.py b/tests/load/test_backpressure.py new file mode 100644 index 0000000..94f111b --- /dev/null +++ b/tests/load/test_backpressure.py @@ -0,0 +1,215 @@ +""" +Backpressure and slow client tests. + +Verifies server handles slow consumers correctly without affecting fast clients. +""" + +import asyncio +import time +from typing import Tuple + +import httpx +import pytest +from httpx_sse import aconnect_sse + + +@pytest.mark.loadtest +async def test_slow_clients_dont_block_fast_clients( + sse_server_url: str, +) -> None: + """ + Slow clients should not affect throughput of fast clients. + + Tests that the server properly handles mixed client speeds. + """ + test_duration = 10 # seconds + + async def fast_client() -> int: + """Client that consumes events as fast as possible.""" + count = 0 + start = time.perf_counter() + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.01" + ) as source: + async for _ in source.aiter_sse(): + count += 1 + if time.perf_counter() - start >= test_duration: + break + except Exception: + pass + return count + + async def slow_client() -> int: + """Client that reads slowly (simulating processing delay).""" + count = 0 + start = time.perf_counter() + try: + async with httpx.AsyncClient(timeout=60.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.01" + ) as source: + async for _ in source.aiter_sse(): + await asyncio.sleep(0.5) # Slow processing + count += 1 + if time.perf_counter() - start >= test_duration: + break + except Exception: + pass + return count + + # Mix of fast and slow clients + fast_tasks = [asyncio.create_task(fast_client()) for _ in range(10)] + slow_tasks = [asyncio.create_task(slow_client()) for _ in range(10)] + + fast_results = await asyncio.gather(*fast_tasks) + slow_results = await asyncio.gather(*slow_tasks) + + avg_fast = sum(fast_results) / len(fast_results) + avg_slow = sum(slow_results) / len(slow_results) + + # Fast clients should receive significantly more events + assert avg_fast > avg_slow * 5, ( + f"Fast clients ({avg_fast:.0f} events) should be much faster than " + f"slow clients ({avg_slow:.0f} events)" + ) + + # Fast clients should not be severely throttled + # With 0.01s delay, should get ~1000 events in 10s + assert ( + avg_fast > 500 + ), f"Fast clients throttled: {avg_fast:.0f} events, expected > 500" + + +@pytest.mark.loadtest +async def test_connection_churn_stability( + sse_server_url: str, + scale: int, +) -> None: + """ + Rapid connect/disconnect should not cause resource exhaustion. + + Tests cleanup under high churn rate. + """ + churn_rate = min(100, scale) # connections per second + duration = 30 # seconds + total_connections = churn_rate * duration + + async def quick_connection() -> bool: + try: + async with httpx.AsyncClient(timeout=5.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0" + ) as source: + async for _ in source.aiter_sse(): + return True + except Exception: + return False + return False + + # Get baseline metrics + async with httpx.AsyncClient() as client: + baseline = (await client.get(f"{sse_server_url}/metrics")).json() + + baseline_fds = baseline.get("num_fds", 0) + baseline_memory = baseline["memory_rss_mb"] + + # Create connections at target rate + successful = 0 + for batch in range(duration): + tasks = [asyncio.create_task(quick_connection()) for _ in range(churn_rate)] + results = await asyncio.gather(*tasks, return_exceptions=True) + successful += sum(1 for r in results if r is True) + await asyncio.sleep(0.5) # Allow some cleanup + + # Get final metrics + async with httpx.AsyncClient() as client: + final = (await client.get(f"{sse_server_url}/metrics")).json() + + final_fds = final.get("num_fds", 0) + final_memory = final["memory_rss_mb"] + + # File descriptors should return to baseline + if baseline_fds > 0 and final_fds > 0: + fd_growth = final_fds - baseline_fds + assert fd_growth < 50, ( + f"File descriptor leak: {fd_growth} new FDs after {total_connections} " + f"connections" + ) + + # Memory should not grow excessively + memory_growth = final_memory - baseline_memory + assert ( + memory_growth < 100 + ), f"Memory grew by {memory_growth:.1f}MB during churn test" + + # Success rate should be high + success_rate = successful / total_connections if total_connections > 0 else 0 + assert success_rate > 0.9, ( + f"Low success rate during churn: {success_rate:.1%} " + f"({successful}/{total_connections})" + ) + + +@pytest.mark.loadtest +async def test_send_timeout_under_load(sse_server_url: str) -> None: + """ + Verify send_timeout works correctly under load. + + Clients that stop reading should eventually be disconnected. + """ + + async def frozen_client() -> Tuple[str, float]: + """Client that stops reading after first event (simulates frozen client).""" + start = time.perf_counter() + try: + async with httpx.AsyncClient(timeout=120.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.001" + ) as source: + async for _ in source.aiter_sse(): + # Stop reading but keep connection open + await asyncio.sleep(60) # Will be interrupted by timeout + break + except httpx.ReadTimeout: + return "timeout", time.perf_counter() - start + except Exception as e: + return f"error:{type(e).__name__}", time.perf_counter() - start + return "completed", time.perf_counter() - start + + # Start some frozen clients (server has default send_timeout) + tasks = [asyncio.create_task(frozen_client()) for _ in range(5)] + + # Also verify server remains responsive with normal clients + async def normal_client() -> int: + count = 0 + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.1" + ) as source: + async for _ in source.aiter_sse(): + count += 1 + if count >= 50: + break + except Exception: + pass + return count + + normal_tasks = [asyncio.create_task(normal_client()) for _ in range(3)] + + # Wait for normal clients to complete + normal_results = await asyncio.gather(*normal_tasks) + + # Cancel frozen clients if still running + for task in tasks: + if not task.done(): + task.cancel() + + await asyncio.gather(*tasks, return_exceptions=True) + + # Normal clients should have completed successfully + assert all( + r >= 45 for r in normal_results + ), f"Normal clients affected by frozen clients: {normal_results}" diff --git a/tests/load/test_memory_stability.py b/tests/load/test_memory_stability.py new file mode 100644 index 0000000..ba23eea --- /dev/null +++ b/tests/load/test_memory_stability.py @@ -0,0 +1,235 @@ +""" +Memory stability tests for sse-starlette under load. + +Verifies no memory leaks during sustained SSE streaming with many concurrent connections. +""" + +import asyncio +import statistics +from typing import List + +import httpx +import pytest +from httpx_sse import aconnect_sse + + +@pytest.mark.loadtest +async def test_memory_stability_under_load( + sse_server_url: str, + scale: int, + duration_minutes: int, +) -> None: + """ + Connect many clients, stream for duration, verify memory is stable. + + Pass criteria: + - Memory growth < 50MB over test duration + - No unbounded growth trend (linear regression slope < 0.1 MB/sec) + """ + events_per_client = duration_minutes * 60 * 10 # 10 events/sec + + async def client_task(client_id: int) -> int: + """Single client consuming SSE events.""" + events_received = 0 + try: + async with httpx.AsyncClient(timeout=300.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.1" + ) as source: + async for _ in source.aiter_sse(): + events_received += 1 + if events_received >= events_per_client: + break + except Exception: + pass # Connection errors during shutdown are expected + return events_received + + # Get baseline memory + async with httpx.AsyncClient() as client: + baseline = (await client.get(f"{sse_server_url}/metrics")).json() + baseline_memory = baseline["memory_rss_mb"] + + # Start all clients + tasks = [asyncio.create_task(client_task(i)) for i in range(scale)] + + # Sample memory periodically + memory_samples: List[float] = [] + sample_interval = max(10, duration_minutes * 6) # At least 10 samples + + for _ in range(sample_interval): + await asyncio.sleep(duration_minutes * 60 / sample_interval) + try: + async with httpx.AsyncClient() as client: + metrics = (await client.get(f"{sse_server_url}/metrics")).json() + memory_samples.append(metrics["memory_rss_mb"]) + except Exception: + pass # Server might be under heavy load + + # Wait for all clients to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + completed = sum(1 for r in results if isinstance(r, int)) + + # Get final memory + async with httpx.AsyncClient() as client: + final = (await client.get(f"{sse_server_url}/metrics")).json() + final_memory = final["memory_rss_mb"] + + # Calculate memory growth + max_memory = max(memory_samples) if memory_samples else final_memory + memory_growth = max_memory - baseline_memory + + # Calculate growth trend (simple linear regression slope) + if len(memory_samples) >= 2: + x_mean = len(memory_samples) / 2 + y_mean = statistics.mean(memory_samples) + numerator = sum( + (i - x_mean) * (y - y_mean) for i, y in enumerate(memory_samples) + ) + denominator = sum((i - x_mean) ** 2 for i in range(len(memory_samples))) + slope = numerator / denominator if denominator else 0 + # Convert to MB/sec + sample_interval_sec = duration_minutes * 60 / len(memory_samples) + slope_per_sec = slope / sample_interval_sec + else: + slope_per_sec = 0 + + # Assert criteria + assert ( + completed >= scale * 0.9 + ), f"Too many failed connections: {completed}/{scale} completed" + assert memory_growth < 50, ( + f"Memory grew by {memory_growth:.1f}MB (baseline: {baseline_memory:.1f}MB, " + f"max: {max_memory:.1f}MB), expected < 50MB" + ) + assert ( + slope_per_sec < 0.1 + ), f"Memory growth trend {slope_per_sec:.3f} MB/sec, expected < 0.1 MB/sec" + + +@pytest.mark.loadtest +async def test_memory_returns_to_baseline_after_disconnect( + sse_server_url: str, + scale: int, +) -> None: + """ + Connect many clients, disconnect all, verify memory returns near baseline. + + Pass criteria: + - Memory within 20% of baseline after all connections close + """ + + async def client_task(client_id: int) -> None: + """Client that connects, receives few events, then disconnects.""" + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.01" + ) as source: + count = 0 + async for _ in source.aiter_sse(): + count += 1 + if count >= 50: + break + except Exception: + pass + + # Get baseline + async with httpx.AsyncClient() as client: + baseline = (await client.get(f"{sse_server_url}/metrics")).json() + baseline_memory = baseline["memory_rss_mb"] + + # Connect and disconnect clients in batches + batch_size = min(100, scale) + for batch_start in range(0, scale, batch_size): + batch_end = min(batch_start + batch_size, scale) + tasks = [ + asyncio.create_task(client_task(i)) for i in range(batch_start, batch_end) + ] + await asyncio.gather(*tasks, return_exceptions=True) + + # Wait for cleanup + await asyncio.sleep(2) + + # Check memory returned to near baseline + async with httpx.AsyncClient() as client: + final = (await client.get(f"{sse_server_url}/metrics")).json() + final_memory = final["memory_rss_mb"] + + # Allow 20% growth from baseline (some overhead is expected) + max_allowed = baseline_memory * 1.2 + assert final_memory <= max_allowed, ( + f"Memory did not return to baseline: {final_memory:.1f}MB " + f"(baseline: {baseline_memory:.1f}MB, max allowed: {max_allowed:.1f}MB)" + ) + + +@pytest.mark.loadtest +async def test_event_set_cleanup(sse_server_url: str, scale: int) -> None: + """ + Verify the internal event set empties after connections close. + + This tests the Issue #152 fix - events should be properly removed + from the thread-local state when connections close. + """ + + connected = asyncio.Event() + connection_count = 0 + + async def client_task() -> None: + nonlocal connection_count + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.5" + ) as source: + connection_count += 1 + if connection_count >= scale * 0.5: + connected.set() + count = 0 + async for _ in source.aiter_sse(): + count += 1 + if count >= 5: # Stay connected for ~2.5s + break + except Exception: + pass + + # Get baseline event count + async with httpx.AsyncClient() as client: + baseline = (await client.get(f"{sse_server_url}/metrics")).json() + baseline_events = baseline["registered_events"] + + # Connect many clients + tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + + # Wait for connections to establish (with timeout) + try: + await asyncio.wait_for(connected.wait(), timeout=10) + except asyncio.TimeoutError: + pass + await asyncio.sleep(0.5) # Extra margin + + # Check events registered during peak + async with httpx.AsyncClient() as client: + peak = (await client.get(f"{sse_server_url}/metrics")).json() + peak_events = peak["registered_events"] + + # Wait for all to complete + await asyncio.gather(*tasks, return_exceptions=True) + await asyncio.sleep(2) # Allow cleanup time + + # Check events cleaned up + async with httpx.AsyncClient() as client: + final = (await client.get(f"{sse_server_url}/metrics")).json() + final_events = final["registered_events"] + + # Events should have been registered during peak (relaxed threshold) + assert peak_events >= scale * 0.2, ( + f"Expected at least {scale * 0.2} events registered during peak, " + f"got {peak_events}" + ) + + # Events should be cleaned up after + assert final_events <= baseline_events + 10, ( + f"Event set not cleaned up: {final_events} events remaining " + f"(baseline: {baseline_events})" + ) diff --git a/tests/load/test_shutdown.py b/tests/load/test_shutdown.py new file mode 100644 index 0000000..1bdf43d --- /dev/null +++ b/tests/load/test_shutdown.py @@ -0,0 +1,182 @@ +""" +Graceful shutdown tests under load. + +Verifies clean shutdown behavior with many active connections. +""" + +import asyncio +import signal +import time + +import httpx +import pytest +from httpx_sse import aconnect_sse + + +@pytest.mark.loadtest +async def test_graceful_shutdown_with_active_connections( + docker_available: bool, + scale: int, +) -> None: + """ + Send SIGTERM to server with active connections, verify clean shutdown. + + Pass criteria: + - Shutdown completes within 5 seconds + - All connections receive disconnect (no hanging clients) + """ + if not docker_available: + pytest.skip("Docker not available") + + from tests.load.conftest import SSELoadTestContainer + + container = SSELoadTestContainer() + container.start() + + # Wait for server ready + await asyncio.sleep(2) + base_url = container.get_base_url() + + # Verify server is up + async with httpx.AsyncClient() as client: + resp = await client.get(f"{base_url}/health") + assert resp.status_code == 200 + + disconnected = asyncio.Event() + connections_made = 0 + connections_closed = 0 + + async def client_task() -> str: + nonlocal connections_made, connections_closed + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{base_url}/sse?delay=0.1" + ) as source: + connections_made += 1 + async for _ in source.aiter_sse(): + if disconnected.is_set(): + break + connections_closed += 1 + return "clean_close" + except httpx.RemoteProtocolError: + connections_closed += 1 + return "server_closed" + except Exception as e: + connections_closed += 1 + return f"error:{type(e).__name__}" + + # Start clients + tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + + # Wait for connections to establish + await asyncio.sleep(2) + + # Send SIGTERM to container + start_shutdown = time.perf_counter() + container.get_wrapped_container().kill(signal=signal.SIGTERM) + + # Wait for shutdown + shutdown_timeout = 10 + try: + results = await asyncio.wait_for( + asyncio.gather(*tasks, return_exceptions=True), + timeout=shutdown_timeout, + ) + except asyncio.TimeoutError: + # Cancel remaining tasks + for task in tasks: + task.cancel() + results = await asyncio.gather(*tasks, return_exceptions=True) + + shutdown_time = time.perf_counter() - start_shutdown + + # Cleanup container + try: + container.stop() + except Exception: + pass + + # Analyze results + clean_closes = sum(1 for r in results if r == "clean_close") + server_closes = sum(1 for r in results if r == "server_closed") + errors = sum(1 for r in results if isinstance(r, str) and r.startswith("error:")) + + # All connections should have closed (one way or another) + total_closed = clean_closes + server_closes + errors + assert ( + total_closed >= scale * 0.9 + ), f"Not all connections closed: {total_closed}/{scale}" + + # Shutdown should be fast + assert shutdown_time < 10, f"Shutdown took {shutdown_time:.1f}s, expected < 10s" + + +@pytest.mark.loadtest +async def test_connections_receive_shutdown_signal( + docker_available: bool, +) -> None: + """ + Verify connections are notified of shutdown via SSE. + + When AppStatus.should_exit is set, active streams should terminate gracefully. + """ + if not docker_available: + pytest.skip("Docker not available") + + from tests.load.conftest import SSELoadTestContainer + + container = SSELoadTestContainer() + container.start() + + await asyncio.sleep(2) + base_url = container.get_base_url() + + # Connect clients that will wait for events + async def client_task() -> int: + count = 0 + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{base_url}/sse?delay=0.5" + ) as source: + async for _ in source.aiter_sse(): + count += 1 + if count >= 20: # Should not reach this + break + except Exception: + pass + return count + + tasks = [asyncio.create_task(client_task()) for _ in range(10)] + + # Let them receive a few events + await asyncio.sleep(3) + + # Kill the server + container.get_wrapped_container().kill(signal=signal.SIGTERM) + + # Gather results + try: + results = await asyncio.wait_for( + asyncio.gather(*tasks, return_exceptions=True), + timeout=10, + ) + except asyncio.TimeoutError: + for task in tasks: + task.cancel() + results = await asyncio.gather(*tasks, return_exceptions=True) + + try: + container.stop() + except Exception: + pass + + # Clients should have received some events before shutdown + event_counts = [r for r in results if isinstance(r, int)] + total_events = sum(event_counts) + + assert total_events > 0, "Clients should have received events before shutdown" + assert all( + c < 20 for c in event_counts + ), "Clients should have been interrupted by shutdown" diff --git a/tests/load/test_throughput.py b/tests/load/test_throughput.py new file mode 100644 index 0000000..22d8407 --- /dev/null +++ b/tests/load/test_throughput.py @@ -0,0 +1,185 @@ +""" +Throughput and latency tests for sse-starlette. + +Measures events per second, latency percentiles, and first event latency. +""" + +import asyncio +import time +from typing import List + +import httpx +import pytest +from httpx_sse import aconnect_sse + + +@pytest.mark.loadtest +async def test_throughput_single_client(sse_server_url: str) -> None: + """ + Measure maximum throughput for a single client. + + Baseline measurement without contention. + """ + events_received = 0 + start_time = time.perf_counter() + duration_seconds = 10 + + async with httpx.AsyncClient(timeout=60.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0" + ) as source: + async for _ in source.aiter_sse(): + events_received += 1 + if time.perf_counter() - start_time >= duration_seconds: + break + + elapsed = time.perf_counter() - start_time + throughput = events_received / elapsed + + # Should achieve at least 1000 events/sec for a single client + assert ( + throughput >= 1000 + ), f"Single client throughput {throughput:.0f} events/sec, expected >= 1000" + + +@pytest.mark.loadtest +async def test_throughput_multiple_clients( + sse_server_url: str, + scale: int, +) -> None: + """ + Measure aggregate throughput with multiple concurrent clients. + + Pass criteria: + - Aggregate throughput > 10,000 events/sec + """ + duration_seconds = 30 + + async def client_task() -> int: + count = 0 + start = time.perf_counter() + try: + async with httpx.AsyncClient(timeout=60.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.001" + ) as source: + async for _ in source.aiter_sse(): + count += 1 + if time.perf_counter() - start >= duration_seconds: + break + except Exception: + pass + return count + + start_time = time.perf_counter() + tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + results = await asyncio.gather(*tasks, return_exceptions=True) + elapsed = time.perf_counter() - start_time + + total_events = sum(r for r in results if isinstance(r, int)) + aggregate_throughput = total_events / elapsed + + # With scale clients, should achieve high aggregate throughput + min_expected = min(10000, scale * 100) # Scale expectation with client count + assert aggregate_throughput >= min_expected, ( + f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {scale} " + f"clients, expected >= {min_expected}" + ) + + +@pytest.mark.loadtest +async def test_first_event_latency( + sse_server_url: str, + scale: int, +) -> None: + """ + Measure time to first event (TTFE) for multiple connections. + + Pass criteria (relaxed for Docker overhead): + - p50 TTFE < 2000ms + - p99 TTFE < 5000ms + """ + latencies: List[float] = [] + + async def measure_ttfe() -> float: + start = time.perf_counter() + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0" + ) as source: + async for _ in source.aiter_sse(): + return (time.perf_counter() - start) * 1000 # ms + except Exception: + return -1 + return -1 + + tasks = [asyncio.create_task(measure_ttfe()) for _ in range(scale)] + results = await asyncio.gather(*tasks) + + latencies = [r for r in results if r > 0] + + if len(latencies) < scale * 0.9: + pytest.fail(f"Too many failed connections: {len(latencies)}/{scale}") + + latencies.sort() + p50 = latencies[len(latencies) // 2] + p99 = latencies[int(len(latencies) * 0.99)] + + # Relaxed thresholds: Docker networking + container overhead + assert p50 < 2000, f"p50 TTFE {p50:.1f}ms, expected < 2000ms" + assert p99 < 5000, f"p99 TTFE {p99:.1f}ms, expected < 5000ms" + + +@pytest.mark.loadtest +async def test_event_latency_under_load( + sse_server_url: str, + scale: int, +) -> None: + """ + Measure event-to-event latency under load. + + Captures latency between consecutive events to detect backpressure. + """ + all_latencies: List[float] = [] + + async def measure_latencies() -> List[float]: + latencies: List[float] = [] + last_time = None + try: + async with httpx.AsyncClient(timeout=60.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.01" + ) as source: + count = 0 + async for _ in source.aiter_sse(): + now = time.perf_counter() + if last_time is not None: + latencies.append((now - last_time) * 1000) + last_time = now + count += 1 + if count >= 100: + break + except Exception: + pass + return latencies + + tasks = [asyncio.create_task(measure_latencies()) for _ in range(scale)] + results = await asyncio.gather(*tasks) + + for client_latencies in results: + all_latencies.extend(client_latencies) + + if len(all_latencies) < 100: + pytest.fail(f"Insufficient latency samples: {len(all_latencies)}") + + all_latencies.sort() + p50 = all_latencies[len(all_latencies) // 2] + p95 = all_latencies[int(len(all_latencies) * 0.95)] + p99 = all_latencies[int(len(all_latencies) * 0.99)] + + # Expected ~10ms between events (0.01s delay) + # Allow 2x for processing overhead under load + assert p50 < 50, f"p50 inter-event latency {p50:.1f}ms, expected < 50ms" + assert p95 < 100, f"p95 inter-event latency {p95:.1f}ms, expected < 100ms" + assert p99 < 200, f"p99 inter-event latency {p99:.1f}ms, expected < 200ms" diff --git a/tests/load/test_watcher_scale.py b/tests/load/test_watcher_scale.py new file mode 100644 index 0000000..3a4d150 --- /dev/null +++ b/tests/load/test_watcher_scale.py @@ -0,0 +1,159 @@ +""" +Watcher deduplication tests at scale. + +Validates the Issue #152 fix: only one watcher task per thread regardless +of the number of concurrent connections. +""" + +import asyncio + +import httpx +import pytest +from httpx_sse import aconnect_sse + + +@pytest.mark.loadtest +async def test_single_watcher_with_many_connections( + sse_server_url: str, + scale: int, +) -> None: + """ + With N concurrent connections, verify only 1 watcher is running. + + This is the core regression test for Issue #152. + """ + + async def client_task() -> None: + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.1" + ) as source: + async for _ in source.aiter_sse(): + await asyncio.sleep(5) # Stay connected + break + except Exception: + pass + + # Start many connections + tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + + # Wait for connections to establish + await asyncio.sleep(2) + + # Check watcher status + async with httpx.AsyncClient() as client: + metrics = (await client.get(f"{sse_server_url}/metrics")).json() + + watcher_started = metrics["watcher_started"] + registered_events = metrics["registered_events"] + + # Cancel all tasks + for task in tasks: + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + + # Watcher should be running + assert watcher_started is True, "Watcher should be started with active connections" + + # Should have many events registered (one per connection) + assert ( + registered_events >= scale * 0.5 + ), f"Expected at least {scale * 0.5} events, got {registered_events}" + + +@pytest.mark.loadtest +async def test_rapid_connect_disconnect_watcher_stability( + sse_server_url: str, + scale: int, +) -> None: + """ + Rapid connect/disconnect cycles should not accumulate watchers. + + Each connect/disconnect should reuse the existing watcher, not spawn new ones. + """ + + async def quick_connect() -> None: + try: + async with httpx.AsyncClient(timeout=10.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.01" + ) as source: + async for _ in source.aiter_sse(): + break # Disconnect after first event + except Exception: + pass + + # Rapid connect/disconnect cycles + for batch in range(scale // 10): + tasks = [asyncio.create_task(quick_connect()) for _ in range(10)] + await asyncio.gather(*tasks, return_exceptions=True) + + # Brief pause + await asyncio.sleep(0.5) + + # Check metrics - watcher should still be singular + async with httpx.AsyncClient() as client: + metrics = (await client.get(f"{sse_server_url}/metrics")).json() + + # The watcher_started flag confirms single watcher pattern + # If multiple watchers had spawned, we'd see resource issues + num_threads = metrics["num_threads"] + + # Thread count should be reasonable (not proportional to connection count) + # A healthy uvicorn worker has ~5-10 threads typically + assert num_threads < 50, f"Too many threads ({num_threads}), possible watcher leak" + + +@pytest.mark.loadtest +async def test_watcher_cleanup_allows_restart(sse_server_url: str) -> None: + """ + After all connections close, new connections should start fresh watcher. + + Tests the watcher lifecycle: start -> broadcast -> cleanup -> restart. + """ + + async def connect_and_consume(n_events: int) -> int: + count = 0 + try: + async with httpx.AsyncClient(timeout=30.0) as client: + async with aconnect_sse( + client, "GET", f"{sse_server_url}/sse?delay=0.05" + ) as source: + async for _ in source.aiter_sse(): + count += 1 + if count >= n_events: + break + except Exception: + pass + return count + + # Phase 1: Connect, consume, disconnect + tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)] + results = await asyncio.gather(*tasks) + assert sum(results) > 0, "Phase 1 should have received events" + + # Wait for cleanup + await asyncio.sleep(1) + + # Check state is clean + async with httpx.AsyncClient() as client: + metrics1 = (await client.get(f"{sse_server_url}/metrics")).json() + events_after_phase1 = metrics1["registered_events"] + + # Phase 2: New connections should work + tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)] + results = await asyncio.gather(*tasks) + assert sum(results) > 0, "Phase 2 should have received events" + + # Wait for cleanup + await asyncio.sleep(1) + + # Verify clean state + async with httpx.AsyncClient() as client: + metrics2 = (await client.get(f"{sse_server_url}/metrics")).json() + + # Events should be cleaned up after both phases + assert ( + metrics2["registered_events"] <= events_after_phase1 + 5 + ), "Event set should be cleaned up between phases" diff --git a/uv.lock b/uv.lock index a2ba9db..cf95af6 100644 --- a/uv.lock +++ b/uv.lock @@ -1294,6 +1294,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload_time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "httpx-sse" +version = "0.4.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload_time = "2025-10-10T21:48:22.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload_time = "2025-10-10T21:48:21.158Z" }, +] + [[package]] name = "hyperlink" version = "21.0.0" @@ -2347,6 +2356,10 @@ granian = [ { name = "granian", version = "2.5.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "granian", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] +loadtest = [ + { name = "httpx-sse" }, + { name = "psutil" }, +] uvicorn = [ { name = "uvicorn" }, ] @@ -2357,6 +2370,7 @@ dev = [ { name = "async-timeout" }, { name = "build" }, { name = "httpx" }, + { name = "httpx-sse" }, { name = "mypy" }, { name = "portend" }, { name = "pre-commit", version = "4.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -2383,12 +2397,14 @@ requires-dist = [ { name = "daphne", marker = "extra == 'daphne'", specifier = ">=4.2.0" }, { name = "fastapi", marker = "extra == 'examples'", specifier = ">=0.115.12" }, { name = "granian", marker = "extra == 'granian'", specifier = ">=2.3.1" }, + { name = "httpx-sse", marker = "extra == 'loadtest'", specifier = ">=0.4.0" }, + { name = "psutil", marker = "extra == 'loadtest'", specifier = ">=6.1.1" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'examples'", specifier = ">=2.0.41" }, { name = "starlette", specifier = ">=0.49.1" }, { name = "uvicorn", marker = "extra == 'examples'", specifier = ">=0.34.0" }, { name = "uvicorn", marker = "extra == 'uvicorn'", specifier = ">=0.34.0" }, ] -provides-extras = ["examples", "uvicorn", "granian", "daphne"] +provides-extras = ["examples", "uvicorn", "granian", "daphne", "loadtest"] [package.metadata.requires-dev] dev = [ @@ -2396,6 +2412,7 @@ dev = [ { name = "async-timeout", specifier = ">=5.0.1" }, { name = "build", specifier = ">=1.2.2.post1" }, { name = "httpx", specifier = ">=0.28.1" }, + { name = "httpx-sse", specifier = ">=0.4.0" }, { name = "mypy", specifier = ">=1.14.0" }, { name = "portend", specifier = ">=3.2.0" }, { name = "pre-commit", specifier = ">=4.0.0" }, From 03255553be096bd769713c9d5c119e825f912edd Mon Sep 17 00:00:00 2001 From: sysid Date: Fri, 2 Jan 2026 12:33:17 +0100 Subject: [PATCH 3/4] feat(loadtest): add metrics infrastructure with baseline comparison Add structured metrics collection, baseline management, and HTML/JSON reporting to load tests. Tests now produce observable performance data instead of just pass/fail results. New components: - MetricsCollector: aggregates latency, memory, throughput samples - BaselineManager: stores/loads per-test baselines, detects regressions - ReportGenerator: produces JSON reports and self-contained HTML with inline SVG charts All 15 load tests updated with: - Metrics collection integration - Structured docstrings explaining what/why/how for each test - Baseline comparison and optional regression detection CLI options added: --update-baseline, --fail-on-regression, --output-dir, --baselines-dir, --regression-threshold GitHub workflow updated with baseline update and regression detection inputs, plus artifact upload for reports. --- .github/workflows/load-test.yml | 38 +- .gitignore | 4 + tests/load/README.md | 228 +++++++++++ tests/load/baseline.py | 338 +++++++++++++++ tests/load/baselines/.gitkeep | 0 tests/load/conftest.py | 115 +++++- tests/load/metrics.py | 422 +++++++++++++++++++ tests/load/reporter.py | 615 ++++++++++++++++++++++++++++ tests/load/test_backpressure.py | 349 +++++++++++++--- tests/load/test_memory_stability.py | 341 ++++++++++++--- tests/load/test_shutdown.py | 211 ++++++++-- tests/load/test_throughput.py | 342 ++++++++++++++-- tests/load/test_watcher_scale.py | 323 +++++++++++++-- 13 files changed, 3101 insertions(+), 225 deletions(-) create mode 100644 tests/load/README.md create mode 100644 tests/load/baseline.py create mode 100644 tests/load/baselines/.gitkeep create mode 100644 tests/load/metrics.py create mode 100644 tests/load/reporter.py diff --git a/.github/workflows/load-test.yml b/.github/workflows/load-test.yml index 59eeb79..ec3691e 100644 --- a/.github/workflows/load-test.yml +++ b/.github/workflows/load-test.yml @@ -21,6 +21,16 @@ on: - '1' - '5' - '10' + update_baseline: + description: 'Update baselines after run' + required: false + default: false + type: boolean + fail_on_regression: + description: 'Fail if regression detected' + required: false + default: false + type: boolean jobs: load-test: @@ -51,6 +61,9 @@ jobs: python -m pytest tests/load/ -m "loadtest" \ --scale=${{ inputs.scale }} \ --duration=${{ inputs.duration }} \ + --output-dir=tests/load/results \ + ${{ inputs.update_baseline && '--update-baseline' || '' }} \ + ${{ inputs.fail_on_regression && '--fail-on-regression' || '' }} \ -v --tb=short \ --junitxml=load-test-results.xml @@ -58,11 +71,21 @@ jobs: uses: actions/upload-artifact@v4 if: always() with: - name: load-test-results + name: load-test-results-${{ github.sha }} path: | load-test-results.xml + tests/load/results/*.json + tests/load/results/*.html retention-days: 30 + - name: Upload updated baselines + uses: actions/upload-artifact@v4 + if: inputs.update_baseline + with: + name: updated-baselines-${{ github.sha }} + path: tests/load/baselines/*.json + retention-days: 90 + - name: Test Summary if: always() run: | @@ -70,7 +93,16 @@ jobs: echo "" >> $GITHUB_STEP_SUMMARY echo "- **Scale**: ${{ inputs.scale }} concurrent connections" >> $GITHUB_STEP_SUMMARY echo "- **Duration**: ${{ inputs.duration }} minutes" >> $GITHUB_STEP_SUMMARY + echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - if [ -f load-test-results.xml ]; then - echo "Test results uploaded as artifact." >> $GITHUB_STEP_SUMMARY + echo "### Reports" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + if [ -d tests/load/results ]; then + for f in tests/load/results/*.json; do + if [ -f "$f" ]; then + echo "- $(basename "$f")" >> $GITHUB_STEP_SUMMARY + fi + done fi + echo "" >> $GITHUB_STEP_SUMMARY + echo "Download artifacts for detailed HTML reports with charts." >> $GITHUB_STEP_SUMMARY diff --git a/.gitignore b/.gitignore index 02740bb..6f1e208 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,7 @@ venv Pipfile.lock .envrc .pdm-python + +# Load test results (generated, not tracked) +tests/load/results/ +!tests/load/results/.gitkeep diff --git a/tests/load/README.md b/tests/load/README.md new file mode 100644 index 0000000..13332a5 --- /dev/null +++ b/tests/load/README.md @@ -0,0 +1,228 @@ +# SSE-Starlette Load Tests + +Performance and stability tests for the SSE implementation under realistic load conditions. + +## Overview + +These tests measure performance characteristics that unit tests cannot capture: +- Throughput under concurrent load +- Memory stability over time +- Resource cleanup after disconnections +- Graceful shutdown behavior +- Backpressure handling + +## Quick Start + +```bash +# Run load tests locally (requires Docker) +make test-load + +# Run with custom scale +make test-load PYTEST_ARGS="--scale=500 --duration=5" + +# Update baselines after intentional changes +make test-load PYTEST_ARGS="--update-baseline" +``` + +## Architecture + +``` +tests/load/ +β”œβ”€β”€ conftest.py # Fixtures, CLI options, Docker container setup +β”œβ”€β”€ metrics.py # MetricsCollector, statistics computation +β”œβ”€β”€ baseline.py # BaselineManager, regression detection +β”œβ”€β”€ reporter.py # JSON + HTML report generation +β”œβ”€β”€ server_app.py # Test server with /sse and /metrics endpoints +β”œβ”€β”€ Dockerfile.loadtest # Container for isolated server testing +β”œβ”€β”€ baselines/ # Git-tracked baseline files (*.json) +β”œβ”€β”€ results/ # Generated reports (gitignored) +└── test_*.py # Test modules +``` + +## KPI Persistence & Baselining + +### How It Works + +1. **During Test Run**: `MetricsCollector` aggregates samples (latencies, memory, events) +2. **After Test**: Statistics computed (p50/p95/p99, mean, stdev, slopes) +3. **Report Generation**: JSON file saved to `tests/load/results/.json` +4. **Baseline Comparison**: Current run compared against `tests/load/baselines/.json` +5. **Regression Detection**: Percent changes flagged if exceeding thresholds + +### Baseline Files + +Baselines are **git-tracked** so changes are visible in PRs: + +``` +tests/load/baselines/ +β”œβ”€β”€ test_throughput_single_client.json +β”œβ”€β”€ test_memory_stability_under_load.json +└── ... +``` + +Each baseline contains: +```json +{ + "test_name": "test_throughput_single_client", + "timestamp": "2024-01-15T14:30:00Z", + "git_commit": "abc1234", + "throughput": { + "aggregate_events_per_sec": 12456.7, + "per_client_events_per_sec": [12456.7] + }, + "latency": { "p50_ms": 14.8, "p95_ms": 21.4, "p99_ms": 27.4 }, + "memory": { "baseline_mb": 45.2, "peak_mb": 67.8, "growth_mb": 22.6 } +} +``` + +### Updating Baselines + +```bash +# After intentional performance changes (optimization, new features) +make test-load PYTEST_ARGS="--update-baseline" + +# Then commit the updated baseline files +git add tests/load/baselines/ +git commit -m "Update load test baselines after optimization" +``` + +### Regression Detection + +| Metric | Warning Threshold | Fail Threshold | +|--------|-------------------|----------------| +| Latency p99 | +20% | +50% | +| Throughput | -20% | - | +| Memory growth | +50% | - | +| Memory slope | - | >0.1 MB/sec | +| Error rate | - | >5% | + +Enable in CI: +```bash +make test-load PYTEST_ARGS="--fail-on-regression" +``` + +## CLI Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--scale` | 100 | Concurrent connections | +| `--duration` | 1 | Test duration (minutes) | +| `--output-dir` | `tests/load/results` | Report output directory | +| `--baselines-dir` | `tests/load/baselines` | Baseline file directory | +| `--update-baseline` | False | Save current run as new baseline | +| `--fail-on-regression` | False | Exit non-zero if regression detected | +| `--regression-threshold` | 20 | Percent change to trigger warning | + +## Test Categories + +### Throughput (`test_throughput.py`) +- Single client maximum throughput (baseline without contention) +- Multi-client aggregate throughput (scaling behavior) +- Time to first event (connection setup latency) +- Inter-event latency under load (backpressure detection) + +### Memory Stability (`test_memory_stability.py`) +- Memory growth during sustained streaming +- Memory reclamation after disconnect +- Event set cleanup (Issue #152 regression) + +### Watcher Scale (`test_watcher_scale.py`) +- Single watcher with many connections (Issue #152 core test) +- Watcher stability under rapid churn +- Watcher lifecycle (start β†’ broadcast β†’ cleanup β†’ restart) + +### Shutdown (`test_shutdown.py`) +- Graceful shutdown timing with active connections +- Shutdown signal propagation to streams + +### Backpressure (`test_backpressure.py`) +- Slow client isolation (fast clients unaffected) +- Resource stability under connection churn +- send_timeout behavior with frozen clients + +## Limitations: What These Tests Don't Cover + +### Not Measured + +1. **True Production Scale** + - Tests run at 100-1000 connections; production may see 10K+ + - Resource contention patterns differ at extreme scale + - OS-level limits (ulimit, ephemeral ports) not tested + +2. **Network Conditions** + - Tests run on localhost/Docker bridge + - No simulation of latency, packet loss, or bandwidth limits + - Real network jitter not captured + +3. **Long-Running Stability** + - Tests run for minutes; production runs for days/weeks + - Slow leaks (bytes/hour) may not appear in short tests + - GC pressure patterns differ over extended periods + +4. **CPU Profiling** + - No measurement of CPU cycles per event + - Hot path optimization regressions not detected + - Async scheduler overhead not isolated + +5. **Multi-Process/Multi-Node** + - Tests run single uvicorn process + - No testing of gunicorn worker coordination + - No distributed load balancer behavior + +6. **Client Diversity** + - All clients use httpx (same HTTP/1.1 implementation) + - No HTTP/2 or HTTP/3 testing + - No browser-specific SSE behavior (reconnection, Last-Event-ID) + +7. **Garbage Collection Impact** + - Python GC pauses not isolated + - Memory pressure from other processes not simulated + - Different GC generations not separately measured + +### Potential Blind Spots + +| Regression Type | Detection Gap | +|-----------------|---------------| +| 5% throughput drop | Below noise floor | +| Sub-millisecond latency spikes | Averaged out in percentiles | +| Memory leak < 1KB/connection | Too slow to appear in test duration | +| CPU regression without throughput impact | Not measured | +| Thread pool exhaustion at >1000 connections | Scale not tested | +| Event loop blocking < 10ms | Within jitter tolerance | + +### Recommendations for Production + +1. **APM Integration**: Use Datadog/NewRelic for continuous production metrics +2. **Synthetic Monitoring**: Run periodic load tests against staging +3. **Canary Deployments**: Compare metrics between old/new versions +4. **Memory Profiling**: Run tracemalloc in staging for leak detection +5. **CPU Profiling**: Use py-spy periodically to catch hot path regressions + +## Report Outputs + +### JSON Report +Full structured data for programmatic analysis: +``` +tests/load/results/test_throughput_single_client.json +``` + +### HTML Report +Self-contained visualization with inline SVG charts: +``` +tests/load/results/test_throughput_single_client.html +``` + +Features: +- Summary metrics table +- Memory usage over time chart +- Latency distribution (when applicable) +- Comparison against baseline with delta percentages +- Regression/warning highlights + +## GitHub Actions Integration + +The workflow (`.github/workflows/load-test.yml`) supports: +- Manual trigger with scale/duration inputs +- Baseline update option +- Regression detection for CI gates +- Artifact upload for reports diff --git a/tests/load/baseline.py b/tests/load/baseline.py new file mode 100644 index 0000000..4423b79 --- /dev/null +++ b/tests/load/baseline.py @@ -0,0 +1,338 @@ +""" +Baseline management for load test metrics. + +Handles loading, saving, and comparing performance baselines. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from .metrics import ( + LatencyStats, + MemoryStats, + ReliabilityStats, + SSEInternals, + TestReport, + ThroughputStats, +) + + +@dataclass +class ComparisonResult: + """Result of comparing current run against baseline.""" + + # Percent changes (positive = worse for latency/memory, negative = worse for throughput) + latency_p99_change_pct: float | None = None + latency_p50_change_pct: float | None = None + ttfe_p99_change_pct: float | None = None + throughput_change_pct: float | None = None + memory_growth_change_pct: float | None = None + memory_slope_change_pct: float | None = None + error_rate_change_pct: float | None = None + + # Regression detection + regression_detected: bool = False + regression_reasons: list[str] | None = None + warnings: list[str] | None = None + + # Baseline info + baseline_commit: str | None = None + baseline_timestamp: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Convert to JSON-serializable dict.""" + result: dict[str, Any] = {} + + if self.baseline_commit: + result["baseline_commit"] = self.baseline_commit + if self.baseline_timestamp: + result["baseline_timestamp"] = self.baseline_timestamp + + if self.latency_p99_change_pct is not None: + result["latency_p99_change_pct"] = round(self.latency_p99_change_pct, 2) + if self.latency_p50_change_pct is not None: + result["latency_p50_change_pct"] = round(self.latency_p50_change_pct, 2) + if self.ttfe_p99_change_pct is not None: + result["ttfe_p99_change_pct"] = round(self.ttfe_p99_change_pct, 2) + if self.throughput_change_pct is not None: + result["throughput_change_pct"] = round(self.throughput_change_pct, 2) + if self.memory_growth_change_pct is not None: + result["memory_growth_change_pct"] = round(self.memory_growth_change_pct, 2) + if self.memory_slope_change_pct is not None: + result["memory_slope_change_pct"] = round(self.memory_slope_change_pct, 2) + if self.error_rate_change_pct is not None: + result["error_rate_change_pct"] = round(self.error_rate_change_pct, 2) + + result["regression_detected"] = self.regression_detected + if self.regression_reasons: + result["regression_reasons"] = self.regression_reasons + if self.warnings: + result["warnings"] = self.warnings + + return result + + +# Default thresholds for regression detection +DEFAULT_THRESHOLDS = { + "latency_p99_warning_pct": 20.0, + "latency_p99_fail_pct": 50.0, + "throughput_warning_pct": -20.0, # Negative = decrease + "memory_growth_warning_pct": 50.0, + "memory_slope_fail": 0.1, # MB/sec absolute threshold + "error_rate_fail_pct": 5.0, # Absolute percentage +} + + +class BaselineManager: + """Manages per-test baselines for comparison.""" + + def __init__( + self, + baselines_dir: Path | str = "tests/load/baselines", + thresholds: dict[str, float] | None = None, + ): + self.baselines_dir = Path(baselines_dir) + self.thresholds = thresholds or DEFAULT_THRESHOLDS + + def _baseline_path(self, test_name: str) -> Path: + """Get path to baseline file for a test.""" + # Sanitize test name for filename + safe_name = test_name.replace("::", "_").replace("/", "_").replace("\\", "_") + return self.baselines_dir / f"{safe_name}.json" + + def load_baseline(self, test_name: str) -> TestReport | None: + """Load baseline for a specific test.""" + path = self._baseline_path(test_name) + if not path.exists(): + return None + + try: + with open(path) as f: + data = json.load(f) + return _dict_to_report(data) + except (json.JSONDecodeError, KeyError, TypeError): + return None + + def save_baseline(self, report: TestReport) -> Path: + """Save report as new baseline.""" + self.baselines_dir.mkdir(parents=True, exist_ok=True) + path = self._baseline_path(report.test_name) + + with open(path, "w") as f: + json.dump(report.to_dict(), f, indent=2) + + return path + + def compare( + self, current: TestReport, baseline: TestReport | None = None + ) -> ComparisonResult: + """Compare current run against baseline.""" + if baseline is None: + baseline = self.load_baseline(current.test_name) + + if baseline is None: + return ComparisonResult() + + result = ComparisonResult( + baseline_commit=baseline.git_commit, + baseline_timestamp=baseline.timestamp, + ) + warnings: list[str] = [] + regressions: list[str] = [] + + # Compare latency p99 + if current.latency and baseline.latency: + if baseline.latency.p99_ms > 0: + change = ( + (current.latency.p99_ms - baseline.latency.p99_ms) + / baseline.latency.p99_ms + * 100 + ) + result.latency_p99_change_pct = change + if change > self.thresholds["latency_p99_fail_pct"]: + regressions.append( + f"Latency p99 increased by {change:.1f}% " + f"(>{self.thresholds['latency_p99_fail_pct']}%)" + ) + elif change > self.thresholds["latency_p99_warning_pct"]: + warnings.append(f"Latency p99 increased by {change:.1f}%") + + if baseline.latency.p50_ms > 0: + change = ( + (current.latency.p50_ms - baseline.latency.p50_ms) + / baseline.latency.p50_ms + * 100 + ) + result.latency_p50_change_pct = change + + # Compare TTFE p99 + if current.ttfe and baseline.ttfe: + if baseline.ttfe.p99_ms > 0: + change = ( + (current.ttfe.p99_ms - baseline.ttfe.p99_ms) + / baseline.ttfe.p99_ms + * 100 + ) + result.ttfe_p99_change_pct = change + + # Compare throughput + if current.throughput and baseline.throughput: + if baseline.throughput.aggregate_events_per_sec > 0: + change = ( + ( + current.throughput.aggregate_events_per_sec + - baseline.throughput.aggregate_events_per_sec + ) + / baseline.throughput.aggregate_events_per_sec + * 100 + ) + result.throughput_change_pct = change + if change < self.thresholds["throughput_warning_pct"]: + warnings.append(f"Throughput decreased by {abs(change):.1f}%") + + # Compare memory growth + if current.memory and baseline.memory: + if baseline.memory.growth_mb > 0: + change = ( + (current.memory.growth_mb - baseline.memory.growth_mb) + / baseline.memory.growth_mb + * 100 + ) + result.memory_growth_change_pct = change + if change > self.thresholds["memory_growth_warning_pct"]: + warnings.append(f"Memory growth increased by {change:.1f}%") + + # Memory slope absolute check + if current.memory.slope_mb_per_sec > self.thresholds["memory_slope_fail"]: + regressions.append( + f"Memory slope {current.memory.slope_mb_per_sec:.3f} MB/sec " + f"exceeds threshold {self.thresholds['memory_slope_fail']} MB/sec" + ) + + if baseline.memory.slope_mb_per_sec > 0: + change = ( + (current.memory.slope_mb_per_sec - baseline.memory.slope_mb_per_sec) + / baseline.memory.slope_mb_per_sec + * 100 + ) + result.memory_slope_change_pct = change + + # Compare error rate + if current.reliability and baseline.reliability: + change = ( + current.reliability.error_rate - baseline.reliability.error_rate + ) * 100 + result.error_rate_change_pct = change + + if ( + current.reliability.error_rate * 100 + > self.thresholds["error_rate_fail_pct"] + ): + regressions.append( + f"Error rate {current.reliability.error_rate * 100:.1f}% " + f"exceeds threshold {self.thresholds['error_rate_fail_pct']}%" + ) + + result.regression_detected = len(regressions) > 0 + result.regression_reasons = regressions if regressions else None + result.warnings = warnings if warnings else None + + return result + + +def _dict_to_report(data: dict[str, Any]) -> TestReport: + """Convert JSON dict back to TestReport.""" + metadata = data.get("metadata", data) + + # Reconstruct latency stats + latency = None + if "latency" in data: + lat = data["latency"] + latency = LatencyStats( + p50_ms=lat["p50_ms"], + p90_ms=lat["p90_ms"], + p95_ms=lat["p95_ms"], + p99_ms=lat["p99_ms"], + max_ms=lat["max_ms"], + min_ms=lat.get("min_ms", 0.0), + mean_ms=lat["mean_ms"], + stdev_ms=lat["stdev_ms"], + sample_count=lat["sample_count"], + ) + + ttfe = None + if "ttfe" in data: + t = data["ttfe"] + ttfe = LatencyStats( + p50_ms=t["p50_ms"], + p90_ms=t["p90_ms"], + p95_ms=t["p95_ms"], + p99_ms=t["p99_ms"], + max_ms=t["max_ms"], + min_ms=t.get("min_ms", 0.0), + mean_ms=t["mean_ms"], + stdev_ms=t["stdev_ms"], + sample_count=t["sample_count"], + ) + + throughput = None + if "throughput" in data: + th = data["throughput"] + throughput = ThroughputStats( + aggregate_events_per_sec=th["aggregate_events_per_sec"], + per_client_events_per_sec=th["per_client_events_per_sec"], + total_events=th["total_events"], + total_duration_sec=th["total_duration_sec"], + client_count=th["client_count"], + ) + + memory = None + if "memory" in data: + m = data["memory"] + memory = MemoryStats( + baseline_mb=m["baseline_mb"], + peak_mb=m["peak_mb"], + final_mb=m["final_mb"], + growth_mb=m["growth_mb"], + slope_mb_per_sec=m["slope_mb_per_sec"], + samples=[(s[0], s[1]) for s in m.get("samples", [])], + ) + + reliability = None + if "reliability" in data: + r = data["reliability"] + reliability = ReliabilityStats( + successful_connections=r["successful_connections"], + failed_connections=r["failed_connections"], + error_rate=r["error_rate"], + errors=r.get("errors", []), + ) + + sse_internals = None + if "sse_internals" in data: + s = data["sse_internals"] + sse_internals = SSEInternals( + watcher_started=s["watcher_started"], + peak_registered_events=s["peak_registered_events"], + final_registered_events=s["final_registered_events"], + ) + + return TestReport( + test_name=metadata.get("test_name", "unknown"), + timestamp=metadata.get("timestamp", ""), + git_commit=metadata.get("git_commit", "unknown"), + git_branch=metadata.get("git_branch", "unknown"), + scale=metadata.get("scale", 0), + duration_minutes=metadata.get("duration_minutes", 0), + latency=latency, + ttfe=ttfe, + throughput=throughput, + memory=memory, + reliability=reliability, + sse_internals=sse_internals, + comparison=data.get("comparison"), + ) diff --git a/tests/load/baselines/.gitkeep b/tests/load/baselines/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/load/conftest.py b/tests/load/conftest.py index ec73419..b70f888 100644 --- a/tests/load/conftest.py +++ b/tests/load/conftest.py @@ -4,14 +4,24 @@ Provides container-based SSE server and utility fixtures. """ +from __future__ import annotations + import os import time -from typing import Generator +from pathlib import Path +from typing import TYPE_CHECKING, Generator import httpx import pytest from testcontainers.core.container import DockerContainer +from .baseline import BaselineManager +from .metrics import MetricsCollector +from .reporter import ReportGenerator + +if TYPE_CHECKING: + from .metrics import TestReport + class SSELoadTestContainer(DockerContainer): """Custom container for SSE load testing.""" @@ -114,6 +124,37 @@ def pytest_addoption(parser: pytest.Parser) -> None: default="1", help="Test duration in minutes", ) + parser.addoption( + "--output-dir", + action="store", + default="tests/load/results", + help="Directory for test reports", + ) + parser.addoption( + "--baselines-dir", + action="store", + default="tests/load/baselines", + help="Directory for baseline files", + ) + parser.addoption( + "--update-baseline", + action="store_true", + default=False, + help="Save current run as new baseline", + ) + parser.addoption( + "--fail-on-regression", + action="store_true", + default=False, + help="Exit non-zero if regression detected", + ) + parser.addoption( + "--regression-threshold", + action="store", + type=int, + default=20, + help="Percent change to trigger regression warning", + ) @pytest.fixture @@ -126,3 +167,75 @@ def scale(request: pytest.FixtureRequest) -> int: def duration_minutes(request: pytest.FixtureRequest) -> int: """Get the duration in minutes for load tests.""" return int(request.config.getoption("--duration")) + + +@pytest.fixture +def output_dir(request: pytest.FixtureRequest) -> Path: + """Get the output directory for reports.""" + return Path(request.config.getoption("--output-dir")) + + +@pytest.fixture +def baselines_dir(request: pytest.FixtureRequest) -> Path: + """Get the baselines directory.""" + return Path(request.config.getoption("--baselines-dir")) + + +@pytest.fixture +def update_baseline(request: pytest.FixtureRequest) -> bool: + """Whether to update baselines.""" + return bool(request.config.getoption("--update-baseline")) + + +@pytest.fixture +def fail_on_regression(request: pytest.FixtureRequest) -> bool: + """Whether to fail on regression.""" + return bool(request.config.getoption("--fail-on-regression")) + + +@pytest.fixture +def metrics_collector() -> MetricsCollector: + """Fresh metrics collector for each test.""" + return MetricsCollector() + + +@pytest.fixture(scope="session") +def baseline_manager(request: pytest.FixtureRequest) -> BaselineManager: + """Baseline manager for comparison.""" + baselines_dir = Path(request.config.getoption("--baselines-dir")) + threshold = int(request.config.getoption("--regression-threshold")) + thresholds = { + "latency_p99_warning_pct": float(threshold), + "latency_p99_fail_pct": float(threshold * 2.5), + "throughput_warning_pct": float(-threshold), + "memory_growth_warning_pct": float(threshold * 2.5), + "memory_slope_fail": 0.1, + "error_rate_fail_pct": 5.0, + } + return BaselineManager(baselines_dir=baselines_dir, thresholds=thresholds) + + +@pytest.fixture(scope="session") +def report_generator(request: pytest.FixtureRequest) -> ReportGenerator: + """Report generator for output.""" + output_dir = Path(request.config.getoption("--output-dir")) + return ReportGenerator(output_dir=output_dir) + + +# Store test reports for session-level access +_test_reports: dict[str, "TestReport"] = {} + + +def pytest_sessionstart(session: pytest.Session) -> None: + """Clear reports at session start.""" + _test_reports.clear() + + +def register_test_report(report: "TestReport") -> None: + """Register a test report for later processing.""" + _test_reports[report.test_name] = report + + +def get_test_reports() -> dict[str, "TestReport"]: + """Get all registered test reports.""" + return _test_reports.copy() diff --git a/tests/load/metrics.py b/tests/load/metrics.py new file mode 100644 index 0000000..c3bfea5 --- /dev/null +++ b/tests/load/metrics.py @@ -0,0 +1,422 @@ +""" +Core metrics collection and reporting infrastructure for load tests. + +Provides dataclasses for structured metrics and a collector for aggregating +samples during test execution. +""" + +from __future__ import annotations + +import statistics +import subprocess +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + + +@dataclass +class LatencyStats: + """Statistical summary of latency measurements.""" + + p50_ms: float + p90_ms: float + p95_ms: float + p99_ms: float + max_ms: float + min_ms: float + mean_ms: float + stdev_ms: float + sample_count: int + + @classmethod + def from_samples(cls, samples: list[float]) -> LatencyStats | None: + """Compute statistics from raw latency samples (in ms).""" + if not samples: + return None + + sorted_samples = sorted(samples) + n = len(sorted_samples) + + def percentile(p: float) -> float: + idx = int(n * p / 100) + return sorted_samples[min(idx, n - 1)] + + return cls( + p50_ms=percentile(50), + p90_ms=percentile(90), + p95_ms=percentile(95), + p99_ms=percentile(99), + max_ms=sorted_samples[-1], + min_ms=sorted_samples[0], + mean_ms=statistics.mean(sorted_samples), + stdev_ms=statistics.stdev(sorted_samples) if n > 1 else 0.0, + sample_count=n, + ) + + def to_dict(self) -> dict[str, float | int]: + """Convert to JSON-serializable dict.""" + return { + "p50_ms": round(self.p50_ms, 3), + "p90_ms": round(self.p90_ms, 3), + "p95_ms": round(self.p95_ms, 3), + "p99_ms": round(self.p99_ms, 3), + "max_ms": round(self.max_ms, 3), + "min_ms": round(self.min_ms, 3), + "mean_ms": round(self.mean_ms, 3), + "stdev_ms": round(self.stdev_ms, 3), + "sample_count": self.sample_count, + } + + +@dataclass +class MemoryStats: + """Memory usage statistics.""" + + baseline_mb: float + peak_mb: float + final_mb: float + growth_mb: float + slope_mb_per_sec: float + samples: list[tuple[float, float]] # (elapsed_sec, rss_mb) + + @classmethod + def from_samples( + cls, + samples: list[tuple[float, float]], + baseline_mb: float, + final_mb: float, + ) -> MemoryStats: + """Compute statistics from time-series memory samples.""" + if not samples: + return cls( + baseline_mb=baseline_mb, + peak_mb=baseline_mb, + final_mb=final_mb, + growth_mb=0.0, + slope_mb_per_sec=0.0, + samples=[], + ) + + peak_mb = max(s[1] for s in samples) + growth_mb = peak_mb - baseline_mb + + # Linear regression for slope + slope = 0.0 + if len(samples) >= 2: + x_vals = [s[0] for s in samples] + y_vals = [s[1] for s in samples] + x_mean = statistics.mean(x_vals) + y_mean = statistics.mean(y_vals) + numerator = sum((x - x_mean) * (y - y_mean) for x, y in zip(x_vals, y_vals)) + denominator = sum((x - x_mean) ** 2 for x in x_vals) + if denominator > 0: + slope = numerator / denominator + + return cls( + baseline_mb=baseline_mb, + peak_mb=peak_mb, + final_mb=final_mb, + growth_mb=growth_mb, + slope_mb_per_sec=slope, + samples=samples, + ) + + def to_dict(self) -> dict[str, Any]: + """Convert to JSON-serializable dict.""" + return { + "baseline_mb": round(self.baseline_mb, 2), + "peak_mb": round(self.peak_mb, 2), + "final_mb": round(self.final_mb, 2), + "growth_mb": round(self.growth_mb, 2), + "slope_mb_per_sec": round(self.slope_mb_per_sec, 4), + "samples": [[round(t, 2), round(m, 2)] for t, m in self.samples], + } + + +@dataclass +class ThroughputStats: + """Throughput statistics.""" + + aggregate_events_per_sec: float + per_client_events_per_sec: float + total_events: int + total_duration_sec: float + client_count: int + + def to_dict(self) -> dict[str, float | int]: + """Convert to JSON-serializable dict.""" + return { + "aggregate_events_per_sec": round(self.aggregate_events_per_sec, 2), + "per_client_events_per_sec": round(self.per_client_events_per_sec, 2), + "total_events": self.total_events, + "total_duration_sec": round(self.total_duration_sec, 2), + "client_count": self.client_count, + } + + +@dataclass +class ReliabilityStats: + """Connection reliability statistics.""" + + successful_connections: int + failed_connections: int + error_rate: float + errors: list[str] + + def to_dict(self) -> dict[str, Any]: + """Convert to JSON-serializable dict.""" + return { + "successful_connections": self.successful_connections, + "failed_connections": self.failed_connections, + "error_rate": round(self.error_rate, 4), + "errors": self.errors[:10], # Limit to first 10 errors + } + + +@dataclass +class SSEInternals: + """SSE library internal state (Issue #152 validation).""" + + watcher_started: bool + peak_registered_events: int + final_registered_events: int + + def to_dict(self) -> dict[str, Any]: + """Convert to JSON-serializable dict.""" + return { + "watcher_started": self.watcher_started, + "peak_registered_events": self.peak_registered_events, + "final_registered_events": self.final_registered_events, + } + + +@dataclass +class TestReport: + """Complete performance report for a single test.""" + + test_name: str + timestamp: str + git_commit: str + git_branch: str + scale: int + duration_minutes: int + + # Metrics (optional based on test type) + latency: LatencyStats | None = None + ttfe: LatencyStats | None = None + throughput: ThroughputStats | None = None + memory: MemoryStats | None = None + reliability: ReliabilityStats | None = None + sse_internals: SSEInternals | None = None + + # Comparison results (populated by BaselineManager) + comparison: dict[str, float] | None = None + + def to_dict(self) -> dict[str, Any]: + """Convert to JSON-serializable dict.""" + result: dict[str, Any] = { + "metadata": { + "test_name": self.test_name, + "timestamp": self.timestamp, + "git_commit": self.git_commit, + "git_branch": self.git_branch, + "scale": self.scale, + "duration_minutes": self.duration_minutes, + } + } + + if self.latency: + result["latency"] = self.latency.to_dict() + if self.ttfe: + result["ttfe"] = self.ttfe.to_dict() + if self.throughput: + result["throughput"] = self.throughput.to_dict() + if self.memory: + result["memory"] = self.memory.to_dict() + if self.reliability: + result["reliability"] = self.reliability.to_dict() + if self.sse_internals: + result["sse_internals"] = self.sse_internals.to_dict() + if self.comparison: + result["comparison"] = self.comparison + + return result + + +def _get_git_info() -> tuple[str, str]: + """Get current git commit and branch.""" + try: + commit = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, + text=True, + check=True, + ).stdout.strip() + except subprocess.CalledProcessError: + commit = "unknown" + + try: + branch = subprocess.run( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + capture_output=True, + text=True, + check=True, + ).stdout.strip() + except subprocess.CalledProcessError: + branch = "unknown" + + return commit, branch + + +@dataclass +class MetricsCollector: + """Collects performance metrics during test execution.""" + + # Latency samples (milliseconds) + latency_samples: list[float] = field(default_factory=list) + ttfe_samples: list[float] = field(default_factory=list) + + # Memory samples (elapsed_sec, rss_mb) + memory_samples: list[tuple[float, float]] = field(default_factory=list) + memory_baseline_mb: float = 0.0 + memory_final_mb: float = 0.0 + + # Throughput tracking + events_per_client: list[int] = field(default_factory=list) + total_duration_sec: float = 0.0 + + # Reliability + successful_connections: int = 0 + failed_connections: int = 0 + errors: list[str] = field(default_factory=list) + + # SSE internals + watcher_started: bool = False + peak_registered_events: int = 0 + final_registered_events: int = 0 + + # Internal timing + _start_time: float = field(default_factory=time.perf_counter) + + def add_latency_sample(self, ms: float) -> None: + """Record an inter-event latency sample.""" + self.latency_samples.append(ms) + + def add_ttfe_sample(self, ms: float) -> None: + """Record a time-to-first-event sample.""" + self.ttfe_samples.append(ms) + + def add_memory_sample(self, rss_mb: float) -> None: + """Record a memory usage sample with timestamp.""" + elapsed = time.perf_counter() - self._start_time + self.memory_samples.append((elapsed, rss_mb)) + + def set_memory_baseline(self, rss_mb: float) -> None: + """Set the baseline memory before test starts.""" + self.memory_baseline_mb = rss_mb + + def set_memory_final(self, rss_mb: float) -> None: + """Set the final memory after test completes.""" + self.memory_final_mb = rss_mb + + def add_client_events(self, count: int) -> None: + """Record events received by a client.""" + self.events_per_client.append(count) + + def set_duration(self, seconds: float) -> None: + """Set total test duration.""" + self.total_duration_sec = seconds + + def record_success(self) -> None: + """Record a successful connection.""" + self.successful_connections += 1 + + def record_failure(self, error: str) -> None: + """Record a failed connection.""" + self.failed_connections += 1 + self.errors.append(error) + + def set_sse_internals( + self, watcher_started: bool, peak_events: int, final_events: int + ) -> None: + """Record SSE library internal state.""" + self.watcher_started = watcher_started + self.peak_registered_events = peak_events + self.final_registered_events = final_events + + def compute_report( + self, test_name: str, scale: int, duration_minutes: int + ) -> TestReport: + """Compute final report from collected samples.""" + git_commit, git_branch = _get_git_info() + timestamp = datetime.now(timezone.utc).isoformat() + + # Compute latency stats + latency = LatencyStats.from_samples(self.latency_samples) + ttfe = LatencyStats.from_samples(self.ttfe_samples) + + # Compute memory stats + memory = None + if self.memory_samples or self.memory_baseline_mb > 0: + memory = MemoryStats.from_samples( + self.memory_samples, + self.memory_baseline_mb, + self.memory_final_mb, + ) + + # Compute throughput stats + throughput = None + if self.events_per_client and self.total_duration_sec > 0: + total_events = sum(self.events_per_client) + client_count = len(self.events_per_client) + throughput = ThroughputStats( + aggregate_events_per_sec=total_events / self.total_duration_sec, + per_client_events_per_sec=( + (total_events / client_count / self.total_duration_sec) + if client_count > 0 + else 0.0 + ), + total_events=total_events, + total_duration_sec=self.total_duration_sec, + client_count=client_count, + ) + + # Compute reliability stats + total_connections = self.successful_connections + self.failed_connections + reliability = None + if total_connections > 0: + reliability = ReliabilityStats( + successful_connections=self.successful_connections, + failed_connections=self.failed_connections, + error_rate=( + self.failed_connections / total_connections + if total_connections > 0 + else 0.0 + ), + errors=self.errors, + ) + + # SSE internals + sse_internals = None + if self.peak_registered_events > 0 or self.watcher_started: + sse_internals = SSEInternals( + watcher_started=self.watcher_started, + peak_registered_events=self.peak_registered_events, + final_registered_events=self.final_registered_events, + ) + + return TestReport( + test_name=test_name, + timestamp=timestamp, + git_commit=git_commit, + git_branch=git_branch, + scale=scale, + duration_minutes=duration_minutes, + latency=latency, + ttfe=ttfe, + throughput=throughput, + memory=memory, + reliability=reliability, + sse_internals=sse_internals, + ) diff --git a/tests/load/reporter.py b/tests/load/reporter.py new file mode 100644 index 0000000..ac873fc --- /dev/null +++ b/tests/load/reporter.py @@ -0,0 +1,615 @@ +""" +Report generation for load test results. + +Produces JSON and HTML reports with inline SVG charts. +""" + +from __future__ import annotations + +import html +import json +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .baseline import ComparisonResult + from .metrics import ( + LatencyStats, + MemoryStats, + ReliabilityStats, + TestReport, + ThroughputStats, + ) + + +class ReportGenerator: + """Generates JSON and HTML reports from test results.""" + + def __init__(self, output_dir: Path | str = "tests/load/results"): + self.output_dir = Path(output_dir) + + def _report_path(self, test_name: str, ext: str) -> Path: + """Get path to report file.""" + safe_name = test_name.replace("::", "_").replace("/", "_").replace("\\", "_") + return self.output_dir / f"{safe_name}.{ext}" + + def save_json(self, report: TestReport) -> Path: + """Save report as JSON.""" + self.output_dir.mkdir(parents=True, exist_ok=True) + path = self._report_path(report.test_name, "json") + + with open(path, "w") as f: + json.dump(report.to_dict(), f, indent=2) + + return path + + def save_html( + self, report: TestReport, comparison: ComparisonResult | None = None + ) -> Path: + """Save report as HTML with inline SVG charts.""" + self.output_dir.mkdir(parents=True, exist_ok=True) + path = self._report_path(report.test_name, "html") + + html_content = self._render_html(report, comparison) + with open(path, "w") as f: + f.write(html_content) + + return path + + def print_summary( + self, report: TestReport, comparison: ComparisonResult | None = None + ) -> None: + """Print summary to console.""" + sep = "=" * 70 + print(f"\n{sep}") + print(f" SSE Load Test Results: {report.test_name}") + print(sep) + print( + f"Run: {report.timestamp} | commit: {report.git_commit} | " + f"branch: {report.git_branch}" + ) + print( + f"Scale: {report.scale} connections | Duration: {report.duration_minutes} min" + ) + print() + + # Latency + if report.latency: + print("LATENCY (inter-event)") + self._print_latency_line( + " p50:", report.latency.p50_ms, comparison, "latency_p50" + ) + print(f" p95: {report.latency.p95_ms:.1f} ms") + self._print_latency_line( + " p99:", report.latency.p99_ms, comparison, "latency_p99" + ) + print(f" max: {report.latency.max_ms:.1f} ms") + print() + + # TTFE + if report.ttfe: + print("TIME TO FIRST EVENT") + print(f" p50: {report.ttfe.p50_ms:.1f} ms") + self._print_latency_line( + " p99:", report.ttfe.p99_ms, comparison, "ttfe_p99" + ) + print() + + # Throughput + if report.throughput: + print("THROUGHPUT") + self._print_throughput_line( + report.throughput.aggregate_events_per_sec, comparison + ) + print( + f" Per client: {report.throughput.per_client_events_per_sec:.1f} events/sec" + ) + print() + + # Memory + if report.memory: + print("MEMORY") + print(f" Baseline: {report.memory.baseline_mb:.1f} MB") + print(f" Peak: {report.memory.peak_mb:.1f} MB") + self._print_memory_line( + " Growth:", report.memory.growth_mb, comparison, "memory_growth" + ) + self._print_slope_line(report.memory.slope_mb_per_sec) + print() + + # Reliability + if report.reliability: + total = ( + report.reliability.successful_connections + + report.reliability.failed_connections + ) + pct = ( + report.reliability.successful_connections / total * 100 + if total > 0 + else 0 + ) + print("RELIABILITY") + print( + f" Successful: {report.reliability.successful_connections}/{total} ({pct:.1f}%)" + ) + if report.reliability.errors: + print(f" Errors: {len(report.reliability.errors)}") + print() + + # Comparison summary + if comparison and (comparison.regression_reasons or comparison.warnings): + if comparison.regression_detected: + print("REGRESSIONS DETECTED:") + for reason in comparison.regression_reasons or []: + print(f" - {reason}") + if comparison.warnings: + print("WARNINGS:") + for warning in comparison.warnings: + print(f" - {warning}") + print() + + print(sep) + result = "PASS" + if comparison and comparison.regression_detected: + result = "FAIL (regression detected)" + elif comparison and comparison.warnings: + result = f"PASS ({len(comparison.warnings)} warnings)" + print(f"Result: {result}") + print(sep + "\n") + + def _print_latency_line( + self, + label: str, + value: float, + comparison: ComparisonResult | None, + key: str, + ) -> None: + """Print latency line with optional comparison.""" + line = f"{label} {value:.1f} ms" + if comparison: + change = getattr(comparison, f"{key}_change_pct", None) + if change is not None: + symbol = "+" if change > 0 else "" + indicator = "!" if abs(change) > 20 else "" + line += f" ({symbol}{change:.1f}% vs baseline) {indicator}" + print(line) + + def _print_throughput_line( + self, value: float, comparison: ComparisonResult | None + ) -> None: + """Print throughput line with optional comparison.""" + line = f" Aggregate: {value:,.0f} events/sec" + if comparison and comparison.throughput_change_pct is not None: + change = comparison.throughput_change_pct + symbol = "+" if change > 0 else "" + indicator = "!" if change < -20 else "" + line += f" ({symbol}{change:.1f}% vs baseline) {indicator}" + print(line) + + def _print_memory_line( + self, + label: str, + value: float, + comparison: ComparisonResult | None, + key: str, + ) -> None: + """Print memory line with optional comparison.""" + line = f"{label} {value:.1f} MB" + if comparison: + change = getattr(comparison, f"{key}_change_pct", None) + if change is not None: + symbol = "+" if change > 0 else "" + indicator = "!" if change > 50 else "" + line += f" ({symbol}{change:.1f}% vs baseline) {indicator}" + print(line) + + def _print_slope_line(self, slope: float) -> None: + """Print memory slope line.""" + indicator = "!" if slope > 0.1 else "" + print(f" Slope: {slope:.3f} MB/sec {indicator}") + + def _render_html( + self, report: TestReport, comparison: ComparisonResult | None + ) -> str: + """Render full HTML report.""" + charts_html = "" + + # Latency histogram + if report.latency: + charts_html += self._render_section( + "Latency Distribution", + self._render_latency_summary(report.latency, comparison), + ) + + # TTFE stats + if report.ttfe: + charts_html += self._render_section( + "Time to First Event", + self._render_ttfe_summary(report.ttfe, comparison), + ) + + # Memory chart + if report.memory and report.memory.samples: + charts_html += self._render_section( + "Memory Usage Over Time", + self._render_memory_chart(report.memory) + + self._render_memory_summary(report.memory, comparison), + ) + + # Throughput + if report.throughput: + charts_html += self._render_section( + "Throughput", + self._render_throughput_summary(report.throughput, comparison), + ) + + # Reliability + if report.reliability: + charts_html += self._render_section( + "Reliability", + self._render_reliability_summary(report.reliability), + ) + + # Comparison + comparison_html = "" + if comparison and (comparison.regression_reasons or comparison.warnings): + comparison_html = self._render_comparison_section(comparison) + + return f""" + + + + + Load Test Report: {html.escape(report.test_name)} + + + +
+

{html.escape(report.test_name)}

+ +
+ + {comparison_html} + {charts_html} + + +""" + + def _render_section(self, title: str, content: str) -> str: + """Render a section with title.""" + return f""" +
+

{html.escape(title)}

+ {content} +
+""" + + def _render_latency_summary( + self, stats: LatencyStats, comparison: ComparisonResult | None + ) -> str: + """Render latency summary table.""" + p99_change = "" + if comparison and comparison.latency_p99_change_pct is not None: + cls = "positive" if comparison.latency_p99_change_pct > 0 else "negative" + sign = "+" if comparison.latency_p99_change_pct > 0 else "" + p99_change = ( + f'' + f"({sign}{comparison.latency_p99_change_pct:.1f}%)" + ) + + return f""" + + + + + + + + + + +
PercentileValue
p50{stats.p50_ms:.2f} ms
p90{stats.p90_ms:.2f} ms
p95{stats.p95_ms:.2f} ms
p99{stats.p99_ms:.2f} ms {p99_change}
max{stats.max_ms:.2f} ms
mean{stats.mean_ms:.2f} ms
stdev{stats.stdev_ms:.2f} ms
samples{stats.sample_count:,}
+""" + + def _render_ttfe_summary( + self, stats: LatencyStats, comparison: ComparisonResult | None + ) -> str: + """Render TTFE summary table.""" + p99_change = "" + if comparison and comparison.ttfe_p99_change_pct is not None: + cls = "positive" if comparison.ttfe_p99_change_pct > 0 else "negative" + sign = "+" if comparison.ttfe_p99_change_pct > 0 else "" + p99_change = ( + f'' + f"({sign}{comparison.ttfe_p99_change_pct:.1f}%)" + ) + + return f""" + + + + + + +
MetricValue
p50{stats.p50_ms:.1f} ms
p99{stats.p99_ms:.1f} ms {p99_change}
max{stats.max_ms:.1f} ms
samples{stats.sample_count:,}
+""" + + def _render_memory_chart(self, memory: MemoryStats) -> str: + """Render SVG line chart for memory over time.""" + if not memory.samples: + return "" + + # Chart dimensions + width = 600 + height = 200 + padding = 40 + + times = [s[0] for s in memory.samples] + values = [s[1] for s in memory.samples] + + if not times or len(times) < 2: + return "" + + x_min, x_max = min(times), max(times) + y_min = min(values) * 0.9 + y_max = max(values) * 1.1 + + def scale_x(t: float) -> float: + if x_max == x_min: + return padding + return padding + (t - x_min) / (x_max - x_min) * (width - 2 * padding) + + def scale_y(v: float) -> float: + if y_max == y_min: + return height - padding + return ( + height + - padding + - (v - y_min) / (y_max - y_min) * (height - 2 * padding) + ) + + # Generate path + points = [f"{scale_x(t):.1f},{scale_y(v):.1f}" for t, v in memory.samples] + path_d = "M " + " L ".join(points) + + # Generate axis labels + y_labels = "" + for i in range(5): + y_val = y_min + (y_max - y_min) * i / 4 + y_pos = scale_y(y_val) + y_labels += f'{y_val:.0f}' + + x_labels = "" + for i in range(5): + x_val = x_min + (x_max - x_min) * i / 4 + x_pos = scale_x(x_val) + x_labels += f'{x_val:.0f}s' + + return f""" +
+ + + + + + + Memory (MB) + + + {y_labels} + {x_labels} + + + + + + + +
+""" + + def _render_memory_summary( + self, memory: MemoryStats, comparison: ComparisonResult | None + ) -> str: + """Render memory summary table.""" + growth_change = "" + if comparison and comparison.memory_growth_change_pct is not None: + cls = "positive" if comparison.memory_growth_change_pct > 0 else "negative" + sign = "+" if comparison.memory_growth_change_pct > 0 else "" + growth_change = ( + f'' + f"({sign}{comparison.memory_growth_change_pct:.1f}%)" + ) + + return f""" + + + + + + + +
MetricValue
Baseline{memory.baseline_mb:.1f} MB
Peak{memory.peak_mb:.1f} MB
Final{memory.final_mb:.1f} MB
Growth{memory.growth_mb:.1f} MB {growth_change}
Slope{memory.slope_mb_per_sec:.4f} MB/sec
+""" + + def _render_throughput_summary( + self, throughput: "ThroughputStats", comparison: ComparisonResult | None + ) -> str: + """Render throughput summary table.""" + from .metrics import ThroughputStats + + if not isinstance(throughput, ThroughputStats): + return "" + + change = "" + if comparison and comparison.throughput_change_pct is not None: + cls = "negative" if comparison.throughput_change_pct < 0 else "positive" + sign = "+" if comparison.throughput_change_pct > 0 else "" + change = ( + f'' + f"({sign}{comparison.throughput_change_pct:.1f}%)" + ) + + return f""" + + + + + + + +
MetricValue
Aggregate{throughput.aggregate_events_per_sec:,.0f} events/sec {change}
Per Client{throughput.per_client_events_per_sec:.1f} events/sec
Total Events{throughput.total_events:,}
Duration{throughput.total_duration_sec:.1f} sec
Clients{throughput.client_count:,}
+""" + + def _render_reliability_summary(self, reliability: "ReliabilityStats") -> str: + """Render reliability summary.""" + from .metrics import ReliabilityStats + + if not isinstance(reliability, ReliabilityStats): + return "" + + total = reliability.successful_connections + reliability.failed_connections + pct = reliability.successful_connections / total * 100 if total > 0 else 0 + + errors_html = "" + if reliability.errors: + error_items = "".join( + f"
  • {html.escape(e)}
  • " for e in reliability.errors[:10] + ) + errors_html = f"
      {error_items}
    " + + return f""" + + + + + +
    MetricValue
    Successful{reliability.successful_connections:,} / {total:,} ({pct:.1f}%)
    Failed{reliability.failed_connections:,}
    Error Rate{reliability.error_rate * 100:.2f}%
    + {errors_html} +""" + + def _render_comparison_section(self, comparison: ComparisonResult) -> str: + """Render comparison alerts section.""" + content = "" + + if comparison.regression_reasons: + reasons = "".join( + f"
  • {html.escape(r)}
  • " for r in comparison.regression_reasons + ) + content += f""" +
    + Regressions Detected +
      {reasons}
    +
    +""" + + if comparison.warnings: + warnings = "".join( + f"
  • {html.escape(w)}
  • " for w in comparison.warnings + ) + content += f""" +
    + Warnings +
      {warnings}
    +
    +""" + + if comparison.baseline_commit: + content += f""" +
    + Compared against baseline: {html.escape(comparison.baseline_commit)} + ({html.escape(comparison.baseline_timestamp or 'unknown')}) +
    +""" + + return content diff --git a/tests/load/test_backpressure.py b/tests/load/test_backpressure.py index 94f111b..024adf6 100644 --- a/tests/load/test_backpressure.py +++ b/tests/load/test_backpressure.py @@ -1,30 +1,74 @@ """ Backpressure and slow client tests. -Verifies server handles slow consumers correctly without affecting fast clients. +This module verifies the server handles mixed client speeds correctly: +- Slow consumers don't block fast consumers (per-connection isolation) +- Rapid connection churn doesn't exhaust resources +- send_timeout properly disconnects frozen clients + +SSE servers must handle heterogeneous clients. A slow consumer (mobile on 2G) +shouldn't cause head-of-line blocking for fast consumers (desktop on fiber). """ +from __future__ import annotations + import asyncio import time -from typing import Tuple import httpx import pytest from httpx_sse import aconnect_sse +from .baseline import BaselineManager +from .conftest import register_test_report +from .metrics import MetricsCollector +from .reporter import ReportGenerator + @pytest.mark.loadtest async def test_slow_clients_dont_block_fast_clients( sse_server_url: str, + scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Slow clients should not affect throughput of fast clients. - - Tests that the server properly handles mixed client speeds. + Verify slow consumers don't throttle fast consumers (connection isolation). + + ## What is Measured + - Event count for "fast" clients (consume immediately) + - Event count for "slow" clients (0.5s processing delay per event) + - Ratio between fast and slow throughput + + ## Why This Matters + Tests per-connection isolation in the send path: + - Each connection has its own anyio.Lock for sends + - Slow client's blocked send() doesn't block other connections + - No shared buffers that could cause head-of-line blocking + + Without isolation, a single slow client could stall all other streams, + making the server unusable under mixed load. + + ## Methodology + 1. Connect 10 "fast" clients (consume events immediately) + 2. Connect 10 "slow" clients (sleep 0.5s after each event) + 3. Run for 10 seconds + 4. Compare event counts + + ## Pass Criteria + - Fast clients avg > slow clients avg * 5 (isolation works) + - Fast clients avg > 500 events (not throttled by slow clients) + - Rationale: With 10ms delay, fast clients should receive ~1000 events. + Slow clients receive ~20 (10s / 0.5s). 5x ratio is conservative. + 500 events threshold catches severe throttling. """ test_duration = 10 # seconds - async def fast_client() -> int: + async def fast_client() -> tuple[int, str | None]: """Client that consumes events as fast as possible.""" count = 0 start = time.perf_counter() @@ -37,11 +81,11 @@ async def fast_client() -> int: count += 1 if time.perf_counter() - start >= test_duration: break - except Exception: - pass - return count + return count, None + except Exception as e: + return count, str(e) - async def slow_client() -> int: + async def slow_client() -> tuple[int, str | None]: """Client that reads slowly (simulating processing delay).""" count = 0 start = time.perf_counter() @@ -55,9 +99,11 @@ async def slow_client() -> int: count += 1 if time.perf_counter() - start >= test_duration: break - except Exception: - pass - return count + return count, None + except Exception as e: + return count, str(e) + + start_time = time.perf_counter() # Mix of fast and slow clients fast_tasks = [asyncio.create_task(fast_client()) for _ in range(10)] @@ -66,17 +112,63 @@ async def slow_client() -> int: fast_results = await asyncio.gather(*fast_tasks) slow_results = await asyncio.gather(*slow_tasks) - avg_fast = sum(fast_results) / len(fast_results) - avg_slow = sum(slow_results) / len(slow_results) + elapsed = time.perf_counter() - start_time + metrics_collector.set_duration(elapsed) + + # Process results + fast_counts: list[int] = [] + slow_counts: list[int] = [] + + for result in fast_results: + if isinstance(result, tuple): + count, error = result + fast_counts.append(count) + metrics_collector.add_client_events(count) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + for result in slow_results: + if isinstance(result, tuple): + count, error = result + slow_counts.append(count) + metrics_collector.add_client_events(count) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + avg_fast = sum(fast_counts) / len(fast_counts) if fast_counts else 0 + avg_slow = sum(slow_counts) / len(slow_counts) if slow_counts else 0 + + # Generate report + report = metrics_collector.compute_report( + test_name="test_slow_clients_dont_block_fast_clients", + scale=20, # 10 fast + 10 slow + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None - # Fast clients should receive significantly more events + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions assert avg_fast > avg_slow * 5, ( f"Fast clients ({avg_fast:.0f} events) should be much faster than " f"slow clients ({avg_slow:.0f} events)" ) - - # Fast clients should not be severely throttled - # With 0.01s delay, should get ~1000 events in 10s assert ( avg_fast > 500 ), f"Fast clients throttled: {avg_fast:.0f} events, expected > 500" @@ -86,27 +178,60 @@ async def slow_client() -> int: async def test_connection_churn_stability( sse_server_url: str, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Rapid connect/disconnect should not cause resource exhaustion. - - Tests cleanup under high churn rate. + Verify rapid connect/disconnect doesn't exhaust file descriptors or memory. + + ## What is Measured + - File descriptor count before and after churn + - Memory (RSS) before and after churn + - Connection success rate during churn + + ## Why This Matters + Tests resource cleanup under high connection churn: + - Sockets properly closed on disconnect + - Task references released after completion + - No accumulation of leaked resources + + In production, clients frequently reconnect (mobile network switches, + browser tab refresh). Resource leaks under churn cause eventual exhaustion. + + ## Methodology + 1. Record baseline FDs and memory + 2. Create `churn_rate` connections per second for 30 seconds + 3. Each connection receives one event and disconnects + 4. Sample memory every 5 seconds + 5. Record final FDs and memory + + ## Pass Criteria + - FD growth < 50 (no socket leaks) + - Memory growth < 100MB (no major retention) + - Success rate > 90% (server stays responsive under churn) + - Rationale: 50 FDs allows for some timing variance in cleanup. + 100MB memory is generous but catches runaway allocation. + 90% success rate accounts for expected failures under heavy churn. """ churn_rate = min(100, scale) # connections per second duration = 30 # seconds total_connections = churn_rate * duration - async def quick_connection() -> bool: + async def quick_connection() -> tuple[bool, str | None]: try: async with httpx.AsyncClient(timeout=5.0) as client: async with aconnect_sse( client, "GET", f"{sse_server_url}/sse?delay=0" ) as source: async for _ in source.aiter_sse(): - return True - except Exception: - return False - return False + return True, None + except Exception as e: + return False, str(e) + return False, "no events" # Get baseline metrics async with httpx.AsyncClient() as client: @@ -114,23 +239,72 @@ async def quick_connection() -> bool: baseline_fds = baseline.get("num_fds", 0) baseline_memory = baseline["memory_rss_mb"] + metrics_collector.set_memory_baseline(baseline_memory) + + start_time = time.perf_counter() # Create connections at target rate successful = 0 for batch in range(duration): tasks = [asyncio.create_task(quick_connection()) for _ in range(churn_rate)] results = await asyncio.gather(*tasks, return_exceptions=True) - successful += sum(1 for r in results if r is True) + + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + success, error = result + if success: + metrics_collector.record_success() + successful += 1 + else: + metrics_collector.record_failure(error or "unknown") + + # Sample memory periodically + if batch % 5 == 0: + try: + async with httpx.AsyncClient() as client: + metrics = (await client.get(f"{sse_server_url}/metrics")).json() + metrics_collector.add_memory_sample(metrics["memory_rss_mb"]) + except Exception: + pass + await asyncio.sleep(0.5) # Allow some cleanup + elapsed = time.perf_counter() - start_time + metrics_collector.set_duration(elapsed) + # Get final metrics async with httpx.AsyncClient() as client: final = (await client.get(f"{sse_server_url}/metrics")).json() final_fds = final.get("num_fds", 0) final_memory = final["memory_rss_mb"] + metrics_collector.set_memory_final(final_memory) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_connection_churn_stability", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) - # File descriptors should return to baseline + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions if baseline_fds > 0 and final_fds > 0: fd_growth = final_fds - baseline_fds assert fd_growth < 50, ( @@ -138,13 +312,11 @@ async def quick_connection() -> bool: f"connections" ) - # Memory should not grow excessively memory_growth = final_memory - baseline_memory assert ( memory_growth < 100 ), f"Memory grew by {memory_growth:.1f}MB during churn test" - # Success rate should be high success_rate = successful / total_connections if total_connections > 0 else 0 assert success_rate > 0.9, ( f"Low success rate during churn: {success_rate:.1%} " @@ -153,14 +325,48 @@ async def quick_connection() -> bool: @pytest.mark.loadtest -async def test_send_timeout_under_load(sse_server_url: str) -> None: +async def test_send_timeout_under_load( + sse_server_url: str, + scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, +) -> None: """ - Verify send_timeout works correctly under load. - - Clients that stop reading should eventually be disconnected. + Verify send_timeout disconnects frozen clients without blocking normal clients. + + ## What is Measured + - Event count for normal clients (should complete successfully) + - Outcome for "frozen" clients (stop reading after first event) + - Implicit: server responsiveness during frozen client handling + + ## Why This Matters + Tests the send_timeout feature: + - Frozen clients (stop reading but don't close connection) block the send() + - Without timeout, server thread/task hangs indefinitely + - With timeout, server detects blocked send and closes connection + - Normal clients should be unaffected by frozen clients + + This is critical for production resilience. Mobile clients frequently + "freeze" (backgrounded, network change) without closing connections. + + ## Methodology + 1. Connect 5 "frozen" clients (receive one event, then stop reading) + 2. Connect 3 "normal" clients (receive 50 events normally) + 3. Wait for normal clients to complete + 4. Verify normal clients weren't affected + + ## Pass Criteria + - Normal clients receive >= 45/50 events + - Rationale: Normal clients should complete unaffected. 45/50 allows + small margin for timing. If frozen clients blocked the server, + normal clients would timeout or receive far fewer events. """ - async def frozen_client() -> Tuple[str, float]: + async def frozen_client() -> tuple[str, float, str | None]: """Client that stops reading after first event (simulates frozen client).""" start = time.perf_counter() try: @@ -173,16 +379,16 @@ async def frozen_client() -> Tuple[str, float]: await asyncio.sleep(60) # Will be interrupted by timeout break except httpx.ReadTimeout: - return "timeout", time.perf_counter() - start + return "timeout", time.perf_counter() - start, None except Exception as e: - return f"error:{type(e).__name__}", time.perf_counter() - start - return "completed", time.perf_counter() - start + return f"error:{type(e).__name__}", time.perf_counter() - start, str(e) + return "completed", time.perf_counter() - start, None # Start some frozen clients (server has default send_timeout) - tasks = [asyncio.create_task(frozen_client()) for _ in range(5)] + frozen_tasks = [asyncio.create_task(frozen_client()) for _ in range(5)] # Also verify server remains responsive with normal clients - async def normal_client() -> int: + async def normal_client() -> tuple[int, str | None]: count = 0 try: async with httpx.AsyncClient(timeout=30.0) as client: @@ -193,23 +399,68 @@ async def normal_client() -> int: count += 1 if count >= 50: break - except Exception: - pass - return count + return count, None + except Exception as e: + return count, str(e) normal_tasks = [asyncio.create_task(normal_client()) for _ in range(3)] # Wait for normal clients to complete normal_results = await asyncio.gather(*normal_tasks) + # Process normal client results + normal_counts: list[int] = [] + for result in normal_results: + if isinstance(result, tuple): + count, error = result + normal_counts.append(count) + metrics_collector.add_client_events(count) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + # Cancel frozen clients if still running - for task in tasks: + for task in frozen_tasks: if not task.done(): task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) + frozen_results = await asyncio.gather(*frozen_tasks, return_exceptions=True) + + # Process frozen client results + for result in frozen_results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + status, duration, error = result + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + # Generate report + report = metrics_collector.compute_report( + test_name="test_send_timeout_under_load", + scale=8, # 5 frozen + 3 normal + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") - # Normal clients should have completed successfully + # Original assertion assert all( - r >= 45 for r in normal_results - ), f"Normal clients affected by frozen clients: {normal_results}" + r >= 45 for r in normal_counts + ), f"Normal clients affected by frozen clients: {normal_counts}" diff --git a/tests/load/test_memory_stability.py b/tests/load/test_memory_stability.py index ba23eea..883b8ad 100644 --- a/tests/load/test_memory_stability.py +++ b/tests/load/test_memory_stability.py @@ -1,34 +1,74 @@ """ Memory stability tests for sse-starlette under load. -Verifies no memory leaks during sustained SSE streaming with many concurrent connections. +This module detects memory leaks and resource accumulation in the SSE implementation: +- Memory growth during sustained streaming (leak detection) +- Memory reclamation after connections close (cleanup verification) +- Internal event set cleanup (Issue #152 regression test) + +Memory leaks in SSE are particularly insidious because they accumulate slowly +over days/weeks in production, eventually causing OOM kills. """ +from __future__ import annotations + import asyncio -import statistics -from typing import List import httpx import pytest from httpx_sse import aconnect_sse +from .baseline import BaselineManager +from .conftest import register_test_report +from .metrics import MetricsCollector +from .reporter import ReportGenerator + @pytest.mark.loadtest async def test_memory_stability_under_load( sse_server_url: str, scale: int, duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Connect many clients, stream for duration, verify memory is stable. - - Pass criteria: - - Memory growth < 50MB over test duration - - No unbounded growth trend (linear regression slope < 0.1 MB/sec) + Verify memory remains stable during sustained SSE streaming. + + ## What is Measured + - RSS memory at start, during streaming, and at end + - Total memory growth (peak - baseline) + - Memory growth rate (linear regression slope over time samples) + + ## Why This Matters + Detects memory leaks in the EventSourceResponse lifecycle: + - Buffers not released after send + - Task references held after completion + - Event objects accumulating in queues + - Closure captures preventing garbage collection + + A small leak (e.g., 1KB/connection) becomes catastrophic with thousands of + connections over hours of operation. + + ## Methodology + 1. Record baseline memory before any connections + 2. Connect `scale` clients, each streaming for `duration_minutes` + 3. Sample memory periodically during streaming + 4. Compute total growth and growth rate (slope) + + ## Pass Criteria + - Memory growth < 50MB total + - Growth rate (slope) < 0.1 MB/sec + - Rationale: 50MB allows for legitimate per-connection overhead while + catching runaway leaks. The slope check catches slow leaks that might + stay under the absolute threshold but indicate unbounded growth. """ events_per_client = duration_minutes * 60 * 10 # 10 events/sec - async def client_task(client_id: int) -> int: + async def client_task(client_id: int) -> tuple[int, str | None]: """Single client consuming SSE events.""" events_received = 0 try: @@ -40,20 +80,20 @@ async def client_task(client_id: int) -> int: events_received += 1 if events_received >= events_per_client: break - except Exception: - pass # Connection errors during shutdown are expected - return events_received + return events_received, None + except Exception as e: + return events_received, str(e) # Get baseline memory async with httpx.AsyncClient() as client: baseline = (await client.get(f"{sse_server_url}/metrics")).json() baseline_memory = baseline["memory_rss_mb"] + metrics_collector.set_memory_baseline(baseline_memory) # Start all clients tasks = [asyncio.create_task(client_task(i)) for i in range(scale)] # Sample memory periodically - memory_samples: List[float] = [] sample_interval = max(10, duration_minutes * 6) # At least 10 samples for _ in range(sample_interval): @@ -61,64 +101,117 @@ async def client_task(client_id: int) -> int: try: async with httpx.AsyncClient() as client: metrics = (await client.get(f"{sse_server_url}/metrics")).json() - memory_samples.append(metrics["memory_rss_mb"]) + metrics_collector.add_memory_sample(metrics["memory_rss_mb"]) except Exception: pass # Server might be under heavy load # Wait for all clients to complete results = await asyncio.gather(*tasks, return_exceptions=True) - completed = sum(1 for r in results if isinstance(r, int)) + + # Process results + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + events, error = result + metrics_collector.add_client_events(events) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() # Get final memory async with httpx.AsyncClient() as client: final = (await client.get(f"{sse_server_url}/metrics")).json() final_memory = final["memory_rss_mb"] + metrics_collector.set_memory_final(final_memory) - # Calculate memory growth - max_memory = max(memory_samples) if memory_samples else final_memory - memory_growth = max_memory - baseline_memory + # Set duration + metrics_collector.set_duration(duration_minutes * 60) - # Calculate growth trend (simple linear regression slope) - if len(memory_samples) >= 2: - x_mean = len(memory_samples) / 2 - y_mean = statistics.mean(memory_samples) - numerator = sum( - (i - x_mean) * (y - y_mean) for i, y in enumerate(memory_samples) - ) - denominator = sum((i - x_mean) ** 2 for i in range(len(memory_samples))) - slope = numerator / denominator if denominator else 0 - # Convert to MB/sec - sample_interval_sec = duration_minutes * 60 / len(memory_samples) - slope_per_sec = slope / sample_interval_sec - else: - slope_per_sec = 0 - - # Assert criteria + # Generate report + report = metrics_collector.compute_report( + test_name="test_memory_stability_under_load", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions + completed = metrics_collector.successful_connections assert ( completed >= scale * 0.9 ), f"Too many failed connections: {completed}/{scale} completed" - assert memory_growth < 50, ( - f"Memory grew by {memory_growth:.1f}MB (baseline: {baseline_memory:.1f}MB, " - f"max: {max_memory:.1f}MB), expected < 50MB" - ) - assert ( - slope_per_sec < 0.1 - ), f"Memory growth trend {slope_per_sec:.3f} MB/sec, expected < 0.1 MB/sec" + + if report.memory: + assert report.memory.growth_mb < 50, ( + f"Memory grew by {report.memory.growth_mb:.1f}MB " + f"(baseline: {baseline_memory:.1f}MB, peak: {report.memory.peak_mb:.1f}MB), " + f"expected < 50MB" + ) + assert report.memory.slope_mb_per_sec < 0.1, ( + f"Memory growth trend {report.memory.slope_mb_per_sec:.3f} MB/sec, " + f"expected < 0.1 MB/sec" + ) @pytest.mark.loadtest async def test_memory_returns_to_baseline_after_disconnect( sse_server_url: str, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Connect many clients, disconnect all, verify memory returns near baseline. - - Pass criteria: - - Memory within 20% of baseline after all connections close + Verify memory is reclaimed after all connections close. + + ## What is Measured + - Memory before any connections (baseline) + - Memory after all connections close (final) + - Delta as percentage of baseline + + ## Why This Matters + Complements the stability test by verifying cleanup: + - Task references properly released + - anyio.Event objects garbage collected + - No lingering closures or callbacks + - Thread-local state cleared + + Even if memory doesn't grow during streaming, retained references after + disconnect indicate a leak that will accumulate across connection cycles. + + ## Methodology + 1. Record baseline memory + 2. Connect clients in batches, each receiving 50 events then disconnecting + 3. Wait 2 seconds for cleanup (GC, finalizers) + 4. Record final memory and compare to baseline + + ## Pass Criteria + - Final memory <= baseline * 1.2 (20% margin) + - Rationale: Python's memory allocator doesn't always return memory to OS + immediately. 20% margin accounts for fragmentation and GC timing while + still catching significant retention issues. """ - async def client_task(client_id: int) -> None: + async def client_task(client_id: int) -> tuple[int, str | None]: """Client that connects, receives few events, then disconnects.""" try: async with httpx.AsyncClient(timeout=30.0) as client: @@ -130,13 +223,15 @@ async def client_task(client_id: int) -> None: count += 1 if count >= 50: break - except Exception: - pass + return count, None + except Exception as e: + return 0, str(e) # Get baseline async with httpx.AsyncClient() as client: baseline = (await client.get(f"{sse_server_url}/metrics")).json() baseline_memory = baseline["memory_rss_mb"] + metrics_collector.set_memory_baseline(baseline_memory) # Connect and disconnect clients in batches batch_size = min(100, scale) @@ -145,7 +240,26 @@ async def client_task(client_id: int) -> None: tasks = [ asyncio.create_task(client_task(i)) for i in range(batch_start, batch_end) ] - await asyncio.gather(*tasks, return_exceptions=True) + results = await asyncio.gather(*tasks, return_exceptions=True) + + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + events, error = result + metrics_collector.add_client_events(events) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + # Sample memory after each batch + try: + async with httpx.AsyncClient() as client: + metrics = (await client.get(f"{sse_server_url}/metrics")).json() + metrics_collector.add_memory_sample(metrics["memory_rss_mb"]) + except Exception: + pass # Wait for cleanup await asyncio.sleep(2) @@ -154,8 +268,31 @@ async def client_task(client_id: int) -> None: async with httpx.AsyncClient() as client: final = (await client.get(f"{sse_server_url}/metrics")).json() final_memory = final["memory_rss_mb"] + metrics_collector.set_memory_final(final_memory) - # Allow 20% growth from baseline (some overhead is expected) + # Generate report + report = metrics_collector.compute_report( + test_name="test_memory_returns_to_baseline_after_disconnect", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertion max_allowed = baseline_memory * 1.2 assert final_memory <= max_allowed, ( f"Memory did not return to baseline: {final_memory:.1f}MB " @@ -164,18 +301,54 @@ async def client_task(client_id: int) -> None: @pytest.mark.loadtest -async def test_event_set_cleanup(sse_server_url: str, scale: int) -> None: +async def test_event_set_cleanup( + sse_server_url: str, + scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, +) -> None: """ - Verify the internal event set empties after connections close. - - This tests the Issue #152 fix - events should be properly removed - from the thread-local state when connections close. + Verify internal event set is cleaned up after connections close (Issue #152). + + ## What is Measured + - `registered_events` count from /metrics endpoint + - Events at baseline, peak (during connections), and after cleanup + - Watcher started status (should be True if connections exist) + + ## Why This Matters + This is a regression test for Issue #152 (watcher task leak). Before the fix: + - Each SSE connection created a new watcher task + - Events accumulated in `_ShutdownState.events` without cleanup + - CPU usage grew unbounded as N watchers polled AppStatus.should_exit + + After the fix (using threading.local): + - One watcher per thread, not per connection + - Events removed from set on connection close + - Watcher stops when set becomes empty + + ## Methodology + 1. Record baseline `registered_events` count + 2. Connect `scale` clients, wait for connections to establish + 3. Record peak `registered_events` (should be >= scale * 0.2) + 4. Wait for all connections to close + 2s cleanup + 5. Record final `registered_events` (should return near baseline) + + ## Pass Criteria + - Peak events >= scale * 0.2 (events were registered) + - Final events <= baseline + 10 (events were cleaned up) + - Rationale: We expect most (not all) connections to register events. + After cleanup, the set should be nearly empty. The +10 margin allows + for concurrent test interference. """ connected = asyncio.Event() connection_count = 0 - async def client_task() -> None: + async def client_task() -> tuple[int, str | None]: nonlocal connection_count try: async with httpx.AsyncClient(timeout=30.0) as client: @@ -190,13 +363,16 @@ async def client_task() -> None: count += 1 if count >= 5: # Stay connected for ~2.5s break - except Exception: - pass + return count, None + except Exception as e: + return 0, str(e) # Get baseline event count async with httpx.AsyncClient() as client: baseline = (await client.get(f"{sse_server_url}/metrics")).json() baseline_events = baseline["registered_events"] + baseline_memory = baseline["memory_rss_mb"] + metrics_collector.set_memory_baseline(baseline_memory) # Connect many clients tasks = [asyncio.create_task(client_task()) for _ in range(scale)] @@ -212,23 +388,64 @@ async def client_task() -> None: async with httpx.AsyncClient() as client: peak = (await client.get(f"{sse_server_url}/metrics")).json() peak_events = peak["registered_events"] + metrics_collector.add_memory_sample(peak["memory_rss_mb"]) # Wait for all to complete - await asyncio.gather(*tasks, return_exceptions=True) + results = await asyncio.gather(*tasks, return_exceptions=True) + + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + events, error = result + metrics_collector.add_client_events(events) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + await asyncio.sleep(2) # Allow cleanup time # Check events cleaned up async with httpx.AsyncClient() as client: final = (await client.get(f"{sse_server_url}/metrics")).json() final_events = final["registered_events"] + metrics_collector.set_memory_final(final["memory_rss_mb"]) - # Events should have been registered during peak (relaxed threshold) + # Record SSE internals + metrics_collector.set_sse_internals( + watcher_started=peak.get("watcher_started", False), + peak_events=peak_events, + final_events=final_events, + ) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_event_set_cleanup", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions assert peak_events >= scale * 0.2, ( f"Expected at least {scale * 0.2} events registered during peak, " f"got {peak_events}" ) - - # Events should be cleaned up after assert final_events <= baseline_events + 10, ( f"Event set not cleaned up: {final_events} events remaining " f"(baseline: {baseline_events})" diff --git a/tests/load/test_shutdown.py b/tests/load/test_shutdown.py index 1bdf43d..406a4a3 100644 --- a/tests/load/test_shutdown.py +++ b/tests/load/test_shutdown.py @@ -1,9 +1,18 @@ """ Graceful shutdown tests under load. -Verifies clean shutdown behavior with many active connections. +This module verifies the server shuts down cleanly with active SSE connections: +- SIGTERM handling with concurrent streams +- Connection notification and cleanup timing +- No hanging connections after shutdown + +Graceful shutdown is critical for zero-downtime deployments. If SSE connections +aren't properly notified, clients hang until TCP timeout (minutes), causing +poor user experience during rolling updates. """ +from __future__ import annotations + import asyncio import signal import time @@ -12,18 +21,56 @@ import pytest from httpx_sse import aconnect_sse +from .baseline import BaselineManager +from .conftest import register_test_report +from .metrics import MetricsCollector +from .reporter import ReportGenerator + @pytest.mark.loadtest async def test_graceful_shutdown_with_active_connections( docker_available: bool, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Send SIGTERM to server with active connections, verify clean shutdown. - - Pass criteria: - - Shutdown completes within 5 seconds - - All connections receive disconnect (no hanging clients) + Verify server shuts down cleanly within timeout when SIGTERM is sent. + + ## What is Measured + - Time from SIGTERM to all connections closed + - Connection close status (clean_close, server_closed, or error) + - Percentage of connections that closed successfully + + ## Why This Matters + Tests the core graceful shutdown mechanism: + - Uvicorn receives SIGTERM, sets Server.should_exit + - Watcher detects should_exit, broadcasts to all registered events + - EventSourceResponse streams terminate, connections close + - Server waits for in-flight requests, then exits + + Without this working: + - Rolling deployments cause client disconnects + - Container orchestrators kill processes after timeout + - Users experience broken connections during updates + + ## Methodology + 1. Start server in Docker container + 2. Connect `scale` concurrent SSE clients + 3. Wait for connections to establish (~2s) + 4. Send SIGTERM to container + 5. Measure time until all connections close + 6. Categorize close reasons (clean, server-initiated, error) + + ## Pass Criteria + - >= 90% connections closed (clean_closes + server_closes) + - Shutdown time < 10 seconds + - Rationale: 90% accounts for race conditions in test timing. 10s is + generous but catches hangs. Production should complete in <5s. """ if not docker_available: pytest.skip("Docker not available") @@ -46,7 +93,7 @@ async def test_graceful_shutdown_with_active_connections( connections_made = 0 connections_closed = 0 - async def client_task() -> str: + async def client_task() -> tuple[str, str | None]: nonlocal connections_made, connections_closed try: async with httpx.AsyncClient(timeout=30.0) as client: @@ -58,13 +105,13 @@ async def client_task() -> str: if disconnected.is_set(): break connections_closed += 1 - return "clean_close" + return "clean_close", None except httpx.RemoteProtocolError: connections_closed += 1 - return "server_closed" + return "server_closed", None except Exception as e: connections_closed += 1 - return f"error:{type(e).__name__}" + return f"error:{type(e).__name__}", str(e) # Start clients tasks = [asyncio.create_task(client_task()) for _ in range(scale)] @@ -90,6 +137,7 @@ async def client_task() -> str: results = await asyncio.gather(*tasks, return_exceptions=True) shutdown_time = time.perf_counter() - start_shutdown + metrics_collector.set_duration(shutdown_time) # Cleanup container try: @@ -98,28 +146,99 @@ async def client_task() -> str: pass # Analyze results - clean_closes = sum(1 for r in results if r == "clean_close") - server_closes = sum(1 for r in results if r == "server_closed") - errors = sum(1 for r in results if isinstance(r, str) and r.startswith("error:")) - - # All connections should have closed (one way or another) + clean_closes = 0 + server_closes = 0 + errors = 0 + + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + errors += 1 + elif isinstance(result, tuple): + status, error = result + if status == "clean_close": + metrics_collector.record_success() + clean_closes += 1 + elif status == "server_closed": + metrics_collector.record_success() + server_closes += 1 + elif status.startswith("error:"): + metrics_collector.record_failure(error or status) + errors += 1 + + # Generate report + report = metrics_collector.compute_report( + test_name="test_graceful_shutdown_with_active_connections", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions total_closed = clean_closes + server_closes + errors assert ( total_closed >= scale * 0.9 ), f"Not all connections closed: {total_closed}/{scale}" - - # Shutdown should be fast assert shutdown_time < 10, f"Shutdown took {shutdown_time:.1f}s, expected < 10s" @pytest.mark.loadtest async def test_connections_receive_shutdown_signal( docker_available: bool, + scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Verify connections are notified of shutdown via SSE. - - When AppStatus.should_exit is set, active streams should terminate gracefully. + Verify active SSE streams are interrupted by shutdown signal. + + ## What is Measured + - Events received per client before shutdown + - Events received (or not) after shutdown signal + - Connection termination triggered by AppStatus.should_exit + + ## Why This Matters + Tests that the watcher correctly broadcasts shutdown to active streams: + - AppStatus.should_exit propagates to watcher task + - Watcher sets all registered anyio.Event objects + - EventSourceResponse._ping_task detects event, stops iteration + - Client receives connection close, not just timeout + + This complements the shutdown timing test by verifying the signal path + works, not just that connections eventually close. + + ## Methodology + 1. Start server in Docker container + 2. Connect 10 clients to /sse?delay=0.5 (slow stream to keep connections active) + 3. Wait 3s for clients to receive events + 4. Send SIGTERM + 5. Wait for clients to notice stream end + 6. Count events before/after signal + + ## Pass Criteria + - Total events > 0 (clients received events before shutdown) + - All clients received < 20 events (interrupted before completing) + - Rationale: With 0.5s delay, clients receive ~6 events in 3s. If they + reached 20, they weren't interrupted. This proves the shutdown signal + propagated through the watcher to active streams. """ if not docker_available: pytest.skip("Docker not available") @@ -133,7 +252,7 @@ async def test_connections_receive_shutdown_signal( base_url = container.get_base_url() # Connect clients that will wait for events - async def client_task() -> int: + async def client_task() -> tuple[int, str | None]: count = 0 try: async with httpx.AsyncClient(timeout=30.0) as client: @@ -144,9 +263,9 @@ async def client_task() -> int: count += 1 if count >= 20: # Should not reach this break - except Exception: - pass - return count + return count, None + except Exception as e: + return count, str(e) tasks = [asyncio.create_task(client_task()) for _ in range(10)] @@ -172,10 +291,46 @@ async def client_task() -> int: except Exception: pass - # Clients should have received some events before shutdown - event_counts = [r for r in results if isinstance(r, int)] - total_events = sum(event_counts) - + # Process results + total_events = 0 + event_counts: list[int] = [] + + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + count, error = result + metrics_collector.add_client_events(count) + total_events += count + event_counts.append(count) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + # Generate report + report = metrics_collector.compute_report( + test_name="test_connections_receive_shutdown_signal", + scale=10, # Fixed scale for this test + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions assert total_events > 0, "Clients should have received events before shutdown" assert all( c < 20 for c in event_counts diff --git a/tests/load/test_throughput.py b/tests/load/test_throughput.py index 22d8407..7f47fbc 100644 --- a/tests/load/test_throughput.py +++ b/tests/load/test_throughput.py @@ -1,24 +1,64 @@ """ Throughput and latency tests for sse-starlette. -Measures events per second, latency percentiles, and first event latency. +This module measures the core performance characteristics of the SSE server: +- Raw throughput (events/sec) under various client loads +- Time to first event (connection setup latency) +- Inter-event latency distribution under load + +These metrics establish performance baselines and detect regressions in +event delivery, connection handling, and async task scheduling. """ +from __future__ import annotations + import asyncio import time -from typing import List import httpx import pytest from httpx_sse import aconnect_sse +from .baseline import BaselineManager +from .conftest import register_test_report +from .metrics import MetricsCollector +from .reporter import ReportGenerator + @pytest.mark.loadtest -async def test_throughput_single_client(sse_server_url: str) -> None: +async def test_throughput_single_client( + sse_server_url: str, + scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, +) -> None: """ Measure maximum throughput for a single client. - Baseline measurement without contention. + ## What is Measured + - Events per second delivered to a single consumer with zero delay + - Server's maximum event generation rate without client contention + + ## Why This Matters + Establishes the performance ceiling for the SSE implementation. A regression + here indicates fundamental slowdown in event serialization, async scheduling, + or the streaming response path. This baseline is used to evaluate how well + throughput scales with multiple clients. + + ## Methodology + 1. Connect single client to /sse?delay=0 (server sends events as fast as possible) + 2. Count events received over 10 seconds + 3. Calculate events/sec throughput + + ## Pass Criteria + - Throughput >= 1000 events/sec + - Rationale: With zero delay, the bottleneck should be network I/O and async + scheduling. 1000 events/sec is achievable on any modern system and leaves + headroom for real-world latency. """ events_received = 0 start_time = time.perf_counter() @@ -36,7 +76,35 @@ async def test_throughput_single_client(sse_server_url: str) -> None: elapsed = time.perf_counter() - start_time throughput = events_received / elapsed - # Should achieve at least 1000 events/sec for a single client + # Record metrics + metrics_collector.add_client_events(events_received) + metrics_collector.set_duration(elapsed) + metrics_collector.record_success() + + # Generate report + report = metrics_collector.compute_report( + test_name="test_throughput_single_client", + scale=1, # Single client test + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + # Check for regression + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertion assert ( throughput >= 1000 ), f"Single client throughput {throughput:.0f} events/sec, expected >= 1000" @@ -46,16 +114,44 @@ async def test_throughput_single_client(sse_server_url: str) -> None: async def test_throughput_multiple_clients( sse_server_url: str, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ Measure aggregate throughput with multiple concurrent clients. - Pass criteria: - - Aggregate throughput > 10,000 events/sec + ## What is Measured + - Total events/sec delivered across N concurrent SSE connections + - Per-client event counts to detect uneven distribution + - Connection success rate under concurrent load + + ## Why This Matters + Detects contention issues that only appear under load: + - Lock contention in the send path (anyio.Lock) + - Per-connection memory/CPU overhead scaling poorly + - Async task scheduler saturation + - Event loop blocking under concurrent I/O + + ## Methodology + 1. Launch `scale` concurrent client tasks (default 100) + 2. Each client connects to /sse?delay=0.001 (1ms between events) + 3. Run for 30 seconds, counting events per client + 4. Sum total events and calculate aggregate throughput + + ## Pass Criteria + - Aggregate throughput >= min(10000, scale * 100) events/sec + - Rationale: With 1ms delay, each client should receive ~1000 events/sec. + With 100 clients, expect ~100K events/sec total. The min() handles + smaller scale values gracefully. """ duration_seconds = 30 - async def client_task() -> int: + async def client_task() -> tuple[int, str | None]: + """Run client and return (event_count, error_or_none).""" count = 0 start = time.perf_counter() try: @@ -67,20 +163,56 @@ async def client_task() -> int: count += 1 if time.perf_counter() - start >= duration_seconds: break - except Exception: - pass - return count + return count, None + except Exception as e: + return count, str(e) start_time = time.perf_counter() tasks = [asyncio.create_task(client_task()) for _ in range(scale)] results = await asyncio.gather(*tasks, return_exceptions=True) elapsed = time.perf_counter() - start_time - total_events = sum(r for r in results if isinstance(r, int)) - aggregate_throughput = total_events / elapsed + # Process results + total_events = 0 + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + count, error = result + metrics_collector.add_client_events(count) + total_events += count + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + metrics_collector.set_duration(elapsed) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_throughput_multiple_clients", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) - # With scale clients, should achieve high aggregate throughput - min_expected = min(10000, scale * 100) # Scale expectation with client count + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertion + aggregate_throughput = total_events / elapsed + min_expected = min(10000, scale * 100) assert aggregate_throughput >= min_expected, ( f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {scale} " f"clients, expected >= {min_expected}" @@ -91,17 +223,44 @@ async def client_task() -> int: async def test_first_event_latency( sse_server_url: str, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ Measure time to first event (TTFE) for multiple connections. - Pass criteria (relaxed for Docker overhead): - - p50 TTFE < 2000ms - - p99 TTFE < 5000ms + The TTFE is high due to the "thundering herd" effect (100 connections hitting simultaneously). + But the inter-event latency is excellent - only ~5ms overhead on top of the 10ms delay. + + ## What is Measured + - Time from connection initiation to first SSE event received + - Latency distribution across concurrent connections (p50, p99) + - Connection success rate under concurrent connection storms + + ## Why This Matters + TTFE is the user-perceived responsiveness metric. High TTFE indicates: + - Slow connection acceptance in the ASGI server + - Blocking operations in EventSourceResponse initialization + - Resource exhaustion during connection setup + - Inefficient task group initialization + + ## Methodology + 1. Launch `scale` concurrent connection attempts simultaneously + 2. Each client measures time from connect() to first SSE event + 3. Collect latency samples and compute percentiles + + ## Pass Criteria + - p50 < 1250ms, p99 < 2500ms + - Calibrated from measured p50=932ms, p99=1779ms at scale=100 + - Threshold factor: 1.3x measured values + """ - latencies: List[float] = [] - async def measure_ttfe() -> float: + async def measure_ttfe() -> tuple[float, str | None]: start = time.perf_counter() try: async with httpx.AsyncClient(timeout=30.0) as client: @@ -109,16 +268,51 @@ async def measure_ttfe() -> float: client, "GET", f"{sse_server_url}/sse?delay=0" ) as source: async for _ in source.aiter_sse(): - return (time.perf_counter() - start) * 1000 # ms - except Exception: - return -1 - return -1 + return (time.perf_counter() - start) * 1000, None + except Exception as e: + return -1, str(e) + return -1, "no events received" + start_time = time.perf_counter() tasks = [asyncio.create_task(measure_ttfe()) for _ in range(scale)] results = await asyncio.gather(*tasks) + elapsed = time.perf_counter() - start_time + + # Process results + latencies: list[float] = [] + for latency, error in results: + if latency > 0: + metrics_collector.add_ttfe_sample(latency) + metrics_collector.record_success() + latencies.append(latency) + else: + metrics_collector.record_failure(error or "unknown") + + metrics_collector.set_duration(elapsed) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_first_event_latency", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None - latencies = [r for r in results if r > 0] + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions if len(latencies) < scale * 0.9: pytest.fail(f"Too many failed connections: {len(latencies)}/{scale}") @@ -126,25 +320,52 @@ async def measure_ttfe() -> float: p50 = latencies[len(latencies) // 2] p99 = latencies[int(len(latencies) * 0.99)] - # Relaxed thresholds: Docker networking + container overhead - assert p50 < 2000, f"p50 TTFE {p50:.1f}ms, expected < 2000ms" - assert p99 < 5000, f"p99 TTFE {p99:.1f}ms, expected < 5000ms" + assert p50 < 1250, f"p50 TTFE {p50:.1f}ms, expected < 1250ms" + assert p99 < 2500, f"p99 TTFE {p99:.1f}ms, expected < 2500ms" @pytest.mark.loadtest async def test_event_latency_under_load( sse_server_url: str, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Measure event-to-event latency under load. - - Captures latency between consecutive events to detect backpressure. + Measure inter-event latency under concurrent load. + + ## What is Measured + - Time between consecutive SSE events (inter-event latency) + - Latency distribution percentiles (p50, p95, p99) + - Variance across multiple concurrent connections + + ## Why This Matters + Inter-event latency reveals hidden performance issues: + - Backpressure from slow sends affecting fast clients + - Buffer bloat in the response stream + - Async scheduler starvation under load + - GC pauses or memory pressure spikes + + Unlike throughput (which averages over time), latency percentiles expose + tail latency issues that degrade user experience. + + ## Methodology + 1. Launch `scale` concurrent clients to /sse?delay=0.01 (10ms between events) + 2. Each client receives 100 events and records inter-event times + 3. Aggregate all latency samples and compute percentiles + + ## Pass Criteria + - p50 < 20ms, p95 < 30ms, p99 < 40ms + - Calibrated from measured p50=14.8ms, p95=21.4ms, p99=27.4ms at scale=100 + - Server delay: 10ms. Threshold factor: 1.3x measured values """ - all_latencies: List[float] = [] - async def measure_latencies() -> List[float]: - latencies: List[float] = [] + async def measure_latencies() -> tuple[list[float], str | None]: + latencies: list[float] = [] last_time = None try: async with httpx.AsyncClient(timeout=60.0) as client: @@ -160,16 +381,51 @@ async def measure_latencies() -> List[float]: count += 1 if count >= 100: break - except Exception: - pass - return latencies + return latencies, None + except Exception as e: + return latencies, str(e) + start_time = time.perf_counter() tasks = [asyncio.create_task(measure_latencies()) for _ in range(scale)] results = await asyncio.gather(*tasks) + elapsed = time.perf_counter() - start_time + + # Process results + all_latencies: list[float] = [] + for client_latencies, error in results: + for lat in client_latencies: + metrics_collector.add_latency_sample(lat) + all_latencies.append(lat) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + metrics_collector.set_duration(elapsed) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_event_latency_under_load", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) - for client_latencies in results: - all_latencies.extend(client_latencies) + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + # Original assertions if len(all_latencies) < 100: pytest.fail(f"Insufficient latency samples: {len(all_latencies)}") @@ -178,8 +434,6 @@ async def measure_latencies() -> List[float]: p95 = all_latencies[int(len(all_latencies) * 0.95)] p99 = all_latencies[int(len(all_latencies) * 0.99)] - # Expected ~10ms between events (0.01s delay) - # Allow 2x for processing overhead under load - assert p50 < 50, f"p50 inter-event latency {p50:.1f}ms, expected < 50ms" - assert p95 < 100, f"p95 inter-event latency {p95:.1f}ms, expected < 100ms" - assert p99 < 200, f"p99 inter-event latency {p99:.1f}ms, expected < 200ms" + assert p50 < 20, f"p50 inter-event latency {p50:.1f}ms, expected < 20ms" + assert p95 < 30, f"p95 inter-event latency {p95:.1f}ms, expected < 30ms" + assert p99 < 40, f"p99 inter-event latency {p99:.1f}ms, expected < 40ms" diff --git a/tests/load/test_watcher_scale.py b/tests/load/test_watcher_scale.py index 3a4d150..67cc28f 100644 --- a/tests/load/test_watcher_scale.py +++ b/tests/load/test_watcher_scale.py @@ -1,29 +1,76 @@ """ -Watcher deduplication tests at scale. +Watcher deduplication tests at scale (Issue #152 regression suite). -Validates the Issue #152 fix: only one watcher task per thread regardless -of the number of concurrent connections. +This module validates the fix for Issue #152: watcher task accumulation. +Before the fix, each SSE connection spawned a new watcher task that polled +AppStatus.should_exit. With thousands of connections, CPU usage grew unbounded. + +The fix uses threading.local() to maintain one watcher per thread. These tests +verify that pattern holds under various load conditions: +- Many simultaneous connections sharing a single watcher +- Rapid connect/disconnect cycles not spawning new watchers +- Clean watcher lifecycle (start -> broadcast -> cleanup -> restart) """ +from __future__ import annotations + import asyncio import httpx import pytest from httpx_sse import aconnect_sse +from .baseline import BaselineManager +from .conftest import register_test_report +from .metrics import MetricsCollector +from .reporter import ReportGenerator + @pytest.mark.loadtest async def test_single_watcher_with_many_connections( sse_server_url: str, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - With N concurrent connections, verify only 1 watcher is running. - - This is the core regression test for Issue #152. + Verify only one watcher runs regardless of connection count (Issue #152 core test). + + ## What is Measured + - `watcher_started` flag from /metrics (True = watcher exists) + - `registered_events` count (should be >= scale * 0.5) + - Implicit: CPU usage would spike if multiple watchers existed (not measured) + + ## Why This Matters + This is the primary regression test for Issue #152. Before the fix: + - N connections = N watcher tasks + - Each watcher polls AppStatus.should_exit every 0.5s + - 1000 connections = 1000 polling tasks = CPU exhaustion + + After the fix: + - N connections = 1 watcher task (per thread) + - Watcher broadcasts to all registered events + - Constant CPU overhead regardless of connection count + + ## Methodology + 1. Connect `scale` concurrent clients (default 100) + 2. Wait for connections to establish (~2s) + 3. Query /metrics for watcher_started and registered_events + 4. Cancel all connections + + ## Pass Criteria + - watcher_started = True (watcher exists for active connections) + - registered_events >= scale * 0.5 (most connections registered) + - Rationale: watcher_started=True confirms the mechanism works. + Event count verifies registration worked. We don't directly measure + watcher count, but CPU metrics in CI would catch proliferation. """ - async def client_task() -> None: + async def client_task() -> tuple[int, str | None]: try: async with httpx.AsyncClient(timeout=30.0) as client: async with aconnect_sse( @@ -32,8 +79,9 @@ async def client_task() -> None: async for _ in source.aiter_sse(): await asyncio.sleep(5) # Stay connected break - except Exception: - pass + return 1, None + except Exception as e: + return 0, str(e) # Start many connections tasks = [asyncio.create_task(client_task()) for _ in range(scale)] @@ -47,16 +95,55 @@ async def client_task() -> None: watcher_started = metrics["watcher_started"] registered_events = metrics["registered_events"] + metrics_collector.add_memory_sample(metrics["memory_rss_mb"]) # Cancel all tasks for task in tasks: task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - - # Watcher should be running + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + _, error = result + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() + + # Record SSE internals + metrics_collector.set_sse_internals( + watcher_started=watcher_started, + peak_events=registered_events, + final_events=0, + ) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_single_watcher_with_many_connections", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions assert watcher_started is True, "Watcher should be started with active connections" - - # Should have many events registered (one per connection) assert ( registered_events >= scale * 0.5 ), f"Expected at least {scale * 0.5} events, got {registered_events}" @@ -66,14 +153,43 @@ async def client_task() -> None: async def test_rapid_connect_disconnect_watcher_stability( sse_server_url: str, scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, ) -> None: """ - Rapid connect/disconnect cycles should not accumulate watchers. - - Each connect/disconnect should reuse the existing watcher, not spawn new ones. + Verify rapid connect/disconnect cycles don't accumulate watcher tasks. + + ## What is Measured + - Thread count after many rapid connection cycles + - Memory samples during the churn + - SSE internals (watcher_started, registered_events) + + ## Why This Matters + Tests the watcher lifecycle under high churn: + - Connections come and go faster than the watcher poll interval (0.5s) + - Watcher must survive connection churn without proliferation + - Event registration/deregistration must be thread-safe + + Before Issue #152 fix, each connection left behind a watcher task. Even if + connections closed quickly, watchers accumulated and never stopped. + + ## Methodology + 1. Run `scale / 10` batches of 10 quick connections each + 2. Each connection receives 1 event and disconnects immediately + 3. After all batches, check thread count and watcher status + + ## Pass Criteria + - num_threads < 50 + - Rationale: A healthy uvicorn server has ~5-10 threads. If watchers + accumulated, we'd see hundreds of threads (one per watcher task). + 50 provides margin for legitimate worker threads. """ - async def quick_connect() -> None: + async def quick_connect() -> tuple[int, str | None]: try: async with httpx.AsyncClient(timeout=10.0) as client: async with aconnect_sse( @@ -81,13 +197,25 @@ async def quick_connect() -> None: ) as source: async for _ in source.aiter_sse(): break # Disconnect after first event - except Exception: - pass + return 1, None + except Exception as e: + return 0, str(e) # Rapid connect/disconnect cycles for batch in range(scale // 10): tasks = [asyncio.create_task(quick_connect()) for _ in range(10)] - await asyncio.gather(*tasks, return_exceptions=True) + results = await asyncio.gather(*tasks, return_exceptions=True) + + for result in results: + if isinstance(result, Exception): + metrics_collector.record_failure(str(result)) + elif isinstance(result, tuple): + count, error = result + metrics_collector.add_client_events(count) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() # Brief pause await asyncio.sleep(0.5) @@ -96,24 +224,88 @@ async def quick_connect() -> None: async with httpx.AsyncClient() as client: metrics = (await client.get(f"{sse_server_url}/metrics")).json() - # The watcher_started flag confirms single watcher pattern - # If multiple watchers had spawned, we'd see resource issues num_threads = metrics["num_threads"] - - # Thread count should be reasonable (not proportional to connection count) - # A healthy uvicorn worker has ~5-10 threads typically + metrics_collector.add_memory_sample(metrics["memory_rss_mb"]) + + # Record SSE internals + metrics_collector.set_sse_internals( + watcher_started=metrics.get("watcher_started", False), + peak_events=metrics.get("registered_events", 0), + final_events=metrics.get("registered_events", 0), + ) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_rapid_connect_disconnect_watcher_stability", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertion assert num_threads < 50, f"Too many threads ({num_threads}), possible watcher leak" @pytest.mark.loadtest -async def test_watcher_cleanup_allows_restart(sse_server_url: str) -> None: +async def test_watcher_cleanup_allows_restart( + sse_server_url: str, + scale: int, + duration_minutes: int, + metrics_collector: MetricsCollector, + baseline_manager: BaselineManager, + report_generator: ReportGenerator, + update_baseline: bool, + fail_on_regression: bool, +) -> None: """ - After all connections close, new connections should start fresh watcher. - - Tests the watcher lifecycle: start -> broadcast -> cleanup -> restart. + Verify watcher stops when all connections close, restarts with new connections. + + ## What is Measured + - registered_events after Phase 1 (should be near 0 after cleanup) + - Events received in Phase 2 (watcher must restart to deliver them) + - Final registered_events (should match Phase 1 cleanup) + + ## Why This Matters + Tests the complete watcher lifecycle: + 1. Start: First connection starts the watcher + 2. Broadcast: Watcher delivers shutdown signals to all registered events + 3. Cleanup: Last connection removes its event, watcher stops + 4. Restart: New connections restart the watcher + + If cleanup fails, events accumulate indefinitely. If restart fails, new + connections won't receive shutdown signals, causing graceful shutdown to fail. + + ## Methodology + 1. Phase 1: Connect 50 clients, each receives 20 events, then disconnects + 2. Wait 1s for cleanup + 3. Check registered_events (should be near 0) + 4. Phase 2: Connect 50 new clients, each receives 20 events + 5. Wait 1s for cleanup + 6. Verify final state matches Phase 1 post-cleanup + + ## Pass Criteria + - phase1_events > 0 (Phase 1 received events) + - phase2_events > 0 (Phase 2 received events - proves restart worked) + - final_events <= events_after_phase1 + 5 (cleanup works consistently) + - Rationale: If watcher didn't restart in Phase 2, no events would be + delivered. The +5 margin allows for concurrent test interference. """ - async def connect_and_consume(n_events: int) -> int: + async def connect_and_consume(n_events: int) -> tuple[int, str | None]: count = 0 try: async with httpx.AsyncClient(timeout=30.0) as client: @@ -124,14 +316,24 @@ async def connect_and_consume(n_events: int) -> int: count += 1 if count >= n_events: break - except Exception: - pass - return count + return count, None + except Exception as e: + return count, str(e) # Phase 1: Connect, consume, disconnect tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)] results = await asyncio.gather(*tasks) - assert sum(results) > 0, "Phase 1 should have received events" + + phase1_events = 0 + for result in results: + if isinstance(result, tuple): + count, error = result + phase1_events += count + metrics_collector.add_client_events(count) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() # Wait for cleanup await asyncio.sleep(1) @@ -140,11 +342,22 @@ async def connect_and_consume(n_events: int) -> int: async with httpx.AsyncClient() as client: metrics1 = (await client.get(f"{sse_server_url}/metrics")).json() events_after_phase1 = metrics1["registered_events"] + metrics_collector.add_memory_sample(metrics1["memory_rss_mb"]) # Phase 2: New connections should work tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)] results = await asyncio.gather(*tasks) - assert sum(results) > 0, "Phase 2 should have received events" + + phase2_events = 0 + for result in results: + if isinstance(result, tuple): + count, error = result + phase2_events += count + metrics_collector.add_client_events(count) + if error: + metrics_collector.record_failure(error) + else: + metrics_collector.record_success() # Wait for cleanup await asyncio.sleep(1) @@ -153,7 +366,41 @@ async def connect_and_consume(n_events: int) -> int: async with httpx.AsyncClient() as client: metrics2 = (await client.get(f"{sse_server_url}/metrics")).json() - # Events should be cleaned up after both phases + final_events = metrics2["registered_events"] + metrics_collector.add_memory_sample(metrics2["memory_rss_mb"]) + + # Record SSE internals + metrics_collector.set_sse_internals( + watcher_started=metrics2.get("watcher_started", False), + peak_events=max(events_after_phase1, final_events), + final_events=final_events, + ) + + # Generate report + report = metrics_collector.compute_report( + test_name="test_watcher_cleanup_allows_restart", + scale=scale, + duration_minutes=duration_minutes, + ) + register_test_report(report) + + # Compare and output + comparison = baseline_manager.compare(report) + report.comparison = comparison.to_dict() if comparison else None + + report_generator.save_json(report) + report_generator.save_html(report, comparison) + report_generator.print_summary(report, comparison) + + if update_baseline: + baseline_manager.save_baseline(report) + + if fail_on_regression and comparison and comparison.regression_detected: + pytest.fail(f"Regression detected: {comparison.regression_reasons}") + + # Original assertions + assert phase1_events > 0, "Phase 1 should have received events" + assert phase2_events > 0, "Phase 2 should have received events" assert ( - metrics2["registered_events"] <= events_after_phase1 + 5 + final_events <= events_after_phase1 + 5 ), "Event set should be cleaned up between phases" From 1b25b1625fab4099d79393db9886cf1d41bff2ad Mon Sep 17 00:00:00 2001 From: sysid Date: Fri, 2 Jan 2026 13:13:47 +0100 Subject: [PATCH 4/4] refactor(loadtest): replace CLI scale/duration with per-test constants Remove --scale and --duration CLI options from load tests. Each test now defines its own parameters as constants, allowing appropriate values per test type (e.g., shutdown tests need fewer connections). Changes: - conftest.py: remove --scale, --duration options and fixtures - metrics.py: compute duration_minutes internally from actual duration - test_*.py: add explicit NUM_CLIENTS, DURATION_SEC, etc. constants - README.md: update CLI options documentation - load-test.yml: remove scale/duration workflow inputs --- .github/workflows/load-test.yml | 22 -------- tests/load/README.md | 48 ++++++++++++++-- tests/load/conftest.py | 24 -------- tests/load/metrics.py | 7 ++- tests/load/test_backpressure.py | 67 +++++++++++----------- tests/load/test_memory_stability.py | 87 +++++++++++++++++------------ tests/load/test_shutdown.py | 33 +++++------ tests/load/test_throughput.py | 69 +++++++++++------------ tests/load/test_watcher_scale.py | 74 +++++++++++++++--------- 9 files changed, 227 insertions(+), 204 deletions(-) diff --git a/.github/workflows/load-test.yml b/.github/workflows/load-test.yml index ec3691e..726c170 100644 --- a/.github/workflows/load-test.yml +++ b/.github/workflows/load-test.yml @@ -3,24 +3,6 @@ name: Load Tests on: workflow_dispatch: inputs: - scale: - description: 'Number of concurrent connections' - required: true - default: '100' - type: choice - options: - - '100' - - '500' - - '1000' - duration: - description: 'Test duration in minutes' - required: true - default: '1' - type: choice - options: - - '1' - - '5' - - '10' update_baseline: description: 'Update baselines after run' required: false @@ -59,8 +41,6 @@ jobs: - name: Run load tests run: | python -m pytest tests/load/ -m "loadtest" \ - --scale=${{ inputs.scale }} \ - --duration=${{ inputs.duration }} \ --output-dir=tests/load/results \ ${{ inputs.update_baseline && '--update-baseline' || '' }} \ ${{ inputs.fail_on_regression && '--fail-on-regression' || '' }} \ @@ -91,8 +71,6 @@ jobs: run: | echo "## Load Test Results" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Scale**: ${{ inputs.scale }} concurrent connections" >> $GITHUB_STEP_SUMMARY - echo "- **Duration**: ${{ inputs.duration }} minutes" >> $GITHUB_STEP_SUMMARY echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Reports" >> $GITHUB_STEP_SUMMARY diff --git a/tests/load/README.md b/tests/load/README.md index 13332a5..9b0343a 100644 --- a/tests/load/README.md +++ b/tests/load/README.md @@ -17,9 +17,6 @@ These tests measure performance characteristics that unit tests cannot capture: # Run load tests locally (requires Docker) make test-load -# Run with custom scale -make test-load PYTEST_ARGS="--scale=500 --duration=5" - # Update baselines after intentional changes make test-load PYTEST_ARGS="--update-baseline" ``` @@ -105,14 +102,15 @@ make test-load PYTEST_ARGS="--fail-on-regression" | Option | Default | Description | |--------|---------|-------------| -| `--scale` | 100 | Concurrent connections | -| `--duration` | 1 | Test duration (minutes) | | `--output-dir` | `tests/load/results` | Report output directory | | `--baselines-dir` | `tests/load/baselines` | Baseline file directory | | `--update-baseline` | False | Save current run as new baseline | | `--fail-on-regression` | False | Exit non-zero if regression detected | | `--regression-threshold` | 20 | Percent change to trigger warning | +**Note**: Test scale (connections, duration) is controlled via constants within each test file. +This allows appropriate parameters per test type (e.g., shutdown tests use fewer connections). + ## Test Categories ### Throughput (`test_throughput.py`) @@ -219,10 +217,48 @@ Features: - Comparison against baseline with delta percentages - Regression/warning highlights +## Server Metrics Endpoint + +The load test server exposes `/metrics` for monitoring: + +```json +{ + "memory_rss_mb": 45.2, + "num_fds": 25, + "num_threads": 8, + "watcher_started": true, + "registered_events": 100, + "uptime_seconds": 30.5 +} +``` + +Key metrics: +- `memory_rss_mb`: Detect memory leaks +- `registered_events`: Verify Issue #152 (should equal active connections) +- `watcher_started`: Confirm single watcher pattern +- `num_fds`: Detect file descriptor leaks + +## Dependencies + +Added to `pyproject.toml` as optional `[loadtest]` group: + +```bash +pip install -e ".[loadtest]" +``` + ## GitHub Actions Integration The workflow (`.github/workflows/load-test.yml`) supports: -- Manual trigger with scale/duration inputs +- Manual trigger via workflow_dispatch - Baseline update option - Regression detection for CI gates - Artifact upload for reports + +## Design Decisions + +| Choice | Rationale | +|--------|-----------| +| httpx-sse + asyncio | Native async SSE client, simple concurrency with asyncio.gather() | +| Docker containers | Isolated environment, reproducible, clean SIGTERM shutdown | +| Manual CI trigger | Load tests are resource-intensive, not suitable for every PR | +| psutil for metrics | Cross-platform, no infrastructure needed, real-time data | diff --git a/tests/load/conftest.py b/tests/load/conftest.py index b70f888..160d2e3 100644 --- a/tests/load/conftest.py +++ b/tests/load/conftest.py @@ -112,18 +112,6 @@ async def async_client() -> httpx.AsyncClient: def pytest_addoption(parser: pytest.Parser) -> None: """Add custom command line options for load tests.""" - parser.addoption( - "--scale", - action="store", - default="100", - help="Number of concurrent connections for load tests", - ) - parser.addoption( - "--duration", - action="store", - default="1", - help="Test duration in minutes", - ) parser.addoption( "--output-dir", action="store", @@ -157,18 +145,6 @@ def pytest_addoption(parser: pytest.Parser) -> None: ) -@pytest.fixture -def scale(request: pytest.FixtureRequest) -> int: - """Get the scale (number of connections) for load tests.""" - return int(request.config.getoption("--scale")) - - -@pytest.fixture -def duration_minutes(request: pytest.FixtureRequest) -> int: - """Get the duration in minutes for load tests.""" - return int(request.config.getoption("--duration")) - - @pytest.fixture def output_dir(request: pytest.FixtureRequest) -> Path: """Get the output directory for reports.""" diff --git a/tests/load/metrics.py b/tests/load/metrics.py index c3bfea5..c5929dc 100644 --- a/tests/load/metrics.py +++ b/tests/load/metrics.py @@ -345,9 +345,7 @@ def set_sse_internals( self.peak_registered_events = peak_events self.final_registered_events = final_events - def compute_report( - self, test_name: str, scale: int, duration_minutes: int - ) -> TestReport: + def compute_report(self, test_name: str, scale: int) -> TestReport: """Compute final report from collected samples.""" git_commit, git_branch = _get_git_info() timestamp = datetime.now(timezone.utc).isoformat() @@ -406,6 +404,9 @@ def compute_report( final_registered_events=self.final_registered_events, ) + # Compute duration_minutes from actual test duration + duration_minutes = max(1, int(self.total_duration_sec / 60)) + return TestReport( test_name=test_name, timestamp=timestamp, diff --git a/tests/load/test_backpressure.py b/tests/load/test_backpressure.py index 024adf6..6c598e9 100644 --- a/tests/load/test_backpressure.py +++ b/tests/load/test_backpressure.py @@ -28,8 +28,6 @@ @pytest.mark.loadtest async def test_slow_clients_dont_block_fast_clients( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -54,9 +52,9 @@ async def test_slow_clients_dont_block_fast_clients( making the server unusable under mixed load. ## Methodology - 1. Connect 10 "fast" clients (consume events immediately) - 2. Connect 10 "slow" clients (sleep 0.5s after each event) - 3. Run for 10 seconds + 1. Connect NUM_FAST "fast" clients (consume events immediately) + 2. Connect NUM_SLOW "slow" clients (sleep 0.5s after each event) + 3. Run for DURATION_SEC seconds 4. Compare event counts ## Pass Criteria @@ -66,7 +64,10 @@ async def test_slow_clients_dont_block_fast_clients( Slow clients receive ~20 (10s / 0.5s). 5x ratio is conservative. 500 events threshold catches severe throttling. """ - test_duration = 10 # seconds + # Test parameters + NUM_FAST = 10 + NUM_SLOW = 10 + DURATION_SEC = 10 async def fast_client() -> tuple[int, str | None]: """Client that consumes events as fast as possible.""" @@ -79,7 +80,7 @@ async def fast_client() -> tuple[int, str | None]: ) as source: async for _ in source.aiter_sse(): count += 1 - if time.perf_counter() - start >= test_duration: + if time.perf_counter() - start >= DURATION_SEC: break return count, None except Exception as e: @@ -97,7 +98,7 @@ async def slow_client() -> tuple[int, str | None]: async for _ in source.aiter_sse(): await asyncio.sleep(0.5) # Slow processing count += 1 - if time.perf_counter() - start >= test_duration: + if time.perf_counter() - start >= DURATION_SEC: break return count, None except Exception as e: @@ -106,8 +107,8 @@ async def slow_client() -> tuple[int, str | None]: start_time = time.perf_counter() # Mix of fast and slow clients - fast_tasks = [asyncio.create_task(fast_client()) for _ in range(10)] - slow_tasks = [asyncio.create_task(slow_client()) for _ in range(10)] + fast_tasks = [asyncio.create_task(fast_client()) for _ in range(NUM_FAST)] + slow_tasks = [asyncio.create_task(slow_client()) for _ in range(NUM_SLOW)] fast_results = await asyncio.gather(*fast_tasks) slow_results = await asyncio.gather(*slow_tasks) @@ -145,8 +146,7 @@ async def slow_client() -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_slow_clients_dont_block_fast_clients", - scale=20, # 10 fast + 10 slow - duration_minutes=duration_minutes, + scale=NUM_FAST + NUM_SLOW, ) register_test_report(report) @@ -177,8 +177,6 @@ async def slow_client() -> tuple[int, str | None]: @pytest.mark.loadtest async def test_connection_churn_stability( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -204,7 +202,7 @@ async def test_connection_churn_stability( ## Methodology 1. Record baseline FDs and memory - 2. Create `churn_rate` connections per second for 30 seconds + 2. Create CHURN_RATE connections per second for DURATION_SEC seconds 3. Each connection receives one event and disconnects 4. Sample memory every 5 seconds 5. Record final FDs and memory @@ -217,9 +215,11 @@ async def test_connection_churn_stability( 100MB memory is generous but catches runaway allocation. 90% success rate accounts for expected failures under heavy churn. """ - churn_rate = min(100, scale) # connections per second - duration = 30 # seconds - total_connections = churn_rate * duration + # Test parameters + CHURN_RATE = 100 # connections per second + DURATION_SEC = 30 + + total_connections = CHURN_RATE * DURATION_SEC async def quick_connection() -> tuple[bool, str | None]: try: @@ -245,8 +245,8 @@ async def quick_connection() -> tuple[bool, str | None]: # Create connections at target rate successful = 0 - for batch in range(duration): - tasks = [asyncio.create_task(quick_connection()) for _ in range(churn_rate)] + for batch in range(DURATION_SEC): + tasks = [asyncio.create_task(quick_connection()) for _ in range(CHURN_RATE)] results = await asyncio.gather(*tasks, return_exceptions=True) for result in results: @@ -285,8 +285,7 @@ async def quick_connection() -> tuple[bool, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_connection_churn_stability", - scale=scale, - duration_minutes=duration_minutes, + scale=total_connections, ) register_test_report(report) @@ -327,8 +326,6 @@ async def quick_connection() -> tuple[bool, str | None]: @pytest.mark.loadtest async def test_send_timeout_under_load( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -354,8 +351,8 @@ async def test_send_timeout_under_load( "freeze" (backgrounded, network change) without closing connections. ## Methodology - 1. Connect 5 "frozen" clients (receive one event, then stop reading) - 2. Connect 3 "normal" clients (receive 50 events normally) + 1. Connect NUM_FROZEN "frozen" clients (receive one event, then stop reading) + 2. Connect NUM_NORMAL "normal" clients (receive EVENTS_PER_NORMAL events normally) 3. Wait for normal clients to complete 4. Verify normal clients weren't affected @@ -365,6 +362,10 @@ async def test_send_timeout_under_load( small margin for timing. If frozen clients blocked the server, normal clients would timeout or receive far fewer events. """ + # Test parameters + NUM_FROZEN = 5 + NUM_NORMAL = 3 + EVENTS_PER_NORMAL = 50 async def frozen_client() -> tuple[str, float, str | None]: """Client that stops reading after first event (simulates frozen client).""" @@ -385,7 +386,7 @@ async def frozen_client() -> tuple[str, float, str | None]: return "completed", time.perf_counter() - start, None # Start some frozen clients (server has default send_timeout) - frozen_tasks = [asyncio.create_task(frozen_client()) for _ in range(5)] + frozen_tasks = [asyncio.create_task(frozen_client()) for _ in range(NUM_FROZEN)] # Also verify server remains responsive with normal clients async def normal_client() -> tuple[int, str | None]: @@ -397,13 +398,13 @@ async def normal_client() -> tuple[int, str | None]: ) as source: async for _ in source.aiter_sse(): count += 1 - if count >= 50: + if count >= EVENTS_PER_NORMAL: break return count, None except Exception as e: return count, str(e) - normal_tasks = [asyncio.create_task(normal_client()) for _ in range(3)] + normal_tasks = [asyncio.create_task(normal_client()) for _ in range(NUM_NORMAL)] # Wait for normal clients to complete normal_results = await asyncio.gather(*normal_tasks) @@ -441,8 +442,7 @@ async def normal_client() -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_send_timeout_under_load", - scale=8, # 5 frozen + 3 normal - duration_minutes=duration_minutes, + scale=NUM_FROZEN + NUM_NORMAL, ) register_test_report(report) @@ -460,7 +460,8 @@ async def normal_client() -> tuple[int, str | None]: if fail_on_regression and comparison and comparison.regression_detected: pytest.fail(f"Regression detected: {comparison.regression_reasons}") - # Original assertion + # TODO: fix percentage + min_expected = EVENTS_PER_NORMAL - 5 # Allow 10% margin assert all( - r >= 45 for r in normal_counts + r >= min_expected for r in normal_counts ), f"Normal clients affected by frozen clients: {normal_counts}" diff --git a/tests/load/test_memory_stability.py b/tests/load/test_memory_stability.py index 883b8ad..118c1c7 100644 --- a/tests/load/test_memory_stability.py +++ b/tests/load/test_memory_stability.py @@ -13,6 +13,7 @@ from __future__ import annotations import asyncio +import time import httpx import pytest @@ -27,8 +28,6 @@ @pytest.mark.loadtest async def test_memory_stability_under_load( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -55,7 +54,7 @@ async def test_memory_stability_under_load( ## Methodology 1. Record baseline memory before any connections - 2. Connect `scale` clients, each streaming for `duration_minutes` + 2. Connect NUM_CLIENTS clients, each streaming for DURATION_SEC 3. Sample memory periodically during streaming 4. Compute total growth and growth rate (slope) @@ -66,7 +65,10 @@ async def test_memory_stability_under_load( catching runaway leaks. The slope check catches slow leaks that might stay under the absolute threshold but indicate unbounded growth. """ - events_per_client = duration_minutes * 60 * 10 # 10 events/sec + # Test parameters + NUM_CLIENTS = 100 + DURATION_SEC = 60 + EVENTS_PER_CLIENT = DURATION_SEC * 10 # 10 events/sec with 0.1s delay async def client_task(client_id: int) -> tuple[int, str | None]: """Single client consuming SSE events.""" @@ -78,7 +80,7 @@ async def client_task(client_id: int) -> tuple[int, str | None]: ) as source: async for _ in source.aiter_sse(): events_received += 1 - if events_received >= events_per_client: + if events_received >= EVENTS_PER_CLIENT: break return events_received, None except Exception as e: @@ -91,13 +93,13 @@ async def client_task(client_id: int) -> tuple[int, str | None]: metrics_collector.set_memory_baseline(baseline_memory) # Start all clients - tasks = [asyncio.create_task(client_task(i)) for i in range(scale)] + start_time = time.perf_counter() + tasks = [asyncio.create_task(client_task(i)) for i in range(NUM_CLIENTS)] - # Sample memory periodically - sample_interval = max(10, duration_minutes * 6) # At least 10 samples - - for _ in range(sample_interval): - await asyncio.sleep(duration_minutes * 60 / sample_interval) + # Sample memory periodically (at least 10 samples) + num_samples = 10 + for _ in range(num_samples): + await asyncio.sleep(DURATION_SEC / num_samples) try: async with httpx.AsyncClient() as client: metrics = (await client.get(f"{sse_server_url}/metrics")).json() @@ -107,6 +109,7 @@ async def client_task(client_id: int) -> tuple[int, str | None]: # Wait for all clients to complete results = await asyncio.gather(*tasks, return_exceptions=True) + elapsed = time.perf_counter() - start_time # Process results for result in results: @@ -127,13 +130,12 @@ async def client_task(client_id: int) -> tuple[int, str | None]: metrics_collector.set_memory_final(final_memory) # Set duration - metrics_collector.set_duration(duration_minutes * 60) + metrics_collector.set_duration(elapsed) # Generate report report = metrics_collector.compute_report( test_name="test_memory_stability_under_load", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -154,8 +156,8 @@ async def client_task(client_id: int) -> tuple[int, str | None]: # Original assertions completed = metrics_collector.successful_connections assert ( - completed >= scale * 0.9 - ), f"Too many failed connections: {completed}/{scale} completed" + completed >= NUM_CLIENTS * 0.9 + ), f"Too many failed connections: {completed}/{NUM_CLIENTS} completed" if report.memory: assert report.memory.growth_mb < 50, ( @@ -172,8 +174,6 @@ async def client_task(client_id: int) -> tuple[int, str | None]: @pytest.mark.loadtest async def test_memory_returns_to_baseline_after_disconnect( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -200,7 +200,7 @@ async def test_memory_returns_to_baseline_after_disconnect( ## Methodology 1. Record baseline memory - 2. Connect clients in batches, each receiving 50 events then disconnecting + 2. Connect clients in batches, each receiving EVENTS_PER_CLIENT events then disconnecting 3. Wait 2 seconds for cleanup (GC, finalizers) 4. Record final memory and compare to baseline @@ -210,6 +210,10 @@ async def test_memory_returns_to_baseline_after_disconnect( immediately. 20% margin accounts for fragmentation and GC timing while still catching significant retention issues. """ + # Test parameters + NUM_CLIENTS = 100 + EVENTS_PER_CLIENT = 50 + BATCH_SIZE = 100 async def client_task(client_id: int) -> tuple[int, str | None]: """Client that connects, receives few events, then disconnects.""" @@ -221,7 +225,7 @@ async def client_task(client_id: int) -> tuple[int, str | None]: count = 0 async for _ in source.aiter_sse(): count += 1 - if count >= 50: + if count >= EVENTS_PER_CLIENT: break return count, None except Exception as e: @@ -233,10 +237,12 @@ async def client_task(client_id: int) -> tuple[int, str | None]: baseline_memory = baseline["memory_rss_mb"] metrics_collector.set_memory_baseline(baseline_memory) + start_time = time.perf_counter() + # Connect and disconnect clients in batches - batch_size = min(100, scale) - for batch_start in range(0, scale, batch_size): - batch_end = min(batch_start + batch_size, scale) + batch_size = min(BATCH_SIZE, NUM_CLIENTS) + for batch_start in range(0, NUM_CLIENTS, batch_size): + batch_end = min(batch_start + batch_size, NUM_CLIENTS) tasks = [ asyncio.create_task(client_task(i)) for i in range(batch_start, batch_end) ] @@ -264,17 +270,19 @@ async def client_task(client_id: int) -> tuple[int, str | None]: # Wait for cleanup await asyncio.sleep(2) + elapsed = time.perf_counter() - start_time + # Check memory returned to near baseline async with httpx.AsyncClient() as client: final = (await client.get(f"{sse_server_url}/metrics")).json() final_memory = final["memory_rss_mb"] metrics_collector.set_memory_final(final_memory) + metrics_collector.set_duration(elapsed) # Generate report report = metrics_collector.compute_report( test_name="test_memory_returns_to_baseline_after_disconnect", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -303,8 +311,6 @@ async def client_task(client_id: int) -> tuple[int, str | None]: @pytest.mark.loadtest async def test_event_set_cleanup( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -332,18 +338,21 @@ async def test_event_set_cleanup( ## Methodology 1. Record baseline `registered_events` count - 2. Connect `scale` clients, wait for connections to establish - 3. Record peak `registered_events` (should be >= scale * 0.2) + 2. Connect NUM_CLIENTS clients, wait for connections to establish + 3. Record peak `registered_events` (should be >= NUM_CLIENTS * 0.2) 4. Wait for all connections to close + 2s cleanup 5. Record final `registered_events` (should return near baseline) ## Pass Criteria - - Peak events >= scale * 0.2 (events were registered) + - Peak events >= NUM_CLIENTS * 0.2 (events were registered) - Final events <= baseline + 10 (events were cleaned up) - Rationale: We expect most (not all) connections to register events. After cleanup, the set should be nearly empty. The +10 margin allows for concurrent test interference. """ + # Test parameters + NUM_CLIENTS = 100 + EVENTS_PER_CLIENT = 5 connected = asyncio.Event() connection_count = 0 @@ -356,12 +365,12 @@ async def client_task() -> tuple[int, str | None]: client, "GET", f"{sse_server_url}/sse?delay=0.5" ) as source: connection_count += 1 - if connection_count >= scale * 0.5: + if connection_count >= NUM_CLIENTS * 0.5: connected.set() count = 0 async for _ in source.aiter_sse(): count += 1 - if count >= 5: # Stay connected for ~2.5s + if count >= EVENTS_PER_CLIENT: # Stay connected for ~2.5s break return count, None except Exception as e: @@ -374,8 +383,10 @@ async def client_task() -> tuple[int, str | None]: baseline_memory = baseline["memory_rss_mb"] metrics_collector.set_memory_baseline(baseline_memory) + start_time = time.perf_counter() + # Connect many clients - tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)] # Wait for connections to establish (with timeout) try: @@ -406,11 +417,14 @@ async def client_task() -> tuple[int, str | None]: await asyncio.sleep(2) # Allow cleanup time + elapsed = time.perf_counter() - start_time + # Check events cleaned up async with httpx.AsyncClient() as client: final = (await client.get(f"{sse_server_url}/metrics")).json() final_events = final["registered_events"] metrics_collector.set_memory_final(final["memory_rss_mb"]) + metrics_collector.set_duration(elapsed) # Record SSE internals metrics_collector.set_sse_internals( @@ -422,8 +436,7 @@ async def client_task() -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_event_set_cleanup", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -442,8 +455,8 @@ async def client_task() -> tuple[int, str | None]: pytest.fail(f"Regression detected: {comparison.regression_reasons}") # Original assertions - assert peak_events >= scale * 0.2, ( - f"Expected at least {scale * 0.2} events registered during peak, " + assert peak_events >= NUM_CLIENTS * 0.2, ( + f"Expected at least {NUM_CLIENTS * 0.2} events registered during peak, " f"got {peak_events}" ) assert final_events <= baseline_events + 10, ( diff --git a/tests/load/test_shutdown.py b/tests/load/test_shutdown.py index 406a4a3..172594f 100644 --- a/tests/load/test_shutdown.py +++ b/tests/load/test_shutdown.py @@ -30,8 +30,6 @@ @pytest.mark.loadtest async def test_graceful_shutdown_with_active_connections( docker_available: bool, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -60,7 +58,7 @@ async def test_graceful_shutdown_with_active_connections( ## Methodology 1. Start server in Docker container - 2. Connect `scale` concurrent SSE clients + 2. Connect NUM_CLIENTS concurrent SSE clients 3. Wait for connections to establish (~2s) 4. Send SIGTERM to container 5. Measure time until all connections close @@ -72,6 +70,9 @@ async def test_graceful_shutdown_with_active_connections( - Rationale: 90% accounts for race conditions in test timing. 10s is generous but catches hangs. Production should complete in <5s. """ + # Test parameters + NUM_CLIENTS = 100 + if not docker_available: pytest.skip("Docker not available") @@ -114,7 +115,7 @@ async def client_task() -> tuple[str, str | None]: return f"error:{type(e).__name__}", str(e) # Start clients - tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)] # Wait for connections to establish await asyncio.sleep(2) @@ -169,8 +170,7 @@ async def client_task() -> tuple[str, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_graceful_shutdown_with_active_connections", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -191,16 +191,14 @@ async def client_task() -> tuple[str, str | None]: # Original assertions total_closed = clean_closes + server_closes + errors assert ( - total_closed >= scale * 0.9 - ), f"Not all connections closed: {total_closed}/{scale}" + total_closed >= NUM_CLIENTS * 0.9 + ), f"Not all connections closed: {total_closed}/{NUM_CLIENTS}" assert shutdown_time < 10, f"Shutdown took {shutdown_time:.1f}s, expected < 10s" @pytest.mark.loadtest async def test_connections_receive_shutdown_signal( docker_available: bool, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -227,7 +225,7 @@ async def test_connections_receive_shutdown_signal( ## Methodology 1. Start server in Docker container - 2. Connect 10 clients to /sse?delay=0.5 (slow stream to keep connections active) + 2. Connect NUM_CLIENTS clients to /sse?delay=0.5 (slow stream to keep connections active) 3. Wait 3s for clients to receive events 4. Send SIGTERM 5. Wait for clients to notice stream end @@ -240,6 +238,10 @@ async def test_connections_receive_shutdown_signal( reached 20, they weren't interrupted. This proves the shutdown signal propagated through the watcher to active streams. """ + # Test parameters + NUM_CLIENTS = 10 + MAX_EVENTS_PER_CLIENT = 20 + if not docker_available: pytest.skip("Docker not available") @@ -261,13 +263,13 @@ async def client_task() -> tuple[int, str | None]: ) as source: async for _ in source.aiter_sse(): count += 1 - if count >= 20: # Should not reach this + if count >= MAX_EVENTS_PER_CLIENT: # Should not reach this break return count, None except Exception as e: return count, str(e) - tasks = [asyncio.create_task(client_task()) for _ in range(10)] + tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)] # Let them receive a few events await asyncio.sleep(3) @@ -311,8 +313,7 @@ async def client_task() -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_connections_receive_shutdown_signal", - scale=10, # Fixed scale for this test - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -333,5 +334,5 @@ async def client_task() -> tuple[int, str | None]: # Original assertions assert total_events > 0, "Clients should have received events before shutdown" assert all( - c < 20 for c in event_counts + c < MAX_EVENTS_PER_CLIENT for c in event_counts ), "Clients should have been interrupted by shutdown" diff --git a/tests/load/test_throughput.py b/tests/load/test_throughput.py index 7f47fbc..d395791 100644 --- a/tests/load/test_throughput.py +++ b/tests/load/test_throughput.py @@ -28,8 +28,6 @@ @pytest.mark.loadtest async def test_throughput_single_client( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -60,9 +58,11 @@ async def test_throughput_single_client( scheduling. 1000 events/sec is achievable on any modern system and leaves headroom for real-world latency. """ + # Test parameters + DURATION_SEC = 10 + events_received = 0 start_time = time.perf_counter() - duration_seconds = 10 async with httpx.AsyncClient(timeout=60.0) as client: async with aconnect_sse( @@ -70,7 +70,7 @@ async def test_throughput_single_client( ) as source: async for _ in source.aiter_sse(): events_received += 1 - if time.perf_counter() - start_time >= duration_seconds: + if time.perf_counter() - start_time >= DURATION_SEC: break elapsed = time.perf_counter() - start_time @@ -84,8 +84,7 @@ async def test_throughput_single_client( # Generate report report = metrics_collector.compute_report( test_name="test_throughput_single_client", - scale=1, # Single client test - duration_minutes=duration_minutes, + scale=1, ) register_test_report(report) @@ -113,8 +112,6 @@ async def test_throughput_single_client( @pytest.mark.loadtest async def test_throughput_multiple_clients( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -137,18 +134,19 @@ async def test_throughput_multiple_clients( - Event loop blocking under concurrent I/O ## Methodology - 1. Launch `scale` concurrent client tasks (default 100) + 1. Launch NUM_CLIENTS concurrent client tasks 2. Each client connects to /sse?delay=0.001 (1ms between events) - 3. Run for 30 seconds, counting events per client + 3. Run for DURATION_SEC seconds, counting events per client 4. Sum total events and calculate aggregate throughput ## Pass Criteria - - Aggregate throughput >= min(10000, scale * 100) events/sec + - Aggregate throughput >= min(10000, NUM_CLIENTS * 100) events/sec - Rationale: With 1ms delay, each client should receive ~1000 events/sec. - With 100 clients, expect ~100K events/sec total. The min() handles - smaller scale values gracefully. + With 100 clients, expect ~100K events/sec total. """ - duration_seconds = 30 + # Test parameters + NUM_CLIENTS = 100 + DURATION_SEC = 30 async def client_task() -> tuple[int, str | None]: """Run client and return (event_count, error_or_none).""" @@ -161,14 +159,14 @@ async def client_task() -> tuple[int, str | None]: ) as source: async for _ in source.aiter_sse(): count += 1 - if time.perf_counter() - start >= duration_seconds: + if time.perf_counter() - start >= DURATION_SEC: break return count, None except Exception as e: return count, str(e) start_time = time.perf_counter() - tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)] results = await asyncio.gather(*tasks, return_exceptions=True) elapsed = time.perf_counter() - start_time @@ -191,8 +189,7 @@ async def client_task() -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_throughput_multiple_clients", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -212,9 +209,9 @@ async def client_task() -> tuple[int, str | None]: # Original assertion aggregate_throughput = total_events / elapsed - min_expected = min(10000, scale * 100) + min_expected = min(10000, NUM_CLIENTS * 100) assert aggregate_throughput >= min_expected, ( - f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {scale} " + f"Aggregate throughput {aggregate_throughput:.0f} events/sec with {NUM_CLIENTS} " f"clients, expected >= {min_expected}" ) @@ -222,8 +219,6 @@ async def client_task() -> tuple[int, str | None]: @pytest.mark.loadtest async def test_first_event_latency( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -249,7 +244,7 @@ async def test_first_event_latency( - Inefficient task group initialization ## Methodology - 1. Launch `scale` concurrent connection attempts simultaneously + 1. Launch NUM_CLIENTS concurrent connection attempts simultaneously 2. Each client measures time from connect() to first SSE event 3. Collect latency samples and compute percentiles @@ -257,8 +252,9 @@ async def test_first_event_latency( - p50 < 1250ms, p99 < 2500ms - Calibrated from measured p50=932ms, p99=1779ms at scale=100 - Threshold factor: 1.3x measured values - """ + # Test parameters + NUM_CLIENTS = 100 async def measure_ttfe() -> tuple[float, str | None]: start = time.perf_counter() @@ -274,7 +270,7 @@ async def measure_ttfe() -> tuple[float, str | None]: return -1, "no events received" start_time = time.perf_counter() - tasks = [asyncio.create_task(measure_ttfe()) for _ in range(scale)] + tasks = [asyncio.create_task(measure_ttfe()) for _ in range(NUM_CLIENTS)] results = await asyncio.gather(*tasks) elapsed = time.perf_counter() - start_time @@ -293,8 +289,7 @@ async def measure_ttfe() -> tuple[float, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_first_event_latency", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -313,8 +308,8 @@ async def measure_ttfe() -> tuple[float, str | None]: pytest.fail(f"Regression detected: {comparison.regression_reasons}") # Original assertions - if len(latencies) < scale * 0.9: - pytest.fail(f"Too many failed connections: {len(latencies)}/{scale}") + if len(latencies) < NUM_CLIENTS * 0.9: + pytest.fail(f"Too many failed connections: {len(latencies)}/{NUM_CLIENTS}") latencies.sort() p50 = latencies[len(latencies) // 2] @@ -327,8 +322,6 @@ async def measure_ttfe() -> tuple[float, str | None]: @pytest.mark.loadtest async def test_event_latency_under_load( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -354,8 +347,8 @@ async def test_event_latency_under_load( tail latency issues that degrade user experience. ## Methodology - 1. Launch `scale` concurrent clients to /sse?delay=0.01 (10ms between events) - 2. Each client receives 100 events and records inter-event times + 1. Launch NUM_CLIENTS concurrent clients to /sse?delay=0.01 (10ms between events) + 2. Each client receives EVENTS_PER_CLIENT events and records inter-event times 3. Aggregate all latency samples and compute percentiles ## Pass Criteria @@ -363,6 +356,9 @@ async def test_event_latency_under_load( - Calibrated from measured p50=14.8ms, p95=21.4ms, p99=27.4ms at scale=100 - Server delay: 10ms. Threshold factor: 1.3x measured values """ + # Test parameters + NUM_CLIENTS = 100 + EVENTS_PER_CLIENT = 100 async def measure_latencies() -> tuple[list[float], str | None]: latencies: list[float] = [] @@ -379,14 +375,14 @@ async def measure_latencies() -> tuple[list[float], str | None]: latencies.append((now - last_time) * 1000) last_time = now count += 1 - if count >= 100: + if count >= EVENTS_PER_CLIENT: break return latencies, None except Exception as e: return latencies, str(e) start_time = time.perf_counter() - tasks = [asyncio.create_task(measure_latencies()) for _ in range(scale)] + tasks = [asyncio.create_task(measure_latencies()) for _ in range(NUM_CLIENTS)] results = await asyncio.gather(*tasks) elapsed = time.perf_counter() - start_time @@ -406,8 +402,7 @@ async def measure_latencies() -> tuple[list[float], str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_event_latency_under_load", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) diff --git a/tests/load/test_watcher_scale.py b/tests/load/test_watcher_scale.py index 67cc28f..e71f9cd 100644 --- a/tests/load/test_watcher_scale.py +++ b/tests/load/test_watcher_scale.py @@ -15,6 +15,7 @@ from __future__ import annotations import asyncio +import time import httpx import pytest @@ -29,8 +30,6 @@ @pytest.mark.loadtest async def test_single_watcher_with_many_connections( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -42,7 +41,7 @@ async def test_single_watcher_with_many_connections( ## What is Measured - `watcher_started` flag from /metrics (True = watcher exists) - - `registered_events` count (should be >= scale * 0.5) + - `registered_events` count (should be >= NUM_CLIENTS * 0.5) - Implicit: CPU usage would spike if multiple watchers existed (not measured) ## Why This Matters @@ -57,18 +56,21 @@ async def test_single_watcher_with_many_connections( - Constant CPU overhead regardless of connection count ## Methodology - 1. Connect `scale` concurrent clients (default 100) + 1. Connect NUM_CLIENTS concurrent clients 2. Wait for connections to establish (~2s) 3. Query /metrics for watcher_started and registered_events 4. Cancel all connections ## Pass Criteria - watcher_started = True (watcher exists for active connections) - - registered_events >= scale * 0.5 (most connections registered) + - registered_events >= NUM_CLIENTS * 0.5 (most connections registered) - Rationale: watcher_started=True confirms the mechanism works. Event count verifies registration worked. We don't directly measure watcher count, but CPU metrics in CI would catch proliferation. """ + # Test parameters + NUM_CLIENTS = 100 + HOLD_DURATION_SEC = 5 async def client_task() -> tuple[int, str | None]: try: @@ -77,14 +79,16 @@ async def client_task() -> tuple[int, str | None]: client, "GET", f"{sse_server_url}/sse?delay=0.1" ) as source: async for _ in source.aiter_sse(): - await asyncio.sleep(5) # Stay connected + await asyncio.sleep(HOLD_DURATION_SEC) # Stay connected break return 1, None except Exception as e: return 0, str(e) + start_time = time.perf_counter() + # Start many connections - tasks = [asyncio.create_task(client_task()) for _ in range(scale)] + tasks = [asyncio.create_task(client_task()) for _ in range(NUM_CLIENTS)] # Wait for connections to establish await asyncio.sleep(2) @@ -102,6 +106,9 @@ async def client_task() -> tuple[int, str | None]: task.cancel() results = await asyncio.gather(*tasks, return_exceptions=True) + elapsed = time.perf_counter() - start_time + metrics_collector.set_duration(elapsed) + # Process results for result in results: if isinstance(result, Exception): @@ -123,8 +130,7 @@ async def client_task() -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_single_watcher_with_many_connections", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_CLIENTS, ) register_test_report(report) @@ -145,15 +151,13 @@ async def client_task() -> tuple[int, str | None]: # Original assertions assert watcher_started is True, "Watcher should be started with active connections" assert ( - registered_events >= scale * 0.5 - ), f"Expected at least {scale * 0.5} events, got {registered_events}" + registered_events >= NUM_CLIENTS * 0.5 + ), f"Expected at least {NUM_CLIENTS * 0.5} events, got {registered_events}" @pytest.mark.loadtest async def test_rapid_connect_disconnect_watcher_stability( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -178,7 +182,7 @@ async def test_rapid_connect_disconnect_watcher_stability( connections closed quickly, watchers accumulated and never stopped. ## Methodology - 1. Run `scale / 10` batches of 10 quick connections each + 1. Run NUM_BATCHES batches of BATCH_SIZE quick connections each 2. Each connection receives 1 event and disconnects immediately 3. After all batches, check thread count and watcher status @@ -188,6 +192,9 @@ async def test_rapid_connect_disconnect_watcher_stability( accumulated, we'd see hundreds of threads (one per watcher task). 50 provides margin for legitimate worker threads. """ + # Test parameters + NUM_BATCHES = 10 + BATCH_SIZE = 10 async def quick_connect() -> tuple[int, str | None]: try: @@ -201,9 +208,11 @@ async def quick_connect() -> tuple[int, str | None]: except Exception as e: return 0, str(e) + start_time = time.perf_counter() + # Rapid connect/disconnect cycles - for batch in range(scale // 10): - tasks = [asyncio.create_task(quick_connect()) for _ in range(10)] + for _ in range(NUM_BATCHES): + tasks = [asyncio.create_task(quick_connect()) for _ in range(BATCH_SIZE)] results = await asyncio.gather(*tasks, return_exceptions=True) for result in results: @@ -220,6 +229,9 @@ async def quick_connect() -> tuple[int, str | None]: # Brief pause await asyncio.sleep(0.5) + elapsed = time.perf_counter() - start_time + metrics_collector.set_duration(elapsed) + # Check metrics - watcher should still be singular async with httpx.AsyncClient() as client: metrics = (await client.get(f"{sse_server_url}/metrics")).json() @@ -237,8 +249,7 @@ async def quick_connect() -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_rapid_connect_disconnect_watcher_stability", - scale=scale, - duration_minutes=duration_minutes, + scale=NUM_BATCHES * BATCH_SIZE, ) register_test_report(report) @@ -263,8 +274,6 @@ async def quick_connect() -> tuple[int, str | None]: @pytest.mark.loadtest async def test_watcher_cleanup_allows_restart( sse_server_url: str, - scale: int, - duration_minutes: int, metrics_collector: MetricsCollector, baseline_manager: BaselineManager, report_generator: ReportGenerator, @@ -290,10 +299,10 @@ async def test_watcher_cleanup_allows_restart( connections won't receive shutdown signals, causing graceful shutdown to fail. ## Methodology - 1. Phase 1: Connect 50 clients, each receives 20 events, then disconnects + 1. Phase 1: Connect CLIENTS_PER_PHASE clients, each receives EVENTS_PER_CLIENT events, then disconnects 2. Wait 1s for cleanup 3. Check registered_events (should be near 0) - 4. Phase 2: Connect 50 new clients, each receives 20 events + 4. Phase 2: Connect CLIENTS_PER_PHASE new clients, each receives EVENTS_PER_CLIENT events 5. Wait 1s for cleanup 6. Verify final state matches Phase 1 post-cleanup @@ -304,6 +313,9 @@ async def test_watcher_cleanup_allows_restart( - Rationale: If watcher didn't restart in Phase 2, no events would be delivered. The +5 margin allows for concurrent test interference. """ + # Test parameters + CLIENTS_PER_PHASE = 50 + EVENTS_PER_CLIENT = 20 async def connect_and_consume(n_events: int) -> tuple[int, str | None]: count = 0 @@ -320,8 +332,13 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]: except Exception as e: return count, str(e) + start_time = time.perf_counter() + # Phase 1: Connect, consume, disconnect - tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)] + tasks = [ + asyncio.create_task(connect_and_consume(EVENTS_PER_CLIENT)) + for _ in range(CLIENTS_PER_PHASE) + ] results = await asyncio.gather(*tasks) phase1_events = 0 @@ -345,7 +362,10 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]: metrics_collector.add_memory_sample(metrics1["memory_rss_mb"]) # Phase 2: New connections should work - tasks = [asyncio.create_task(connect_and_consume(20)) for _ in range(50)] + tasks = [ + asyncio.create_task(connect_and_consume(EVENTS_PER_CLIENT)) + for _ in range(CLIENTS_PER_PHASE) + ] results = await asyncio.gather(*tasks) phase2_events = 0 @@ -362,6 +382,9 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]: # Wait for cleanup await asyncio.sleep(1) + elapsed = time.perf_counter() - start_time + metrics_collector.set_duration(elapsed) + # Verify clean state async with httpx.AsyncClient() as client: metrics2 = (await client.get(f"{sse_server_url}/metrics")).json() @@ -379,8 +402,7 @@ async def connect_and_consume(n_events: int) -> tuple[int, str | None]: # Generate report report = metrics_collector.compute_report( test_name="test_watcher_cleanup_allows_restart", - scale=scale, - duration_minutes=duration_minutes, + scale=CLIENTS_PER_PHASE * 2, ) register_test_report(report)