diff --git a/.github/copilot/instructions/api-retry-strategy.md b/.github/copilot/instructions/api-retry-strategy.md new file mode 100644 index 0000000..ccddefb --- /dev/null +++ b/.github/copilot/instructions/api-retry-strategy.md @@ -0,0 +1,60 @@ +# API Retry Strategy Guide + +## Performance Context + +Control D API has strict rate limits. The sync tool retries failed requests with exponential backoff to handle transient failures (network issues, temporary server errors) while respecting API constraints. + +## Current Implementation + +**Location:** `main.py::_retry_request()` (line ~845) + +**Key characteristics:** +- Max retries: 10 attempts (configurable via `MAX_RETRIES`) +- Base delay: 1 second (configurable via `RETRY_DELAY`) +- Exponential backoff: `delay * (2^attempt)` → 1s, 2s, 4s, 8s, 16s, ... +- Smart error handling: Don't retry 4xx errors except 429 (rate limit) +- Security-aware: Sanitizes error messages in logs + +## Jitter Pattern (Recommended) + +**Why jitter matters:** +When multiple requests fail simultaneously (e.g., API outage), synchronized retries create "thundering herd" - all clients retry at exact same time, overwhelming the recovering server. Jitter randomizes retry timing to spread load. + +**Implementation formula:** +```python +import random +wait_time = (delay * (2 ** attempt)) * (0.5 + random.random()) +``` + +This adds ±50% randomness: a 4s backoff becomes 2-6s range. + +**Maintainer rationale (from discussion #219):** +> "API rate limits are non-negotiable. Serial processing exists because I got burned by 429s and zombie states in production. Any retry improvement needs rock-solid rate limit awareness." + +## Testing Approach + +**Unit tests:** +- Verify jitter stays within bounds (0.5x to 1.5x base delay) +- Confirm 4xx errors (except 429) still don't retry +- Check max retries still respected + +**Integration tests:** +- Simulate transient failures (mock server returning 500s) +- Measure retry timing distribution (should show variance) +- Confirm eventual success after transient errors + +**Performance validation:** +No performance degradation expected - jitter adds microseconds of `random()` overhead per retry, negligible compared to network I/O. + +## Common Pitfalls + +1. **Don't add jitter to initial request** - only to retries. First attempt should be immediate. +2. **Don't exceed max backoff** - cap total wait time to prevent indefinite delays. +3. **Be cautious with 429 responses** - current behavior still uses jittered exponential backoff; once `Retry-After` handling is implemented, jitter should be disabled in favor of the header-specified delay. +4. **Don't break existing behavior** - ensure 4xx non-retryable errors still fail fast. + +## Future Improvements + +- **Rate limit header parsing:** Implement proper `Retry-After` handling for 429 responses (and bypass jitter when a valid `Retry-After` value is present) +- **Circuit breaker:** Stop retrying after consecutive failures to prevent cascading failures +- **Per-endpoint tracking:** Different backoff strategies for read vs. write operations diff --git a/benchmark_retry_jitter.py b/benchmark_retry_jitter.py new file mode 100644 index 0000000..3095467 --- /dev/null +++ b/benchmark_retry_jitter.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +Synthetic benchmark to demonstrate retry jitter behavior. + +Run this script to see how jitter randomizes retry delays compared to +deterministic exponential backoff. + +Usage: python3 benchmark_retry_jitter.py +""" + +import random +from typing import List + +def simulate_retries_without_jitter(max_retries: int, base_delay: float) -> List[float]: + """Simulate retry delays WITHOUT jitter (old behavior).""" + delays = [] + for attempt in range(max_retries - 1): + wait_time = base_delay * (2 ** attempt) + delays.append(wait_time) + return delays + +def simulate_retries_with_jitter(max_retries: int, base_delay: float) -> List[float]: + """Simulate retry delays WITH jitter (new behavior).""" + delays = [] + for attempt in range(max_retries - 1): + base_wait = base_delay * (2 ** attempt) + jitter_factor = 0.5 + random.random() # [0.5, 1.5] + wait_time = base_wait * jitter_factor + delays.append(wait_time) + return delays + +def main(): + print("=" * 60) + print("Retry Jitter Performance Demonstration") + print("=" * 60) + print() + + max_retries = 5 + base_delay = 1.0 + + print(f"Configuration: max_retries={max_retries}, base_delay={base_delay}s") + print() + + # Without jitter (deterministic) + print("WITHOUT JITTER (old behavior):") + print("All clients retry at exactly the same time (thundering herd)") + print() + without_jitter = simulate_retries_without_jitter(max_retries, base_delay) + for i, delay in enumerate(without_jitter): + print(f" Attempt {i+1}: {delay:6.2f}s") + print(f" Total: {sum(without_jitter):6.2f}s") + print() + + # With jitter (randomized) + print("WITH JITTER (new behavior):") + print("Retries spread across time window, reducing server load spikes") + print() + + # Run 3 simulations to show variance + for run in range(3): + print(f" Run {run+1}:") + with_jitter = simulate_retries_with_jitter(max_retries, base_delay) + for i, delay in enumerate(with_jitter): + base = base_delay * (2 ** i) + print(f" Attempt {i+1}: {delay:6.2f}s (base: {base:4.1f}s, range: [{base*0.5:.1f}s, {base*1.5:.1f}s])") + print(f" Total: {sum(with_jitter):6.2f}s") + print() + + # Statistical analysis + print("IMPACT ANALYSIS:") + print() + + # Simulate thundering herd scenario + num_clients = 100 + print(f"Scenario: {num_clients} clients all fail at the same time") + print() + + print("WITHOUT JITTER:") + print(f" At t=1s: ALL {num_clients} clients retry simultaneously → server overload") + print(f" At t=2s: ALL {num_clients} clients retry simultaneously → server overload") + print(f" At t=4s: ALL {num_clients} clients retry simultaneously → server overload") + print() + + print("WITH JITTER:") + # Simulate retry distribution + retry_times = [] + for _ in range(num_clients): + first_retry = (base_delay * (0.5 + random.random())) + retry_times.append(first_retry) + + retry_times.sort() + min_time = min(retry_times) + max_time = max(retry_times) + avg_time = sum(retry_times) / len(retry_times) + + print(f" First retry window: {min_time:.2f}s to {max_time:.2f}s (spread: {max_time - min_time:.2f}s)") + print(f" Average first retry: {avg_time:.2f}s") + print(f" Retries distributed over time → reduced peak load on server") + print() + + # Calculate approximate load reduction based on bucketed concurrency + print("THEORETICAL LOAD REDUCTION:") + window_size = max_time - min_time + if window_size > 0: + # Use small time buckets (e.g., 100ms) to approximate peak concurrent retries + bucket_size = 0.1 # seconds + num_buckets = max(1, int(window_size / bucket_size) + 1) + buckets = [0] * num_buckets + + # Count how many retries fall into each time bucket + for t in retry_times: + # Normalize to start of window and compute bucket index + idx = int((t - min_time) / bucket_size) + if idx >= num_buckets: + # Clamp to last bucket to handle any floating-point edge cases + idx = num_buckets - 1 + buckets[idx] += 1 + + peak_with_jitter = max(buckets) + peak_without_jitter = num_clients # all clients retry together without jitter + reduction = (1 - (peak_with_jitter / peak_without_jitter)) * 100 + + print(f" Approximate peak concurrent retries with jitter: {peak_with_jitter} (per {bucket_size:.1f}s)") + print(f" Peak concurrent retries reduced by approximately {reduction:.0f}%") + else: + # In the extremely unlikely case that all retries occur at the same instant + print(" All retries occurred at the same time; no observable spreading in this run.") + print() + + print("✅ Jitter prevents thundering herd and improves system reliability") + +if __name__ == "__main__": + main() diff --git a/main.py b/main.py index 1f4f047..7fd1f95 100644 --- a/main.py +++ b/main.py @@ -21,6 +21,7 @@ import logging import os import platform +import random import re import shutil import socket @@ -864,10 +865,18 @@ def _retry_request(request_func, max_retries=MAX_RETRIES, delay=RETRY_DELAY): if hasattr(e, "response") and e.response is not None: log.debug(f"Response content: {sanitize_for_log(e.response.text)}") raise - wait_time = delay * (2**attempt) + + # Exponential backoff with jitter to prevent thundering herd + # Base delay: delay * (2^attempt) gives exponential growth + # Jitter: multiply by random factor in range [0.5, 1.5] to spread retries + # This prevents multiple failed requests from retrying simultaneously + base_wait = delay * (2**attempt) + jitter_factor = 0.5 + random.random() # Random value between 0.5 and 1.5 + wait_time = base_wait * jitter_factor + log.warning( f"Request failed (attempt {attempt + 1}/{max_retries}): " - f"{sanitize_for_log(e)}. Retrying in {wait_time}s..." + f"{sanitize_for_log(e)}. Retrying in {wait_time:.2f}s..." ) time.sleep(wait_time) diff --git a/tests/test_retry_jitter.py b/tests/test_retry_jitter.py new file mode 100644 index 0000000..e1e0435 --- /dev/null +++ b/tests/test_retry_jitter.py @@ -0,0 +1,181 @@ +""" +Tests for retry logic with exponential backoff and jitter. + +These tests verify that: +1. Jitter randomizes retry delays to prevent thundering herd +2. Exponential backoff still functions correctly +3. Non-retryable errors (4xx except 429) fail fast +4. Max retries limit is respected +""" + +from unittest.mock import Mock, patch +import pytest +import httpx + + +# Import functions under test +import main + + +class TestRetryJitter: + """Tests for jitter in exponential backoff retry logic.""" + + def test_jitter_adds_randomness_to_retry_delays(self): + """Verify that retry delays include jitter and aren't identical.""" + request_func = Mock(side_effect=httpx.TimeoutException("Connection timeout")) + + # Collect actual wait times across multiple retry sequences + wait_times_run1 = [] + wait_times_run2 = [] + + with patch('time.sleep') as mock_sleep: + # First run + try: + main._retry_request(request_func, max_retries=3, delay=1) + except httpx.TimeoutException: + pass + wait_times_run1 = [call.args[0] for call in mock_sleep.call_args_list] + + with patch('time.sleep') as mock_sleep: + # Second run with fresh mock + request_func.side_effect = httpx.TimeoutException("Connection timeout") + try: + main._retry_request(request_func, max_retries=3, delay=1) + except httpx.TimeoutException: + pass + wait_times_run2 = [call.args[0] for call in mock_sleep.call_args_list] + + # Both runs should have same number of retries (2 retries for 3 max_retries) + assert len(wait_times_run1) == 2 + assert len(wait_times_run2) == 2 + + # Due to jitter, wait times should differ between runs + # (with high probability - could theoretically be equal but extremely unlikely) + assert wait_times_run1 != wait_times_run2, \ + "Jitter should produce different wait times across runs" + + def test_jitter_stays_within_bounds(self): + """Verify jitter keeps delays within expected range (0.5x to 1.5x base).""" + request_func = Mock(side_effect=httpx.TimeoutException("Connection timeout")) + + with patch('time.sleep') as mock_sleep: + try: + main._retry_request(request_func, max_retries=5, delay=1) + except httpx.TimeoutException: + pass + + wait_times = [call.args[0] for call in mock_sleep.call_args_list] + + # Verify each wait time is within jitter bounds + for attempt, wait_time in enumerate(wait_times): + base_delay = 1 * (2 ** attempt) # Exponential backoff formula + min_expected = base_delay * 0.5 + max_expected = base_delay * 1.5 + + assert min_expected <= wait_time <= max_expected, \ + f"Attempt {attempt}: wait time {wait_time:.2f}s outside jitter bounds " \ + f"[{min_expected:.2f}s, {max_expected:.2f}s]" + + def test_exponential_backoff_still_increases(self): + """Verify that despite jitter, the exponential base scaling is correct. + + We fix random.random() to a constant so that jitter becomes deterministic, + and then assert that each delay matches delay * 2**attempt * jitter_factor. + """ + request_func = Mock(side_effect=httpx.TimeoutException("Connection timeout")) + + # Use a fixed random.random() so jitter multiplier is stable across attempts. + # Assuming jitter is implemented as: base_delay * (0.5 + random.random()), + # a fixed return_value of 0.5 yields a jitter_factor of 1.0. + with patch('time.sleep') as mock_sleep, patch('random.random', return_value=0.5): + try: + main._retry_request(request_func, max_retries=5, delay=1) + except httpx.TimeoutException: + pass + + wait_times = [call.args[0] for call in mock_sleep.call_args_list] + + jitter_factor = 0.5 + 0.5 # Matches the patched random.random() above + for attempt, wait_time in enumerate(wait_times): + base_delay = 1 * (2 ** attempt) + expected_delay = base_delay * jitter_factor + # Use approx to avoid brittle float equality while still being strict. + assert wait_time == pytest.approx(expected_delay), ( + f"Attempt {attempt}: expected {expected_delay:.2f}s, " + f"got {wait_time:.2f}s" + ) + def test_four_hundred_errors_still_fail_fast(self): + """Verify 4xx errors (except 429) still don't retry despite jitter.""" + response = Mock(status_code=404) + error = httpx.HTTPStatusError( + "Not found", + request=Mock(), + response=response + ) + request_func = Mock(side_effect=error) + + with patch('time.sleep') as mock_sleep: + with pytest.raises(httpx.HTTPStatusError): + main._retry_request(request_func, max_retries=5, delay=1) + + # Should not sleep at all - fail immediately + assert mock_sleep.call_count == 0 + + def test_429_rate_limit_retries_with_jitter(self): + """Verify 429 rate limit errors retry with jittered backoff.""" + response = Mock(status_code=429) + error = httpx.HTTPStatusError( + "Too many requests", + request=Mock(), + response=response + ) + request_func = Mock(side_effect=error) + + with patch('time.sleep') as mock_sleep: + with pytest.raises(httpx.HTTPStatusError): + main._retry_request(request_func, max_retries=3, delay=1) + + # Should retry (2 retries for max_retries=3) + assert mock_sleep.call_count == 2 + + # Verify jitter applied to retries + wait_times = [call.args[0] for call in mock_sleep.call_args_list] + assert len(wait_times) == 2 + + # First retry: base=1, range=[0.5, 1.5] + assert 0.5 <= wait_times[0] <= 1.5 + + def test_successful_retry_after_transient_failure(self): + """Verify successful request after transient failures works correctly.""" + # Fail twice, then succeed + request_func = Mock(side_effect=[ + httpx.TimeoutException("Timeout 1"), + httpx.TimeoutException("Timeout 2"), + Mock(status_code=200) # Success + ]) + + with patch('time.sleep') as mock_sleep: + response = main._retry_request(request_func, max_retries=5, delay=1) + + # Should have made 3 requests total (2 failures + 1 success) + assert request_func.call_count == 3 + + # Should have slept twice (after first two failures) + assert mock_sleep.call_count == 2 + + # Should return the successful response + assert response.status_code == 200 + + def test_max_retries_respected(self): + """Verify max_retries limit is still enforced with jitter.""" + request_func = Mock(side_effect=httpx.TimeoutException("Always fails")) + + with patch('time.sleep') as mock_sleep: + with pytest.raises(httpx.TimeoutException): + main._retry_request(request_func, max_retries=4, delay=1) + + # Should attempt exactly max_retries times + assert request_func.call_count == 4 + + # Should sleep max_retries-1 times (no sleep after final failure) + assert mock_sleep.call_count == 3