abhimehro · abhimehro · Feb 19, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/.github/copilot/instructions/api-retry-strategy.md b/.github/copilot/instructions/api-retry-strategy.md
@@ -0,0 +1,60 @@
+# API Retry Strategy Guide
+
+## Performance Context
+
+Control D API has strict rate limits. The sync tool retries failed requests with exponential backoff to handle transient failures (network issues, temporary server errors) while respecting API constraints.
+
+## Current Implementation
+
+**Location:** `main.py::_retry_request()` (line ~845)
+
+**Key characteristics:**
+- Max retries: 10 attempts (configurable via `MAX_RETRIES`)
+- Base delay: 1 second (configurable via `RETRY_DELAY`)
+- Exponential backoff: `delay * (2^attempt)` → 1s, 2s, 4s, 8s, 16s, ...
+- Smart error handling: Don't retry 4xx errors except 429 (rate limit)
+- Security-aware: Sanitizes error messages in logs
+
+## Jitter Pattern (Recommended)
+
+**Why jitter matters:**
+When multiple requests fail simultaneously (e.g., API outage), synchronized retries create "thundering herd" - all clients retry at exact same time, overwhelming the recovering server. Jitter randomizes retry timing to spread load.
+
+**Implementation formula:**
+```python
+import random
+wait_time = (delay * (2 ** attempt)) * (0.5 + random.random())
+```
+
+This adds ±50% randomness: a 4s backoff becomes 2-6s range.
+
+**Maintainer rationale (from discussion #219):**
+> "API rate limits are non-negotiable. Serial processing exists because I got burned by 429s and zombie states in production. Any retry improvement needs rock-solid rate limit awareness."
+
+## Testing Approach
+
+**Unit tests:**
+- Verify jitter stays within bounds (0.5x to 1.5x base delay)
+- Confirm 4xx errors (except 429) still don't retry
+- Check max retries still respected
+
+**Integration tests:**
+- Simulate transient failures (mock server returning 500s)
+- Measure retry timing distribution (should show variance)
+- Confirm eventual success after transient errors
+
+**Performance validation:**
+No performance degradation expected - jitter adds microseconds of `random()` overhead per retry, negligible compared to network I/O.
+
+## Common Pitfalls
+
+1. **Don't add jitter to initial request** - only to retries. First attempt should be immediate.
+2. **Don't exceed max backoff** - cap total wait time to prevent indefinite delays.
+3. **Be cautious with 429 responses** - current behavior still uses jittered exponential backoff; once `Retry-After` handling is implemented, jitter should be disabled in favor of the header-specified delay.
+4. **Don't break existing behavior** - ensure 4xx non-retryable errors still fail fast.
+
+## Future Improvements
+
+- **Rate limit header parsing:** Implement proper `Retry-After` handling for 429 responses (and bypass jitter when a valid `Retry-After` value is present)
+- **Circuit breaker:** Stop retrying after consecutive failures to prevent cascading failures
+- **Per-endpoint tracking:** Different backoff strategies for read vs. write operations
diff --git a/benchmark_retry_jitter.py b/benchmark_retry_jitter.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+Synthetic benchmark to demonstrate retry jitter behavior.
+
+Run this script to see how jitter randomizes retry delays compared to
+deterministic exponential backoff.
+
+Usage: python3 benchmark_retry_jitter.py
+"""
+
+import random
+from typing import List
+
+def simulate_retries_without_jitter(max_retries: int, base_delay: float) -> List[float]:
+    """Simulate retry delays WITHOUT jitter (old behavior)."""
+    delays = []
+    for attempt in range(max_retries - 1):
+        wait_time = base_delay * (2 ** attempt)
+        delays.append(wait_time)
+    return delays
+
+def simulate_retries_with_jitter(max_retries: int, base_delay: float) -> List[float]:
+    """Simulate retry delays WITH jitter (new behavior)."""
+    delays = []
+    for attempt in range(max_retries - 1):
+        base_wait = base_delay * (2 ** attempt)
+        jitter_factor = 0.5 + random.random()  # [0.5, 1.5]
+        wait_time = base_wait * jitter_factor
+        delays.append(wait_time)
+    return delays
+
+def main():
+    print("=" * 60)
+    print("Retry Jitter Performance Demonstration")
+    print("=" * 60)
+    print()
+
+    max_retries = 5
+    base_delay = 1.0
+
+    print(f"Configuration: max_retries={max_retries}, base_delay={base_delay}s")
+    print()
+
+    # Without jitter (deterministic)
+    print("WITHOUT JITTER (old behavior):")
+    print("All clients retry at exactly the same time (thundering herd)")
+    print()
+    without_jitter = simulate_retries_without_jitter(max_retries, base_delay)
+    for i, delay in enumerate(without_jitter):
+        print(f"  Attempt {i+1}: {delay:6.2f}s")
+    print(f"  Total: {sum(without_jitter):6.2f}s")
+    print()
+
+    # With jitter (randomized)
+    print("WITH JITTER (new behavior):")
+    print("Retries spread across time window, reducing server load spikes")
+    print()
+
+    # Run 3 simulations to show variance
+    for run in range(3):
+        print(f"  Run {run+1}:")
+        with_jitter = simulate_retries_with_jitter(max_retries, base_delay)
+        for i, delay in enumerate(with_jitter):
+            base = base_delay * (2 ** i)
+            print(f"    Attempt {i+1}: {delay:6.2f}s (base: {base:4.1f}s, range: [{base*0.5:.1f}s, {base*1.5:.1f}s])")
+        print(f"    Total: {sum(with_jitter):6.2f}s")
+        print()
+
+    # Statistical analysis
+    print("IMPACT ANALYSIS:")
+    print()
+
+    # Simulate thundering herd scenario
+    num_clients = 100
+    print(f"Scenario: {num_clients} clients all fail at the same time")
+    print()
+
+    print("WITHOUT JITTER:")
+    print(f"  At t=1s: ALL {num_clients} clients retry simultaneously → server overload")
+    print(f"  At t=2s: ALL {num_clients} clients retry simultaneously → server overload")
+    print(f"  At t=4s: ALL {num_clients} clients retry simultaneously → server overload")
+    print()
+
+    print("WITH JITTER:")
+    # Simulate retry distribution
+    retry_times = []
+    for _ in range(num_clients):
+        first_retry = (base_delay * (0.5 + random.random()))
+        retry_times.append(first_retry)
+
+    retry_times.sort()
+    min_time = min(retry_times)
+    max_time = max(retry_times)
+    avg_time = sum(retry_times) / len(retry_times)
+
+    print(f"  First retry window: {min_time:.2f}s to {max_time:.2f}s (spread: {max_time - min_time:.2f}s)")
+    print(f"  Average first retry: {avg_time:.2f}s")
+    print(f"  Retries distributed over time → reduced peak load on server")
+    print()
+
+    # Calculate approximate load reduction based on bucketed concurrency
+    print("THEORETICAL LOAD REDUCTION:")
+    window_size = max_time - min_time
+    if window_size > 0:
+        # Use small time buckets (e.g., 100ms) to approximate peak concurrent retries
+        bucket_size = 0.1  # seconds
+        num_buckets = max(1, int(window_size / bucket_size) + 1)
+        buckets = [0] * num_buckets
+
+        # Count how many retries fall into each time bucket
+        for t in retry_times:
+            # Normalize to start of window and compute bucket index
+            idx = int((t - min_time) / bucket_size)
+            if idx >= num_buckets:
+                # Clamp to last bucket to handle any floating-point edge cases
+                idx = num_buckets - 1
+            buckets[idx] += 1
+
+        peak_with_jitter = max(buckets)
+        peak_without_jitter = num_clients  # all clients retry together without jitter
+        reduction = (1 - (peak_with_jitter / peak_without_jitter)) * 100
+
+        print(f"  Approximate peak concurrent retries with jitter: {peak_with_jitter} (per {bucket_size:.1f}s)")
+        print(f"  Peak concurrent retries reduced by approximately {reduction:.0f}%")
+    else:
+        # In the extremely unlikely case that all retries occur at the same instant
+        print("  All retries occurred at the same time; no observable spreading in this run.")
+    print()
+
+    print("✅ Jitter prevents thundering herd and improves system reliability")
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import platform
+import random
 import re
 import shutil
 import socket
@@ -864,10 +865,18 @@
                 if hasattr(e, "response") and e.response is not None:
                     log.debug(f"Response content: {sanitize_for_log(e.response.text)}")
                 raise
-            wait_time = delay * (2**attempt)
+
+            # Exponential backoff with jitter to prevent thundering herd
+            # Base delay: delay * (2^attempt) gives exponential growth
+            # Jitter: multiply by random factor in range [0.5, 1.5] to spread retries
+            # This prevents multiple failed requests from retrying simultaneously
+            base_wait = delay * (2**attempt)
+            jitter_factor = 0.5 + random.random()  # Random value between 0.5 and 1.5
+            wait_time = base_wait * jitter_factor
+
             log.warning(
                 f"Request failed (attempt {attempt + 1}/{max_retries}): "
-                f"{sanitize_for_log(e)}. Retrying in {wait_time}s..."
+                f"{sanitize_for_log(e)}. Retrying in {wait_time:.2f}s..."
             )
             time.sleep(wait_time)