PatrickFanella · PatrickFanella · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Copilot
diff --git a/cmd/tradingagent/runtime.go b/cmd/tradingagent/runtime.go
@@ -187,6 +187,7 @@ func newAPIServer(ctx context.Context, cfg config.Config, logger *slog.Logger) (
 		NewsFeedRepo:     newsFeedRepo,
 		DiscoveryRunRepo: pgrepo.NewDiscoveryRunRepo(db.Pool),
 		ReportArtifacts:  reportArtifactRepo,
+		ReportMetrics:    appMetrics,
 	}
 	notificationManager := newNotificationManager(cfg)
 
@@ -344,6 +345,7 @@ func newAPIServer(ctx context.Context, cfg config.Config, logger *slog.Logger) (
 				Logger:                logger,
 			})
 			orch.WithJobMetrics(appMetrics)
+			orch.WithReportMetrics(appMetrics)
 			orch.RegisterAll()
 			if err := orch.Start(); err != nil {
 				logger.Warn("automation: failed to start job orchestrator", slog.Any("error", err))
@@ -540,6 +542,12 @@ func chainOpts(cfg config.LLMConfig, appMetrics *metrics.Metrics, logger *slog.L
 	// Retry with exponential backoff.
 	if cfg.RetryMaxAttempts > 1 {
 		opts = append(opts, llm.WithRetry(cfg.RetryMaxAttempts))
+		if appMetrics != nil {
+			opts = append(opts, llm.WithChainRetryMetrics(&retryMetricsAdapter{
+				m:        appMetrics,
+				provider: configuredPrimaryRetryProviderLabel(cfg.DefaultProvider),
+			}))
+		}
 	}
 
 	// Fallback provider.
@@ -570,6 +578,9 @@ func chainOpts(cfg config.LLMConfig, appMetrics *metrics.Metrics, logger *slog.L
 	// Budget guard.
 	if budget != nil {
 		opts = append(opts, llm.WithBudget(budget))
+		if appMetrics != nil {
+			opts = append(opts, llm.WithChainBudgetMetrics(appMetrics))
+		}
 	}
 
 	// Per-call timeout.
@@ -580,6 +591,23 @@ func chainOpts(cfg config.LLMConfig, appMetrics *metrics.Metrics, logger *slog.L
 	return opts
 }
 
+// retryMetricsAdapter adapts *metrics.Metrics to the llm.RetryMetrics interface
+// by binding a provider label at construction time.
+type retryMetricsAdapter struct {
+	m        *metrics.Metrics
+	provider string
+}
+
+func (a *retryMetricsAdapter) RecordLLMRetry() { a.m.RecordLLMRetry(a.provider) }
+
+func configuredPrimaryRetryProviderLabel(provider string) string {
+	name := strings.TrimSpace(provider)
+	if name == "" {
+		name = "unknown"
+	}
+	return fmt.Sprintf("configured_primary:%s", name)
+}
+
 func buildLLMBudget(cfg config.LLMConfig) *llm.Budget {
 	// Validate() enforces non-negative values, but unit tests call runtime helpers
 	// directly with hand-built LLMConfig values; clamp defensively for that path.

diff --git a/cmd/tradingagent/runtime_test.go b/cmd/tradingagent/runtime_test.go
@@ -951,6 +951,28 @@ func TestBuildProviderChain_InvalidFallbackSkipped(t *testing.T) {
 	}
 }
 
+func TestConfiguredPrimaryRetryProviderLabel(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{name: "trimmed provider", input: " openai ", expected: "configured_primary:openai"},
+		{name: "empty provider", input: "   ", expected: "configured_primary:unknown"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			if got := configuredPrimaryRetryProviderLabel(tt.input); got != tt.expected {
+				t.Fatalf("configuredPrimaryRetryProviderLabel(%q) = %q, want %q", tt.input, got, tt.expected)
+			}
+		})
+	}
+}
+
 func slogDiscardLogger() *slog.Logger {
 	return slog.New(slog.NewTextHandler(io.Discard, nil))
 }
diff --git a/docs/reference/llm-resilience.md b/docs/reference/llm-resilience.md
@@ -0,0 +1,200 @@
+---
+title: 'LLM Resilience'
+description: 'Provider chain architecture, env var reference, and operational troubleshooting.'
+status: 'canonical'
+created: '2026-04-16'
+tags: [llm, resilience, observability, reference]
+---
+
+## Provider Chain Architecture
+
+Every LLM call passes through a layered provider chain. Each layer is optional — omitting its config disables it.
+
+```text
+┌─────────────────────────────────────────────────────────────┐
+│                    LLM Provider Chain                       │
+│                                                             │
+│  Request flow (outermost → innermost):                      │
+│                                                             │
+│  ┌──────────────┐                                           │
+│  │ Budget Guard  │  ErrBudgetExhausted → not retried        │
+│  └──────┬───────┘                                           │
+│         ▼                                                   │
+│  ┌──────────────┐                                           │
+│  │   Timeout    │  Per-call deadline (LLM_CALL_TIMEOUT)     │
+│  └──────┬───────┘                                           │
+│         ▼                                                   │
+│  ┌──────────────┐                                           │
+│  │  Throttle    │  Concurrency limiter (LLM_THROTTLE_…)     │
+│  └──────┬───────┘                                           │
+│         ▼                                                   │
+│  ┌──────────────┐                                           │
+│  │    Retry     │  Exponential backoff on 429/5xx           │
+│  └──────┬───────┘                                           │
+│         ▼                                                   │
+│  ┌──────────────┐                                           │
+│  │  Fallback    │  Primary fails → try secondary            │
+│  └──────┬───────┘                                           │
+│         ▼                                                   │
+│  ┌──────────────┐                                           │
+│  │    Cache     │  In-memory response cache                 │
+│  └──────┬───────┘                                           │
+│         ▼                                                   │
+│  ┌──────────────┐                                           │
+│  │   Provider   │  Raw OpenAI / OpenRouter / Ollama / etc.  │
+│  └──────────────┘                                           │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### Chain construction
+
+`NewProviderChain(primary, logger, opts...)` builds layers inside-out:
+
+1. **Cache** (innermost) — deduplicates identical prompts
+2. **Fallback** — routes to secondary provider on failure
+3. **Retry** — exponential backoff with jitter on transient errors
+4. **Throttle** — semaphore-based concurrency limiter
+5. **Timeout** — per-call `context.WithTimeout`
+6. **Budget Guard** (outermost) — rejects when daily limits hit
+
+Only layers whose options are provided are added. With zero config the chain degrades to the raw provider.
+
+## Environment Variables
+
+| Variable                   | Type     | Default  | Description                                             |
+| -------------------------- | -------- | -------- | ------------------------------------------------------- |
+| `LLM_DEFAULT_PROVIDER`     | string   | `openai` | Primary LLM provider name                               |
+| `LLM_FALLBACK_PROVIDER`    | string   | `""`     | Secondary LLM provider (empty = no fallback)            |
+| `LLM_FALLBACK_MODEL`       | string   | `""`     | Model override for fallback provider                    |
+| `LLM_RETRY_MAX_ATTEMPTS`   | int      | `2`      | Max retry attempts on transient errors (429, 5xx)       |
+| `LLM_CALL_TIMEOUT`         | duration | `5m`     | Per-call timeout (high during stabilization)            |
+| `LLM_BUDGET_REQUESTS_DAY`  | int      | `0`      | Max requests/day (0 = unlimited)                        |
+| `LLM_BUDGET_TOKENS_DAY`    | int      | `0`      | Max tokens/day (0 = unlimited)                          |
+| `LLM_THROTTLE_CONCURRENCY` | int      | `4`      | Max concurrent LLM calls                                |
+| `LLM_CACHE_ENABLED`        | bool     | `true`   | Enable in-memory response cache                         |
+| `LLM_TIMEOUT`              | duration | `30s`    | Legacy timeout (superseded by `LLM_CALL_TIMEOUT`)       |
+| `LLM_DEBATE_TIMEOUT`       | duration | —        | Debate-phase-specific timeout with quick-model fallback |
+| `SCHEDULER_JOB_TIMEOUT`    | duration | `45m`    | Pipeline-level hard cap                                 |
+
+### Emergency env-var overrides (no redeploy needed for Docker Compose)
+
+| Override                     | Effect                                    |
+| ---------------------------- | ----------------------------------------- |
+| `LLM_FALLBACK_PROVIDER=`     | Chain degrades to primary-only with retry |
+| `LLM_RETRY_MAX_ATTEMPTS=1`   | Disables retry                            |
+| `LLM_BUDGET_REQUESTS_DAY=0`  | Disables budget guard (unlimited)         |
+| `LLM_CALL_TIMEOUT=30m`       | Even more generous timeout                |
+| `LLM_THROTTLE_CONCURRENCY=1` | Serialize all LLM calls                   |
+
+## Report Worker
+
+The `paper_validation_report` automation job generates paper-trading validation reports.
+
+- **Schedule:** `0 17 * * 1-5` ET (after US market close)
+- **Per-strategy jitter:** 0–119s to spread LLM/DB load
+- **Persistence:** `report_artifacts` table (upsert on `strategy_id, report_type, time_bucket`)
+- **Error handling:** Each strategy is independent — a failure in one does not block others
+
+### Auto-Disable
+
+If a job accumulates **5 consecutive failures**, it is automatically disabled:
+
+- Job's `Enabled` flag is set to `false`
+- A log entry at ERROR level is emitted
+- The job will not fire again until manually re-enabled
+
+**Re-enable via API:**
+
+```http
+PUT /api/v1/automation/jobs/{name}/enable
+```
+
+### API Endpoints
+
+| Endpoint                                 | Method | Description                         |
+| ---------------------------------------- | ------ | ----------------------------------- |
+| `/api/v1/strategies/{id}/reports/latest` | GET    | Latest completed report + staleness |
+| `/api/v1/strategies/{id}/reports`        | GET    | Paginated report list               |
+
+The `latest` endpoint includes a `stale_seconds` field = `now - completed_at`.
+
+## Prometheus Metrics
+
+### LLM Provider Chain
+
+| Metric                                    | Type      | Labels                | Description                             |
+| ----------------------------------------- | --------- | --------------------- | --------------------------------------- |
+| `tradingagent_llm_calls_total`            | Counter   | provider, model, role | Total LLM API calls                     |
+| `tradingagent_llm_retry_total`            | Counter   | provider              | Retry attempts                          |
+| `tradingagent_llm_fallback_total`         | Counter   | reason                | Fallback events                         |
+| `tradingagent_llm_budget_exhausted_total` | Counter   | —                     | Budget exhaustion rejections            |
+| `tradingagent_llm_tokens_total`           | Counter   | type                  | Token consumption (prompt / completion) |
+| `tradingagent_llm_latency_seconds`        | Histogram | provider, model       | Call latency distribution               |
+| `tradingagent_llm_cache_hits_total`       | Counter   | —                     | Response cache hits                     |
+| `tradingagent_llm_cache_misses_total`     | Counter   | —                     | Response cache misses                   |
+
+### Report Worker
+
+| Metric                                     | Type      | Labels      | Description                   |
+| ------------------------------------------ | --------- | ----------- | ----------------------------- |
+| `tradingagent_report_worker_success_total` | Counter   | strategy_id | Successful report generations |
+| `tradingagent_report_worker_error_total`   | Counter   | strategy_id | Failed report generations     |
+| `tradingagent_report_staleness_seconds`    | Histogram | strategy_id | Report age at query time      |
+| `tradingagent_automation_job_errors_total` | Counter   | job_name    | All automation job errors     |
+
+## Troubleshooting
+
+### All providers down
+
+**Symptoms:** `tradingagent_llm_fallback_total` spikes, pipeline runs fail.
+
+**Check:**
+
+1. `tradingagent_llm_calls_total` by provider — are calls reaching providers?
+2. Provider status pages (OpenAI, OpenRouter)
+3. API key validity
+
+**Mitigate:** If secondary is also down, set `LLM_FALLBACK_PROVIDER=` to avoid wasted fallback attempts.
+
+### Budget exhausted
+
+**Symptoms:** `tradingagent_llm_budget_exhausted_total` incrementing, jobs returning `ErrBudgetExhausted`.
+
+**Check:**
+
+1. `GET /api/v1/llm/budget` (if exposed) or check logs for budget stats
+2. Compare `LLM_BUDGET_REQUESTS_DAY` against actual daily volume
+
+**Mitigate:** Set `LLM_BUDGET_REQUESTS_DAY=0` to remove limit temporarily. Investigate cost after.
+
+### Report stale
+
+**Symptoms:** `tradingagent_report_staleness_seconds` > 86400 (24h), dashboard shows stale warning.
+
+**Check:**
+
+1. `GET /api/v1/automation/status` — is `paper_validation_report` enabled? Check `consecutive_failures`.
+2. If auto-disabled: re-enable via `PUT /api/v1/automation/jobs/paper_validation_report/enable`
+3. Check logs for the underlying failure reason
+
+**Mitigate:** Trigger manual run via `POST /api/v1/automation/jobs/paper_validation_report/run`.
+
+### Fallback storm
+
+**Symptoms:** `tradingagent_llm_fallback_total` / `tradingagent_llm_calls_total` > 20% over sustained period.
+
+**Check:**
+
+1. Primary provider health
+2. `tradingagent_llm_retry_total` — are retries exhausting before fallback?
+
+**Mitigate:** Budget guard caps daily requests. Auto-disable fires after 5 consecutive job failures. Reduce `LLM_THROTTLE_CONCURRENCY=1` to slow the bleed.
+
+## Success Criteria (7-Day Validation)
+
+| Criterion                  | Target   |
+| -------------------------- | -------- |
+| Report worker success rate | > 98%    |
+| Fallback usage             | < 20%    |
+| Pipeline timeout failures  | ↓ 80%    |
+| P95 report latency         | < 20 min |
diff --git a/internal/api/report_handlers.go b/internal/api/report_handlers.go
@@ -1,13 +1,27 @@
 package api
 
 import (
+	"context"
 	"math"
 	"net/http"
 	"time"
 
+	"github.com/google/uuid"
+
 	pgrepo "github.com/PatrickFanella/get-rich-quick/internal/repository/postgres"
 )
 
+// ReportMetrics captures report staleness observations.
+type ReportMetrics interface {
+	ObserveReportStaleness(strategyID string, seconds float64)
+}
+
+// ReportArtifactStore captures report artifact reads used by report handlers.
+type ReportArtifactStore interface {
+	GetLatest(ctx context.Context, strategyID uuid.UUID, reportType string) (*pgrepo.ReportArtifact, error)
+	List(ctx context.Context, filter pgrepo.ReportArtifactFilter, limit, offset int) ([]pgrepo.ReportArtifact, error)
+}
+
 // reportLatestResponse wraps the latest report artifact with a stale_seconds
 // field showing how old the report is.
 type reportLatestResponse struct {
@@ -50,6 +64,10 @@ func (s *Server) handleGetLatestReport(w http.ResponseWriter, r *http.Request) {
 		stale = math.Max(0, math.Round(time.Since(*artifact.CompletedAt).Seconds()))
 	}
 
+	if s.reportMetrics != nil {
+		s.reportMetrics.ObserveReportStaleness(id.String(), stale)
+	}
+
 	respondJSON(w, http.StatusOK, reportLatestResponse{
 		ReportArtifact: *artifact,
 		StaleSeconds:   stale,