Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions cmd/tradingagent/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ func newAPIServer(ctx context.Context, cfg config.Config, logger *slog.Logger) (
NewsFeedRepo: newsFeedRepo,
DiscoveryRunRepo: pgrepo.NewDiscoveryRunRepo(db.Pool),
ReportArtifacts: reportArtifactRepo,
ReportMetrics: appMetrics,
}
notificationManager := newNotificationManager(cfg)

Expand Down Expand Up @@ -344,6 +345,7 @@ func newAPIServer(ctx context.Context, cfg config.Config, logger *slog.Logger) (
Logger: logger,
})
orch.WithJobMetrics(appMetrics)
orch.WithReportMetrics(appMetrics)
orch.RegisterAll()
if err := orch.Start(); err != nil {
logger.Warn("automation: failed to start job orchestrator", slog.Any("error", err))
Expand Down Expand Up @@ -540,6 +542,12 @@ func chainOpts(cfg config.LLMConfig, appMetrics *metrics.Metrics, logger *slog.L
// Retry with exponential backoff.
if cfg.RetryMaxAttempts > 1 {
opts = append(opts, llm.WithRetry(cfg.RetryMaxAttempts))
if appMetrics != nil {
opts = append(opts, llm.WithChainRetryMetrics(&retryMetricsAdapter{
m: appMetrics,
provider: configuredPrimaryRetryProviderLabel(cfg.DefaultProvider),
}))
}
}

// Fallback provider.
Expand Down Expand Up @@ -570,6 +578,9 @@ func chainOpts(cfg config.LLMConfig, appMetrics *metrics.Metrics, logger *slog.L
// Budget guard.
if budget != nil {
opts = append(opts, llm.WithBudget(budget))
if appMetrics != nil {
opts = append(opts, llm.WithChainBudgetMetrics(appMetrics))
}
}

// Per-call timeout.
Expand All @@ -580,6 +591,23 @@ func chainOpts(cfg config.LLMConfig, appMetrics *metrics.Metrics, logger *slog.L
return opts
}

// retryMetricsAdapter adapts *metrics.Metrics to the llm.RetryMetrics interface
// by binding a provider label at construction time.
type retryMetricsAdapter struct {
m *metrics.Metrics
provider string
}

func (a *retryMetricsAdapter) RecordLLMRetry() { a.m.RecordLLMRetry(a.provider) }

func configuredPrimaryRetryProviderLabel(provider string) string {
name := strings.TrimSpace(provider)
if name == "" {
name = "unknown"
}
return fmt.Sprintf("configured_primary:%s", name)
}

func buildLLMBudget(cfg config.LLMConfig) *llm.Budget {
// Validate() enforces non-negative values, but unit tests call runtime helpers
// directly with hand-built LLMConfig values; clamp defensively for that path.
Expand Down
22 changes: 22 additions & 0 deletions cmd/tradingagent/runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,28 @@ func TestBuildProviderChain_InvalidFallbackSkipped(t *testing.T) {
}
}

func TestConfiguredPrimaryRetryProviderLabel(t *testing.T) {
t.Parallel()

tests := []struct {
name string
input string
expected string
}{
{name: "trimmed provider", input: " openai ", expected: "configured_primary:openai"},
{name: "empty provider", input: " ", expected: "configured_primary:unknown"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
if got := configuredPrimaryRetryProviderLabel(tt.input); got != tt.expected {
t.Fatalf("configuredPrimaryRetryProviderLabel(%q) = %q, want %q", tt.input, got, tt.expected)
}
})
}
}

func slogDiscardLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
200 changes: 200 additions & 0 deletions docs/reference/llm-resilience.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
---
title: 'LLM Resilience'
description: 'Provider chain architecture, env var reference, and operational troubleshooting.'
status: 'canonical'
created: '2026-04-16'
tags: [llm, resilience, observability, reference]
---

## Provider Chain Architecture

Every LLM call passes through a layered provider chain. Each layer is optional — omitting its config disables it.

```text
┌─────────────────────────────────────────────────────────────┐
│ LLM Provider Chain │
│ │
│ Request flow (outermost → innermost): │
│ │
│ ┌──────────────┐ │
│ │ Budget Guard │ ErrBudgetExhausted → not retried │
│ └──────┬───────┘ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Timeout │ Per-call deadline (LLM_CALL_TIMEOUT) │
│ └──────┬───────┘ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Throttle │ Concurrency limiter (LLM_THROTTLE_…) │
│ └──────┬───────┘ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Retry │ Exponential backoff on 429/5xx │
│ └──────┬───────┘ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Fallback │ Primary fails → try secondary │
│ └──────┬───────┘ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Cache │ In-memory response cache │
│ └──────┬───────┘ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Provider │ Raw OpenAI / OpenRouter / Ollama / etc. │
│ └──────────────┘ │
└─────────────────────────────────────────────────────────────┘
```

### Chain construction

`NewProviderChain(primary, logger, opts...)` builds layers inside-out:

1. **Cache** (innermost) — deduplicates identical prompts
2. **Fallback** — routes to secondary provider on failure
3. **Retry** — exponential backoff with jitter on transient errors
4. **Throttle** — semaphore-based concurrency limiter
5. **Timeout** — per-call `context.WithTimeout`
6. **Budget Guard** (outermost) — rejects when daily limits hit

Only layers whose options are provided are added. With zero config the chain degrades to the raw provider.

## Environment Variables

| Variable | Type | Default | Description |
| -------------------------- | -------- | -------- | ------------------------------------------------------- |
| `LLM_DEFAULT_PROVIDER` | string | `openai` | Primary LLM provider name |
| `LLM_FALLBACK_PROVIDER` | string | `""` | Secondary LLM provider (empty = no fallback) |
| `LLM_FALLBACK_MODEL` | string | `""` | Model override for fallback provider |
| `LLM_RETRY_MAX_ATTEMPTS` | int | `2` | Max retry attempts on transient errors (429, 5xx) |
| `LLM_CALL_TIMEOUT` | duration | `5m` | Per-call timeout (high during stabilization) |
| `LLM_BUDGET_REQUESTS_DAY` | int | `0` | Max requests/day (0 = unlimited) |
| `LLM_BUDGET_TOKENS_DAY` | int | `0` | Max tokens/day (0 = unlimited) |
| `LLM_THROTTLE_CONCURRENCY` | int | `4` | Max concurrent LLM calls |
| `LLM_CACHE_ENABLED` | bool | `true` | Enable in-memory response cache |
| `LLM_TIMEOUT` | duration | `30s` | Legacy timeout (superseded by `LLM_CALL_TIMEOUT`) |
| `LLM_DEBATE_TIMEOUT` | duration | — | Debate-phase-specific timeout with quick-model fallback |
| `SCHEDULER_JOB_TIMEOUT` | duration | `45m` | Pipeline-level hard cap |

### Emergency env-var overrides (no redeploy needed for Docker Compose)

| Override | Effect |
| ---------------------------- | ----------------------------------------- |
| `LLM_FALLBACK_PROVIDER=` | Chain degrades to primary-only with retry |
| `LLM_RETRY_MAX_ATTEMPTS=1` | Disables retry |
| `LLM_BUDGET_REQUESTS_DAY=0` | Disables budget guard (unlimited) |
| `LLM_CALL_TIMEOUT=30m` | Even more generous timeout |
| `LLM_THROTTLE_CONCURRENCY=1` | Serialize all LLM calls |

## Report Worker

The `paper_validation_report` automation job generates paper-trading validation reports.

- **Schedule:** `0 17 * * 1-5` ET (after US market close)
- **Per-strategy jitter:** 0–119s to spread LLM/DB load
- **Persistence:** `report_artifacts` table (upsert on `strategy_id, report_type, time_bucket`)
- **Error handling:** Each strategy is independent — a failure in one does not block others

### Auto-Disable

If a job accumulates **5 consecutive failures**, it is automatically disabled:

- Job's `Enabled` flag is set to `false`
- A log entry at ERROR level is emitted
- The job will not fire again until manually re-enabled

**Re-enable via API:**

```http
PUT /api/v1/automation/jobs/{name}/enable
```

### API Endpoints

| Endpoint | Method | Description |
| ---------------------------------------- | ------ | ----------------------------------- |
| `/api/v1/strategies/{id}/reports/latest` | GET | Latest completed report + staleness |
| `/api/v1/strategies/{id}/reports` | GET | Paginated report list |

The `latest` endpoint includes a `stale_seconds` field = `now - completed_at`.

## Prometheus Metrics

### LLM Provider Chain

| Metric | Type | Labels | Description |
| ----------------------------------------- | --------- | --------------------- | --------------------------------------- |
| `tradingagent_llm_calls_total` | Counter | provider, model, role | Total LLM API calls |
| `tradingagent_llm_retry_total` | Counter | provider | Retry attempts |
| `tradingagent_llm_fallback_total` | Counter | reason | Fallback events |
| `tradingagent_llm_budget_exhausted_total` | Counter | — | Budget exhaustion rejections |
| `tradingagent_llm_tokens_total` | Counter | type | Token consumption (prompt / completion) |
| `tradingagent_llm_latency_seconds` | Histogram | provider, model | Call latency distribution |
| `tradingagent_llm_cache_hits_total` | Counter | — | Response cache hits |
| `tradingagent_llm_cache_misses_total` | Counter | — | Response cache misses |

### Report Worker

| Metric | Type | Labels | Description |
| ------------------------------------------ | --------- | ----------- | ----------------------------- |
| `tradingagent_report_worker_success_total` | Counter | strategy_id | Successful report generations |
| `tradingagent_report_worker_error_total` | Counter | strategy_id | Failed report generations |
| `tradingagent_report_staleness_seconds` | Histogram | strategy_id | Report age at query time |
| `tradingagent_automation_job_errors_total` | Counter | job_name | All automation job errors |

## Troubleshooting

### All providers down

**Symptoms:** `tradingagent_llm_fallback_total` spikes, pipeline runs fail.

**Check:**

1. `tradingagent_llm_calls_total` by provider — are calls reaching providers?
2. Provider status pages (OpenAI, OpenRouter)
3. API key validity

**Mitigate:** If secondary is also down, set `LLM_FALLBACK_PROVIDER=` to avoid wasted fallback attempts.

### Budget exhausted

**Symptoms:** `tradingagent_llm_budget_exhausted_total` incrementing, jobs returning `ErrBudgetExhausted`.

**Check:**

1. `GET /api/v1/llm/budget` (if exposed) or check logs for budget stats
2. Compare `LLM_BUDGET_REQUESTS_DAY` against actual daily volume

**Mitigate:** Set `LLM_BUDGET_REQUESTS_DAY=0` to remove limit temporarily. Investigate cost after.

### Report stale

**Symptoms:** `tradingagent_report_staleness_seconds` > 86400 (24h), dashboard shows stale warning.

**Check:**

1. `GET /api/v1/automation/status` — is `paper_validation_report` enabled? Check `consecutive_failures`.
2. If auto-disabled: re-enable via `PUT /api/v1/automation/jobs/paper_validation_report/enable`
3. Check logs for the underlying failure reason

**Mitigate:** Trigger manual run via `POST /api/v1/automation/jobs/paper_validation_report/run`.

### Fallback storm

**Symptoms:** `tradingagent_llm_fallback_total` / `tradingagent_llm_calls_total` > 20% over sustained period.

**Check:**

1. Primary provider health
2. `tradingagent_llm_retry_total` — are retries exhausting before fallback?

**Mitigate:** Budget guard caps daily requests. Auto-disable fires after 5 consecutive job failures. Reduce `LLM_THROTTLE_CONCURRENCY=1` to slow the bleed.

## Success Criteria (7-Day Validation)

| Criterion | Target |
| -------------------------- | -------- |
| Report worker success rate | > 98% |
| Fallback usage | < 20% |
| Pipeline timeout failures | ↓ 80% |
| P95 report latency | < 20 min |
18 changes: 18 additions & 0 deletions internal/api/report_handlers.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
package api

import (
"context"
"math"
"net/http"
"time"

"github.com/google/uuid"

pgrepo "github.com/PatrickFanella/get-rich-quick/internal/repository/postgres"
)

// ReportMetrics captures report staleness observations.
type ReportMetrics interface {
ObserveReportStaleness(strategyID string, seconds float64)
}

// ReportArtifactStore captures report artifact reads used by report handlers.
type ReportArtifactStore interface {
GetLatest(ctx context.Context, strategyID uuid.UUID, reportType string) (*pgrepo.ReportArtifact, error)
List(ctx context.Context, filter pgrepo.ReportArtifactFilter, limit, offset int) ([]pgrepo.ReportArtifact, error)
}

// reportLatestResponse wraps the latest report artifact with a stale_seconds
// field showing how old the report is.
type reportLatestResponse struct {
Expand Down Expand Up @@ -50,6 +64,10 @@ func (s *Server) handleGetLatestReport(w http.ResponseWriter, r *http.Request) {
stale = math.Max(0, math.Round(time.Since(*artifact.CompletedAt).Seconds()))
}

if s.reportMetrics != nil {
s.reportMetrics.ObserveReportStaleness(id.String(), stale)
}
Comment on lines +67 to +69
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

handleGetLatestReport now emits ObserveReportStaleness, but report_handlers_test.go doesn’t assert that the metrics hook is called (or that the staleness value passed matches the response). Adding a test with a stub ReportMetrics implementation would help ensure this observation doesn’t get accidentally removed or miscomputed.

Copilot uses AI. Check for mistakes.

respondJSON(w, http.StatusOK, reportLatestResponse{
ReportArtifact: *artifact,
StaleSeconds: stale,
Expand Down
Loading