From 8030b4a49c2c97d620d8b99a6ad0b3a50cb8fd78 Mon Sep 17 00:00:00 2001 From: max Date: Fri, 19 Jun 2026 20:29:53 +0800 Subject: [PATCH 1/3] =?UTF-8?q?feat=20:=20grafana=20=E7=9B=A3=E6=8E=A7?= =?UTF-8?q?=E6=95=B4=E5=90=88=20(prometheus=20=E5=8B=95=E6=85=8B=20SD=20+?= =?UTF-8?q?=20dashboards=20+=20alerts)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1-4 完整監控方案,全程單一 origin、隨模型啟停自動跟隨: - backend: 新增 prometheus_targets 服務,reconciler 在 vLLM 進/出 READY 時動態寫 file_sd targets,Prometheus 無需改設定即自動發現艦隊 (LLMOPS_PROMETHEUS_SD_PATH;含單元測試) - deploy: 新增 prometheus / grafana / dcgm-exporter / node-exporter services;prometheus、grafana 與 backend 共用 netns;nginx 反代 /grafana(單一 origin,含 absolute_redirect off 修 port 重導) - grafana: provision datasource + 官方 vLLM(Performance/Query)、DCGM、 Node Exporter dashboards,加自訂 "vLLM Scheduling & Capacity" (排程/容量/工作負載,變數化 datasource+model_name+instance), 4 條 vLLM alert rules + webhook contact point(env 帶入) - frontend: 新增「監控」分頁嵌入 5 張 dashboard(kiosk、主題同步) - 移除已被 grafana 取代的 /trends(前端頁面 + 後端 timeseries endpoint 與 store 方法) Co-Authored-By: Claude Opus 4.8 --- apps/backend/app/api/observability.py | 20 - apps/backend/app/core/settings.py | 5 + apps/backend/app/llmops/manager.py | 19 + apps/backend/app/llmops/reconciler.py | 8 + apps/backend/app/main.py | 4 + .../app/services/prometheus_targets.py | 84 + .../tests/unit/test_prometheus_targets.py | 99 + apps/backend/tests/unit/test_reconciler.py | 27 +- .../src/components/TimeChart.vue | 81 - .../src/components/layout/AppSidebar.vue | 4 +- apps/frontend_llmops/src/lib/api.ts | 8 - apps/frontend_llmops/src/router/index.ts | 12 +- apps/frontend_llmops/src/types/api.ts | 9 - .../src/views/MonitoringView.vue | 99 + apps/frontend_llmops/src/views/TrendsView.vue | 151 - deploy/.env.example | 18 +- deploy/docker-compose.yaml | 90 + deploy/grafana/dashboards/gpu/dcgm.json | 774 + .../dashboards/host/node_exporter_full.json | 15536 ++++++++++++++++ .../vllm/performance_statistics.json | 1405 ++ .../dashboards/vllm/query_statistics.json | 760 + .../dashboards/vllm/scheduling_capacity.json | 1331 ++ .../provisioning/alerting/contactpoints.yaml | 29 + .../grafana/provisioning/alerting/vllm.yaml | 187 + .../provisioning/dashboards/provider.yml | 15 + .../provisioning/datasources/prometheus.yml | 21 + deploy/nginx.conf | 27 + deploy/prometheus/prometheus.yml | 40 + docs/grafana_dashboarad_template.json | 892 + docs/grafana_dashboarad_template2.json | 2058 ++ docs/vllm_grafana_monitoring_guide.md | 786 + packages/llmops-store/llmops_store.py | 51 - 32 files changed, 24318 insertions(+), 332 deletions(-) create mode 100644 apps/backend/app/services/prometheus_targets.py create mode 100644 apps/backend/tests/unit/test_prometheus_targets.py delete mode 100644 apps/frontend_llmops/src/components/TimeChart.vue create mode 100644 apps/frontend_llmops/src/views/MonitoringView.vue delete mode 100644 apps/frontend_llmops/src/views/TrendsView.vue create mode 100644 deploy/grafana/dashboards/gpu/dcgm.json create mode 100644 deploy/grafana/dashboards/host/node_exporter_full.json create mode 100644 deploy/grafana/dashboards/vllm/performance_statistics.json create mode 100644 deploy/grafana/dashboards/vllm/query_statistics.json create mode 100644 deploy/grafana/dashboards/vllm/scheduling_capacity.json create mode 100644 deploy/grafana/provisioning/alerting/contactpoints.yaml create mode 100644 deploy/grafana/provisioning/alerting/vllm.yaml create mode 100644 deploy/grafana/provisioning/dashboards/provider.yml create mode 100644 deploy/grafana/provisioning/datasources/prometheus.yml create mode 100644 deploy/prometheus/prometheus.yml create mode 100644 docs/grafana_dashboarad_template.json create mode 100644 docs/grafana_dashboarad_template2.json create mode 100644 docs/vllm_grafana_monitoring_guide.md diff --git a/apps/backend/app/api/observability.py b/apps/backend/app/api/observability.py index 20274b9..3d79307 100644 --- a/apps/backend/app/api/observability.py +++ b/apps/backend/app/api/observability.py @@ -54,26 +54,6 @@ async def requests_log(request: Request, model_key: Optional[str] = None, limit: return await _store(request).recent_requests(model_key=model_key, limit=limit) -@router.get("/metrics/timeseries") -async def metrics_timeseries( - request: Request, - window: int = 3600, - bucket: int = 60, - model_key: Optional[str] = None, -): - """Bucketed request metrics over the last `window` seconds (for trend charts). - - `bucket` is the bucket width in seconds; `model_key` optionally scopes to one - model group. Each point: ts, count, error_count, avg/p95 latency, total_tokens. - """ - import time - - since = time.time() - max(60, window) - return await _store(request).timeseries( - since=since, bucket_seconds=bucket, model_key=model_key - ) - - @router.get("/models/{key}/logs") async def model_logs( key: str, tail: int = 200, manager: ModelManager = Depends(get_manager) diff --git a/apps/backend/app/core/settings.py b/apps/backend/app/core/settings.py index 6dee065..6895f28 100644 --- a/apps/backend/app/core/settings.py +++ b/apps/backend/app/core/settings.py @@ -50,6 +50,10 @@ class BackendSettings: admin_token: str = "" # Optional webhook URL; a JSON alert is POSTed when a model enters FAILED. alert_webhook: str = "" + # Optional path for the Prometheus file_sd targets file. The backend rewrites + # it whenever the set of ready vLLM instances changes, so Prometheus can + # scrape a dynamic fleet without config edits. Empty -> feature disabled. + prometheus_sd_path: str = "" # Total concurrency budget shared across running evals (sum of their # eval_batch_size). Evals run in parallel as long as the sum stays within # this; the rest queue. Maps to vLLM's max-num-seqs pressure. Runtime-editable @@ -65,6 +69,7 @@ def from_env(cls) -> "BackendSettings": return cls( admin_token=os.environ.get("LLMOPS_ADMIN_TOKEN", "").strip(), alert_webhook=os.environ.get("LLMOPS_ALERT_WEBHOOK", "").strip(), + prometheus_sd_path=os.environ.get("LLMOPS_PROMETHEUS_SD_PATH", "").strip(), poll_interval=_env_float("LLMOPS_POLL_INTERVAL", 2.0), start_timeout=_env_float("LLMOPS_START_TIMEOUT", 300.0), stop_timeout=_env_float("LLMOPS_STOP_TIMEOUT", 10.0), diff --git a/apps/backend/app/llmops/manager.py b/apps/backend/app/llmops/manager.py index 9d91534..ef1fbbe 100644 --- a/apps/backend/app/llmops/manager.py +++ b/apps/backend/app/llmops/manager.py @@ -144,6 +144,25 @@ async def trigger_router_reload(self) -> bool: logger.warning("Router reload POST failed (%s/reload)", self.router_url) return False + async def write_prometheus_targets(self) -> bool: + """Best-effort: refresh the Prometheus file_sd targets file to reflect the + currently-ready vLLM instances. No-op unless prometheus_sd_path is set. + Write-if-changed and never raises — monitoring discovery must never break + the model state machine. The (blocking) file IO runs in the executor.""" + path = self.settings.prometheus_sd_path + if not path: + return False + from app.services.prometheus_targets import build_targets, write_targets_file + + instances = await self.registry.snapshot() + targets = build_targets(instances) + loop = asyncio.get_event_loop() + try: + return await loop.run_in_executor(None, write_targets_file, path, targets) + except Exception: + logger.warning("Failed to write Prometheus SD file at %s", path) + return False + async def list(self) -> list[ModelInstance]: return await self.registry.snapshot() diff --git a/apps/backend/app/llmops/reconciler.py b/apps/backend/app/llmops/reconciler.py index 2910a0e..bb17bce 100644 --- a/apps/backend/app/llmops/reconciler.py +++ b/apps/backend/app/llmops/reconciler.py @@ -209,6 +209,14 @@ async def reconcile_once( for inst, _frm, to, _detail in transitions ): await manager.trigger_router_reload() + # Keep the Prometheus scrape-target file in sync whenever a vLLM instance + # joins or leaves the ready pool (READY in either direction of a transition), + # so monitoring tracks the live fleet. Idempotent (write-if-changed). + if manager is not None and any( + inst.kind == ModelKind.LLM and ModelState.READY in (frm, to) + for inst, frm, to, _detail in transitions + ): + await manager.write_prometheus_targets() if manager is not None and settings.auto_restart: await _process_restarts(registry, settings, store, manager) diff --git a/apps/backend/app/main.py b/apps/backend/app/main.py index 57c7db1..3d5351e 100644 --- a/apps/backend/app/main.py +++ b/apps/backend/app/main.py @@ -118,6 +118,10 @@ async def lifespan(app: FastAPI): # honest from the first response. await adopt_running(registry, http_client, settings, store) + # Seed the Prometheus file_sd targets file (covering adopted-ready instances) + # so monitoring has a valid file from t=0, before the first state transition. + await manager.write_prometheus_targets() + tasks = [ asyncio.create_task(reconcile_loop(registry, http_client, settings, store, manager)), asyncio.create_task(_gpu_poll_loop(app, settings.gpu_poll_interval)), diff --git a/apps/backend/app/services/prometheus_targets.py b/apps/backend/app/services/prometheus_targets.py new file mode 100644 index 0000000..48d09b3 --- /dev/null +++ b/apps/backend/app/services/prometheus_targets.py @@ -0,0 +1,84 @@ +"""Prometheus file-based service discovery for the backend-owned vLLM fleet. + +vLLM instances are spawned on demand on dynamic localhost ports (and come and go +as models are added/removed/auto-restarted), so a static Prometheus scrape config +would constantly drift. Instead the backend — which already owns the registry, the +single source of truth for which instance is on which port — writes a Prometheus +`file_sd` targets file listing every *ready* vLLM instance. Prometheus watches the +file and picks up changes within its refresh interval, no restart needed. + +Only LLM (vLLM) instances are emitted: vLLM exposes a Prometheus-format `/metrics` +on its OpenAI port, whereas the embedding/reranker server does not. + +The file lives in the shared data volume and is read by the Prometheus container +(which joins the backend's network namespace, so the `localhost:` targets +resolve to the same vLLM processes the backend spawned). +""" +from __future__ import annotations + +import json +import os +from typing import Iterable + +from app.llmops.instance import ModelInstance +from app.llmops.state import ModelKind, ModelState + + +def build_targets(instances: Iterable[ModelInstance]) -> list[dict]: + """Build the Prometheus file_sd target list from registry instances. + + One entry per ready vLLM instance. `targets` is the scrape address + (`host:port`); Prometheus appends the configured metrics_path (`/metrics`). + Labels carry the group/instance identity and model tag so dashboards can + join on something meaningful instead of the volatile `host:port`. + + Sorted by address so the serialized output is stable — the writer can then + skip an identical rewrite and avoid churning the file (which would otherwise + nudge Prometheus to re-read it every reconcile pass). + """ + targets: list[dict] = [] + for inst in instances: + if inst.kind != ModelKind.LLM or inst.state != ModelState.READY: + continue + group, _, instance_id = inst.key.partition("::") + targets.append( + { + "targets": [f"{inst.host}:{inst.port}"], + "labels": { + "group": group, + "instance_id": instance_id, + "model_tag": inst.model_tag or "", + }, + } + ) + targets.sort(key=lambda t: t["targets"][0]) + return targets + + +def render(targets: list[dict]) -> str: + """Serialize the target list to the JSON Prometheus file_sd expects.""" + return json.dumps(targets, indent=2, sort_keys=True) + + +def write_targets_file(path: str, targets: list[dict]) -> bool: + """Atomically write the SD file if its content changed. Returns True if it + was (re)written, False if the on-disk content already matched. + + Write-if-changed keeps Prometheus from re-reading an identical file on every + reconcile tick. The write is atomic (temp + os.replace) so Prometheus never + observes a half-written, unparseable file. + """ + payload = render(targets) + try: + with open(path, encoding="utf-8") as f: + if f.read() == payload: + return False + except (OSError, ValueError): + pass # missing/unreadable -> (re)write below + + os.makedirs(os.path.dirname(path) or ".", exist_ok=True) + tmp = f"{path}.tmp" + with open(tmp, "w", encoding="utf-8") as f: + f.write(payload) + os.replace(tmp, path) # atomic on POSIX + return True diff --git a/apps/backend/tests/unit/test_prometheus_targets.py b/apps/backend/tests/unit/test_prometheus_targets.py new file mode 100644 index 0000000..1e8732b --- /dev/null +++ b/apps/backend/tests/unit/test_prometheus_targets.py @@ -0,0 +1,99 @@ +import json + +import pytest + +from app.core.settings import BackendSettings +from app.llmops.launchers import EMBEDDING_KEY, EmbeddingLauncher, VllmLauncher +from app.llmops.manager import ModelManager, build_registry +from app.llmops.state import ModelState +from app.services.prometheus_targets import (build_targets, render, + write_targets_file) +from tests.conftest import FAKE_CONFIG, FakeHTTPClient + +pytestmark = pytest.mark.unit + +HEALTHY = "Qwen3-0.6B::qwen3" # port 8002 +OTHER = "Qwen3-0.6B::qwen3-2" # port 8004 + + +def _registry(): + return build_registry(FAKE_CONFIG, "config.yaml", [VllmLauncher(), EmbeddingLauncher()]) + + +def test_build_targets_only_includes_ready_llm(): + reg = _registry() + reg.get(HEALTHY).state = ModelState.READY + reg.get(OTHER).state = ModelState.STARTING # not ready -> excluded + + targets = build_targets(reg.values()) + + assert len(targets) == 1 + entry = targets[0] + assert entry["targets"] == ["localhost:8002"] + assert entry["labels"]["group"] == "Qwen3-0.6B" + assert entry["labels"]["instance_id"] == "qwen3" + assert entry["labels"]["model_tag"] == "Qwen/Qwen3-0.6B" + + +def test_build_targets_excludes_embedding_server(): + # The embedding/reranker server is not vLLM and exposes no Prometheus metrics. + reg = _registry() + emb = reg.get(EMBEDDING_KEY) + assert emb is not None + emb.state = ModelState.READY + + assert build_targets(reg.values()) == [] + + +def test_build_targets_is_sorted_and_stable(): + reg = _registry() + reg.get(HEALTHY).state = ModelState.READY # 8002 + reg.get(OTHER).state = ModelState.READY # 8004 + + addrs = [t["targets"][0] for t in build_targets(reg.values())] + assert addrs == ["localhost:8002", "localhost:8004"] # sorted by address + + +def test_write_targets_file_writes_then_skips_unchanged(tmp_path): + path = str(tmp_path / "sub" / "targets.json") # parent created on demand + targets = [{"targets": ["localhost:8002"], "labels": {"group": "g"}}] + + assert write_targets_file(path, targets) is True # first write + assert json.loads(open(path).read()) == targets + assert write_targets_file(path, targets) is False # identical -> skip + + targets2 = targets + [{"targets": ["localhost:8004"], "labels": {"group": "g"}}] + assert write_targets_file(path, targets2) is True # changed -> rewrite + assert json.loads(open(path).read()) == targets2 + + +def test_write_targets_file_leaves_no_tmp_artifact(tmp_path): + path = tmp_path / "targets.json" + write_targets_file(str(path), []) + assert not (tmp_path / "targets.json.tmp").exists() + assert path.read_text() == render([]) + + +async def test_manager_noop_without_path_configured(): + # Default settings leave prometheus_sd_path empty -> feature disabled. + reg = _registry() + mgr = ModelManager( + reg, [VllmLauncher(), EmbeddingLauncher()], FakeHTTPClient(), + FAKE_CONFIG, "config.yaml", BackendSettings(), + ) + assert await mgr.write_prometheus_targets() is False + + +async def test_manager_writes_ready_targets_when_path_set(tmp_path): + path = str(tmp_path / "targets.json") + reg = _registry() + reg.get(HEALTHY).state = ModelState.READY + settings = BackendSettings(prometheus_sd_path=path) + mgr = ModelManager( + reg, [VllmLauncher(), EmbeddingLauncher()], FakeHTTPClient(), + FAKE_CONFIG, "config.yaml", settings, + ) + + assert await mgr.write_prometheus_targets() is True + written = json.loads(open(path).read()) + assert [t["targets"][0] for t in written] == ["localhost:8002"] diff --git a/apps/backend/tests/unit/test_reconciler.py b/apps/backend/tests/unit/test_reconciler.py index 569e082..12e2a24 100644 --- a/apps/backend/tests/unit/test_reconciler.py +++ b/apps/backend/tests/unit/test_reconciler.py @@ -45,15 +45,20 @@ async def test_starting_becomes_ready_when_health_ok(): class _ReloadSpyManager: - """Minimal manager stub capturing router-reload nudges.""" + """Minimal manager stub capturing router-reload + Prometheus SD nudges.""" def __init__(self): self.reloads = 0 + self.sd_writes = 0 async def trigger_router_reload(self): self.reloads += 1 return True + async def write_prometheus_targets(self): + self.sd_writes += 1 + return True + async def test_ready_transition_nudges_router_reload(): reg = _registry() @@ -67,14 +72,32 @@ async def test_ready_transition_nudges_router_reload(): await reconcile_once(reg, FakeHTTPClient(healthy_ports={8002}), _settings(), manager=mgr) assert inst.state == ModelState.READY assert mgr.reloads == 1 + assert mgr.sd_writes == 1 # joining the ready pool refreshes scrape targets async def test_no_ready_transition_does_not_reload(): - # Steady-state pass (nothing turns READY) must not spam the router. + # Steady-state pass (nothing turns READY) must not spam the router or rewrite SD. reg = _registry() mgr = _ReloadSpyManager() await reconcile_once(reg, FakeHTTPClient(healthy_ports=set()), _settings(), manager=mgr) assert mgr.reloads == 0 + assert mgr.sd_writes == 0 + + +async def test_ready_to_failed_refreshes_sd_but_not_router(): + # A ready vLLM dying leaves the pool: SD must be rewritten (drop the target), + # but the router reload only fires on instances *joining* the pool. + reg = _registry() + inst = reg.get(HEALTHY) + inst.state = ModelState.READY + inst.managed = True + inst.proc = FakeProc(returncode=139) # crashed + + mgr = _ReloadSpyManager() + await reconcile_once(reg, FakeHTTPClient(healthy_ports={8002}), _settings(), manager=mgr) + assert inst.state == ModelState.FAILED + assert mgr.sd_writes == 1 + assert mgr.reloads == 0 async def test_starting_times_out_to_failed(): diff --git a/apps/frontend_llmops/src/components/TimeChart.vue b/apps/frontend_llmops/src/components/TimeChart.vue deleted file mode 100644 index 78bec48..0000000 --- a/apps/frontend_llmops/src/components/TimeChart.vue +++ /dev/null @@ -1,81 +0,0 @@ - - - diff --git a/apps/frontend_llmops/src/components/layout/AppSidebar.vue b/apps/frontend_llmops/src/components/layout/AppSidebar.vue index e906672..2822f57 100644 --- a/apps/frontend_llmops/src/components/layout/AppSidebar.vue +++ b/apps/frontend_llmops/src/components/layout/AppSidebar.vue @@ -12,11 +12,11 @@ import { KeyRound, Layers, LayoutDashboard, + LineChart, Package, Receipt, Server, TerminalSquare, - TrendingUp, } from '@lucide/vue' import { useModelsStore } from '@/stores/models' import StatusDot from '@/components/StatusDot.vue' @@ -53,8 +53,8 @@ const nav = [ { to: '/', label: '總覽', icon: LayoutDashboard }, { to: '/models', label: '模型', icon: Server }, { to: '/traffic', label: '流量', icon: ArrowLeftRight }, - { to: '/trends', label: '趨勢', icon: TrendingUp }, { to: '/requests', label: '請求', icon: Receipt }, + { to: '/monitoring', label: '監控', icon: LineChart }, { to: '/playground', label: '測試台', icon: TerminalSquare }, { to: '/benchmark', label: '壓測', icon: Gauge }, { to: '/eval', label: '評測', icon: ClipboardCheck }, diff --git a/apps/frontend_llmops/src/lib/api.ts b/apps/frontend_llmops/src/lib/api.ts index 82b5ffe..a1d61e1 100644 --- a/apps/frontend_llmops/src/lib/api.ts +++ b/apps/frontend_llmops/src/lib/api.ts @@ -31,7 +31,6 @@ import type { RouterMetrics, SettingValue, StateEvent, - TimeseriesPoint, UsageRow, } from '@/types/api' @@ -167,13 +166,6 @@ export const api = { request(API_BASE, `/api/models/${enc(key)}/logs?tail=${tail}`), getModelMetrics: (key: string) => request(API_BASE, `/api/models/${enc(key)}/metrics`), - getTimeseries: (opts: { window?: number; bucket?: number; modelKey?: string } = {}) => { - const params = new URLSearchParams() - params.set('window', String(opts.window ?? 3600)) - params.set('bucket', String(opts.bucket ?? 60)) - if (opts.modelKey) params.set('model_key', opts.modelKey) - return request(API_BASE, `/api/metrics/timeseries?${params.toString()}`) - }, healthz: () => request(API_BASE, '/healthz'), // ---- LLM Router ----------------------------------------------------------- diff --git a/apps/frontend_llmops/src/router/index.ts b/apps/frontend_llmops/src/router/index.ts index bd95bb8..dc0e50d 100644 --- a/apps/frontend_llmops/src/router/index.ts +++ b/apps/frontend_llmops/src/router/index.ts @@ -21,18 +21,18 @@ const router = createRouter({ meta: { title: 'Traffic' }, component: () => import('@/views/TrafficView.vue'), }, - { - path: '/trends', - name: 'trends', - meta: { title: 'Trends' }, - component: () => import('@/views/TrendsView.vue'), - }, { path: '/requests', name: 'requests', meta: { title: 'Requests' }, component: () => import('@/views/RequestsView.vue'), }, + { + path: '/monitoring', + name: 'monitoring', + meta: { title: 'Monitoring' }, + component: () => import('@/views/MonitoringView.vue'), + }, { path: '/benchmark', name: 'benchmark', diff --git a/apps/frontend_llmops/src/types/api.ts b/apps/frontend_llmops/src/types/api.ts index 0aaf1be..a0ebf4f 100644 --- a/apps/frontend_llmops/src/types/api.ts +++ b/apps/frontend_llmops/src/types/api.ts @@ -42,15 +42,6 @@ export interface ModelView { restart_count?: number } -export interface TimeseriesPoint { - ts: number - count: number - error_count: number - avg_latency_ms: number | null - p95_latency_ms: number | null - total_tokens: number -} - export interface MemoryInfo { total: number available: number diff --git a/apps/frontend_llmops/src/views/MonitoringView.vue b/apps/frontend_llmops/src/views/MonitoringView.vue new file mode 100644 index 0000000..08616b3 --- /dev/null +++ b/apps/frontend_llmops/src/views/MonitoringView.vue @@ -0,0 +1,99 @@ + + +