diff --git a/.specify/feature.json b/.specify/feature.json index 52c047d..509b591 100644 --- a/.specify/feature.json +++ b/.specify/feature.json @@ -1,3 +1,3 @@ { - "feature_directory": "specs/020-deploy-smoke-verification" + "feature_directory": "specs/021-ops-health-dashboard" } diff --git a/ACTIVE_SPECS.md b/ACTIVE_SPECS.md index 90554a0..b7c112f 100644 --- a/ACTIVE_SPECS.md +++ b/ACTIVE_SPECS.md @@ -6,7 +6,7 @@ when the spec is fully finished. ## Open Specs -- No open specs. +- `021-ops-health-dashboard`: implementation and validation complete; remaining work is commit, push, PR, and GitHub validation. ## Maintenance Rules diff --git a/AGENTS.md b/AGENTS.md index 43ccdb8..72e5a43 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -10,6 +10,9 @@ Auto-generated from all feature plans. Last updated: 2026-06-11 ## Active Technologies +- TypeScript 5.9, Next.js 16 App Router, React 19 + Existing Next.js server components/API routes, Prisma 7, Better Auth role/session helpers, next-intl, lucide-react, existing monitoring and app-version helpers (021-ops-health-dashboard) +- Existing Prisma database only; no new tables planned. Use existing background job records for worker evidence and existing deployment/runtime metadata when available. (021-ops-health-dashboard) + - TypeScript 5.9 on Node.js via the existing `tsx` dev dependency + Node built-ins, existing `tsx`, Azure CLI available in deployment runners (020-deploy-smoke-verification) - No new storage; smoke evidence remains command output and GitHub step summary (020-deploy-smoke-verification) @@ -41,11 +44,11 @@ TypeScript 5.9 on Next.js 16 App Router (React 19): Follow standard conventions ## Recent Changes +- 021-ops-health-dashboard: Added TypeScript 5.9, Next.js 16 App Router, React 19 + Existing Next.js server components/API routes, Prisma 7, Better Auth role/session helpers, next-intl, lucide-react, existing monitoring and app-version helpers + - 020-deploy-smoke-verification: Added TypeScript 5.9 on Node.js via the existing `tsx` dev dependency + Node built-ins, existing `tsx`, Azure CLI available in deployment runners - 019-logging-standardization: Added TypeScript 5.9 on Next.js 16 App Router with React 19; Python 3.12 worker; PowerShell/Node validation scripts + Existing `src/lib/logger.ts`, `src/proxy.ts`, `src/instrumentation.ts`, Prisma-backed services, Python stdlib `logging`/`json`, Vitest, Playwright, existing validation scripts -- 017-deepsec-remediation: Added TypeScript 5.9 on Next.js 16 App Router, React 19, Python 3.12 worker where affected, PowerShell validation scripts + Prisma 7, Better Auth, Zod, Vitest, Playwright, GitHub Actions, GoReleaser, DeepSec 2.0.12 - diff --git a/CONTINUE.md b/CONTINUE.md index b3ce0c1..39ba7fe 100644 --- a/CONTINUE.md +++ b/CONTINUE.md @@ -4,46 +4,30 @@ ## Current Snapshot -- Updated: 2026-06-11 14:46:32 -- Branch: `main` +- Updated: 2026-06-11 22:53:35 +- Branch: `021-ops-health-dashboard` ## Recent Non-Continuity Commits +- 8047615 feat: expose runtime build metadata (#4) - 6c81729 feat: add azure deployment smoke verification (#3) - 3d52264 fix: move state queue logging to dedicated resource +- 34de987 chore: record clean handoff - 25306fd chore: refresh specs overview -- dd226de test: update opentofu action pin assertion -- 9b92cb5 ci: update opentofu setup action ## Git Status -- M .env.docker.example -- M .env.example -- M .github/workflows/deploy-azure.yml -- M Dockerfile.app -- M Dockerfile.worker -- M README.md -- M docker-compose.yml -- M infra/azure/main.tf -- M infra/azure/modules/runtime/app.tf -- M infra/azure/modules/runtime/job.tf -- M infra/azure/modules/runtime/variables.tf -- M infra/azure/modules/runtime/worker.tf -- M infra/azure/variables.tf -- M specs/018-opentofu-azure-infra/quickstart.md -- M src/components/ui/AppVersionBadge.tsx -- M src/lib/app-version.ts -- M tests/unit/security/deploy-workflow.test.ts -- ?? src/app/api/version/ -- ?? tests/unit/app-version.test.ts -- ?? tests/unit/version-route.test.ts +- Existing handoff edits retained for inclusion in the next PR +- Active spec implementation completed under `specs/021-ops-health-dashboard/` +- `.specify/feature.json` now points at `specs/021-ops-health-dashboard` +- Full validation passed locally, including Trivy/container scans and Playwright E2E ## Active Specs -- None +- `021-ops-health-dashboard`: Implementation and validation complete; PR cleanup remains ## Next Recommended Actions -1. Review, commit, and push the runtime build metadata changes. -2. Optionally open a PR and confirm GitHub Actions validation. -3. Use `APP_ENVIRONMENT`, `APP_VERSION`, `APP_REVISION`, `APP_BUILD_ID`, and `APP_BUILT_AT` for dev/staging traceability instead of generated version files. +1. Commit and push `021-ops-health-dashboard`. +2. Open a PR and watch GitHub validation. +3. Include `CONTINUE.md` and `CONTINUE_LOG.md` housekeeping changes in the PR. diff --git a/CONTINUE_LOG.md b/CONTINUE_LOG.md index 240fcff..373b9db 100644 --- a/CONTINUE_LOG.md +++ b/CONTINUE_LOG.md @@ -1,5 +1,46 @@ # Continue Log +## 2026-06-11 21:14:00 + +- Implemented ops health dashboard feature slices for spec `021-ops-health-dashboard`. +- Added admin `/admin/ops` page, `/api/admin/ops-health` snapshot API, shared ops health snapshot logic, localized UI, responsive e2e coverage, and safe copy diagnostics. +- Focused Vitest and Playwright ops-health checks pass; final validation remains. + +## 2026-06-11 18:05:00 + +- Applied `/speckit.analyze` remediation edits for spec `021-ops-health-dashboard`. +- Tightened config sanity scope, copy toast feedback, safe fatal-error testing, and full pre-merge validation tasks. + +## 2026-06-11 17:55:00 + +- Generated task list for spec `021-ops-health-dashboard`. +- Tasks cover setup, shared snapshot foundation, three independently testable user stories, and polish/validation. +- Next action: run `/speckit.implement`. + +## 2026-06-11 17:40:00 + +- Planned spec `021-ops-health-dashboard`. +- Added `plan.md`, `research.md`, `data-model.md`, `contracts/ops-health-dashboard.md`, and `quickstart.md`. +- Updated `ACTIVE_SPECS.md` and Codex agent context. Next action: run `/speckit.tasks`. + +## 2026-06-11 17:24:00 + +- Clarified spec `021-ops-health-dashboard`. +- Decisions: admin-only access, admin/ops navigation, point-in-time snapshot with manual refresh, recent recorded worker/smoke status only, and copyable non-secret diagnostic summary. +- Next action: run `/speckit.plan`. + +## 2026-06-11 17:09:00 + +- Created spec `021-ops-health-dashboard` on branch `021-ops-health-dashboard`. +- Added requirement checklist and refreshed `specs/OVERVIEW.md` plus `.specify/feature.json`. +- Kept prior continuity housekeeping changes for inclusion in the next PR, per user request. + +## 2026-06-11 16:59:10 + +- Corrected continuity snapshot after PR #4 merge. +- Current state: clean `main`, runtime build metadata feature merged, local and GitHub validation green. +- Next recommendation: select the next user-visible feature area rather than adding more deployment plumbing immediately. + ## 2026-05-28 09:15:49 +02:00 - Implemented spec `016-runtime-credential-separation` end to end. @@ -1517,3 +1558,12 @@ - Validation passed: focused version/workflow tests, `pnpm run typecheck`, and `.\validate.ps1 all`. - Active specs: none. - Next focus: review, commit, push, and optionally open a PR for the metadata changes. + +## 2026-06-11 22:53:35 + +- Implemented spec `021-ops-health-dashboard` on branch `021-ops-health-dashboard`. +- Added an administrator-only `/admin/ops` dashboard, `/api/admin/ops-health` snapshot route, reusable ops health snapshot/redaction logic, admin navigation, i18n copy, and focused unit/integration/e2e coverage. +- Adjusted local validation so Vitest uses the standard PostgreSQL test URL when no database URL is configured, matching the generated PostgreSQL Prisma client. +- Validation passed: focused ops unit/integration tests, focused ops Playwright tests, `.\validate.ps1 quality`, `.\validate.ps1 all`, and `.\validate.ps1 full` including Trivy/container scans and full Playwright E2E. +- Active specs: `021-ops-health-dashboard` implemented and validated; PR cleanup remains. +- Next focus: commit, push, open a PR, and confirm GitHub validation. diff --git a/specs/021-ops-health-dashboard/checklists/requirements.md b/specs/021-ops-health-dashboard/checklists/requirements.md new file mode 100644 index 0000000..3f4ed31 --- /dev/null +++ b/specs/021-ops-health-dashboard/checklists/requirements.md @@ -0,0 +1,35 @@ +# Specification Quality Checklist: Ops Health Dashboard + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2026-06-11 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +- Clarification pass completed on 2026-06-11 with five accepted answers covering access, refresh model, navigation, worker/smoke evidence, and copyable diagnostics. +- Specification is ready for `/speckit.plan`. diff --git a/specs/021-ops-health-dashboard/clarify.md b/specs/021-ops-health-dashboard/clarify.md new file mode 100644 index 0000000..e72a4aa --- /dev/null +++ b/specs/021-ops-health-dashboard/clarify.md @@ -0,0 +1,22 @@ +# Clarifications: Ops Health Dashboard + +**Feature Branch**: `021-ops-health-dashboard` +**Date**: 2026-06-11 +**Spec**: [spec.md](./spec.md) + +## Session 2026-06-11 + +The clarification pass resolved the following product and implementation boundaries before planning and task generation: + +1. **Access model**: The first version is admin-only. Developers use administrator accounts in dev/staging when they need the operational view. +2. **Refresh model**: Health data is a read-only point-in-time snapshot captured when the dashboard opens or when an administrator manually refreshes it. +3. **Navigation placement**: The dashboard belongs in the existing admin/ops area navigation. +4. **Worker and smoke evidence**: Show recent recorded worker/deploy smoke status when available; otherwise report unknown or unavailable. +5. **Diagnostic sharing**: Include a copyable non-secret summary in the first version. + +## Applied Spec Changes + +- Added clarifications to [spec.md](./spec.md). +- Kept the dashboard read-only and administrator-scoped. +- Kept optional worker/deploy smoke signals evidence-based rather than active probes. +- Required diagnostic output to avoid secrets, cookies, auth headers, private keys, passwords, and full connection strings. diff --git a/specs/021-ops-health-dashboard/contracts/ops-health-dashboard.md b/specs/021-ops-health-dashboard/contracts/ops-health-dashboard.md new file mode 100644 index 0000000..6cc8625 --- /dev/null +++ b/specs/021-ops-health-dashboard/contracts/ops-health-dashboard.md @@ -0,0 +1,105 @@ +# Contract: Ops Health Dashboard + +## Admin Page + +**Route**: `/admin/ops` + +**Audience**: Platform administrators only. + +**Behavior**: + +- Renders inside the existing dashboard shell and navigation. +- Shows environment/build metadata. +- Shows overall status and individual health areas. +- Shows a timestamp for the current snapshot. +- Offers a manual refresh action that loads a new snapshot. +- Offers a copy action for the non-secret diagnostic summary. +- Redirects or denies access for non-admin users using the existing admin-page pattern. + +## Snapshot API + +**Route**: `/api/admin/ops-health` + +**Method**: `GET` + +**Access**: Platform administrators only. + +**Response: 200** + +```json +{ + "capturedAt": "2026-06-11T15:24:00.000Z", + "overallStatus": "healthy", + "environment": { + "environment": "staging", + "version": "staging-42", + "revision": "abcdef123456", + "buildId": "123.2", + "builtAt": "2026-06-11T12:00:00.000Z" + }, + "checks": [ + { + "key": "runtime", + "status": "healthy", + "summary": "Runtime is responding", + "checkedAt": "2026-06-11T15:24:00.000Z" + }, + { + "key": "database", + "status": "healthy", + "summary": "Database connectivity check passed", + "checkedAt": "2026-06-11T15:24:00.000Z" + }, + { + "key": "configuration", + "status": "healthy", + "summary": "Required runtime configuration is present", + "checkedAt": "2026-06-11T15:24:00.000Z" + }, + { + "key": "worker", + "status": "unknown", + "summary": "No recent worker evidence is available" + }, + { + "key": "deploySmoke", + "status": "unavailable", + "summary": "No recent deployment smoke result is available" + } + ], + "diagnosticSummary": { + "generatedAt": "2026-06-11T15:24:00.000Z", + "text": "Environment: staging\nVersion: staging-42\nRevision: abcdef123456\nBuild ID: 123.2\nOverall: healthy\nruntime: healthy\ndatabase: healthy\nconfiguration: healthy\nworker: unknown\ndeploySmoke: unavailable" + } +} +``` + +**Response: 401/403** + +Uses the existing unauthorized response/redirect behavior for API routes. + +**Failure behavior**: + +- A degraded check should normally return `200` with `overallStatus: "degraded"` so the page can render available diagnostics. +- Reserve `5xx` for failures that prevent assembling any safe snapshot. +- Response bodies must not include raw secrets or full configuration values. + +## Navigation Contract + +Add an admin-only navigation item: + +- Label key: `nav.opsHealth` +- Target: `/admin/ops` +- Icon: use an existing lucide status/heartbeat/activity-style icon + +## Internationalization Contract + +Add translation keys for all visible labels, statuses, summaries, button text, and copy feedback in: + +- `src/i18n/messages/en.json` +- `src/i18n/messages/de.json` +- `src/i18n/messages/es.json` +- `src/i18n/messages/fr.json` +- `src/i18n/messages/pt.json` + +No hardcoded user-facing strings are allowed in page/components. diff --git a/specs/021-ops-health-dashboard/data-model.md b/specs/021-ops-health-dashboard/data-model.md new file mode 100644 index 0000000..87bca57 --- /dev/null +++ b/specs/021-ops-health-dashboard/data-model.md @@ -0,0 +1,98 @@ +# Data Model: Ops Health Dashboard + +The first version uses in-memory view models assembled from existing runtime metadata, health checks, and available recorded operational evidence. No new database tables are planned. + +## EnvironmentIdentity + +Identifies the running environment and build. + +| Field | Type | Required | Notes | +| ------------- | -------------------- | -------- | -------------------------------------------------------------------- | +| `environment` | string | yes | `local`, `development`, `staging`, `production`, or configured value | +| `version` | string | yes | Runtime version label; fallback allowed for local development | +| `revision` | string | no | Commit SHA or revision identifier | +| `buildId` | string | no | CI/run/build identifier | +| `builtAt` | ISO timestamp string | no | Build timestamp if provided | + +Validation rules: + +- Missing optional values render as `unknown` or `unavailable`. +- Values must be treated as identifiers only; they must not include secrets. + +## HealthCheckResult + +Represents one operational health area. + +| Field | Type | Required | Notes | +| ----------- | ---------------------------- | -------- | --------------------------------------------------------------- | +| `key` | enum string | yes | `runtime`, `database`, `configuration`, `worker`, `deploySmoke` | +| `label` | translation key | yes | Localized in UI | +| `status` | enum string | yes | `healthy`, `degraded`, `unknown`, `unavailable` | +| `summary` | translation key or safe text | yes | Short human-readable status | +| `detail` | translation key or safe text | no | Safe next-step detail | +| `checkedAt` | ISO timestamp string | no | Present when evaluated or recorded | + +Validation rules: + +- `degraded` must include safe detail that points to an investigation area. +- `unknown` and `unavailable` must not cause the whole dashboard to fail by themselves. +- Raw secret/config values are never valid in `summary` or `detail`. + +## HealthSnapshot + +Point-in-time administrator view of the environment. + +| Field | Type | Required | Notes | +| ------------------- | -------------------- | -------- | ------------------------------------- | +| `capturedAt` | ISO timestamp string | yes | Time this snapshot was assembled | +| `overallStatus` | enum string | yes | Worst meaningful status across checks | +| `environment` | EnvironmentIdentity | yes | Current runtime identity | +| `checks` | HealthCheckResult[] | yes | Ordered health areas | +| `diagnosticSummary` | DiagnosticSummary | yes | Copy-safe summary | + +Overall status rules: + +1. Any `degraded` required check makes overall status `degraded`. +2. If no required check is degraded and at least one required check is unknown, overall status is `unknown`. +3. Optional worker/deploy smoke unknown/unavailable states do not degrade the overall status. +4. Otherwise the overall status is `healthy`. + +## DiagnosticSummary + +Copyable non-secret representation of the snapshot. + +| Field | Type | Required | Notes | +| ------------- | -------------------- | -------- | ------------------------------------------------ | +| `text` | string | yes | Multi-line plain text suitable for issue reports | +| `generatedAt` | ISO timestamp string | yes | Usually same as `capturedAt` | + +Allowed content: + +- Environment name +- Version, revision, build id, built-at timestamp +- Snapshot timestamp +- Overall status +- Health check keys and statuses +- Safe short summaries + +Forbidden content: + +- Tokens, passwords, API keys, private keys +- Full connection strings +- Raw environment variable values except approved build metadata +- Request cookies, session IDs, auth headers + +## Recorded Operational Evidence + +Existing data that may inform optional worker and deploy smoke checks. + +Potential sources: + +- Existing background job records for worker evidence +- Existing deployment smoke output if the application records it in a safe, queryable place + +Rules: + +- If no recent safe record exists, show `unknown` or `unavailable`. +- The dashboard must not execute deploy smoke tests. +- Any payload/result/error content read from records must be sanitized before display or copying. diff --git a/specs/021-ops-health-dashboard/plan.md b/specs/021-ops-health-dashboard/plan.md new file mode 100644 index 0000000..769433a --- /dev/null +++ b/specs/021-ops-health-dashboard/plan.md @@ -0,0 +1,128 @@ +# Implementation Plan: Ops Health Dashboard + +**Branch**: `021-ops-health-dashboard` | **Date**: 2026-06-11 | **Spec**: [spec.md](./spec.md) +**Input**: Feature specification from `/specs/021-ops-health-dashboard/spec.md` + +**Required First Step**: Read `/CONTINUE.md` before planning or implementation so the current handoff context, open risks, and recommended next actions are carried forward. + +## Summary + +Add an administrator-only Ops Health Dashboard inside the existing dashboard/admin navigation. The first version presents a read-only point-in-time health snapshot gathered on page load or manual refresh, combining existing build metadata, runtime/database checks, configuration sanity, recent worker evidence, and recent deployment smoke evidence where available. It also provides a copyable non-secret diagnostic summary. The implementation should reuse existing health/version/background-job patterns, avoid new persistent storage unless a recorded smoke/job signal already exists, and keep all diagnostic output secret-safe. + +## Technical Context + +**Language/Version**: TypeScript 5.9, Next.js 16 App Router, React 19 +**Primary Dependencies**: Existing Next.js server components/API routes, Prisma 7, Better Auth role/session helpers, next-intl, lucide-react, existing monitoring and app-version helpers +**Storage**: Existing Prisma database only; no new tables planned. Use existing background job records for worker evidence and existing deployment/runtime metadata when available. +**Testing**: Vitest unit/integration tests, Playwright e2e for administrator dashboard access and copy interaction, existing `validate.ps1`/`pnpm` validation pipeline +**Target Platform**: Web application running locally, in Docker, and on Azure Container Apps +**Project Type**: Next.js web application with server-rendered dashboard pages and API routes +**Performance Goals**: Healthy snapshot visible within 5 seconds; health checks bounded so one slow/degraded area does not make the dashboard unusable +**Constraints**: Admin-only access; no raw secrets, tokens, passwords, private keys, or full connection strings in UI/API/copy output; configuration sanity checks report presence/readiness only for authentication, database URL ownership, runtime environment, and build metadata; all user-facing text via next-intl; responsive layout; configurable base path respected; user-triggered copy actions provide toast-style feedback +**Scale/Scope**: Small-team operational page for roughly 10 users and single-instance deployments + +## Constitution Check + +_GATE: Must pass before Phase 0 research. Re-check after Phase 1 design._ + +- **Simplicity First**: PASS. Reuse existing app-version, monitoring, role auth, background job, navigation, and i18n patterns. No new dependency or persistence layer. +- **Test Coverage**: PASS. Plan includes unit tests for snapshot/sanitization/status aggregation, route/page auth tests, and e2e coverage for the admin flow. +- **Duplication Control**: PASS. Shared status/snapshot formatting should live in a small service/module rather than duplicating logic between route, page, and copy summary. +- **Incremental Delivery**: PASS. User Story 1 can ship first with metadata and admin navigation; User Story 2 adds health checks; User Story 3 adds copyable diagnostics. +- **Spec Sequencing And Completion**: PASS. `020` is fully implemented, `021` is the active spec, and `ACTIVE_SPECS.md` will be updated. +- **Continuity And Handoff**: PASS. `CONTINUE.md` was reviewed and will be updated with planning state. +- **Azure OpenAI Integration**: PASS. No LLM functionality is introduced. +- **Web Application Standards**: PASS. Dashboard remains under the existing app shell and base-path-aware routing. +- **Web Application Standards**: PASS. Dashboard remains under the existing app shell and base-path-aware routing, and copy actions require toast-style feedback. +- **Internationalization**: PASS. New labels, statuses, messages, and copy feedback require keys for en/de/es/fr/pt. +- **Responsive Design**: PASS. Plan requires mobile/tablet/desktop layout verification. + +## Project Structure + +### Documentation (this feature) + +```text +specs/021-ops-health-dashboard/ +├── plan.md +├── research.md +├── data-model.md +├── quickstart.md +├── contracts/ +│ └── ops-health-dashboard.md +├── checklists/ +│ └── requirements.md +└── tasks.md # Created by /speckit.tasks, not by this plan +``` + +### Source Code (repository root) + +```text +src/ +├── app/ +│ ├── (dashboard)/ +│ │ └── admin/ +│ │ └── ops/ +│ │ └── page.tsx +│ └── api/ +│ └── admin/ +│ └── ops-health/ +│ └── route.ts +├── components/ +│ └── ops/ +│ ├── DiagnosticSummaryCopy.tsx +│ ├── HealthStatusBadge.tsx +│ └── OpsHealthDashboard.tsx +├── lib/ +│ └── ops-health.ts +└── i18n/ + └── messages/ + ├── en.json + ├── de.json + ├── es.json + ├── fr.json + └── pt.json + +tests/ +├── e2e/ +│ └── ops-health/ +│ └── admin-ops-health.spec.ts +├── integration/ +│ └── ops-health-api.test.ts +└── unit/ + └── ops-health.test.ts +``` + +**Structure Decision**: Use the existing Next.js App Router dashboard structure. The user-facing page belongs under the administrator area, the JSON snapshot route belongs under `/api/admin`, and shared snapshot/status/sanitization logic belongs in `src/lib/ops-health.ts` so tests and UI do not duplicate operational rules. + +## Phase 0: Research Summary + +See [research.md](./research.md). All planning questions are resolved with conservative defaults: + +- Snapshot checks are point-in-time and manually refreshable. +- Worker/deploy smoke evidence is displayed only when recently recorded evidence exists. +- Diagnostic output is allowlisted and redacted by default. +- No new storage is added for the first version. + +## Phase 1: Design Summary + +See [data-model.md](./data-model.md), [contracts/ops-health-dashboard.md](./contracts/ops-health-dashboard.md), and [quickstart.md](./quickstart.md). + +The design centers on four in-memory view models: + +- `EnvironmentIdentity` +- `HealthCheckResult` +- `HealthSnapshot` +- `DiagnosticSummary` + +## Post-Design Constitution Check + +- **Simplicity First**: PASS. Design remains a small page, route, and service around existing primitives. +- **Test Coverage**: PASS. Artifacts define unit, integration, and e2e checks for each story. +- **Duplication Control**: PASS. Snapshot derivation and redaction have one owner. +- **Incremental Delivery**: PASS. Tasks can be generated by story priority. +- **Continuity And Handoff**: PASS. Active spec and next action are recorded. +- **Internationalization / Responsive Design**: PASS. Quickstart and contract call out locale keys and viewport checks. + +## Complexity Tracking + +No constitution violations or added complexity require justification. diff --git a/specs/021-ops-health-dashboard/quickstart.md b/specs/021-ops-health-dashboard/quickstart.md new file mode 100644 index 0000000..c136585 --- /dev/null +++ b/specs/021-ops-health-dashboard/quickstart.md @@ -0,0 +1,87 @@ +# Quickstart: Ops Health Dashboard + +## Prerequisites + +- Install dependencies with `pnpm install`. +- Generate Prisma client if needed with `pnpm run prisma:generate`. +- Seed or create a platform administrator account. +- Configure build metadata as desired: + - `APP_ENVIRONMENT` + - `APP_VERSION` + - `APP_REVISION` + - `APP_BUILD_ID` + - `APP_BUILT_AT` + +## Manual Validation + +1. Start the app: + + ```powershell + pnpm dev + ``` + +2. Sign in as a platform administrator. + +3. Open `/admin/ops` through the dashboard navigation. + +4. Confirm the page shows: + - Environment name + - Version/revision/build id/build time where available + - Snapshot timestamp + - Overall status + - Runtime, database, configuration, worker, and deploy smoke health areas + - Unknown/unavailable states where optional evidence is absent + - Configuration readiness as presence/readiness only, without raw environment values + +5. Use the manual refresh action and confirm the snapshot timestamp changes. + +6. Use the copy action and confirm a toast-style success message appears, then confirm the copied text contains no raw secrets, tokens, passwords, private keys, auth headers, cookies, or full connection strings. + +7. Sign in as a non-admin user and confirm `/admin/ops` and `/api/admin/ops-health` are not accessible. + +8. Confirm the dashboard has no horizontal overflow at mobile, tablet, and desktop widths. + +## Automated Validation + +Run focused tests for the implementation: + +```powershell +pnpm test -- tests/unit/ops-health.test.ts tests/integration/ops-health-api.test.ts +pnpm test:e2e -- tests/e2e/ops-health/admin-ops-health.spec.ts +``` + +The focused e2e spec covers admin access, non-admin denial, manual refresh, copy feedback, and responsive overflow checks. + +Run broader project validation before merge: + +```powershell +.\validate.ps1 quality +.\validate.ps1 all +``` + +## Expected Safe Diagnostic Summary Shape + +The copied summary should be plain text, compact, and suitable for issue reports: + +```text +Environment: staging +Version: staging-42 +Revision: abcdef123456 +Build ID: 123.2 +Built At: 2026-06-11T12:00:00.000Z +Captured At: 2026-06-11T15:24:00.000Z +Overall: healthy +runtime: healthy +database: healthy +configuration: healthy +worker: unknown +deploySmoke: unavailable +``` + +Forbidden examples: + +- `DATABASE_URL=...` +- `Authorization: Bearer ...` +- Session cookie values +- Passwords or API keys +- Private key material diff --git a/specs/021-ops-health-dashboard/research.md b/specs/021-ops-health-dashboard/research.md new file mode 100644 index 0000000..9b0887c --- /dev/null +++ b/specs/021-ops-health-dashboard/research.md @@ -0,0 +1,54 @@ +# Research: Ops Health Dashboard + +## Decision: Use a point-in-time snapshot with manual refresh + +**Rationale**: The clarified spec asks for a read-only snapshot captured on page load or manual refresh. This avoids background polling, partial live updates, websocket state, and race conditions while still giving administrators current-enough triage information. + +**Alternatives considered**: + +- Live auto-refreshing dashboard: rejected for v1 because it adds UI state and repeated health-check load without a stated need. +- Last-known-only dashboard: rejected because core runtime/database/config checks are more useful when evaluated for the current request. + +## Decision: Keep access admin-only + +**Rationale**: The dashboard exposes operational status and configuration readiness. Even without secrets, this is security-adjacent information and should follow existing platform-admin pages such as background jobs and admin tokens. + +**Alternatives considered**: + +- Any authenticated user in dev/staging: rejected because environment-specific authorization is easy to misconfigure. +- Public redacted page: rejected because it weakens the security posture for little benefit. + +## Decision: Reuse existing health/version/background-job primitives + +**Rationale**: The repository already has build metadata, health checks, route authorization, background job data, and admin navigation. Reusing these keeps the implementation small and aligned with current patterns. + +**Alternatives considered**: + +- New diagnostic subsystem: rejected as premature abstraction. +- New persistent health table: rejected for v1 because the spec only requires recent recorded results when available, not durable monitoring history. + +## Decision: Treat worker and deploy smoke evidence as recorded-only + +**Rationale**: The dashboard should not trigger deployment smoke checks or worker probes that could mutate state, require external credentials, or take too long. It should summarize recent recorded evidence when the app already has it; otherwise it should say unknown/unavailable. + +**Alternatives considered**: + +- Active worker/deploy smoke checks from the dashboard: rejected because they blur diagnostics with deployment automation. +- Omit these areas: rejected because unknown/unavailable status is still useful and matches the clarified spec. + +## Decision: Use allowlisted diagnostic summary fields with recursive redaction as defense in depth + +**Rationale**: The copyable summary is intended for issue reports and incident notes. It should include identifiers and status labels only. Allowlisting prevents accidental secret exposure, while recursive redaction protects any safe-looking structured details that later include sensitive names. + +**Alternatives considered**: + +- Copy all visible UI text: rejected because future UI detail could accidentally contain sensitive values. +- Downloadable diagnostic file: rejected by clarification; copyable text is enough for v1. + +## Decision: No new external dependencies + +**Rationale**: Existing React, Next.js, Prisma, next-intl, and lucide-react capabilities are sufficient. The constitution asks to minimize dependencies. + +**Alternatives considered**: + +- Clipboard helper library or status dashboard package: rejected because the needed behavior is small and native browser clipboard APIs are sufficient for a client component. diff --git a/specs/021-ops-health-dashboard/spec.md b/specs/021-ops-health-dashboard/spec.md new file mode 100644 index 0000000..63d6876 --- /dev/null +++ b/specs/021-ops-health-dashboard/spec.md @@ -0,0 +1,126 @@ +# Feature Specification: Ops Health Dashboard + +**Feature Branch**: `021-ops-health-dashboard` +**Created**: 2026-06-11 +**Status**: Draft +**Input**: User description: "Create an Ops / Health dashboard for administrators and developers to quickly understand a running environment. It should show app version/build metadata, environment, revision, build id, build time, core runtime health, database connectivity, auth/config sanity, worker/deploy smoke status where available, and clear degraded/unknown states without exposing secrets. Include the existing CONTINUE.md and CONTINUE_LOG.md housekeeping changes in the same PR." + +> Before drafting or implementing this feature, review `/CONTINUE.md` for the latest handoff context and current recommended next steps. + +## Clarifications + +### Session 2026-06-11 + +- Q: Who can access the ops dashboard in the first version? -> A: Admin-only access; developers use admin accounts in dev/staging when needed. +- Q: Should health data update live or use point-in-time checks? -> A: Read-only snapshot taken when the dashboard is opened or manually refreshed. +- Q: Where should administrators access the ops dashboard? -> A: Place it in the existing admin/ops area navigation. +- Q: How should worker and deploy smoke status be represented? -> A: Display recent recorded worker/smoke status when available; otherwise unknown/unavailable. +- Q: Should the first version include shareable diagnostic context? -> A: Include a copyable non-secret summary in the first version. + +## User Scenarios & Testing _(mandatory)_ + +### User Story 1 - Identify Running Environment (Priority: P1) + +An administrator opens the ops dashboard from the existing admin or ops navigation in a dev, staging, or production-like environment and immediately sees which environment and build they are inspecting. Developers use administrator accounts in dev and staging when they need this operational view. + +**Why this priority**: Fast fault triage requires knowing the exact deployed version before looking at logs or reproducing a bug. + +**Independent Test**: Can be tested by opening the dashboard with known build metadata and confirming the visible environment, version, revision, build id, and build time match the running deployment. + +**Acceptance Scenarios**: + +1. **Given** a running deployment with complete build metadata, **When** an authorized operator opens the dashboard, **Then** the dashboard shows environment, version, revision, build id, and build time in a copyable or easily transcribed form. +2. **Given** a local or manually started environment with partial metadata, **When** an authorized operator opens the dashboard, **Then** the dashboard shows available metadata and clearly labels missing values as unknown instead of inventing values. +3. **Given** an administrator is using the existing admin or ops area, **When** they navigate through operational tools, **Then** the ops dashboard is available through the same navigation model. + +--- + +### User Story 2 - Assess Operational Health (Priority: P2) + +An administrator checks whether the core runtime, database connectivity, authentication/configuration readiness, worker readiness, and deployment smoke status are healthy, degraded, or unknown. + +**Why this priority**: Operators need a single first-stop view before deciding whether to inspect logs, infrastructure, database state, or deployment history. + +**Independent Test**: Can be tested by viewing the dashboard under healthy and intentionally degraded conditions and confirming each health area reports the correct state with concise supporting detail. + +**Acceptance Scenarios**: + +1. **Given** all required runtime checks pass, **When** the dashboard loads, **Then** each health area is marked healthy and the overall state is healthy. +2. **Given** one required runtime check fails, **When** the dashboard loads, **Then** that health area is marked degraded, the overall state is degraded, and the dashboard indicates the next area to investigate without exposing secret values. +3. **Given** optional smoke or worker status has no recent recorded result in the current environment, **When** the dashboard loads, **Then** that area is marked unknown or unavailable without causing the whole dashboard to appear failed. +4. **Given** an administrator wants current results after the dashboard is already open, **When** they manually refresh the dashboard status, **Then** the dashboard displays a new point-in-time snapshot rather than continuously updating in the background. + +--- + +### User Story 3 - Share Safe Diagnostic Context (Priority: P3) + +An administrator copies or shares the dashboard's non-secret diagnostic summary in an issue, support thread, or incident note. + +**Why this priority**: Reproducible fault reports improve when operators can share environment and health context without manually redacting credentials. + +**Independent Test**: Can be tested by using the dashboard's visible summary in a report and verifying it contains useful metadata and health state but no secret values or sensitive configuration contents. + +**Acceptance Scenarios**: + +1. **Given** a deployment with configured secrets and service URLs, **When** an operator views or copies diagnostic context, **Then** the shared content includes status labels and safe identifiers but excludes raw secrets, tokens, passwords, connection strings, and private keys. +2. **Given** a dashboard health area has degraded status, **When** an operator reads the diagnostic detail, **Then** the detail explains the failing area in plain language without revealing sensitive values. +3. **Given** an administrator needs to report an issue, **When** they copy the diagnostic summary, **Then** the copied content includes environment/build identifiers and health states without any raw secret values. + +### Edge Cases + +- Build metadata is missing, partial, malformed, or still using a legacy revision variable. +- Database connectivity is slow or unavailable when the dashboard is opened. +- Authentication/configuration readiness is degraded because required settings are absent, but their values must remain hidden. +- Worker or deployment smoke status has not been recorded recently for the current environment. +- The viewer is not authorized to access operational diagnostics. +- Multiple checks have mixed states; the overall status must communicate the most severe state without hiding individual details. + +## Requirements _(mandatory)_ + +### Functional Requirements + +- **FR-001**: System MUST provide an ops dashboard intended for authorized administrators. +- **FR-002**: System MUST show current environment identity, version, revision, build id, and build time when available. +- **FR-003**: System MUST label missing or unavailable metadata as unknown or unavailable without blocking access to other dashboard information. +- **FR-004**: System MUST show an overall health state derived from individual health areas. +- **FR-005**: System MUST show individual health states for core runtime availability, database connectivity, authentication/configuration readiness, worker readiness where available, and deployment smoke status where available. +- **FR-006**: System MUST distinguish healthy, degraded, and unknown or unavailable states. +- **FR-007**: System MUST include concise operator-facing detail for degraded and unknown states so the next investigation area is clear. +- **FR-008**: System MUST avoid exposing raw secrets, tokens, passwords, private keys, full connection strings, or other sensitive configuration values. +- **FR-009**: System MUST allow operators to copy or transcribe a non-secret diagnostic summary containing environment/build identifiers and health states. +- **FR-010**: System MUST remain usable when one health check is slow, fails, or cannot be determined. +- **FR-011**: System MUST be clear enough to use in local development, staging, and production-like deployments. +- **FR-012**: System MUST make access to operational diagnostics available only to administrators. +- **FR-013**: System MUST present health results as a read-only point-in-time snapshot captured when the dashboard opens or when an administrator manually refreshes it. +- **FR-014**: System MUST make the dashboard reachable from the existing admin or ops navigation. +- **FR-015**: System MUST show worker and deployment smoke status only from recent recorded results when those results are available; otherwise these areas MUST be marked unknown or unavailable. +- **FR-016**: System MUST provide a copy action for a non-secret diagnostic summary in the first version. +- **FR-017**: System MUST provide toast-style feedback for successful or failed diagnostic summary copy actions. + +### Key Entities + +- **Environment Identity**: The environment and build metadata that identify the running deployment, including environment name, version, revision, build id, and build time. +- **Health Check Result**: A named operational check with a status, short explanation, optional timestamp, and safe diagnostic detail. +- **Health Snapshot**: A read-only collection of health check results captured at one point in time for the current administrator view. +- **Diagnostic Summary**: A non-secret collection of environment identity and health check states suitable for sharing in issue reports or incident notes. + +## Success Criteria _(mandatory)_ + +### Measurable Outcomes + +- **SC-001**: An authorized operator can identify the running environment and build in under 15 seconds after opening the dashboard. +- **SC-002**: At least 95% of routine triage sessions can determine whether the app, database, configuration, worker, or deployment smoke area is the first investigation target from the dashboard alone. +- **SC-003**: The dashboard remains usable and shows available results when at least one optional or degraded check cannot report status. +- **SC-004**: Diagnostic summaries generated from the dashboard contain zero raw secret values during security review. +- **SC-005**: A healthy environment reports healthy overall status within 5 seconds for a typical operator view. + +## Assumptions + +- The initial audience is administrators; developers use administrator accounts in dev and staging when they need operational diagnostics. +- The first version should prioritize safe, high-signal status over deep remediation workflows. +- Worker and deployment smoke details may be unavailable in some environments, and the dashboard should not actively invent or execute deployment smoke checks to fill that gap. +- The dashboard should reuse existing build metadata and operational validation concepts already present in the project. +- The first version does not require live background updates; administrators can manually refresh when they need a new snapshot. +- The dashboard belongs inside the existing administrative experience rather than as a standalone public diagnostics page. +- The first version includes copyable diagnostic text, not a downloadable diagnostic file. +- The v1 configuration sanity check covers presence/readiness only for authentication, database URL ownership, runtime environment, and build metadata; it must never display raw values. diff --git a/specs/021-ops-health-dashboard/tasks.md b/specs/021-ops-health-dashboard/tasks.md new file mode 100644 index 0000000..038f98e --- /dev/null +++ b/specs/021-ops-health-dashboard/tasks.md @@ -0,0 +1,232 @@ +# Tasks: Ops Health Dashboard + +**Input**: Design documents from `/specs/021-ops-health-dashboard/` +**Prerequisites**: plan.md, spec.md, research.md, data-model.md, contracts/ops-health-dashboard.md, quickstart.md +**Required Context**: Review `/CONTINUE.md` before task execution and update `CONTINUE.md` plus `CONTINUE_LOG.md` when project state materially changes. + +**Tests**: Required by project constitution and this feature plan. Write focused tests before implementation for each story and run the relevant slice after each implemented story. + +**Organization**: Tasks are grouped by user story to enable independent implementation and testing. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (US1, US2, US3) +- Include exact file paths in descriptions + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Prepare shared files and test entry points for the ops dashboard feature. + +- [x] T001 Create shared ops health module scaffold with exported placeholder types in `src/lib/ops-health.ts` +- [x] T002 [P] Create ops component directory scaffold with placeholder exports in `src/components/ops/index.ts` +- [x] T003 [P] Create admin ops route directory with placeholder page in `src/app/(dashboard)/admin/ops/page.tsx` +- [x] T004 [P] Create admin ops health API route scaffold in `src/app/api/admin/ops-health/route.ts` +- [x] T005 [P] Create focused unit test file scaffold in `tests/unit/ops-health.test.ts` +- [x] T006 [P] Create focused integration test file scaffold in `tests/integration/ops-health-api.test.ts` +- [x] T007 [P] Create focused e2e test file scaffold in `tests/e2e/ops-health/admin-ops-health.spec.ts` + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Define shared contracts, sanitization, status aggregation, and i18n keys that all user stories depend on. + +**CRITICAL**: No user story implementation should begin until this phase is complete. + +- [x] T008 Define `HealthStatus`, `HealthCheckKey`, `EnvironmentIdentity`, `HealthCheckResult`, `HealthSnapshot`, and `DiagnosticSummary` types in `src/lib/ops-health.ts` +- [x] T009 Implement status aggregation rules for required vs optional checks in `src/lib/ops-health.ts` +- [x] T010 Implement secret-safe redaction and allowlisted diagnostic summary formatting helpers in `src/lib/ops-health.ts` +- [x] T011 [P] Add ops health translation namespace and nav label to `src/i18n/messages/en.json` +- [x] T012 [P] Add ops health translation namespace and nav label to `src/i18n/messages/de.json` +- [x] T013 [P] Add ops health translation namespace and nav label to `src/i18n/messages/es.json` +- [x] T014 [P] Add ops health translation namespace and nav label to `src/i18n/messages/fr.json` +- [x] T015 [P] Add ops health translation namespace and nav label to `src/i18n/messages/pt.json` +- [x] T016 Add unit tests for status aggregation and diagnostic redaction helpers in `tests/unit/ops-health.test.ts` + +**Checkpoint**: Foundation ready. Shared snapshot types and safety helpers are test-covered. + +--- + +## Phase 3: User Story 1 - Identify Running Environment (Priority: P1) MVP + +**Goal**: An administrator can open the dashboard from admin navigation and identify the running environment/build within 15 seconds. + +**Independent Test**: Sign in as a platform administrator, navigate to `/admin/ops`, and verify the page shows environment, version, revision, build id, build time, and snapshot timestamp with unknown labels for missing metadata. + +### Tests for User Story 1 + +- [x] T017 [P] [US1] Add unit tests for environment identity mapping from app version metadata in `tests/unit/ops-health.test.ts` +- [x] T018 [P] [US1] Add integration tests for admin-only `/api/admin/ops-health` metadata response in `tests/integration/ops-health-api.test.ts` +- [x] T019 [P] [US1] Add e2e test for admin navigation to `/admin/ops` and visible build metadata in `tests/e2e/ops-health/admin-ops-health.spec.ts` + +### Implementation for User Story 1 + +- [x] T020 [US1] Implement environment identity assembly from `getAppVersionInfo()` in `src/lib/ops-health.ts` +- [x] T021 [US1] Implement `GET /api/admin/ops-health` admin authorization and metadata-only snapshot response in `src/app/api/admin/ops-health/route.ts` +- [x] T022 [US1] Add admin-only Ops Health navigation item with `nav.opsHealth` in `src/components/ui/Navigation.tsx` +- [x] T023 [P] [US1] Implement status badge component for healthy/degraded/unknown/unavailable display in `src/components/ops/HealthStatusBadge.tsx` +- [x] T024 [US1] Implement initial ops dashboard page shell and metadata panel in `src/app/(dashboard)/admin/ops/page.tsx` +- [x] T025 [US1] Ensure non-admin users are redirected or denied from `/admin/ops` in `src/app/(dashboard)/admin/ops/page.tsx` +- [x] T026 [US1] Run and record focused US1 validation with `pnpm test -- tests/unit/ops-health.test.ts tests/integration/ops-health-api.test.ts` and relevant e2e command from `specs/021-ops-health-dashboard/quickstart.md` + +**Checkpoint**: User Story 1 is independently functional and demonstrable. + +--- + +## Phase 4: User Story 2 - Assess Operational Health (Priority: P2) + +**Goal**: An administrator can see overall status plus runtime, database, configuration, worker, and deploy smoke health areas as a point-in-time snapshot. + +**Independent Test**: Open the dashboard in healthy, degraded, and missing-evidence conditions and verify each health area reports healthy, degraded, unknown, or unavailable correctly without blocking the rest of the page. + +### Tests for User Story 2 + +- [x] T027 [P] [US2] Add unit tests for runtime, database, configuration, worker, and deploy smoke health result assembly in `tests/unit/ops-health.test.ts` +- [x] T028 [P] [US2] Add integration tests for degraded database/configuration, optional unknown worker/smoke states, and safe fatal snapshot assembly errors in `tests/integration/ops-health-api.test.ts` +- [x] T029 [P] [US2] Add e2e test for manual refresh updating snapshot timestamp in `tests/e2e/ops-health/admin-ops-health.spec.ts` + +### Implementation for User Story 2 + +- [x] T030 [US2] Implement runtime health check mapping from existing process health in `src/lib/ops-health.ts` +- [x] T031 [US2] Implement database health check mapping from existing database health in `src/lib/ops-health.ts` +- [x] T032 [US2] Implement configuration sanity check for authentication readiness, database URL ownership, runtime environment, and build metadata presence without exposing raw values in `src/lib/ops-health.ts` +- [x] T033 [US2] Implement recent worker evidence lookup from existing background jobs in `src/lib/ops-health.ts` +- [x] T034 [US2] Implement deploy smoke evidence as recorded-only unknown/unavailable status in `src/lib/ops-health.ts` +- [x] T035 [US2] Expand `/api/admin/ops-health` to return full health snapshot and degraded status details in `src/app/api/admin/ops-health/route.ts` +- [x] T036 [US2] Implement ops dashboard health check grid and overall status panel in `src/components/ops/OpsHealthDashboard.tsx` +- [x] T037 [US2] Add manual refresh behavior that fetches a new snapshot from `/api/admin/ops-health` in `src/components/ops/OpsHealthDashboard.tsx` +- [x] T038 [US2] Wire server-rendered initial snapshot into dashboard client refresh component in `src/app/(dashboard)/admin/ops/page.tsx` +- [x] T039 [US2] Run and record focused US2 validation with `pnpm test -- tests/unit/ops-health.test.ts tests/integration/ops-health-api.test.ts` and the manual-refresh e2e slice from `specs/021-ops-health-dashboard/quickstart.md` + +**Checkpoint**: User Stories 1 and 2 are independently functional and demonstrable. + +--- + +## Phase 5: User Story 3 - Share Safe Diagnostic Context (Priority: P3) + +**Goal**: An administrator can copy a non-secret diagnostic summary for issues or incidents. + +**Independent Test**: Copy the diagnostic summary from the dashboard and verify it contains environment/build identifiers and health states but no raw secrets, cookies, auth headers, private keys, passwords, or full connection strings. + +### Tests for User Story 3 + +- [x] T040 [P] [US3] Add unit tests for diagnostic summary allowlist and forbidden secret patterns in `tests/unit/ops-health.test.ts` +- [x] T041 [P] [US3] Add integration test asserting `/api/admin/ops-health` diagnostic summary contains no raw secret-like values in `tests/integration/ops-health-api.test.ts` +- [x] T042 [P] [US3] Add e2e test for copy diagnostic summary action and toast-style success feedback in `tests/e2e/ops-health/admin-ops-health.spec.ts` + +### Implementation for User Story 3 + +- [x] T043 [US3] Finalize diagnostic summary text generation from `HealthSnapshot` in `src/lib/ops-health.ts` +- [x] T044 [US3] Implement copy-to-clipboard client component with localized toast-style success and failure feedback in `src/components/ops/DiagnosticSummaryCopy.tsx` +- [x] T045 [US3] Add diagnostic summary panel to `src/components/ops/OpsHealthDashboard.tsx` +- [x] T046 [US3] Ensure copied diagnostic summary is rebuilt after manual refresh in `src/components/ops/OpsHealthDashboard.tsx` +- [x] T047 [US3] Run and record focused US3 validation with `pnpm test -- tests/unit/ops-health.test.ts tests/integration/ops-health-api.test.ts` and the copy e2e slice from `specs/021-ops-health-dashboard/quickstart.md` + +**Checkpoint**: All user stories are independently functional and demonstrable. + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Final quality, docs, continuity, and validation before PR. + +- [x] T048 [P] Review responsive layout at mobile, tablet, and desktop widths using Playwright or browser screenshots for `/admin/ops` +- [x] T049 [P] Verify all new user-facing text uses next-intl keys across `src/app/(dashboard)/admin/ops/page.tsx` and `src/components/ops/` +- [x] T050 [P] Update `specs/021-ops-health-dashboard/quickstart.md` with any implementation-specific validation notes discovered during build +- [x] T051 Update `ACTIVE_SPECS.md` to show implementation progress for `021-ops-health-dashboard` +- [x] T052 Update `CONTINUE.md` and append `CONTINUE_LOG.md` with implementation status and next action +- [x] T053 Run `pnpm run specs:overview:update` and verify `pnpm run specs:overview:check` +- [x] T054 Run focused validation `pnpm test -- tests/unit/ops-health.test.ts tests/integration/ops-health-api.test.ts` +- [x] T055 Run relevant e2e validation `pnpm test:e2e -- tests/e2e/ops-health/admin-ops-health.spec.ts` +- [x] T056 Run broader validation with `.\validate.ps1 quality` and full pre-merge validation with `.\validate.ps1 all` + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies; can start immediately. +- **Foundational (Phase 2)**: Depends on Setup completion; blocks all user stories. +- **User Story 1 (Phase 3)**: Depends on Foundation; MVP scope. +- **User Story 2 (Phase 4)**: Depends on Foundation and can reuse US1 page/API structure. +- **User Story 3 (Phase 5)**: Depends on Foundation and benefits from US2 complete snapshot data. +- **Polish (Phase 6)**: Depends on all desired user stories. + +### User Story Dependencies + +- **US1 Identify Running Environment**: MVP; no dependency on US2 or US3. +- **US2 Assess Operational Health**: Can be developed after Foundation, but easiest after US1 creates route/page shell. +- **US3 Share Safe Diagnostic Context**: Can be developed after Foundation, but easiest after US2 finalizes full snapshot shape. + +### Within Each User Story + +- Write tests first and confirm they fail before implementation. +- Implement shared service logic before API/page wiring. +- Implement API/page data before UI rendering. +- Run focused validation at each checkpoint before moving to the next story. + +## Parallel Opportunities + +- Setup scaffolds T002-T007 can run in parallel after T001. +- Translation tasks T011-T015 can run in parallel. +- Story test tasks marked [P] can run in parallel within each story. +- Component tasks and service tasks in different files can run in parallel once shared types are stable. +- Polish checks T048-T050 can run in parallel. + +## Parallel Example: User Story 1 + +```text +Task: "T017 [US1] Add unit tests for environment identity mapping from app version metadata in tests/unit/ops-health.test.ts" +Task: "T018 [US1] Add integration tests for admin-only /api/admin/ops-health metadata response in tests/integration/ops-health-api.test.ts" +Task: "T019 [US1] Add e2e test for admin navigation to /admin/ops and visible build metadata in tests/e2e/ops-health/admin-ops-health.spec.ts" +``` + +## Parallel Example: User Story 2 + +```text +Task: "T030 [US2] Implement runtime health check mapping from existing process health in src/lib/ops-health.ts" +Task: "T036 [US2] Implement ops dashboard health check grid and overall status panel in src/components/ops/OpsHealthDashboard.tsx" +Task: "T029 [US2] Add e2e test for manual refresh updating snapshot timestamp in tests/e2e/ops-health/admin-ops-health.spec.ts" +``` + +## Parallel Example: User Story 3 + +```text +Task: "T040 [US3] Add unit tests for diagnostic summary allowlist and forbidden secret patterns in tests/unit/ops-health.test.ts" +Task: "T044 [US3] Implement copy-to-clipboard client component with localized toast-style success and failure feedback in src/components/ops/DiagnosticSummaryCopy.tsx" +Task: "T041 [US3] Add integration test asserting /api/admin/ops-health diagnostic summary contains no raw secret-like values in tests/integration/ops-health-api.test.ts" +``` + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Setup. +2. Complete Phase 2: Foundational. +3. Complete Phase 3: User Story 1. +4. Stop and validate: unit/integration tests plus admin navigation e2e for metadata. +5. Demo `/admin/ops` with build metadata and admin-only access. + +### Incremental Delivery + +1. Foundation ready. +2. Add US1 metadata/admin navigation MVP. +3. Add US2 health snapshot and manual refresh. +4. Add US3 copyable safe diagnostics. +5. Complete polish, responsive review, and validation. + +### Validation Rhythm + +- Run focused tests after each story slice. +- Run `pnpm run specs:overview:check` after spec/task updates. +- Run `.\validate.ps1 quality` during polish and `.\validate.ps1 all` before PR/merge. +- Run additional broader validation if implementation changes reach shared auth, Prisma, or dashboard shell behavior. + +## Notes + +- [P] tasks use different files or can proceed without depending on incomplete task output. +- Tasks intentionally avoid adding new storage or dependencies. +- Optional worker and deploy smoke evidence must remain unknown/unavailable when no recent safe record exists. +- Diagnostic summary output must be allowlisted and redacted by default. diff --git a/specs/OVERVIEW.md b/specs/OVERVIEW.md index c54e948..f0bfbba 100644 --- a/specs/OVERVIEW.md +++ b/specs/OVERVIEW.md @@ -31,6 +31,7 @@ Purpose: Track the status of all planned features, their implementation progress | 018 | OpenTofu Azure Infrastructure | Fully Implemented | - | Large | Review, commit, and propagate the finished feature | | 019 | Logging Standardization | Fully Implemented | - | Large | Review, commit, and propagate the finished feature | | 020 | Deploy Smoke Verification | Fully Implemented | - | Large | Review, commit, and propagate the finished feature | +| 021 | Ops Health Dashboard | Fully Implemented | - | Large | Review, commit, and propagate the finished feature | ## Implementation Roadmap @@ -47,6 +48,7 @@ Purpose: Track the status of all planned features, their implementation progress - 018 OpenTofu Azure Infrastructure: fully implemented - 019 Logging Standardization: fully implemented - 020 Deploy Smoke Verification: fully implemented +- 021 Ops Health Dashboard: fully implemented ### Begin Immediately diff --git a/src/app/(dashboard)/admin/ops/page.tsx b/src/app/(dashboard)/admin/ops/page.tsx new file mode 100644 index 0000000..bfb21ba --- /dev/null +++ b/src/app/(dashboard)/admin/ops/page.tsx @@ -0,0 +1,17 @@ +import { redirect } from "next/navigation"; +import { Role } from "../../../../../generated/prisma/enums"; +import { OpsHealthDashboard } from "@/components/ops"; +import { requireSession } from "@/lib/auth"; +import { buildOpsHealthSnapshot } from "@/lib/ops-health"; + +export default async function OpsHealthPage() { + const user = await requireSession(); + + if (user.role !== Role.PLATFORM_ADMIN) { + redirect("/dashboard"); + } + + const snapshot = await buildOpsHealthSnapshot(); + + return ; +} diff --git a/src/app/api/admin/ops-health/route.ts b/src/app/api/admin/ops-health/route.ts new file mode 100644 index 0000000..81bb29a --- /dev/null +++ b/src/app/api/admin/ops-health/route.ts @@ -0,0 +1,22 @@ +import { NextResponse } from "next/server"; +import { Role } from "../../../../../generated/prisma/enums"; +import { requireApiUserWithRoles } from "@/lib/route-auth"; +import { buildOpsHealthSnapshot } from "@/lib/ops-health"; + +export async function GET(request: Request) { + const auth = await requireApiUserWithRoles([Role.PLATFORM_ADMIN], request); + if ("error" in auth) { + return auth.error; + } + + try { + return NextResponse.json(await buildOpsHealthSnapshot()); + } catch { + return NextResponse.json( + { + error: "Could not assemble a safe ops health snapshot", + }, + { status: 500 }, + ); + } +} diff --git a/src/components/ops/DiagnosticSummaryCopy.tsx b/src/components/ops/DiagnosticSummaryCopy.tsx new file mode 100644 index 0000000..bb62f21 --- /dev/null +++ b/src/components/ops/DiagnosticSummaryCopy.tsx @@ -0,0 +1,83 @@ +"use client"; + +import { Clipboard } from "lucide-react"; +import { useTranslations } from "next-intl"; +import { useEffect, useState } from "react"; +import { Button } from "@/components/ui/Button"; + +type ToastState = "success" | "error" | null; + +export function DiagnosticSummaryCopy({ text }: { text: string }) { + const t = useTranslations("opsHealth.copy"); + const [toast, setToast] = useState(null); + + useEffect(() => { + if (!toast) { + return; + } + + const timeout = window.setTimeout(() => setToast(null), 3000); + return () => window.clearTimeout(timeout); + }, [toast]); + + async function copySummary() { + try { + await writeClipboardText(text); + setToast("success"); + } catch { + setToast("error"); + } + } + + return ( +
+ +
+ {toast ? ( + + {toast === "success" ? t("success") : t("error")} + + ) : null} +
+
+ ); +} + +async function writeClipboardText(text: string) { + if (navigator.clipboard?.writeText) { + try { + await navigator.clipboard.writeText(text); + return; + } catch { + // Fall through to the legacy copy path for browsers/test contexts + // without clipboard permissions. + } + } + + const textArea = document.createElement("textarea"); + textArea.value = text; + textArea.setAttribute("readonly", "true"); + textArea.style.position = "fixed"; + textArea.style.left = "-9999px"; + document.body.append(textArea); + textArea.select(); + + try { + if (!document.execCommand("copy")) { + throw new Error("Copy command failed"); + } + } finally { + textArea.remove(); + } +} diff --git a/src/components/ops/HealthStatusBadge.tsx b/src/components/ops/HealthStatusBadge.tsx new file mode 100644 index 0000000..d4ca047 --- /dev/null +++ b/src/components/ops/HealthStatusBadge.tsx @@ -0,0 +1,28 @@ +import type { HealthStatus } from "@/lib/ops-health"; + +const toneByStatus: Record = { + healthy: + "border-emerald-500/25 bg-emerald-500/10 text-emerald-700 dark:text-emerald-300", + degraded: + "border-rose-500/25 bg-rose-500/10 text-rose-700 dark:text-rose-300", + unknown: + "border-amber-500/25 bg-amber-500/10 text-amber-700 dark:text-amber-300", + unavailable: + "border-slate-500/25 bg-slate-500/10 text-slate-700 dark:text-slate-300", +}; + +export function HealthStatusBadge({ + status, + label, +}: { + status: HealthStatus; + label: string; +}) { + return ( + + {label} + + ); +} diff --git a/src/components/ops/OpsHealthDashboard.tsx b/src/components/ops/OpsHealthDashboard.tsx new file mode 100644 index 0000000..fbac8cd --- /dev/null +++ b/src/components/ops/OpsHealthDashboard.tsx @@ -0,0 +1,188 @@ +"use client"; + +import { RefreshCw } from "lucide-react"; +import { useTranslations } from "next-intl"; +import { useState, useTransition } from "react"; +import { Button } from "@/components/ui/Button"; +import { DiagnosticSummaryCopy } from "@/components/ops/DiagnosticSummaryCopy"; +import { HealthStatusBadge } from "@/components/ops/HealthStatusBadge"; +import type { HealthCheckKey, HealthSnapshot } from "@/lib/ops-health"; +import { withBasePath } from "@/lib/base-path"; + +const checkOrder: HealthCheckKey[] = [ + "runtime", + "database", + "configuration", + "worker", + "deploySmoke", +]; + +export function OpsHealthDashboard({ + initialSnapshot, +}: { + initialSnapshot: HealthSnapshot; +}) { + const t = useTranslations("opsHealth"); + const [snapshot, setSnapshot] = useState(initialSnapshot); + const [error, setError] = useState(null); + const [isPending, startTransition] = useTransition(); + + function refreshSnapshot() { + startTransition(async () => { + setError(null); + const response = await fetch(withBasePath("/api/admin/ops-health"), { + cache: "no-store", + }); + if (!response.ok) { + setError(t("refreshFailed")); + return; + } + + setSnapshot((await response.json()) as HealthSnapshot); + }); + } + + const checks = [...snapshot.checks].sort( + (a, b) => checkOrder.indexOf(a.key) - checkOrder.indexOf(b.key), + ); + + return ( +
+
+
+

+ {t("eyebrow")} +

+

+ {t("title")} +

+

+ {t("description")} +

+
+
+

+ {t("overall")} +

+
+ + + {t("capturedAt", { value: snapshot.capturedAt })} + +
+
+
+ +
+
+
+

+ {t("environment.title")} +

+ +
+ {error ? ( +

+ {error} +

+ ) : null} +
+ + + + + +
+
+ +
+

+ {t("summary.title")} +

+

+ {t("summary.description")} +

+
+            {snapshot.diagnosticSummary.text}
+          
+
+ +
+
+
+ +
+ {checks.map((check) => ( +
+
+

+ {t(`checks.${check.key}`)} +

+ +
+

+ {check.summary} +

+ {check.detail ? ( +

+ {check.detail} +

+ ) : null} + {check.checkedAt ? ( +

+ {t("checkedAt", { value: check.checkedAt })} +

+ ) : null} +
+ ))} +
+
+ ); +} + +function MetaRow({ label, value }: { label: string; value: string }) { + return ( +
+
+ {label} +
+
{value}
+
+ ); +} diff --git a/src/components/ops/index.ts b/src/components/ops/index.ts new file mode 100644 index 0000000..654e420 --- /dev/null +++ b/src/components/ops/index.ts @@ -0,0 +1,3 @@ +export { DiagnosticSummaryCopy } from "@/components/ops/DiagnosticSummaryCopy"; +export { HealthStatusBadge } from "@/components/ops/HealthStatusBadge"; +export { OpsHealthDashboard } from "@/components/ops/OpsHealthDashboard"; diff --git a/src/components/ui/Navigation.tsx b/src/components/ui/Navigation.tsx index 66cd8c4..f4b35f4 100644 --- a/src/components/ui/Navigation.tsx +++ b/src/components/ui/Navigation.tsx @@ -7,6 +7,7 @@ import { ChevronLeft, ChevronRight, Gauge, + HeartPulse, KeyRound, ListChecks, RadioTower, @@ -40,6 +41,11 @@ export function Navigation({ user }: { user: SessionUser }) { label: t("backgroundJobs"), icon: Activity, }); + links.push({ + href: "/admin/ops", + label: t("opsHealth"), + icon: HeartPulse, + }); links.push({ href: "/users", label: t("users"), icon: UsersRound }); links.push({ href: "/audit-trail", diff --git a/src/i18n/messages/de.json b/src/i18n/messages/de.json index 431889e..2f6976b 100644 --- a/src/i18n/messages/de.json +++ b/src/i18n/messages/de.json @@ -19,7 +19,8 @@ "tokens": "Tokens", "apiDocs": "API Docs", "adminTokens": "Admin Tokens", - "teamsIntegrations": "Teams" + "teamsIntegrations": "Teams", + "opsHealth": "Ops Health" }, "auth": { "signIn": "Anmelden", @@ -138,6 +139,47 @@ "error": "Fehler" } }, + "opsHealth": { + "eyebrow": "Betrieb", + "title": "Ops Health", + "description": "Erkenne den laufenden Build, prüfe Health Checks und kopiere eine sichere Diagnosezusammenfassung.", + "overall": "Gesamtstatus", + "capturedAt": "Erfasst: {value}", + "checkedAt": "Geprüft: {value}", + "refresh": "Aktualisieren", + "refreshing": "Aktualisiere...", + "refreshFailed": "Der Health-Snapshot konnte nicht aktualisiert werden.", + "status": { + "healthy": "Gesund", + "degraded": "Beeinträchtigt", + "unknown": "Unbekannt", + "unavailable": "Nicht verfügbar" + }, + "environment": { + "title": "Umgebungsidentität", + "environment": "Umgebung", + "version": "Version", + "revision": "Revision", + "buildId": "Build-ID", + "builtAt": "Build-Zeit" + }, + "checks": { + "runtime": "Runtime", + "database": "Datenbank", + "configuration": "Konfiguration", + "worker": "Worker", + "deploySmoke": "Deploy-Smoke" + }, + "summary": { + "title": "Diagnosezusammenfassung", + "description": "Kopiere diesen nicht geheimen Text in Issues oder Incident-Notizen." + }, + "copy": { + "button": "Zusammenfassung kopieren", + "success": "Diagnosezusammenfassung kopiert", + "error": "Diagnosezusammenfassung konnte nicht kopiert werden" + } + }, "tokens": { "eyebrow": "API Access", "title": "Personal access tokens", diff --git a/src/i18n/messages/en.json b/src/i18n/messages/en.json index 32e5db6..cec9126 100644 --- a/src/i18n/messages/en.json +++ b/src/i18n/messages/en.json @@ -19,7 +19,8 @@ "tokens": "Tokens", "apiDocs": "API Docs", "adminTokens": "Admin Tokens", - "teamsIntegrations": "Teams" + "teamsIntegrations": "Teams", + "opsHealth": "Ops Health" }, "auth": { "signIn": "Sign in", @@ -138,6 +139,47 @@ "error": "Error" } }, + "opsHealth": { + "eyebrow": "Operations", + "title": "Ops Health", + "description": "Identify the running build, inspect health checks, and copy a safe diagnostic summary.", + "overall": "Overall status", + "capturedAt": "Captured: {value}", + "checkedAt": "Checked: {value}", + "refresh": "Refresh", + "refreshing": "Refreshing...", + "refreshFailed": "Could not refresh the health snapshot.", + "status": { + "healthy": "Healthy", + "degraded": "Degraded", + "unknown": "Unknown", + "unavailable": "Unavailable" + }, + "environment": { + "title": "Environment identity", + "environment": "Environment", + "version": "Version", + "revision": "Revision", + "buildId": "Build ID", + "builtAt": "Built at" + }, + "checks": { + "runtime": "Runtime", + "database": "Database", + "configuration": "Configuration", + "worker": "Worker", + "deploySmoke": "Deploy smoke" + }, + "summary": { + "title": "Diagnostic summary", + "description": "Copy this non-secret text into issues or incident notes." + }, + "copy": { + "button": "Copy summary", + "success": "Diagnostic summary copied", + "error": "Could not copy diagnostic summary" + } + }, "tokens": { "eyebrow": "API Access", "title": "Personal access tokens", diff --git a/src/i18n/messages/es.json b/src/i18n/messages/es.json index fadcf4f..2530b1e 100644 --- a/src/i18n/messages/es.json +++ b/src/i18n/messages/es.json @@ -19,7 +19,8 @@ "tokens": "Tokens", "apiDocs": "API Docs", "adminTokens": "Admin Tokens", - "teamsIntegrations": "Teams" + "teamsIntegrations": "Teams", + "opsHealth": "Salud Ops" }, "auth": { "signIn": "Iniciar sesion", @@ -138,6 +139,47 @@ "error": "Error" } }, + "opsHealth": { + "eyebrow": "Operaciones", + "title": "Salud Ops", + "description": "Identifica la compilación en ejecución, revisa los controles de salud y copia un resumen diagnóstico seguro.", + "overall": "Estado general", + "capturedAt": "Capturado: {value}", + "checkedAt": "Comprobado: {value}", + "refresh": "Actualizar", + "refreshing": "Actualizando...", + "refreshFailed": "No se pudo actualizar la instantánea de salud.", + "status": { + "healthy": "Correcto", + "degraded": "Degradado", + "unknown": "Desconocido", + "unavailable": "No disponible" + }, + "environment": { + "title": "Identidad del entorno", + "environment": "Entorno", + "version": "Versión", + "revision": "Revisión", + "buildId": "ID de build", + "builtAt": "Compilado" + }, + "checks": { + "runtime": "Runtime", + "database": "Base de datos", + "configuration": "Configuración", + "worker": "Worker", + "deploySmoke": "Smoke de despliegue" + }, + "summary": { + "title": "Resumen diagnóstico", + "description": "Copia este texto sin secretos en issues o notas de incidente." + }, + "copy": { + "button": "Copiar resumen", + "success": "Resumen diagnóstico copiado", + "error": "No se pudo copiar el resumen diagnóstico" + } + }, "tokens": { "eyebrow": "API Access", "title": "Personal access tokens", diff --git a/src/i18n/messages/fr.json b/src/i18n/messages/fr.json index d84f064..84ec928 100644 --- a/src/i18n/messages/fr.json +++ b/src/i18n/messages/fr.json @@ -19,7 +19,8 @@ "tokens": "Tokens", "apiDocs": "API Docs", "adminTokens": "Admin Tokens", - "teamsIntegrations": "Teams" + "teamsIntegrations": "Teams", + "opsHealth": "Santé Ops" }, "auth": { "signIn": "Se connecter", @@ -138,6 +139,47 @@ "error": "Erreur" } }, + "opsHealth": { + "eyebrow": "Opérations", + "title": "Santé Ops", + "description": "Identifiez le build en cours, inspectez les contrôles de santé et copiez un résumé de diagnostic sûr.", + "overall": "État global", + "capturedAt": "Capturé : {value}", + "checkedAt": "Vérifié : {value}", + "refresh": "Actualiser", + "refreshing": "Actualisation...", + "refreshFailed": "Impossible d’actualiser l’instantané de santé.", + "status": { + "healthy": "Sain", + "degraded": "Dégradé", + "unknown": "Inconnu", + "unavailable": "Indisponible" + }, + "environment": { + "title": "Identité de l’environnement", + "environment": "Environnement", + "version": "Version", + "revision": "Révision", + "buildId": "ID de build", + "builtAt": "Build créé" + }, + "checks": { + "runtime": "Runtime", + "database": "Base de données", + "configuration": "Configuration", + "worker": "Worker", + "deploySmoke": "Smoke de déploiement" + }, + "summary": { + "title": "Résumé de diagnostic", + "description": "Copiez ce texte sans secret dans des tickets ou notes d’incident." + }, + "copy": { + "button": "Copier le résumé", + "success": "Résumé de diagnostic copié", + "error": "Impossible de copier le résumé de diagnostic" + } + }, "tokens": { "eyebrow": "API Access", "title": "Personal access tokens", diff --git a/src/i18n/messages/pt.json b/src/i18n/messages/pt.json index fc845a6..dec9dba 100644 --- a/src/i18n/messages/pt.json +++ b/src/i18n/messages/pt.json @@ -19,7 +19,8 @@ "tokens": "Tokens", "apiDocs": "API Docs", "adminTokens": "Admin Tokens", - "teamsIntegrations": "Teams" + "teamsIntegrations": "Teams", + "opsHealth": "Saúde Ops" }, "auth": { "signIn": "Entrar", @@ -138,6 +139,47 @@ "error": "Erro" } }, + "opsHealth": { + "eyebrow": "Operações", + "title": "Saúde Ops", + "description": "Identifique o build em execução, inspecione verificações de saúde e copie um resumo diagnóstico seguro.", + "overall": "Estado geral", + "capturedAt": "Capturado: {value}", + "checkedAt": "Verificado: {value}", + "refresh": "Atualizar", + "refreshing": "Atualizando...", + "refreshFailed": "Não foi possível atualizar o instantâneo de saúde.", + "status": { + "healthy": "Saudável", + "degraded": "Degradado", + "unknown": "Desconhecido", + "unavailable": "Indisponível" + }, + "environment": { + "title": "Identidade do ambiente", + "environment": "Ambiente", + "version": "Versão", + "revision": "Revisão", + "buildId": "ID do build", + "builtAt": "Build criado" + }, + "checks": { + "runtime": "Runtime", + "database": "Banco de dados", + "configuration": "Configuração", + "worker": "Worker", + "deploySmoke": "Smoke de deploy" + }, + "summary": { + "title": "Resumo diagnóstico", + "description": "Copie este texto sem segredos para issues ou notas de incidente." + }, + "copy": { + "button": "Copiar resumo", + "success": "Resumo diagnóstico copiado", + "error": "Não foi possível copiar o resumo diagnóstico" + } + }, "tokens": { "eyebrow": "API Access", "title": "Personal access tokens", diff --git a/src/lib/ops-health.ts b/src/lib/ops-health.ts new file mode 100644 index 0000000..ae1e1e3 --- /dev/null +++ b/src/lib/ops-health.ts @@ -0,0 +1,299 @@ +import { getAppVersionInfo, type AppVersionInfo } from "@/lib/app-version"; +import { prisma } from "@/lib/db"; +import { resolveAppDatabaseUrl } from "@/lib/database-url"; +import { checkDatabaseHealth, getProcessHealth } from "@/lib/monitoring"; + +export type HealthStatus = "healthy" | "degraded" | "unknown" | "unavailable"; + +export type HealthCheckKey = + | "runtime" + | "database" + | "configuration" + | "worker" + | "deploySmoke"; + +export type EnvironmentIdentity = { + environment: string; + version: string; + revision: string; + buildId: string; + builtAt: string; +}; + +export type HealthCheckResult = { + key: HealthCheckKey; + status: HealthStatus; + summary: string; + detail?: string; + checkedAt?: string; + optional?: boolean; +}; + +export type DiagnosticSummary = { + generatedAt: string; + text: string; +}; + +export type HealthSnapshot = { + capturedAt: string; + overallStatus: HealthStatus; + environment: EnvironmentIdentity; + checks: HealthCheckResult[]; + diagnosticSummary: DiagnosticSummary; +}; + +type OpsHealthEnv = Record; + +const UNKNOWN = "unknown"; +const REQUIRED_CHECKS = new Set([ + "runtime", + "database", + "configuration", +]); +const SENSITIVE_KEY_PATTERN = + /(secret|token|password|passwd|authorization|cookie|private.?key|connection.?string|database.?url|url)$/i; +const FORBIDDEN_SUMMARY_PATTERNS = [ + /Bearer\s+[A-Za-z0-9._~-]+/i, + /postgres(?:ql)?:\/\/\S+/i, + /mysql:\/\/\S+/i, + /file:\S+/i, + /-----BEGIN [A-Z ]*PRIVATE KEY-----/i, +]; + +export function createEnvironmentIdentity( + versionInfo: AppVersionInfo = getAppVersionInfo(), +): EnvironmentIdentity { + return { + environment: versionInfo.environment || UNKNOWN, + version: versionInfo.version || UNKNOWN, + revision: versionInfo.shortRevision || versionInfo.revision || UNKNOWN, + buildId: versionInfo.buildId || UNKNOWN, + builtAt: versionInfo.builtAt || UNKNOWN, + }; +} + +export function aggregateOverallStatus(checks: HealthCheckResult[]) { + const requiredChecks = checks.filter((check) => + REQUIRED_CHECKS.has(check.key), + ); + + if (requiredChecks.some((check) => check.status === "degraded")) { + return "degraded" as const; + } + + if (requiredChecks.some((check) => check.status === "unknown")) { + return "unknown" as const; + } + + return "healthy" as const; +} + +export function redactSensitiveValue(value: unknown): unknown { + if (Array.isArray(value)) { + return value.map((entry) => redactSensitiveValue(entry)); + } + + if (value && typeof value === "object") { + return Object.fromEntries( + Object.entries(value as Record).map(([key, entry]) => [ + key, + SENSITIVE_KEY_PATTERN.test(key) + ? "[REDACTED]" + : redactSensitiveValue(entry), + ]), + ); + } + + if (typeof value === "string") { + return FORBIDDEN_SUMMARY_PATTERNS.reduce( + (current, pattern) => current.replace(pattern, "[REDACTED]"), + value, + ); + } + + return value; +} + +export function createDiagnosticSummary( + snapshot: Omit, +): DiagnosticSummary { + const lines = [ + `Environment: ${snapshot.environment.environment}`, + `Version: ${snapshot.environment.version}`, + `Revision: ${snapshot.environment.revision}`, + `Build ID: ${snapshot.environment.buildId}`, + `Built At: ${snapshot.environment.builtAt}`, + `Captured At: ${snapshot.capturedAt}`, + `Overall: ${snapshot.overallStatus}`, + ...snapshot.checks.map((check) => `${check.key}: ${check.status}`), + ]; + + return { + generatedAt: snapshot.capturedAt, + text: String(redactSensitiveValue(lines.join("\n"))), + }; +} + +function envPresent(env: OpsHealthEnv, names: string[]) { + return names.some((name) => Boolean(env[name]?.trim())); +} + +export function getConfigurationHealth(env: OpsHealthEnv = process.env) { + const isProduction = env.NODE_ENV === "production"; + const hasAuthSecret = envPresent(env, [ + "BETTERAUTH_SECRET", + "BETTER_AUTH_SECRET", + ]); + const hasDatabaseUrl = envPresent(env, ["APP_DATABASE_URL", "DATABASE_URL"]); + const hasRuntimeEnvironment = envPresent(env, ["APP_ENVIRONMENT"]); + const hasBuildMetadata = envPresent(env, [ + "APP_VERSION", + "APP_REVISION", + "APP_GIT_SHA", + "APP_BUILD_ID", + "APP_BUILT_AT", + ]); + const missingRequired = [ + isProduction && !hasAuthSecret ? "authentication secret" : null, + !hasDatabaseUrl ? "app database URL" : null, + ].filter(Boolean); + const optionalMissing = [ + !hasRuntimeEnvironment ? "runtime environment label" : null, + !hasBuildMetadata ? "build metadata" : null, + ].filter(Boolean); + + if (missingRequired.length > 0) { + return { + key: "configuration" as const, + status: "degraded" as const, + summary: "Required runtime configuration is missing", + detail: `Missing required configuration: ${missingRequired.join(", ")}.`, + checkedAt: new Date().toISOString(), + }; + } + + if (optionalMissing.length > 0) { + return { + key: "configuration" as const, + status: "unknown" as const, + summary: "Runtime configuration is usable with incomplete metadata", + detail: `Missing optional metadata: ${optionalMissing.join(", ")}.`, + checkedAt: new Date().toISOString(), + }; + } + + return { + key: "configuration" as const, + status: "healthy" as const, + summary: "Required runtime configuration is present", + checkedAt: new Date().toISOString(), + }; +} + +function mapRuntimeHealth(capturedAt: string): HealthCheckResult { + const runtime = getProcessHealth(); + return { + key: "runtime", + status: runtime.status === "ok" ? "healthy" : "degraded", + summary: + runtime.status === "ok" + ? "Runtime is responding" + : "Runtime health check is degraded", + detail: `Node environment: ${runtime.nodeEnv}; uptime: ${runtime.uptimeSeconds}s.`, + checkedAt: capturedAt, + }; +} + +async function mapDatabaseHealth( + capturedAt: string, +): Promise { + const database = await checkDatabaseHealth(); + return { + key: "database", + status: database.status === "ok" ? "healthy" : "degraded", + summary: + database.status === "ok" + ? "Database connectivity check passed" + : "Database connectivity check failed", + detail: + database.status === "ok" + ? `Provider: ${resolveAppDatabaseUrl().startsWith("file:") ? "sqlite" : "postgresql"}.` + : database.message, + checkedAt: capturedAt, + }; +} + +async function getWorkerHealth(capturedAt: string): Promise { + const recentJob = await prisma.backgroundJob.findFirst({ + orderBy: { updatedAt: "desc" }, + select: { + status: true, + updatedAt: true, + workerId: true, + error: true, + }, + }); + + if (!recentJob) { + return { + key: "worker", + status: "unknown", + summary: "No recent worker evidence is available", + optional: true, + }; + } + + const checkedAt = recentJob.updatedAt.toISOString(); + if (recentJob.status === "FAILED") { + return { + key: "worker", + status: "degraded", + summary: "Recent worker job failed", + detail: String( + redactSensitiveValue(recentJob.error ?? "Review background jobs."), + ), + checkedAt, + optional: true, + }; + } + + return { + key: "worker", + status: "healthy", + summary: `Recent worker evidence: ${recentJob.status.toLowerCase()}`, + detail: recentJob.workerId ? `Worker: ${recentJob.workerId}.` : undefined, + checkedAt, + optional: true, + }; +} + +function getDeploySmokeHealth(): HealthCheckResult { + return { + key: "deploySmoke", + status: "unavailable", + summary: "No recent deployment smoke result is available", + optional: true, + }; +} + +export async function buildOpsHealthSnapshot(): Promise { + const capturedAt = new Date().toISOString(); + const checks = [ + mapRuntimeHealth(capturedAt), + await mapDatabaseHealth(capturedAt), + getConfigurationHealth(), + await getWorkerHealth(capturedAt), + getDeploySmokeHealth(), + ]; + const snapshotWithoutSummary = { + capturedAt, + overallStatus: aggregateOverallStatus(checks), + environment: createEnvironmentIdentity(), + checks, + }; + + return { + ...snapshotWithoutSummary, + diagnosticSummary: createDiagnosticSummary(snapshotWithoutSummary), + }; +} diff --git a/tests/e2e/ops-health/admin-ops-health.spec.ts b/tests/e2e/ops-health/admin-ops-health.spec.ts new file mode 100644 index 0000000..c549a6e --- /dev/null +++ b/tests/e2e/ops-health/admin-ops-health.spec.ts @@ -0,0 +1,100 @@ +import { expect, test } from "@playwright/test"; +import { Role } from "../../../generated/prisma/enums"; +import { + appBasePath, + expectOnDashboard, + loginWithPassword, +} from "../helpers/auth"; +import { + seedBackgroundJob, + seedLocalUser, + updateUserStatus, + UserStatus, +} from "../helpers/db"; + +test("platform admin can inspect and copy ops health diagnostics", async ({ + page, +}) => { + const adminEmail = "ops-admin@example.com"; + + await seedLocalUser({ + email: adminEmail, + name: "Ops Admin", + role: Role.PLATFORM_ADMIN, + password: "OpsAdminPass123", + mustChangePassword: false, + }); + + try { + await seedBackgroundJob({ + jobType: "ops-health-evidence", + status: "COMPLETED", + workerId: "worker-e2e", + payload: { safe: true }, + }); + + await loginWithPassword(page, adminEmail, "OpsAdminPass123"); + await expectOnDashboard(page); + + await page.getByRole("link", { name: "Ops Health" }).click(); + await expect(page).toHaveURL(new RegExp(`${appBasePath}/admin/ops$`)); + await expect( + page.getByRole("heading", { name: "Ops Health" }), + ).toBeVisible(); + await expect(page.getByText("Environment", { exact: true })).toBeVisible(); + await expect(page.getByText("Runtime", { exact: true })).toBeVisible(); + await expect(page.getByText("Database", { exact: true })).toBeVisible(); + + for (const viewport of [ + { width: 390, height: 844 }, + { width: 768, height: 1024 }, + { width: 1280, height: 900 }, + ]) { + await page.setViewportSize(viewport); + await expect( + page.getByRole("heading", { name: "Ops Health" }), + ).toBeVisible(); + const hasHorizontalOverflow = await page.evaluate( + () => document.documentElement.scrollWidth > window.innerWidth + 1, + ); + expect(hasHorizontalOverflow).toBe(false); + } + + const capturedBefore = await page + .getByText(/Captured:/) + .first() + .textContent(); + await page.getByRole("button", { name: "Refresh" }).click(); + await expect + .poll(async () => + page + .getByText(/Captured:/) + .first() + .textContent(), + ) + .not.toBe(capturedBefore); + + await page.getByRole("button", { name: "Copy summary" }).click(); + await expect(page.getByRole("status")).toContainText( + "Diagnostic summary copied", + ); + } finally { + updateUserStatus(adminEmail, UserStatus.INACTIVE); + } +}); + +test("non-admin users cannot open ops health", async ({ page }) => { + await seedLocalUser({ + email: "ops-user@example.com", + name: "Ops User", + role: Role.SCOPE_USER, + password: "UserPass123", + mustChangePassword: false, + }); + + await loginWithPassword(page, "ops-user@example.com", "UserPass123"); + await expectOnDashboard(page); + await page.goto(`${appBasePath}/admin/ops`); + + await expect(page).toHaveURL(new RegExp(`${appBasePath}/dashboard$`)); +}); diff --git a/tests/integration/ops-health-api.test.ts b/tests/integration/ops-health-api.test.ts new file mode 100644 index 0000000..bf207f5 --- /dev/null +++ b/tests/integration/ops-health-api.test.ts @@ -0,0 +1,115 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { Role, UserStatus } from "../../generated/prisma/enums"; + +const { requireApiUserWithRoles } = vi.hoisted(() => ({ + requireApiUserWithRoles: vi.fn(), +})); + +const { buildOpsHealthSnapshot } = vi.hoisted(() => ({ + buildOpsHealthSnapshot: vi.fn(), +})); + +vi.mock("@/lib/route-auth", () => ({ + requireApiUserWithRoles, +})); + +vi.mock("@/lib/ops-health", () => ({ + buildOpsHealthSnapshot, +})); + +import { GET } from "@/app/api/admin/ops-health/route"; + +describe("ops health admin API", () => { + afterEach(() => { + vi.clearAllMocks(); + }); + + it("requires platform admin access", async () => { + const forbidden = Response.json( + { error: "Not authorized" }, + { status: 403 }, + ); + requireApiUserWithRoles.mockResolvedValue({ error: forbidden }); + + const response = await GET( + new Request("http://localhost/api/admin/ops-health"), + ); + + expect(response.status).toBe(403); + expect(requireApiUserWithRoles).toHaveBeenCalledWith( + [Role.PLATFORM_ADMIN], + expect.any(Request), + ); + }); + + it("returns a safe health snapshot for admins", async () => { + requireApiUserWithRoles.mockResolvedValue({ + user: { + id: "admin-1", + role: Role.PLATFORM_ADMIN, + status: UserStatus.ACTIVE, + }, + }); + buildOpsHealthSnapshot.mockResolvedValue({ + capturedAt: "2026-06-11T09:30:00Z", + overallStatus: "healthy", + environment: { + environment: "staging", + version: "staging-42", + revision: "abcdef123456", + buildId: "123.2", + builtAt: "2026-06-11T09:00:00Z", + }, + checks: [ + { + key: "database", + status: "healthy", + summary: "Database connectivity check passed", + }, + ], + diagnosticSummary: { + generatedAt: "2026-06-11T09:30:00Z", + text: "Environment: staging\nDatabase: healthy", + }, + }); + + const response = await GET( + new Request("http://localhost/api/admin/ops-health"), + ); + + expect(response.status).toBe(200); + await expect(response.json()).resolves.toMatchObject({ + overallStatus: "healthy", + environment: { + version: "staging-42", + }, + diagnosticSummary: { + text: expect.not.stringContaining("secret"), + }, + }); + }); + + it("returns a safe fatal error when snapshot assembly fails", async () => { + requireApiUserWithRoles.mockResolvedValue({ + user: { + id: "admin-1", + role: Role.PLATFORM_ADMIN, + status: UserStatus.ACTIVE, + }, + }); + buildOpsHealthSnapshot.mockRejectedValue( + new Error("postgresql://user:pass@example/db"), + ); + + const response = await GET( + new Request("http://localhost/api/admin/ops-health"), + ); + const body = await response.json(); + + expect(response.status).toBe(500); + expect(body).toEqual({ + error: "Could not assemble a safe ops health snapshot", + }); + expect(JSON.stringify(body)).not.toContain("postgresql://"); + }); +}); diff --git a/tests/unit/ops-health.test.ts b/tests/unit/ops-health.test.ts new file mode 100644 index 0000000..c6812be --- /dev/null +++ b/tests/unit/ops-health.test.ts @@ -0,0 +1,192 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +const { prismaMock } = vi.hoisted(() => ({ + prismaMock: { + backgroundJob: { + findFirst: vi.fn(), + }, + }, +})); + +const { checkDatabaseHealth, getProcessHealth } = vi.hoisted(() => ({ + checkDatabaseHealth: vi.fn(), + getProcessHealth: vi.fn(), +})); + +vi.mock("@/lib/db", () => ({ + prisma: prismaMock, +})); + +vi.mock("@/lib/monitoring", () => ({ + checkDatabaseHealth, + getProcessHealth, +})); + +import { + aggregateOverallStatus, + buildOpsHealthSnapshot, + createDiagnosticSummary, + createEnvironmentIdentity, + getConfigurationHealth, + redactSensitiveValue, + type HealthCheckResult, +} from "@/lib/ops-health"; +import { resetAppVersionInfoForTests } from "@/lib/app-version"; + +const metadataKeys = [ + "APP_ENVIRONMENT", + "APP_VERSION", + "APP_REVISION", + "APP_GIT_SHA", + "APP_BUILD_ID", + "APP_BUILT_AT", + "APP_DATABASE_URL", + "DATABASE_URL", + "BETTERAUTH_SECRET", + "BETTER_AUTH_SECRET", + "NODE_ENV", +]; + +describe("ops health snapshot helpers", () => { + afterEach(() => { + for (const key of metadataKeys) { + delete process.env[key]; + } + resetAppVersionInfoForTests(); + vi.clearAllMocks(); + }); + + it("maps app version metadata to environment identity", () => { + process.env.APP_ENVIRONMENT = "staging"; + process.env.APP_VERSION = "staging-42"; + process.env.APP_REVISION = "abcdef1234567890"; + process.env.APP_BUILD_ID = "123.2"; + process.env.APP_BUILT_AT = "2026-06-11T09:20:17Z"; + + expect(createEnvironmentIdentity()).toEqual({ + environment: "staging", + version: "staging-42", + revision: "abcdef123456", + buildId: "123.2", + builtAt: "2026-06-11T09:20:17Z", + }); + }); + + it("ignores optional worker and smoke states for overall status", () => { + const checks: HealthCheckResult[] = [ + { key: "runtime", status: "healthy", summary: "ok" }, + { key: "database", status: "healthy", summary: "ok" }, + { key: "configuration", status: "healthy", summary: "ok" }, + { key: "worker", status: "unknown", summary: "none", optional: true }, + { + key: "deploySmoke", + status: "unavailable", + summary: "none", + optional: true, + }, + ]; + + expect(aggregateOverallStatus(checks)).toBe("healthy"); + }); + + it("marks overall status degraded when a required check degrades", () => { + expect( + aggregateOverallStatus([ + { key: "runtime", status: "healthy", summary: "ok" }, + { key: "database", status: "degraded", summary: "down" }, + { key: "configuration", status: "healthy", summary: "ok" }, + ]), + ).toBe("degraded"); + }); + + it("redacts nested sensitive values", () => { + expect( + redactSensitiveValue({ + token: "abc", + nested: { + databaseUrl: "postgresql://user:pass@example/db", + safe: "hello", + }, + }), + ).toEqual({ + token: "[REDACTED]", + nested: { + databaseUrl: "[REDACTED]", + safe: "hello", + }, + }); + }); + + it("reports configuration readiness without raw values", () => { + const result = getConfigurationHealth({ + NODE_ENV: "production", + APP_DATABASE_URL: "postgresql://secret@example/db", + APP_ENVIRONMENT: "prod", + APP_VERSION: "v1", + }); + + expect(result).toMatchObject({ + key: "configuration", + status: "degraded", + }); + expect(JSON.stringify(result)).not.toContain("postgresql://"); + }); + + it("assembles a full snapshot with safe diagnostic text", async () => { + process.env.APP_ENVIRONMENT = "staging"; + process.env.APP_VERSION = "staging-42"; + process.env.APP_REVISION = "abcdef1234567890"; + process.env.APP_BUILD_ID = "123.2"; + process.env.APP_BUILT_AT = "2026-06-11T09:20:17Z"; + process.env.APP_DATABASE_URL = "file:./dev.db"; + checkDatabaseHealth.mockResolvedValue({ status: "ok" }); + getProcessHealth.mockReturnValue({ + status: "ok", + uptimeSeconds: 42, + nodeEnv: "test", + }); + prismaMock.backgroundJob.findFirst.mockResolvedValue({ + status: "COMPLETED", + updatedAt: new Date("2026-06-11T09:30:00Z"), + workerId: "worker-1", + error: null, + }); + + const snapshot = await buildOpsHealthSnapshot(); + + expect(snapshot.environment.version).toBe("staging-42"); + expect(snapshot.checks.map((check) => check.key)).toEqual([ + "runtime", + "database", + "configuration", + "worker", + "deploySmoke", + ]); + expect(snapshot.diagnosticSummary.text).toContain("Version: staging-42"); + expect(snapshot.diagnosticSummary.text).not.toContain("file:./dev.db"); + }); + + it("creates an allowlisted diagnostic summary", () => { + const summary = createDiagnosticSummary({ + capturedAt: "2026-06-11T09:30:00Z", + overallStatus: "healthy", + environment: { + environment: "staging", + version: "v1", + revision: "abc", + buildId: "run-1", + builtAt: "2026-06-11T09:00:00Z", + }, + checks: [ + { + key: "runtime", + status: "healthy", + summary: "Authorization: Bearer nope", + }, + ], + }); + + expect(summary.text).toContain("runtime: healthy"); + expect(summary.text).not.toContain("Bearer"); + }); +}); diff --git a/validate.ps1 b/validate.ps1 index 0240ea7..11f429a 100644 --- a/validate.ps1 +++ b/validate.ps1 @@ -1240,7 +1240,26 @@ if ($Phase -in "all", "full", "test", "commit") { if ($Phase -in "all", "full", "test", "commit") { Write-Step "Tests (vitest)" try { + $previousDatabaseUrl = $env:DATABASE_URL + if ([string]::IsNullOrWhiteSpace($env:APP_DATABASE_URL) -and [string]::IsNullOrWhiteSpace($env:DATABASE_URL)) { + $env:DATABASE_URL = "postgresql://starter:starter_e2e_password@localhost:55432/business_app_starter_e2e_test" + } + + $effectiveDatabaseUrl = if (-not [string]::IsNullOrWhiteSpace($env:APP_DATABASE_URL)) { $env:APP_DATABASE_URL } else { $env:DATABASE_URL } + $generateCommand = if ($effectiveDatabaseUrl -like "file:*") { "pnpm run prisma:generate" } else { "pnpm run prisma:generate:postgres" } + $generateResult = Invoke-NativeCommandCaptured $generateCommand + if ($generateResult.ExitCode -ne 0) { + $generateResult.Output | Out-Host + throw "prisma generate failed" + } + $result = Invoke-NativeCommandCaptured "pnpm test" + if ($null -eq $previousDatabaseUrl) { + Remove-Item Env:\DATABASE_URL -ErrorAction SilentlyContinue + } else { + $env:DATABASE_URL = $previousDatabaseUrl + } + if ($result.ExitCode -ne 0) { $result.Output | Out-Host throw "tests failed" @@ -1274,6 +1293,12 @@ if ($Phase -in "all", "full", "test", "commit") { Write-Pass "tests passed" } } catch { + if ($null -eq $previousDatabaseUrl) { + Remove-Item Env:\DATABASE_URL -ErrorAction SilentlyContinue + } else { + $env:DATABASE_URL = $previousDatabaseUrl + } + Write-Fail "tests failed" $failures += "tests" }