diff --git a/apps/api/src/durable-objects/project-data/row-schemas.ts b/apps/api/src/durable-objects/project-data/row-schemas.ts index 2e96313a7..71d671c8f 100644 --- a/apps/api/src/durable-objects/project-data/row-schemas.ts +++ b/apps/api/src/durable-objects/project-data/row-schemas.ts @@ -256,7 +256,7 @@ export function parseChatMessageRow(row: unknown): { sessionId: r.session_id, role: r.role, content: r.content, - toolMetadata: r.tool_metadata ? JSON.parse(r.tool_metadata) : null, + toolMetadata: safeParseJson(r.tool_metadata), createdAt: r.created_at, sequence: r.sequence, }; diff --git a/apps/api/src/routes/chat.ts b/apps/api/src/routes/chat.ts index 9db6c83af..2d51ff596 100644 --- a/apps/api/src/routes/chat.ts +++ b/apps/api/src/routes/chat.ts @@ -10,17 +10,19 @@ import type { ChatSessionTaskEmbed } from '@simple-agent-manager/shared'; import { DEFAULT_CHAT_SESSION_MESSAGE_LIMIT, isTaskExecutionStep, isTaskMode } from '@simple-agent-manager/shared'; import { and, eq, inArray } from 'drizzle-orm'; import { drizzle } from 'drizzle-orm/d1'; +import type { Context } from 'hono'; import { Hono } from 'hono'; import * as schema from '../db/schema'; import type { Env } from '../env'; import { log } from '../lib/logger'; import { requireRouteParam } from '../lib/route-helpers'; -import { getUserId, requireApproved,requireAuth } from '../middleware/auth'; +import { getAuth, getUserId, requireApproved, requireAuth } from '../middleware/auth'; import { errors } from '../middleware/error'; import { requireOwnedProject } from '../middleware/project-auth'; -import { CreateChatSessionSchema, LinkTaskToChatSchema,parseOptionalBody, SendChatMessageSchema } from '../schemas'; +import { CreateChatSessionSchema, LinkTaskToChatSchema, parseOptionalBody, SendChatMessageSchema } from '../schemas'; import * as chatPersistence from '../services/chat-persistence'; +import { persistError } from '../services/observability'; import * as projectDataService from '../services/project-data'; import { isTaskStatus } from '../services/task-status'; @@ -28,6 +30,91 @@ const chatRoutes = new Hono<{ Bindings: Env }>(); chatRoutes.use('/*', requireAuth(), requireApproved()); +type ChatSessionLoadPhase = 'get_session' | 'get_messages'; + +function isDiagnosticRole(role: string): boolean { + return role === 'admin' || role === 'superadmin'; +} + +function serializeDiagnosticError(err: unknown): { + name: string; + message: string; + stack: string | null; +} { + if (err instanceof Error) { + return { + name: err.name, + message: err.message, + stack: err.stack ?? null, + }; + } + + return { + name: 'NonError', + message: String(err), + stack: null, + }; +} + +async function recordChatSessionLoadFailure( + c: Context<{ Bindings: Env }>, + input: { + err: unknown; + phase: ChatSessionLoadPhase; + projectId: string; + sessionId: string; + userId: string; + } +): Promise { + const requestId = crypto.randomUUID(); + const diagnostic = serializeDiagnosticError(input.err); + const context = { + requestId, + route: 'GET /api/projects/:projectId/sessions/:sessionId', + phase: input.phase, + projectId: input.projectId, + sessionId: input.sessionId, + userId: input.userId, + errorName: diagnostic.name, + errorMessage: diagnostic.message, + }; + + log.error('chat.session_detail_load_failed', { + ...context, + stack: diagnostic.stack, + }); + + if (c.env.OBSERVABILITY_DATABASE) { + await persistError(c.env.OBSERVABILITY_DATABASE, { + source: 'api', + level: 'error', + message: 'chat.session_detail_load_failed', + stack: diagnostic.stack, + context, + userId: input.userId, + ipAddress: c.req.header('CF-Connecting-IP') ?? null, + userAgent: c.req.header('User-Agent') ?? null, + }); + } + + const body: Record = { + error: 'CHAT_SESSION_LOAD_FAILED', + message: 'Failed to load chat session', + requestId, + phase: input.phase, + }; + + if (isDiagnosticRole(getAuth(c).user.role)) { + body.details = { + errorName: diagnostic.name, + errorMessage: diagnostic.message, + stack: diagnostic.stack, + }; + } + + return c.json(body, 500); +} + function getSessionMessageLimit(env: Env, requestedLimit?: string): number { const configuredLimit = parseInt(env.CHAT_SESSION_MESSAGE_LIMIT || '', 10); const maxLimit = Number.isFinite(configuredLimit) && configuredLimit > 0 @@ -113,7 +200,19 @@ chatRoutes.get('/:sessionId', async (c) => { await requireOwnedProject(db, projectId, userId); - const session = await projectDataService.getSession(c.env, projectId, sessionId); + let session: Awaited>; + try { + session = await projectDataService.getSession(c.env, projectId, sessionId); + } catch (err) { + return recordChatSessionLoadFailure(c, { + err, + phase: 'get_session', + projectId, + sessionId, + userId, + }); + } + if (!session) { throw errors.notFound('Chat session'); } @@ -122,13 +221,24 @@ chatRoutes.get('/:sessionId', async (c) => { const beforeParam = c.req.query('before'); const before = beforeParam ? parseInt(beforeParam, 10) : null; - const messagesResult = await projectDataService.getMessages( - c.env, - projectId, - sessionId, - limit, - before - ); + let messagesResult: Awaited>; + try { + messagesResult = await projectDataService.getMessages( + c.env, + projectId, + sessionId, + limit, + before + ); + } catch (err) { + return recordChatSessionLoadFailure(c, { + err, + phase: 'get_messages', + projectId, + sessionId, + userId, + }); + } // Embed task summary if session is linked to a task (D1 lookup, best-effort) let task: ChatSessionTaskEmbed | null = null; diff --git a/apps/api/src/services/observability.ts b/apps/api/src/services/observability.ts index dd9522cc9..5796b4c7a 100644 --- a/apps/api/src/services/observability.ts +++ b/apps/api/src/services/observability.ts @@ -7,6 +7,7 @@ */ import type { PlatformErrorLevel,PlatformErrorSource } from '@simple-agent-manager/shared'; +import type { SQL } from 'drizzle-orm'; import { and, count, desc, eq, gte, like, lte, or } from 'drizzle-orm'; import { drizzle } from 'drizzle-orm/d1'; @@ -169,7 +170,7 @@ export async function queryErrors( const { platformErrors } = observabilitySchema; const limit = Math.min(params.limit ?? DEFAULT_QUERY_LIMIT, MAX_QUERY_LIMIT); - const conditions: ReturnType[] = []; + const conditions: SQL[] = []; if (params.source && VALID_SOURCES.has(params.source)) { conditions.push(eq(platformErrors.source, params.source)); @@ -188,7 +189,14 @@ export async function queryErrors( } if (params.search) { - conditions.push(like(platformErrors.message, `%${params.search}%`)); + const searchPattern = `%${params.search}%`; + const searchCondition = or( + like(platformErrors.message, searchPattern), + like(platformErrors.context, searchPattern) + ); + if (searchCondition) { + conditions.push(searchCondition); + } } // Cursor-based pagination: decode cursor as timestamp diff --git a/apps/api/tests/unit/durable-objects/row-schemas.test.ts b/apps/api/tests/unit/durable-objects/row-schemas.test.ts index aeb252a29..8417387e2 100644 --- a/apps/api/tests/unit/durable-objects/row-schemas.test.ts +++ b/apps/api/tests/unit/durable-objects/row-schemas.test.ts @@ -298,18 +298,18 @@ describe('parseChatMessageRow', () => { expect(result.sequence).toBeNull(); }); - it('throws on invalid JSON in tool_metadata', () => { - expect(() => - parseChatMessageRow({ - id: 'm1', - session_id: 's1', - role: 'user', - content: 'hi', - tool_metadata: 'not-json', - created_at: 1000, - sequence: null, - }) - ).toThrow(); + it('returns null for invalid JSON in tool_metadata', () => { + const result = parseChatMessageRow({ + id: 'm1', + session_id: 's1', + role: 'user', + content: 'hi', + tool_metadata: 'not-json', + created_at: 1000, + sequence: null, + }); + + expect(result.toolMetadata).toBeNull(); }); }); diff --git a/apps/api/tests/unit/routes/chat-session-agent-routing.test.ts b/apps/api/tests/unit/routes/chat-session-agent-routing.test.ts index 1cbd7c868..de91c62fb 100644 --- a/apps/api/tests/unit/routes/chat-session-agent-routing.test.ts +++ b/apps/api/tests/unit/routes/chat-session-agent-routing.test.ts @@ -10,6 +10,8 @@ const mocks = vi.hoisted(() => ({ getSession: vi.fn(), getMessages: vi.fn(), listAcpSessions: vi.fn(), + persistError: vi.fn(async () => undefined), + userRole: 'user', })); vi.mock('drizzle-orm/d1', () => ({ @@ -27,6 +29,20 @@ vi.mock('../../../src/middleware/auth', () => ({ requireAuth: () => vi.fn((c: unknown, next: () => Promise) => next()), requireApproved: () => vi.fn((c: unknown, next: () => Promise) => next()), getUserId: () => 'user-1', + getAuth: () => ({ + user: { + id: 'user-1', + email: 'user@example.com', + name: null, + avatarUrl: null, + role: mocks.userRole, + status: 'active', + }, + session: { + id: 'session-1', + expiresAt: new Date('2030-01-01T00:00:00Z'), + }, + }), })); vi.mock('../../../src/middleware/project-auth', () => ({ @@ -46,6 +62,10 @@ vi.mock('../../../src/services/project-data', () => ({ unlinkSessionIdea: vi.fn(), })); +vi.mock('../../../src/services/observability', () => ({ + persistError: mocks.persistError, +})); + vi.mock('../../../src/schemas', () => ({ CreateChatSessionSchema: {}, LinkTaskToChatSchema: {}, @@ -59,6 +79,7 @@ describe('chatRoutes agent session routing', () => { beforeEach(() => { vi.clearAllMocks(); + mocks.userRole = 'user'; orderBySpy = vi.fn(() => ({ limit: vi.fn().mockResolvedValue([]), @@ -235,6 +256,102 @@ describe('chatRoutes agent session routing', () => { ); }); + it('returns a structured diagnostic response when session lookup fails', async () => { + const loadError = new Error('Durable Object session lookup failed'); + mocks.getSession.mockRejectedValue(loadError); + + const response = await app.request( + '/api/projects/proj-1/sessions/chat-1', + { method: 'GET', headers: { 'User-Agent': 'vitest' } }, + { + DATABASE: {} as D1Database, + OBSERVABILITY_DATABASE: {} as D1Database, + } as Env, + ); + + expect(response.status).toBe(500); + const body = await response.json(); + expect(body).toEqual({ + error: 'CHAT_SESSION_LOAD_FAILED', + message: 'Failed to load chat session', + requestId: expect.any(String), + phase: 'get_session', + }); + expect(body.details).toBeUndefined(); + expect(mocks.persistError).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + source: 'api', + level: 'error', + message: 'chat.session_detail_load_failed', + stack: expect.stringContaining('Durable Object session lookup failed'), + userId: 'user-1', + userAgent: 'vitest', + context: expect.objectContaining({ + requestId: body.requestId, + route: 'GET /api/projects/:projectId/sessions/:sessionId', + phase: 'get_session', + projectId: 'proj-1', + sessionId: 'chat-1', + userId: 'user-1', + errorName: 'Error', + errorMessage: 'Durable Object session lookup failed', + }), + }), + ); + }); + + it('returns safe diagnostics for regular users when message lookup fails', async () => { + mocks.getMessages.mockRejectedValue(new Error('Malformed tool metadata')); + + const response = await app.request( + '/api/projects/proj-1/sessions/chat-1', + { method: 'GET' }, + { + DATABASE: {} as D1Database, + OBSERVABILITY_DATABASE: {} as D1Database, + } as Env, + ); + + expect(response.status).toBe(500); + const body = await response.json(); + expect(body).toEqual({ + error: 'CHAT_SESSION_LOAD_FAILED', + message: 'Failed to load chat session', + requestId: expect.any(String), + phase: 'get_messages', + }); + expect(body.details).toBeUndefined(); + }); + + it('includes sanitized diagnostic details for admins when message lookup fails', async () => { + mocks.userRole = 'admin'; + mocks.getMessages.mockRejectedValue(new Error('Malformed tool metadata')); + + const response = await app.request( + '/api/projects/proj-1/sessions/chat-1', + { method: 'GET' }, + { + DATABASE: {} as D1Database, + OBSERVABILITY_DATABASE: {} as D1Database, + } as Env, + ); + + expect(response.status).toBe(500); + const body = await response.json(); + expect(body).toMatchObject({ + error: 'CHAT_SESSION_LOAD_FAILED', + message: 'Failed to load chat session', + requestId: expect.any(String), + phase: 'get_messages', + details: { + errorName: 'Error', + errorMessage: 'Malformed tool metadata', + stack: expect.stringContaining('Malformed tool metadata'), + }, + }); + }); + it('returns null agentType when ACP session has no agentType', async () => { mocks.listAcpSessions.mockResolvedValue({ sessions: [ diff --git a/apps/api/tests/unit/services/observability.test.ts b/apps/api/tests/unit/services/observability.test.ts index 1b77c1d52..d30e8ed35 100644 --- a/apps/api/tests/unit/services/observability.test.ts +++ b/apps/api/tests/unit/services/observability.test.ts @@ -1,4 +1,4 @@ -import { beforeEach,describe, expect, it, vi } from 'vitest'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; import type { Env } from '../../../src/env'; @@ -72,6 +72,8 @@ vi.mock('../../../src/db/schema', () => ({ })); // Import after mocks +import { like, or } from 'drizzle-orm'; + import { persistError, persistErrorBatch, @@ -564,6 +566,28 @@ describe('Observability Service', () => { expect(result.total).toBe(5); }); + it('should search serialized context as well as error messages', async () => { + mockSelectFrom.mockReturnValueOnce({ + where: vi.fn().mockResolvedValue([{ count: 0 }]), + }); + mockSelectFrom.mockReturnValueOnce({ + where: vi.fn().mockReturnValue({ + orderBy: vi.fn().mockReturnValue({ + limit: vi.fn().mockResolvedValue([]), + }), + }), + }); + + await queryErrors(mockDb, { search: 'request-id-123' }); + + expect(like).toHaveBeenCalledWith('message', '%request-id-123%'); + expect(like).toHaveBeenCalledWith('context', '%request-id-123%'); + expect(or).toHaveBeenCalledWith( + expect.objectContaining({ type: 'like', val: '%request-id-123%' }), + expect.objectContaining({ type: 'like', val: '%request-id-123%' }) + ); + }); + it('should gracefully ignore invalid cursor', async () => { mockSelectFrom.mockReturnValueOnce({ where: vi.fn().mockResolvedValue([{ count: 0 }]), diff --git a/docs/guides/deployment-troubleshooting.md b/docs/guides/deployment-troubleshooting.md index bbc9c48a7..90b006c4e 100644 --- a/docs/guides/deployment-troubleshooting.md +++ b/docs/guides/deployment-troubleshooting.md @@ -328,6 +328,31 @@ curl -I https://app.example.com --- +#### "Project chat session returns Internal server error" + +**Symptoms:** + +- The project chat list loads, but opening one session fails +- Browser/network response includes `CHAT_SESSION_LOAD_FAILED` +- The response includes `requestId` and `phase` + +**How diagnostics work:** + +`GET /api/projects/:projectId/sessions/:sessionId` in `apps/api/src/routes/chat.ts` wraps the session detail load phases separately. A `phase` of `get_session` points at the session lookup; `get_messages` points at the message page load from ProjectData. The same handler persists `chat.session_detail_load_failed` to `OBSERVABILITY_DATABASE` with the `requestId`, `projectId`, `sessionId`, `phase`, and sanitized error fields so `/admin/errors` can be searched even if Worker tail logs are sampled. + +Admin and superadmin callers also receive a `details` object with sanitized error name/message/stack in the API response. Regular users only receive the generic message, `requestId`, and `phase`. + +Malformed `tool_metadata` on a single chat message is handled in `apps/api/src/durable-objects/project-data/row-schemas.ts` by `parseChatMessageRow()`, which returns `toolMetadata: null` for invalid JSON instead of making the whole session unloadable. + +**Solutions:** + +1. Copy the response `requestId` and `phase`. +2. Search `/admin/errors` for the `requestId`, or filter source `api` and message `chat.session_detail_load_failed`. +3. If `phase` is `get_messages`, inspect the ProjectData DO message rows for malformed or unexpected message data. +4. If `phase` is `get_session`, inspect the ProjectData DO session row and project ownership mapping. + +--- + ### Resume & Recovery #### Resuming a Failed Deployment diff --git a/tasks/backlog/2026-05-06-project-chat-session-error-diagnostics.md b/tasks/active/2026-05-06-project-chat-session-error-diagnostics.md similarity index 86% rename from tasks/backlog/2026-05-06-project-chat-session-error-diagnostics.md rename to tasks/active/2026-05-06-project-chat-session-error-diagnostics.md index 338f4dc7c..9bd827e1c 100644 --- a/tasks/backlog/2026-05-06-project-chat-session-error-diagnostics.md +++ b/tasks/active/2026-05-06-project-chat-session-error-diagnostics.md @@ -30,14 +30,14 @@ The affected user flow is loading a project chat session: ## Implementation Checklist -- [ ] Add a safe helper for chat session load diagnostics with a generated request ID, phase, project ID, session ID, error name/message/stack, and sanitized response details. -- [ ] Persist chat session detail load failures to observability D1 when available, not only sampled Worker logs. -- [ ] Return a structured 500 response for chat session detail failures that includes a safe `requestId` and phase. -- [ ] Expose admin-only diagnostic details for chat session load errors while keeping regular-user responses safe. -- [ ] Make `tool_metadata` parsing resilient so malformed metadata does not prevent a session from loading. -- [ ] Add regression tests for malformed `tool_metadata` and chat session detail diagnostic responses. -- [ ] Update documentation for the troubleshooting/debugging path. -- [ ] Run relevant API tests and quality checks. +- [x] Add a safe helper for chat session load diagnostics with a generated request ID, phase, project ID, session ID, error name/message/stack, and sanitized response details. +- [x] Persist chat session detail load failures to observability D1 when available, not only sampled Worker logs. +- [x] Return a structured 500 response for chat session detail failures that includes a safe `requestId` and phase. +- [x] Expose admin-only diagnostic details for chat session load errors while keeping regular-user responses safe. +- [x] Make `tool_metadata` parsing resilient so malformed metadata does not prevent a session from loading. +- [x] Add regression tests for malformed `tool_metadata` and chat session detail diagnostic responses. +- [x] Update documentation for the troubleshooting/debugging path. +- [x] Run relevant API tests and quality checks. - [ ] Deploy to staging and verify the changed endpoint behavior with a real authenticated request. ## Acceptance Criteria