diff --git a/apps/api/.env.example b/apps/api/.env.example index 776f34392..c544baa69 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -229,6 +229,7 @@ BASE_DOMAIN=workspaces.example.com # Workspace idle timeout (ProjectData DO) # WORKSPACE_IDLE_TIMEOUT_MS=7200000 # 2 hours — global default idle timeout before workspace is stopped (overridable per-project) +# WORKSPACE_STOPPED_TTL_MS=300000 # 5 minutes — auto-delete stopped workspaces after this TTL # Task agent configuration # DEFAULT_TASK_AGENT_TYPE=opencode # Agent used for autonomous task execution diff --git a/apps/api/src/durable-objects/node-lifecycle.ts b/apps/api/src/durable-objects/node-lifecycle.ts index 2caeda550..122407ee2 100644 --- a/apps/api/src/durable-objects/node-lifecycle.ts +++ b/apps/api/src/durable-objects/node-lifecycle.ts @@ -14,6 +14,11 @@ * alarm() → on `warm`: sets `destroying`, updates D1, cron handles teardown * → on `active`: no-op (was claimed between schedule and fire) * + * Workspace auto-deletion: + * scheduleWorkspaceDeletion(workspaceId, userId) → stores pending deletion, recalculates alarm + * cancelWorkspaceDeletion(workspaceId) → removes pending deletion, recalculates alarm + * alarm() → also processes expired workspace deletions (calls VM agent, updates D1) + * * Actual infrastructure destruction (Hetzner API, DNS) is handled by the * cron sweep, NOT by this DO — because user credentials are encrypted in D1 * and must be decrypted with CREDENTIAL_ENCRYPTION_KEY (or ENCRYPTION_KEY @@ -21,18 +26,22 @@ * * See: specs/021-task-chat-architecture/tasks.md (Phase 5) */ -import type { NodeLifecycleState,NodeLifecycleStatus } from '@simple-agent-manager/shared'; +import type { NodeLifecycleState, NodeLifecycleStatus } from '@simple-agent-manager/shared'; import { DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS, DEFAULT_NODE_WARM_TIMEOUT_MS, + DEFAULT_WORKSPACE_STOPPED_TTL_MS, } from '@simple-agent-manager/shared'; import { DurableObject } from 'cloudflare:workers'; +import type { Env } from '../env'; import { log } from '../lib/logger'; +import { deleteWorkspaceOnNode } from '../services/node-agent'; type NodeLifecycleEnv = { DATABASE: D1Database; NODE_WARM_TIMEOUT_MS?: string; + WORKSPACE_STOPPED_TTL_MS?: string; }; interface StoredState { @@ -45,6 +54,12 @@ interface StoredState { warmTimeoutOverrideMs?: number | null; } +interface PendingWorkspaceDeletion { + workspaceId: string; + userId: string; + deleteAt: number; +} + export class NodeLifecycle extends DurableObject { /** * Mark a node as idle (warm). Called after the last workspace on the node @@ -73,8 +88,8 @@ export class NodeLifecycle extends DurableObject { }; await this.ctx.storage.put('state', newState); - // Schedule (or reschedule) alarm - await this.ctx.storage.setAlarm(now + warmTimeout); + // Recalculate alarm considering both warm timeout and pending workspace deletions + await this.recalculateAlarm(now + warmTimeout); // Update D1 warm_since column await this.updateD1WarmSince(nodeId, new Date(now).toISOString()); @@ -84,7 +99,7 @@ export class NodeLifecycle extends DurableObject { /** * Mark a node as active. Called when a workspace starts on the node. - * Cancels any pending warm timeout alarm. + * Cancels any pending warm timeout alarm but preserves workspace deletion alarms. */ async markActive(): Promise { const state = await this.getStoredState(); @@ -97,8 +112,8 @@ export class NodeLifecycle extends DurableObject { state.warmSince = null; await this.ctx.storage.put('state', state); - // Cancel any pending alarm - await this.ctx.storage.deleteAlarm(); + // Recalculate alarm — pending workspace deletions still need to fire + await this.recalculateAlarm(null); // Clear D1 warm_since await this.updateD1WarmSince(state.nodeId, null); @@ -128,8 +143,8 @@ export class NodeLifecycle extends DurableObject { state.warmSince = null; await this.ctx.storage.put('state', state); - // Cancel alarm - await this.ctx.storage.deleteAlarm(); + // Recalculate alarm — pending workspace deletions still need to fire + await this.recalculateAlarm(null); // Clear D1 warm_since await this.updateD1WarmSince(state.nodeId, null); @@ -148,19 +163,65 @@ export class NodeLifecycle extends DurableObject { return this.toPublicState(state); } + // ========================================================================= + // Workspace auto-deletion scheduling + // ========================================================================= + + /** + * Schedule a stopped workspace for automatic deletion after the configured TTL. + * Called when a workspace transitions to 'stopped' status. + */ + async scheduleWorkspaceDeletion(workspaceId: string, userId: string): Promise { + const ttl = this.getWorkspaceStoppedTtlMs(); + const deleteAt = Date.now() + ttl; + + const entry: PendingWorkspaceDeletion = { workspaceId, userId, deleteAt }; + await this.ctx.storage.put(`ws-delete:${workspaceId}`, entry); + + log.info('node_lifecycle.workspace_deletion_scheduled', { + workspaceId, + userId, + deleteAt: new Date(deleteAt).toISOString(), + ttlMs: ttl, + }); + + await this.recalculateAlarm(await this.getWarmAlarmTime()); + } + /** - * Alarm handler. Fires when the warm timeout expires. + * Cancel a pending workspace deletion. Called when a workspace is restarted + * before the TTL expires. + */ + async cancelWorkspaceDeletion(workspaceId: string): Promise { + await this.ctx.storage.delete(`ws-delete:${workspaceId}`); + + log.info('node_lifecycle.workspace_deletion_cancelled', { workspaceId }); + + await this.recalculateAlarm(await this.getWarmAlarmTime()); + } + + // ========================================================================= + // Alarm handler + // ========================================================================= + + /** + * Alarm handler. Fires when either: + * 1. The warm timeout expires (node should be destroyed) + * 2. A workspace deletion is due * - * If the node is still warm, transitions to `destroying` and marks D1 - * for the cron sweep to handle actual infrastructure teardown. - * If the node was claimed between schedule and fire, this is a no-op. + * Processes expired workspace deletions first, then handles warm timeout. */ async alarm(): Promise { + // Process any expired workspace deletions + await this.processExpiredDeletions(); + const state = await this.getStoredState(); if (!state) return; // No-op if node was claimed (active) or already destroying if (state.status === 'active') { + // Still recalculate alarm for any remaining pending workspace deletions + await this.recalculateAlarm(null); return; } @@ -171,7 +232,18 @@ export class NodeLifecycle extends DurableObject { return; } - // status === 'warm' → transition to destroying + // status === 'warm' → check if warm timeout has actually expired + if (state.warmSince) { + const warmTimeout = state.warmTimeoutOverrideMs ?? this.getWarmTimeoutMs(); + const warmExpiry = state.warmSince + warmTimeout; + if (Date.now() < warmExpiry) { + // Warm timeout hasn't expired yet — alarm fired for workspace deletion only + await this.recalculateAlarm(warmExpiry); + return; + } + } + + // Warm timeout expired → transition to destroying state.status = 'destroying'; await this.ctx.storage.put('state', state); @@ -193,8 +265,8 @@ export class NodeLifecycle extends DurableObject { nodeId: state.nodeId, error: err instanceof Error ? err.message : String(err), }); - // Schedule retry - await this.ctx.storage.setAlarm(Date.now() + DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS); + // Schedule retry (use recalculateAlarm to not delay pending workspace deletions) + await this.recalculateAlarm(Date.now() + DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS); } } @@ -215,6 +287,15 @@ export class NodeLifecycle extends DurableObject { return DEFAULT_NODE_WARM_TIMEOUT_MS; } + private getWorkspaceStoppedTtlMs(): number { + const envValue = this.env.WORKSPACE_STOPPED_TTL_MS; + if (envValue) { + const parsed = parseInt(envValue, 10); + if (Number.isFinite(parsed) && parsed > 0) return parsed; + } + return DEFAULT_WORKSPACE_STOPPED_TTL_MS; + } + private toPublicState(state: StoredState): NodeLifecycleState { return { nodeId: state.nodeId, @@ -238,4 +319,114 @@ export class NodeLifecycle extends DurableObject { }); } } + + /** + * Get all pending workspace deletions from DO storage. + */ + private async getPendingDeletions(): Promise> { + return await this.ctx.storage.list({ prefix: 'ws-delete:' }); + } + + /** + * Process all workspace deletions whose deleteAt time has passed. + */ + private async processExpiredDeletions(): Promise { + const pending = await this.getPendingDeletions(); + const now = Date.now(); + + // Load state once to get nodeId — avoids N storage reads in the loop + const state = await this.getStoredState(); + if (!state) return; + + for (const [key, entry] of pending) { + if (entry.deleteAt > now) continue; + + try { + await this.deleteWorkspace(state.nodeId, entry.workspaceId, entry.userId); + await this.ctx.storage.delete(key); + + log.info('node_lifecycle.workspace_auto_deleted', { + workspaceId: entry.workspaceId, + userId: entry.userId, + }); + } catch (err) { + log.error('node_lifecycle.workspace_deletion_failed', { + workspaceId: entry.workspaceId, + userId: entry.userId, + error: err instanceof Error ? err.message : String(err), + }); + // Leave the entry for retry on next alarm. Push deleteAt forward slightly + // to avoid tight retry loops. + entry.deleteAt = now + DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS; + await this.ctx.storage.put(key, entry); + } + } + } + + /** + * Delete a workspace: call VM agent to remove Docker container + volume, + * then update D1 status to 'deleted'. + */ + private async deleteWorkspace(nodeId: string, workspaceId: string, userId: string): Promise { + // Call VM agent DELETE endpoint via shared helper (handles JWT auth, proper URL routing) + try { + await deleteWorkspaceOnNode(nodeId, workspaceId, this.env as unknown as Env, userId); + } catch (err) { + // If the node is unreachable (already destroyed), log but don't fail + // The D1 status update below still marks the workspace as deleted + log.warn('node_lifecycle.workspace_delete_vm_agent_failed', { + workspaceId, + nodeId, + error: err instanceof Error ? err.message : String(err), + }); + } + + // Update D1 workspace status to 'deleted' + const now = new Date().toISOString(); + await this.env.DATABASE.prepare( + `UPDATE workspaces SET status = 'deleted', updated_at = ? WHERE id = ? AND status = 'stopped'` + ).bind(now, workspaceId).run(); + + // Clean up any agent_sessions referencing this workspace (best-effort) + try { + await this.env.DATABASE.prepare( + `UPDATE agent_sessions SET status = 'completed', updated_at = ? WHERE workspace_id = ? AND status NOT IN ('completed', 'failed')` + ).bind(now, workspaceId).run(); + } catch { + // best-effort + } + } + + /** + * Get the warm alarm time if the node is in warm state. + */ + private async getWarmAlarmTime(): Promise { + const state = await this.getStoredState(); + if (!state || state.status !== 'warm' || !state.warmSince) return null; + const warmTimeout = state.warmTimeoutOverrideMs ?? this.getWarmTimeoutMs(); + return state.warmSince + warmTimeout; + } + + /** + * Recalculate and set the alarm to the earliest time needed: + * either the warm timeout expiry or the earliest pending workspace deletion. + * + * @param warmAlarmTime - The warm timeout expiry time, or null if not applicable + */ + private async recalculateAlarm(warmAlarmTime: number | null): Promise { + let earliest = warmAlarmTime; + + const pending = await this.getPendingDeletions(); + for (const [, entry] of pending) { + if (earliest === null || entry.deleteAt < earliest) { + earliest = entry.deleteAt; + } + } + + if (earliest !== null) { + await this.ctx.storage.setAlarm(earliest); + } else { + await this.ctx.storage.deleteAlarm(); + } + } } diff --git a/apps/api/src/durable-objects/project-data/index.ts b/apps/api/src/durable-objects/project-data/index.ts index 295f974b5..73677bcee 100644 --- a/apps/api/src/durable-objects/project-data/index.ts +++ b/apps/api/src/durable-objects/project-data/index.ts @@ -350,7 +350,24 @@ export class ProjectData extends DurableObject { (type, payload, sid) => this.broadcastEvent(type, payload, sid), () => this.scheduleSummarySync()); await idleCleanup.processExpiredCleanups(this.sql, this.env, (taskId) => idleCleanup.completeTaskInD1(this.env.DATABASE, taskId), - (workspaceId) => idleCleanup.stopWorkspaceInD1(this.env.DATABASE, workspaceId), + async (workspaceId) => { + await idleCleanup.stopWorkspaceInD1(this.env.DATABASE, workspaceId); + // Schedule automatic deletion after TTL (best-effort) + try { + const workerEnv = this.env as unknown as import('../../env').Env; + const wsRow = await workerEnv.DATABASE.prepare( + 'SELECT node_id, user_id FROM workspaces WHERE id = ?' + ).bind(workspaceId).first<{ node_id: string | null; user_id: string }>(); + if (wsRow?.node_id) { + const doId = workerEnv.NODE_LIFECYCLE.idFromName(wsRow.node_id); + const stub = workerEnv.NODE_LIFECYCLE.get(doId); + await (stub as unknown as import('../node-lifecycle').NodeLifecycle) + .scheduleWorkspaceDeletion(workspaceId, wsRow.user_id); + } + } catch { + // Best-effort — cron safety-net will catch it + } + }, (type, payload, sid) => this.broadcastEvent(type, payload, sid), () => this.scheduleSummarySync()); // Mailbox delivery sweep: expire stale messages and re-queue unacked ones diff --git a/apps/api/src/durable-objects/task-runner/state-machine.ts b/apps/api/src/durable-objects/task-runner/state-machine.ts index 40054bbb9..5038525fd 100644 --- a/apps/api/src/durable-objects/task-runner/state-machine.ts +++ b/apps/api/src/durable-objects/task-runner/state-machine.ts @@ -350,6 +350,20 @@ export async function cleanupOnFailure( error: err instanceof Error ? err.message : String(err), }); } + + // Schedule automatic deletion after TTL (best-effort) + try { + const doId = rc.env.NODE_LIFECYCLE.idFromName(state.stepResults.nodeId); + const stub = rc.env.NODE_LIFECYCLE.get(doId); + await (stub as unknown as import('../node-lifecycle').NodeLifecycle) + .scheduleWorkspaceDeletion(state.stepResults.workspaceId, state.userId); + } catch (err) { + log.warn('task_runner_do.cleanup.schedule_deletion_failed', { + taskId: state.taskId, + workspaceId: state.stepResults.workspaceId, + error: err instanceof Error ? err.message : String(err), + }); + } } // Clean up auto-provisioned node. If a workspace exists, cleanupTaskRun diff --git a/apps/api/src/env.ts b/apps/api/src/env.ts index 1b0b44cce..29e30ace2 100644 --- a/apps/api/src/env.ts +++ b/apps/api/src/env.ts @@ -139,6 +139,8 @@ export interface Env { ORPHANED_WORKSPACE_GRACE_PERIOD_MS?: string; // Workspace idle timeout (global default, overridable per-project) WORKSPACE_IDLE_TIMEOUT_MS?: string; + // Auto-delete stopped workspaces after this TTL (default: 300000 = 5 minutes) + WORKSPACE_STOPPED_TTL_MS?: string; // Task agent configuration DEFAULT_TASK_AGENT_TYPE?: string; // Built-in profile model overrides (defaults: claude-sonnet-4-5-20250929, claude-opus-4-6) diff --git a/apps/api/src/routes/workspaces/lifecycle.ts b/apps/api/src/routes/workspaces/lifecycle.ts index 11ff3fafb..82d486ebb 100644 --- a/apps/api/src/routes/workspaces/lifecycle.ts +++ b/apps/api/src/routes/workspaces/lifecycle.ts @@ -68,6 +68,16 @@ lifecycleRoutes.post('/:id/stop', requireAuth(), requireApproved(), async (c) => await stopComputeTracking(innerDb, workspace.id).catch((e) => { log.warn('workspace.compute_tracking_stop_failed', { workspaceId: workspace.id, error: String(e) }); }); + + // Schedule automatic deletion after TTL + try { + const doId = c.env.NODE_LIFECYCLE.idFromName(workspace.nodeId!); + const stub = c.env.NODE_LIFECYCLE.get(doId); + await (stub as unknown as import('../../durable-objects/node-lifecycle').NodeLifecycle) + .scheduleWorkspaceDeletion(workspace.id, userId); + } catch (e) { + log.warn('workspace.schedule_deletion_failed', { workspaceId: workspace.id, error: String(e) }); + } } catch (err) { await innerDb .update(schema.workspaces) @@ -122,6 +132,16 @@ lifecycleRoutes.post('/:id/restart', requireAuth(), requireApproved(), async (c) const node = await getOwnedNode(db, workspace.nodeId, userId); assertNodeOperational(node, 'restart workspace'); + // Cancel any pending auto-deletion before restarting + try { + const doId = c.env.NODE_LIFECYCLE.idFromName(workspace.nodeId!); + const stub = c.env.NODE_LIFECYCLE.get(doId); + await (stub as unknown as import('../../durable-objects/node-lifecycle').NodeLifecycle) + .cancelWorkspaceDeletion(workspace.id); + } catch (e) { + log.warn('workspace.cancel_deletion_failed', { workspaceId: workspace.id, error: String(e) }); + } + // Clear previous error state and boot logs before starting new provisioning await db .update(schema.workspaces) diff --git a/apps/api/src/scheduled/node-cleanup.ts b/apps/api/src/scheduled/node-cleanup.ts index 0d3672c70..c736978aa 100644 --- a/apps/api/src/scheduled/node-cleanup.ts +++ b/apps/api/src/scheduled/node-cleanup.ts @@ -23,14 +23,15 @@ import { DEFAULT_MAX_AUTO_NODE_LIFETIME_MS, DEFAULT_NODE_WARM_GRACE_PERIOD_MS, DEFAULT_ORPHANED_WORKSPACE_GRACE_PERIOD_MS, + DEFAULT_WORKSPACE_STOPPED_TTL_MS, } from '@simple-agent-manager/shared'; -import { eq } from 'drizzle-orm'; +import { and, eq } from 'drizzle-orm'; import { drizzle } from 'drizzle-orm/d1'; import * as schema from '../db/schema'; import type { Env } from '../env'; import { log } from '../lib/logger'; -import { stopWorkspaceOnNode } from '../services/node-agent'; +import { deleteWorkspaceOnNode, stopWorkspaceOnNode } from '../services/node-agent'; import { deleteNodeResources } from '../services/nodes'; import { persistError } from '../services/observability'; import * as projectDataService from '../services/project-data'; @@ -48,6 +49,7 @@ export interface NodeCleanupResult { lifetimeSkipped: number; orphanedWorkspacesFlagged: number; orphanedNodesFlagged: number; + stoppedWorkspacesDeleted: number; errors: number; } @@ -63,6 +65,7 @@ export async function runNodeCleanupSweep(env: Env): Promise lifetimeSkipped: 0, orphanedWorkspacesFlagged: 0, orphanedNodesFlagged: 0, + stoppedWorkspacesDeleted: 0, errors: 0, }; @@ -369,5 +372,44 @@ export async function runNodeCleanupSweep(env: Env): Promise result.orphanedNodesFlagged++; } + // 5. Safety-net: delete stopped workspaces past TTL that the DO alarm missed. + // This catches cases where the DO alarm failed to fire or wasn't scheduled. + const stoppedTtlMs = parseMs(env.WORKSPACE_STOPPED_TTL_MS, DEFAULT_WORKSPACE_STOPPED_TTL_MS); + // Add a 2x grace buffer so the DO alarm has time to fire first + const stoppedGraceThreshold = new Date(now.getTime() - stoppedTtlMs * 2).toISOString(); + const staleStoppedWorkspaces = await env.DATABASE.prepare( + `SELECT w.id, w.node_id, w.user_id + FROM workspaces w + WHERE w.status = 'stopped' + AND w.updated_at < ? + LIMIT 50` + ).bind(stoppedGraceThreshold).all<{ + id: string; + node_id: string | null; + user_id: string; + }>(); + + for (const ws of staleStoppedWorkspaces.results) { + try { + // Delete on VM agent (best-effort — node may be gone) + if (ws.node_id) { + await deleteWorkspaceOnNode(ws.node_id, ws.id, env, ws.user_id).catch((e) => { + log.warn('node_cleanup.stale_stopped_delete_on_node_failed', { workspaceId: ws.id, error: String(e) }); + }); + } + + // Mark as deleted in D1 (status guard prevents TOCTOU race if workspace was restarted) + await db + .update(schema.workspaces) + .set({ status: 'deleted', updatedAt: new Date().toISOString() }) + .where(and(eq(schema.workspaces.id, ws.id), eq(schema.workspaces.status, 'stopped'))); + + result.stoppedWorkspacesDeleted++; + } catch (e) { + log.error('node_cleanup.stale_stopped_workspace_delete_failed', { workspaceId: ws.id, error: String(e) }); + result.errors++; + } + } + return result; } diff --git a/apps/api/src/services/task-runner.ts b/apps/api/src/services/task-runner.ts index cae3367ad..1a8a492e1 100644 --- a/apps/api/src/services/task-runner.ts +++ b/apps/api/src/services/task-runner.ts @@ -104,6 +104,22 @@ export async function cleanupTaskRun( }); } + // Schedule automatic deletion after TTL (best-effort) + if (workspace.nodeId && (workspace.status === 'running' || workspace.status === 'recovery' || workspace.status === 'stopped')) { + try { + const doId = env.NODE_LIFECYCLE.idFromName(workspace.nodeId); + const stub = env.NODE_LIFECYCLE.get(doId); + await (stub as unknown as import('../durable-objects/node-lifecycle').NodeLifecycle) + .scheduleWorkspaceDeletion(workspace.id, task.userId); + } catch (e) { + log.warn('task_run.cleanup.schedule_deletion_failed', { + taskId, + workspaceId: workspace.id, + error: e instanceof Error ? e.message : String(e), + }); + } + } + // If node was auto-provisioned for this task, check if it can be cleaned up if (task.autoProvisionedNodeId) { await cleanupAutoProvisionedNode( diff --git a/apps/api/tests/integration/warm-node-pooling.test.ts b/apps/api/tests/integration/warm-node-pooling.test.ts index bb647f092..9e429ba27 100644 --- a/apps/api/tests/integration/warm-node-pooling.test.ts +++ b/apps/api/tests/integration/warm-node-pooling.test.ts @@ -39,7 +39,7 @@ describe('warm node pooling lifecycle integration', () => { it('DO.markIdle sets warm status and schedules alarm', () => { expect(doFile).toContain("status: 'warm'"); - expect(doFile).toContain('setAlarm(now + warmTimeout)'); + expect(doFile).toContain('recalculateAlarm(now + warmTimeout)'); }); it('DO.markIdle updates D1 warm_since', () => { @@ -64,7 +64,7 @@ describe('warm node pooling lifecycle integration', () => { const tryClaimSection = doFile.slice(doFile.indexOf('async tryClaim')); expect(tryClaimSection).toContain("state.status = 'active'"); expect(tryClaimSection).toContain('state.claimedByTask = taskId'); - expect(tryClaimSection).toContain('deleteAlarm()'); + expect(tryClaimSection).toContain('recalculateAlarm(null)'); }); }); diff --git a/apps/api/tests/unit/node-cleanup.test.ts b/apps/api/tests/unit/node-cleanup.test.ts index 2c9b85544..d0b3a52c7 100644 --- a/apps/api/tests/unit/node-cleanup.test.ts +++ b/apps/api/tests/unit/node-cleanup.test.ts @@ -15,6 +15,18 @@ vi.mock('../../src/services/nodes', () => ({ deleteNodeResources: vi.fn().mockResolvedValue(undefined), })); +// Mock node-agent service +vi.mock('../../src/services/node-agent', () => ({ + deleteWorkspaceOnNode: vi.fn().mockResolvedValue(undefined), + stopWorkspaceOnNode: vi.fn().mockResolvedValue(undefined), +})); + +// Mock project-data service +vi.mock('../../src/services/project-data', () => ({ + stopSession: vi.fn().mockResolvedValue(undefined), + cleanupWorkspaceActivity: vi.fn().mockResolvedValue(undefined), +})); + // Mock persistError vi.mock('../../src/services/observability', () => ({ persistError: vi.fn().mockResolvedValue(undefined), @@ -228,6 +240,7 @@ describe('runNodeCleanupSweep', () => { lifetimeSkipped: 0, orphanedWorkspacesFlagged: 0, orphanedNodesFlagged: 0, + stoppedWorkspacesDeleted: 0, errors: 0, }); }); diff --git a/apps/api/tests/unit/node-lifecycle-workspace-deletion.test.ts b/apps/api/tests/unit/node-lifecycle-workspace-deletion.test.ts new file mode 100644 index 000000000..4ca8e572c --- /dev/null +++ b/apps/api/tests/unit/node-lifecycle-workspace-deletion.test.ts @@ -0,0 +1,252 @@ +/** + * Source contract tests for NodeLifecycle workspace auto-deletion feature. + * + * Verifies the NodeLifecycle DO workspace deletion scheduling, + * cancellation, and alarm-based deletion logic. + */ +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; + +import { describe, expect, it } from 'vitest'; + +describe('NodeLifecycle workspace auto-deletion source contract', () => { + const doFile = readFileSync(resolve(process.cwd(), 'src/durable-objects/node-lifecycle.ts'), 'utf8'); + const envFile = readFileSync(resolve(process.cwd(), 'src/env.ts'), 'utf8'); + const sharedConstantsFile = readFileSync( + resolve(process.cwd(), '../../packages/shared/src/constants/node-pooling.ts'), + 'utf8', + ); + const sharedIndexFile = readFileSync( + resolve(process.cwd(), '../../packages/shared/src/constants/index.ts'), + 'utf8', + ); + + describe('shared constant', () => { + it('defines DEFAULT_WORKSPACE_STOPPED_TTL_MS in node-pooling.ts', () => { + expect(sharedConstantsFile).toContain('export const DEFAULT_WORKSPACE_STOPPED_TTL_MS'); + expect(sharedConstantsFile).toContain('5 * 60 * 1000'); + }); + + it('exports DEFAULT_WORKSPACE_STOPPED_TTL_MS from constants barrel', () => { + expect(sharedIndexFile).toContain('DEFAULT_WORKSPACE_STOPPED_TTL_MS'); + }); + }); + + describe('env configuration', () => { + it('WORKSPACE_STOPPED_TTL_MS is defined in Env type', () => { + expect(envFile).toContain('WORKSPACE_STOPPED_TTL_MS?: string'); + }); + + it('NodeLifecycleEnv includes WORKSPACE_STOPPED_TTL_MS', () => { + expect(doFile).toContain('WORKSPACE_STOPPED_TTL_MS?: string'); + }); + }); + + describe('scheduleWorkspaceDeletion method', () => { + it('exists as a public async method', () => { + expect(doFile).toContain('async scheduleWorkspaceDeletion(workspaceId: string, userId: string)'); + }); + + it('reads configured TTL via getWorkspaceStoppedTtlMs', () => { + expect(doFile).toContain('this.getWorkspaceStoppedTtlMs()'); + }); + + it('stores entries with ws-delete: prefix in DO storage', () => { + expect(doFile).toContain('`ws-delete:${workspaceId}`'); + }); + + it('stores PendingWorkspaceDeletion with workspaceId, userId, deleteAt', () => { + expect(doFile).toContain('const entry: PendingWorkspaceDeletion = { workspaceId, userId, deleteAt }'); + }); + + it('recalculates alarm after scheduling', () => { + const method = doFile.slice(doFile.indexOf('async scheduleWorkspaceDeletion')); + expect(method).toContain('recalculateAlarm'); + }); + }); + + describe('cancelWorkspaceDeletion method', () => { + it('exists as a public async method', () => { + expect(doFile).toContain('async cancelWorkspaceDeletion(workspaceId: string)'); + }); + + it('deletes the ws-delete: entry from DO storage', () => { + expect(doFile).toContain("this.ctx.storage.delete(`ws-delete:${workspaceId}`)"); + }); + + it('recalculates alarm after cancellation', () => { + const method = doFile.slice(doFile.indexOf('async cancelWorkspaceDeletion')); + const methodEnd = method.indexOf('async ', 10); + const methodBody = method.slice(0, methodEnd > 0 ? methodEnd : undefined); + expect(methodBody).toContain('recalculateAlarm'); + }); + }); + + describe('alarm handler processes workspace deletions', () => { + it('calls processExpiredDeletions in alarm()', () => { + const alarmMethod = doFile.slice(doFile.indexOf('async alarm()')); + expect(alarmMethod).toContain('processExpiredDeletions'); + }); + + it('processExpiredDeletions checks deleteAt against now', () => { + expect(doFile).toContain('entry.deleteAt > now'); + }); + + it('calls deleteWorkspace for expired entries', () => { + expect(doFile).toContain('await this.deleteWorkspace(state.nodeId, entry.workspaceId, entry.userId)'); + }); + + it('removes processed entries from storage', () => { + const processMethod = doFile.slice(doFile.indexOf('processExpiredDeletions')); + expect(processMethod).toContain('this.ctx.storage.delete(key)'); + }); + + it('retries failed deletions by pushing deleteAt forward', () => { + expect(doFile).toContain('entry.deleteAt = now + DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS'); + }); + }); + + describe('deleteWorkspace implementation', () => { + it('calls deleteWorkspaceOnNode via shared helper (proper JWT auth)', () => { + expect(doFile).toContain('deleteWorkspaceOnNode(nodeId, workspaceId, this.env as unknown as Env, userId)'); + }); + + it('updates D1 workspace status to deleted', () => { + expect(doFile).toContain("UPDATE workspaces SET status = 'deleted'"); + }); + + it('only deletes workspaces that are still stopped', () => { + expect(doFile).toContain("AND status = 'stopped'"); + }); + + it('cleans up agent_sessions (best-effort)', () => { + expect(doFile).toContain('UPDATE agent_sessions SET status'); + }); + + it('handles unreachable VM agent gracefully (node may be gone)', () => { + expect(doFile).toContain('node_lifecycle.workspace_delete_vm_agent_failed'); + }); + }); + + describe('recalculateAlarm picks earliest time', () => { + it('considers warm alarm time', () => { + const method = doFile.slice(doFile.indexOf('private async recalculateAlarm')); + expect(method).toContain('warmAlarmTime'); + }); + + it('considers pending workspace deletions', () => { + const method = doFile.slice(doFile.indexOf('private async recalculateAlarm')); + expect(method).toContain('getPendingDeletions'); + }); + + it('picks the minimum of all times', () => { + const method = doFile.slice(doFile.indexOf('private async recalculateAlarm')); + expect(method).toContain('entry.deleteAt < earliest'); + }); + + it('deletes alarm when no pending events', () => { + const method = doFile.slice(doFile.indexOf('private async recalculateAlarm')); + expect(method).toContain('deleteAlarm()'); + }); + + it('sets alarm to earliest time', () => { + const method = doFile.slice(doFile.indexOf('private async recalculateAlarm')); + expect(method).toContain('setAlarm(earliest)'); + }); + }); + + describe('markActive preserves workspace deletion alarms', () => { + it('calls recalculateAlarm(null) instead of deleteAlarm()', () => { + const markActiveStart = doFile.indexOf('async markActive()'); + const markActiveEnd = doFile.indexOf('async tryClaim', markActiveStart); + const markActiveBody = doFile.slice(markActiveStart, markActiveEnd); + expect(markActiveBody).toContain('recalculateAlarm(null)'); + expect(markActiveBody).not.toContain('deleteAlarm()'); + }); + }); + + describe('tryClaim preserves workspace deletion alarms', () => { + it('calls recalculateAlarm(null) instead of deleteAlarm()', () => { + const tryClaimStart = doFile.indexOf('async tryClaim('); + const tryClaimEnd = doFile.indexOf('async getStatus', tryClaimStart); + const tryClaimBody = doFile.slice(tryClaimStart, tryClaimEnd); + expect(tryClaimBody).toContain('recalculateAlarm(null)'); + expect(tryClaimBody).not.toContain('deleteAlarm()'); + }); + }); + + describe('configurable workspace stopped TTL', () => { + it('uses WORKSPACE_STOPPED_TTL_MS env var', () => { + expect(doFile).toContain('WORKSPACE_STOPPED_TTL_MS'); + }); + + it('falls back to DEFAULT_WORKSPACE_STOPPED_TTL_MS', () => { + expect(doFile).toContain('DEFAULT_WORKSPACE_STOPPED_TTL_MS'); + }); + + it('imports the constant from shared', () => { + expect(doFile).toContain("DEFAULT_WORKSPACE_STOPPED_TTL_MS,"); + }); + }); + + describe('callers schedule deletion on workspace stop', () => { + const lifecycleFile = readFileSync(resolve(process.cwd(), 'src/routes/workspaces/lifecycle.ts'), 'utf8'); + const taskRunnerFile = readFileSync(resolve(process.cwd(), 'src/services/task-runner.ts'), 'utf8'); + const stateMachineFile = readFileSync( + resolve(process.cwd(), 'src/durable-objects/task-runner/state-machine.ts'), + 'utf8', + ); + + it('lifecycle stop route calls scheduleWorkspaceDeletion', () => { + expect(lifecycleFile).toContain('scheduleWorkspaceDeletion(workspace.id, userId)'); + }); + + it('lifecycle restart route calls cancelWorkspaceDeletion', () => { + expect(lifecycleFile).toContain('cancelWorkspaceDeletion(workspace.id)'); + }); + + it('cleanupTaskRun calls scheduleWorkspaceDeletion', () => { + expect(taskRunnerFile).toContain('scheduleWorkspaceDeletion(workspace.id, task.userId)'); + }); + + it('cleanupOnFailure calls scheduleWorkspaceDeletion', () => { + expect(stateMachineFile).toContain( + 'scheduleWorkspaceDeletion(state.stepResults.workspaceId, state.userId)', + ); + }); + + it('idle cleanup processExpiredCleanups schedules deletion after stop', () => { + const projectDataIndexFile = readFileSync( + resolve(process.cwd(), 'src/durable-objects/project-data/index.ts'), + 'utf8', + ); + expect(projectDataIndexFile).toContain('scheduleWorkspaceDeletion(workspaceId, wsRow.user_id)'); + }); + }); + + describe('cron safety-net for stale stopped workspaces', () => { + const cronFile = readFileSync(resolve(process.cwd(), 'src/scheduled/node-cleanup.ts'), 'utf8'); + + it('imports DEFAULT_WORKSPACE_STOPPED_TTL_MS', () => { + expect(cronFile).toContain('DEFAULT_WORKSPACE_STOPPED_TTL_MS'); + }); + + it('includes stoppedWorkspacesDeleted in result', () => { + expect(cronFile).toContain('stoppedWorkspacesDeleted'); + }); + + it('queries for stopped workspaces past TTL', () => { + expect(cronFile).toContain("w.status = 'stopped'"); + expect(cronFile).toContain('stoppedGraceThreshold'); + }); + + it('calls deleteWorkspaceOnNode for stale stopped workspaces', () => { + expect(cronFile).toContain('deleteWorkspaceOnNode(ws.node_id, ws.id, env, ws.user_id)'); + }); + + it('marks stale stopped workspaces as deleted in D1', () => { + const step5Section = cronFile.slice(cronFile.indexOf('Safety-net')); + expect(step5Section).toContain("status: 'deleted'"); + }); + }); +}); diff --git a/apps/api/tests/unit/node-lifecycle.test.ts b/apps/api/tests/unit/node-lifecycle.test.ts index 7871ea1ab..3e7167aa2 100644 --- a/apps/api/tests/unit/node-lifecycle.test.ts +++ b/apps/api/tests/unit/node-lifecycle.test.ts @@ -47,8 +47,8 @@ describe('NodeLifecycle DO source contract', () => { expect(doFile).toContain('warmSince: now'); }); - it('schedules alarm at now + timeout', () => { - expect(doFile).toContain('setAlarm(now + warmTimeout)'); + it('recalculates alarm considering warm timeout and pending deletions', () => { + expect(doFile).toContain('recalculateAlarm(now + warmTimeout)'); }); it('updates D1 warm_since column', () => { @@ -69,8 +69,9 @@ describe('NodeLifecycle DO source contract', () => { expect(doFile).toContain('state.claimedByTask = null'); }); - it('cancels alarm via deleteAlarm', () => { - expect(doFile).toContain('this.ctx.storage.deleteAlarm()'); + it('recalculates alarm (preserves workspace deletion alarms)', () => { + const markActiveSection = doFile.slice(doFile.indexOf('async markActive()')); + expect(markActiveSection).toContain('recalculateAlarm(null)'); }); it('clears D1 warm_since', () => { @@ -96,10 +97,9 @@ describe('NodeLifecycle DO source contract', () => { expect(doFile).toContain('state.claimedByTask = taskId'); }); - it('cancels alarm on successful claim', () => { - // deleteAlarm is called inside tryClaim after claiming + it('recalculates alarm on successful claim (preserves workspace deletion alarms)', () => { const tryClaimSection = doFile.slice(doFile.indexOf('async tryClaim')); - expect(tryClaimSection).toContain('deleteAlarm()'); + expect(tryClaimSection).toContain('recalculateAlarm(null)'); }); }); diff --git a/packages/shared/src/constants/index.ts b/packages/shared/src/constants/index.ts index 4e673127c..9c3829f48 100644 --- a/packages/shared/src/constants/index.ts +++ b/packages/shared/src/constants/index.ts @@ -84,6 +84,7 @@ export { DEFAULT_NODE_WARM_TIMEOUT_MS, DEFAULT_ORPHANED_WORKSPACE_GRACE_PERIOD_MS, DEFAULT_WORKSPACE_IDLE_TIMEOUT_MS, + DEFAULT_WORKSPACE_STOPPED_TTL_MS, MAX_NODE_IDLE_TIMEOUT_MS, MAX_WORKSPACE_IDLE_TIMEOUT_MS, MIN_NODE_IDLE_TIMEOUT_MS, diff --git a/packages/shared/src/constants/node-pooling.ts b/packages/shared/src/constants/node-pooling.ts index a527ac5e3..d4560ade6 100644 --- a/packages/shared/src/constants/node-pooling.ts +++ b/packages/shared/src/constants/node-pooling.ts @@ -17,6 +17,13 @@ export const DEFAULT_ORPHANED_WORKSPACE_GRACE_PERIOD_MS = 10 * 60 * 1000; // 10 /** Default alarm retry delay (ms) when node destruction fails. */ export const DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS = 60 * 1000; // 1 minute +// ============================================================================= +// Workspace Stopped Auto-Delete +// ============================================================================= + +/** Default TTL (ms) before a stopped workspace is automatically deleted. Override via WORKSPACE_STOPPED_TTL_MS env var. */ +export const DEFAULT_WORKSPACE_STOPPED_TTL_MS = 5 * 60 * 1000; // 5 minutes + // ============================================================================= // Workspace Idle Timeout (Compute Lifecycle Management) // ============================================================================= diff --git a/tasks/backlog/2026-05-06-stopped-workspace-auto-delete.md b/tasks/archive/2026-05-06-stopped-workspace-auto-delete.md similarity index 100% rename from tasks/backlog/2026-05-06-stopped-workspace-auto-delete.md rename to tasks/archive/2026-05-06-stopped-workspace-auto-delete.md