Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ BASE_DOMAIN=workspaces.example.com

# Workspace idle timeout (ProjectData DO)
# WORKSPACE_IDLE_TIMEOUT_MS=7200000 # 2 hours — global default idle timeout before workspace is stopped (overridable per-project)
# WORKSPACE_STOPPED_TTL_MS=300000 # 5 minutes — auto-delete stopped workspaces after this TTL

# Task agent configuration
# DEFAULT_TASK_AGENT_TYPE=opencode # Agent used for autonomous task execution
Expand Down
221 changes: 206 additions & 15 deletions apps/api/src/durable-objects/node-lifecycle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,34 @@
* alarm() → on `warm`: sets `destroying`, updates D1, cron handles teardown
* → on `active`: no-op (was claimed between schedule and fire)
*
* Workspace auto-deletion:
* scheduleWorkspaceDeletion(workspaceId, userId) → stores pending deletion, recalculates alarm
* cancelWorkspaceDeletion(workspaceId) → removes pending deletion, recalculates alarm
* alarm() → also processes expired workspace deletions (calls VM agent, updates D1)
*
* Actual infrastructure destruction (Hetzner API, DNS) is handled by the
* cron sweep, NOT by this DO — because user credentials are encrypted in D1
* and must be decrypted with CREDENTIAL_ENCRYPTION_KEY (or ENCRYPTION_KEY
* fallback) in the worker context via getCredentialEncryptionKey(env).
*
* See: specs/021-task-chat-architecture/tasks.md (Phase 5)
*/
import type { NodeLifecycleState,NodeLifecycleStatus } from '@simple-agent-manager/shared';
import type { NodeLifecycleState, NodeLifecycleStatus } from '@simple-agent-manager/shared';
import {
DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS,
DEFAULT_NODE_WARM_TIMEOUT_MS,
DEFAULT_WORKSPACE_STOPPED_TTL_MS,
} from '@simple-agent-manager/shared';
import { DurableObject } from 'cloudflare:workers';

import type { Env } from '../env';
import { log } from '../lib/logger';
import { deleteWorkspaceOnNode } from '../services/node-agent';

type NodeLifecycleEnv = {
DATABASE: D1Database;
NODE_WARM_TIMEOUT_MS?: string;
WORKSPACE_STOPPED_TTL_MS?: string;
};

interface StoredState {
Expand All @@ -45,6 +54,12 @@
warmTimeoutOverrideMs?: number | null;
}

interface PendingWorkspaceDeletion {
workspaceId: string;
userId: string;
deleteAt: number;
}

export class NodeLifecycle extends DurableObject<NodeLifecycleEnv> {
/**
* Mark a node as idle (warm). Called after the last workspace on the node
Expand Down Expand Up @@ -73,8 +88,8 @@
};
await this.ctx.storage.put('state', newState);

// Schedule (or reschedule) alarm
await this.ctx.storage.setAlarm(now + warmTimeout);
// Recalculate alarm considering both warm timeout and pending workspace deletions
await this.recalculateAlarm(now + warmTimeout);

// Update D1 warm_since column
await this.updateD1WarmSince(nodeId, new Date(now).toISOString());
Expand All @@ -84,7 +99,7 @@

/**
* Mark a node as active. Called when a workspace starts on the node.
* Cancels any pending warm timeout alarm.
* Cancels any pending warm timeout alarm but preserves workspace deletion alarms.
*/
async markActive(): Promise<NodeLifecycleState> {
const state = await this.getStoredState();
Expand All @@ -97,8 +112,8 @@
state.warmSince = null;
await this.ctx.storage.put('state', state);

// Cancel any pending alarm
await this.ctx.storage.deleteAlarm();
// Recalculate alarm — pending workspace deletions still need to fire
await this.recalculateAlarm(null);

// Clear D1 warm_since
await this.updateD1WarmSince(state.nodeId, null);
Expand Down Expand Up @@ -128,8 +143,8 @@
state.warmSince = null;
await this.ctx.storage.put('state', state);

// Cancel alarm
await this.ctx.storage.deleteAlarm();
// Recalculate alarm — pending workspace deletions still need to fire
await this.recalculateAlarm(null);

// Clear D1 warm_since
await this.updateD1WarmSince(state.nodeId, null);
Expand All @@ -148,19 +163,65 @@
return this.toPublicState(state);
}

// =========================================================================
// Workspace auto-deletion scheduling
// =========================================================================

/**
* Schedule a stopped workspace for automatic deletion after the configured TTL.
* Called when a workspace transitions to 'stopped' status.
*/
async scheduleWorkspaceDeletion(workspaceId: string, userId: string): Promise<void> {
const ttl = this.getWorkspaceStoppedTtlMs();
const deleteAt = Date.now() + ttl;

const entry: PendingWorkspaceDeletion = { workspaceId, userId, deleteAt };
await this.ctx.storage.put(`ws-delete:${workspaceId}`, entry);

log.info('node_lifecycle.workspace_deletion_scheduled', {
workspaceId,
userId,
deleteAt: new Date(deleteAt).toISOString(),
ttlMs: ttl,
});

await this.recalculateAlarm(await this.getWarmAlarmTime());
}

/**
* Alarm handler. Fires when the warm timeout expires.
* Cancel a pending workspace deletion. Called when a workspace is restarted
* before the TTL expires.
*/
async cancelWorkspaceDeletion(workspaceId: string): Promise<void> {
await this.ctx.storage.delete(`ws-delete:${workspaceId}`);

log.info('node_lifecycle.workspace_deletion_cancelled', { workspaceId });

await this.recalculateAlarm(await this.getWarmAlarmTime());
}

// =========================================================================
// Alarm handler
// =========================================================================

/**
* Alarm handler. Fires when either:
* 1. The warm timeout expires (node should be destroyed)
* 2. A workspace deletion is due
*
* If the node is still warm, transitions to `destroying` and marks D1
* for the cron sweep to handle actual infrastructure teardown.
* If the node was claimed between schedule and fire, this is a no-op.
* Processes expired workspace deletions first, then handles warm timeout.
*/
async alarm(): Promise<void> {
// Process any expired workspace deletions
await this.processExpiredDeletions();

const state = await this.getStoredState();
if (!state) return;

// No-op if node was claimed (active) or already destroying
if (state.status === 'active') {
// Still recalculate alarm for any remaining pending workspace deletions
await this.recalculateAlarm(null);
return;
}

Expand All @@ -171,7 +232,18 @@
return;
}

// status === 'warm' → transition to destroying
// status === 'warm' → check if warm timeout has actually expired
if (state.warmSince) {
const warmTimeout = state.warmTimeoutOverrideMs ?? this.getWarmTimeoutMs();
const warmExpiry = state.warmSince + warmTimeout;
if (Date.now() < warmExpiry) {
// Warm timeout hasn't expired yet — alarm fired for workspace deletion only
await this.recalculateAlarm(warmExpiry);
return;
}
}

// Warm timeout expired → transition to destroying
state.status = 'destroying';
await this.ctx.storage.put('state', state);

Expand All @@ -193,8 +265,8 @@
nodeId: state.nodeId,
error: err instanceof Error ? err.message : String(err),
});
// Schedule retry
await this.ctx.storage.setAlarm(Date.now() + DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS);
// Schedule retry (use recalculateAlarm to not delay pending workspace deletions)
await this.recalculateAlarm(Date.now() + DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS);
}
}

Expand All @@ -215,6 +287,15 @@
return DEFAULT_NODE_WARM_TIMEOUT_MS;
}

private getWorkspaceStoppedTtlMs(): number {
const envValue = this.env.WORKSPACE_STOPPED_TTL_MS;
if (envValue) {
const parsed = parseInt(envValue, 10);

Check warning on line 293 in apps/api/src/durable-objects/node-lifecycle.ts

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Prefer `Number.parseInt` over `parseInt`.

See more on https://sonarcloud.io/project/issues?id=raphaeltm_simple-agent-manager&issues=AZ37hsD1srRB7EZG97as&open=AZ37hsD1srRB7EZG97as&pullRequest=916
if (Number.isFinite(parsed) && parsed > 0) return parsed;
}
return DEFAULT_WORKSPACE_STOPPED_TTL_MS;
}

private toPublicState(state: StoredState): NodeLifecycleState {
return {
nodeId: state.nodeId,
Expand All @@ -238,4 +319,114 @@
});
}
}

/**
* Get all pending workspace deletions from DO storage.
*/
private async getPendingDeletions(): Promise<Map<string, PendingWorkspaceDeletion>> {
return await this.ctx.storage.list<PendingWorkspaceDeletion>({ prefix: 'ws-delete:' });
}

/**
* Process all workspace deletions whose deleteAt time has passed.
*/
private async processExpiredDeletions(): Promise<void> {
const pending = await this.getPendingDeletions();
const now = Date.now();

// Load state once to get nodeId — avoids N storage reads in the loop
const state = await this.getStoredState();
if (!state) return;

for (const [key, entry] of pending) {
if (entry.deleteAt > now) continue;

try {
await this.deleteWorkspace(state.nodeId, entry.workspaceId, entry.userId);
await this.ctx.storage.delete(key);

log.info('node_lifecycle.workspace_auto_deleted', {
workspaceId: entry.workspaceId,
userId: entry.userId,
});
} catch (err) {
log.error('node_lifecycle.workspace_deletion_failed', {
workspaceId: entry.workspaceId,
userId: entry.userId,
error: err instanceof Error ? err.message : String(err),
});
// Leave the entry for retry on next alarm. Push deleteAt forward slightly
// to avoid tight retry loops.
entry.deleteAt = now + DEFAULT_NODE_LIFECYCLE_ALARM_RETRY_MS;
await this.ctx.storage.put(key, entry);
}
}
}

/**
* Delete a workspace: call VM agent to remove Docker container + volume,
* then update D1 status to 'deleted'.
*/
private async deleteWorkspace(nodeId: string, workspaceId: string, userId: string): Promise<void> {
// Call VM agent DELETE endpoint via shared helper (handles JWT auth, proper URL routing)
try {
await deleteWorkspaceOnNode(nodeId, workspaceId, this.env as unknown as Env, userId);
} catch (err) {
// If the node is unreachable (already destroyed), log but don't fail
// The D1 status update below still marks the workspace as deleted
log.warn('node_lifecycle.workspace_delete_vm_agent_failed', {
workspaceId,
nodeId,
error: err instanceof Error ? err.message : String(err),
});
}

// Update D1 workspace status to 'deleted'
const now = new Date().toISOString();
await this.env.DATABASE.prepare(
`UPDATE workspaces SET status = 'deleted', updated_at = ? WHERE id = ? AND status = 'stopped'`
).bind(now, workspaceId).run();

// Clean up any agent_sessions referencing this workspace (best-effort)
try {
await this.env.DATABASE.prepare(
`UPDATE agent_sessions SET status = 'completed', updated_at = ? WHERE workspace_id = ? AND status NOT IN ('completed', 'failed')`
).bind(now, workspaceId).run();
} catch {
// best-effort
}
}

/**
* Get the warm alarm time if the node is in warm state.
*/
private async getWarmAlarmTime(): Promise<number | null> {
const state = await this.getStoredState();
if (!state || state.status !== 'warm' || !state.warmSince) return null;

Check warning on line 405 in apps/api/src/durable-objects/node-lifecycle.ts

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Prefer using an optional chain expression instead, as it's more concise and easier to read.

See more on https://sonarcloud.io/project/issues?id=raphaeltm_simple-agent-manager&issues=AZ37hsD1srRB7EZG97at&open=AZ37hsD1srRB7EZG97at&pullRequest=916
const warmTimeout = state.warmTimeoutOverrideMs ?? this.getWarmTimeoutMs();
return state.warmSince + warmTimeout;
}

/**
* Recalculate and set the alarm to the earliest time needed:
* either the warm timeout expiry or the earliest pending workspace deletion.
*
* @param warmAlarmTime - The warm timeout expiry time, or null if not applicable
*/
private async recalculateAlarm(warmAlarmTime: number | null): Promise<void> {
let earliest = warmAlarmTime;

const pending = await this.getPendingDeletions();
for (const [, entry] of pending) {
if (earliest === null || entry.deleteAt < earliest) {
earliest = entry.deleteAt;
}
}

if (earliest !== null) {

Check warning on line 426 in apps/api/src/durable-objects/node-lifecycle.ts

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Unexpected negated condition.

See more on https://sonarcloud.io/project/issues?id=raphaeltm_simple-agent-manager&issues=AZ37hsD1srRB7EZG97au&open=AZ37hsD1srRB7EZG97au&pullRequest=916
await this.ctx.storage.setAlarm(earliest);
} else {
await this.ctx.storage.deleteAlarm();
}
}
}
19 changes: 18 additions & 1 deletion apps/api/src/durable-objects/project-data/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,24 @@ export class ProjectData extends DurableObject<Env> {
(type, payload, sid) => this.broadcastEvent(type, payload, sid), () => this.scheduleSummarySync());
await idleCleanup.processExpiredCleanups(this.sql, this.env,
(taskId) => idleCleanup.completeTaskInD1(this.env.DATABASE, taskId),
(workspaceId) => idleCleanup.stopWorkspaceInD1(this.env.DATABASE, workspaceId),
async (workspaceId) => {
await idleCleanup.stopWorkspaceInD1(this.env.DATABASE, workspaceId);
// Schedule automatic deletion after TTL (best-effort)
try {
const workerEnv = this.env as unknown as import('../../env').Env;
const wsRow = await workerEnv.DATABASE.prepare(
'SELECT node_id, user_id FROM workspaces WHERE id = ?'
).bind(workspaceId).first<{ node_id: string | null; user_id: string }>();
if (wsRow?.node_id) {
const doId = workerEnv.NODE_LIFECYCLE.idFromName(wsRow.node_id);
const stub = workerEnv.NODE_LIFECYCLE.get(doId);
await (stub as unknown as import('../node-lifecycle').NodeLifecycle)
.scheduleWorkspaceDeletion(workspaceId, wsRow.user_id);
}
} catch {
// Best-effort — cron safety-net will catch it
}
},
(type, payload, sid) => this.broadcastEvent(type, payload, sid), () => this.scheduleSummarySync());

// Mailbox delivery sweep: expire stale messages and re-queue unacked ones
Expand Down
14 changes: 14 additions & 0 deletions apps/api/src/durable-objects/task-runner/state-machine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,20 @@ export async function cleanupOnFailure(
error: err instanceof Error ? err.message : String(err),
});
}

// Schedule automatic deletion after TTL (best-effort)
try {
const doId = rc.env.NODE_LIFECYCLE.idFromName(state.stepResults.nodeId);
const stub = rc.env.NODE_LIFECYCLE.get(doId);
await (stub as unknown as import('../node-lifecycle').NodeLifecycle)
.scheduleWorkspaceDeletion(state.stepResults.workspaceId, state.userId);
} catch (err) {
log.warn('task_runner_do.cleanup.schedule_deletion_failed', {
taskId: state.taskId,
workspaceId: state.stepResults.workspaceId,
error: err instanceof Error ? err.message : String(err),
});
}
}

// Clean up auto-provisioned node. If a workspace exists, cleanupTaskRun
Expand Down
2 changes: 2 additions & 0 deletions apps/api/src/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ export interface Env {
ORPHANED_WORKSPACE_GRACE_PERIOD_MS?: string;
// Workspace idle timeout (global default, overridable per-project)
WORKSPACE_IDLE_TIMEOUT_MS?: string;
// Auto-delete stopped workspaces after this TTL (default: 300000 = 5 minutes)
WORKSPACE_STOPPED_TTL_MS?: string;
// Task agent configuration
DEFAULT_TASK_AGENT_TYPE?: string;
// Built-in profile model overrides (defaults: claude-sonnet-4-5-20250929, claude-opus-4-6)
Expand Down
Loading
Loading