diff --git a/.env.example b/.env.example index 231c3133..2a84b502 100644 --- a/.env.example +++ b/.env.example @@ -307,6 +307,14 @@ TRINITY_DATA_PATH= # Override only when running compose from a directory other than the repo root. HOST_TEMPLATES_PATH= +# Size of each agent container's /tmp RAM-backed tmpfs (#1231). Format: m +# or g (e.g. 512m, 2g). noexec,nosuid are always applied — only the size +# is configurable. tmpfs counts against the agent's memory cgroup, so keep it +# bounded. Empty/invalid falls back to the default (512m). Existing agents pick +# up a change on recreate, not restart. +# Default: 512m +AGENT_TMP_SIZE= + # =========================================== # OPENTELEMETRY CONFIGURATION (Optional) # =========================================== diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index d4a2e3c4..42a55860 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -82,6 +82,10 @@ services: - GITHUB_PAT=${GITHUB_PAT} # Host paths for volumes (used when creating agent containers) - HOST_TEMPLATES_PATH=${HOST_TEMPLATES_PATH:-${PWD}/config/agent-templates} + # Agent /tmp tmpfs size (#1231) — read by capabilities.py to build the + # agent container mount spec. noexec,nosuid stay fixed; only size tunes. + # Existing agents pick up a change on recreate, not restart. + - AGENT_TMP_SIZE=${AGENT_TMP_SIZE:-512m} # OpenTelemetry Configuration (Optional) - OTEL_ENABLED=${OTEL_ENABLED:-0} - OTEL_COLLECTOR_ENDPOINT=${OTEL_COLLECTOR_ENDPOINT:-http://trinity-otel-collector:4317} diff --git a/docker-compose.yml b/docker-compose.yml index 5258af7b..001c15fa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -46,6 +46,10 @@ services: - VOIP_INTENT_TTL_SECONDS=${VOIP_INTENT_TTL_SECONDS:-180} # staged Gemini-intent TTL - GITHUB_PAT=${GITHUB_PAT:-} - HOST_TEMPLATES_PATH=${PWD}/config/agent-templates + # Agent /tmp tmpfs size (#1231) — read by capabilities.py to build the + # agent container mount spec. noexec,nosuid stay fixed; only size tunes. + # Existing agents pick up a change on recreate, not restart. + - AGENT_TMP_SIZE=${AGENT_TMP_SIZE:-512m} # OpenTelemetry Configuration (Optional) - OTEL_ENABLED=${OTEL_ENABLED:-0} - OTEL_COLLECTOR_ENDPOINT=${OTEL_COLLECTOR_ENDPOINT:-http://trinity-otel-collector:4317} diff --git a/docs/memory/architecture.md b/docs/memory/architecture.md index 3cd7d970..e5e01f20 100644 --- a/docs/memory/architecture.md +++ b/docs/memory/architecture.md @@ -1492,7 +1492,7 @@ Bridges (members of **both** networks): `backend` (primary HTTP API — Redis on ## Container Security - **Non-root execution** (Invariant #17, #874): backend and scheduler as `trinity` (UID 1000), MCP server as `node` (UID 1000), frontend as `nginx` (UID 101), agents as `developer` (UID 1000). Backend needs `group_add: ${DOCKER_GID:-999}` for Docker socket access on Linux. -- `CAP_DROP: ALL` + `CAP_ADD: NET_BIND_SERVICE`; `security_opt: no-new-privileges:true`; tmpfs `/tmp` with `noexec,nosuid` (100 MB RAM-backed — heavy scratch like pip/npm/ML wheels is redirected via a default `TMPDIR=/home/developer/.tmp` on the disk-backed home volume, created at start by `startup.sh`; mount spec + TMPDIR default live in `services/agent_service/capabilities.py` so create/recreate/system-agent can't drift, #1098); no external UI port exposure; network isolation per Network Topology above. +- `CAP_DROP: ALL` + `CAP_ADD: NET_BIND_SERVICE`; `security_opt: no-new-privileges:true`; tmpfs `/tmp` with `noexec,nosuid` (RAM-backed, default 512 MB — operator-tunable via `AGENT_TMP_SIZE` on the backend service, validated `^\d+[mg]$` with invalid→default; `noexec,nosuid` stay fixed; counts against the agent memory cgroup; creation-time, so existing agents pick up a change on recreate not restart, #1231. Heavy scratch like pip/npm/ML wheels is redirected via a default `TMPDIR=/home/developer/.tmp` on the disk-backed home volume, created at start by `startup.sh`; mount spec + TMPDIR default live in `services/agent_service/capabilities.py` so create/recreate/system-agent can't drift, #1098); no external UI port exposure; network isolation per Network Topology above. - **Internal API security (C-003)**: `/api/internal/` endpoints (scheduler, agent containers) require the `X-Internal-Secret` header; falls back to `SECRET_KEY` if `INTERNAL_API_SECRET` unset. - **WebSocket security (C-002, #550)**: single-use ticket auth — see [Real-time Delivery](#real-time-delivery-reliability-003-306). - **Frontend XSS (H-005)**: all markdown rendering uses DOMPurify via `utils/markdown.js`; no direct `v-html` with unsanitized content. diff --git a/src/backend/services/agent_service/capabilities.py b/src/backend/services/agent_service/capabilities.py index 5cd09a42..0924c4e0 100644 --- a/src/backend/services/agent_service/capabilities.py +++ b/src/backend/services/agent_service/capabilities.py @@ -15,6 +15,9 @@ from __future__ import annotations +import os +import re + # Restricted mode capabilities - minimum for agent operation (default) RESTRICTED_CAPABILITIES: list[str] = [ @@ -66,17 +69,44 @@ ] -# Agent /tmp mount + scratch-space defaults (#1098) +# Agent /tmp mount + scratch-space defaults (#1098, #1231) # ----------------------------------------------------------------------------- -# /tmp is a small RAM-backed tmpfs, hardened noexec,nosuid. It is deliberately -# tiny and non-exec so a compromised agent can't stage/execute payloads there. -# The catch: heavy scratch (pip/npm install, compiling C extensions, ML wheels +# /tmp is a RAM-backed tmpfs, hardened noexec,nosuid. It is deliberately +# non-exec so a compromised agent can't stage/execute payloads there. The +# catch: heavy scratch (pip/npm install, compiling C extensions, ML wheels # like torch/transformers) must NOT land on /tmp — it hits "No space left on -# device" at 100 MB, and "Permission denied" on the noexec flag. +# device" at the cap, and "Permission denied" on the noexec flag. #1098 +# redirects $TMPDIR-honoring tools off /tmp; but install scripts that hardcode +# /tmp (e.g. the `gh` CLI) still exhaust the cap, silently breaking later /tmp +# writes (incl. git's commit scratch) — #1231. +# +# Size is operator-tunable via AGENT_TMP_SIZE (e.g. "512m", "2g"), default +# 512m. ONLY the size is configurable — noexec,nosuid stay hardcoded (security +# posture), and the value stays bounded (it counts against the container memory +# cgroup). An empty/invalid value falls back to the default rather than +# producing a broken or unbounded mount spec. Mount specs are creation-time, so +# existing agents pick up a new size on recreate, not restart. # # Defined here (single source of truth) so the create path (crud.py) and the -# recreate path (lifecycle.py) can't drift — both import these constants. -AGENT_TMPFS_MOUNT: dict[str, str] = {'/tmp': 'noexec,nosuid,size=100m'} +# recreate path (lifecycle.py) can't drift — both import this constant. +_AGENT_TMP_SIZE_DEFAULT = "512m" +_AGENT_TMP_SIZE_RE = re.compile(r"^\d+[mg]$") + + +def _resolve_agent_tmp_size() -> str: + """Validated /tmp tmpfs size from AGENT_TMP_SIZE (env), else the default. + + Accepts ``m`` / ``g`` (case-insensitive); anything else — empty, + a bare number, a Kubernetes-style suffix — falls back to the default so a + typo can never yield a broken or unbounded mount spec. + """ + raw = (os.getenv("AGENT_TMP_SIZE") or "").strip().lower() + return raw if _AGENT_TMP_SIZE_RE.match(raw) else _AGENT_TMP_SIZE_DEFAULT + + +AGENT_TMPFS_MOUNT: dict[str, str] = { + '/tmp': f'noexec,nosuid,size={_resolve_agent_tmp_size()}' +} # Default TMPDIR redirects scratch onto the disk-backed, exec-capable agent # home volume. pip / npm / most build tooling honor TMPDIR, so this dodges both diff --git a/tests/unit/test_1231_agent_tmp_size.py b/tests/unit/test_1231_agent_tmp_size.py new file mode 100644 index 00000000..621d7265 --- /dev/null +++ b/tests/unit/test_1231_agent_tmp_size.py @@ -0,0 +1,80 @@ +"""Unit tests for #1231: agent /tmp tmpfs size is operator-configurable via +AGENT_TMP_SIZE, with noexec,nosuid fixed and a safe default. + +The agent /tmp was a hardcoded 100 MB noexec,nosuid tmpfs. It fills easily +(e.g. `gh` CLI install artifacts that hardcode /tmp and bypass the #1098 +TMPDIR redirect), after which every /tmp write — including git's commit +scratch — fails with "No space left on device", silently breaking autonomous +runs' persist step. The size is now read from AGENT_TMP_SIZE (default 512m); +only the size is tunable — the security flags stay hardcoded. + +Loaded by file path (stdlib-only) so the test doesn't drag the +docker / fastapi / database transitive imports of the agent_service package. +""" +from __future__ import annotations + +import importlib.util +from pathlib import Path + +import pytest + +_CAPS_PATH = ( + Path(__file__).resolve().parent.parent.parent + / "src" / "backend" / "services" / "agent_service" / "capabilities.py" +) + + +def _load(): + spec = importlib.util.spec_from_file_location("caps_tmpsize_under_test", _CAPS_PATH) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# --- _resolve_agent_tmp_size (call-time env read) ----------------------- + +def test_default_when_unset(monkeypatch): + monkeypatch.delenv("AGENT_TMP_SIZE", raising=False) + assert _load()._resolve_agent_tmp_size() == "512m" + + +@pytest.mark.parametrize("value", ["256m", "512m", "1g", "2g", "100m"]) +def test_valid_values_pass_through(monkeypatch, value): + monkeypatch.setenv("AGENT_TMP_SIZE", value) + assert _load()._resolve_agent_tmp_size() == value + + +def test_case_folds_and_strips(monkeypatch): + monkeypatch.setenv("AGENT_TMP_SIZE", " 1G ") + assert _load()._resolve_agent_tmp_size() == "1g" + + +@pytest.mark.parametrize("bad", ["512", "512Mi", "512MB", "0.5g", "g", "abc", "-1m", ""]) +def test_invalid_falls_back_to_default(monkeypatch, bad): + monkeypatch.setenv("AGENT_TMP_SIZE", bad) + assert _load()._resolve_agent_tmp_size() == "512m" + + +# --- AGENT_TMPFS_MOUNT (import-time spec) ------------------------------- + +def test_mount_spec_default_shape(monkeypatch): + monkeypatch.delenv("AGENT_TMP_SIZE", raising=False) + mount = _load().AGENT_TMPFS_MOUNT + assert mount == {"/tmp": "noexec,nosuid,size=512m"} + + +def test_mount_spec_honors_env(monkeypatch): + monkeypatch.setenv("AGENT_TMP_SIZE", "2g") + mount = _load().AGENT_TMPFS_MOUNT + assert mount == {"/tmp": "noexec,nosuid,size=2g"} + + +def test_security_flags_always_present(monkeypatch): + """noexec,nosuid are hardcoded — a configured size must never drop them + (the load-bearing security posture: a compromised agent can't stage or + execute payloads on /tmp).""" + for value in ("256m", "garbage", "8g"): + monkeypatch.setenv("AGENT_TMP_SIZE", value) + spec = _load().AGENT_TMPFS_MOUNT["/tmp"] + assert spec.startswith("noexec,nosuid,size=") + assert "exec" not in spec.replace("noexec", "") # no stray exec flag