Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,14 @@ TRINITY_DATA_PATH=
# Override only when running compose from a directory other than the repo root.
HOST_TEMPLATES_PATH=

# Size of each agent container's /tmp RAM-backed tmpfs (#1231). Format: <int>m
# or <int>g (e.g. 512m, 2g). noexec,nosuid are always applied — only the size
# is configurable. tmpfs counts against the agent's memory cgroup, so keep it
# bounded. Empty/invalid falls back to the default (512m). Existing agents pick
# up a change on recreate, not restart.
# Default: 512m
AGENT_TMP_SIZE=

# ===========================================
# OPENTELEMETRY CONFIGURATION (Optional)
# ===========================================
Expand Down
4 changes: 4 additions & 0 deletions docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ services:
- GITHUB_PAT=${GITHUB_PAT}
# Host paths for volumes (used when creating agent containers)
- HOST_TEMPLATES_PATH=${HOST_TEMPLATES_PATH:-${PWD}/config/agent-templates}
# Agent /tmp tmpfs size (#1231) — read by capabilities.py to build the
# agent container mount spec. noexec,nosuid stay fixed; only size tunes.
# Existing agents pick up a change on recreate, not restart.
- AGENT_TMP_SIZE=${AGENT_TMP_SIZE:-512m}
# OpenTelemetry Configuration (Optional)
- OTEL_ENABLED=${OTEL_ENABLED:-0}
- OTEL_COLLECTOR_ENDPOINT=${OTEL_COLLECTOR_ENDPOINT:-http://trinity-otel-collector:4317}
Expand Down
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ services:
- VOIP_INTENT_TTL_SECONDS=${VOIP_INTENT_TTL_SECONDS:-180} # staged Gemini-intent TTL
- GITHUB_PAT=${GITHUB_PAT:-}
- HOST_TEMPLATES_PATH=${PWD}/config/agent-templates
# Agent /tmp tmpfs size (#1231) — read by capabilities.py to build the
# agent container mount spec. noexec,nosuid stay fixed; only size tunes.
# Existing agents pick up a change on recreate, not restart.
- AGENT_TMP_SIZE=${AGENT_TMP_SIZE:-512m}
# OpenTelemetry Configuration (Optional)
- OTEL_ENABLED=${OTEL_ENABLED:-0}
- OTEL_COLLECTOR_ENDPOINT=${OTEL_COLLECTOR_ENDPOINT:-http://trinity-otel-collector:4317}
Expand Down
2 changes: 1 addition & 1 deletion docs/memory/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -1492,7 +1492,7 @@ Bridges (members of **both** networks): `backend` (primary HTTP API — Redis on
## Container Security

- **Non-root execution** (Invariant #17, #874): backend and scheduler as `trinity` (UID 1000), MCP server as `node` (UID 1000), frontend as `nginx` (UID 101), agents as `developer` (UID 1000). Backend needs `group_add: ${DOCKER_GID:-999}` for Docker socket access on Linux.
- `CAP_DROP: ALL` + `CAP_ADD: NET_BIND_SERVICE`; `security_opt: no-new-privileges:true`; tmpfs `/tmp` with `noexec,nosuid` (100 MB RAM-backed — heavy scratch like pip/npm/ML wheels is redirected via a default `TMPDIR=/home/developer/.tmp` on the disk-backed home volume, created at start by `startup.sh`; mount spec + TMPDIR default live in `services/agent_service/capabilities.py` so create/recreate/system-agent can't drift, #1098); no external UI port exposure; network isolation per Network Topology above.
- `CAP_DROP: ALL` + `CAP_ADD: NET_BIND_SERVICE`; `security_opt: no-new-privileges:true`; tmpfs `/tmp` with `noexec,nosuid` (RAM-backed, default 512 MB — operator-tunable via `AGENT_TMP_SIZE` on the backend service, validated `^\d+[mg]$` with invalid→default; `noexec,nosuid` stay fixed; counts against the agent memory cgroup; creation-time, so existing agents pick up a change on recreate not restart, #1231. Heavy scratch like pip/npm/ML wheels is redirected via a default `TMPDIR=/home/developer/.tmp` on the disk-backed home volume, created at start by `startup.sh`; mount spec + TMPDIR default live in `services/agent_service/capabilities.py` so create/recreate/system-agent can't drift, #1098); no external UI port exposure; network isolation per Network Topology above.
- **Internal API security (C-003)**: `/api/internal/` endpoints (scheduler, agent containers) require the `X-Internal-Secret` header; falls back to `SECRET_KEY` if `INTERNAL_API_SECRET` unset.
- **WebSocket security (C-002, #550)**: single-use ticket auth — see [Real-time Delivery](#real-time-delivery-reliability-003-306).
- **Frontend XSS (H-005)**: all markdown rendering uses DOMPurify via `utils/markdown.js`; no direct `v-html` with unsanitized content.
Expand Down
44 changes: 37 additions & 7 deletions src/backend/services/agent_service/capabilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

from __future__ import annotations

import os
import re


# Restricted mode capabilities - minimum for agent operation (default)
RESTRICTED_CAPABILITIES: list[str] = [
Expand Down Expand Up @@ -66,17 +69,44 @@
]


# Agent /tmp mount + scratch-space defaults (#1098)
# Agent /tmp mount + scratch-space defaults (#1098, #1231)
# -----------------------------------------------------------------------------
# /tmp is a small RAM-backed tmpfs, hardened noexec,nosuid. It is deliberately
# tiny and non-exec so a compromised agent can't stage/execute payloads there.
# The catch: heavy scratch (pip/npm install, compiling C extensions, ML wheels
# /tmp is a RAM-backed tmpfs, hardened noexec,nosuid. It is deliberately
# non-exec so a compromised agent can't stage/execute payloads there. The
# catch: heavy scratch (pip/npm install, compiling C extensions, ML wheels
# like torch/transformers) must NOT land on /tmp — it hits "No space left on
# device" at 100 MB, and "Permission denied" on the noexec flag.
# device" at the cap, and "Permission denied" on the noexec flag. #1098
# redirects $TMPDIR-honoring tools off /tmp; but install scripts that hardcode
# /tmp (e.g. the `gh` CLI) still exhaust the cap, silently breaking later /tmp
# writes (incl. git's commit scratch) — #1231.
#
# Size is operator-tunable via AGENT_TMP_SIZE (e.g. "512m", "2g"), default
# 512m. ONLY the size is configurable — noexec,nosuid stay hardcoded (security
# posture), and the value stays bounded (it counts against the container memory
# cgroup). An empty/invalid value falls back to the default rather than
# producing a broken or unbounded mount spec. Mount specs are creation-time, so
# existing agents pick up a new size on recreate, not restart.
#
# Defined here (single source of truth) so the create path (crud.py) and the
# recreate path (lifecycle.py) can't drift — both import these constants.
AGENT_TMPFS_MOUNT: dict[str, str] = {'/tmp': 'noexec,nosuid,size=100m'}
# recreate path (lifecycle.py) can't drift — both import this constant.
_AGENT_TMP_SIZE_DEFAULT = "512m"
_AGENT_TMP_SIZE_RE = re.compile(r"^\d+[mg]$")


def _resolve_agent_tmp_size() -> str:
"""Validated /tmp tmpfs size from AGENT_TMP_SIZE (env), else the default.

Accepts ``<int>m`` / ``<int>g`` (case-insensitive); anything else — empty,
a bare number, a Kubernetes-style suffix — falls back to the default so a
typo can never yield a broken or unbounded mount spec.
"""
raw = (os.getenv("AGENT_TMP_SIZE") or "").strip().lower()
return raw if _AGENT_TMP_SIZE_RE.match(raw) else _AGENT_TMP_SIZE_DEFAULT


AGENT_TMPFS_MOUNT: dict[str, str] = {
'/tmp': f'noexec,nosuid,size={_resolve_agent_tmp_size()}'
}

# Default TMPDIR redirects scratch onto the disk-backed, exec-capable agent
# home volume. pip / npm / most build tooling honor TMPDIR, so this dodges both
Expand Down
80 changes: 80 additions & 0 deletions tests/unit/test_1231_agent_tmp_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Unit tests for #1231: agent /tmp tmpfs size is operator-configurable via
AGENT_TMP_SIZE, with noexec,nosuid fixed and a safe default.

The agent /tmp was a hardcoded 100 MB noexec,nosuid tmpfs. It fills easily
(e.g. `gh` CLI install artifacts that hardcode /tmp and bypass the #1098
TMPDIR redirect), after which every /tmp write — including git's commit
scratch — fails with "No space left on device", silently breaking autonomous
runs' persist step. The size is now read from AGENT_TMP_SIZE (default 512m);
only the size is tunable — the security flags stay hardcoded.

Loaded by file path (stdlib-only) so the test doesn't drag the
docker / fastapi / database transitive imports of the agent_service package.
"""
from __future__ import annotations

import importlib.util
from pathlib import Path

import pytest

_CAPS_PATH = (
Path(__file__).resolve().parent.parent.parent
/ "src" / "backend" / "services" / "agent_service" / "capabilities.py"
)


def _load():
spec = importlib.util.spec_from_file_location("caps_tmpsize_under_test", _CAPS_PATH)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# --- _resolve_agent_tmp_size (call-time env read) -----------------------

def test_default_when_unset(monkeypatch):
monkeypatch.delenv("AGENT_TMP_SIZE", raising=False)
assert _load()._resolve_agent_tmp_size() == "512m"


@pytest.mark.parametrize("value", ["256m", "512m", "1g", "2g", "100m"])
def test_valid_values_pass_through(monkeypatch, value):
monkeypatch.setenv("AGENT_TMP_SIZE", value)
assert _load()._resolve_agent_tmp_size() == value


def test_case_folds_and_strips(monkeypatch):
monkeypatch.setenv("AGENT_TMP_SIZE", " 1G ")
assert _load()._resolve_agent_tmp_size() == "1g"


@pytest.mark.parametrize("bad", ["512", "512Mi", "512MB", "0.5g", "g", "abc", "-1m", ""])
def test_invalid_falls_back_to_default(monkeypatch, bad):
monkeypatch.setenv("AGENT_TMP_SIZE", bad)
assert _load()._resolve_agent_tmp_size() == "512m"


# --- AGENT_TMPFS_MOUNT (import-time spec) -------------------------------

def test_mount_spec_default_shape(monkeypatch):
monkeypatch.delenv("AGENT_TMP_SIZE", raising=False)
mount = _load().AGENT_TMPFS_MOUNT
assert mount == {"/tmp": "noexec,nosuid,size=512m"}


def test_mount_spec_honors_env(monkeypatch):
monkeypatch.setenv("AGENT_TMP_SIZE", "2g")
mount = _load().AGENT_TMPFS_MOUNT
assert mount == {"/tmp": "noexec,nosuid,size=2g"}


def test_security_flags_always_present(monkeypatch):
"""noexec,nosuid are hardcoded — a configured size must never drop them
(the load-bearing security posture: a compromised agent can't stage or
execute payloads on /tmp)."""
for value in ("256m", "garbage", "8g"):
monkeypatch.setenv("AGENT_TMP_SIZE", value)
spec = _load().AGENT_TMPFS_MOUNT["/tmp"]
assert spec.startswith("noexec,nosuid,size=")
assert "exec" not in spec.replace("noexec", "") # no stray exec flag
Loading