Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/backend/routers/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1263,8 +1263,11 @@ async def update_agent_quotas(
# Agent Default Resources (RES-001)
# ============================================================================

VALID_CPU_VALUES = ["1", "2", "4", "8", "16"]
VALID_MEMORY_VALUES = ["1g", "2g", "4g", "8g", "16g", "32g"]
# Canonical allowed values live in the container-spec module so the admin
# defaults endpoint and the agent create/recreate paths can't drift (#1197).
from services.agent_service.capabilities import VALID_CPU, VALID_MEMORY
VALID_CPU_VALUES = list(VALID_CPU)
VALID_MEMORY_VALUES = list(VALID_MEMORY)


@router.get("/agent-defaults/resources")
Expand Down
48 changes: 48 additions & 0 deletions src/backend/services/agent_service/capabilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,51 @@
# The directory is created (writable by UID 1000) at container start by
# docker/base-image/startup.sh, so existing agents pick it up on restart.
AGENT_DEFAULT_TMPDIR: str = '/home/developer/.tmp'


# Container resource limits (#1197)
# -----------------------------------------------------------------------------
# Canonical allowed values for the per-agent CPU / memory limits, shared by the
# admin defaults endpoint (routers/settings.py) and the three container-create
# sites (crud.py create, lifecycle.py recreate, system_agent_service.py). Kept
# here — stdlib-only, import-light — so the value set has ONE home and the
# create paths can't drift from what the API accepts.
#
# CPU is an integer processor count fed to Docker's NanoCpus (#1126); memory is
# a Docker memory string fed to mem_limit. A template.yaml carrying a fractional
# or Kubernetes-style value (cpu: "0.5", memory: "512Mi") is rejected up front
# with an actionable message instead of crashing deep in container creation on a
# raw int() / an invalid Docker mem string (#1197).
VALID_CPU: tuple[str, ...] = ("1", "2", "4", "8", "16")
VALID_MEMORY: tuple[str, ...] = ("1g", "2g", "4g", "8g", "16g", "32g")


def normalize_cpu(value, default) -> str:
"""Validate/normalize a CPU value against VALID_CPU.

``value`` is the template/config value (may be None/empty); ``default`` is
the system fallback (already admin-validated). Returns the canonical string
or raises ``ValueError`` with an actionable message.
"""
cpu = str(value if value not in (None, "") else default).strip()
if cpu not in VALID_CPU:
raise ValueError(
f"Invalid cpu '{cpu}': must be one of {', '.join(VALID_CPU)} "
f"(integer processor count — not a fractional or Kubernetes-style value)"
)
return cpu


def normalize_memory(value, default) -> str:
"""Validate/normalize a memory value against VALID_MEMORY (Docker form).

Case-folds so ``4G`` → ``4g``. Returns the canonical string or raises
``ValueError`` with an actionable message.
"""
mem = str(value if value not in (None, "") else default).strip().lower()
if mem not in VALID_MEMORY:
raise ValueError(
f"Invalid memory '{mem}': must be one of {', '.join(VALID_MEMORY)} "
f"(Docker form like '4g' — not a Kubernetes-style value like '512Mi')"
)
return mem
40 changes: 37 additions & 3 deletions src/backend/services/agent_service/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from utils.helpers import sanitize_agent_name, utc_now_iso
from .helpers import validate_base_image
from .lifecycle import RESTRICTED_CAPABILITIES, FULL_CAPABILITIES
from .capabilities import AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR
from .capabilities import AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR, normalize_cpu, normalize_memory

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -453,6 +453,25 @@ async def create_agent_internal(
if generated_files:
cred_files_volume = {str(cred_files_dir): {'bind': '/generated-creds', 'mode': 'ro'}}

# #1197: validate/normalize template resource fields against the allowed
# set BEFORE any side effects (MCP key, subscription, container). A
# Kubernetes-style `cpu: "0.5"` / `memory: "512Mi"` from a source repo's
# template.yaml used to reach the raw `int(cpu)` at container-create and
# abort with an opaque ValueError — leaving an orphaned mcp_api_keys row.
# Fail fast here with an actionable 400 instead, and write the canonical
# values back so the container labels + limits use them.
if config.resources is None:
config.resources = {}
try:
config.resources['cpu'] = normalize_cpu(
config.resources.get('cpu'), _get_default_resource('cpu')
)
config.resources['memory'] = normalize_memory(
config.resources.get('memory'), _get_default_resource('memory')
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))

# Phase: Agent-to-Agent Collaboration
# Generate agent-scoped MCP API key for Trinity MCP access
agent_mcp_key = None
Expand Down Expand Up @@ -734,11 +753,13 @@ async def create_agent_internal(
# is redirected off this tiny tmpfs via the TMPDIR env var).
tmpfs=AGENT_TMPFS_MOUNT,
network='trinity-agent-network',
mem_limit=config.resources.get('memory') or _get_default_resource('memory'),
# #1197: cpu/memory normalized + validated above (raises 400 on
# a bad template value), so these are guaranteed Docker-valid.
mem_limit=config.resources['memory'],
# #1126: nano_cpus (Linux CFS quota), NOT cpu_count — the latter
# is Windows-only in docker-py and left NanoCpus=0, so newly
# created agents never got a CPU limit on Linux.
nano_cpus=int(config.resources.get('cpu') or _get_default_resource('cpu')) * 1_000_000_000,
nano_cpus=int(config.resources['cpu']) * 1_000_000_000,
)

agent_status = get_agent_status_from_container(container)
Expand Down Expand Up @@ -841,6 +862,19 @@ async def create_agent_internal(
config.name,
cleanup_exc,
)
# #1197: the agent-scoped MCP key is minted before container
# creation, so a failure here would otherwise leave an orphaned
# mcp_api_keys row (one per failed attempt). Roll it back too.
if agent_mcp_key:
try:
db.delete_agent_mcp_api_key(config.name)
except Exception as cleanup_exc:
logger.warning(
"Failed to roll back MCP key for %s after creation "
"failure: %s",
config.name,
cleanup_exc,
)
logger.error(f"Failed to create agent {config.name}: {e}")
raise HTTPException(status_code=500, detail="Failed to create agent. Please try again.")
else:
Expand Down
9 changes: 9 additions & 0 deletions src/backend/services/agent_service/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ async def wait_for_agent_ready(
PROHIBITED_CAPABILITIES,
AGENT_TMPFS_MOUNT,
AGENT_DEFAULT_TMPDIR,
normalize_cpu,
normalize_memory,
)


Expand Down Expand Up @@ -410,6 +412,13 @@ async def recreate_container_with_updated_config(agent_name: str, old_container,
cpu = labels.get("trinity.cpu") or system_defaults["cpu"]
memory = labels.get("trinity.memory") or system_defaults["memory"]

# #1197: validate/normalize before they reach Docker (int(cpu) NanoCpus /
# mem_limit). A stale label or DB override carrying a non-integer cpu or a
# Kubernetes-style memory would otherwise crash recreate with an opaque
# ValueError; fail with a clear message instead.
cpu = normalize_cpu(cpu, system_defaults["cpu"])
memory = normalize_memory(memory, system_defaults["memory"])

# Update labels with new resource limits for future reference
labels["trinity.cpu"] = cpu
labels["trinity.memory"] = memory
Expand Down
6 changes: 4 additions & 2 deletions src/backend/services/system_agent_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from services.docker_utils import container_reload, container_start, containers_run
from services.settings_service import get_anthropic_api_key
from services.agent_service.lifecycle import FULL_CAPABILITIES, AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR
from services.agent_service.capabilities import normalize_cpu, normalize_memory
from utils.helpers import utc_now_iso

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -243,9 +244,10 @@ async def _create_system_agent(self) -> dict:
volumes=volumes,
environment=env_vars,
labels=labels,
mem_limit=resources.get("memory", "8g"),
# #1197: normalize/validate before Docker (int(cpu) NanoCpus / mem_limit).
mem_limit=normalize_memory(resources.get("memory"), "8g"),
# #1126: nano_cpus (Linux CFS quota), NOT cpu_count (Windows-only → NanoCpus=0).
nano_cpus=int(resources.get("cpu", "4")) * 1_000_000_000,
nano_cpus=int(normalize_cpu(resources.get("cpu"), "4")) * 1_000_000_000,
restart_policy={"Name": "unless-stopped"}, # Auto-restart on failure
# Always apply AppArmor for additional sandboxing
security_opt=['apparmor:docker-default'],
Expand Down
96 changes: 96 additions & 0 deletions tests/unit/test_resource_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Resource normalization for agent container limits (#1197).

A GitHub source repo's template.yaml carrying a fractional / Kubernetes-style
resources block (``cpu: "0.5"``, ``memory: "512Mi"``) used to abort agent
creation deep in container-create with an opaque
``ValueError: invalid literal for int() with base 10: '0.5'``. These tests pin
``normalize_cpu`` / ``normalize_memory`` — the guards now applied at all three
container-create sites — so the failure surfaces early with an actionable
message and the allowed value set can't silently drift.

Loaded by file path (stdlib-only) so the test doesn't drag the
docker / fastapi / database transitive imports of the agent_service package.
"""
from __future__ import annotations

import importlib.util
from pathlib import Path

import pytest

_CAPS_PATH = (
Path(__file__).resolve().parent.parent.parent
/ "src" / "backend" / "services" / "agent_service" / "capabilities.py"
)


def _load():
spec = importlib.util.spec_from_file_location("caps_resnorm_under_test", _CAPS_PATH)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


caps = _load()


# --- CPU ----------------------------------------------------------------

@pytest.mark.parametrize("value", ["1", "2", "4", "8", "16", 4, 16])
def test_normalize_cpu_accepts_valid(value):
assert caps.normalize_cpu(value, "2") == str(value)


def test_normalize_cpu_falls_back_to_default_on_empty():
assert caps.normalize_cpu(None, "2") == "2"
assert caps.normalize_cpu("", "4") == "4"


@pytest.mark.parametrize("bad", ["0.5", "0", "3", "32", "100m", "half"])
def test_normalize_cpu_rejects_invalid(bad):
with pytest.raises(ValueError) as ei:
caps.normalize_cpu(bad, "2")
msg = str(ei.value)
assert bad.strip() in msg # echoes the offending value
assert "must be one of" in msg # actionable


def test_normalize_cpu_is_int_castable_after_normalize():
# The whole point: every value normalize_cpu returns is int()-castable, so
# the `int(cpu) * 1_000_000_000` NanoCpus line can never raise (#1197).
for v in caps.VALID_CPU:
assert isinstance(int(caps.normalize_cpu(v, "2")), int)


# --- Memory -------------------------------------------------------------

@pytest.mark.parametrize("value", ["1g", "2g", "4g", "8g", "16g", "32g"])
def test_normalize_memory_accepts_valid(value):
assert caps.normalize_memory(value, "4g") == value


def test_normalize_memory_case_folds():
assert caps.normalize_memory("4G", "4g") == "4g"
assert caps.normalize_memory(" 8G ", "4g") == "8g"


def test_normalize_memory_falls_back_to_default_on_empty():
assert caps.normalize_memory(None, "4g") == "4g"
assert caps.normalize_memory("", "8g") == "8g"


@pytest.mark.parametrize("bad", ["512Mi", "512m", "4", "4gb", "0.5g", "huge"])
def test_normalize_memory_rejects_invalid(bad):
with pytest.raises(ValueError) as ei:
caps.normalize_memory(bad, "4g")
msg = str(ei.value)
assert "must be one of" in msg


# --- drift guard --------------------------------------------------------

def test_value_sets_match_settings_router():
"""The canonical sets here must match what the admin defaults endpoint
accepts (routers/settings.py reads VALID_CPU/VALID_MEMORY from here)."""
assert caps.VALID_CPU == ("1", "2", "4", "8", "16")
assert caps.VALID_MEMORY == ("1g", "2g", "4g", "8g", "16g", "32g")
Loading