From 465e2158b5e70fcb67e79fb0641e50794bd34a8b Mon Sep 17 00:00:00 2001 From: Oleksii Dolhov Date: Tue, 16 Jun 2026 13:13:12 +0300 Subject: [PATCH] fix(agents): validate template cpu/memory before container create (#1197) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A GitHub source repo whose template.yaml declares a fractional/Kubernetes-style resources block (cpu: "0.5", memory: "512Mi") aborted agent creation deep in container-create with an opaque `ValueError: invalid literal for int() with base 10: '0.5'`, after the MCP key was already minted — leaving an orphaned mcp_api_keys row per attempt (#1126/#1128 added the unguarded int(cpu) at three sites). - Add normalize_cpu/normalize_memory + canonical VALID_CPU/VALID_MEMORY in services/agent_service/capabilities.py (stdlib-only, the anti-drift home for container spec). routers/settings.py now imports these instead of duplicating the lists, so the API and the create paths can't drift. - crud.py create: normalize/validate config.resources BEFORE any side effect and raise a clear HTTP 400 on invalid input; write canonical values back so labels + limits use them. Roll back the agent-scoped MCP key in the failure path so a failed create leaves no orphan row. - lifecycle.py recreate + system_agent_service.py: same guard at their nano_cpus/mem_limit sites. - tests/unit/test_resource_normalization.py: pins the helpers (valid set, fractional/k8s rejection with actionable message, case-folding, default fallback, int()-castability, drift guard). Related to #1197 --- src/backend/routers/settings.py | 7 +- .../services/agent_service/capabilities.py | 48 ++++++++++ src/backend/services/agent_service/crud.py | 40 +++++++- .../services/agent_service/lifecycle.py | 9 ++ src/backend/services/system_agent_service.py | 6 +- tests/unit/test_resource_normalization.py | 96 +++++++++++++++++++ 6 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 tests/unit/test_resource_normalization.py diff --git a/src/backend/routers/settings.py b/src/backend/routers/settings.py index 73fb9735..44f6d61d 100644 --- a/src/backend/routers/settings.py +++ b/src/backend/routers/settings.py @@ -1263,8 +1263,11 @@ async def update_agent_quotas( # Agent Default Resources (RES-001) # ============================================================================ -VALID_CPU_VALUES = ["1", "2", "4", "8", "16"] -VALID_MEMORY_VALUES = ["1g", "2g", "4g", "8g", "16g", "32g"] +# Canonical allowed values live in the container-spec module so the admin +# defaults endpoint and the agent create/recreate paths can't drift (#1197). +from services.agent_service.capabilities import VALID_CPU, VALID_MEMORY +VALID_CPU_VALUES = list(VALID_CPU) +VALID_MEMORY_VALUES = list(VALID_MEMORY) @router.get("/agent-defaults/resources") diff --git a/src/backend/services/agent_service/capabilities.py b/src/backend/services/agent_service/capabilities.py index 5cd09a42..13824105 100644 --- a/src/backend/services/agent_service/capabilities.py +++ b/src/backend/services/agent_service/capabilities.py @@ -84,3 +84,51 @@ # The directory is created (writable by UID 1000) at container start by # docker/base-image/startup.sh, so existing agents pick it up on restart. AGENT_DEFAULT_TMPDIR: str = '/home/developer/.tmp' + + +# Container resource limits (#1197) +# ----------------------------------------------------------------------------- +# Canonical allowed values for the per-agent CPU / memory limits, shared by the +# admin defaults endpoint (routers/settings.py) and the three container-create +# sites (crud.py create, lifecycle.py recreate, system_agent_service.py). Kept +# here — stdlib-only, import-light — so the value set has ONE home and the +# create paths can't drift from what the API accepts. +# +# CPU is an integer processor count fed to Docker's NanoCpus (#1126); memory is +# a Docker memory string fed to mem_limit. A template.yaml carrying a fractional +# or Kubernetes-style value (cpu: "0.5", memory: "512Mi") is rejected up front +# with an actionable message instead of crashing deep in container creation on a +# raw int() / an invalid Docker mem string (#1197). +VALID_CPU: tuple[str, ...] = ("1", "2", "4", "8", "16") +VALID_MEMORY: tuple[str, ...] = ("1g", "2g", "4g", "8g", "16g", "32g") + + +def normalize_cpu(value, default) -> str: + """Validate/normalize a CPU value against VALID_CPU. + + ``value`` is the template/config value (may be None/empty); ``default`` is + the system fallback (already admin-validated). Returns the canonical string + or raises ``ValueError`` with an actionable message. + """ + cpu = str(value if value not in (None, "") else default).strip() + if cpu not in VALID_CPU: + raise ValueError( + f"Invalid cpu '{cpu}': must be one of {', '.join(VALID_CPU)} " + f"(integer processor count — not a fractional or Kubernetes-style value)" + ) + return cpu + + +def normalize_memory(value, default) -> str: + """Validate/normalize a memory value against VALID_MEMORY (Docker form). + + Case-folds so ``4G`` → ``4g``. Returns the canonical string or raises + ``ValueError`` with an actionable message. + """ + mem = str(value if value not in (None, "") else default).strip().lower() + if mem not in VALID_MEMORY: + raise ValueError( + f"Invalid memory '{mem}': must be one of {', '.join(VALID_MEMORY)} " + f"(Docker form like '4g' — not a Kubernetes-style value like '512Mi')" + ) + return mem diff --git a/src/backend/services/agent_service/crud.py b/src/backend/services/agent_service/crud.py index 8d9ca8b8..1b0f02bc 100644 --- a/src/backend/services/agent_service/crud.py +++ b/src/backend/services/agent_service/crud.py @@ -35,7 +35,7 @@ from utils.helpers import sanitize_agent_name, utc_now_iso from .helpers import validate_base_image from .lifecycle import RESTRICTED_CAPABILITIES, FULL_CAPABILITIES -from .capabilities import AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR +from .capabilities import AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR, normalize_cpu, normalize_memory logger = logging.getLogger(__name__) @@ -453,6 +453,25 @@ async def create_agent_internal( if generated_files: cred_files_volume = {str(cred_files_dir): {'bind': '/generated-creds', 'mode': 'ro'}} + # #1197: validate/normalize template resource fields against the allowed + # set BEFORE any side effects (MCP key, subscription, container). A + # Kubernetes-style `cpu: "0.5"` / `memory: "512Mi"` from a source repo's + # template.yaml used to reach the raw `int(cpu)` at container-create and + # abort with an opaque ValueError — leaving an orphaned mcp_api_keys row. + # Fail fast here with an actionable 400 instead, and write the canonical + # values back so the container labels + limits use them. + if config.resources is None: + config.resources = {} + try: + config.resources['cpu'] = normalize_cpu( + config.resources.get('cpu'), _get_default_resource('cpu') + ) + config.resources['memory'] = normalize_memory( + config.resources.get('memory'), _get_default_resource('memory') + ) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + # Phase: Agent-to-Agent Collaboration # Generate agent-scoped MCP API key for Trinity MCP access agent_mcp_key = None @@ -734,11 +753,13 @@ async def create_agent_internal( # is redirected off this tiny tmpfs via the TMPDIR env var). tmpfs=AGENT_TMPFS_MOUNT, network='trinity-agent-network', - mem_limit=config.resources.get('memory') or _get_default_resource('memory'), + # #1197: cpu/memory normalized + validated above (raises 400 on + # a bad template value), so these are guaranteed Docker-valid. + mem_limit=config.resources['memory'], # #1126: nano_cpus (Linux CFS quota), NOT cpu_count — the latter # is Windows-only in docker-py and left NanoCpus=0, so newly # created agents never got a CPU limit on Linux. - nano_cpus=int(config.resources.get('cpu') or _get_default_resource('cpu')) * 1_000_000_000, + nano_cpus=int(config.resources['cpu']) * 1_000_000_000, ) agent_status = get_agent_status_from_container(container) @@ -841,6 +862,19 @@ async def create_agent_internal( config.name, cleanup_exc, ) + # #1197: the agent-scoped MCP key is minted before container + # creation, so a failure here would otherwise leave an orphaned + # mcp_api_keys row (one per failed attempt). Roll it back too. + if agent_mcp_key: + try: + db.delete_agent_mcp_api_key(config.name) + except Exception as cleanup_exc: + logger.warning( + "Failed to roll back MCP key for %s after creation " + "failure: %s", + config.name, + cleanup_exc, + ) logger.error(f"Failed to create agent {config.name}: {e}") raise HTTPException(status_code=500, detail="Failed to create agent. Please try again.") else: diff --git a/src/backend/services/agent_service/lifecycle.py b/src/backend/services/agent_service/lifecycle.py index 859f9d07..24419daa 100644 --- a/src/backend/services/agent_service/lifecycle.py +++ b/src/backend/services/agent_service/lifecycle.py @@ -96,6 +96,8 @@ async def wait_for_agent_ready( PROHIBITED_CAPABILITIES, AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR, + normalize_cpu, + normalize_memory, ) @@ -410,6 +412,13 @@ async def recreate_container_with_updated_config(agent_name: str, old_container, cpu = labels.get("trinity.cpu") or system_defaults["cpu"] memory = labels.get("trinity.memory") or system_defaults["memory"] + # #1197: validate/normalize before they reach Docker (int(cpu) NanoCpus / + # mem_limit). A stale label or DB override carrying a non-integer cpu or a + # Kubernetes-style memory would otherwise crash recreate with an opaque + # ValueError; fail with a clear message instead. + cpu = normalize_cpu(cpu, system_defaults["cpu"]) + memory = normalize_memory(memory, system_defaults["memory"]) + # Update labels with new resource limits for future reference labels["trinity.cpu"] = cpu labels["trinity.memory"] = memory diff --git a/src/backend/services/system_agent_service.py b/src/backend/services/system_agent_service.py index 6648881e..e3df6c04 100644 --- a/src/backend/services/system_agent_service.py +++ b/src/backend/services/system_agent_service.py @@ -23,6 +23,7 @@ from services.docker_utils import container_reload, container_start, containers_run from services.settings_service import get_anthropic_api_key from services.agent_service.lifecycle import FULL_CAPABILITIES, AGENT_TMPFS_MOUNT, AGENT_DEFAULT_TMPDIR +from services.agent_service.capabilities import normalize_cpu, normalize_memory from utils.helpers import utc_now_iso logger = logging.getLogger(__name__) @@ -243,9 +244,10 @@ async def _create_system_agent(self) -> dict: volumes=volumes, environment=env_vars, labels=labels, - mem_limit=resources.get("memory", "8g"), + # #1197: normalize/validate before Docker (int(cpu) NanoCpus / mem_limit). + mem_limit=normalize_memory(resources.get("memory"), "8g"), # #1126: nano_cpus (Linux CFS quota), NOT cpu_count (Windows-only → NanoCpus=0). - nano_cpus=int(resources.get("cpu", "4")) * 1_000_000_000, + nano_cpus=int(normalize_cpu(resources.get("cpu"), "4")) * 1_000_000_000, restart_policy={"Name": "unless-stopped"}, # Auto-restart on failure # Always apply AppArmor for additional sandboxing security_opt=['apparmor:docker-default'], diff --git a/tests/unit/test_resource_normalization.py b/tests/unit/test_resource_normalization.py new file mode 100644 index 00000000..932d1046 --- /dev/null +++ b/tests/unit/test_resource_normalization.py @@ -0,0 +1,96 @@ +"""Resource normalization for agent container limits (#1197). + +A GitHub source repo's template.yaml carrying a fractional / Kubernetes-style +resources block (``cpu: "0.5"``, ``memory: "512Mi"``) used to abort agent +creation deep in container-create with an opaque +``ValueError: invalid literal for int() with base 10: '0.5'``. These tests pin +``normalize_cpu`` / ``normalize_memory`` — the guards now applied at all three +container-create sites — so the failure surfaces early with an actionable +message and the allowed value set can't silently drift. + +Loaded by file path (stdlib-only) so the test doesn't drag the +docker / fastapi / database transitive imports of the agent_service package. +""" +from __future__ import annotations + +import importlib.util +from pathlib import Path + +import pytest + +_CAPS_PATH = ( + Path(__file__).resolve().parent.parent.parent + / "src" / "backend" / "services" / "agent_service" / "capabilities.py" +) + + +def _load(): + spec = importlib.util.spec_from_file_location("caps_resnorm_under_test", _CAPS_PATH) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +caps = _load() + + +# --- CPU ---------------------------------------------------------------- + +@pytest.mark.parametrize("value", ["1", "2", "4", "8", "16", 4, 16]) +def test_normalize_cpu_accepts_valid(value): + assert caps.normalize_cpu(value, "2") == str(value) + + +def test_normalize_cpu_falls_back_to_default_on_empty(): + assert caps.normalize_cpu(None, "2") == "2" + assert caps.normalize_cpu("", "4") == "4" + + +@pytest.mark.parametrize("bad", ["0.5", "0", "3", "32", "100m", "half"]) +def test_normalize_cpu_rejects_invalid(bad): + with pytest.raises(ValueError) as ei: + caps.normalize_cpu(bad, "2") + msg = str(ei.value) + assert bad.strip() in msg # echoes the offending value + assert "must be one of" in msg # actionable + + +def test_normalize_cpu_is_int_castable_after_normalize(): + # The whole point: every value normalize_cpu returns is int()-castable, so + # the `int(cpu) * 1_000_000_000` NanoCpus line can never raise (#1197). + for v in caps.VALID_CPU: + assert isinstance(int(caps.normalize_cpu(v, "2")), int) + + +# --- Memory ------------------------------------------------------------- + +@pytest.mark.parametrize("value", ["1g", "2g", "4g", "8g", "16g", "32g"]) +def test_normalize_memory_accepts_valid(value): + assert caps.normalize_memory(value, "4g") == value + + +def test_normalize_memory_case_folds(): + assert caps.normalize_memory("4G", "4g") == "4g" + assert caps.normalize_memory(" 8G ", "4g") == "8g" + + +def test_normalize_memory_falls_back_to_default_on_empty(): + assert caps.normalize_memory(None, "4g") == "4g" + assert caps.normalize_memory("", "8g") == "8g" + + +@pytest.mark.parametrize("bad", ["512Mi", "512m", "4", "4gb", "0.5g", "huge"]) +def test_normalize_memory_rejects_invalid(bad): + with pytest.raises(ValueError) as ei: + caps.normalize_memory(bad, "4g") + msg = str(ei.value) + assert "must be one of" in msg + + +# --- drift guard -------------------------------------------------------- + +def test_value_sets_match_settings_router(): + """The canonical sets here must match what the admin defaults endpoint + accepts (routers/settings.py reads VALID_CPU/VALID_MEMORY from here).""" + assert caps.VALID_CPU == ("1", "2", "4", "8", "16") + assert caps.VALID_MEMORY == ("1g", "2g", "4g", "8g", "16g", "32g")