datalayer · echarles · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/API.md b/API.md
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![Become a Sponsor](https://img.shields.io/static/v1?label=Become%20a%20Sponsor&message=%E2%9D%A4&logo=GitHub&style=flat&color=1ABC9C)](https://github.com/sponsors/datalayer)
 
-# ☰ Datalayer Core
+# ☰ ☢️ Datalayer Core
 
 <p align="center">
   <strong>Python and Typescript libraries for Datalayer</strong>
@@ -118,7 +118,7 @@ datalayer runtime list
 datalayer runtime create ai-env --given-name my-runtime-123
 
 # Execute a script in a runtime
-datalayer runtime exec my-script.py --runtime <runtime-id>
+datalayer runtime exec my-script.py --agent <agent-id>
 
 # Create a snapshot from a runtime but do not terminate the runtime
 datalayer snapshots create <pod-name> my-snapshot 'AI work!' False
@@ -151,6 +151,29 @@ datalayer usage team-allocate-member --team-uid <team_uid> --member-uid <member_
 datalayer usage team-revoke-member --team-uid <team_uid> --member-uid <member_uid> --amount 5
 ```
 
+### 5. Evals CLI (Multi-Agentspec)
+
+Use comma-separated agentspec ids to create one experiment per agentspec variant:
+
+```bash
+# Creates one experiment per agentspec in the list
+datalayer evals experiments create my-exp \
+  --evalset-id <evalset_id> \
+  --agent-spec-ids example-evals,example-evals-nocodemode,example-custom
+```
+
+Generate a comparison report:
+
+```bash
+datalayer evals report <evalset_id> --run-limit 50 --export
+```
+
+How to interpret grouped comparisons in the report:
+
+- `Within-Agentspec Pairwise Latest-Pass Deltas`: compares experiments using the same agentspec id.
+- `Cross-Agentspec Pairwise Latest-Pass Deltas`: compares experiments using different agentspec ids.
+- Pairwise sections compute all combinations for the selected experiments, not just two agentspecs.
+
 ## Examples
 
 ### Python Examples

diff --git a/datalayer_core/__version__.py b/datalayer_core/__version__.py
@@ -3,4 +3,4 @@
 
 """Datalayer Core version information."""
 
-__version__ = "1.1.24"
+__version__ = "1.1.38"
diff --git a/datalayer_core/agents/__init__.py b/datalayer_core/agents/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2023-2025 Datalayer, Inc.
+# Distributed under the terms of the Modified BSD License.
+
+"""Runtime and agent execution helpers."""
+
+from datalayer_core.agents.agent_cloud import RuntimeService
+from datalayer_core.agents.agent_local import (
+	DEFAULT_LOCAL_AGENT_NAME,
+	DEFAULT_LOCAL_HOST,
+	DEFAULT_LOCAL_LOG_LEVEL,
+	DEFAULT_LOCAL_PROTOCOL,
+	LocalAgentRuntime,
+	ensure_local_agent,
+	start_local_agent_runtime,
+	terminate_local_agent_runtime,
+)
+from datalayer_core.agents.utils import (
+	compute_time_reservation_minutes,
+	create_cloud_agent_runtime,
+	resolve_environment_burning_rate,
+	teardown_agent_execution_resources,
+	terminate_cloud_agent_runtime,
+)
+
+__all__ = [
+	"RuntimeService",
+	"LocalAgentRuntime",
+	"DEFAULT_LOCAL_AGENT_NAME",
+	"DEFAULT_LOCAL_HOST",
+	"DEFAULT_LOCAL_LOG_LEVEL",
+	"DEFAULT_LOCAL_PROTOCOL",
+	"ensure_local_agent",
+	"start_local_agent_runtime",
+	"terminate_local_agent_runtime",
+	"resolve_environment_burning_rate",
+	"compute_time_reservation_minutes",
+	"create_cloud_agent_runtime",
+	"terminate_cloud_agent_runtime",
+	"teardown_agent_execution_resources",
+]
diff --git a/datalayer_core/runtimes/runtime.py → datalayer_core/agents/agent_cloud.py b/datalayer_core/runtimes/runtime.py → datalayer_core/agents/agent_cloud.py
@@ -19,9 +19,9 @@
 from datalayer_core.mixins.sandbox_snapshots import SandboxSnapshotsMixin
 from datalayer_core.mixins.runtimes import RuntimesMixin
 from datalayer_core.models import ExecutionResponse
+from datalayer_core.models.sandbox_snapshot import SandboxSnapshotModel
 from datalayer_core.models.runtime import RuntimeModel
-from datalayer_core.runtimes.sandbox_snapshot import (
-    SandboxSnapshotModel,
+from datalayer_core.sandboxes.code_sandbox_snapshots import (
     as_code_sandbox_snapshots,
     create_snapshot,
 )
@@ -60,6 +60,7 @@ def __init__(
         run_url: str = DEFAULT_DATALAYER_RUN_URL,
         iam_url: Optional[str] = None,
         token: Optional[str] = None,
+        api_key: Optional[str] = None,
         pod_name: Optional[str] = None,
         ingress: Optional[str] = None,
         reservation_id: Optional[str] = None,
@@ -86,6 +87,8 @@ def __init__(
             Datalayer IAM server URL. If not provided, defaults to run_url.
         token : Optional[str]
             Authentication token (can also be set via DATALAYER_API_KEY env var).
+        api_key : Optional[str]
+            Authentication API key alias for ``token``.
         pod_name : Optional[str]
             Name of the pod running the runtime.
         ingress : Optional[str]
@@ -110,7 +113,7 @@ def __init__(
             time_reservation=time_reservation,
             run_url=run_url,
             iam_url=iam_url or run_url,
-            token=token,
+            token=token or api_key,
             external_token=None,
             pod_name=pod_name,
             ingress=ingress,

diff --git a/datalayer_core/runtimes/local.py → datalayer_core/agents/agent_local.py b/datalayer_core/runtimes/local.py → datalayer_core/agents/agent_local.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2023-2025 Datalayer, Inc.
+# Distributed under the terms of the Modified BSD License.
+
 # Copyright (c) 2023-2026 Datalayer, Inc.
 # Distributed under the terms of the Modified BSD License.
 
@@ -122,13 +125,14 @@ def start_local_agent_runtime(
     protocol: str = DEFAULT_LOCAL_PROTOCOL,
     log_level: str = DEFAULT_LOCAL_LOG_LEVEL,
     wait: bool = True,
+    disable_tool_approvals: bool = False,
 ) -> LocalAgentRuntime:
     """Launch a local ``agent-runtimes`` server as a subprocess.
 
     Parameters
     ----------
     agent_spec_id : str
-        Agent spec id to boot the runtime with.
+        Agentspec id to boot the runtime with.
     agent_name : str
         Registered agent name/id served by the runtime.
     host : str
@@ -172,6 +176,8 @@ def start_local_agent_runtime(
         "--log-level",
         log_level,
     ]
+    if disable_tool_approvals:
+        command.append("--disable-tool-approvals")
 
     runtime_env, mapped_targets = build_agent_runtime_env()
     if mapped_targets:
@@ -238,6 +244,7 @@ def ensure_local_agent(
     enable_skills: bool = True,
     description: Optional[str] = None,
     timeout: int = 120,
+    disable_tool_approvals: bool = False,
 ) -> None:
     """Ensure a local agent with the expected transport is registered.
 
@@ -298,6 +305,7 @@ def ensure_local_agent(
         "agent_spec_id": agent_spec_id,
         "enable_skills": enable_skills,
         "tools": [],
+        "disableToolApprovals": disable_tool_approvals,
     }
     try:
         response = requests.post(
@@ -456,6 +464,129 @@ def extract_vercel_stream_text(raw: str) -> str:
     return "".join(text_parts).strip()
 
 
+def _coerce_usage_payload(candidate: Any) -> dict[str, Any]:
+    if not isinstance(candidate, dict) or not candidate:
+        return {}
+    nested = candidate.get("usage")
+    if isinstance(nested, dict) and nested:
+        merged = dict(nested)
+        for key, value in candidate.items():
+            if key == "usage":
+                continue
+            merged.setdefault(str(key), value)
+        return merged
+    return dict(candidate)
+
+
+def _usage_payload_score(payload: dict[str, Any]) -> int:
+    if not payload:
+        return 0
+    token_keys = {
+        "prompt_tokens",
+        "promptTokens",
+        "input_tokens",
+        "inputTokens",
+        "completion_tokens",
+        "completionTokens",
+        "output_tokens",
+        "outputTokens",
+        "total_tokens",
+        "totalTokens",
+        "tokens_total",
+        "token_total",
+    }
+    score = len(payload)
+    if any(key in payload for key in token_keys):
+        score += 100
+    if any(
+        key in payload
+        for key in (
+            "credits_consumed",
+            "creditsConsumed",
+            "credits",
+            "total_credits",
+            "cost_credits",
+        )
+    ):
+        score += 10
+    return score
+
+
+def extract_vercel_stream_usage(raw: str) -> dict[str, Any]:
+    """Extract best-effort pydantic usage metadata from a Vercel AI SSE stream."""
+    best: dict[str, Any] = {}
+    best_score = 0
+    for line in raw.splitlines():
+        if not line.startswith("data: "):
+            continue
+        payload = line[6:].strip()
+        if not payload or payload == "[DONE]":
+            continue
+        try:
+            event = json.loads(payload)
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(event, dict):
+            continue
+
+        candidates: list[dict[str, Any]] = []
+        message_metadata = event.get("messageMetadata")
+        if isinstance(message_metadata, dict):
+            candidates.extend(
+                [
+                    _coerce_usage_payload(message_metadata.get("pydantic_ai")),
+                    _coerce_usage_payload(message_metadata.get("pydanticAI")),
+                    _coerce_usage_payload(message_metadata.get("usage")),
+                ]
+            )
+        candidates.extend(
+            [
+                _coerce_usage_payload(event.get("pydantic_ai_usage")),
+                _coerce_usage_payload(event.get("pydantic_ai")),
+                _coerce_usage_payload(event.get("usage")),
+            ]
+        )
+        for candidate in candidates:
+            score = _usage_payload_score(candidate)
+            if score > best_score:
+                best = candidate
+                best_score = score
+    return best
+
+
+def _vercel_ai_error_message(raw: str) -> Optional[str]:
+    """Detect a non-stream error body returned with an HTTP 200 status.
+
+    The ``agent-runtimes`` server answers an unknown agent route with HTTP 200
+    and a JSON error body (for example
+    ``{"error": "Agent '...' not found", "message": "No agent registered ..."}``)
+    instead of an SSE stream. Such a body must NOT be treated as a successful
+    completion, otherwise route-candidate fallback stops at the first wrong
+    route and an empty answer is recorded.
+
+    Returns
+    -------
+    Optional[str]
+        The error message when the body is an error payload (or an empty body),
+        otherwise ``None`` when the body is a genuine SSE stream.
+    """
+    text = (raw or "").strip()
+    if not text:
+        return "Empty response body"
+    # A genuine Vercel AI response is an SSE stream of ``data:`` lines.
+    if "data:" in text:
+        return None
+    try:
+        payload = json.loads(text)
+    except json.JSONDecodeError:
+        return None
+    if isinstance(payload, dict):
+        error = payload.get("error") or payload.get("message")
+        if error:
+            return str(error)
+    return None
+
+
 def _post_vercel_ai_chat(
     *,
     endpoint: str,
@@ -528,13 +659,36 @@ def _post_vercel_ai_chat(
         }
 
     output_text = extract_vercel_stream_text(raw)
-    return {
+    usage = extract_vercel_stream_usage(raw)
+    if not output_text:
+        error_message = _vercel_ai_error_message(raw)
+        if error_message is not None:
+            message_text = (
+                f"{source_label} chat returned no output: {error_message}"
+            )
+            return {
+                "status": "failed",
+                "output": {"text": "", "raw_stream_excerpt": raw[:2000]},
+                "failure_cause": {
+                    "stage": "runtime_execution",
+                    "type": "runtime_agent_unavailable",
+                    "message": message_text,
+                    "detail_excerpt": raw[:2000] or message_text,
+                    "execution_url": endpoint,
+                },
+            }
+    output: dict[str, Any] = {
+        "text": output_text,
+        "raw_stream_excerpt": raw[:2000],
+    }
+    result: dict[str, Any] = {
         "status": "completed",
-        "output": {
-            "text": output_text,
-            "raw_stream_excerpt": raw[:2000],
-        },
+        "output": output,
     }
+    if usage:
+        output["pydantic_ai_usage"] = usage
+        result["usage"] = usage
+    return result
 
 
 def run_local_agent_chat(
@@ -606,7 +760,7 @@ def runtime_route_candidates(
 
     The ``agent-runtimes`` server inside a cloud runtime may register its agent
     under different names depending on how it was launched. Trying a few known
-    candidates (explicit agent name, agent spec id, pod name, then the default
+    candidates (explicit agent name, agentspec id, pod name, then the default
     route) makes cloud execution resilient.
     """
     candidates: list[str] = []

diff --git a/datalayer_core/runtimes/agent_runtime.py → datalayer_core/agents/utils.py b/datalayer_core/runtimes/agent_runtime.py → datalayer_core/agents/utils.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2023-2025 Datalayer, Inc.
+# Distributed under the terms of the Modified BSD License.
+
 # Copyright (c) 2023-2026 Datalayer, Inc.
 # Distributed under the terms of the Modified BSD License.
 
@@ -144,9 +147,9 @@ def create_cloud_agent_runtime(
     name : Optional[str]
         Optional runtime name.
     agent_spec_id : Optional[str]
-        Registered agent spec id (ignored when ``agent_spec`` is provided).
+        Registered agentspec id (ignored when ``agent_spec`` is provided).
     agent_spec : Optional[dict[str, Any]]
-        Inline agent spec payload (takes precedence over ``agent_spec_id``).
+        Inline agentspec payload (takes precedence over ``agent_spec_id``).
     credits_limit : Optional[float]
         Target credits budget used to derive ``time_reservation`` when the
         latter is not supplied.
@@ -284,15 +287,15 @@ def teardown_agent_execution_resources(
 
     if target == "local":
         if local_base_url and token and local_agent_name:
-            from datalayer_core.runtimes.local import delete_local_agent
+            from datalayer_core.agents.agent_local import delete_local_agent
 
             result["local_agent_deleted"] = delete_local_agent(
                 base_url=local_base_url,
                 token=token,
                 agent_name=local_agent_name,
             )
         if local_runtime is not None:
-            from datalayer_core.runtimes.local import terminate_local_agent_runtime
+            from datalayer_core.agents.agent_local import terminate_local_agent_runtime
 
             terminate_local_agent_runtime(local_runtime)
             result["local_runtime_terminated"] = True
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		"""Datalayer Core version information."""

		__version__ = "1.1.24"
		__version__ = "1.1.38"