diff --git a/API.md b/API.md deleted file mode 100644 index 8e1f5cf5..00000000 --- a/API.md +++ /dev/null @@ -1,724 +0,0 @@ -# Datalayer Core - DatalayerClient Documentation - -This document provides comprehensive examples for using the DatalayerClient. - -## Table of Contents - -- [Getting Started](#getting-started) - - [Installation](#installation) - - [Initialization](#initialization) - - [Handlers Pattern](#handlers-pattern) -- [Authentication](#authentication) -- [Runtime Management](#runtime-management) -- [Notebook & Document Management](#notebook--document-management) -- [Model Classes](#model-classes) - - [Runtime Model](#runtime-model) - - [Snapshot Model](#snapshot-model) - - [Notebook Model](#notebook-model) - - [Lexical Model](#lexical-model) - - [Space Model](#space-model) -- [Error Handling](#error-handling) -- [Best Practices](#best-practices) -- [Testing](#testing) - -## Getting Started - -### Installation - -```bash -npm install @datalayer/core -``` - -### Initialization - -The DatalayerClient provides a high-level, object-oriented interface for interacting with Datalayer services. - -#### Key Features - -- **Flat API**: All methods directly on `client.` (e.g., `client.createNotebook()`) -- **Handlers Pattern**: Inject platform-specific behavior without wrapping Client methods -- **Rich Models**: Returns model instances with methods, not just plain objects -- **Type Safety**: Full TypeScript support with proper interfaces - -#### Basic Initialization - -```typescript -import { DatalayerClient } from '@datalayer/core/client'; -import { DEFAULT_SERVICE_URLS } from '@datalayer/core/api/constants'; - -// Basic initialization with token -const client = new DatalayerClient({ - token: 'bearer-token-123', - iamRunUrl: DEFAULT_SERVICE_URLS.IAM, - runtimesRunUrl: DEFAULT_SERVICE_URLS.RUNTIMES, - spacerRunUrl: DEFAULT_SERVICE_URLS.SPACER -}); - -// Quick initialization with defaults -const client = new DatalayerClient({ - token: 'bearer-token-123' -}); -``` - -### Handlers Pattern - -Initialize with lifecycle handlers for cross-cutting concerns: - -```typescript -// Initialize with handlers for logging and error handling -const client = new DatalayerClient({ - token: 'bearer-token-123', - iamRunUrl: 'https://prod1.datalayer.run', - handlers: { - // Called before every Client method - beforeCall: async (methodName, args) => { - console.log(`[Client] Calling ${methodName}`, args); - }, - // Called after successful method execution - afterCall: async (methodName, result) => { - console.log(`[Client] ${methodName} completed`); - // Track analytics, update UI, etc. - }, - // Called when a method throws an error - onError: async (methodName, error) => { - console.error(`[Client] ${methodName} failed:`, error); - - // Platform-specific error handling - if (error.message.includes('Not authenticated')) { - // Show login prompt in your platform's UI - // e.g., vscode.window.showErrorMessage(...) - // or showAuthModal() in React - } - } - } -}); - -// Initialize for VS Code extension with platform-specific handlers -const vscodeClient = new DatalayerClient({ - token: 'bearer-token-123', - handlers: { - onError: async (methodName, error) => { - // VS Code specific error handling - const vscode = require('vscode'); - vscode.window.showErrorMessage(`Datalayer: ${error.message}`); - } - } -}); - -// Initialize for React app with UI handlers -const reactClient = new DatalayerClient({ - token: 'bearer-token-123', - handlers: { - onError: async (methodName, error) => { - // React specific error handling - toast.error(`Error: ${error.message}`); - }, - beforeCall: async (methodName, args) => { - setLoading(true); - }, - afterCall: async (methodName, result) => { - setLoading(false); - } - } -}); -``` - -## Authentication - -```typescript -// Get current user profile (whoami) -const user = await client.whoami(); -console.log('User ID:', user.uid); -console.log('Email:', user.email); -console.log('Roles:', user.roles); - -// Login with token -await client.login('new-bearer-token'); - -// Get credits information -const credits = await client.getCredits(); -console.log('Available credits:', credits.balance); - -// Check IAM service health -const health = await client.checkIAMHealth(); -console.log('IAM service status:', health.status); - -// Logout -await client.logout(); -``` - -## Runtime Management - -```typescript -// List available environments -const environments = await client.listEnvironments(); -environments.forEach(env => { - console.log(`${env.name}: ${env.type}`); - console.log('Resources:', env.resources); -}); - -// Ensure runtime (creates or reuses existing) -const runtime = await client.ensureRuntime( - 'ai-agents-env', // environment name - 50, // credits limit - true, // wait for ready - 60000, // max wait time (ms) - true, // reuse existing - 'snapshot-id' // optional snapshot to restore from -); - -console.log('Runtime ready:', runtime.podName); -console.log('Jupyter URL:', runtime.jupyterUrl); - -// Create a specific runtime -const newRuntime = await client.createRuntime( - 'python-gpu-env', // environment name - 'notebook', // type - 'ml-training-gpu', // given name - 100 // credits limit -); - -// Wait for runtime to be ready -await newRuntime.waitUntilReady(60000); // 60 seconds timeout -console.log('Runtime is ready!'); - -// Check runtime state -const state = await newRuntime.getState(); -console.log('Current state:', state); - -// Create a snapshot -const snapshot = await client.createSnapshot( - newRuntime.podName, - 'checkpoint-before-training', - 'Saving model state before training', - false // don't stop runtime after snapshot -); -console.log('Snapshot created:', snapshot.uid); - -// List all runtimes -const runtimes = await client.listRuntimes(); -runtimes.forEach(r => { - console.log(`${r.podName}: ${r.givenName} (${r.environmentName})`); -}); - -// Get specific runtime -const specificRuntime = await client.getRuntime('pod-name-123'); -console.log('Runtime details:', specificRuntime.givenName); - -// List snapshots -const snapshots = await client.listSnapshots(); -snapshots.forEach(s => { - console.log(`${s.name}: ${s.description} (${s.status})`); -}); - -// Get specific snapshot -const specificSnapshot = await client.getSnapshot('snapshot-id-123'); -console.log('Snapshot size:', await specificSnapshot.getSize()); - -// Delete resources -await client.deleteRuntime(runtime.podName); -await client.deleteSnapshot(snapshot.uid); -console.log('Resources cleaned up'); - -// Check runtimes service health -const runtimesHealth = await client.checkRuntimesHealth(); -console.log('Runtimes service status:', runtimesHealth.status); -``` - -## Notebook & Document Management - -```typescript -// Get user's spaces -const spaces = await client.getMySpaces(); -console.log('Available spaces:', spaces.length); - -const mySpace = spaces[0]; -console.log('Space:', mySpace.uid); - -// Get items in space -const items = await client.getSpaceItems(mySpace.uid); -console.log('Items in space:', items.length); - -// Create a space -const newSpace = await client.createSpace( - 'Analysis Workspace', // name - 'Data analysis workspace', // description - 'workspace', // variant - 'analysis-ws', // space handle - 'org-id-123', // organization ID - '', // seed space ID - false // is public -); - -// Create a notebook -const notebook = await client.createNotebook( - mySpace.uid, // space ID - 'Analysis Notebook', // name - 'Data analysis for Q4' // description - // optional: file (File | Blob) -); - -console.log('Notebook created:', notebook.id); -console.log('Path:', notebook.path); - -// Get notebook details -const notebookDetails = await client.getNotebook(notebook.id); -console.log('Notebook UID:', notebookDetails.uid); - -// Update notebook -const updatedNotebook = await client.updateNotebook( - notebook.id, - 'Q4 Analysis - Final', // new name - 'Final analysis for Q4 2024' // new description -); - -// Get notebook content -const content = await client.getNotebookContent(notebook.id, { - includeOutputs: true, - format: 'json' -}); -console.log('Notebook cells:', content.cells.length); - -// Create a lexical document -const document = await client.createLexical( - mySpace.uid, // space ID - 'Project Notes', // name - 'Implementation notes' // description - // optional: file (File | Blob) -); - -console.log('Document created:', document.id); - -// Get lexical document -const lexicalDetails = await client.getLexical(document.id); - -// Update lexical document -const updatedDocument = await client.updateLexical( - document.id, - 'Project Notes v2', // new name - 'Updated notes' // new description -); - -// Get lexical content -const lexicalContent = await client.getLexicalContent(document.id, { - format: 'json' -}); -console.log('Document content:', lexicalContent); - -// Prefetch content for multiple items (caching) -const itemIds = [notebook.id, document.id]; -await client.prefetchContent(itemIds, 'notebook'); -await client.prefetchContent([document.id], 'lexical'); - -// Clear content cache -await client.clearContentCache(notebook.id, 'notebook'); -await client.clearContentCache(); // clear all cache - -// Delete items -try { - await client.deleteSpaceItem(notebook.id); - await client.deleteSpaceItem(document.id); - console.log('Items deleted successfully'); -} catch (error) { - console.error('Failed to delete items:', error.message); -} - -// Check spacer service health -const spacerHealth = await client.checkSpacerHealth(); -console.log('Spacer service status:', spacerHealth.status); -``` - -## Model Classes - -The DatalayerClient provides rich model classes that wrap API responses with convenient methods: - -### Runtime Model - -```typescript -const runtime = await client.createRuntime( - 'python-gpu-env', - 'notebook', - 'ml-training', - 100 -); - -// Static properties (no API calls) -console.log(runtime.podName); // Unique pod identifier -console.log(runtime.environmentName); // Environment being used -console.log(runtime.jupyterUrl); // Jupyter server URL -console.log(runtime.jupyterToken); // Authentication token -console.log(runtime.burningRate); // Credits per hour -console.log(runtime.givenName); // User-friendly name -console.log(runtime.createdAt); // Creation timestamp - -// Dynamic methods (fetch fresh data) -const state = await runtime.getState(); // Current state -const isRunning = await runtime.isRunning(); // Check if running -const isStarting = await runtime.isStarting(); // Check if starting - -// Actions -await runtime.waitUntilReady(30000); // Wait for ready state -const snapshot = await runtime.createSnapshot('checkpoint', 'Before changes'); -await runtime.delete(); // Delete runtime -``` - -### Snapshot Model - -```typescript -const snapshot = await client.createSnapshot( - runtime.podName, - 'training-checkpoint', - 'After epoch 10' -); - -// Static properties -console.log(snapshot.uid); // Unique identifier -console.log(snapshot.name); // Snapshot name -console.log(snapshot.description); // Description -console.log(snapshot.environment); // Environment name -console.log(snapshot.format); // Snapshot format -console.log(snapshot.metadata); // Custom metadata -console.log(snapshot.updatedAt); // Last update time - -// Dynamic methods -const status = await snapshot.getStatus(); // Current status -const size = await snapshot.getSize(); // Size in bytes -const metadata = await snapshot.getLatestMetadata(); // Fresh metadata - -// Actions -const newRuntime = await snapshot.restore({ // Create runtime from snapshot - given_name: 'restored-runtime', - credits_limit: 100 -}); -await snapshot.delete(); // Delete snapshot -``` - -### Notebook Model - -```typescript -const notebook = await client.createNotebook( - 'space-uid', - 'ML Experiments', - 'Machine learning experiments notebook' -); - -// Static properties (instant access, no API calls) -console.log(notebook.id); // Notebook ID -console.log(notebook.uid); // Unique identifier -console.log(notebook.path); // File path -console.log(notebook.spaceId); // Parent space -console.log(notebook.ownerId); // Owner user ID -console.log(notebook.createdAt); // Creation date -console.log(notebook.version); // Version number -console.log(notebook.metadata); // Metadata object - -// Dynamic methods (fetch fresh data from API) -const name = await notebook.getName(); // Current name -const content = await notebook.getContent(); // Notebook content -const kernelSpec = await notebook.getKernelSpec(); // Kernel specification -const updatedAt = await notebook.getUpdatedAt(); // Last update time - -// Actions -const updated = await notebook.update({ - name: 'ML Experiments - Final', - description: 'Completed experiments' -}); -await notebook.delete(); // Delete notebook - -// After deletion, accessing properties will throw errors -try { - await notebook.getName(); -} catch (error) { - console.log('Notebook has been deleted'); -} -``` - -### Lexical Model - -```typescript -const document = await client.createLexical( - 'space-uid', - 'Architecture Design', - 'System architecture documentation' -); - -// Static properties -console.log(document.id); // Document ID -console.log(document.uid); // Unique identifier -console.log(document.spaceId); // Parent space -console.log(document.ownerId); // Owner ID -console.log(document.createdAt); // Creation date - -// Dynamic methods -const name = await document.getName(); // Current name -const content = await document.getContent(); // Document content -const updatedAt = await document.getUpdatedAt(); // Last update - -// Actions -const updated = await document.update({ - name: 'Architecture Design v2', - content: { /* Lexical content */ } -}); -await document.delete(); // Delete document -``` - -### Space Model - -```typescript -const spaces = await client.getMySpaces(); -const space = spaces[0]; - -// Static properties -console.log(space.uid); // Space UID -console.log(space.handle); // Space handle -console.log(space.variant); // Space variant -console.log(space.visibility); // Visibility setting -console.log(space.ownerId); // Owner ID -console.log(space.createdAt); // Creation date - -// Dynamic methods -const name = await space.getName(); // Current name -const description = await space.getDescription(); // Description -const items = await space.getItems(); // Items in space -const updatedAt = await space.getUpdatedAt(); // Last update - -// Get items with type checking -items.forEach(item => { - if ('notebookType' in item) { - console.log('Notebook:', item.name); - } else if ('documentType' in item) { - console.log('Document:', item.name); - } -}); -``` - -## Error Handling - -The DatalayerClient provides detailed error messages with proper error handling: - -```typescript -// Basic error handling -try { - const notebook = await client.createNotebook( - 'space-id', - 'My Notebook', - 'Description' - ); -} catch (error) { - console.error('Failed to create notebook:', error.message); -} - -// Handle authentication errors -try { - const user = await client.whoami(); -} catch (error) { - if (error.message.includes('401') || error.message.includes('Not authenticated')) { - console.error('Authentication failed - please check your token'); - // Trigger re-authentication in your app - } else { - console.error('API error:', error.message); - } -} - -// Handle insufficient credits -try { - const runtime = await client.createRuntime('python-gpu-env', 'notebook', 'test', 1000); -} catch (error) { - if (error.message.includes('insufficient credits')) { - console.error('Not enough credits to create runtime'); - } else if (error.message.includes('quota exceeded')) { - console.error('Runtime quota exceeded'); - } else { - console.error('Runtime creation failed:', error.message); - } -} - -// Model deletion state -const runtime = await client.createRuntime('ai-agents-env', 'notebook', 'test', 10); -await client.deleteRuntime(runtime.podName); - -// This will throw an error -try { - await runtime.getState(); -} catch (error) { - console.log('Runtime has been deleted'); -} - -// Using handlers for global error handling -const client = new DatalayerClient({ - token: 'your-token', - handlers: { - onError: async (methodName, error) => { - // Global error handling - if (error.message.includes('401')) { - console.log('Authentication required'); - // Handle auth globally - } else if (error.message.includes('429')) { - console.log('Rate limited - retrying...'); - // Handle rate limiting - } else { - console.error(`Global error in ${methodName}:`, error.message); - } - } - } -}); -``` - -## Best Practices - -1. **Use handlers for cross-cutting concerns**: Implement logging, error handling, and UI updates through the handlers pattern rather than wrapping Client methods. - -2. **Handle deletion states**: Models track deletion state to prevent operations on deleted resources. - -3. **Cache dynamic data**: The Client models cache dynamic data for 5 seconds to reduce API calls. - -4. **Wait for runtime readiness**: Always use `waitUntilReady()` after creating a runtime before performing operations: - ```typescript - const runtime = await client.createRuntime('ai-agents-env', 'notebook', 'analysis', 50); - await runtime.waitUntilReady(60000); // Wait up to 60 seconds - // Now safe to use runtime - ``` - -5. **Reuse runtimes when possible**: Use `ensureRuntime()` instead of `createRuntime()` to reuse existing runtimes and save credits: - ```typescript - const runtime = await client.ensureRuntime( - 'ai-agents-env', - 50, // credits limit - true, // wait for ready - 60000, // max wait time - true // reuse existing - ); - ``` - -6. **Clean up resources**: Always delete runtimes and snapshots when done to avoid charges: - ```typescript - try { - // Use runtime for work - const runtime = await client.createRuntime(...); - // ... do work ... - } finally { - // Always clean up - await client.deleteRuntime(runtime.podName); - } - ``` - -7. **Use environment variables for configuration**: - ```typescript - const client = new DatalayerClient({ - token: process.env.DATALAYER_API_KEY, - iamRunUrl: process.env.DATALAYER_IAM_URL || DEFAULT_SERVICE_URLS.IAM, - runtimesRunUrl: process.env.DATALAYER_RUNTIMES_URL || DEFAULT_SERVICE_URLS.RUNTIMES, - spacerRunUrl: process.env.DATALAYER_SPACER_URL || DEFAULT_SERVICE_URLS.SPACER - }); - ``` - -8. **Prefetch content for better performance**: - ```typescript - // Prefetch multiple items to reduce individual API calls - const notebookIds = ['nb1', 'nb2', 'nb3']; - await client.prefetchContent(notebookIds, 'notebook'); - - // Now accessing content is much faster - for (const id of notebookIds) { - const content = await client.getNotebookContent(id); - } - ``` - -9. **Use appropriate service health checks**: - ```typescript - // Check service health before critical operations - const iamHealth = await client.checkIAMHealth(); - const runtimesHealth = await client.checkRuntimesHealth(); - const spacerHealth = await client.checkSpacerHealth(); - - if (iamHealth.status === 'healthy' && runtimesHealth.status === 'healthy') { - // Safe to proceed with runtime operations - } - ``` - -## Testing - -For testing, you can use the provided test utilities: - -```typescript -import { testConfig } from '@datalayer/core/__tests__/shared/test-config'; -import { performCleanup } from '@datalayer/core/__tests__/shared/cleanup-shared'; - -// Check if tests should run -if (testConfig.hasToken()) { - // Cleanup before tests - await performCleanup('setup'); - - // Run your tests - const client = new DatalayerClient({ - token: testConfig.getToken(), - ...DEFAULT_SERVICE_URLS - }); - - // Your test code here - const user = await client.whoami(); - expect(user.uid).toBeDefined(); - - // Cleanup after tests - await performCleanup('teardown'); -} - -// Skip expensive tests if configured -if (!testConfig.shouldSkipExpensive()) { - // Run expensive tests (runtime creation, etc.) - const runtime = await client.createRuntime('ai-agents-env', 'notebook', 'test', 10); - await client.deleteRuntime(runtime.podName); -} -``` - -## Configuration Options - -The DatalayerClient accepts these configuration options: - -```typescript -export interface DatalayerClientConfig { - /** Authentication token for API requests */ - token?: string; - /** URL for the IAM service */ - iamRunUrl?: string; - /** URL for the Runtimes service */ - runtimesRunUrl?: string; - /** URL for the Spacer service */ - spacerRunUrl?: string; - /** Platform-specific storage implementation */ - storage?: PlatformStorage; - /** Enable caching for API responses */ - cacheEnabled?: boolean; - /** Enable offline mode */ - offlineMode?: boolean; - /** Handlers for intercepting Client method calls */ - handlers?: ClientHandlers; -} -``` - -Example with all options: - -```typescript -import { BrowserStorage } from '@datalayer/core/client/storage'; - -const client = new DatalayerClient({ - token: 'your-token', - iamRunUrl: 'https://custom-iam.example.com', - runtimesRunUrl: 'https://custom-runtimes.example.com', - spacerRunUrl: 'https://custom-spacer.example.com', - storage: new BrowserStorage(), - cacheEnabled: true, - offlineMode: false, - handlers: { - beforeCall: async (methodName, args) => { - console.log(`Calling ${methodName}`, args); - }, - afterCall: async (methodName, result) => { - console.log(`${methodName} completed`); - }, - onError: async (methodName, error) => { - console.error(`${methodName} failed:`, error); - } - } -}); -``` diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index e10c0628..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1,444 +0,0 @@ -# CLAUDE.md - -Datalayer Core - Python Client and CLI for the Datalayer AI Platform. Hybrid Python/TypeScript codebase with server-side Python and client-side React components. - -## ⚠️ CRITICAL: Import/Export Pattern Issue (January 2025) - -**NEVER use destructured imports from `src/api/spacer`!** - -### The Problem -The spacer API exports use namespace pattern in `src/api/spacer/index.js`: -```javascript -export * as items from './items'; -export * as users from './users'; -export * as notebooks from './notebooks'; -export * as lexicals from './lexicals'; -export * as cells from './cells'; -``` - -This creates a structure like `spacerAPI.items`, NOT direct named exports. - -### ❌ WRONG - Destructured Import (causes runtime errors): -```javascript -import { items, users, notebooks } from '../../../api/spacer'; -const response = await items.getSpaceItems(...); // ❌ items is undefined -``` - -### ✅ CORRECT - Namespace Import: -```javascript -import * as spacerAPI from '../../../api/spacer'; -const response = await spacerAPI.items.getSpaceItems(...); // ✅ Works correctly -``` - -### Why This Happens -- Webpack bundling works fine (no build errors) -- Runtime fails because destructured import `{ items }` expects named export -- Namespace export `export * as items` creates nested structure instead -- Result: `items` becomes `undefined` at runtime, causing "Cannot read properties of undefined" - -### Files Fixed (January 2025) -- `lib/client/client/models/Space.js` -- `lib/client/client/models/Notebook.js` -- `lib/client/client/models/Lexical.js` -- `lib/client/client/models/Item.js` - -**Always use namespace imports for spacer API!** - -## Project Structure - -- **Source code**: `src/` contains the TypeScript/React library code -- **API Layer**: `src/api/` contains raw API functions for direct service access -- **Client**: `src/client/client/` contains the high-level Client with models and mixins -- **Examples**: `src/examples/` contains interactive React examples -- **Python**: `datalayer_core/` contains the Python Client -- **Tests**: `src/__tests__/` for TypeScript, `datalayer_core/tests/` for Python -- **No default Vite files**: Removed App.tsx, main.tsx, public/ - this is a library, not an app - -## Development Commands - -**Python**: `pip install -e .[test]` | `pytest datalayer_core/tests/` | `mypy datalayer_core/` -**TypeScript Library**: `npm install` | `npm run build:lib` | `npm run lint` | `npm run test` -**Integration Tests**: `npm run test:integration` (runs all API and Client integration tests) -**Examples**: `npm run examples` (starts dev server at http://localhost:3000/) -**Code Quality**: `npm run check` | `npm run check:fix` | `npm run lint` | `npm run format` | `npm run type-check` -**Docs**: `cd docs && make build` | `npm run typedoc` (generates TypeScript API docs) | See `API.md` for comprehensive API/Client examples -**Make**: `make build` | `make start` | `make docs` - -**CLI Scripts**: `datalayer`/`dla`/`d`, `datalayer-config`, `datalayer-migrate`, `datalayer-server`, `datalayer-troubleshoot` - -## Architecture - -**Python Core**: - -- `DatalayerApp` - Base application class (traitlets) -- `DatalayerClient` - Main Client class with mixins -- CLI with subcommands: about, console, envs, runtimes, login, secrets, snapshots -- Resource management: runtimes, environments, secrets, snapshots - -**TypeScript/React**: NPM package `@datalayer/core` - -- API layer with `DatalayerApi.ts` -- Component library (UI, Jupyter, business logic) -- Zustand state management -- 70+ TypeScript models -- Custom hooks for auth, platform integration, UI/UX -- Universal navigation system that auto-detects React Router, Next.js, or falls back to native browser navigation - -## Configuration - -- Environment variables: `DATALAYER_API_KEY`, `DATALAYER_RUN_URL` -- Traitlets configuration with custom Datalayer paths -- Dev setup in `dev/`, examples in `examples/` - -## Quality Standards - -- **Type checking**: 100% mypy compliance (Python), strict TypeScript checks -- **Testing**: pytest + Vitest with React Testing Library + comprehensive test mocks -- **Linting**: ESLint with React/TypeScript rules, ruff for Python -- **Formatting**: Prettier for consistent code style (80 char width, single quotes) -- **Security**: bandit compliance, replaced `eval()` with `ast.literal_eval()` -- **Documentation**: NumPy-style docstrings, TypeDoc API docs, Docusaurus site -- **Pre-commit**: Updated to latest versions (ruff v0.12.8, bandit 1.8.6, pip-audit v2.9.0) - -## Development Tips - -- Use npm, not yarn -- Run checks after changes: `npm run check:fix` -- Use playwright MCP servers when you need to check stuff -- Ensure things always build after changes -- Run also npm run format/lint/type-check to ensure all is working properly - -## Running Examples - -**Start the examples server:** - -```bash -npm run examples -``` - -The examples are served at http://localhost:3000/ and include: - -- `DatalayerNotebookExample`: Demonstrates Datalayer services integration with Jupyter notebooks -- `NotebookExample`: Basic notebook example -- `CellExample`: Individual cell execution example - -**Next.js Notebook Example:** - -Located in `examples/nextjj/`, this is a full Next.js application demonstrating platform integration: - -```bash -cd examples/nextjj -npm install -npm run dev -``` - -Features: - -- Token authentication with Datalayer IAM -- Browse and create notebooks from workspace -- Select compute environments for execution -- Interactive notebook viewer with real-time outputs -- Clean UI with centered empty states and proper spacing -- Welcome page with token authentication -- Navigation between notebooks, environments, and viewer pages -- Error handling for runtime creation failures - -**Configuration:** - -- The application uses local storage for token management -- Authentication happens through the welcome page where users enter their Datalayer API token -- The app communicates directly with `https://prod1.datalayer.run` API endpoints -- Built with Next.js 14, TypeScript, and GitHub Primer components - -**Desktop Example:** - -For a native desktop application with Jupyter integration, see the separate Datalayer Desktop repository: -https://github.com/datalayer/desktop - -Features: -- Native desktop app with Electron -- Full Jupyter notebook integration -- Real-time collaboration support -- WebSocket proxy for kernel communication - -## TypeScript/React Services - -**DatalayerServiceManager**: Creates and configures ServiceManager for Datalayer infrastructure - -- Located in `src/services/DatalayerServiceManager.ts` -- Uses the runtime API (`/api/runtimes/v1/runtimes`) to create kernels -- Internally uses `createRuntime` from the API module for proper auth handling -- Returns configured ServiceManager for use with Jupyter components - -**DatalayerCollaborationProvider**: Enables real-time collaboration - -- Located in `src/collaboration/DatalayerCollaborationProvider.ts` -- Requires Datalayer credentials (runUrl and token) -- Integrates with Jupyter notebooks for collaborative editing -- **IMPORTANT**: Uses notebook UIDs (not paths) for document IDs in Datalayer SaaS -- Collaboration is enabled by default in Notebook2 components - -## API Notes - -- **Runtime API**: `POST /api/runtimes/v1/runtimes` - Creates compute runtimes -- **Collaboration API**: `/api/spacer/v1/documents/{notebook_uid}` - Works for notebooks (not just documents!) -- **Required Headers**: Authorization (Bearer token), X-External-Token (for some operations) -- **Proxy Setup**: Vite dev server proxies `/api` to `https://prod1.datalayer.run` for CORS -- **API Docs**: Available at https://prod1.datalayer.run/api/runtimes/v1/ui/ -- **Pre-commit hooks**: Husky + lint-staged for automatic code quality checks -- **Code Quality Scripts**: - - `npm run check` - Run all checks (format, lint, type-check) - - `npm run check:fix` - Auto-fix all issues - - `npm run lint` / `npm run lint:fix` - ESLint checking - - `npm run format` / `npm run format:check` - Prettier formatting - - `npm run type-check` - TypeScript compilation check - -## API and Client Architecture - -### Two-Layer Architecture - -**1. Raw API Layer** (`src/api/`) -- Direct access to REST endpoints -- Organized by service (IAM, Runtimes, Spacer) -- Returns raw API responses -- Minimal abstraction, maximum control - -**2. Client Layer** (`src/client/client/`) -- High-level, intuitive interface -- Domain models with rich methods -- Automatic state management -- Mixins for organized functionality - -**Client Structure**: -- `storage/`: Platform-agnostic storage implementations (Browser, Node, Electron) -- `state/`: Service-specific state managers with TTL caching -- `models/`: Rich domain models (User, Runtime, Space, Notebook, Lexical, Snapshot) -- `mixins/`: Service mixins (IAMMixin, RuntimesMixin, SpacerMixin, HealthMixin) -- `base.ts`: Client base class composition - -### Key Changes and Fixes - -**Authentication**: -- Fixed logout endpoint to use GET method (was incorrectly using POST) -- Proper error handling for invalid tokens -- OAuth support limited to GitHub and LinkedIn only (removed Google/Microsoft) - -**Model Lifecycle Management**: -- Models track deletion state to prevent operations on deleted resources -- Runtime and Snapshot deletion now marks instances as deleted -- All model methods check deletion state before operations - -**Platform Abstraction Layer** (January 2025): -- Implemented PlatformStorage interface with 3 implementations (Browser, Node, Electron) -- State managers with TTL-based caching (IAMState, RuntimesState, SpacerState) -- RuntimesState tracks runtime keys for proper getCachedRuntimes() implementation -- All storage implementations support encryption - -**Test Infrastructure**: -- 100% test pass rate achieved (247 tests passing) -- Consolidated test configuration (removed redundant `shouldRunExpensive()`) -- Integration tests are self-contained (no inter-test dependencies) -- Proper cleanup in test teardown -- Environment variable `DATALAYER_TEST_SKIP_EXPENSIVE=false` enables all tests -- Fixed empty string handling in BrowserStorage -- Fixed OAuth provider recognition in User model tests - -**TypeScript Improvements**: -- Fixed strict null checks in model constructors -- Proper typing for Client mixins and models -- Consistent error handling across all models -- Fixed unused variable warnings in test files - -### Client Models - -**Runtime Model**: -- Dynamic state checking (always fetches fresh from API) -- `waitUntilReady()` method for startup synchronization -- Direct snapshot creation via `createSnapshot()` -- Deletion state tracking - -**Snapshot Model**: -- Status and size checking methods -- Metadata access -- Relationship with Runtime model -- Deletion state tracking - -**Space Model**: -- Item listing with proper relationship handling -- Support for both Notebooks and Lexical documents -- Lazy loading of properties - -**Notebook/Lexical Models**: -- Content management -- Update operations -- Proper serialization to JSON -- Deletion lifecycle - -## AI Notes IMPORTANT - -- Use npm, not yarn -- Run checks after changes: - - npm run format - - npm run lint - - npm run type-check - - npm run build:lib (ensure it builds with fresh output) -- Run integration tests: `npm run test:integration` -- Avoid old-school require imports -- Use playwright MCP to inspect things directly -- Check API.md for comprehensive examples of both raw API and Client usage -- **Client Usage**: Always use the handlers pattern for cross-cutting concerns instead of wrapping Client methods -- **VS Code Extension**: Use `(client as any)` casting when TypeScript definitions are incomplete - -## ag-ui (CopilotKit) Architecture (November 2024) - -### Critical Fix: Separated Hook Files - -The ag-ui adapter uses **separated hook files** to prevent Lumino widget initialization crashes: - -**Files:** - -- `src/tools/adapters/agui/notebookHooks.tsx` - Notebook-only (imports `@datalayer/jupyter-react`) -- `src/tools/adapters/agui/lexicalHooks.tsx` - Lexical-only (imports `@datalayer/jupyter-lexical`) -- `src/tools/adapters/agui/AgUIToolAdapter.ts` - Shared components (`ActionRegistrar`, `UseFrontendToolFn`) - -**Problem Solved:** - -Original combined `hooks.tsx` imported from BOTH packages, causing: - -1. When `useNotebookToolActions` was called → entire lexical package loaded -2. Lexical package initialization → creates Lumino widgets for Jupyter output nodes -3. Lumino widget initialization → **CRASH**: `Cannot set properties of undefined (setting 'class-name')` - -**Solution Benefits:** - -- ✅ Notebook example never loads lexical code (no crash) -- ✅ Lazy loading (lexical only loads when needed) -- ✅ No code duplication (shared components in `AgUIToolAdapter.ts`) -- ✅ Smaller bundles (tree-shaking eliminates unused code) - -**Critical Rule:** - -```typescript -// ❌ NEVER create combined hooks that import from both packages -import { ... } from '@datalayer/jupyter-lexical'; -import { ... } from '@datalayer/jupyter-react'; - -// ✅ ALWAYS keep hooks separated by package -// notebookHooks.tsx -import { ... } from '@datalayer/jupyter-react'; - -// lexicalHooks.tsx -import { ... } from '@datalayer/jupyter-lexical'; -``` - -## Critical Lessons Learned (January 2025) - -### Module Import/Export Issues -**Problem**: Webpack couldn't resolve namespace exports when destructured in consuming code. -**Symptom**: Runtime error "Cannot read properties of undefined (reading 'getSpaceItems')" -**Root Cause**: Using `export * as items from './items'` in index files, then importing as `import * as spacerAPI` and accessing `spacerAPI.items.getSpaceItems()` -**Solution**: Use direct module imports instead: -```typescript -// BAD - webpack can't resolve this properly -import * as spacerAPI from '../../../api/spacer'; -await spacerAPI.items.getSpaceItems(...); - -// GOOD - direct imports work -import * as items from '../../../api/spacer/items'; -await items.getSpaceItems(...); -``` - -### Code Deduplication with Abstract Base Classes -**Achievement**: Reduced code duplication by 45-47% across models -**Pattern**: Created `Item` abstract base class for Notebook, Lexical, and Cell models -**Benefits**: -- Single source of truth for common functionality -- Consistent deletion state tracking -- Unified error handling - -### Build System Improvements -**Issue**: Stale build artifacts causing confusion -**Solution**: Added clean scripts to all build commands -- `build:lib` now runs `npm run clean:lib` first -- Removes `lib/`, `dist/`, `build/`, and `tsconfig.tsbuildinfo` -- Ensures fresh builds every time - -### TypeScript Module Resolution -**Issue**: Node.js ESM requires explicit file extensions in imports -**Context**: Only matters for direct Node.js execution, not webpack bundles -**Note**: TypeScript source files don't need .js extensions - only needed if running compiled JS directly with Node - -### Debugging Approach -**Key Learning**: When fixing runtime errors in webpack bundles: -1. Check the actual TypeScript source files, not compiled JavaScript -2. Webpack module resolution differs from Node.js ESM -3. Clean rebuild (`rm -rf dist lib node_modules`) can resolve mysterious issues -4. Always verify fixes actually work in the runtime environment - -## Client Handlers Pattern (January 2025) - -### Problem Solved -Eliminated massive code duplication where consuming applications (VS Code extension, React apps) were wrapping every Client method 1:1 just to add logging, error handling, or platform-specific behavior. - -### Solution: Handlers Pattern -The Client now supports lifecycle handlers that can be injected at initialization: - -```typescript -const client = new DatalayerClient({ - token: 'your-token', - iamRunUrl: 'https://prod1.datalayer.run', - handlers: { - beforeCall: async (methodName, args) => { - console.log(`[Client] Calling ${methodName}`, args); - }, - afterCall: async (methodName, result) => { - console.log(`[Client] ${methodName} completed`, result); - }, - onError: async (methodName, error) => { - console.error(`[Client] ${methodName} failed`, error); - // Platform-specific error handling - if (error.message.includes('Not authenticated')) { - // Show platform-specific auth prompt - } - } - } -}); -``` - -### Key Implementation Details - -**Automatic Method Wrapping**: The Client automatically wraps all mixin methods with handlers: -- Located in `src/client/client/base.ts` -- Smart detection: Only wraps mixin methods, not base class infrastructure -- No hardcoded method lists - automatically detects based on prototype chain - -**Clean Mixin Composition**: Uses helper function for readable mixin composition: -```typescript -const DatalayerClientWithMixins = composeMixins( - IAMMixin, - RuntimesMixin, - SpacerMixin, -); -``` - -**TypeScript Support**: Proper interface declaration for mixin methods: -```typescript -export interface DatalayerClient { - // All mixin methods declared here for TypeScript - whoami(): Promise; - createRuntime(config: any): Promise; - // ... etc -} -``` - -### Benefits -- **Zero code duplication**: No more wrapper services -- **Platform agnostic**: Same Client works everywhere -- **Clean separation**: Business logic in Client, platform behavior in handlers -- **Type safe**: Full TypeScript support -- **Maintainable**: Add new Client methods without updating consumers - -### Removed Components -- Deleted `HealthMixin` (unnecessary complexity) -- VS Code extension: Removed `spacerService.ts` and `runtimeService.ts` -- All wrapper services replaced with direct Client usage + handlers diff --git a/README.md b/README.md index 98fe0730..5721af09 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Become a Sponsor](https://img.shields.io/static/v1?label=Become%20a%20Sponsor&message=%E2%9D%A4&logo=GitHub&style=flat&color=1ABC9C)](https://github.com/sponsors/datalayer) -# ☰ Datalayer Core +# ☰ ☢️ Datalayer Core

Python and Typescript libraries for Datalayer @@ -118,7 +118,7 @@ datalayer runtime list datalayer runtime create ai-env --given-name my-runtime-123 # Execute a script in a runtime -datalayer runtime exec my-script.py --runtime +datalayer runtime exec my-script.py --agent # Create a snapshot from a runtime but do not terminate the runtime datalayer snapshots create my-snapshot 'AI work!' False @@ -151,6 +151,29 @@ datalayer usage team-allocate-member --team-uid --member-uid --member-uid --amount 5 ``` +### 5. Evals CLI (Multi-Agentspec) + +Use comma-separated agentspec ids to create one experiment per agentspec variant: + +```bash +# Creates one experiment per agentspec in the list +datalayer evals experiments create my-exp \ + --evalset-id \ + --agent-spec-ids example-evals,example-evals-nocodemode,example-custom +``` + +Generate a comparison report: + +```bash +datalayer evals report --run-limit 50 --export +``` + +How to interpret grouped comparisons in the report: + +- `Within-Agentspec Pairwise Latest-Pass Deltas`: compares experiments using the same agentspec id. +- `Cross-Agentspec Pairwise Latest-Pass Deltas`: compares experiments using different agentspec ids. +- Pairwise sections compute all combinations for the selected experiments, not just two agentspecs. + ## Examples ### Python Examples diff --git a/datalayer_core/__version__.py b/datalayer_core/__version__.py index 0bad1d00..6109e043 100644 --- a/datalayer_core/__version__.py +++ b/datalayer_core/__version__.py @@ -3,4 +3,4 @@ """Datalayer Core version information.""" -__version__ = "1.1.24" +__version__ = "1.1.38" diff --git a/datalayer_core/agents/__init__.py b/datalayer_core/agents/__init__.py new file mode 100644 index 00000000..8000f6f4 --- /dev/null +++ b/datalayer_core/agents/__init__.py @@ -0,0 +1,40 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Runtime and agent execution helpers.""" + +from datalayer_core.agents.agent_cloud import RuntimeService +from datalayer_core.agents.agent_local import ( + DEFAULT_LOCAL_AGENT_NAME, + DEFAULT_LOCAL_HOST, + DEFAULT_LOCAL_LOG_LEVEL, + DEFAULT_LOCAL_PROTOCOL, + LocalAgentRuntime, + ensure_local_agent, + start_local_agent_runtime, + terminate_local_agent_runtime, +) +from datalayer_core.agents.utils import ( + compute_time_reservation_minutes, + create_cloud_agent_runtime, + resolve_environment_burning_rate, + teardown_agent_execution_resources, + terminate_cloud_agent_runtime, +) + +__all__ = [ + "RuntimeService", + "LocalAgentRuntime", + "DEFAULT_LOCAL_AGENT_NAME", + "DEFAULT_LOCAL_HOST", + "DEFAULT_LOCAL_LOG_LEVEL", + "DEFAULT_LOCAL_PROTOCOL", + "ensure_local_agent", + "start_local_agent_runtime", + "terminate_local_agent_runtime", + "resolve_environment_burning_rate", + "compute_time_reservation_minutes", + "create_cloud_agent_runtime", + "terminate_cloud_agent_runtime", + "teardown_agent_execution_resources", +] diff --git a/datalayer_core/runtimes/runtime.py b/datalayer_core/agents/agent_cloud.py similarity index 98% rename from datalayer_core/runtimes/runtime.py rename to datalayer_core/agents/agent_cloud.py index dd292ccc..06d62068 100644 --- a/datalayer_core/runtimes/runtime.py +++ b/datalayer_core/agents/agent_cloud.py @@ -19,9 +19,9 @@ from datalayer_core.mixins.sandbox_snapshots import SandboxSnapshotsMixin from datalayer_core.mixins.runtimes import RuntimesMixin from datalayer_core.models import ExecutionResponse +from datalayer_core.models.sandbox_snapshot import SandboxSnapshotModel from datalayer_core.models.runtime import RuntimeModel -from datalayer_core.runtimes.sandbox_snapshot import ( - SandboxSnapshotModel, +from datalayer_core.sandboxes.code_sandbox_snapshots import ( as_code_sandbox_snapshots, create_snapshot, ) @@ -60,6 +60,7 @@ def __init__( run_url: str = DEFAULT_DATALAYER_RUN_URL, iam_url: Optional[str] = None, token: Optional[str] = None, + api_key: Optional[str] = None, pod_name: Optional[str] = None, ingress: Optional[str] = None, reservation_id: Optional[str] = None, @@ -86,6 +87,8 @@ def __init__( Datalayer IAM server URL. If not provided, defaults to run_url. token : Optional[str] Authentication token (can also be set via DATALAYER_API_KEY env var). + api_key : Optional[str] + Authentication API key alias for ``token``. pod_name : Optional[str] Name of the pod running the runtime. ingress : Optional[str] @@ -110,7 +113,7 @@ def __init__( time_reservation=time_reservation, run_url=run_url, iam_url=iam_url or run_url, - token=token, + token=token or api_key, external_token=None, pod_name=pod_name, ingress=ingress, diff --git a/datalayer_core/runtimes/local.py b/datalayer_core/agents/agent_local.py similarity index 80% rename from datalayer_core/runtimes/local.py rename to datalayer_core/agents/agent_local.py index 3ab44ca4..648e170f 100644 --- a/datalayer_core/runtimes/local.py +++ b/datalayer_core/agents/agent_local.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. @@ -122,13 +125,14 @@ def start_local_agent_runtime( protocol: str = DEFAULT_LOCAL_PROTOCOL, log_level: str = DEFAULT_LOCAL_LOG_LEVEL, wait: bool = True, + disable_tool_approvals: bool = False, ) -> LocalAgentRuntime: """Launch a local ``agent-runtimes`` server as a subprocess. Parameters ---------- agent_spec_id : str - Agent spec id to boot the runtime with. + Agentspec id to boot the runtime with. agent_name : str Registered agent name/id served by the runtime. host : str @@ -172,6 +176,8 @@ def start_local_agent_runtime( "--log-level", log_level, ] + if disable_tool_approvals: + command.append("--disable-tool-approvals") runtime_env, mapped_targets = build_agent_runtime_env() if mapped_targets: @@ -238,6 +244,7 @@ def ensure_local_agent( enable_skills: bool = True, description: Optional[str] = None, timeout: int = 120, + disable_tool_approvals: bool = False, ) -> None: """Ensure a local agent with the expected transport is registered. @@ -298,6 +305,7 @@ def ensure_local_agent( "agent_spec_id": agent_spec_id, "enable_skills": enable_skills, "tools": [], + "disableToolApprovals": disable_tool_approvals, } try: response = requests.post( @@ -456,6 +464,129 @@ def extract_vercel_stream_text(raw: str) -> str: return "".join(text_parts).strip() +def _coerce_usage_payload(candidate: Any) -> dict[str, Any]: + if not isinstance(candidate, dict) or not candidate: + return {} + nested = candidate.get("usage") + if isinstance(nested, dict) and nested: + merged = dict(nested) + for key, value in candidate.items(): + if key == "usage": + continue + merged.setdefault(str(key), value) + return merged + return dict(candidate) + + +def _usage_payload_score(payload: dict[str, Any]) -> int: + if not payload: + return 0 + token_keys = { + "prompt_tokens", + "promptTokens", + "input_tokens", + "inputTokens", + "completion_tokens", + "completionTokens", + "output_tokens", + "outputTokens", + "total_tokens", + "totalTokens", + "tokens_total", + "token_total", + } + score = len(payload) + if any(key in payload for key in token_keys): + score += 100 + if any( + key in payload + for key in ( + "credits_consumed", + "creditsConsumed", + "credits", + "total_credits", + "cost_credits", + ) + ): + score += 10 + return score + + +def extract_vercel_stream_usage(raw: str) -> dict[str, Any]: + """Extract best-effort pydantic usage metadata from a Vercel AI SSE stream.""" + best: dict[str, Any] = {} + best_score = 0 + for line in raw.splitlines(): + if not line.startswith("data: "): + continue + payload = line[6:].strip() + if not payload or payload == "[DONE]": + continue + try: + event = json.loads(payload) + except json.JSONDecodeError: + continue + if not isinstance(event, dict): + continue + + candidates: list[dict[str, Any]] = [] + message_metadata = event.get("messageMetadata") + if isinstance(message_metadata, dict): + candidates.extend( + [ + _coerce_usage_payload(message_metadata.get("pydantic_ai")), + _coerce_usage_payload(message_metadata.get("pydanticAI")), + _coerce_usage_payload(message_metadata.get("usage")), + ] + ) + candidates.extend( + [ + _coerce_usage_payload(event.get("pydantic_ai_usage")), + _coerce_usage_payload(event.get("pydantic_ai")), + _coerce_usage_payload(event.get("usage")), + ] + ) + for candidate in candidates: + score = _usage_payload_score(candidate) + if score > best_score: + best = candidate + best_score = score + return best + + +def _vercel_ai_error_message(raw: str) -> Optional[str]: + """Detect a non-stream error body returned with an HTTP 200 status. + + The ``agent-runtimes`` server answers an unknown agent route with HTTP 200 + and a JSON error body (for example + ``{"error": "Agent '...' not found", "message": "No agent registered ..."}``) + instead of an SSE stream. Such a body must NOT be treated as a successful + completion, otherwise route-candidate fallback stops at the first wrong + route and an empty answer is recorded. + + Returns + ------- + Optional[str] + The error message when the body is an error payload (or an empty body), + otherwise ``None`` when the body is a genuine SSE stream. + """ + text = (raw or "").strip() + if not text: + return "Empty response body" + # A genuine Vercel AI response is an SSE stream of ``data:`` lines. + if "data:" in text: + return None + try: + payload = json.loads(text) + except json.JSONDecodeError: + return None + if isinstance(payload, dict): + error = payload.get("error") or payload.get("message") + if error: + return str(error) + return None + + def _post_vercel_ai_chat( *, endpoint: str, @@ -528,13 +659,36 @@ def _post_vercel_ai_chat( } output_text = extract_vercel_stream_text(raw) - return { + usage = extract_vercel_stream_usage(raw) + if not output_text: + error_message = _vercel_ai_error_message(raw) + if error_message is not None: + message_text = ( + f"{source_label} chat returned no output: {error_message}" + ) + return { + "status": "failed", + "output": {"text": "", "raw_stream_excerpt": raw[:2000]}, + "failure_cause": { + "stage": "runtime_execution", + "type": "runtime_agent_unavailable", + "message": message_text, + "detail_excerpt": raw[:2000] or message_text, + "execution_url": endpoint, + }, + } + output: dict[str, Any] = { + "text": output_text, + "raw_stream_excerpt": raw[:2000], + } + result: dict[str, Any] = { "status": "completed", - "output": { - "text": output_text, - "raw_stream_excerpt": raw[:2000], - }, + "output": output, } + if usage: + output["pydantic_ai_usage"] = usage + result["usage"] = usage + return result def run_local_agent_chat( @@ -606,7 +760,7 @@ def runtime_route_candidates( The ``agent-runtimes`` server inside a cloud runtime may register its agent under different names depending on how it was launched. Trying a few known - candidates (explicit agent name, agent spec id, pod name, then the default + candidates (explicit agent name, agentspec id, pod name, then the default route) makes cloud execution resilient. """ candidates: list[str] = [] diff --git a/datalayer_core/runtimes/agent_runtime.py b/datalayer_core/agents/utils.py similarity index 95% rename from datalayer_core/runtimes/agent_runtime.py rename to datalayer_core/agents/utils.py index 27856a57..ae9a78dd 100644 --- a/datalayer_core/runtimes/agent_runtime.py +++ b/datalayer_core/agents/utils.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. @@ -144,9 +147,9 @@ def create_cloud_agent_runtime( name : Optional[str] Optional runtime name. agent_spec_id : Optional[str] - Registered agent spec id (ignored when ``agent_spec`` is provided). + Registered agentspec id (ignored when ``agent_spec`` is provided). agent_spec : Optional[dict[str, Any]] - Inline agent spec payload (takes precedence over ``agent_spec_id``). + Inline agentspec payload (takes precedence over ``agent_spec_id``). credits_limit : Optional[float] Target credits budget used to derive ``time_reservation`` when the latter is not supplied. @@ -284,7 +287,7 @@ def teardown_agent_execution_resources( if target == "local": if local_base_url and token and local_agent_name: - from datalayer_core.runtimes.local import delete_local_agent + from datalayer_core.agents.agent_local import delete_local_agent result["local_agent_deleted"] = delete_local_agent( base_url=local_base_url, @@ -292,7 +295,7 @@ def teardown_agent_execution_resources( agent_name=local_agent_name, ) if local_runtime is not None: - from datalayer_core.runtimes.local import terminate_local_agent_runtime + from datalayer_core.agents.agent_local import terminate_local_agent_runtime terminate_local_agent_runtime(local_runtime) result["local_runtime_terminated"] = True diff --git a/datalayer_core/base/serverapplication.py b/datalayer_core/base/serverapplication.py index 0a00ee97..4a24daf2 100644 --- a/datalayer_core/base/serverapplication.py +++ b/datalayer_core/base/serverapplication.py @@ -3,7 +3,7 @@ """The Datalayer Core Server application.""" -import os +from pathlib import Path from jupyter_server.extension.application import ExtensionApp, ExtensionAppJinjaMixin from jupyter_server.utils import url_path_join @@ -18,9 +18,9 @@ from datalayer_core.handlers.service_worker.handler import ServiceWorkerHandler from datalayer_core.utils.urls import DEFAULT_DATALAYER_IAM_URL -DEFAULT_STATIC_FILES_PATH = os.path.join(os.path.dirname(__file__), "./static") - -DEFAULT_TEMPLATE_FILES_PATH = os.path.join(os.path.dirname(__file__), "./templates") +_PACKAGE_ROOT = Path(__file__).resolve().parent.parent +DEFAULT_STATIC_FILES_PATH = str(_PACKAGE_ROOT / "static") +DEFAULT_TEMPLATE_FILES_PATH = str(_PACKAGE_ROOT / "templates") class DatalayerExtensionApp(ExtensionAppJinjaMixin, ExtensionApp): diff --git a/datalayer_core/cli/__main__.py b/datalayer_core/cli/__main__.py index 8413fcd8..474f5585 100644 --- a/datalayer_core/cli/__main__.py +++ b/datalayer_core/cli/__main__.py @@ -5,10 +5,12 @@ import os import sys +from typing import Optional import typer from datalayer_core.__version__ import __version__ +from datalayer_core.authn import AuthenticationManager from datalayer_core.cli.commands.about import app as about_app from datalayer_core.cli.commands.agents import agents_ls from datalayer_core.cli.commands.agents import app as agents_app @@ -34,20 +36,19 @@ from datalayer_core.cli.commands.otel import app as otel_app from datalayer_core.cli.commands.pools import app as pools_app from datalayer_core.cli.commands.ray import app as ray_app -from datalayer_core.cli.commands.runtime_checkpoints import app as checkpoints_app -from datalayer_core.cli.commands.runtime_checkpoints import ( +from datalayer_core.cli.commands.checkpoints import app as checkpoints_app +from datalayer_core.cli.commands.checkpoints import ( checkpoints_ls, ) from datalayer_core.cli.commands.sandbox_snapshots import app as snapshots_app from datalayer_core.cli.commands.sandbox_snapshots import snapshots_ls -from datalayer_core.cli.commands.runtimes import app as runtimes_app -from datalayer_core.cli.commands.runtimes import runtimes_ls +from datalayer_core.cli.commands.schedules import app as schedules_app from datalayer_core.cli.commands.secrets import app as secrets_app from datalayer_core.cli.commands.secrets import secrets_ls from datalayer_core.cli.commands.subscription import app as subscription_app from datalayer_core.cli.commands.subscription import subscription_root -from datalayer_core.cli.commands.tokens import app as tokens_app -from datalayer_core.cli.commands.tokens import tokens_ls +from datalayer_core.cli.commands.api_keys import app as api_keys_app +from datalayer_core.cli.commands.api_keys import api_keys_ls from datalayer_core.cli.commands.usage import app as usage_app from datalayer_core.cli.commands.usage import usage_root from datalayer_core.cli.commands.plans import app as plans_app @@ -63,6 +64,48 @@ def version_callback(value: bool) -> None: raise typer.Exit() +def _lookup_billable_account_uid_by_handle( + *, iam_url: str, access_token: str, account_handle: str +) -> Optional[str]: + """Resolve an account handle to UID using IAM APIs.""" + import requests + + handle = str(account_handle or "").strip().lower() + if not handle: + return None + + headers = {"Authorization": f"Bearer {access_token}"} + + # 1) Directly match the authenticated user's own handle. + whoami_response = requests.get( + f"{iam_url.rstrip('/')}/api/iam/v1/whoami", + headers=headers, + timeout=10, + ) + if whoami_response.status_code == 200: + payload = whoami_response.json() + profile = payload.get("profile") or {} + profile_handle = str(profile.get("handle") or "").strip().lower() + if profile_handle == handle: + return str(profile.get("uid") or "").strip() or None + + # 2) Match organizations and teams from memberships. + memberships_response = requests.get( + f"{iam_url.rstrip('/')}/api/iam/v1/memberships", + headers=headers, + timeout=10, + ) + if memberships_response.status_code != 200: + return None + memberships_payload = memberships_response.json() + memberships = memberships_payload.get("memberships") or [] + for membership in memberships: + membership_handle = str(membership.get("handle") or "").strip().lower() + if membership_handle == handle: + return str(membership.get("uid") or "").strip() or None + return None + + # Create the main Typer app app = typer.Typer( name="dla", @@ -152,6 +195,35 @@ def main_callback( "--mcp-server-url", help="Override DATALAYER_MCP_SERVER_URL for this CLI invocation.", ), + scheduler_url: str | None = typer.Option( + None, + "--scheduler-url", + help="Override DATALAYER_SCHEDULER_URL for this CLI invocation.", + ), + api_key: str | None = typer.Option( + None, + "--api-key", + help=( + "Auth token for backend calls. Falls back to DATALAYER_API_KEY when " + "omitted; otherwise built-in auth resolution is used." + ), + ), + billable_account_uid: str | None = typer.Option( + None, + "--billable-account-uid", + help=( + "Billable account UID context. Falls back to DATALAYER_ACCOUNT_UID " + "when omitted." + ), + ), + billable_account_handle: str | None = typer.Option( + None, + "--billable-account-handle", + help=( + "Billable account handle context. Falls back to DATALAYER_ACCOUNT_HANDLE " + "when omitted and is resolved to UID via IAM lookup." + ), + ), ) -> None: """Main callback to handle global options.""" overrides = { @@ -169,11 +241,62 @@ def main_callback( "DATALAYER_STATUS_URL": status_url, "DATALAYER_SUPPORT_URL": support_url, "DATALAYER_MCP_SERVER_URL": mcp_server_url, + "DATALAYER_SCHEDULER_URL": scheduler_url, } for env_name, value in overrides.items(): if value is not None: os.environ[env_name] = value.rstrip("/") + # Global auth option: explicit flag overrides env; when omitted keep normal + # command behavior (env var token or stored auth token). + if api_key is not None: + normalized_api_key = str(api_key).strip() + if normalized_api_key: + os.environ["DATALAYER_API_KEY"] = normalized_api_key + + # Global billable context defaults. + resolved_uid = str(billable_account_uid or "").strip() or str( + os.environ.get("DATALAYER_ACCOUNT_UID") or "" + ).strip() + resolved_handle = str(billable_account_handle or "").strip() or str( + os.environ.get("DATALAYER_ACCOUNT_HANDLE") or "" + ).strip() + + # Convert handle -> uid only when uid is not already known. + if not resolved_uid and resolved_handle: + effective_iam_url = str(os.environ.get("DATALAYER_IAM_URL") or "").strip() + if not effective_iam_url: + effective_iam_url = "http://localhost:9700" + + resolved_token = str(os.environ.get("DATALAYER_API_KEY") or "").strip() + if not resolved_token: + auth = AuthenticationManager(iam_url=effective_iam_url) + resolved_token = str(auth.get_stored_token() or "").strip() + + if not resolved_token: + raise typer.BadParameter( + "Cannot resolve --billable-account-handle without authentication. " + "Pass --api-key, set DATALAYER_API_KEY, or login first." + ) + + resolved_from_handle = _lookup_billable_account_uid_by_handle( + iam_url=effective_iam_url, + access_token=resolved_token, + account_handle=resolved_handle, + ) + if not resolved_from_handle: + raise typer.BadParameter( + f"Could not resolve billable account handle '{resolved_handle}' to a UID." + ) + resolved_uid = resolved_from_handle + + if resolved_uid: + os.environ["DATALAYER_ACCOUNT_UID"] = resolved_uid + # Keep backward compatibility with existing scripts. + os.environ["DATALAYER_BILLABLE_ACCOUNT_UID"] = resolved_uid + if resolved_handle: + os.environ["DATALAYER_ACCOUNT_HANDLE"] = resolved_handle + # Register commands (without name to add them at the top level) app.add_typer(about_app) @@ -191,11 +314,11 @@ def main_callback( app.add_typer(otel_app) app.add_typer(pools_app) app.add_typer(ray_app) -app.add_typer(runtimes_app) +app.add_typer(schedules_app) app.add_typer(secrets_app) app.add_typer(snapshots_app) app.add_typer(subscription_app) -app.add_typer(tokens_app) +app.add_typer(api_keys_app) app.add_typer(users_app) app.add_typer(usage_app) app.add_typer(plans_app) @@ -214,11 +337,10 @@ def main_callback( # Add convenient aliases at root level app.command(name="envs-ls")(envs_ls) -app.command(name="runtimes-ls")(runtimes_ls) app.command(name="secrets-ls")(secrets_ls) app.command(name="snapshots-ls")(snapshots_ls) app.command(name="checkpoints-ls")(checkpoints_ls) -app.command(name="tokens-ls")(tokens_ls) +app.command(name="api-keys-ls")(api_keys_ls) app.command(name="agent-nodes-ls")(agent_nodes_ls) app.command(name="agents-ls")(agents_ls) @@ -239,6 +361,10 @@ def main_callback( "--status-url", "--support-url", "--mcp-server-url", + "--scheduler-url", + "--api-key", + "--billable-account-uid", + "--billable-account-handle", } _GLOBAL_OPTIONS_NO_VALUES = { diff --git a/datalayer_core/cli/commands/README.md b/datalayer_core/cli/commands/README.md index 5c0d401c..f14f3ca6 100644 --- a/datalayer_core/cli/commands/README.md +++ b/datalayer_core/cli/commands/README.md @@ -1,37 +1,47 @@ # Exec Module -The `exec` module provides functionality to execute Python files and Jupyter notebooks on Datalayer runtimes. +The `exec` module provides functionality to execute Python files and Jupyter notebooks on Datalayer code sandboxes. ## Commands ### `dla exec` -Execute a Python file or Jupyter notebook on a Datalayer runtime. +Execute a Python file or Jupyter notebook on a Datalayer code sandbox. **Usage:** ```bash -dla exec --runtime [options] +dla exec [options] +dla exec --example-py [options] +dla exec --example-notebook [options] ``` **Arguments:** -- `filename`: Path to the Python file (.py) or Jupyter notebook (.ipynb) to execute +- `filename`: Path to the Python file (.py) or Jupyter notebook (.ipynb) to execute (optional when using `--example-py` or `--example-notebook`) **Options:** -- `--runtime, -r`: Name of the runtime to execute on (required) +- `--sandbox, -s`: Name of the code sandbox to execute on (optional) - `--verbose, -v`: Show all cell outputs (default: false, outputs are suppressed) - `--timeout, -t`: Execution timeout for each cell in seconds - `--raise`: Stop executing if an exception occurs (default: continue on errors) +- `--example-py`: Create and execute a temporary example Python file +- `--example-notebook`: Create and execute a temporary example notebook **Examples:** ```bash -# Execute a Python script on a runtime -dla exec script.py --runtime my-runtime +# Execute a Python script on a code sandbox +dla exec script.py --sandbox my-sandbox + +# Execute an auto-generated Python example +dla exec --example-py --sandbox my-sandbox + +# Execute an auto-generated notebook example +dla exec --example-notebook # Execute a Jupyter notebook with verbose output -dla exec notebook.ipynb --runtime my-runtime --verbose +dla exec notebook.ipynb --sandbox my-sandbox --verbose # Execute with timeout and stop on errors -dla exec script.py --runtime my-runtime --timeout 30 --raise +dla exec script.py --sandbox my-sandbox --timeout 30 --raise ``` ## File Support @@ -41,11 +51,11 @@ The exec module supports: - **Python files (.py)**: The entire file content is executed as a single cell - **Jupyter notebooks (.ipynb)**: Each code cell is executed sequentially, markdown cells are skipped -## Runtime Connection +## Code Sandbox Connection The exec module uses the modern `DatalayerClient` and `RuntimeManager` to: -1. Connect to the specified runtime +1. Connect to the specified code sandbox 2. Start a kernel session 3. Execute cells sequentially 4. Handle interrupts (Ctrl+C) gracefully @@ -54,7 +64,7 @@ The exec module uses the modern `DatalayerClient` and `RuntimeManager` to: ## Error Handling - File validation (existence, readability) -- Runtime connection errors +- Code sandbox connection errors - Cell execution errors (can continue or stop based on `--raise` flag) - Proper cleanup on interruption or failure @@ -63,5 +73,5 @@ The exec module uses the modern `DatalayerClient` and `RuntimeManager` to: The exec functionality is implemented in: - `datalayer_core/cli/exec/exec.py`: Main Typer-based CLI commands - Uses `datalayer_core/utils/notebook.get_cells()` for file parsing -- Uses `datalayer_core/cli/console/manager.RuntimeManager` for runtime connection +- Uses `datalayer_core/cli/console/manager.RuntimeManager` for code sandbox connection - Integrates with the main CLI via `datalayer_core/cli/__main__.py` diff --git a/datalayer_core/cli/commands/agents.py b/datalayer_core/cli/commands/agents.py index 83798aef..e82d6148 100644 --- a/datalayer_core/cli/commands/agents.py +++ b/datalayer_core/cli/commands/agents.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. @@ -13,10 +16,11 @@ import typer import yaml from rich.console import Console +from rich.panel import Panel +from rich.table import Table from datalayer_core.client.client import DatalayerClient -from datalayer_core.displays.runtimes import display_runtimes -from datalayer_core.runtimes.local import ( +from datalayer_core.agents.agent_local import ( DEFAULT_LOCAL_AGENT_NAME, DEFAULT_LOCAL_HOST, DEFAULT_LOCAL_LOG_LEVEL, @@ -25,6 +29,8 @@ start_local_agent_runtime, terminate_local_agent_runtime, ) +from datalayer_core.utils.network import fetch +from datalayer_core.utils.date import timestamp_to_local_date from datalayer_core.utils.urls import DatalayerURLs DEFAULT_AGENT_SPEC_ID = "example-simple" @@ -98,6 +104,127 @@ def _load_agent_spec(spec_source: str) -> dict[str, Any]: return parsed +def _resolve_billable_account_details( + *, + client: DatalayerClient, + billable_account_uid: str, +) -> dict[str, str]: + """Resolve account metadata from IAM whoami/memberships payloads. + + When no explicit billable account UID is provided by the runtime payload, + fall back to the authenticated user profile from whoami. + """ + + resolved_token = str(client._get_token() or "").strip() + if not resolved_token: + return {"uid": billable_account_uid} if billable_account_uid else {} + + iam_base = str(client.urls.iam_url).rstrip("/") + headers = {"Authorization": f"Bearer {resolved_token}"} + + try: + whoami_response = requests.get( + f"{iam_base}/api/iam/v1/whoami", + headers=headers, + timeout=10, + ) + except Exception: + whoami_response = None + + if whoami_response is not None and whoami_response.status_code == 200: + payload = whoami_response.json() + profile = payload.get("profile") or {} + profile_uid = str(profile.get("uid") or "").strip() + if profile_uid and (not billable_account_uid or profile_uid == billable_account_uid): + full_name = str(profile.get("name") or "").strip() + if not full_name: + first_name = str(profile.get("first_name") or "").strip() + last_name = str(profile.get("last_name") or "").strip() + full_name = " ".join(p for p in [first_name, last_name] if p) + return { + "uid": profile_uid, + "handle": str(profile.get("handle") or "").strip(), + "type": str(profile.get("type") or "user").strip() or "user", + "name": full_name, + "description": str(profile.get("description") or "").strip(), + } + + try: + memberships_response = requests.get( + f"{iam_base}/api/iam/v1/memberships", + headers=headers, + timeout=10, + ) + except Exception: + memberships_response = None + + if memberships_response is not None and memberships_response.status_code == 200: + memberships_payload = memberships_response.json() + memberships = memberships_payload.get("memberships") or [] + for membership in memberships: + uid = str((membership or {}).get("uid") or "").strip() + if uid == billable_account_uid: + return { + "uid": billable_account_uid, + "handle": str((membership or {}).get("handle") or "").strip(), + "type": str((membership or {}).get("type") or "").strip(), + "name": str((membership or {}).get("name") or "").strip(), + "description": str( + (membership or {}).get("description") or "" + ).strip(), + } + + return {"uid": billable_account_uid} if billable_account_uid else {} + + +def _resolve_agentspec_label(runtime_payload: dict[str, Any]) -> str: + """Best-effort extraction of agentspec identifier from runtime payload.""" + candidates = [ + runtime_payload.get("agent_spec_id"), + runtime_payload.get("agentspec_id"), + runtime_payload.get("agentSpecId"), + ] + for candidate in candidates: + value = str(candidate or "").strip() + if value: + return value + return "n/a" + + +def _billable_uid_label( + *, + billable_uid: str, + authenticated_uid: str, + rich: bool = False, +) -> str: + """Human label for billable UID in text/raw outputs.""" + if billable_uid and authenticated_uid and billable_uid == authenticated_uid: + return "[bold green]me[/bold green]" if rich else "me" + return billable_uid or "n/a" + + +def _print_runtime_summary_panel( + *, + title: str, + identifier: str, + agentspec: str, + url: str, +) -> None: + """Render a compact runtime summary panel.""" + lines = [ + f"Identifier: {identifier}", + f"Agentspec: {agentspec}", + f"URL: {url}", + ] + console.print( + Panel( + "\n".join(lines), + title=title, + border_style="green", + ) + ) + + def _create_local_agent_runtime( *, agent_spec_id: str, @@ -150,7 +277,7 @@ def _create_local_agent_runtime( f"[green]Local agent runtime '{agent_name}' started![/green]" ) console.print(f"Base URL: {runtime.base_url}") - console.print(f"Agent spec id: {agent_spec_id}") + console.print(f"Agentspec id: {agent_spec_id}") console.print(f"Chat endpoint: {runtime.chat_endpoint}") console.print("[dim]Press Ctrl+C to stop the local runtime.[/dim]") @@ -186,23 +313,62 @@ def list_agents( try: client = _make_client(token=token, iam_url=iam_url, runtimes_url=runtimes_url) runtimes = client.list_runtimes() - runtime_dicts: list[dict[str, Any]] = [] + + authenticated_uid = str( + _resolve_billable_account_details( + client=client, + billable_account_uid="", + ).get("uid") + or "" + ).strip() + + table = Table(title="Agents") + table.add_column("ID", style="cyan", no_wrap=True) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Environment", style="cyan", no_wrap=True) + table.add_column("Billable Account UID", style="cyan", no_wrap=True) + table.add_column("Expired At", style="cyan", no_wrap=True) + for runtime in runtimes: - runtime_dicts.append( - { - "given_name": runtime.name, - "environment_name": runtime.environment, - "pod_name": runtime.pod_name, - "ingress": runtime.ingress, - "reservation_id": runtime.reservation_id, - "uid": runtime.uid, - "burning_rate": runtime.burning_rate, - "token": runtime.jupyter_token, - "started_at": runtime.started_at, - "expired_at": runtime.expired_at, - } + runtime_payload: dict[str, Any] = {} + ownership_payload: dict[str, Any] = {} + pod_name = str(runtime.pod_name or "") + if pod_name: + try: + runtime_response = client._get_runtime(pod_name) + runtime_payload = runtime_response.get("runtime") or {} + ownership_payload = runtime_payload.get("ownership") or {} + except Exception: + runtime_payload = {} + ownership_payload = {} + + billable_uid = str( + runtime_payload.get("billable_account_uid") + or ownership_payload.get("billable_account_uid") + or getattr(runtime, "billable_account_uid", "") + or "" + ).strip() + if not billable_uid and authenticated_uid: + billable_uid = authenticated_uid + + display_billable_uid = _billable_uid_label( + billable_uid=billable_uid, + authenticated_uid=authenticated_uid, + rich=True, + ) + + expired_at = runtime.expired_at + table.add_row( + pod_name, + str(runtime.name or ""), + str(runtime.environment or ""), + display_billable_uid, + "Never" + if expired_at is None + else timestamp_to_local_date(expired_at), ) - display_runtimes(runtime_dicts) + + console.print(table) except Exception as exc: console.print(f"[red]Error listing agent runtimes: {exc}[/red]") raise typer.Exit(1) @@ -220,14 +386,14 @@ def create_agent_runtime( None, "--agentspec-id", help=( - "Agent spec id for runtime bootstrap. " + "Agentspec id for runtime bootstrap. " f"Defaults to {DEFAULT_AGENT_SPEC_ID} when --agentspec is omitted." ), ), spec: Optional[str] = typer.Option( None, "--agentspec", - help="Agent spec source as YAML/JSON URL or local file path.", + help="Agentspec source as YAML/JSON URL or local file path.", ), time_reservation: Optional[float] = typer.Option( 10.0, @@ -272,7 +438,12 @@ def create_agent_runtime( local: bool = typer.Option( False, "--local", - help="Launch the agent as a local agent-runtimes server instead of a cloud runtime.", + help="Launch the agent as a local agent-runtimes server.", + ), + cloud: bool = typer.Option( + False, + "--cloud", + help="Launch the agent as a cloud runtime.", ), host: str = typer.Option( DEFAULT_LOCAL_HOST, @@ -295,10 +466,10 @@ def create_agent_runtime( help="Log level for the local runtime process (only with --local).", ), ) -> None: - """Create a new runtime preloaded with an agent spec. + """Create a new runtime preloaded with an agentspec. - By default creates a cloud runtime. With ``--local`` it launches a local - ``agent-runtimes`` server and serves until interrupted (Ctrl+C). + By default creates a cloud runtime. Use ``--local`` for a local + ``agent-runtimes`` server, or ``--cloud`` to be explicit. """ import questionary @@ -308,6 +479,9 @@ def create_agent_runtime( "Use either --agentspec-id or --agentspec, not both." ) + if local and cloud: + raise typer.BadParameter("Use only one of --local or --cloud.") + if local: if spec: raise typer.BadParameter( @@ -366,6 +540,35 @@ def create_agent_runtime( billable_account_handle=billable_account_handle, ) + authenticated_uid = str( + _resolve_billable_account_details( + client=client, + billable_account_uid="", + ).get("uid") + or "" + ).strip() + + created_runtime_payload: dict[str, Any] = {} + ownership_payload: dict[str, Any] = {} + created_pod_name = str(runtime.pod_name or "") + if created_pod_name: + try: + created_runtime_response = client._get_runtime(created_pod_name) + created_runtime_payload = created_runtime_response.get("runtime") or {} + ownership_payload = created_runtime_payload.get("ownership") or {} + except Exception: + created_runtime_payload = {} + ownership_payload = {} + + billable_uid = str( + created_runtime_payload.get("billable_account_uid") + or ownership_payload.get("billable_account_uid") + or billable_account_uid + or "" + ).strip() + if not billable_uid and authenticated_uid: + billable_uid = authenticated_uid + if raw: payload = { "success": True, @@ -379,6 +582,11 @@ def create_agent_runtime( "burning_rate": runtime.burning_rate, "started_at": runtime.started_at, "expired_at": runtime.expired_at, + "billable_account_uid": billable_uid or None, + "billable_account_uid_label": _billable_uid_label( + billable_uid=billable_uid, + authenticated_uid=authenticated_uid, + ), }, "agent_spec_id": resolved_spec_id, "agent_spec_source": spec or "", @@ -386,15 +594,15 @@ def create_agent_runtime( console.print(json.dumps(payload, ensure_ascii=False)) return - console.print(f"[green]Agent runtime '{runtime.name}' created successfully![/green]") - if runtime.pod_name: - console.print(f"Pod: {runtime.pod_name}") - if runtime.ingress: - console.print(f"Ingress: {runtime.ingress}") - if resolved_spec_id: - console.print(f"Agent spec id: {resolved_spec_id}") - elif spec: - console.print(f"Agent spec source: {spec}") + spec_label = resolved_spec_id or spec or "n/a" + identifier = str(runtime.pod_name or runtime.uid or runtime.name or "") + url = str(runtime.ingress or "") + _print_runtime_summary_panel( + title="Agent Runtime Created", + identifier=identifier, + agentspec=spec_label, + url=url, + ) except typer.Exit: raise @@ -459,7 +667,28 @@ def get_agent_runtime( raise typer.Exit(0) pod_name = selected + runtime_response = client._get_runtime(pod_name) + runtime_payload = runtime_response.get("runtime") or {} + ownership_payload = runtime_payload.get("ownership") or {} runtime = client.get_runtime(pod_name) + + authenticated_uid = str( + _resolve_billable_account_details( + client=client, + billable_account_uid="", + ).get("uid") + or "" + ).strip() + + billable_uid = str( + runtime_payload.get("billable_account_uid") + or ownership_payload.get("billable_account_uid") + or getattr(runtime, "billable_account_uid", "") + or "" + ).strip() + if not billable_uid and authenticated_uid: + billable_uid = authenticated_uid + runtime_dict = { "given_name": runtime.name, "environment_name": runtime.environment, @@ -471,6 +700,11 @@ def get_agent_runtime( "token": runtime.jupyter_token, "started_at": runtime.started_at, "expired_at": runtime.expired_at, + "billable_account_uid": billable_uid or None, + "billable_account_uid_label": _billable_uid_label( + billable_uid=billable_uid, + authenticated_uid=authenticated_uid, + ), } if raw: @@ -481,7 +715,12 @@ def get_agent_runtime( ) return - display_runtimes([runtime_dict]) + _print_runtime_summary_panel( + title="Agent Runtime", + identifier=str(runtime.pod_name or runtime.uid or runtime.name or ""), + agentspec=_resolve_agentspec_label(runtime_payload), + url=str(runtime.ingress or ""), + ) except typer.Exit: raise @@ -664,4 +903,307 @@ def agents_ls( ), ) -> None: """List running agent runtimes (root command alias).""" - list_agents(token=token, iam_url=iam_url, runtimes_url=runtimes_url) \ No newline at end of file + list_agents(token=token, iam_url=iam_url, runtimes_url=runtimes_url) + + +@app.command(name="inspect") +def inspect_agent_runtime( + agent: Optional[str] = typer.Option( + None, + "--agent", + "-a", + help="Agent identifier (pod name, uid, or given name). Defaults to first running runtime.", + ), + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), + iam_url: Optional[str] = typer.Option( + None, + "--iam-url", + help="Datalayer IAM server URL", + ), + runtimes_url: Optional[str] = typer.Option( + None, + "--runtimes-url", + help="Datalayer Runtimes server URL", + ), +) -> None: + """Inspect an agent runtime and list available code sandboxes.""" + try: + client = _make_client(token=token, iam_url=iam_url, runtimes_url=runtimes_url) + runtimes = client.list_runtimes() + if not runtimes: + console.print("[yellow]No running runtimes found.[/yellow]") + raise typer.Exit(1) + + selected = None + if agent: + for candidate in runtimes: + if agent in {candidate.pod_name, candidate.uid, candidate.name}: + selected = candidate + break + if selected is None: + console.print(f"[red]Agent '{agent}' not found.[/red]") + raise typer.Exit(1) + else: + selected = runtimes[0] + + pod_name = selected.pod_name or "" + runtime_response = client._get_runtime(pod_name) + runtime_payload = runtime_response.get("runtime") or {} + ownership_payload = runtime_payload.get("ownership") or {} + + refreshed = client.get_runtime(pod_name) + endpoint = str(refreshed.ingress or "").rstrip("/") + runtime_token = str(refreshed.jupyter_token or client._get_token() or "") + if not endpoint: + console.print("[red]Runtime has no ingress endpoint.[/red]") + raise typer.Exit(1) + + billable_account_uid = str( + runtime_payload.get("billable_account_uid") + or ownership_payload.get("billable_account_uid") + or "" + ).strip() + billable_account_handle = str( + runtime_payload.get("billable_account_handle") + or ownership_payload.get("billable_account_handle") + or "" + ).strip() + billable_account_kind = str( + runtime_payload.get("billable_account_kind") + or ownership_payload.get("billable_account_kind") + or runtime_payload.get("billable_account_type") + or ownership_payload.get("billable_account_type") + or "" + ).strip() + + account_details = _resolve_billable_account_details( + client=client, + billable_account_uid=billable_account_uid, + ) + authenticated_uid = str( + _resolve_billable_account_details( + client=client, + billable_account_uid="", + ).get("uid") + or "" + ).strip() + billable_account_uid = str( + account_details.get("uid") or billable_account_uid or "" + ).strip() + display_billable_uid = _billable_uid_label( + billable_uid=billable_account_uid, + authenticated_uid=authenticated_uid, + rich=True, + ) + resolved_handle = str( + account_details.get("handle") or billable_account_handle or "" + ).strip() + resolved_kind = str( + account_details.get("type") or billable_account_kind or "" + ).strip() + resolved_name = str(account_details.get("name") or "").strip() + resolved_description = str(account_details.get("description") or "").strip() + + kernel_endpoints = [f"{endpoint}/api/kernels"] + if "/jupyter/server/" in endpoint: + host_prefix, remainder = endpoint.split("/jupyter/server/", 1) + path_parts = [part for part in remainder.split("/") if part] + if path_parts: + pool = path_parts[0] + kernel_endpoints.append( + f"{host_prefix}/jupyter/server/{pool}/api/kernels" + ) + kernel_endpoints.append(f"{host_prefix}/jupyter/api/kernels") + kernel_endpoints.append(f"{endpoint}/jupyter/api/kernels") + + # Deduplicate while preserving order. + deduped_kernel_endpoints: list[str] = [] + seen_endpoints: set[str] = set() + for kernel_url in kernel_endpoints: + if kernel_url not in seen_endpoints: + seen_endpoints.add(kernel_url) + deduped_kernel_endpoints.append(kernel_url) + kernel_endpoints = deduped_kernel_endpoints + + kernels: list[Any] = [] + kernel_endpoint_used = "" + kernel_lookup_error = "" + for kernel_url in kernel_endpoints: + try: + response = fetch(kernel_url, token=runtime_token, timeout=15) + payload = response.json() if response.content else [] + if isinstance(payload, list): + kernels = payload + else: + kernels = [] + kernel_endpoint_used = kernel_url + kernel_lookup_error = "" + break + except Exception as exc: + kernel_lookup_error = str(exc) + + if not isinstance(kernels, list): + kernels = [] + + _print_runtime_summary_panel( + title="Agent Runtime Inspection", + identifier=str(refreshed.pod_name or refreshed.uid or refreshed.name or ""), + agentspec=_resolve_agentspec_label(runtime_payload), + url=endpoint, + ) + + summary = Table(title="Agent Runtime Inspection") + summary.add_column("Field", style="cyan") + summary.add_column("Value") + summary.add_row("Runtime", str(refreshed.name or pod_name)) + summary.add_row("Pod", str(pod_name)) + summary.add_row("UID", str(refreshed.uid or "")) + summary.add_row("Ingress", endpoint) + summary.add_row("Billable Account UID", display_billable_uid) + if kernel_endpoint_used: + summary.add_row("Kernels", str(len(kernels))) + summary.add_row("Kernel API", kernel_endpoint_used) + else: + summary.add_row("Kernels", "unavailable") + summary.add_row("Kernel API", "not exposed via ingress") + console.print(summary) + + account_table = Table(title="Billable Account") + account_table.add_column("Field", style="cyan") + account_table.add_column("Value") + account_table.add_row("UID", display_billable_uid) + account_table.add_row("Handle", resolved_handle or "n/a") + account_table.add_row("Type", resolved_kind or "n/a") + account_table.add_row("Name", resolved_name or "n/a") + account_table.add_row("Description", resolved_description or "n/a") + console.print(account_table) + + code_sandboxes_table = Table(title="Available Code Sandboxes") + code_sandboxes_table.add_column("ID", style="green") + code_sandboxes_table.add_column("Name") + code_sandboxes_table.add_column("State") + code_sandboxes_table.add_column("Connections") + code_sandboxes_table.add_column("Last Activity") + + for kernel in kernels: + code_sandboxes_table.add_row( + str((kernel or {}).get("id") or ""), + str((kernel or {}).get("name") or ""), + str((kernel or {}).get("execution_state") or ""), + str((kernel or {}).get("connections") or "0"), + str((kernel or {}).get("last_activity") or ""), + ) + + if kernels: + console.print(code_sandboxes_table) + else: + if kernel_lookup_error: + console.print( + "[yellow]Kernel list unavailable (all probed endpoints failed).[/yellow]" + ) + console.print( + "[dim]Probed endpoints:[/dim]" + ) + for kernel_url in kernel_endpoints: + console.print(f"[dim]- {kernel_url}[/dim]") + console.print(f"[dim]Last error: {kernel_lookup_error}[/dim]") + else: + console.print("[yellow]No kernels returned by runtime API.[/yellow]") + except typer.Exit: + raise + except Exception as exc: + console.print(f"[red]Error inspecting agent runtime: {exc}[/red]") + raise typer.Exit(1) + + +@app.command(name="health") +def health_agent_runtime( + agent: Optional[str] = typer.Option( + None, + "--agent", + "-a", + help="Agent identifier (pod name, uid, or given name). Defaults to first running runtime.", + ), + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), + api_key: Optional[str] = typer.Option( + None, + "--api-key", + help="Authentication API key (alias for --token).", + ), + iam_url: Optional[str] = typer.Option( + None, + "--iam-url", + help="Datalayer IAM server URL", + ), + runtimes_url: Optional[str] = typer.Option( + None, + "--runtimes-url", + help="Datalayer Runtimes server URL", + ), +) -> None: + """Check agent runtime health by executing a probe on the sandbox.""" + try: + client = _make_client( + token=token or api_key, + iam_url=iam_url, + runtimes_url=runtimes_url, + ) + runtimes = client.list_runtimes() + if not runtimes: + console.print("[yellow]No running runtimes found.[/yellow]") + raise typer.Exit(1) + + selected = None + if agent: + for candidate in runtimes: + if agent in {candidate.pod_name, candidate.uid, candidate.name}: + selected = candidate + break + if selected is None: + console.print(f"[red]Agent '{agent}' not found.[/red]") + raise typer.Exit(1) + else: + selected = runtimes[0] + + pod_name = selected.pod_name or selected.uid or selected.name or "" + refreshed = client.get_runtime(pod_name) + health = client.check_runtime_health( + pod_name, + api_key=api_key, + ) + + health_status = "alive" if bool(health.get("success")) else "unreachable" + detail = str(health.get("message") or "health probe failed") + probe_mode = str(health.get("probe_mode") or "n/a") + + table = Table(title="Agent Runtime Health") + table.add_column("Field", style="cyan") + table.add_column("Value") + table.add_row("Runtime", str(refreshed.name or pod_name)) + table.add_row("Pod", str(pod_name)) + table.add_row("UID", str(refreshed.uid or "")) + table.add_row("Ingress", str(refreshed.ingress or "n/a")) + table.add_row("Probe", probe_mode) + table.add_row("Status", health_status) + table.add_row("Detail", detail) + console.print(table) + + stdout_tail = str(health.get("stdout_tail") or "").strip() + if stdout_tail: + console.print(f"[dim]Probe stdout: {stdout_tail}[/dim]") + + if health_status != "alive": + raise typer.Exit(1) + except typer.Exit: + raise + except Exception as exc: + console.print(f"[red]Error checking agent runtime health: {exc}[/red]") + raise typer.Exit(1) \ No newline at end of file diff --git a/datalayer_core/cli/commands/api_keys.py b/datalayer_core/cli/commands/api_keys.py new file mode 100644 index 00000000..e465a546 --- /dev/null +++ b/datalayer_core/cli/commands/api_keys.py @@ -0,0 +1,192 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""API key commands for Datalayer CLI.""" + +from typing import Optional + +import typer +from rich.console import Console + +from datalayer_core.client.client import DatalayerClient +from datalayer_core.displays.api_keys import display_api_keys +from datalayer_core.models.api_key import ApiKeyType + +# Create a Typer app for API key commands +app = typer.Typer( + name="api-keys", + help="API key management commands", + invoke_without_command=True, +) + +console = Console() + + +@app.callback() +def api_keys_callback(ctx: typer.Context) -> None: + """API key management commands.""" + if ctx.invoked_subcommand is None: + typer.echo(ctx.get_help()) + + +@app.command(name="ls") +def list_api_keys( + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), +) -> None: + """List all API keys.""" + try: + client = DatalayerClient(token=token) + api_keys = client.list_api_keys() + + # Convert to dict format for display_api_keys + api_key_dicts = [] + for api_key in api_keys: + api_key_dicts.append( + { + "uid": api_key.uid, + "name_s": api_key.name, + "description_t": api_key.description, + "variant_s": api_key.api_key_type, + } + ) + + display_api_keys(api_key_dicts) + + except Exception as e: + console.print(f"[red]Error listing API keys: {e}[/red]") + raise typer.Exit(1) + + +@app.command(name="list") +def list_api_keys_verbose( + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), +) -> None: + """List all API keys.""" + list_api_keys(token=token) + + +@app.command(name="create") +def create_api_key( + name: str = typer.Argument(..., help="Name of the API key"), + description: str = typer.Argument(..., help="Description of the API key"), + expiration_date: Optional[int] = typer.Option( + 0, + "--expiration-date", + help="Expiration date in seconds since epoch (0 for no expiration)", + ), + api_key_type: str = typer.Option( + ApiKeyType.SECRET.value, + "--api-key-type", + help="Type of the API key (secret, publishable, restricted, temporary)", + ), + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), +) -> None: + """Create a new API key.""" + try: + client = DatalayerClient(token=token) + + result = client.create_api_key( + name=name, + description=description, + expiration_date=expiration_date or 0, + api_key_type=api_key_type, + ) + + if result.get("success", False): + api_key_data = result.get("api_key", result.get("token", {})) + console.print( + f"[green]API key '{name}' created successfully![/green]" + ) + console.print( + f"[yellow]API key value: {result.get('access_token', 'N/A')}[/yellow]" + ) + console.print( + "[dim]Please save this API key value securely - it won't be shown again![/dim]" + ) + + # Display the created API key info. + if api_key_data: + display_api_keys( + [ + { + "uid": api_key_data.get("uid"), + "name_s": api_key_data.get("name_s", name), + "description_t": api_key_data.get( + "description_t", description + ), + "variant_s": api_key_data.get( + "variant_s", api_key_type + ), + } + ] + ) + else: + console.print( + f"[red]Failed to create API key: {result.get('message', 'Unknown error')}[/red]" + ) + raise typer.Exit(1) + + except Exception as e: + console.print(f"[red]Error creating API key: {e}[/red]") + raise typer.Exit(1) + + +@app.command(name="delete") +def delete_api_key( + uid: str = typer.Argument(..., help="UID of the API key to delete"), + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), +) -> None: + """Delete an API key.""" + try: + client = DatalayerClient(token=token) + + success = client.delete_api_key(uid) + + if success: + console.print(f"[green]API key '{uid}' deleted successfully![/green]") + else: + console.print(f"[red]Failed to delete API key '{uid}'[/red]") + raise typer.Exit(1) + + except Exception as e: + console.print(f"[red]Error deleting API key: {e}[/red]") + raise typer.Exit(1) + + +# Root level commands for convenience +def api_keys_list( + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), +) -> None: + """List all API keys (root command).""" + list_api_keys(token=token) + + +def api_keys_ls( + token: Optional[str] = typer.Option( + None, + "--token", + help="Authentication token (Bearer token for API requests).", + ), +) -> None: + """List all API keys (root command alias).""" + list_api_keys(token=token) diff --git a/datalayer_core/cli/commands/authn.py b/datalayer_core/cli/commands/authn.py index 7bfc2b09..8d0335ad 100644 --- a/datalayer_core/cli/commands/authn.py +++ b/datalayer_core/cli/commands/authn.py @@ -449,10 +449,39 @@ def whoami( "--details", help="Show detailed user information", ), + urls_only: bool = typer.Option( + False, + "--urls", + help="Show only resolved Datalayer service URLs", + ), ) -> None: """Show current authenticated user.""" try: urls = DatalayerURLs.from_environment(run_url=run_url, iam_url=iam_url) + + if urls_only: + url_items = [ + ("DATALAYER_RUN_URL", urls.run_url), + ("DATALAYER_IAM_URL", urls.iam_url), + ("DATALAYER_RUNTIMES_URL", urls.runtimes_url), + ("DATALAYER_SPACER_URL", urls.spacer_url), + ("DATALAYER_LIBRARY_URL", urls.library_url), + ("DATALAYER_MANAGER_URL", urls.manager_url), + ("DATALAYER_AI_AGENTS_URL", urls.ai_agents_url), + ("DATALAYER_AI_INFERENCE_URL", urls.ai_inference_url), + ("DATALAYER_OTEL_URL", urls.otel_url), + ("DATALAYER_GROWTH_URL", urls.growth_url), + ("DATALAYER_SUCCESS_URL", urls.success_url), + ("DATALAYER_STATUS_URL", urls.status_url), + ("DATALAYER_SUPPORT_URL", urls.support_url), + ("DATALAYER_MCP_SERVER_URL", urls.mcp_server_url), + ("DATALAYER_SCHEDULER_URL", urls.scheduler_url), + ] + console.print("[bold]Defined URLs:[/bold]") + for env_name, value in url_items: + console.print(f" 🌐 {env_name}: [green]{value}[/green]") + return + auth = AuthenticationManager(urls.iam_url) # If token provided, store it temporarily for whoami @@ -469,11 +498,29 @@ def whoami( console.print(f"👤 User: [cyan]{handle}[/cyan]") if email: console.print(f"📧 Email: {email}") - console.print(f"🌐 Server: [green]{urls.run_url}[/green]") + console.print(f"🌐 Datalayer RUN URL: [green]{urls.run_url}[/green]") if details: console.print("\n[bold]Detailed Information:[/bold]") + url_items = [ + ("DATALAYER_RUN_URL", urls.run_url), + ("DATALAYER_IAM_URL", urls.iam_url), + ("DATALAYER_RUNTIMES_URL", urls.runtimes_url), + ("DATALAYER_SPACER_URL", urls.spacer_url), + ("DATALAYER_LIBRARY_URL", urls.library_url), + ("DATALAYER_MANAGER_URL", urls.manager_url), + ("DATALAYER_AI_AGENTS_URL", urls.ai_agents_url), + ("DATALAYER_AI_INFERENCE_URL", urls.ai_inference_url), + ("DATALAYER_OTEL_URL", urls.otel_url), + ("DATALAYER_GROWTH_URL", urls.growth_url), + ("DATALAYER_SUCCESS_URL", urls.success_url), + ("DATALAYER_STATUS_URL", urls.status_url), + ("DATALAYER_SUPPORT_URL", urls.support_url), + ("DATALAYER_MCP_SERVER_URL", urls.mcp_server_url), + ("DATALAYER_SCHEDULER_URL", urls.scheduler_url), + ] + # Full name first_name = user.get("first_name_t", "") last_name = user.get("last_name_t", "") @@ -558,20 +605,22 @@ def whoami( teams = [m for m in memberships if (m.get("type") or "").lower() == "team"] org_by_uid = {m.get("uid"): m for m in orgs} + console.print("\n[bold]👥 Memberships:[/bold]") + if orgs: - console.print("\n[bold]🏢 Organizations:[/bold]") + console.print(" [bold]🏢 Organizations:[/bold]") for org in orgs: handle = org.get("handle") or org.get("uid") or "unknown" name = org.get("name") or "" roles = ", ".join(org.get("roles_ss") or []) or "-" - label = f" • [cyan]{handle}[/cyan]" + label = f" • [cyan]{handle}[/cyan]" if name and name != handle: label += f" ({name})" label += f" uid={org.get('uid')} roles={roles}" console.print(label) if teams: - console.print("\n[bold]👥 Teams:[/bold]") + console.print(" [bold]👥 Teams:[/bold]") for team in teams: handle = team.get("handle") or team.get("uid") or "unknown" name = team.get("name") or "" @@ -581,7 +630,7 @@ def whoami( parent_label = ( parent.get("handle") if parent else (org_uid or "unknown") ) - label = f" • [cyan]{handle}[/cyan]" + label = f" • [cyan]{handle}[/cyan]" if name and name != handle: label += f" ({name})" label += f" in [magenta]{parent_label}[/magenta]" @@ -589,7 +638,11 @@ def whoami( console.print(label) if not orgs and not teams: - console.print("\n[dim]No organization or team memberships.[/dim]") + console.print(" [dim]No organization or team memberships.[/dim]") + + console.print("\n[bold]Defined URLs:[/bold]") + for env_name, value in url_items: + console.print(f" 🌐 {env_name}: [green]{value}[/green]") else: console.print("[yellow]Not authenticated[/yellow]") console.print("Run 'datalayer login' to authenticate") @@ -684,8 +737,19 @@ def whoami_root( "--details", help="Show detailed user information", ), + urls_only: bool = typer.Option( + False, + "--urls", + help="Show only resolved Datalayer service URLs", + ), ) -> None: """ Show current authenticated user. """ - whoami(run_url=run_url, iam_url=iam_url, token=token, details=details) + whoami( + run_url=run_url, + iam_url=iam_url, + token=token, + details=details, + urls_only=urls_only, + ) diff --git a/datalayer_core/cli/commands/runtime_checkpoints.py b/datalayer_core/cli/commands/checkpoints.py similarity index 83% rename from datalayer_core/cli/commands/runtime_checkpoints.py rename to datalayer_core/cli/commands/checkpoints.py index 16face1c..41b67a45 100644 --- a/datalayer_core/cli/commands/runtime_checkpoints.py +++ b/datalayer_core/cli/commands/checkpoints.py @@ -72,11 +72,11 @@ def checkpoints_callback(ctx: typer.Context) -> None: @app.command(name="ls") def checkpoints_list( - runtime_uid: Optional[str] = typer.Option( + agent_uid: Optional[str] = typer.Option( None, - "--runtime", - "-r", - help="Filter checkpoints by runtime UID (pod name). If omitted, lists all checkpoints.", + "--agent", + "-a", + help="Filter checkpoints by agent UID (pod name). If omitted, lists all checkpoints.", ), token: Optional[str] = typer.Option( None, @@ -91,8 +91,8 @@ def checkpoints_list( ) -> None: """List runtime checkpoints.""" try: - if runtime_uid: - path = f"/runtime-checkpoints/{runtime_uid}" + if agent_uid: + path = f"/runtime-checkpoints/{agent_uid}" else: path = "/runtime-checkpoints" data = _fetch_api(path, token=token, runtimes_url=runtimes_url) @@ -109,11 +109,11 @@ def checkpoints_list( def checkpoints_ls( - runtime_uid: Optional[str] = typer.Option( + agent_uid: Optional[str] = typer.Option( None, - "--runtime", - "-r", - help="Filter checkpoints by runtime UID (pod name). If omitted, lists all checkpoints.", + "--agent", + "-a", + help="Filter checkpoints by agent UID (pod name). If omitted, lists all checkpoints.", ), token: Optional[str] = typer.Option( None, @@ -127,17 +127,17 @@ def checkpoints_ls( ), ) -> None: """List runtime checkpoints (root command alias).""" - checkpoints_list(runtime_uid=runtime_uid, token=token, runtimes_url=runtimes_url) + checkpoints_list(agent_uid=agent_uid, token=token, runtimes_url=runtimes_url) @app.command(name="delete") def checkpoints_delete( checkpoint_uid: str = typer.Argument(..., help="Checkpoint UID to delete"), - runtime_uid: Optional[str] = typer.Option( + agent_uid: Optional[str] = typer.Option( None, - "--runtime", - "-r", - help="Runtime UID that owns the checkpoint. If omitted, will be looked up automatically.", + "--agent", + "-a", + help="Agent UID that owns the checkpoint. If omitted, will be looked up automatically.", ), token: Optional[str] = typer.Option( None, @@ -158,8 +158,8 @@ def checkpoints_delete( ) -> None: """Delete a runtime checkpoint.""" try: - # If runtime_uid not provided, look up the checkpoint first. - if not runtime_uid: + # If agent_uid not provided, look up the checkpoint first. + if not agent_uid: # List all checkpoints and find the one matching the uid. data = _fetch_api( "/runtime-checkpoints", token=token, runtimes_url=runtimes_url @@ -169,16 +169,16 @@ def checkpoints_delete( if not match: console.print(f"[red]Checkpoint {checkpoint_uid} not found.[/red]") raise typer.Exit(1) - runtime_uid = match["runtime_uid"] + agent_uid = match["runtime_uid"] if not yes: typer.confirm( - f"Delete checkpoint {checkpoint_uid} from runtime {runtime_uid}?", + f"Delete checkpoint {checkpoint_uid} from agent {agent_uid}?", abort=True, ) _fetch_api( - f"/runtime-checkpoints/{runtime_uid}/{checkpoint_uid}", + f"/runtime-checkpoints/{agent_uid}/{checkpoint_uid}", method="DELETE", token=token, runtimes_url=runtimes_url, diff --git a/datalayer_core/cli/commands/cluster.py b/datalayer_core/cli/commands/cluster.py index 61e6973b..bf65da12 100644 --- a/datalayer_core/cli/commands/cluster.py +++ b/datalayer_core/cli/commands/cluster.py @@ -112,7 +112,29 @@ def _build_anomalies_panel(nodes_with_pods: list[Any], unassigned: list[Any]) -> if bool((pod or {}).get("unschedulable")): unschedulable_pods += 1 + yellow_total = pending_pods + len(unassigned) + pending_scale_up_nodes + pending_scale_down_nodes + red_total = unschedulable_pods + failed_pods + not_ready_nodes + + if red_total > 0: + summary_label = "FAILURES" + summary_style = "red" + border_style = "red" + elif yellow_total > 0: + summary_label = "WARNING" + summary_style = "yellow" + border_style = "yellow" + else: + summary_label = "OK" + summary_style = "green" + border_style = "green" + lines = Text() + lines.append("summary: ", style="bold") + lines.append(summary_label, style=f"bold {summary_style}") + lines.append("\n", style=summary_style) + lines.append(f"yellow flags: {yellow_total}\n", style="yellow") + lines.append(f"red flags: {red_total}\n", style="red") + lines.append("----------------------------------------\n", style="dim") lines.append("Pods\n", style="bold") lines.append(f"pending pods: {pending_pods}\n", style="yellow") lines.append(f"unschedulable pods: {unschedulable_pods}\n", style="red") @@ -124,7 +146,7 @@ def _build_anomalies_panel(nodes_with_pods: list[Any], unassigned: list[Any]) -> lines.append(f"pending scale-up nodes: {pending_scale_up_nodes}\n", style="cyan") lines.append(f"pending scale-down nodes: {pending_scale_down_nodes}", style="cyan") - return Panel(lines, title="Anomalies", border_style="yellow") + return Panel(lines, title="Anomalies", border_style=border_style) @app.callback() diff --git a/datalayer_core/cli/commands/console.py b/datalayer_core/cli/commands/console.py index ec13c02e..caad41b3 100644 --- a/datalayer_core/cli/commands/console.py +++ b/datalayer_core/cli/commands/console.py @@ -14,7 +14,7 @@ # Create a Typer app for console commands app = typer.Typer( - name="console", help="Runtime console commands", invoke_without_command=True + name="console", help="Agent console commands", invoke_without_command=True ) console = Console() @@ -22,7 +22,7 @@ @app.callback() def console_callback(ctx: typer.Context) -> None: - """Runtime console commands.""" + """Agent console commands.""" if ctx.invoked_subcommand is None: typer.echo(ctx.get_help()) @@ -31,8 +31,8 @@ def console_callback(ctx: typer.Context) -> None: def console_connect( runtime_name: Optional[str] = typer.Option( None, - "--runtime", - help="The name of the Runtime to connect to", + "--agent", + help="The name of the Agent to connect to", ), run_url: Optional[str] = typer.Option( None, @@ -73,22 +73,22 @@ def console_connect( None, help="Additional arguments to pass to the console application" ), ) -> None: - """Connect to a Datalayer runtime console.""" + """Connect to a Datalayer agent console.""" try: # Get URLs configuration urls = DatalayerURLs.from_environment(run_url=run_url) - console.print("[green]Starting Datalayer runtime console...[/green]") + console.print("[green]Starting Datalayer agent console...[/green]") console.print(f"Run URL: {urls.run_url}") if runtime_name: - console.print(f"Runtime: {runtime_name}") + console.print(f"Agent: {runtime_name}") console.print("[yellow]Press Ctrl+D or Ctrl+C to exit the console[/yellow]") # Prepare sys.argv for the RuntimesConsoleApp args = [] if runtime_name: - args.extend(["--runtime", runtime_name]) + args.extend(["--agent", runtime_name]) if urls.run_url: args.extend(["--run-url", urls.run_url]) if token: @@ -124,7 +124,7 @@ def console_connect( except KeyboardInterrupt: console.print("\n[yellow]Console session ended.[/yellow]") except Exception as e: - console.print(f"[red]Error connecting to runtime console: {e}[/red]") + console.print(f"[red]Error connecting to agent console: {e}[/red]") raise typer.Exit(1) @@ -134,8 +134,8 @@ def console_callback_default( ctx: typer.Context, runtime_name: Optional[str] = typer.Option( None, - "--runtime", - help="The name of the Runtime to connect to", + "--agent", + help="The name of the Agent to connect to", ), run_url: Optional[str] = typer.Option( None, @@ -173,7 +173,7 @@ def console_callback_default( help="Connect to an existing kernel instead of starting a new one", ), ) -> None: - """Connect to a Datalayer runtime console (default behavior).""" + """Connect to a Datalayer agent console (default behavior).""" if ctx.invoked_subcommand is None: # Get any remaining arguments that weren't parsed extra_args: list[str] = [] diff --git a/datalayer_core/cli/commands/envs.py b/datalayer_core/cli/commands/envs.py index fbc8d71e..b949f00f 100644 --- a/datalayer_core/cli/commands/envs.py +++ b/datalayer_core/cli/commands/envs.py @@ -2,7 +2,6 @@ # Distributed under the terms of the Modified BSD License. """Environment commands for Datalayer CLI.""" - from typing import Any, Dict, Optional import typer @@ -67,25 +66,26 @@ def list_environments( # Convert to dict format for display_environments env_dicts: list[Dict[str, Any]] = [] for env in environments: - env_dicts.append( - { - "name": env.name, - "title": env.title, - "burning_rate": env.burning_rate, - "language": env.language, - "owner": env.owner, - "visibility": env.visibility, - **(env.metadata or {}), - } - ) + env_dict: Dict[str, Any] = { + "name": env.name, + "title": env.title, + "burning_rate": env.burning_rate, + "language": env.language, + "owner": env.owner, + "visibility": env.visibility, + } + for key, value in (env.metadata or {}).items(): + if key not in env_dict: + env_dict[key] = value + env_dicts.append(env_dict) display_environments(env_dicts) if len(env_dicts) > 0: - console.print("\n[dim]Create a Runtime with e.g.[/dim]") + console.print("\n[dim]Create an Agent with e.g.[/dim]") for env_dict in env_dicts: console.print( - f"[dim]datalayer runtimes create --given-name my-runtime --credits-limit 3 {env_dict['name']}[/dim]" + f"[dim]datalayer agents create --given-name my-agent {env_dict['name']}[/dim]" ) console.print() diff --git a/datalayer_core/cli/commands/evals.py b/datalayer_core/cli/commands/evals.py index 72f27732..ea4d3734 100644 --- a/datalayer_core/cli/commands/evals.py +++ b/datalayer_core/cli/commands/evals.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. @@ -5,11 +8,7 @@ from __future__ import annotations -from datetime import datetime, timezone -import csv import json -import math -import re import time from pathlib import Path from typing import Any, Optional @@ -19,8 +18,36 @@ from rich.table import Table from rich.tree import Tree -from datalayer_core.client.client import DatalayerClient -from datalayer_core.utils.urls import DatalayerURLs +from datalayer_core.evals.evals import ( + load_evalset_spec, +) +from datalayer_core.evals.evals import ( + make_client as _make_client, +) +from datalayer_core.evals.evals import ( + merge_dicts as _merge_dicts, +) +from datalayer_core.evals.evals import ( + parse_json_file as _parse_json_file, +) +from datalayer_core.evals.evals import ( + parse_json_value as _parse_json_value, +) +from datalayer_core.evals.evals import ( + resolve_billable_account_uid as _resolve_billable_account_uid, +) +from datalayer_core.evals.evaluators import evaluate_evalset +from datalayer_core.evals.report import ( + _now_iso, + _parse_csv_values, + _parse_evaluator_specs, + _print_report_console, + _report_data, + _report_markdown, + _status_style, + _timestamp_slug, + _write_report_csv, +) app = typer.Typer( name="evals", @@ -36,1197 +63,6 @@ console = Console() -def _now_iso() -> str: - return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") - - -def _timestamp_slug(raw_iso: str) -> str: - cleaned = raw_iso.replace("-", "").replace(":", "").replace(".", "") - cleaned = cleaned.replace("+0000", "Z").replace("+00:00", "Z") - cleaned = cleaned.replace("T", "T") - if cleaned.endswith("Z"): - return cleaned - return f"{cleaned}Z" - - -def _parse_json_value(raw: Optional[str], flag_name: str) -> dict[str, Any]: - if not raw: - return {} - try: - parsed = json.loads(raw) - except Exception as exc: - raise typer.BadParameter(f"Invalid JSON for {flag_name}: {exc}") from exc - if not isinstance(parsed, dict): - raise typer.BadParameter(f"{flag_name} must decode to an object") - return parsed - - -def _parse_json_file(path_value: Optional[str], flag_name: str) -> dict[str, Any]: - if not path_value: - return {} - path = Path(path_value) - if not path.exists(): - raise typer.BadParameter(f"File not found for {flag_name}: {path}") - text = path.read_text(encoding="utf-8") - return _parse_json_value(text, flag_name) - - -def _merge_dicts(*parts: dict[str, Any]) -> dict[str, Any]: - merged: dict[str, Any] = {} - for part in parts: - merged.update(part) - return merged - - -def _make_client( - token: Optional[str] = None, - ai_agents_url: Optional[str] = None, -) -> DatalayerClient: - urls = DatalayerURLs.from_environment(ai_agents_url=ai_agents_url) - return DatalayerClient(urls=urls, token=token) - - -def _status_style(status: str) -> str: - normalized = status.lower() - if normalized in {"completed", "success", "passed"}: - return "green" - if normalized in {"running", "queued", "pending"}: - return "yellow" - if normalized in {"failed", "error"}: - return "red" - return "white" - - -def _run_pass_rate(run: dict[str, Any]) -> float | None: - metrics = run.get("metrics") or {} - raw = metrics.get("pass_rate") - if isinstance(raw, (int, float)): - value = float(raw) - if value < 0: - return 0.0 - if value > 1: - return 1.0 - return value - return None - - -def _fmt_pct(raw: float | None) -> str: - if raw is None: - return "n/a" - return f"{raw * 100:.1f}%" - - -def _style_text(value: str, style: str | None, colorize: bool) -> str: - if not colorize or not style: - return value - return f"[{style}]{value}[/{style}]" - - -def _compute_baseline_and_drift(runs: list[dict[str, Any]]) -> tuple[float | None, float | None, float | None]: - pass_rates = [rate for rate in (_run_pass_rate(run) for run in runs) if rate is not None] - if not pass_rates: - return None, None, None - baseline_size = min(3, max(1, len(pass_rates) // 2)) - baseline_slice = pass_rates[:baseline_size] - baseline = sum(baseline_slice) / baseline_size - latest = pass_rates[-1] - drift = latest - baseline - return baseline, latest, drift - - -def _classify_legacy_failure(message: str) -> dict[str, Any]: - """Infer a structured stage/type/url from a free-form legacy error message. - - Older runs (and any path that only persisted a plain error string) lack a - structured ``failure_cause``. Rather than rendering ``unknown`` / - ``legacy_error`` with an empty detail excerpt, classify the most common - error shapes so the report stays actionable. - """ - text = message.strip() - lowered = text.lower() - - url_match = re.search(r"https?://[^\s]+", text) - execution_url = url_match.group(0).rstrip(".,)") if url_match else "" - - stage = "unknown" - failure_type = "legacy_error" - if "all connection attempts failed" in lowered or "connection refused" in lowered or "request failed" in lowered: - stage = "runtime_execution" - failure_type = "runtime_unreachable" - elif "returned http" in lowered or re.search(r"\bhttp\s*[45]\d\d\b", lowered): - stage = "runtime_execution" - failure_type = "runtime_http_error" - elif "traceback" in lowered: - stage = "runtime_execution" - failure_type = "runtime_traceback" - elif "no submitted code" in lowered or "missing" in lowered and "code" in lowered: - stage = "run_preparation" - failure_type = "missing_submitted_code" - elif "no interactive runtime url" in lowered or "not configured" in lowered: - stage = "runtime_resolution" - failure_type = "no_runtime_url" - - cause: dict[str, Any] = { - "stage": stage, - "type": failure_type, - "message": text, - "detail_excerpt": text, - } - if execution_url: - cause["execution_url"] = execution_url - return cause - - -def _extract_failure_cause(run: dict[str, Any]) -> dict[str, Any] | None: - """Extract a structured failure cause from a run's report/summary payload.""" - for container_key in ("report", "summary"): - container = run.get(container_key) - if isinstance(container, dict): - cause = container.get("failure_cause") - if isinstance(cause, dict) and cause: - return cause - # Fallback: synthesize a structured cause from legacy error fields. - summary = run.get("summary") if isinstance(run.get("summary"), dict) else {} - report = run.get("report") if isinstance(run.get("report"), dict) else {} - message = ( - summary.get("failure_reason") - or summary.get("execution_error") - or report.get("error") - ) - if isinstance(message, str) and message.strip(): - return _classify_legacy_failure(message) - return None - - -def _format_failure_cause(cause: dict[str, Any] | None) -> str: - """Render a failure cause as a concise single-line string.""" - if not isinstance(cause, dict) or not cause: - return "" - failure_type = str(cause.get("type") or "").strip() - message = str(cause.get("message") or "").strip() - parts: list[str] = [] - if failure_type: - parts.append(f"[{failure_type}]") - if message: - parts.append(message) - return " ".join(parts).strip() - - -def _failure_cause_detail_lines(cause: dict[str, Any]) -> list[str]: - """Render the full failure cause (message, context, diagnostics, attempts) as markdown lines.""" - lines: list[str] = [] - message = str(cause.get("message") or "").strip() - if message: - lines.append(f"- Message: {message}") - for key, label in ( - ("stage", "Stage"), - ("type", "Type"), - ("runtime_pod_name", "Runtime pod"), - ("environment_name", "Environment"), - ("execution_url", "Execution URL"), - ): - value = str(cause.get(key) or "").strip() - if value: - lines.append(f"- {label}: `{value}`") - - detail = str(cause.get("detail_excerpt") or "").strip() - if detail: - lines.append("- Detail excerpt:") - lines.append("") - lines.append("```text") - lines.extend(detail.splitlines() or [detail]) - lines.append("```") - - diagnostics = cause.get("diagnostics") - if isinstance(diagnostics, dict) and diagnostics: - for key, label in ( - ("agent_runtimes_url", "Agent runtimes URL"), - ("run_url", "Run URL"), - ): - value = diagnostics.get(key) - if value: - lines.append(f"- {label}: `{value}`") - for key, label in ( - ("route_ids", "Route IDs tried"), - ("discovered_agent_ids", "Discovered agent IDs"), - ("candidate_urls", "Candidate URLs"), - ): - value = diagnostics.get(key) - if isinstance(value, list) and value: - rendered = ", ".join(f"`{item}`" for item in value) - lines.append(f"- {label}: {rendered}") - - attempts = diagnostics.get("attempts") - if isinstance(attempts, list) and attempts: - lines.append("- Connection attempts:") - attempt_rows: list[list[str]] = [] - for attempt in attempts: - if not isinstance(attempt, dict): - continue - status_code = attempt.get("status_code") - attempt_rows.append( - [ - str(attempt.get("url") or "-"), - "ok" if attempt.get("ok") else "failed", - "-" if status_code is None else str(status_code), - str(attempt.get("error") or "-"), - ] - ) - if attempt_rows: - lines.append("") - lines.extend( - _markdown_table( - ["URL", "Result", "HTTP", "Error"], - attempt_rows, - ["left", "left", "right", "left"], - ) - ) - return lines - - -def _run_detail_record(run: dict[str, Any]) -> dict[str, Any]: - metrics = run.get("metrics") if isinstance(run.get("metrics"), dict) else {} - summary = run.get("summary") if isinstance(run.get("summary"), dict) else {} - report = run.get("report") if isinstance(run.get("report"), dict) else {} - return { - "id": str(run.get("id", "")), - "status": str(run.get("status", "")), - "created_at": str(run.get("created_at", "")), - "updated_at": str(run.get("updated_at", "")), - "pass_rate": _run_pass_rate(run), - "metrics": metrics, - "summary": summary, - "report": report, - "failure_cause": _extract_failure_cause(run), - } - - -def _report_data( - client: DatalayerClient, - evalset_id: str, - run_limit: int, - account_uid: Optional[str], -) -> dict[str, Any]: - experiments_payload = client.evals_list_experiments( - evalset_id=evalset_id, - limit=200, - offset=0, - account_uid=account_uid, - ) - experiments = experiments_payload.get("experiments") or [] - - report: dict[str, Any] = { - "evalset_id": evalset_id, - "generated_at": _now_iso(), - "experiments": [], - } - - for experiment in experiments: - experiment_id = str(experiment.get("id", "")) - experiment_name = str(experiment.get("name", experiment_id)) - - runs_payload = client.evals_list_runs( - experiment_id, - limit=run_limit, - offset=0, - account_uid=account_uid, - ) - runs = runs_payload.get("runs") or [] - total_runs = int(runs_payload.get("total") or len(runs)) - baseline, latest, drift = _compute_baseline_and_drift(runs) - - latest_two_delta: float | None = None - latest_two_run_ids: list[str] = [] - latest_two_compare: dict[str, Any] | None = None - if len(runs) >= 2: - latest_two_run_ids = [str(runs[0].get("id", "")), str(runs[1].get("id", ""))] - compare_payload = client.evals_compare_runs( - latest_two_run_ids, - account_uid=account_uid, - ) - compared_runs = compare_payload.get("runs") or [] - compared_by_id = { - str(run.get("id", "")): run - for run in compared_runs - if isinstance(run, dict) - } - run_a = compared_by_id.get(latest_two_run_ids[0], runs[0]) - run_b = compared_by_id.get(latest_two_run_ids[1], runs[1]) - pass_a = _run_pass_rate(run_a) - pass_b = _run_pass_rate(run_b) - if pass_a is not None and pass_b is not None: - latest_two_delta = pass_a - pass_b - latest_two_compare = { - "run_ids": latest_two_run_ids, - "run_a": _run_detail_record(run_a), - "run_b": _run_detail_record(run_b), - "delta_pass_rate": latest_two_delta, - } - - consecutive_comparisons: list[dict[str, Any]] = [] - for idx in range(max(0, len(runs) - 1)): - run_a = runs[idx] - run_b = runs[idx + 1] - pass_a = _run_pass_rate(run_a) - pass_b = _run_pass_rate(run_b) - delta = None - if pass_a is not None and pass_b is not None: - delta = pass_a - pass_b - consecutive_comparisons.append( - { - "run_a_id": str(run_a.get("id", "")), - "run_b_id": str(run_b.get("id", "")), - "run_a_status": str(run_a.get("status", "")), - "run_b_status": str(run_b.get("status", "")), - "run_a_pass_rate": pass_a, - "run_b_pass_rate": pass_b, - "delta_pass_rate": delta, - } - ) - - pass_rates = [ - _run_pass_rate(run) - for run in runs - if isinstance(_run_pass_rate(run), (int, float)) - ] - numeric_pass_rates = [float(value) for value in pass_rates if isinstance(value, (int, float))] - mean_pass = sum(numeric_pass_rates) / len(numeric_pass_rates) if numeric_pass_rates else None - stddev_pass = None - if numeric_pass_rates: - variance = sum((value - mean_pass) ** 2 for value in numeric_pass_rates) / len(numeric_pass_rates) - stddev_pass = math.sqrt(variance) - - report["experiments"].append( - { - "id": experiment_id, - "name": experiment_name, - "runs_total": total_runs, - "runs_fetched": len(runs), - "latest_pass_rate": latest, - "baseline_pass_rate": baseline, - "drift_delta": drift, - "latest_two_run_ids": latest_two_run_ids, - "latest_two_delta": latest_two_delta, - "latest_two_comparison": latest_two_compare, - "mean_pass_rate": mean_pass, - "stddev_pass_rate": stddev_pass, - "runs": [_run_detail_record(run) for run in runs], - "consecutive_comparisons": consecutive_comparisons, - } - ) - return report - - -def _ascii_bar( - value: float | None, - width: int = 28, - *, - full_blocks: bool = True, - colorize: bool = False, -) -> str: - if value is None: - return "-" - bounded = max(0.0, min(1.0, float(value))) - filled = int(round(bounded * width)) - fill_char = "█" if full_blocks else "#" - empty_char = "░" if full_blocks else "." - filled_part = fill_char * filled - empty_part = empty_char * (width - filled) - if not colorize: - return filled_part + empty_part - if bounded >= 0.85: - style = "green" - elif bounded >= 0.75: - style = "yellow" - else: - style = "red" - return _style_text(filled_part, style, True) + _style_text(empty_part, "grey39", True) - - -def _fmt_pts(value: float) -> str: - return f"{value * 100:.1f}" - - -def _ascii_histogram( - values: list[float], - *, - bins: int = 8, - width: int = 22, - min_value: float | None = None, - max_value: float | None = None, - full_blocks: bool = True, - colorize: bool = False, - drift_palette: bool = False, -) -> list[str]: - if not values: - return ["n/a"] - - lo = min_value if isinstance(min_value, (int, float)) else min(values) - hi = max_value if isinstance(max_value, (int, float)) else max(values) - if hi <= lo: - hi = lo + 1e-9 - - bins = max(2, bins) - counts = [0 for _ in range(bins)] - span = hi - lo - for value in values: - ratio = (value - lo) / span - idx = int(ratio * bins) - idx = max(0, min(bins - 1, idx)) - counts[idx] += 1 - - peak = max(counts) if counts else 1 - fill_char = "█" if full_blocks else "#" - empty_char = "░" if full_blocks else "." - lines: list[str] = [] - for idx, count in enumerate(counts): - left = lo + (span * idx / bins) - right = lo + (span * (idx + 1) / bins) - filled = int(round((count / peak) * width)) if peak > 0 else 0 - filled_part = fill_char * filled - empty_part = empty_char * (width - filled) - if colorize: - if drift_palette: - if right <= 0: - bar_style = "red" - elif left >= 0: - bar_style = "green" - else: - bar_style = "yellow" - elif peak > 0 and count / peak >= 0.67: - bar_style = "cyan" - elif peak > 0 and count / peak >= 0.34: - bar_style = "blue" - else: - bar_style = "magenta" - bar = _style_text(filled_part, bar_style, True) + _style_text(empty_part, "grey39", True) - else: - bar = filled_part + empty_part - lines.append( - f"{_fmt_pts(left):>6} to {_fmt_pts(right):>6} pts |{bar}| {count}" - ) - return lines - - -def _fmt_delta(value: float | None, *, colorize: bool = False) -> str: - if value is None: - return "n/a" - rendered = f"{value * 100:+.1f} pts" - if value > 0: - return _style_text(rendered, "green", colorize) - if value < 0: - return _style_text(rendered, "red", colorize) - return _style_text(rendered, "yellow", colorize) - - -def _sparkline(values: list[float], *, colorize: bool = False) -> str: - if not values: - return "n/a" - ticks = "▁▂▃▄▅▆▇█" - lo = min(values) - hi = max(values) - if hi <= lo: - base = ticks[-2] * len(values) - else: - span = hi - lo - chars = [] - for value in values: - idx = int(round(((value - lo) / span) * (len(ticks) - 1))) - idx = max(0, min(len(ticks) - 1, idx)) - chars.append(ticks[idx]) - base = "".join(chars) - if not colorize: - return base - if values[-1] >= 0.85: - style = "green" - elif values[-1] >= 0.75: - style = "yellow" - else: - style = "red" - return _style_text(base, style, True) - - -def _pairwise_latest_deltas(experiments: list[dict[str, Any]]) -> list[dict[str, Any]]: - pairs: list[dict[str, Any]] = [] - for idx, left in enumerate(experiments): - left_latest = left.get("latest_pass_rate") - if not isinstance(left_latest, (int, float)): - continue - for right in experiments[idx + 1 :]: - right_latest = right.get("latest_pass_rate") - if not isinstance(right_latest, (int, float)): - continue - pairs.append( - { - "left": str(left.get("name", "")), - "right": str(right.get("name", "")), - "left_latest": float(left_latest), - "right_latest": float(right_latest), - "delta": float(left_latest) - float(right_latest), - } - ) - pairs.sort(key=lambda item: abs(item["delta"]), reverse=True) - return pairs - - -def _markdown_table(headers: list[str], rows: list[list[str]], aligns: list[str]) -> list[str]: - widths = [len(header) for header in headers] - for row in rows: - for idx, cell in enumerate(row): - widths[idx] = max(widths[idx], len(cell)) - - def _pad(cell: str, width: int, align: str) -> str: - if align == "right": - return cell.rjust(width) - return cell.ljust(width) - - header_line = "| " + " | ".join(headers[idx].ljust(widths[idx]) for idx in range(len(headers))) + " |" - - sep_parts: list[str] = [] - for idx, align in enumerate(aligns): - width = max(3, widths[idx]) - if align == "right": - sep_parts.append("-" * (width - 1) + ":") - else: - sep_parts.append(":" + "-" * (width - 1)) - sep_line = "| " + " | ".join(sep_parts) + " |" - - body_lines = [ - "| " + " | ".join(_pad(row[idx], widths[idx], aligns[idx]) for idx in range(len(headers))) + " |" - for row in rows - ] - return [header_line, sep_line, *body_lines] - - -def _report_markdown(report: dict[str, Any], run_limit: int, *, colorize: bool = False) -> str: - evalset_id = str(report.get("evalset_id", "")) - generated_at = str(report.get("generated_at", "")) - experiments = [item for item in (report.get("experiments") or []) if isinstance(item, dict)] - - lines: list[str] = [] - lines.append(f"# Evals Report: {evalset_id}") - lines.append("") - lines.append(f"- Generated at: {generated_at}") - lines.append(f"- Experiments: {len(experiments)}") - lines.append(f"- Run window per experiment: {run_limit}") - lines.append("") - - lines.append("## Experiment Overview") - lines.append("") - overview_rows: list[list[str]] = [] - for experiment in experiments: - runs_fetched = int(experiment.get("runs_fetched") or 0) - runs_total = int(experiment.get("runs_total") or 0) - overview_rows.append( - [ - f"{experiment.get('name', '')}", - f"{runs_fetched}/{runs_total}", - _fmt_pct(experiment.get('latest_pass_rate') if isinstance(experiment.get('latest_pass_rate'), (int, float)) else None), - _fmt_pct(experiment.get('baseline_pass_rate') if isinstance(experiment.get('baseline_pass_rate'), (int, float)) else None), - _fmt_delta(experiment.get('drift_delta') if isinstance(experiment.get('drift_delta'), (int, float)) else None, colorize=colorize), - _fmt_delta(experiment.get('latest_two_delta') if isinstance(experiment.get('latest_two_delta'), (int, float)) else None, colorize=colorize), - ] - ) - lines.extend( - _markdown_table( - ["Experiment", "Runs (fetched/total)", "Latest", "Baseline", "Drift", "Latest-2 Delta"], - overview_rows, - ["left", "right", "right", "right", "right", "right"], - ) - ) - lines.append("") - - lines.append("## Comparison Combinations") - lines.append("") - - ranked_latest = sorted( - [item for item in experiments if isinstance(item.get("latest_pass_rate"), (int, float))], - key=lambda item: float(item.get("latest_pass_rate") or 0.0), - reverse=True, - ) - lines.append("### By Latest Pass Rate") - lines.append("") - latest_rows: list[list[str]] = [] - for idx, item in enumerate(ranked_latest, start=1): - latest_rows.append([str(idx), f"{item.get('name', '')}", _fmt_pct(float(item.get('latest_pass_rate') or 0.0))]) - lines.extend(_markdown_table(["Rank", "Experiment", "Latest"], latest_rows, ["right", "left", "right"])) - latest_values = [ - float(item.get("latest_pass_rate")) - for item in ranked_latest - if isinstance(item.get("latest_pass_rate"), (int, float)) - ] - lines.append("") - lines.append("Latest pass-rate histogram (pts):") - for hist_line in _ascii_histogram( - latest_values, - bins=8, - width=20, - min_value=0.0, - max_value=1.0, - full_blocks=True, - colorize=colorize, - ): - lines.append(f"`{hist_line}`") - lines.append("") - - ranked_drift = sorted( - [item for item in experiments if isinstance(item.get("drift_delta"), (int, float))], - key=lambda item: float(item.get("drift_delta") or 0.0), - ) - lines.append("### By Drift (Most Negative To Most Positive)") - lines.append("") - drift_rows: list[list[str]] = [] - for idx, item in enumerate(ranked_drift, start=1): - drift_rows.append([str(idx), f"{item.get('name', '')}", _fmt_delta(float(item.get('drift_delta') or 0.0), colorize=colorize)]) - lines.extend(_markdown_table(["Rank", "Experiment", "Drift"], drift_rows, ["right", "left", "right"])) - drift_values = [ - float(item.get("drift_delta")) - for item in ranked_drift - if isinstance(item.get("drift_delta"), (int, float)) - ] - lines.append("") - lines.append("Drift histogram (delta pts):") - for hist_line in _ascii_histogram( - drift_values, - bins=8, - width=20, - full_blocks=True, - colorize=colorize, - drift_palette=True, - ): - lines.append(f"`{hist_line}`") - lines.append("") - - ranked_stability = sorted( - [item for item in experiments if isinstance(item.get("stddev_pass_rate"), (int, float))], - key=lambda item: float(item.get("stddev_pass_rate") or 0.0), - ) - lines.append("### By Stability (Lowest Pass-Rate StdDev)") - lines.append("") - stability_rows: list[list[str]] = [] - for idx, item in enumerate(ranked_stability, start=1): - stddev = item.get("stddev_pass_rate") - mean = item.get("mean_pass_rate") - stability_rows.append( - [ - str(idx), - f"{item.get('name', '')}", - (f"{float(stddev) * 100:.2f} pts" if isinstance(stddev, (int, float)) else "n/a"), - (_fmt_pct(float(mean)) if isinstance(mean, (int, float)) else "n/a"), - ] - ) - lines.extend(_markdown_table(["Rank", "Experiment", "StdDev", "Mean"], stability_rows, ["right", "left", "right", "right"])) - lines.append("") - - pairwise = _pairwise_latest_deltas(experiments) - lines.append("### Pairwise Latest-Pass Deltas") - lines.append("") - pair_rows: list[list[str]] = [] - for pair in pairwise: - pair_rows.append( - [ - f"{pair['left']} vs {pair['right']}", - _fmt_pct(pair['left_latest']), - _fmt_pct(pair['right_latest']), - _fmt_delta(pair['delta'], colorize=colorize), - ] - ) - if not pairwise: - pair_rows.append(["n/a", "n/a", "n/a", "n/a"]) - lines.extend( - _markdown_table( - ["Pair", "Left Latest", "Right Latest", "Delta (Left-Right)"], - pair_rows, - ["left", "right", "right", "right"], - ) - ) - pair_deltas = [float(pair["delta"]) for pair in pairwise if isinstance(pair.get("delta"), (int, float))] - lines.append("") - lines.append("Pairwise latest-delta histogram (pts):") - for hist_line in _ascii_histogram( - pair_deltas, - bins=8, - width=20, - full_blocks=True, - colorize=colorize, - drift_palette=True, - ): - lines.append(f"`{hist_line}`") - lines.append("") - - lines.append("### Insight Highlights") - lines.append("") - best_latest = ranked_latest[0] if ranked_latest else None - worst_latest = ranked_latest[-1] if ranked_latest else None - most_negative = ranked_drift[0] if ranked_drift else None - most_positive = ranked_drift[-1] if ranked_drift else None - most_stable = ranked_stability[0] if ranked_stability else None - if best_latest: - lines.append( - "- Top latest pass-rate: " - + f"{best_latest.get('name', '')} ({_fmt_pct(float(best_latest.get('latest_pass_rate') or 0.0))})." - ) - if worst_latest: - lines.append( - "- Lowest latest pass-rate: " - + f"{worst_latest.get('name', '')} ({_fmt_pct(float(worst_latest.get('latest_pass_rate') or 0.0))})." - ) - if most_positive: - drift_pos = float(most_positive.get("drift_delta") or 0.0) - lines.append( - "- Strongest positive drift: " - + f"{most_positive.get('name', '')} ({_fmt_delta(drift_pos, colorize=colorize)})." - ) - if most_negative: - drift_neg = float(most_negative.get("drift_delta") or 0.0) - lines.append( - "- Strongest negative drift: " - + f"{most_negative.get('name', '')} ({_fmt_delta(drift_neg, colorize=colorize)})." - ) - if most_stable: - std = most_stable.get("stddev_pass_rate") - mean = most_stable.get("mean_pass_rate") - lines.append( - "- Stability leader: " - + f"{most_stable.get('name', '')} " - + f"(stddev={(float(std) * 100):.2f} pts, mean={_fmt_pct(float(mean)) if isinstance(mean, (int, float)) else 'n/a'})." - ) - - drift_neg_count = len([value for value in drift_values if value < 0]) - drift_flat_count = len([value for value in drift_values if value == 0]) - drift_pos_count = len([value for value in drift_values if value > 0]) - total = max(1, drift_neg_count + drift_flat_count + drift_pos_count) - neg_meter = "█" * int(round((drift_neg_count / total) * 14)) - flat_meter = "█" * int(round((drift_flat_count / total) * 14)) - pos_meter = "█" * int(round((drift_pos_count / total) * 14)) - neg_meter = neg_meter or "·" - flat_meter = flat_meter or "·" - pos_meter = pos_meter or "·" - lines.append("") - lines.append("Drift balance meter:") - lines.append( - "`NEG " - + _style_text(neg_meter, "red", colorize) - + f" ({drift_neg_count}) | FLAT " - + _style_text(flat_meter, "yellow", colorize) - + f" ({drift_flat_count}) | POS " - + _style_text(pos_meter, "green", colorize) - + f" ({drift_pos_count})`" - ) - lines.append("") - - lines.append("## Per-Experiment Details") - lines.append("") - for experiment in experiments: - lines.append(f"### {experiment.get('name', '')}") - lines.append("") - lines.append("#### Run Timeline") - lines.append("") - run_rows: list[list[str]] = [] - runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] - for idx, run in enumerate(runs, start=1): - pass_rate = run.get("pass_rate") if isinstance(run.get("pass_rate"), (int, float)) else None - cause_text = _format_failure_cause(run.get("failure_cause")) - run_rows.append( - [ - str(idx), - str(run.get('id', '')), - str(run.get('status', '')), - _fmt_pct(float(pass_rate)) if isinstance(pass_rate, (int, float)) else 'n/a', - f"`{_ascii_bar(float(pass_rate), full_blocks=True, colorize=colorize) if isinstance(pass_rate, (int, float)) else '-'}`", - cause_text or "-", - ] - ) - if not runs: - run_rows.append(["1", "n/a", "n/a", "n/a", "`-`", "-"]) - lines.extend(_markdown_table(["#", "Run ID", "Status", "Pass Rate", "ASCII Trend", "Failure Cause"], run_rows, ["right", "left", "left", "right", "left", "left"])) - lines.append("") - failure_rows: list[list[str]] = [] - for idx, run in enumerate(runs, start=1): - cause = run.get("failure_cause") - if not isinstance(cause, dict) or not cause: - continue - detail = str(cause.get("detail_excerpt") or "").strip() - detail_single = " ".join(detail.split()) - if len(detail_single) > 240: - detail_single = detail_single[:237] + "..." - failure_rows.append( - [ - str(idx), - str(run.get("id", "")), - str(cause.get("stage") or "-"), - str(cause.get("type") or "-"), - str(cause.get("message") or "-"), - detail_single or "-", - ] - ) - if failure_rows: - lines.append("#### Failure Causes") - lines.append("") - lines.extend( - _markdown_table( - ["#", "Run ID", "Stage", "Type", "Message", "Detail Excerpt"], - failure_rows, - ["right", "left", "left", "left", "left", "left"], - ) - ) - lines.append("") - for idx, run in enumerate(runs, start=1): - cause = run.get("failure_cause") - if not isinstance(cause, dict) or not cause: - continue - detail_lines = _failure_cause_detail_lines(cause) - if not detail_lines: - continue - lines.append(f"

Run {idx} failure detail ({run.get('id', '')})") - lines.append("") - lines.extend(detail_lines) - lines.append("") - lines.append("
") - lines.append("") - timeline_values = [ - float(run.get("pass_rate")) - for run in runs - if isinstance(run.get("pass_rate"), (int, float)) - ] - lines.append( - "Pass-rate sparkline: " - + f"`{_sparkline(timeline_values, colorize=colorize) if timeline_values else 'n/a'}`" - ) - lines.append("") - - comparisons = [ - item for item in (experiment.get("consecutive_comparisons") or []) - if isinstance(item, dict) - ] - lines.append("#### Consecutive Run Deltas (A-B)") - lines.append("") - comparison_rows: list[list[str]] = [] - for item in comparisons: - run_a = item.get("run_a_pass_rate") if isinstance(item.get("run_a_pass_rate"), (int, float)) else None - run_b = item.get("run_b_pass_rate") if isinstance(item.get("run_b_pass_rate"), (int, float)) else None - delta = item.get("delta_pass_rate") if isinstance(item.get("delta_pass_rate"), (int, float)) else None - comparison_rows.append( - [ - str(item.get('run_a_id', '')), - str(item.get('run_b_id', '')), - _fmt_pct(float(run_a)) if isinstance(run_a, (int, float)) else 'n/a', - _fmt_pct(float(run_b)) if isinstance(run_b, (int, float)) else 'n/a', - _fmt_delta(float(delta), colorize=colorize) if isinstance(delta, (int, float)) else 'n/a', - ] - ) - if not comparisons: - comparison_rows.append(["n/a", "n/a", "n/a", "n/a", "n/a"]) - lines.extend(_markdown_table(["Run A", "Run B", "A Pass", "B Pass", "Delta"], comparison_rows, ["left", "left", "right", "right", "right"])) - lines.append("") - - lines.append("## Notes") - lines.append("") - lines.append("- Drift is computed as latest - baseline.") - lines.append("- Baseline uses the first half of fetched runs (minimum 1, maximum 3).") - lines.append("- Latest-2 delta uses the latest two runs returned in the fetched window.") - lines.append("") - - return "\n".join(lines) - - -def _write_report_csv(report: dict[str, Any], output_path: Path) -> None: - experiments = [item for item in (report.get("experiments") or []) if isinstance(item, dict)] - fieldnames = [ - "row_type", - "evalset_id", - "experiment_id", - "experiment_name", - "run_index", - "run_id", - "run_status", - "run_pass_rate", - "runs_fetched", - "runs_total", - "baseline_pass_rate", - "latest_pass_rate", - "drift_delta", - "latest_two_delta", - "mean_pass_rate", - "stddev_pass_rate", - "failure_stage", - "failure_type", - "failure_message", - "generated_at", - ] - output_path.parent.mkdir(parents=True, exist_ok=True) - with output_path.open("w", encoding="utf-8", newline="") as stream: - writer = csv.DictWriter(stream, fieldnames=fieldnames) - writer.writeheader() - for experiment in experiments: - writer.writerow( - { - "row_type": "experiment", - "evalset_id": str(report.get("evalset_id", "")), - "experiment_id": str(experiment.get("id", "")), - "experiment_name": str(experiment.get("name", "")), - "run_index": "", - "run_id": "", - "run_status": "", - "run_pass_rate": "", - "runs_fetched": int(experiment.get("runs_fetched") or 0), - "runs_total": int(experiment.get("runs_total") or 0), - "baseline_pass_rate": experiment.get("baseline_pass_rate"), - "latest_pass_rate": experiment.get("latest_pass_rate"), - "drift_delta": experiment.get("drift_delta"), - "latest_two_delta": experiment.get("latest_two_delta"), - "mean_pass_rate": experiment.get("mean_pass_rate"), - "stddev_pass_rate": experiment.get("stddev_pass_rate"), - "failure_stage": "", - "failure_type": "", - "failure_message": "", - "generated_at": str(report.get("generated_at", "")), - } - ) - runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] - for idx, run in enumerate(runs, start=1): - cause = run.get("failure_cause") if isinstance(run.get("failure_cause"), dict) else {} - writer.writerow( - { - "row_type": "run", - "evalset_id": str(report.get("evalset_id", "")), - "experiment_id": str(experiment.get("id", "")), - "experiment_name": str(experiment.get("name", "")), - "run_index": idx, - "run_id": str(run.get("id", "")), - "run_status": str(run.get("status", "")), - "run_pass_rate": run.get("pass_rate"), - "runs_fetched": int(experiment.get("runs_fetched") or 0), - "runs_total": int(experiment.get("runs_total") or 0), - "baseline_pass_rate": experiment.get("baseline_pass_rate"), - "latest_pass_rate": experiment.get("latest_pass_rate"), - "drift_delta": experiment.get("drift_delta"), - "latest_two_delta": experiment.get("latest_two_delta"), - "mean_pass_rate": experiment.get("mean_pass_rate"), - "stddev_pass_rate": experiment.get("stddev_pass_rate"), - "failure_stage": str(cause.get("stage", "")), - "failure_type": str(cause.get("type", "")), - "failure_message": str(cause.get("message", "")), - "generated_at": str(report.get("generated_at", "")), - } - ) - - -def _print_report_console(report: dict[str, Any], run_limit: int) -> None: - evalset_id = str(report.get("evalset_id", "")) - generated_at = str(report.get("generated_at", "")) - experiments = [item for item in (report.get("experiments") or []) if isinstance(item, dict)] - - console.rule(f"[bold cyan]Evals Report[/bold cyan] {evalset_id}") - console.print(f"Generated at: {generated_at}") - console.print(f"Experiments: {len(experiments)} | Run window per experiment: {run_limit}") - console.print("") - - overview = Table(title="Experiment Overview") - overview.add_column("Experiment", style="white") - overview.add_column("Runs", justify="right") - overview.add_column("Latest", justify="right") - overview.add_column("Baseline", justify="right") - overview.add_column("Drift", justify="right") - overview.add_column("Latest-2", justify="right") - for experiment in experiments: - overview.add_row( - str(experiment.get("name", "")), - f"{int(experiment.get('runs_fetched') or 0)}/{int(experiment.get('runs_total') or 0)}", - _fmt_pct(experiment.get("latest_pass_rate") if isinstance(experiment.get("latest_pass_rate"), (int, float)) else None), - _fmt_pct(experiment.get("baseline_pass_rate") if isinstance(experiment.get("baseline_pass_rate"), (int, float)) else None), - _fmt_delta(experiment.get("drift_delta") if isinstance(experiment.get("drift_delta"), (int, float)) else None, colorize=True), - _fmt_delta(experiment.get("latest_two_delta") if isinstance(experiment.get("latest_two_delta"), (int, float)) else None, colorize=True), - ) - console.print(overview) - - ranked_latest = sorted( - [item for item in experiments if isinstance(item.get("latest_pass_rate"), (int, float))], - key=lambda item: float(item.get("latest_pass_rate") or 0.0), - reverse=True, - ) - latest_table = Table(title="By Latest Pass Rate") - latest_table.add_column("Rank", justify="right", no_wrap=True) - latest_table.add_column("Experiment", style="white") - latest_table.add_column("Latest", justify="right", no_wrap=True) - for idx, item in enumerate(ranked_latest, start=1): - latest_table.add_row(str(idx), str(item.get("name", "")), _fmt_pct(float(item.get("latest_pass_rate") or 0.0))) - console.print(latest_table) - latest_values = [ - float(item.get("latest_pass_rate")) - for item in ranked_latest - if isinstance(item.get("latest_pass_rate"), (int, float)) - ] - console.print("Latest histogram:") - for hist_line in _ascii_histogram( - latest_values, - bins=8, - width=20, - min_value=0.0, - max_value=1.0, - full_blocks=True, - colorize=True, - ): - console.print(hist_line) - - ranked_drift = sorted( - [item for item in experiments if isinstance(item.get("drift_delta"), (int, float))], - key=lambda item: float(item.get("drift_delta") or 0.0), - ) - drift_table = Table(title="By Drift (Negative To Positive)") - drift_table.add_column("Rank", justify="right", no_wrap=True) - drift_table.add_column("Experiment", style="white") - drift_table.add_column("Drift", justify="right", no_wrap=True) - for idx, item in enumerate(ranked_drift, start=1): - drift_table.add_row( - str(idx), - str(item.get("name", "")), - _fmt_delta(float(item.get("drift_delta") or 0.0), colorize=True), - ) - console.print(drift_table) - drift_values = [ - float(item.get("drift_delta")) - for item in ranked_drift - if isinstance(item.get("drift_delta"), (int, float)) - ] - console.print("Drift histogram:") - for hist_line in _ascii_histogram( - drift_values, - bins=8, - width=20, - full_blocks=True, - colorize=True, - drift_palette=True, - ): - console.print(hist_line) - - pairwise = _pairwise_latest_deltas(experiments) - pairwise_table = Table(title="Pairwise Latest-Pass Deltas") - pairwise_table.add_column("Pair", style="white") - pairwise_table.add_column("Left", justify="right", no_wrap=True) - pairwise_table.add_column("Right", justify="right", no_wrap=True) - pairwise_table.add_column("Delta", justify="right", no_wrap=True) - for pair in pairwise: - pairwise_table.add_row( - f"{pair['left']} vs {pair['right']}", - _fmt_pct(pair["left_latest"]), - _fmt_pct(pair["right_latest"]), - _fmt_delta(pair["delta"], colorize=True), - ) - if not pairwise: - pairwise_table.add_row("n/a", "n/a", "n/a", "n/a") - console.print(pairwise_table) - - if ranked_latest: - console.print( - "[bold]Insight:[/bold] top latest " - f"[green]{ranked_latest[0].get('name', '')}[/green] " - f"({_fmt_pct(float(ranked_latest[0].get('latest_pass_rate') or 0.0))})" - ) - if ranked_drift: - console.print( - "[bold]Insight:[/bold] strongest drift " - f"{ranked_drift[-1].get('name', '')} " - f"({_fmt_delta(float(ranked_drift[-1].get('drift_delta') or 0.0), colorize=True)})" - ) - console.print("") - - for experiment in experiments: - console.print("") - console.print(f"[bold]Run Timeline:[/bold] {experiment.get('name', '')}") - run_table = Table() - run_table.add_column("#", justify="right", style="cyan", no_wrap=True) - run_table.add_column("Run ID", style="white", no_wrap=True) - run_table.add_column("Status", no_wrap=True) - run_table.add_column("Pass Rate", justify="right", no_wrap=True) - run_table.add_column("Trend", style="white", no_wrap=True) - run_table.add_column("Failure Cause", style="red", overflow="fold") - - runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] - for idx, run in enumerate(runs, start=1): - status_value = str(run.get("status", "")) - pass_rate = float(run.get("pass_rate")) if isinstance(run.get("pass_rate"), (int, float)) else None - cause_text = _format_failure_cause(run.get("failure_cause")) - run_table.add_row( - str(idx), - str(run.get("id", "")), - f"[{_status_style(status_value)}]{status_value}[/{_status_style(status_value)}]", - _fmt_pct(pass_rate), - _ascii_bar(pass_rate, width=28, full_blocks=True, colorize=True) if pass_rate is not None else "-", - cause_text or "-", - ) - if not runs: - run_table.add_row("1", "n/a", "n/a", "n/a", "-", "-") - console.print(run_table) - - for idx, run in enumerate(runs, start=1): - cause = run.get("failure_cause") - if not isinstance(cause, dict) or not cause: - continue - console.print( - f"[red bold]Run {idx} failure:[/red bold] " - f"[red]{str(cause.get('message') or 'Unknown failure.')}[/red]" - ) - for key, label in ( - ("stage", "stage"), - ("type", "type"), - ("execution_url", "execution url"), - ): - value = str(cause.get(key) or "").strip() - if value: - console.print(f" {label}: {value}") - diagnostics = cause.get("diagnostics") - if isinstance(diagnostics, dict): - for key, label in ( - ("agent_runtimes_url", "agent runtimes url"), - ("run_url", "run url"), - ): - value = diagnostics.get(key) - if value: - console.print(f" {label}: {value}") - candidate_urls = diagnostics.get("candidate_urls") - if isinstance(candidate_urls, list) and candidate_urls: - console.print(f" candidate urls: {', '.join(str(u) for u in candidate_urls)}") - attempts = diagnostics.get("attempts") - if isinstance(attempts, list) and attempts: - for attempt in attempts: - if not isinstance(attempt, dict): - continue - outcome = "ok" if attempt.get("ok") else "failed" - console.print( - f" attempt: {attempt.get('url', '')} -> {outcome} " - f"{attempt.get('error') or ''}".rstrip() - ) - detail = str(cause.get("detail_excerpt") or "").strip() - if detail: - console.print(f" detail: {detail}") - - deltas_table = Table(title="Consecutive Run Deltas") - deltas_table.add_column("Run A", style="white", no_wrap=True) - deltas_table.add_column("Run B", style="white", no_wrap=True) - deltas_table.add_column("A Pass", justify="right", no_wrap=True) - deltas_table.add_column("B Pass", justify="right", no_wrap=True) - deltas_table.add_column("Delta", justify="right", no_wrap=True) - comparisons = [ - item for item in (experiment.get("consecutive_comparisons") or []) - if isinstance(item, dict) - ] - for item in comparisons: - run_a = item.get("run_a_pass_rate") if isinstance(item.get("run_a_pass_rate"), (int, float)) else None - run_b = item.get("run_b_pass_rate") if isinstance(item.get("run_b_pass_rate"), (int, float)) else None - delta = item.get("delta_pass_rate") if isinstance(item.get("delta_pass_rate"), (int, float)) else None - deltas_table.add_row( - str(item.get("run_a_id", "")), - str(item.get("run_b_id", "")), - _fmt_pct(float(run_a)) if isinstance(run_a, (int, float)) else "n/a", - _fmt_pct(float(run_b)) if isinstance(run_b, (int, float)) else "n/a", - _fmt_delta(float(delta), colorize=True) if isinstance(delta, (int, float)) else "n/a", - ) - if not comparisons: - deltas_table.add_row("n/a", "n/a", "n/a", "n/a", "n/a") - console.print(deltas_table) - - @app.callback() def evals_callback(ctx: typer.Context) -> None: """Evals command group.""" @@ -1237,8 +73,9 @@ def evals_callback(ctx: typer.Context) -> None: @app.command(name="ls") def evals_ls( token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), run_environment: Optional[str] = typer.Option(None, "--run-environment", help="Filter by run environment (ui/sdk)."), kind: Optional[str] = typer.Option(None, "--kind", help="Filter by kind (batch/interactive)."), q: Optional[str] = typer.Option(None, "--q", help="Search query."), @@ -1247,14 +84,15 @@ def evals_ls( raw: bool = typer.Option(False, "--raw", help="Print raw JSON output."), ) -> None: """List all evalsets and their experiments.""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) evalsets_payload = client.evals_list_evals( run_environment=run_environment, kind=kind, q=q, limit=limit, offset=offset, - account_uid=account_uid, + account_uid=resolved_account_uid, ) evalsets = [item for item in (evalsets_payload.get("evalsets") or []) if isinstance(item, dict)] @@ -1267,7 +105,7 @@ def evals_ls( evalset_id=evalset_id, limit=200, offset=0, - account_uid=account_uid, + account_uid=resolved_account_uid, ) experiments_by_evalset[evalset_id] = [ item @@ -1316,8 +154,9 @@ def evals_delete_top( evalset_id: str = typer.Argument(..., help="Evalset UID to delete."), yes: bool = typer.Option(False, "--yes", "-y", help="Skip the confirmation prompt."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), ) -> None: """Delete an evalset and its associated experiments, runs, and cases.""" if not yes: @@ -1325,8 +164,9 @@ def evals_delete_top( f"Delete evalset {evalset_id} and all associated experiments, runs, and cases?", abort=True, ) - client = _make_client(token=token, ai_agents_url=ai_agents_url) - payload = client.evals_delete_eval(evalset_id, account_uid=account_uid) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) + payload = client.evals_delete_eval(evalset_id, account_uid=resolved_account_uid) cascade = payload.get("cascade") or {} console.print( f"[green]Eval deleted:[/green] {evalset_id} " @@ -1339,8 +179,9 @@ def evals_delete_top( @evals_app.command(name="ls") def evals_list( token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), run_environment: Optional[str] = typer.Option(None, "--run-environment", help="Filter by run environment (ui/sdk)."), kind: Optional[str] = typer.Option(None, "--kind", help="Filter by kind (batch/interactive)."), q: Optional[str] = typer.Option(None, "--q", help="Search query."), @@ -1349,14 +190,15 @@ def evals_list( raw: bool = typer.Option(False, "--raw", help="Print raw JSON output."), ) -> None: """List evalsets.""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) payload = client.evals_list_evals( run_environment=run_environment, kind=kind, q=q, limit=limit, offset=offset, - account_uid=account_uid, + account_uid=resolved_account_uid, ) if raw: console.print(payload) @@ -1392,10 +234,26 @@ def evals_create( schema_json: Optional[str] = typer.Option(None, "--schema-json", help="Schema JSON object."), metadata_json: Optional[str] = typer.Option(None, "--metadata-json", help="Metadata JSON object."), cases_file: Optional[str] = typer.Option(None, "--cases-file", help="Path to JSON array of cases."), + evalset_evaluator_json: list[str] = typer.Option( + [], + "--evalset-evaluator-json", + help="Repeatable JSON object applied as an evalset-level evaluator for the evalset.", + ), + report_evaluator_json: list[str] = typer.Option( + [], + "--report-evaluator-json", + help="Repeatable JSON object applied as a report-level evaluator for the evalset.", + ), + case_evaluator_json: list[str] = typer.Option( + [], + "--case-evaluator-json", + help="Repeatable JSON object applied as a case evaluator to every case in the payload.", + ), tags: list[str] = typer.Option([], "--tag", help="Repeatable tag."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON output."), ) -> None: """Create an evalset.""" @@ -1419,6 +277,33 @@ def evals_create( raise typer.BadParameter("--cases-file must contain a JSON array") cases = [case for case in decoded if isinstance(case, dict)] + evalset_evaluators = [ + item for item in (spec.get("evalset_evaluators") or []) if isinstance(item, dict) + ] + report_evaluators = [ + item for item in (spec.get("report_evaluators") or []) if isinstance(item, dict) + ] + evalset_evaluators.extend( + _parse_evaluator_specs(evalset_evaluator_json, "--evalset-evaluator-json") + ) + report_evaluators.extend( + _parse_evaluator_specs(report_evaluator_json, "--report-evaluator-json") + ) + + default_case_evaluators = _parse_evaluator_specs( + case_evaluator_json, + "--case-evaluator-json", + ) + if default_case_evaluators: + for case in cases: + existing = case.get("evaluators") + if isinstance(existing, list): + case["evaluators"] = [ + item for item in existing if isinstance(item, dict) + ] + default_case_evaluators + else: + case["evaluators"] = list(default_case_evaluators) + resolved_name = str(name or spec.get("name") or "").strip() if not resolved_name: raise typer.BadParameter("name argument is required unless provided in --spec-file") @@ -1429,17 +314,20 @@ def evals_create( spec_tags = spec.get("tags") if isinstance(spec.get("tags"), list) else [] resolved_tags = tags if tags else [str(tag) for tag in spec_tags if str(tag).strip()] - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) payload = client.evals_create_eval( name=resolved_name, description=resolved_description, run_environment=resolved_run_environment, kind=resolved_kind, schema=schema, + evalset_evaluators=evalset_evaluators, + report_evaluators=report_evaluators, metadata=metadata, tags=resolved_tags, cases=cases, - account_uid=account_uid, + account_uid=resolved_account_uid, ) if raw: typer.echo(json.dumps(payload)) @@ -1452,12 +340,14 @@ def evals_create( def evals_delete( evalset_id: str = typer.Argument(..., help="Evalset ID."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), ) -> None: """Delete an evalset (cascade delete runs/experiments).""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) - payload = client.evals_delete_eval(evalset_id, account_uid=account_uid) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) + payload = client.evals_delete_eval(evalset_id, account_uid=resolved_account_uid) cascade = payload.get("cascade") or {} console.print( "[green]Eval deleted.[/green] " @@ -1471,20 +361,22 @@ def _render_report( evalset_id: Optional[str], run_limit: int = typer.Option(50, "--run-limit", min=2, max=200, help="Runs fetched per experiment."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), output_file: Optional[str] = typer.Option(None, "--output", help="Write markdown report to file."), export: bool = typer.Option(False, "--export", help="Export timestamped report files report-.md and report-.csv."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON report output."), ) -> None: """Generate a full evalset report with cross-experiment comparisons.""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) resolved_evalset_id = (evalset_id or "").strip() if not resolved_evalset_id: payload = client.evals_list_evals( limit=200, offset=0, - account_uid=account_uid, + account_uid=resolved_account_uid, ) evalsets = [item for item in (payload.get("evalsets") or []) if isinstance(item, dict)] if not evalsets: @@ -1506,7 +398,7 @@ def _updated_key(item: dict[str, Any]) -> str: client=client, evalset_id=resolved_evalset_id, run_limit=run_limit, - account_uid=account_uid, + account_uid=resolved_account_uid, ) experiments = report.get("experiments") or [] if not experiments: @@ -1538,8 +430,9 @@ def evals_report( evalset_id: Optional[str] = typer.Argument(None, help="Evalset ID to report. Defaults to latest updated evalset."), run_limit: int = typer.Option(50, "--run-limit", min=2, max=200, help="Runs fetched per experiment."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), output_file: Optional[str] = typer.Option(None, "--output", help="Write markdown report to file."), export: bool = typer.Option(False, "--export", help="Export timestamped report files report-.md and report-.csv."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON report output."), @@ -1549,7 +442,8 @@ def evals_report( evalset_id=evalset_id, run_limit=run_limit, token=token, - ai_agents_url=ai_agents_url, + api_key=api_key, + billable_account_uid=billable_account_uid, account_uid=account_uid, output_file=output_file, export=export, @@ -1557,29 +451,79 @@ def evals_report( ) -@evals_app.command(name="compare-report") -def evals_compare_report_compat( - evalset_id: Optional[str] = typer.Argument(None, help="Evalset ID to report. Defaults to latest updated evalset."), - run_limit: int = typer.Option(50, "--run-limit", min=2, max=200, help="Runs fetched per experiment."), - token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), - output_file: Optional[str] = typer.Option(None, "--output", help="Write markdown report to file."), - export: bool = typer.Option(False, "--export", help="Export timestamped report files report-.md and report-.csv."), - raw: bool = typer.Option(False, "--raw", help="Print raw JSON report output."), +@app.command(name="evaluate") +def evals_evaluate( + evalset_spec: str = typer.Argument(..., help="Path to an evalset spec JSON file (with cases and evaluators)."), + outputs_file: str = typer.Option(..., "--outputs", help="JSON file of agent outputs aligned with the evalset cases (list of strings or {text} objects, or {\"outputs\": [...]})."), + statuses_file: Optional[str] = typer.Option(None, "--statuses", help="Optional JSON file of per-case run statuses aligned with cases."), + output_file: Optional[str] = typer.Option(None, "--output", help="Write the computed metrics JSON to this file."), + raw: bool = typer.Option(False, "--raw", help="Print the full metrics JSON."), ) -> None: - """Compatibility alias for report. Prefer: datalayer evals report .""" - console.print("[yellow]Deprecated:[/yellow] use [bold]datalayer evals report [/bold].") - _render_report( - evalset_id=evalset_id, - run_limit=run_limit, - token=token, - ai_agents_url=ai_agents_url, - account_uid=account_uid, - output_file=output_file, - export=export, - raw=raw, - ) + """Run per-case and global evaluators over real agent outputs. + + Grades the provided outputs against an evalset spec using the shared evals + API (``datalayer_core.evals.evaluate_evalset``) and emits run metrics + (``case_results`` + ``evaluator_results``). Callers produce outputs and + delegate all evaluator execution here instead of re-implementing it. + """ + spec = load_evalset_spec(evalset_spec, require_cases=True) + outputs_payload = json.loads(Path(outputs_file).read_text(encoding="utf-8")) + if isinstance(outputs_payload, dict) and "outputs" in outputs_payload: + outputs = outputs_payload["outputs"] + else: + outputs = outputs_payload + if not isinstance(outputs, list): + raise typer.BadParameter('--outputs must be a JSON list (or {"outputs": [...]}).') + statuses: Optional[list] = None + if statuses_file: + statuses_payload = json.loads(Path(statuses_file).read_text(encoding="utf-8")) + if isinstance(statuses_payload, dict) and "statuses" in statuses_payload: + statuses_payload = statuses_payload["statuses"] + if statuses_payload is not None and not isinstance(statuses_payload, list): + raise typer.BadParameter("--statuses must be a JSON list.") + statuses = statuses_payload + + metrics = evaluate_evalset(spec, outputs, statuses=statuses) + + if output_file: + Path(output_file).write_text(json.dumps(metrics, indent=2) + "\n", encoding="utf-8") + console.print(f"[green]Metrics written:[/green] {output_file}") + + if raw: + console.print_json(json.dumps(metrics)) + return + + summary = Table(title="Eval Metrics") + summary.add_column("Metric", style="cyan") + summary.add_column("Value", style="white") + summary.add_row("Pass rate", f"{float(metrics.get('pass_rate', 0.0)):.2%}") + summary.add_row("Cases", str(metrics.get("total_cases", 0))) + summary.add_row("Passed", str(metrics.get("passed", 0))) + summary.add_row("Failed", str(metrics.get("failed", 0))) + summary.add_row("Avg score", f"{float(metrics.get('avg_score', 0.0)):.4f}") + console.print(summary) + + evaluator_results = metrics.get("evaluator_results") or [] + if evaluator_results: + evaluators_table = Table(title="Evaluator Results") + evaluators_table.add_column("Evaluator", style="cyan") + evaluators_table.add_column("Scope", style="white") + evaluators_table.add_column("Score", style="white") + evaluators_table.add_column("Passed", style="white") + evaluators_table.add_column("Summary", style="white") + for item in evaluator_results: + if not isinstance(item, dict): + continue + score = item.get("score") + passed = bool(item.get("passed")) + evaluators_table.add_row( + str(item.get("name", "")), + str(item.get("scope", "")), + "n/a" if score is None else f"{float(score):.4f}", + f"[{'green' if passed else 'red'}]{'pass' if passed else 'fail'}[/{'green' if passed else 'red'}]", + str(item.get("summary", "")), + ) + console.print(evaluators_table) @experiments_app.command(name="ls") @@ -1589,18 +533,20 @@ def experiments_list( limit: int = typer.Option(50, "--limit", min=1, max=200), offset: int = typer.Option(0, "--offset", min=0), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON output."), ) -> None: """List evalset experiments.""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) payload = client.evals_list_experiments( evalset_id=evalset_id, status=status, limit=limit, offset=offset, - account_uid=account_uid, + account_uid=resolved_account_uid, ) if raw: console.print(payload) @@ -1630,13 +576,16 @@ def experiments_create( evalset_id: Optional[str] = typer.Option(None, "--evalset-id", help="Evalset ID."), description: Optional[str] = typer.Option(None, "--description", help="Description."), status: Optional[str] = typer.Option(None, "--status", help="Initial status."), - spec_file: Optional[str] = typer.Option(None, "--spec-file", help="Path to experiment spec JSON file."), + spec_file: Optional[str] = typer.Option(None, "--spec-file", help="Path to experimentspec JSON file."), + agent_spec_id: Optional[str] = typer.Option(None, "--agent-spec-id", help="Single agentspec id."), + agent_spec_ids: Optional[str] = typer.Option(None, "--agent-spec-ids", help="Comma-separated agentspec ids for multi-experiment creation."), config_json: Optional[str] = typer.Option(None, "--config-json", help="Config JSON object."), summary_json: Optional[str] = typer.Option(None, "--summary-json", help="Summary JSON object."), tags: list[str] = typer.Option([], "--tag", help="Repeatable tag."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON output."), ) -> None: """Create an evalset experiment.""" @@ -1659,22 +608,64 @@ def experiments_create( spec_tags = spec.get("tags") if isinstance(spec.get("tags"), list) else [] resolved_tags = tags if tags else [str(tag) for tag in spec_tags if str(tag).strip()] - client = _make_client(token=token, ai_agents_url=ai_agents_url) - payload = client.evals_create_experiment( - name=resolved_name, - evalset_id=resolved_evalset_id, - description=resolved_description, - status=resolved_status, - config=resolved_config, - summary=resolved_summary, - tags=resolved_tags, - account_uid=account_uid, - ) + selected_agent_specs = _parse_csv_values(agent_spec_ids) + if agent_spec_id: + selected_agent_specs = [str(agent_spec_id).strip(), *selected_agent_specs] + selected_agent_specs = [value for value in _parse_csv_values(",".join(selected_agent_specs)) if value] + + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) + payloads: list[dict[str, Any]] = [] + targets = selected_agent_specs or [""] + for spec_index, target_agent_spec_id in enumerate(targets, start=1): + config_payload = dict(resolved_config) + summary_payload = dict(resolved_summary) + experiment_name = resolved_name + if target_agent_spec_id: + config_payload["agent_spec_id"] = target_agent_spec_id + if not str(config_payload.get("agent_spec_name") or "").strip(): + config_payload["agent_spec_name"] = target_agent_spec_id + summary_payload["agent_spec_id"] = target_agent_spec_id + if not str(summary_payload.get("agent_spec_name") or "").strip(): + summary_payload["agent_spec_name"] = str(config_payload.get("agent_spec_name") or target_agent_spec_id) + if len(targets) > 1: + experiment_name = f"{resolved_name}-{target_agent_spec_id}" + summary_payload["agentspec_variant_index"] = spec_index + + payload = client.evals_create_experiment( + name=experiment_name, + evalset_id=resolved_evalset_id, + description=resolved_description, + status=resolved_status, + config=config_payload, + summary=summary_payload, + tags=resolved_tags, + account_uid=resolved_account_uid, + ) + payloads.append(payload) + if raw: - typer.echo(json.dumps(payload)) + typer.echo(json.dumps({"experiments": [item.get("experiment") for item in payloads]})) + return + + if len(payloads) == 1: + experiment = payloads[0].get("experiment") or {} + console.print(f"[green]Experiment created:[/green] {experiment.get('id', '')} ({experiment.get('name', '')})") return - experiment = payload.get("experiment") or {} - console.print(f"[green]Experiment created:[/green] {experiment.get('id', '')} ({experiment.get('name', '')})") + + table = Table(title=f"Experiments Created ({len(payloads)})") + table.add_column("ID", style="cyan") + table.add_column("Name", style="white") + table.add_column("Agentspec", style="white") + for payload in payloads: + experiment = payload.get("experiment") or {} + config = experiment.get("config") if isinstance(experiment.get("config"), dict) else {} + table.add_row( + str(experiment.get("id", "")), + str(experiment.get("name", "")), + str(config.get("agent_spec_id") or "-"), + ) + console.print(table) @runs_app.command(name="ls") @@ -1683,17 +674,19 @@ def runs_list( limit: int = typer.Option(50, "--limit", min=1, max=200), offset: int = typer.Option(0, "--offset", min=0), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON output."), ) -> None: """List runs for an experiment.""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) payload = client.evals_list_runs( experiment_id, limit=limit, offset=offset, - account_uid=account_uid, + account_uid=resolved_account_uid, ) if raw: console.print(payload) @@ -1730,8 +723,18 @@ def runs_launch( experiment_id: str = typer.Option(..., "--experiment-id", help="Experiment ID."), status: str = typer.Option("queued", "--status", help="Initial run status."), run_mode: Optional[str] = typer.Option(None, "--run-mode", help="Run mode hint (batch/interactive)."), - runtime_pod_name: Optional[str] = typer.Option(None, "--runtime-pod-name", help="Runtime pod for interactive execution."), + agent_pod_name: Optional[str] = typer.Option(None, "--agent-pod-name", help="Agent pod for interactive execution."), submitted_code_file: Optional[str] = typer.Option(None, "--submitted-code-file", help="Python file to execute in interactive mode."), + evalset_evaluator_json: list[str] = typer.Option( + [], + "--evalset-evaluator-json", + help="Repeatable JSON object for evalset-level evaluators attached to this run context.", + ), + report_evaluator_json: list[str] = typer.Option( + [], + "--report-evaluator-json", + help="Repeatable JSON object for evalset-level report evaluators attached to this run context.", + ), metrics_json: Optional[str] = typer.Option(None, "--metrics-json", help="Inline metrics JSON object."), summary_json: Optional[str] = typer.Option(None, "--summary-json", help="Inline summary JSON object."), report_json: Optional[str] = typer.Option(None, "--report-json", help="Inline report JSON object."), @@ -1741,8 +744,9 @@ def runs_launch( started_at: Optional[str] = typer.Option(None, "--started-at", help="ISO timestamp override."), ended_at: Optional[str] = typer.Option(None, "--ended-at", help="ISO timestamp override."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), ) -> None: """Launch an evalset run on SaaS and tag it as CLI-launched.""" cli_summary: dict[str, Any] = { @@ -1751,14 +755,27 @@ def runs_launch( } if run_mode: cli_summary["run_mode"] = run_mode - if runtime_pod_name: - cli_summary["runtime_pod_name"] = runtime_pod_name + if agent_pod_name: + cli_summary["runtime_pod_name"] = agent_pod_name if submitted_code_file: path = Path(submitted_code_file) if not path.exists(): raise typer.BadParameter(f"submitted code file not found: {submitted_code_file}") cli_summary["submitted_code"] = path.read_text(encoding="utf-8") + evalset_evaluators = _parse_evaluator_specs( + evalset_evaluator_json, + "--evalset-evaluator-json", + ) + report_evaluators = _parse_evaluator_specs( + report_evaluator_json, + "--report-evaluator-json", + ) + if evalset_evaluators: + cli_summary["evalset_evaluators"] = evalset_evaluators + if report_evaluators: + cli_summary["report_evaluators"] = report_evaluators + metrics = _merge_dicts( _parse_json_file(metrics_file, "--metrics-file"), _parse_json_value(metrics_json, "--metrics-json"), @@ -1772,8 +789,19 @@ def runs_launch( _parse_json_file(report_file, "--report-file"), _parse_json_value(report_json, "--report-json"), ) + if evalset_evaluators or report_evaluators: + report = _merge_dicts( + report, + { + "evalset_evaluators": { + "evalset_evaluators": evalset_evaluators, + "report_evaluators": report_evaluators, + } + }, + ) - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) payload = client.evals_create_run( experiment_id, status=status, @@ -1782,7 +810,7 @@ def runs_launch( metrics=metrics, summary=summary, report=report, - account_uid=account_uid, + account_uid=resolved_account_uid, ) run = payload.get("run") or {} run_id = str(run.get("id", "")) @@ -1797,16 +825,18 @@ def runs_watch( interval_seconds: float = typer.Option(3.0, "--interval", min=0.5, help="Polling interval."), timeout_seconds: int = typer.Option(600, "--timeout", min=5, help="Timeout in seconds."), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), ) -> None: """Watch a run until completion/failure.""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) started = time.time() last_status = "" while True: - payload = client.evals_get_run(run_id, account_uid=account_uid) + payload = client.evals_get_run(run_id, account_uid=resolved_account_uid) run = payload.get("run") or {} status = str(run.get("status", "unknown")) if status != last_status: @@ -1837,16 +867,18 @@ def live_targets( window: str = typer.Option("24h", "--window", help="Window: 1h, 6h, 24h, 7d, 30d."), limit: int = typer.Option(50, "--limit", min=1, max=200), token: Optional[str] = typer.Option(None, "--token", help="API token."), - ai_agents_url: Optional[str] = typer.Option(None, "--ai-agents-url", help="AI Agents base URL."), - account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Organization/account UID context."), + api_key: Optional[str] = typer.Option(None, "--api-key", help="Authentication API key (alias for --token)."), + billable_account_uid: Optional[str] = typer.Option(None, "--billable-account-uid", help="Billable account UID context (organization/team/user)."), + account_uid: Optional[str] = typer.Option(None, "--account-uid", help="Deprecated alias for --billable-account-uid."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON output."), ) -> None: """List live monitoring targets.""" - client = _make_client(token=token, ai_agents_url=ai_agents_url) + resolved_account_uid = _resolve_billable_account_uid(billable_account_uid, account_uid) + client = _make_client(token=token, api_key=api_key) payload = client.evals_list_live_targets( window=window, limit=limit, - account_uid=account_uid, + account_uid=resolved_account_uid, ) if raw: console.print(payload) diff --git a/datalayer_core/cli/commands/exec.py b/datalayer_core/cli/commands/exec.py index 999123c3..6527f3a1 100644 --- a/datalayer_core/cli/commands/exec.py +++ b/datalayer_core/cli/commands/exec.py @@ -1,42 +1,53 @@ # Copyright (c) 2023-2025 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. -"""Execution application for running code in Datalayer runtimes.""" +"""Execution application for running code in Datalayer code sandboxes.""" from __future__ import annotations import json import signal import sys +import tempfile +import time +from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional +from uuid import uuid4 import typer from rich.console import Console +from rich.table import Table from datalayer_core.client.client import DatalayerClient from datalayer_core.console.manager import RuntimeManager +from datalayer_core.utils.defaults import DEFAULT_ENVIRONMENT +from datalayer_core.utils.network import fetch from datalayer_core.utils.notebook import get_cells # Create the main Typer app for exec functionality app = typer.Typer( name="exec", - help="Execute files or notebooks on runtimes", + help="Execute files or notebooks on code sandboxes", invoke_without_command=True, ) console = Console() +KERNEL_READY_TIMEOUT_SECONDS = 20.0 +KERNEL_PROBE_TIMEOUT_SECONDS = 20.0 +DEFAULT_EXEC_TIMEOUT_SECONDS = 10.0 + @app.callback() def exec_callback(ctx: typer.Context) -> None: - """Execute files or notebooks on runtimes.""" + """Execute files or notebooks on code sandboxes.""" if ctx.invoked_subcommand is None: typer.echo(ctx.get_help()) -class RuntimesExecService: - """Service for executing files on Datalayer runtimes.""" +class CodeSandboxExecService: + """Service for executing files on Datalayer code sandboxes.""" def __init__(self, token: Optional[str] = None) -> None: """Initialize the exec service.""" @@ -60,51 +71,89 @@ def handle_sigint(self, *args: Any) -> None: # so that the interact loop advances, and prompt is redrawn, etc. raise KeyboardInterrupt - def init_kernel_manager(self, runtime_name: str) -> None: - """Initialize the kernel manager and connect to runtime.""" - try: - # Validate runtime only when explicitly provided. - # Empty runtime name delegates selection/creation to RuntimeManager.start_kernel. - if runtime_name: - runtimes = self._client.list_runtimes() - target_runtime = None - - for runtime in runtimes: - if runtime.name == runtime_name or runtime.uid == runtime_name: - target_runtime = runtime - break - - if target_runtime is None: - raise RuntimeError(f"Runtime '{runtime_name}' not found") - - # Get token using the same method as DatalayerClient - token = self._client._get_token() - - # Create a RuntimeManager with proper credentials - self.kernel_manager = RuntimeManager( - run_url=self._client.urls.run_url, - token=token or "", - username="", # Username is not required for token-based auth - ) + def init_kernel_manager(self, sandbox_name: str) -> None: + """Initialize the kernel manager and connect to a code sandbox.""" + max_attempts = 2 + last_error: Exception | None = None + # Set up signal handler once. + signal.signal(signal.SIGINT, self.handle_sigint) - # Set up signal handler - signal.signal(signal.SIGINT, self.handle_sigint) + for attempt in range(1, max_attempts + 1): + try: + # Validate sandbox only when explicitly provided. + # Empty sandbox name delegates selection/creation to RuntimeManager.start_kernel. + if sandbox_name: + runtimes = self._client.list_runtimes() + target_sandbox = None - # Start kernel and get client - self.kernel_manager.start_kernel(name=runtime_name or "") - self.kernel_client = self.kernel_manager.client + for runtime in runtimes: + if runtime.name == sandbox_name or runtime.uid == sandbox_name: + target_sandbox = runtime + break - if self.kernel_client: - self.kernel_client.start_channels() - console.print( - f"[green]Connected to runtime: {runtime_name or 'auto-selected'}[/green]" + if target_sandbox is None: + raise RuntimeError(f"Code sandbox '{sandbox_name}' not found") + + # Get token using the same method as DatalayerClient + token = self._client._get_token() + + # Create a RuntimeManager with proper credentials + self.kernel_manager = RuntimeManager( + run_url=self._client.urls.run_url, + token=token or "", + username="", # Username is not required for token-based auth ) - else: - raise RuntimeError("Failed to create kernel client") - except Exception as e: + # Start kernel and get client + self.kernel_manager.start_kernel(name=sandbox_name or "") + + if bool(getattr(self.kernel_manager, "runtime_created_in_start", False)): + self._inspect_created_code_sandbox_kernels() + + self.kernel_client = self.kernel_manager.client + + if not self.kernel_client: + raise RuntimeError("Failed to create kernel client") + + self.kernel_client.start_channels() + # Fresh runtimes can report healthy before the kernel channels are + # fully ready for requests. Wait explicitly to avoid hanging on + # the first execute call. + self.kernel_client.wait_for_ready(timeout=KERNEL_READY_TIMEOUT_SECONDS) + self._probe_kernel_execution() + manager_runtime_name = str(getattr(self.kernel_manager, "runtime_name", "") or sandbox_name or "auto-selected") + manager_runtime_uid = str(getattr(self.kernel_manager, "runtime_uid", "") or "") + manager_kernel_id = str(getattr(self.kernel_manager, "_kernel_id", "") or "") + if manager_runtime_uid or manager_kernel_id: + runtime_ref = f"{manager_runtime_uid}#{manager_kernel_id}".strip("#") + console.print( + f"[green]Connected to code sandbox: {manager_runtime_name} ({runtime_ref})[/green]" + ) + else: + console.print( + f"[green]Connected to code sandbox: {sandbox_name or 'auto-selected'}[/green]" + ) + return + except Exception as e: + last_error = e + self.cleanup() + self.kernel_manager = None + self.kernel_client = None + if attempt < max_attempts: + console.print( + "[yellow]Kernel not ready yet, retrying connection...[/yellow]" + ) + time.sleep(1.5 * attempt) + continue + break + + if last_error is None: + last_error = RuntimeError("Unknown code sandbox initialization failure") + + e = last_error + try: console.print( - f"[red]Failed to connect to runtime '{runtime_name}': {e}[/red]" + f"[red]Failed to connect to code sandbox '{sandbox_name}': {e}[/red]" ) # Provide helpful authentication guidance @@ -117,18 +166,82 @@ def init_kernel_manager(self, runtime_name: str) -> None: "[yellow] 2. Set DATALAYER_API_KEY environment variable[/yellow]" ) console.print("[yellow] 3. Use --token option if available[/yellow]") - + finally: raise typer.Exit(1) + def _inspect_created_code_sandbox_kernels(self) -> None: + """Inspect kernels after sandbox auto-creation and fail fast when count != 1.""" + if not self.kernel_manager: + raise RuntimeError("Code sandbox manager is not initialized") + + server_url = str(getattr(self.kernel_manager, "server_url", "") or "").rstrip("/") + sandbox_token = str(getattr(self.kernel_manager, "token", "") or "") + sandbox_name = str(getattr(self.kernel_manager, "runtime_name", "") or "") + sandbox_uid = str(getattr(self.kernel_manager, "runtime_uid", "") or "") + sandbox_pod = str(getattr(self.kernel_manager, "runtime_pod_name", "") or "") + + response = fetch(f"{server_url}/api/kernels", token=sandbox_token, timeout=15) + kernels = response.json() if response.content else [] + if not isinstance(kernels, list): + kernels = [] + + summary = Table(title="Code Sandbox Inspection (auto-created by exec)") + summary.add_column("Field", style="cyan") + summary.add_column("Value") + summary.add_row("Code Sandbox", sandbox_name or sandbox_pod) + summary.add_row("Pod", sandbox_pod) + summary.add_row("UID", sandbox_uid) + summary.add_row("Ingress", server_url) + summary.add_row("Kernels", str(len(kernels))) + console.print(summary) + + code_sandboxes_table = Table(title="Available Code Sandboxes") + code_sandboxes_table.add_column("ID", style="green") + code_sandboxes_table.add_column("Name") + code_sandboxes_table.add_column("State") + code_sandboxes_table.add_column("Connections") + code_sandboxes_table.add_column("Last Activity") + for kernel in kernels: + code_sandboxes_table.add_row( + str((kernel or {}).get("id") or ""), + str((kernel or {}).get("name") or ""), + str((kernel or {}).get("execution_state") or ""), + str((kernel or {}).get("connections") or "0"), + str((kernel or {}).get("last_activity") or ""), + ) + if kernels: + console.print(code_sandboxes_table) + + if len(kernels) != 1: + raise RuntimeError( + f"Auto-created code sandbox expected exactly one kernel, found {len(kernels)}" + ) + + def _probe_kernel_execution(self) -> None: + """Validate the kernel can execute a trivial statement before running user code.""" + if not self.kernel_client: + raise RuntimeError("Kernel client not initialized") + + def _noop_output_hook(msg: dict[str, Any]) -> None: + # A stream-based probe validates the same IOPub path used by cells. + _ = msg + + self.kernel_client.execute_interactive( + "print('__datalayer_probe__')", + silent=False, + timeout=KERNEL_PROBE_TIMEOUT_SECONDS, + output_hook=_noop_output_hook, + ) + def execute_file( self, filepath: Path, silent: bool = True, timeout: Optional[float] = None, raise_exceptions: bool = False, - ) -> None: + ) -> dict[str, Any]: """ - Execute a file or notebook on the connected runtime. + Execute a file or notebook on the connected code sandbox. Parameters ---------- @@ -144,19 +257,35 @@ def execute_file( if not self.kernel_client: raise RuntimeError("Kernel client not initialized") + report: dict[str, Any] = { + "input_file": str(filepath), + "cells": [], + } + try: self._executing = True console.print(f"[blue]Executing file: {filepath}[/blue]") + # Guardrail: ensure the selected code sandbox endpoint is reachable + # before submitting any execute requests. + self._assert_code_sandbox_alive() + self._prepare_kernel_before_execution() + # Get cells from the file cells = list(get_cells(filepath)) if not cells: console.print("[yellow]No executable cells found in file[/yellow]") - return + return report total_cells = len(cells) console.print(f"[blue]Found {total_cells} cell(s) to execute[/blue]") + failed_cells = 0 + effective_timeout = ( + float(timeout) + if timeout is not None + else DEFAULT_EXEC_TIMEOUT_SECONDS + ) # Execute each cell for i, (cell_id, cell_source) in enumerate(cells, 1): @@ -164,11 +293,69 @@ def execute_file( continue console.print(f"[blue]Executing cell {i}/{total_cells}...[/blue]") + self._print_cell_source(i, cell_source) + captured_outputs: list[dict[str, Any]] = [] + + def output_hook(msg: dict[str, Any]) -> None: + msg_type = str(msg.get("msg_type") or "") + content = msg.get("content") or {} + + if msg_type == "stream": + captured_outputs.append( + { + "output_type": "stream", + "name": content.get("name", "stdout"), + "text": content.get("text", ""), + } + ) + return + + if msg_type in {"display_data", "execute_result"}: + data = content.get("data") or {} + captured_outputs.append( + { + "output_type": msg_type, + "data": data, + "metadata": content.get("metadata") or {}, + "execution_count": content.get("execution_count"), + } + ) + return + + if msg_type == "error": + captured_outputs.append( + { + "output_type": "error", + "ename": content.get("ename"), + "evalue": content.get("evalue"), + "traceback": content.get("traceback") or [], + } + ) + + cell_report: dict[str, Any] = { + "cell_index": i, + "cell_id": cell_id, + "status": "ok", + "outputs": captured_outputs, + } try: - reply = self.kernel_client.execute_interactive( - cell_source, silent=silent, timeout=timeout - ) + try: + reply = self.kernel_client.execute_interactive( + cell_source, + silent=silent, + timeout=effective_timeout, + output_hook=output_hook, + ) + except TypeError: + # Backward compatibility when output_hook is not available. + reply = self.kernel_client.execute_interactive( + cell_source, + silent=silent, + timeout=effective_timeout, + ) + + cell_report["reply"] = reply.get("content") if isinstance(reply, dict) else {} if raise_exceptions and reply["content"]["status"] != "ok": content = reply["content"] @@ -186,6 +373,8 @@ def execute_file( f"Unknown failure: {json.dumps(content)}" ) + self._print_cell_outputs(i, captured_outputs) + # Show success for each cell if not silent if not silent: status = reply["content"]["status"] @@ -198,13 +387,34 @@ def execute_file( f"[yellow]⚠ Cell {i} completed with status: {status}[/yellow]" ) + if reply["content"].get("status") != "ok": + cell_report["status"] = str(reply["content"].get("status") or "error") + failed_cells += 1 + except Exception as e: if raise_exceptions: raise + failed_cells += 1 + cell_report["status"] = "error" + cell_report["error"] = str(e) console.print(f"[yellow]Warning: Cell {i} failed: {e}[/yellow]") + finally: + report["cells"].append(cell_report) - console.print("[green]✓ Execution completed successfully[/green]") + if failed_cells > 0: + console.print( + f"[red]Execution completed with {failed_cells} failed cell(s).[/red]" + ) + report["failed_cells"] = failed_cells + report["success"] = False + else: + console.print("[green]✓ Execution completed successfully[/green]") + report["failed_cells"] = 0 + report["success"] = True + return report + except typer.Exit: + raise except Exception as e: if raise_exceptions: raise @@ -213,6 +423,129 @@ def execute_file( finally: self._executing = False + def _print_cell_outputs(self, cell_index: int, outputs: list[dict[str, Any]]) -> None: + """Print collected outputs for a cell after execution.""" + if not outputs: + console.print(f"[dim]Cell {cell_index} output: (no output)[/dim]") + return + + console.print(f"[cyan]Cell {cell_index} output:[/cyan]") + for output in outputs: + output_type = str(output.get("output_type") or "") + if output_type == "stream": + text = str(output.get("text") or "").rstrip("\n") + if text: + console.print(text) + continue + + if output_type in {"display_data", "execute_result"}: + data = output.get("data") or {} + text_plain = "" + if isinstance(data, dict): + text_plain = str(data.get("text/plain") or "").rstrip("\n") + if text_plain: + console.print(text_plain) + else: + console.print(json.dumps(output, ensure_ascii=False)) + continue + + if output_type == "error": + traceback = output.get("traceback") or [] + if traceback: + console.print("[red]" + "\n".join(str(line) for line in traceback) + "[/red]") + else: + ename = str(output.get("ename") or "Error") + evalue = str(output.get("evalue") or "") + console.print(f"[red]{ename}: {evalue}[/red]") + continue + + console.print(json.dumps(output, ensure_ascii=False)) + + def _print_cell_source(self, cell_index: int, source: str) -> None: + """Print the source code that will be sent to the kernel for execution.""" + console.print(f"[cyan]Cell {cell_index} source:[/cyan]") + console.print("[dim][/dim]") + console.print(source.rstrip("\n")) + console.print("[dim][/dim]") + + def _assert_code_sandbox_alive(self) -> None: + """Fail early when the selected code sandbox endpoint is not reachable.""" + if not self.kernel_manager: + raise RuntimeError("Code sandbox manager is not initialized") + + server_url = str(getattr(self.kernel_manager, "server_url", "") or "").rstrip("/") + sandbox_token = str(getattr(self.kernel_manager, "token", "") or "") + if not server_url: + raise RuntimeError("Code sandbox endpoint is not available") + + attempts = 5 + last_error: Exception | None = None + for attempt in range(1, attempts + 1): + try: + fetch(f"{server_url}/api/kernels", token=sandbox_token, timeout=15) + return + except Exception as e: + last_error = e + if attempt < attempts: + time.sleep(0.4 * attempt) + continue + break + + raise RuntimeError( + f"Code sandbox health check failed for '{server_url}': {last_error}" + ) from last_error + + def _prepare_kernel_before_execution(self) -> None: + """List kernels visible on the code sandbox before execution starts.""" + kernels = self._fetch_code_sandbox_kernels() + self._print_available_kernels( + title="Kernels available before execution:", + kernels=kernels, + ) + + def _fetch_code_sandbox_kernels(self) -> list[dict[str, Any]]: + """Fetch kernels from the current code sandbox.""" + if not self.kernel_manager: + return [] + + server_url = str(getattr(self.kernel_manager, "server_url", "") or "").rstrip("/") + sandbox_token = str(getattr(self.kernel_manager, "token", "") or "") + if not server_url: + return [] + + response = fetch(f"{server_url}/api/kernels", token=sandbox_token, timeout=15) + kernels = response.json() if response.content else [] + if not isinstance(kernels, list): + return [] + return [kernel for kernel in kernels if isinstance(kernel, dict)] + + def _print_available_kernels( + self, + title: str, + kernels: list[dict[str, Any]], + ) -> None: + """Print kernels currently visible on the code sandbox.""" + selected_kernel_id = str(getattr(self.kernel_manager, "_kernel_id", "") or "") + + if not kernels: + console.print(f"[yellow]{title} none[/yellow]") + return + + console.print(f"[blue]{title}[/blue]") + for kernel in sorted( + kernels, + key=lambda kernel: str((kernel or {}).get("id") or ""), + ): + kernel_id = str((kernel or {}).get("id") or "") + kernel_name = str((kernel or {}).get("name") or "") + execution_state = str((kernel or {}).get("execution_state") or "") + connections = (kernel or {}).get("connections") + last_activity = str((kernel or {}).get("last_activity") or "") + marker = "*" if selected_kernel_id and kernel_id == selected_kernel_id else " " + console.print( + f" [{marker}] id={kernel_id} name={kernel_name} state={execution_state} connections={connections} last_activity={last_activity}" + ) + def cleanup(self) -> None: """Clean up resources.""" if self.kernel_client: @@ -225,18 +558,24 @@ def cleanup(self) -> None: # Main execution function decorated as the default command @app.command() def main( - filename: str = typer.Argument(..., help="Path to the file or notebook to execute"), - runtime: Optional[str] = typer.Option( + filename: Optional[str] = typer.Argument( None, - "--runtime", - "-r", - help="Name of the runtime to execute on (uses first available if not specified)", + help="Path to the file or notebook to execute", + ), + sandbox: Optional[str] = typer.Option( + None, + "--sandbox", + "-s", + help="Name of the code sandbox to execute on (uses first available if not specified)", ), verbose: bool = typer.Option( False, "--verbose", "-v", help="Show all cell outputs" ), timeout: Optional[float] = typer.Option( - None, "--timeout", "-t", help="Execution timeout for each cell in seconds" + None, + "--timeout", + "-t", + help="Execution timeout for each cell in seconds", ), raise_exceptions: bool = typer.Option( False, "--raise", help="Stop executing if an exception occurs" @@ -246,66 +585,254 @@ def main( "--token", help="Authentication token (Bearer token for API requests).", ), + api_key: Optional[str] = typer.Option( + None, + "--api-key", + help="Authentication API key (alias for --token).", + ), + example_notebook: bool = typer.Option( + False, + "--example-notebook", + help="Create a temporary example notebook, execute it, then remove it.", + ), + example_py: bool = typer.Option( + False, + "--example-py", + help="Create a temporary example Python file, execute it, then remove it.", + ), + output_name: Optional[str] = typer.Option( + None, + "--output-name", + help="Output report filename/path. Defaults to .out.json next to the input file.", + ), ) -> None: - """Execute a Python file or Jupyter notebook on a Datalayer runtime.""" - - # Resolve file path - filepath = Path(filename).expanduser().resolve() + """Execute a Python file or Jupyter notebook on a Datalayer code sandbox.""" - # Check if file exists and is readable - if not filepath.exists(): - console.print(f"[red]Error: File '{filepath}' does not exist[/red]") - raise typer.Exit(1) + auth_token = token or api_key - if not filepath.is_file(): - console.print(f"[red]Error: '{filepath}' is not a file[/red]") + if example_notebook and example_py: + console.print( + "[red]Error: --example-notebook and --example-py are mutually exclusive[/red]" + ) raise typer.Exit(1) - try: - with filepath.open("rb"): - pass - except Exception as e: + if filename and (example_notebook or example_py): console.print( - f"[red]Error: Could not open file '{filepath}' for reading: {e}[/red]" + "[red]Error: provide either a filename or one --example-* flag, not both[/red]" ) raise typer.Exit(1) - # Check file extension - if filepath.suffix not in [".py", ".ipynb"]: + if not filename and not example_notebook and not example_py: console.print( - f"[yellow]Warning: File extension '{filepath.suffix}' is not .py or .ipynb[/yellow]" + "[red]Error: missing FILE_PATH or an --example-* option[/red]" ) + raise typer.Exit(1) - # Determine which runtime to use - selected_runtime = runtime - if selected_runtime is None: - selected_runtime = _select_runtime(token=token) - - # Create exec service and execute - exec_service = RuntimesExecService(token=token) + generated_example = False + filepath: Path + if example_notebook: + filepath = _create_example_notebook_file() + generated_example = True + console.print(f"[blue]Generated example notebook: {filepath}[/blue]") + elif example_py: + filepath = _create_example_python_file() + generated_example = True + console.print(f"[blue]Generated example Python file: {filepath}[/blue]") + else: + # Resolve file path + filepath = Path(str(filename)).expanduser().resolve() try: - # Initialize connection to runtime - exec_service.init_kernel_manager(selected_runtime) - - # Execute the file - exec_service.execute_file( - filepath=filepath, - silent=not verbose, - timeout=timeout, - raise_exceptions=raise_exceptions, - ) + # Check if file exists and is readable + if not filepath.exists(): + console.print(f"[red]Error: File '{filepath}' does not exist[/red]") + raise typer.Exit(1) + if not filepath.is_file(): + console.print(f"[red]Error: '{filepath}' is not a file[/red]") + raise typer.Exit(1) + + try: + with filepath.open("rb"): + pass + except Exception as e: + console.print( + f"[red]Error: Could not open file '{filepath}' for reading: {e}[/red]" + ) + raise typer.Exit(1) + + # Check file extension + if filepath.suffix not in [".py", ".ipynb"]: + console.print( + f"[yellow]Warning: File extension '{filepath.suffix}' is not .py or .ipynb[/yellow]" + ) + + # Determine which code sandbox to use + selected_sandbox = sandbox + if selected_sandbox is None: + selected_sandbox = _select_code_sandbox(token=auth_token) + + # Create exec service and execute + exec_service = CodeSandboxExecService(token=auth_token) + + try: + # Initialize connection to code sandbox + exec_service.init_kernel_manager(selected_sandbox) + + # Execute the file + execution_report = exec_service.execute_file( + filepath=filepath, + silent=not verbose, + timeout=timeout, + raise_exceptions=raise_exceptions, + ) + + report_path = _resolve_output_report_path(filepath, output_name) + execution_report["output_file"] = str(report_path) + report_path.write_text( + json.dumps(execution_report, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + console.print(f"[green]Saved execution outputs: {report_path}[/green]") + console.print(f"[green]Full output report path: {report_path.resolve()}[/green]") + if int(execution_report.get("failed_cells") or 0) > 0: + raise typer.Exit(1) + + finally: + # Always cleanup + exec_service.cleanup() finally: - # Always cleanup - exec_service.cleanup() + if generated_example: + try: + filepath.unlink(missing_ok=True) + except Exception as e: + console.print( + f"[yellow]Warning: could not remove temporary example file '{filepath}': {e}[/yellow]" + ) + + +def _example_file_path(suffix: str) -> Path: + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S%fZ") + name = f"datalayer-exec-example-{ts}-{uuid4().hex[:8]}{suffix}" + return Path(tempfile.gettempdir()) / name -def _select_runtime(token: Optional[str] = None) -> str: +def _create_example_python_file() -> Path: + path = _example_file_path(".py") + path.write_text( + "import json\n" + "import pandas as pd\n\n" + "pd.set_option('display.max_rows', None)\n" + "pd.set_option('display.max_columns', None)\n" + "pd.set_option('display.width', None)\n\n" + "print('Python example: building sample sales dataframe')\n" + "df = pd.DataFrame({\n" + " 'day': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'],\n" + " 'region': ['north', 'north', 'south', 'south', 'west', 'west'],\n" + " 'orders': [12, 14, 8, 11, 9, 15],\n" + " 'revenue': [240, 310, 175, 220, 190, 360],\n" + "})\n\n" + "print('DataFrame:')\n" + "print(df.to_string(index=False))\n\n" + "print('Grouped summary by region:')\n" + "summary = (\n" + " df.groupby('region', as_index=False)\n" + " .agg(total_orders=('orders', 'sum'), total_revenue=('revenue', 'sum'))\n" + " .sort_values('total_revenue', ascending=False)\n" + ")\n" + "print(summary.to_string(index=False))\n\n" + "payload = {\n" + " 'rows': int(len(df)),\n" + " 'best_region': str(summary.iloc[0]['region']),\n" + " 'total_revenue': int(df['revenue'].sum()),\n" + "}\n" + "print('JSON summary:')\n" + "print(json.dumps(payload, indent=2))\n", + encoding="utf-8", + ) + return path + + +def _create_example_notebook_file() -> Path: + path = _example_file_path(".ipynb") + notebook_payload = { + "cells": [ + { + "id": f"cell-{uuid4().hex[:8]}", + "cell_type": "code", + "execution_count": None, + "metadata": {"id": f"cell-{uuid4().hex[:8]}", "language": "python"}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.width', None)\n", + "print('Notebook example: pandas setup complete')\n", + ], + }, + { + "id": f"cell-{uuid4().hex[:8]}", + "cell_type": "code", + "execution_count": None, + "metadata": {"id": f"cell-{uuid4().hex[:8]}", "language": "python"}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\n", + " 'day': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'],\n", + " 'region': ['north', 'north', 'south', 'south', 'west', 'west'],\n", + " 'orders': [12, 14, 8, 11, 9, 15],\n", + " 'revenue': [240, 310, 175, 220, 190, 360],\n", + "})\n", + "print('Raw dataframe:')\n", + "print(df.to_string(index=False))\n", + ], + }, + { + "id": f"cell-{uuid4().hex[:8]}", + "cell_type": "code", + "execution_count": None, + "metadata": {"id": f"cell-{uuid4().hex[:8]}", "language": "python"}, + "outputs": [], + "source": [ + "summary = (\n", + " df.groupby('region', as_index=False)\n", + " .agg(total_orders=('orders', 'sum'), total_revenue=('revenue', 'sum'))\n", + " .sort_values('total_revenue', ascending=False)\n", + ")\n", + "print('Revenue summary by region:')\n", + "print(summary.to_string(index=False))\n", + "print('Top region:', summary.iloc[0]['region'])\n", + ], + }, + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5, + } + path.write_text(json.dumps(notebook_payload), encoding="utf-8") + return path + + +def _resolve_output_report_path(filepath: Path, output_name: Optional[str]) -> Path: + """Compute output report path for collected execution outputs.""" + if output_name: + candidate = Path(output_name).expanduser() + if candidate.is_absolute(): + return candidate + return filepath.parent / candidate + + # notebook-name.ipynb -> notebook-name.out.json + # script.py -> script.out.json + return filepath.with_suffix(".out.json") + + +def _select_code_sandbox(token: Optional[str] = None) -> str: """ - Select a runtime to use for execution. + Select a code sandbox to use for execution. - Returns the first available runtime, or prompts to create one if none exist. + Returns the first available code sandbox, or interactively provisions one when + no code sandbox is available. Parameters ---------- @@ -315,21 +842,147 @@ def _select_runtime(token: Optional[str] = None) -> str: Returns ------- str - The name/ID of the runtime to use. + The name/ID of the code sandbox to use. """ try: client = DatalayerClient(token=token) runtimes = client.list_runtimes() if not runtimes: - # Return an empty runtime name to trigger RuntimeManager's built-in - # interactive flow that can launch a runtime from an environment. - return "" + console.print("[yellow]No code sandbox is running.[/yellow]") + + should_create = typer.confirm( + "No code sandbox is available. Create one now?", + default=True, + ) + if not should_create: + console.print("[red]Execution aborted: no code sandbox selected.[/red]") + raise typer.Exit(1) + + environment = DEFAULT_ENVIRONMENT + burn_rate = _get_environment_burning_rate(client, environment) + remaining_credits = _get_remaining_credits_after_reservations(client) + default_seconds = _default_code_sandbox_seconds( + remaining_credits=remaining_credits, + burn_rate=burn_rate, + ) + + console.print( + f"[blue]Environment: {environment} (burning_rate={burn_rate:.6f} credits/s)[/blue]" + ) + console.print( + f"[blue]Remaining credits (after reservations): {remaining_credits:.6f}[/blue]" + ) + console.print( + f"[blue]Suggested code sandbox duration: {default_seconds:.2f} seconds (33% of remaining credits)[/blue]" + ) + + requested_seconds = typer.prompt( + "Code sandbox duration in seconds", + type=float, + default=default_seconds, + show_default=True, + ) + if requested_seconds <= 0: + console.print("[red]Code sandbox duration must be greater than 0 seconds.[/red]") + raise typer.Exit(1) + + requested_credits = burn_rate * requested_seconds + time_reservation_minutes = requested_seconds / 60.0 + console.print( + f"[blue]Requested reservation: {requested_seconds:.2f}s -> {requested_credits:.6f} credits[/blue]" + ) + + created_runtime = client.create_runtime( + environment=environment, + time_reservation=time_reservation_minutes, + ) - # Use the first available runtime + sandbox_name = str(created_runtime.name or "") + sandbox_uid = str(created_runtime.uid or "") + sandbox_pod = str(created_runtime.pod_name or "") + sandbox_ingress = str(created_runtime.ingress or "").rstrip("/") + sandbox_token = str( + created_runtime.jupyter_token or client._get_token() or "" + ) + + if not sandbox_ingress or not sandbox_token: + console.print( + "[red]Code sandbox created but ingress/token is not available for inspection.[/red]" + ) + raise typer.Exit(1) + + pre_confirm_kernel_id = _inspect_code_sandbox_kernels_unique( + sandbox_name=sandbox_name or sandbox_pod or sandbox_uid, + sandbox_uid=sandbox_uid, + sandbox_pod=sandbox_pod, + sandbox_ingress=sandbox_ingress, + sandbox_token=sandbox_token, + inspection_label="post-create", + ) + + proceed = typer.confirm( + "Proceed with execution on this code sandbox?", + default=True, + ) + if not proceed: + console.print("[red]Execution aborted by user.[/red]") + raise typer.Exit(1) + + post_confirm_kernel_id = _inspect_code_sandbox_kernels_unique( + sandbox_name=sandbox_name or sandbox_pod or sandbox_uid, + sandbox_uid=sandbox_uid, + sandbox_pod=sandbox_pod, + sandbox_ingress=sandbox_ingress, + sandbox_token=sandbox_token, + inspection_label="pre-exec confirmation", + ) + + if post_confirm_kernel_id != pre_confirm_kernel_id: + console.print( + "[red]Kernel changed between inspections. Failing fast before execution.[/red]" + ) + raise typer.Exit(1) + + selected_name = sandbox_uid or sandbox_name or sandbox_pod + if not selected_name: + console.print( + "[red]Code sandbox created but no code sandbox identifier is available.[/red]" + ) + raise typer.Exit(1) + + console.print( + f"[green]Using newly created code sandbox: {selected_name}#{post_confirm_kernel_id}[/green]" + ) + return selected_name + + # Use the first available code sandbox selected = runtimes[0] + sandbox_uid = str(selected.uid or "") + kernel_id = "" + try: + runtime_token = str(getattr(selected, "jupyter_token", "") or client._get_token() or "") + ingress = str(getattr(selected, "ingress", "") or "").rstrip("/") + if ingress and runtime_token: + response = fetch(f"{ingress}/api/kernels", token=runtime_token, timeout=10) + kernels = response.json() if response.content else [] + if isinstance(kernels, list) and kernels: + ordered = sorted( + ( + str((kernel or {}).get("id") or "") + for kernel in kernels + ) + ) + kernel_id = ordered[0] if ordered else "" + except Exception: + kernel_id = "" + + sandbox_ref = sandbox_uid + if sandbox_uid and kernel_id: + sandbox_ref = f"{sandbox_uid}#{kernel_id}" + console.print( - f"[blue]No runtime specified, using: {selected.name} ({selected.uid})[/blue]" + f"[blue]No code sandbox specified, using: {selected.name} ({sandbox_ref})[/blue]" ) return selected.name or selected.uid or "" @@ -337,12 +990,132 @@ def _select_runtime(token: Optional[str] = None) -> str: # Re-raise typer.Exit without modification raise except Exception as e: - console.print(f"[red]Error checking available runtimes: {e}[/red]") + console.print(f"[red]Error checking available code sandboxes: {e}[/red]") console.print( "[yellow]Hint: Make sure you're authenticated with 'dla login'[/yellow]" ) raise typer.Exit(1) +def _get_environment_burning_rate(client: DatalayerClient, environment: str) -> float: + """Get environment burning rate in credits/second.""" + environments = client.list_environments() + for env in environments: + if str(env.name or "") == environment: + burn_rate = float(env.burning_rate or 0.0) + if burn_rate <= 0: + raise RuntimeError( + f"Environment '{environment}' has invalid burning rate: {burn_rate}" + ) + return burn_rate + raise RuntimeError( + f"Environment '{environment}' not found. Available environments: {[str(env.name or '') for env in environments]}" + ) + + +def _to_float(value: Any, default: float = 0.0) -> float: + """Safely parse a float-like value.""" + try: + if value is None: + return default + return float(value) + except Exception: + return default + + +def _get_remaining_credits_after_reservations(client: DatalayerClient) -> float: + """Compute remaining credits after reservations from usage payload.""" + usage = client.get_usage_credits() + if not usage.get("success", True): + raise RuntimeError( + f"Failed to load usage credits: {usage.get('message', 'Unknown error')}" + ) + + credits = usage.get("credits", {}) or {} + reservations = usage.get("reservations", []) or [] + + credits_value = _to_float(credits.get("credits"), 0.0) + quota = credits.get("quota") + + if quota is None: + available_before_reservations = credits_value + else: + available_before_reservations = _to_float(quota, 0.0) - credits_value + + reserved_total = 0.0 + for reservation in reservations: + if not isinstance(reservation, dict): + continue + reserved_total += _to_float(reservation.get("credits"), 0.0) + + remaining = available_before_reservations - reserved_total + return max(0.0, remaining) + + +def _default_code_sandbox_seconds(remaining_credits: float, burn_rate: float) -> float: + """Suggest code sandbox duration in seconds using 33% of remaining credits.""" + proposed_credits = max(0.0, remaining_credits * 0.33) + if burn_rate <= 0: + raise RuntimeError("Burning rate must be positive to compute duration") + seconds = proposed_credits / burn_rate + # Keep a practical positive default even when credits are very low. + return max(10.0, seconds) + + +def _inspect_code_sandbox_kernels_unique( + sandbox_name: str, + sandbox_uid: str, + sandbox_pod: str, + sandbox_ingress: str, + sandbox_token: str, + inspection_label: str, +) -> str: + """Inspect code sandbox kernels and return the unique kernel id. + + Fails fast if the code sandbox does not expose exactly one kernel. + """ + response = fetch(f"{sandbox_ingress}/api/kernels", token=sandbox_token, timeout=15) + kernels = response.json() if response.content else [] + if not isinstance(kernels, list): + kernels = [] + + summary = Table(title=f"Code Sandbox Inspection ({inspection_label})") + summary.add_column("Field", style="cyan") + summary.add_column("Value") + summary.add_row("Code Sandbox", sandbox_name) + summary.add_row("Pod", sandbox_pod) + summary.add_row("UID", sandbox_uid) + summary.add_row("Ingress", sandbox_ingress) + summary.add_row("Code Sandboxes", str(len(kernels))) + console.print(summary) + + code_sandboxes_table = Table(title="Available Code Sandboxes") + code_sandboxes_table.add_column("ID", style="green") + code_sandboxes_table.add_column("Name") + code_sandboxes_table.add_column("State") + code_sandboxes_table.add_column("Connections") + code_sandboxes_table.add_column("Last Activity") + for kernel in kernels: + code_sandboxes_table.add_row( + str((kernel or {}).get("id") or ""), + str((kernel or {}).get("name") or ""), + str((kernel or {}).get("execution_state") or ""), + str((kernel or {}).get("connections") or "0"), + str((kernel or {}).get("last_activity") or ""), + ) + if kernels: + console.print(code_sandboxes_table) + + if len(kernels) != 1: + raise RuntimeError( + f"Code sandbox inspection requires exactly one kernel; found {len(kernels)}" + ) + + kernel_id = str((kernels[0] or {}).get("id") or "").strip() + if not kernel_id: + raise RuntimeError("Code sandbox inspection returned a kernel without an id") + return kernel_id + + if __name__ == "__main__": app() diff --git a/datalayer_core/cli/commands/plans.py b/datalayer_core/cli/commands/plans.py index db55ed4b..a9b68052 100644 --- a/datalayer_core/cli/commands/plans.py +++ b/datalayer_core/cli/commands/plans.py @@ -3,6 +3,7 @@ """Plans commands for Datalayer CLI.""" +import os from typing import Any, Optional import typer @@ -51,7 +52,7 @@ def _make_client( def plans_callback(ctx: typer.Context) -> None: """Plans and subscription commands.""" if ctx.invoked_subcommand is None: - ctx.invoke(plans_show) + plans_show(token=None, iam_url=None, raw=False) def _format_number(value: Any, fallback: str = "-") -> str: @@ -323,7 +324,8 @@ def plans_catalog( help="Datalayer IAM server URL", ), billable_account_uid: Optional[str] = typer.Option( - None, + os.environ.get("DATALAYER_ACCOUNT_UID") + or os.environ.get("DATALAYER_BILLABLE_ACCOUNT_UID"), "--billable-account-uid", help="Optional billable account UID scope.", ), diff --git a/datalayer_core/cli/commands/pools.py b/datalayer_core/cli/commands/pools.py index 6d19244e..8c61e4ae 100644 --- a/datalayer_core/cli/commands/pools.py +++ b/datalayer_core/cli/commands/pools.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. diff --git a/datalayer_core/cli/commands/ray.py b/datalayer_core/cli/commands/ray.py index b9060c9e..3773fe6e 100644 --- a/datalayer_core/cli/commands/ray.py +++ b/datalayer_core/cli/commands/ray.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. @@ -39,13 +42,25 @@ ) console = Console() +_RAY_RUNTIMES_URL_OVERRIDE: Optional[str] = None _ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") @app.callback() -def ray_callback(ctx: typer.Context) -> None: +def ray_callback( + ctx: typer.Context, + runtimes_url: Optional[str] = typer.Option( + None, + "--runtimes-url", + help="Datalayer Runtimes server URL.", + ), +) -> None: """Ray management commands.""" + global _RAY_RUNTIMES_URL_OVERRIDE + _RAY_RUNTIMES_URL_OVERRIDE = ( + str(runtimes_url).strip().rstrip("/") if runtimes_url else None + ) if ctx.invoked_subcommand is None: typer.echo(ctx.get_help()) @@ -67,9 +82,7 @@ def jobs_callback(ctx: typer.Context) -> None: def _make_client( token: Optional[str] = None, ) -> DatalayerClient: - urls = DatalayerURLs.from_environment() - # Ray CLI is intentionally routed via runtimes, never directly to ray_url. - urls.ray_url = urls.runtimes_url + urls = DatalayerURLs.from_environment(runtimes_url=_RAY_RUNTIMES_URL_OVERRIDE) return DatalayerClient(urls=urls, token=token) @@ -348,31 +361,50 @@ def jobs_list( token: Optional[str] = typer.Option(None, "--token", help="API token."), raw: bool = typer.Option(False, "--raw", help="Print raw JSON."), ) -> None: - client = _make_client(token=token) - payload = client.ray_list_jobs(namespace=namespace, cluster_name=cluster_name) - if raw: - _print_json(payload) - return + try: + client = _make_client(token=token) + payload = client.ray_list_jobs(namespace=namespace, cluster_name=cluster_name) + if raw: + _print_json(payload) + return - items = payload.get("jobs") or [] - table = Table(title=f"Ray Jobs ({len(items)})") - table.add_column("Name", style="cyan") - table.add_column("Namespace") - table.add_column("Cluster") - table.add_column("Status") + items = payload.get("jobs") or [] + table = Table(title=f"Ray Jobs ({len(items)})") + table.add_column("Name", style="cyan") + table.add_column("Namespace") + table.add_column("Cluster") + table.add_column("Status") + + for item in items: + metadata = item.get("metadata") or {} + labels = metadata.get("labels") or {} + status = item.get("status") or {} + table.add_row( + str(metadata.get("name", "")), + str(metadata.get("namespace", namespace)), + str(labels.get("ray.io/cluster", "")), + str(status.get("jobStatus", "")), + ) - for item in items: - metadata = item.get("metadata") or {} - labels = metadata.get("labels") or {} - status = item.get("status") or {} - table.add_row( - str(metadata.get("name", "")), - str(metadata.get("namespace", namespace)), - str(labels.get("ray.io/cluster", "")), - str(status.get("jobStatus", "")), - ) + console.print(table) + except Exception as exc: + message = str(exc).strip() or "Unknown Ray jobs error" + lowered = message.lower() - console.print(table) + if "no ray provider registered" in lowered: + console.print("[red]Unable to list Ray jobs:[/red] No Ray provider registered.") + console.print( + "[yellow]Hint:[/yellow] Start or register a Ray provider in the runtimes service, then retry [bold]d ray jobs ls[/bold]." + ) + elif "status=503" in lowered: + console.print("[red]Unable to list Ray jobs:[/red] Ray service unavailable (503).") + console.print( + "[yellow]Hint:[/yellow] Check runtimes/operator health and Ray provider registration." + ) + else: + console.print(f"[red]Unable to list Ray jobs:[/red] {message}") + + raise typer.Exit(code=1) @jobs_app.command(name="status") diff --git a/datalayer_core/cli/commands/runtimes.py b/datalayer_core/cli/commands/runtimes.py deleted file mode 100644 index 7a0de637..00000000 --- a/datalayer_core/cli/commands/runtimes.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2023-2025 Datalayer, Inc. -# Distributed under the terms of the Modified BSD License. - -"""Runtime commands for Datalayer CLI.""" - -from typing import Optional - -import typer -from rich.console import Console - -from datalayer_core.client.client import DatalayerClient -from datalayer_core.displays.runtimes import display_runtimes -from datalayer_core.utils.urls import DatalayerURLs - -# Create a Typer app for runtime commands -app = typer.Typer( - name="runtimes", help="Runtime management commands", invoke_without_command=True -) - -console = Console() - - -@app.callback() -def runtimes_callback(ctx: typer.Context) -> None: - """Runtime management commands.""" - if ctx.invoked_subcommand is None: - typer.echo(ctx.get_help()) - - -def _make_client( - token: Optional[str] = None, - iam_url: Optional[str] = None, - runtimes_url: Optional[str] = None, -) -> DatalayerClient: - """Create a DatalayerClient with optional runtimes URL override.""" - urls = DatalayerURLs.from_environment(iam_url=iam_url, runtimes_url=runtimes_url) - return DatalayerClient(urls=urls, token=token) - - -@app.command(name="ls") -def list_runtimes( - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), - iam_url: Optional[str] = typer.Option( - None, - "--iam-url", - help="Datalayer IAM server URL", - ), - runtimes_url: Optional[str] = typer.Option( - None, - "--runtimes-url", - help="Datalayer Runtimes server URL", - ), -) -> None: - """List running runtimes.""" - try: - client = _make_client( - token=token, - iam_url=iam_url, - runtimes_url=runtimes_url, - ) - runtimes = client.list_runtimes() - - # Convert to dict format for display_runtimes - runtime_dicts = [] - for runtime in runtimes: - runtime_dicts.append( - { - "given_name": runtime.name, - "environment_name": runtime.environment, - "pod_name": runtime.pod_name, - "ingress": runtime.ingress, - "reservation_id": runtime.reservation_id, - "uid": runtime.uid, - "burning_rate": runtime.burning_rate, - "token": runtime.jupyter_token, - "started_at": runtime.started_at, - "expired_at": runtime.expired_at, - } - ) - - display_runtimes(runtime_dicts) - - except Exception as e: - console.print(f"[red]Error listing runtimes: {e}[/red]") - raise typer.Exit(1) - - -@app.command(name="create") -def create_runtime( - environment: Optional[str] = typer.Argument(None, help="Environment name"), - given_name: Optional[str] = typer.Option( - None, - "--given-name", - help="Custom name for the runtime", - ), - credits_limit: Optional[float] = typer.Option( - None, - "--credits-limit", - help="Maximum amount of credits that can be consumed by the runtime", - ), - time_reservation: Optional[float] = typer.Option( - 10.0, - "--time-reservation", - help="Time reservation in minutes for the runtime", - ), - billable_account_uid: Optional[str] = typer.Option( - None, - "--billable-account-uid", - help="Account UID to bill the runtime to (org/team). Defaults to the authenticated user.", - ), - billable_account_type: Optional[str] = typer.Option( - None, - "--billable-account-type", - help="Billable account type: user, organization, or team.", - ), - billable_account_handle: Optional[str] = typer.Option( - None, - "--billable-account-handle", - help="Billable account handle (informational).", - ), - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), - iam_url: Optional[str] = typer.Option( - None, - "--iam-url", - help="Datalayer IAM server URL", - ), - runtimes_url: Optional[str] = typer.Option( - None, - "--runtimes-url", - help="Datalayer Runtimes server URL", - ), -) -> None: - """Create a new runtime.""" - import questionary - - try: - client = _make_client( - token=token, - iam_url=iam_url, - runtimes_url=runtimes_url, - ) - - if environment is None: - # List environments and let the user pick one - environments = client.list_environments() - if not environments: - console.print("[yellow]No environments available.[/yellow]") - raise typer.Exit(0) - - choices = [] - for env in environments: - label = env.name - if env.title: - label += f" ({env.title})" - choices.append(questionary.Choice(title=label, value=env.name)) - - selected = questionary.select( - "Select the environment for the new runtime:", - choices=choices, - ).ask() - - if selected is None: - raise typer.Exit(0) - environment = selected - - # Create runtime - final_time_reservation = time_reservation or 10.0 - runtime = client.create_runtime( - name=given_name, - environment=environment, - time_reservation=final_time_reservation, - billable_account_uid=billable_account_uid, - billable_account_type=billable_account_type, - billable_account_handle=billable_account_handle, - ) - - console.print( - f"Runtime will use credits limit: {(runtime.burning_rate or 0.0) * 60.0 * final_time_reservation:.2f}" - ) - console.print(f"Runtime created successfully: {runtime.name}") - console.print(f"[green]Runtime '{runtime.name}' created successfully![/green]") - - except typer.Exit: - raise - except Exception as e: - console.print(f"[red]Error creating runtime: {e}[/red]") - raise typer.Exit(1) - - -@app.command(name="terminate") -def terminate_runtime( - pod_name: Optional[str] = typer.Argument( - None, help="Pod name of the runtime to terminate" - ), - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), - iam_url: Optional[str] = typer.Option( - None, - "--iam-url", - help="Datalayer IAM server URL", - ), - runtimes_url: Optional[str] = typer.Option( - None, - "--runtimes-url", - help="Datalayer Runtimes server URL", - ), -) -> None: - """Terminate a running runtime.""" - import questionary - - try: - client = _make_client( - token=token, - iam_url=iam_url, - runtimes_url=runtimes_url, - ) - - if pod_name is None: - # List runtimes and let the user pick one - runtimes = client.list_runtimes() - if not runtimes: - console.print("[yellow]No running runtimes found.[/yellow]") - raise typer.Exit(0) - - choices = [] - for rt in runtimes: - label = rt.pod_name or "" - if rt.name: - label = f"{rt.pod_name} ({rt.name})" - if rt.environment: - label += f" [{rt.environment}]" - choices.append(questionary.Choice(title=label, value=rt.pod_name)) - - selected = questionary.select( - "Select the runtime to terminate:", - choices=choices, - ).ask() - - if selected is None: - # User cancelled (Ctrl-C / Esc) - raise typer.Exit(0) - pod_name = selected - - success = client.terminate_runtime(pod_name) - - if success: - console.print( - f"[green]Runtime '{pod_name}' terminated successfully![/green]" - ) - else: - console.print(f"[red]Failed to terminate runtime '{pod_name}'[/red]") - raise typer.Exit(1) - - except typer.Exit: - raise - except Exception as e: - console.print(f"[red]Error terminating runtime: {e}[/red]") - raise typer.Exit(1) - - -# Root level commands for convenience -def runtimes_list( - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), - iam_url: Optional[str] = typer.Option( - None, - "--iam-url", - help="Datalayer IAM server URL", - ), - runtimes_url: Optional[str] = typer.Option( - None, - "--runtimes-url", - help="Datalayer Runtimes server URL", - ), -) -> None: - """List running runtimes (root command).""" - list_runtimes(token=token, iam_url=iam_url, runtimes_url=runtimes_url) - - -def runtimes_ls( - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), - iam_url: Optional[str] = typer.Option( - None, - "--iam-url", - help="Datalayer IAM server URL", - ), - runtimes_url: Optional[str] = typer.Option( - None, - "--runtimes-url", - help="Datalayer Runtimes server URL", - ), -) -> None: - """List running runtimes (root command alias).""" - list_runtimes(token=token, iam_url=iam_url, runtimes_url=runtimes_url) diff --git a/datalayer_core/cli/commands/schedules.py b/datalayer_core/cli/commands/schedules.py new file mode 100644 index 00000000..5e3d9cc1 --- /dev/null +++ b/datalayer_core/cli/commands/schedules.py @@ -0,0 +1,138 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Schedule commands for Datalayer CLI.""" + +from __future__ import annotations + +import os +from typing import Any, Optional + +import requests +import typer +from rich.console import Console +from rich.table import Table + +from datalayer_core.utils.urls import DatalayerURLs + + +app = typer.Typer( + name="schedules", + help="Scheduler management commands.", + invoke_without_command=True, +) + +console = Console() + + +@app.callback() +def schedules_callback(ctx: typer.Context) -> None: + """Scheduler management commands.""" + if ctx.invoked_subcommand is None: + typer.echo(ctx.get_help()) + + +def _resolve_token(token: Optional[str] = None) -> str: + if token: + return token + env_token = os.environ.get("DATALAYER_API_KEY") + if env_token: + return env_token + try: + from datalayer_core.client.client import DatalayerClient + + client = DatalayerClient() + return client._get_token() or "" + except Exception: + return "" + + +def _fetch_scheduler( + *, + path: str, + token: Optional[str] = None, + scheduler_url: Optional[str] = None, +) -> dict[str, Any]: + resolved_token = _resolve_token(token) + if not resolved_token: + raise RuntimeError( + "No authentication token found. Pass --token, set DATALAYER_API_KEY, or run 'datalayer login'." + ) + + urls = DatalayerURLs.from_environment(scheduler_url=scheduler_url) + url = f"{urls.scheduler_url}/api/scheduler/v1{path}" + headers = {"Authorization": f"Bearer {resolved_token}"} + + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() if response.content else {} + if not isinstance(data, dict): + raise RuntimeError("Unexpected scheduler response payload.") + return data + + +def _render_schedules(schedules: list[dict[str, Any]]) -> None: + table = Table(title="Schedules") + table.add_column("UID", style="cyan") + table.add_column("Notebook UID") + table.add_column("Cron") + table.add_column("Preset") + table.add_column("Enabled") + table.add_column("Next Planned") + + for schedule in schedules: + table.add_row( + str(schedule.get("uid", "")), + str(schedule.get("notebook_uid_s", "")), + str(schedule.get("cron_expression_s", "")), + str(schedule.get("preset_s", "")), + "yes" if bool(schedule.get("enabled_b", True)) else "no", + str(schedule.get("next_planned_ts_dt", "")), + ) + console.print(table) + + +def _render_runs(runs: list[dict[str, Any]]) -> None: + table = Table(title="Schedule Runs") + table.add_column("UID", style="cyan") + table.add_column("Schedule UID") + table.add_column("Notebook UID") + table.add_column("State") + table.add_column("Success") + table.add_column("Planned") + table.add_column("Executed") + + for run in runs: + table.add_row( + str(run.get("uid", "")), + str(run.get("schedule_uid_s", "")), + str(run.get("notebook_uid_s", "")), + str(run.get("state_s", "")), + str(run.get("success_b", "")), + str(run.get("planned_ts_dt", "")), + str(run.get("executed_ts_dt", "")), + ) + console.print(table) + + +@app.command(name="ls") +def list_schedules( + runs: bool = typer.Option(False, "--runs", help="List schedule runs instead of schedule definitions."), + token: Optional[str] = typer.Option(None, "--token", help="Authentication token."), + scheduler_url: Optional[str] = typer.Option(None, "--scheduler-url", help="Datalayer Scheduler service URL."), +) -> None: + """List scheduler definitions or scheduler runs.""" + try: + if runs: + payload = _fetch_scheduler(path="/schedules/runs", token=token, scheduler_url=scheduler_url) + _render_runs(payload.get("runs") or []) + return + + payload = _fetch_scheduler(path="/schedules", token=token, scheduler_url=scheduler_url) + _render_schedules(payload.get("schedules") or []) + except Exception as exc: + console.print(f"[red]Error listing schedules: {exc}[/red]") + raise typer.Exit(1) diff --git a/datalayer_core/cli/commands/tokens.py b/datalayer_core/cli/commands/tokens.py deleted file mode 100644 index 3d7d50f4..00000000 --- a/datalayer_core/cli/commands/tokens.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2023-2025 Datalayer, Inc. -# Distributed under the terms of the Modified BSD License. - -"""Token commands for Datalayer CLI.""" - -from typing import Optional - -import typer -from rich.console import Console - -from datalayer_core.client.client import DatalayerClient -from datalayer_core.displays.tokens import display_tokens -from datalayer_core.models.token import TokenType - -# Create a Typer app for token commands -app = typer.Typer( - name="tokens", help="Token management commands", invoke_without_command=True -) - -console = Console() - - -@app.callback() -def tokens_callback(ctx: typer.Context) -> None: - """Token management commands.""" - if ctx.invoked_subcommand is None: - typer.echo(ctx.get_help()) - - -@app.command(name="ls") -def list_tokens( - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), -) -> None: - """List all tokens.""" - try: - client = DatalayerClient(token=token) - tokens = client.list_tokens() - - # Convert to dict format for display_tokens - token_dicts = [] - for token in tokens: - token_dicts.append( - { - "uid": token.uid, - "name_s": token.name, - "description_t": token.description, - "variant_s": token.token_type, - } - ) - - display_tokens(token_dicts) - - except Exception as e: - console.print(f"[red]Error listing tokens: {e}[/red]") - raise typer.Exit(1) - - -@app.command(name="create") -def create_token( - name: str = typer.Argument(..., help="Name of the token"), - description: str = typer.Argument(..., help="Description of the token"), - expiration_date: Optional[int] = typer.Option( - 0, - "--expiration-date", - help="Expiration date in seconds since epoch (0 for no expiration)", - ), - token_type: str = typer.Option( - TokenType.USER, - "--token-type", - help="Type of the token (user, admin)", - ), - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), -) -> None: - """Create a new token.""" - try: - client = DatalayerClient(token=token) - - result = client.create_token( - name=name, - description=description, - expiration_date=expiration_date or 0, - token_type=token_type, - ) - - if result.get("success", False): - token_data = result.get("token", {}) - console.print(f"[green]Token '{name}' created successfully![/green]") - console.print( - f"[yellow]Token value: {result.get('access_token', 'N/A')}[/yellow]" - ) - console.print( - "[dim]Please save this token value securely - it won't be shown again![/dim]" - ) - - # Display the created token info - if token_data: - display_tokens( - [ - { - "uid": token_data.get("uid"), - "name_s": token_data.get("name_s", name), - "description_t": token_data.get( - "description_t", description - ), - "variant_s": token_data.get("variant_s", token_type), - } - ] - ) - else: - console.print( - f"[red]Failed to create token: {result.get('message', 'Unknown error')}[/red]" - ) - raise typer.Exit(1) - - except Exception as e: - console.print(f"[red]Error creating token: {e}[/red]") - raise typer.Exit(1) - - -@app.command(name="delete") -def delete_token( - uid: str = typer.Argument(..., help="UID of the token to delete"), - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), -) -> None: - """Delete a token.""" - try: - client = DatalayerClient(token=token) - - success = client.delete_token(uid) - - if success: - console.print(f"[green]Token '{uid}' deleted successfully![/green]") - else: - console.print(f"[red]Failed to delete token '{uid}'[/red]") - raise typer.Exit(1) - - except Exception as e: - console.print(f"[red]Error deleting token: {e}[/red]") - raise typer.Exit(1) - - -# Root level commands for convenience -def tokens_list( - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), -) -> None: - """List all tokens (root command).""" - list_tokens(token=token) - - -def tokens_ls( - token: Optional[str] = typer.Option( - None, - "--token", - help="Authentication token (Bearer token for API requests).", - ), -) -> None: - """List all tokens (root command alias).""" - list_tokens(token=token) diff --git a/datalayer_core/cli/commands/usage.py b/datalayer_core/cli/commands/usage.py index accd4316..34a85548 100644 --- a/datalayer_core/cli/commands/usage.py +++ b/datalayer_core/cli/commands/usage.py @@ -4,6 +4,7 @@ """Usage/credits commands for Datalayer CLI.""" from datetime import datetime, timezone +import os from typing import Any, Optional import typer @@ -130,7 +131,8 @@ def usage_records( help="Datalayer IAM server URL", ), billable_account_uid: Optional[str] = typer.Option( - None, + os.environ.get("DATALAYER_ACCOUNT_UID") + or os.environ.get("DATALAYER_BILLABLE_ACCOUNT_UID"), "--billable-account-uid", help="Optional account UID scope. Defaults to the authenticated account.", ), @@ -268,7 +270,8 @@ def usage_reservations( help="Optional reservation type filter.", ), billable_account_uid: Optional[str] = typer.Option( - None, + os.environ.get("DATALAYER_ACCOUNT_UID") + or os.environ.get("DATALAYER_BILLABLE_ACCOUNT_UID"), "--billable-account-uid", help="Optional account UID scope for fallback credits view.", ), diff --git a/datalayer_core/client/client.py b/datalayer_core/client/client.py index 8bd226fa..949433ea 100644 --- a/datalayer_core/client/client.py +++ b/datalayer_core/client/client.py @@ -14,6 +14,8 @@ from functools import lru_cache from typing import Any, Optional, Union +from jupyter_kernel_client import KernelClient + from datalayer_core.mixins.authn import AuthnMixin from datalayer_core.mixins.environments import EnvironmentsMixin from datalayer_core.mixins.evals import EvalsMixin @@ -22,16 +24,16 @@ from datalayer_core.mixins.sandbox_snapshots import SandboxSnapshotsMixin from datalayer_core.mixins.runtimes import RuntimesMixin from datalayer_core.mixins.secrets import SecretsMixin -from datalayer_core.mixins.tokens import TokensMixin +from datalayer_core.mixins.api_keys import ApiKeysMixin from datalayer_core.mixins.usage import UsageMixin from datalayer_core.mixins.whoami import WhoamiAppMixin from datalayer_core.models import UserModel +from datalayer_core.models.api_key import ApiKeyModel, ApiKeyType from datalayer_core.models.environment import EnvironmentModel from datalayer_core.models.sandbox_snapshot import SandboxSnapshotModel from datalayer_core.models.secret import SecretModel, SecretVariant -from datalayer_core.models.token import TokenModel, TokenType -from datalayer_core.runtimes.runtime import RuntimeService -from datalayer_core.runtimes.sandbox_snapshot import ( +from datalayer_core.agents.agent_cloud import RuntimeService +from datalayer_core.sandboxes.code_sandbox_snapshots import ( as_code_sandbox_snapshots, create_snapshot, ) @@ -54,7 +56,7 @@ class DatalayerClient( RayMixin, SecretsMixin, SandboxSnapshotsMixin, - TokensMixin, + ApiKeysMixin, UsageMixin, WhoamiAppMixin, ): @@ -269,6 +271,7 @@ def create_runtime( billable_account_uid: Optional[str] = None, billable_account_type: Optional[str] = None, billable_account_handle: Optional[str] = None, + api_key: Optional[str] = None, ) -> RuntimeService: """ Create a new runtime (kernel) for code execution. @@ -312,6 +315,10 @@ def create_runtime( # print(f"Runtime {name}") + client_for_request = self + if api_key: + client_for_request = DatalayerClient(urls=self._urls, token=api_key) + if snapshot_name is not None: snapshots = self.list_snapshots() snapshot_uid = None @@ -325,7 +332,7 @@ def create_runtime( f"Snapshot '{snapshot_name}' not found. Available snapshots: {[s.name for s in snapshots]}" ) - response = self._create_runtime( + response = client_for_request._create_runtime( given_name=name, environment_name=environment, from_snapshot_uid=snapshot_uid, @@ -338,7 +345,7 @@ def create_runtime( ) else: # Create runtime without snapshot - response = self._create_runtime( + response = client_for_request._create_runtime( given_name=name, environment_name=environment, agent_spec_id=agent_spec_id, @@ -374,7 +381,7 @@ def create_runtime( environment=runtime_data["environment_name"], run_url=self._urls.run_url, iam_url=self._urls.iam_url, - token=self._token, + token=api_key or self._token, ingress=runtime_data["ingress"], jupyter_token=runtime_data["token"], pod_name=runtime_data["pod_name"], @@ -434,7 +441,11 @@ def list_runtimes(self) -> list[RuntimeService]: ) return runtime_services - def terminate_runtime(self, runtime: Union[RuntimeService, str]) -> bool: + def terminate_runtime( + self, + runtime: Union[RuntimeService, str], + api_key: Optional[str] = None, + ) -> bool: """ Terminate a running Runtime. @@ -450,6 +461,9 @@ def terminate_runtime(self, runtime: Union[RuntimeService, str]) -> bool: """ pod_name = runtime.pod_name if isinstance(runtime, RuntimeService) else runtime if pod_name is not None: + if api_key: + client_for_request = DatalayerClient(urls=self._urls, token=api_key) + return client_for_request._terminate_runtime(pod_name).get("success", False) return self._terminate_runtime(pod_name)["success"] else: return False @@ -539,6 +553,111 @@ def update_runtime( raise RuntimeError(f"Failed to update runtime '{pod_name}': {message}") return True + def check_runtime_health( + self, + runtime: Union[RuntimeService, str], + probe_code: str = "print('datalayer runtime health probe')", + timeout: float = 20.0, + api_key: Optional[str] = None, + ) -> dict[str, Any]: + """Check runtime reachability and execute a probe on the sandbox. + + Parameters + ---------- + runtime : Union[RuntimeService, str] + Runtime object or runtime identifier (pod name/uid/name). + probe_code : str + Python code to execute as health probe on the sandbox. + timeout : float + Probe execution timeout in seconds. + api_key : Optional[str] + Optional API key override used for runtime lookup. + + Returns + ------- + dict[str, Any] + Health result with success flag and diagnostics. + """ + client_for_request = self + if api_key: + client_for_request = DatalayerClient(urls=self._urls, token=api_key) + + runtime_service = ( + runtime if isinstance(runtime, RuntimeService) else client_for_request.get_runtime(runtime) + ) + + endpoint = str(runtime_service.ingress or "").rstrip("/") + runtime_token = str( + runtime_service.jupyter_token + or client_for_request._get_token() + or "" + ).strip() + + result: dict[str, Any] = { + "success": False, + "runtime_uid": runtime_service.uid, + "runtime_pod_name": runtime_service.pod_name, + "runtime_name": runtime_service.name, + "ingress": endpoint, + "probe_mode": "sandbox_execute_code", + } + + if not endpoint: + result["message"] = "runtime ingress is missing" + return result + if not runtime_token: + result["message"] = "runtime token is missing" + return result + + kernel_client: Optional[KernelClient] = None + try: + kernel_client = KernelClient(server_url=endpoint, token=runtime_token) + kernel_client.start() + reply = kernel_client.execute(probe_code, timeout=timeout) + outputs = reply.get("outputs", []) + if not isinstance(outputs, list): + outputs = [] + + error_outputs = [ + output + for output in outputs + if isinstance(output, dict) + and str(output.get("output_type") or "") == "error" + ] + + if error_outputs: + first_error = error_outputs[0] + result["message"] = "sandbox probe execution failed" + result["error_name"] = first_error.get("ename") + result["error_value"] = first_error.get("evalue") + traceback_lines = first_error.get("traceback") + if isinstance(traceback_lines, list): + result["traceback_tail"] = "\n".join( + [str(line) for line in traceback_lines if line is not None] + )[-4000:] + return result + + stream_text_parts = [] + for output in outputs: + if not isinstance(output, dict): + continue + if str(output.get("output_type") or "") == "stream": + stream_text_parts.append(str(output.get("text") or "")) + + result["success"] = True + result["message"] = "runtime reachable and sandbox probe executed" + result["stdout_tail"] = "".join(stream_text_parts)[-1000:] + return result + except Exception as exc: + result["message"] = f"runtime health probe exception: {exc}" + return result + finally: + if kernel_client is not None: + try: + kernel_client.stop() + except Exception: + pass + def list_secrets(self) -> list[SecretModel]: """ List all secrets available in the Datalayer environment. @@ -736,76 +855,77 @@ def delete_snapshot( ) return self._delete_snapshot(snapshot_uid) - def create_token( + def create_api_key( self, name: str, description: str, expiration_date: int = 0, - token_type: Union[str, TokenType] = TokenType.USER, + api_key_type: Union[str, ApiKeyType] = ApiKeyType.SECRET, ) -> dict[str, Any]: """ - Create a new token. + Create a new API key. Parameters ---------- name : str - Name of the token. + Name of the API key. description : str - Description of the token. + Description of the API key. expiration_date : int, default 0 - Expiration date of the token in seconds since epoch. - token_type : Union[str, TokenType], default TokenType.USER - Type of the token (e.g., "user", "admin"). + Expiration date of the API key in seconds since epoch. + api_key_type : Union[str, ApiKeyType], default ApiKeyType.SECRET + Type of the API key (secret, publishable, restricted, temporary). Returns ------- dict[str, Any] - A dictionary containing the created token and its details. + A dictionary containing the created API key and its details. """ - return self._create_token( + return self._create_api_key( name=name, description=description, expiration_date=expiration_date, - token_type=token_type, + api_key_type=api_key_type, ) - def list_tokens(self) -> list[TokenModel]: + def list_api_keys(self) -> list[ApiKeyModel]: """ - List all tokens. + List all API keys. Returns ------- - list[Token] - A list of tokens associated with the user. - """ - response = self._list_tokens() - if response.get("success") and "tokens" in response: - token_objects = [] - for token_data in response["tokens"]: - token = TokenModel( - uid=token_data["uid"], - name=token_data.get("name_s", ""), - description=token_data.get("description_t", ""), - token_type=token_data.get("variant_s", "user"), + list[ApiKeyModel] + A list of API keys associated with the user. + """ + response = self._list_api_keys() + if response.get("success"): + payload = response.get("api_keys", response.get("tokens", [])) + api_key_objects = [] + for api_key_data in payload: + api_key = ApiKeyModel( + uid=api_key_data["uid"], + name=api_key_data.get("name_s", ""), + description=api_key_data.get("description_t", ""), + api_key_type=api_key_data.get("variant_s", "secret"), ) - token_objects.append(token) - return token_objects + api_key_objects.append(api_key) + return api_key_objects return [] - def delete_token(self, token: Union[str, TokenModel]) -> bool: + def delete_api_key(self, api_key: Union[str, ApiKeyModel]) -> bool: """ - Delete a specific token. + Delete a specific API key. Parameters ---------- - token : Union[str, Token] - Token object or UID string to delete. + api_key : Union[str, ApiKeyModel] + API key object or UID string to delete. Returns ------- bool The result of the deletion operation. """ - token_uid = token.uid if isinstance(token, TokenModel) else token - response = self._delete_token(token_uid) + api_key_uid = api_key.uid if isinstance(api_key, ApiKeyModel) else api_key + response = self._delete_api_key(api_key_uid) return response.get("success", False) diff --git a/datalayer_core/console/consoleapp.py b/datalayer_core/console/consoleapp.py index e665c4a1..a666813b 100644 --- a/datalayer_core/console/consoleapp.py +++ b/datalayer_core/console/consoleapp.py @@ -41,7 +41,7 @@ aliases = dict(datalayer_aliases) aliases.update( { - "runtime": "RuntimesConsoleApp.runtime_name", + "agent": "RuntimesConsoleApp.runtime_name", } ) diff --git a/datalayer_core/console/manager.py b/datalayer_core/console/manager.py index 8b3c6edc..136fc9d0 100644 --- a/datalayer_core/console/manager.py +++ b/datalayer_core/console/manager.py @@ -57,6 +57,10 @@ def __init__( _ = kwargs.pop("kernel_id", None) # kernel_id not supported super().__init__(server_url="", token="", username=username, **kwargs) self._kernel_id = "" + self.runtime_uid = "" + self.runtime_name = "" + self.runtime_pod_name = "" + self.runtime_created_in_start = False self.run_url = run_url self.run_token = token self.username = username @@ -114,145 +118,256 @@ def start_kernel( "A kernel is already started. Shutdown it before starting a new one." ) + # Reset per-start state markers. + self.runtime_created_in_start = False + runtime_name = name runtime = None - # Use DatalayerClient to get runtime information - if runtime_name: - # Get specific runtime by name - runtimes = self._client.list_runtimes() - for r in runtimes: - if r.name == runtime_name: - runtime = { - "pod_name": r.pod_name, - "ingress": r.ingress, - "token": r.jupyter_token, - "expired_at": r.expired_at, - } - break - else: + # Use DatalayerClient to get runtime information. + runtimes = self._client.list_runtimes() + + if not runtime_name: self.log.debug( - "No Runtime name provided. Picking the first available Runtime…" + "No Agent name provided. Picking the first available Agent…" ) - # Get list of available runtimes - runtimes = self._client.list_runtimes() - - # If no runtime is running, let the user decide to start one from the first environment if not runtimes: - environments = self._client.list_environments() - if not environments: + # Historical behaviour: when no Agent is running, offer to + # launch one from the first available environment instead of + # failing outright. + launched = self._prompt_and_launch_agent() + if launched is None: raise RuntimeError( - "No environments available to create a runtime from." + "No Agent running. Start one first with: " + "`d agents create --time-reservation 10`" ) + runtimes = [launched] + + selected = self._pick_accessible_runtime(runtimes) + + if selected is None: + # The accessibility probe is best-effort (a freshly launched + # Agent may still be warming up its ingress). Fall back to the + # first listed Agent and let `_ensure_kernel_id` retry until the + # kernel endpoint is reachable. + selected = runtimes[0] + + runtime_name = selected.name or selected.uid or selected.pod_name or "" + self.runtime_uid = str(selected.uid or "") + self.runtime_name = str(selected.name or runtime_name or "") + self.runtime_pod_name = str(selected.pod_name or "") + runtime = { + "pod_name": selected.pod_name, + "ingress": selected.ingress, + "token": selected.jupyter_token or self.run_token, + "expired_at": selected.expired_at, + } + else: + selected = None + for r in runtimes: + if r.name == runtime_name or r.uid == runtime_name: + selected = r + break + if selected is None: + raise RuntimeError(f"Agent '{runtime_name}' not found") + self.runtime_uid = str(selected.uid or "") + self.runtime_name = str(selected.name or runtime_name or "") + self.runtime_pod_name = str(selected.pod_name or "") + runtime = { + "pod_name": selected.pod_name, + "ingress": selected.ingress, + "token": selected.jupyter_token or self.run_token, + "expired_at": selected.expired_at, + } - first_environment = environments[0] - first_environment_name = first_environment.name + if runtime is None: + raise RuntimeError("Unable to find an Agent.") - # Calculate credits limit based on environment - credits_limit = ( - first_environment.burning_rate * 60.0 * 10.0 - ) # 10 minutes default + self.server_url = runtime["ingress"] + self.token = runtime.get("token", "") - user_input = ( - input( - f"No Runtime running.\nDo you want to launch a runtime from the environment {first_environment_name} with {credits_limit:.2f} reserved credits? (yes/no) [default: yes]: " - ) - or "yes" - ) - if user_input.lower() != "yes": - raise RuntimeError( - "No Runtime running. Please start one Runtime using `datalayer runtimes create `." - ) + # Ensure runtime endpoint is ready and a usable kernel exists. + self._kernel_id = self._ensure_kernel_id() - # Create new runtime using the client - new_runtime = self._client.create_runtime( - name=f"console-runtime-{first_environment_name}", - environment=first_environment_name, - time_reservation=10.0, # 10 minutes default - ) + kernel_model = self.refresh_model() + msg = f"RuntimeManager using existing Agent {runtime_name}" + expired_at = runtime.get("expired_at") + if expired_at is not None: + msg += f" expiring at {timestamp_to_local_date(expired_at)}" + self.log.info(msg) - # Start the runtime to get connection details - new_runtime._start() - - runtime = { - "pod_name": new_runtime.pod_name, - "ingress": new_runtime.ingress, - "token": new_runtime.jupyter_token, - "expired_at": new_runtime.expired_at, - } - - # Display the created runtime - runtime_dict = { - "given_name": new_runtime.name, - "environment_name": new_runtime.environment, - "pod_name": new_runtime.pod_name, - "ingress": new_runtime.ingress, - "reservation_id": getattr(new_runtime, "reservation_id", ""), - "uid": new_runtime.uid, - "burning_rate": getattr(new_runtime, "burning_rate", 0.0), - "token": new_runtime.jupyter_token, - "started_at": getattr(new_runtime, "started_at", ""), - "expired_at": new_runtime.expired_at, - } - display_runtimes([runtime_dict]) - - # Refresh runtime list - runtimes = self._client.list_runtimes() - - # Use the first available runtime - if runtimes: - r = runtimes[0] - runtime = { - "pod_name": r.pod_name, - "ingress": r.ingress, - "token": r.jupyter_token, - "expired_at": r.expired_at, - } - runtime_name = r.pod_name or "" + return kernel_model - if runtime is None: - raise RuntimeError("Unable to find a Runtime.") + def _pick_accessible_runtime(self, runtimes: list[Any]) -> Optional[Any]: + """Return first runtime that responds on /api/kernels with its runtime token.""" + for runtime in runtimes: + if self._runtime_is_accessible(runtime): + return runtime + return None - self.server_url = runtime["ingress"] - self.token = runtime.get("token", "") + def _prompt_and_launch_agent(self) -> Optional[Any]: + """Offer to launch an Agent from the first environment when none is running. + + Mirrors the historical console behaviour: if no Agent is running, ask + the user whether to create one from the first available environment with + a default 10-minute reservation. Returns the launched runtime (waiting + until it is listed and reachable), or ``None`` when the user declines. + """ + environments = self._client.list_environments() + if not environments: + raise RuntimeError("No environments available to create an Agent from.") + + first_environment = environments[0] + first_environment_name = first_environment.name + + # Default 10-minute reservation; estimate the reserved credits to inform + # the user before they confirm. + burning_rate = float(getattr(first_environment, "burning_rate", 0.0) or 0.0) + credits_limit = burning_rate * 60.0 * 10.0 + + prompt = ( + "No Agent running.\n" + f"Do you want to launch an Agent from the environment " + f"{first_environment_name} with {credits_limit:.2f} reserved credits? " + "(yes/no) [default: yes]: " + ) + try: + answer = (input(prompt) or "yes").strip().lower() + except EOFError: + answer = "yes" + if answer not in ("y", "yes"): + return None + + new_runtime = self._client.create_runtime( + name=f"console-agent-{first_environment_name}", + environment=first_environment_name, + time_reservation=10.0, + ) + + # Surface the freshly created Agent, mirroring the historical flow. + try: + display_runtimes( + [ + { + "given_name": new_runtime.name, + "environment_name": new_runtime.environment, + "pod_name": new_runtime.pod_name, + "ingress": new_runtime.ingress, + "reservation_id": getattr(new_runtime, "reservation_id", ""), + "uid": new_runtime.uid, + "burning_rate": getattr(new_runtime, "burning_rate", 0.0), + "token": new_runtime.jupyter_token, + "started_at": getattr(new_runtime, "started_at", ""), + "expired_at": new_runtime.expired_at, + } + ] + ) + except Exception: + pass + + # Wait until the launched Agent is listed and reachable before using it, + # falling back to the freshly created handle if the probe times out. + launched = self._wait_for_listed_accessible_runtime(str(new_runtime.uid or "")) + return launched or new_runtime + + def _wait_for_listed_accessible_runtime(self, preferred_uid: str) -> Optional[Any]: + """Wait for a launched runtime to be listed and reachable before use.""" + attempts = 30 + for _ in range(attempts): + runtimes = self._client.list_runtimes() + + if preferred_uid: + for runtime in runtimes: + if str(runtime.uid or "") == preferred_uid and self._runtime_is_accessible(runtime): + return runtime + + selected = self._pick_accessible_runtime(runtimes) + if selected is not None: + return selected + + time.sleep(1.0) + + return None + + def _runtime_is_accessible(self, runtime: Any) -> bool: + """Best-effort HTTP accessibility check for runtime ingress and token.""" + ingress = str(getattr(runtime, "ingress", "") or "").rstrip("/") + token = str(getattr(runtime, "jupyter_token", "") or self.run_token or "") + if not ingress or not token: + return False - # Get runtime information. from datalayer_core.utils.network import fetch - response = None - max_attempts = 4 + try: + fetch(f"{ingress}/api/kernels", token=token, timeout=10) + return True + except Exception: + return False + + def _ensure_kernel_id(self) -> str: + """Return the runtime's existing kernel id. + + Datalayer runtimes are provisioned with a kernel already running and + wired to the runtime ingress. We must connect to that existing kernel + instead of creating a new one: a freshly created kernel id is not the + one the ingress routes to, which leads to no execution output and 404 + responses on kernel endpoints (e.g. /interrupt). + """ + from datalayer_core.utils.network import fetch + + kernels_url = f"{self.server_url.rstrip('/')}/api/kernels" + max_attempts = 30 + last_error: Exception | None = None for attempt in range(1, max_attempts + 1): try: - response = fetch(f"{self.server_url}/api/kernels", token=self.token) - break + response = fetch(kernels_url, token=self.token, timeout=20) + kernels = response.json() if response.content else [] + if isinstance(kernels, list) and kernels: + # Freshly launched runtimes can briefly expose stale kernel IDs + # in the list endpoint; verify a kernel can be read directly + # before selecting it. + ordered_kernels = sorted( + kernels, + key=lambda kernel: str((kernel or {}).get("id") or ""), + ) + for kernel in ordered_kernels: + kernel_id = str((kernel or {}).get("id") or "") + if not kernel_id: + continue + try: + fetch( + f"{kernels_url}/{kernel_id}", + token=self.token, + timeout=20, + ) + return kernel_id + except requests.exceptions.HTTPError as e: + status = ( + e.response.status_code + if getattr(e, "response", None) is not None + else None + ) + if status in (404, 410): + # Kernel disappeared while ingress was warming. + continue + last_error = e + except requests.exceptions.ConnectionError as e: + last_error = e except requests.exceptions.HTTPError as e: status = ( e.response.status_code if getattr(e, "response", None) is not None else None ) - if status in (502, 503, 504) and attempt < max_attempts: - time.sleep(2 ** (attempt - 1)) - continue - raise - except requests.exceptions.ConnectionError: - if attempt < max_attempts: - time.sleep(2 ** (attempt - 1)) - continue - raise - - if response is None: - raise RuntimeError("Failed to query kernel endpoint for runtime") - - kernels = response.json() - if kernels: - self._kernel_id = kernels[0]["id"] - - kernel_model = self.refresh_model() - msg = f"RuntimeManager using existing runtime {runtime_name}" - expired_at = runtime.get("expired_at") - if expired_at is not None: - msg += f" expiring at {timestamp_to_local_date(expired_at)}" - self.log.info(msg) - - return kernel_model + if status not in (404, 502, 503, 504): + raise + last_error = e + except requests.exceptions.ConnectionError as e: + last_error = e + + # The kernel may still be registering on a freshly launched runtime. + time.sleep(1.0) + + raise RuntimeError( + f"Runtime has no available kernel at '{kernels_url}': {last_error}" + ) diff --git a/datalayer_core/displays/api_keys.py b/datalayer_core/displays/api_keys.py new file mode 100644 index 00000000..588cf9e0 --- /dev/null +++ b/datalayer_core/displays/api_keys.py @@ -0,0 +1,64 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Display functions for Datalayer core.""" + +from __future__ import annotations + +from rich.console import Console +from rich.table import Table + + +def _new_api_keys_table(title: str = "API Keys") -> Table: + """ + Create a new API keys table. + + Parameters + ---------- + title : str, default "API Keys" + The title for the table. + + Returns + ------- + Table + A rich Table configured for displaying API keys. + """ + table = Table(title=title) + table.add_column("ID", style="cyan", no_wrap=True) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Variant", style="cyan", no_wrap=True) + return table + + +def _add_api_key_to_table(table: Table, api_key: dict[str, str]) -> None: + """ + Add an API key row to the table. + + Parameters + ---------- + table : Table + The rich Table to add the row to. + api_key : dict[str, str] + Dictionary containing API key information with keys: uid, name_s, description_t, variant_s. + """ + table.add_row( + api_key["uid"], + api_key["name_s"], + api_key["variant_s"], + ) + + +def display_api_keys(api_keys: list[dict[str, str]]) -> None: + """ + Display a list of API keys in the console. + + Parameters + ---------- + api_keys : list[dict[str, str]] + List of API key dictionaries to display. + """ + table = _new_api_keys_table(title="API Keys") + for api_key in api_keys: + _add_api_key_to_table(table, api_key) + console = Console() + console.print(table) diff --git a/datalayer_core/displays/environments.py b/datalayer_core/displays/environments.py index 312a8a37..f324abf4 100644 --- a/datalayer_core/displays/environments.py +++ b/datalayer_core/displays/environments.py @@ -6,64 +6,214 @@ from __future__ import annotations import json +import re from typing import Any from rich.console import Console -from rich.table import Table + + +def _description_to_text(description: str) -> str: + """Convert HTML/Markdown-like descriptions into readable plain text.""" + text = (description or "").strip() + if not text: + return "(no description)" + + normalized = text + normalized = re.sub(r"<\s*/\s*p\s*>", "\n\n", normalized, flags=re.IGNORECASE) + normalized = re.sub(r"<\s*p\s*>", "", normalized, flags=re.IGNORECASE) + normalized = re.sub(r"<\s*b\s*>", "", normalized, flags=re.IGNORECASE) + normalized = re.sub(r"<\s*/\s*b\s*>", "", normalized, flags=re.IGNORECASE) + normalized = re.sub(r"<[^>]+>", "", normalized) + # Strip lightweight markdown markers that look noisy in CLI tables. + normalized = re.sub(r"\*\*(.*?)\*\*", r"\1", normalized) + normalized = re.sub(r"__(.*?)__", r"\1", normalized) + normalized = re.sub(r"`([^`]*)`", r"\1", normalized) + normalized = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", normalized) + normalized = re.sub(r"^\s*#{1,6}\s*", "", normalized, flags=re.MULTILINE) + normalized = re.sub(r"\n{3,}", "\n\n", normalized) + normalized = normalized.strip() or "(no description)" + return normalized + + +def _truncate(value: str, width: int) -> str: + if width <= 0: + return "" + if len(value) <= width: + return value + if width == 1: + return "…" + return value[: width - 1] + "…" + +def _wrap_lines(text: str, width: int) -> list[str]: + """Wrap plain text into lines bounded by width, preserving explicit breaks.""" + if width <= 1: + return [text[:width]] if text else [""] + + wrapped: list[str] = [] + for raw_line in text.splitlines() or [""]: + line = raw_line.strip() + if not line: + wrapped.append("") + continue + + remaining = line + while len(remaining) > width: + cut = remaining.rfind(" ", 0, width + 1) + if cut <= 0: + cut = width + wrapped.append(remaining[:cut].rstrip()) + remaining = remaining[cut:].lstrip() + wrapped.append(remaining) + + lines = wrapped + if not lines: + return [""] + return lines + + +def _pad_cell(value: str, width: int, align_right: bool = False) -> str: + text = _truncate(value, width) + return text.rjust(width) if align_right else text.ljust(width) def display_environments(environments: list[dict[str, Any]]) -> None: - """ - Display a list of environments in the console. - - Parameters - ---------- - environments : list[dict[str, Any]] - List of environment dictionaries to display. - """ - table = _new_env_table() - for environment in environments: - _add_env_to_table(table, environment) + """Display environments with a full-width detail line per environment.""" console = Console() - console.print(table) - - -def _new_env_table() -> Table: - """ - Create a new table for displaying environments. - - Returns - ------- - Table - A configured Rich Table object for environments. - """ - table = Table(title="Environments") - table.add_column("ID", style="magenta", no_wrap=True) - table.add_column("Cost per seconds", justify="right", style="red", no_wrap=True) - table.add_column("Name", style="green", no_wrap=True) - table.add_column("Description", style="green", no_wrap=True) - table.add_column("Language", style="green", no_wrap=True) - table.add_column("Resources", justify="right", style="green", no_wrap=True) - return table - - -def _add_env_to_table(table: Table, environment: dict[str, Any]) -> None: - """ - Add an environment row to the display table. - - Parameters - ---------- - table : Table - Rich Table object to add the row to. - environment : dict[str, Any] - Environment data dictionary to add as a row. - """ - desc = environment["description"] - table.add_row( - environment["name"], - "{:.3g}".format(environment["burning_rate"]), - environment["title"], - desc if len(desc) <= 50 else desc[:50] + "…", - environment["language"], - json.dumps(environment["resources"]), + + headers = ("ID", "Credits/Second", "Name", "Language", "Resources") + rows: list[tuple[str, str, str, str, str, str]] = [] + for env in environments: + env_id = str(env.get("name") or "") + cost = "{:.4g}".format(float(env.get("burning_rate") or 0.0)) + name = str(env.get("title") or "") + language = str(env.get("language") or "") + resources = json.dumps(env.get("resources") or {}, ensure_ascii=False) + desc_text = _description_to_text(str(env.get("description") or "")) + rows.append((env_id, cost, name, language, resources, desc_text)) + + terminal_width = max(80, console.width) + inner_target = terminal_width - 2 + + # Preferred widths; later adjusted to fit exactly within terminal width. + id_width = max(len(headers[0]), *(len(r[0]) for r in rows)) if rows else len(headers[0]) + cost_width = max(len(headers[1]), *(len(r[1]) for r in rows)) if rows else len(headers[1]) + name_width = max(len(headers[2]), *(len(r[2]) for r in rows)) if rows else len(headers[2]) + lang_width = max(len(headers[3]), *(len(r[3]) for r in rows)) if rows else len(headers[3]) + + id_width = max(12, min(id_width, 28)) + cost_width = max(6, min(cost_width, 16)) + name_width = max(18, min(name_width, 32)) + lang_width = max(8, min(lang_width, 16)) + + # Resources column gets remaining space. + used_without_resources = ( + (id_width + 2) + + (cost_width + 2) + + (name_width + 2) + + (lang_width + 2) + + 4 # column separators between 5 columns + + 2 # left/right padding of border interior + ) + resources_width = max(20, inner_target - used_without_resources) + + # If terminal is very narrow, squeeze fixed columns further. + if resources_width == 20 and used_without_resources + resources_width > inner_target: + overflow = (used_without_resources + resources_width) - inner_target + # Reduce name first, then id, then lang within minimums. + shrink_name = min(max(0, name_width - 12), overflow) + name_width -= shrink_name + overflow -= shrink_name + if overflow > 0: + shrink_id = min(max(0, id_width - 10), overflow) + id_width -= shrink_id + overflow -= shrink_id + if overflow > 0: + shrink_lang = min(max(0, lang_width - 6), overflow) + lang_width -= shrink_lang + + # Recompute resources width with final fixed widths. + used_without_resources = ( + (id_width + 2) + + (cost_width + 2) + + (name_width + 2) + + (lang_width + 2) + + 4 + + 2 + ) + resources_width = max(12, inner_target - used_without_resources) + + c1 = id_width + 2 + c2 = cost_width + 2 + c3 = name_width + 2 + c4 = lang_width + 2 + c5 = resources_width + 2 + inner_total = c1 + c2 + c3 + c4 + c5 + 4 + + console.print("Environments".center(inner_total + 2), style="bold") + + console.print( + "┏" + + "━" * c1 + + "┳" + + "━" * c2 + + "┳" + + "━" * c3 + + "┳" + + "━" * c4 + + "┳" + + "━" * c5 + + "┓" ) + console.print( + "┃ " + + _pad_cell(headers[0], id_width) + + " ┃ " + + _pad_cell(headers[1], cost_width, align_right=True) + + " ┃ " + + _pad_cell(headers[2], name_width) + + " ┃ " + + _pad_cell(headers[3], lang_width) + + " ┃ " + + _pad_cell(headers[4], resources_width) + + " ┃" + ) + console.print( + "┡" + + "━" * c1 + + "╇" + + "━" * c2 + + "╇" + + "━" * c3 + + "╇" + + "━" * c4 + + "╇" + + "━" * c5 + + "┩" + ) + + for index, (env_id, cost, name, language, resources, desc_text) in enumerate(rows): + span_width = inner_total - 2 + for line in _wrap_lines(desc_text, span_width): + console.print("│ " + _pad_cell(line, span_width)) + + # Thin line between full-width detail line and the summary line. + console.print("├" + "─" * inner_total + "┤") + + console.print( + "│ " + + _pad_cell(env_id, id_width) + + " │ " + + _pad_cell(cost, cost_width, align_right=True) + + " │ " + + _pad_cell(name, name_width) + + " │ " + + _pad_cell(language, lang_width) + + " │ " + + _pad_cell(resources, resources_width) + + " │" + ) + + if index < len(rows) - 1: + console.print("├" + "─" * inner_total + "┤") + + console.print("└" + "─" * inner_total + "┘") diff --git a/datalayer_core/displays/runtime_checkpoints.py b/datalayer_core/displays/runtime_checkpoints.py index bb8d8a39..fabd7c95 100644 --- a/datalayer_core/displays/runtime_checkpoints.py +++ b/datalayer_core/displays/runtime_checkpoints.py @@ -30,7 +30,7 @@ def _new_runtime_checkpoints_table(title: str = "Runtime Checkpoints") -> Table: table = Table(title=title) table.add_column("ID", style="cyan", no_wrap=True) table.add_column("Runtime", style="green", no_wrap=True) - table.add_column("Agent Spec", style="magenta", no_wrap=True) + table.add_column("Agentspec", style="magenta", no_wrap=True) table.add_column("Name", style="cyan", no_wrap=True) table.add_column("Status", style="yellow", no_wrap=True) table.add_column("Updated", style="dim", no_wrap=True) diff --git a/datalayer_core/displays/runtimes.py b/datalayer_core/displays/runtimes.py index 95691444..b0309e65 100644 --- a/datalayer_core/displays/runtimes.py +++ b/datalayer_core/displays/runtimes.py @@ -13,7 +13,7 @@ from datalayer_core.utils.date import timestamp_to_local_date -def _new_runtime_table(title: str = "Runtimes") -> Table: +def _new_runtime_table(title: str = "Agents") -> Table: """ Create a new table for displaying runtimes. @@ -64,7 +64,7 @@ def display_runtimes(runtimes: list[dict[str, Any]]) -> None: runtimes : list[dict[str, Any]] List of runtime dictionaries to display. """ - table = _new_runtime_table(title="Runtimes") + table = _new_runtime_table(title="Agents") for runtime in runtimes: _add_runtime_to_table(table, runtime) console = Console() diff --git a/datalayer_core/displays/tokens.py b/datalayer_core/displays/tokens.py deleted file mode 100644 index 9a72eb41..00000000 --- a/datalayer_core/displays/tokens.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2023-2025 Datalayer, Inc. -# Distributed under the terms of the Modified BSD License. - -"""Display functions for Datalayer core.""" - -from __future__ import annotations - -from rich.console import Console -from rich.table import Table - - -def _new_tokens_table(title: str = "Tokens") -> Table: - """ - Create a new tokens table. - - Parameters - ---------- - title : str, default "tokens" - The title for the table. - - Returns - ------- - Table - A rich Table configured for displaying tokens. - """ - table = Table(title=title) - table.add_column("ID", style="cyan", no_wrap=True) - table.add_column("Name", style="cyan", no_wrap=True) - table.add_column("Variant", style="cyan", no_wrap=True) - return table - - -def _add_token_to_table(table: Table, token: dict[str, str]) -> None: - """ - Add a token row to the table. - - Parameters - ---------- - table : Table - The rich Table to add the row to. - token : dict[str, str] - Dictionary containing token information with keys: uid, name_s, description_t, variant_s. - """ - table.add_row( - token["uid"], - token["name_s"], - token["variant_s"], - ) - - -def display_tokens(tokens: list[dict[str, str]]) -> None: - """ - Display a list of tokens in the console. - - Parameters - ---------- - tokens : list[dict[str, str]] - List of token dictionaries to display. - """ - table = _new_tokens_table(title="Tokens") - for token in tokens: - _add_token_to_table(table, token) - console = Console() - console.print(table) diff --git a/datalayer_core/evals/__init__.py b/datalayer_core/evals/__init__.py new file mode 100644 index 00000000..799488e6 --- /dev/null +++ b/datalayer_core/evals/__init__.py @@ -0,0 +1,59 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Evals shared package.""" + +from datalayer_core.evals.evals import ( + build_eval_report, + load_evalset_spec, + make_client, + merge_dicts, + now_iso, + parse_json_file, + parse_json_value, + render_eval_report_markdown, + resolve_billable_account_uid, + timestamp_slug, + watch_runs, + write_eval_report_csv, + write_eval_reports, +) +from datalayer_core.evals.evaluators import ( + evaluate_evalset, + evaluate_run, + run_and_evaluate_evalset, + run_case_evaluators, +) +from datalayer_core.evals.report import ( + average_latest_pass_rate, + collect_report_failures, + iter_report_runs, +) +from datalayer_core.evals.runner import execute_evalset_spec + +__all__ = [ + "average_latest_pass_rate", + "build_eval_report", + "collect_report_failures", + "evaluate_evalset", + "evaluate_run", + "execute_evalset_spec", + "iter_report_runs", + "load_evalset_spec", + "make_client", + "merge_dicts", + "now_iso", + "parse_json_file", + "parse_json_value", + "render_eval_report_markdown", + "resolve_billable_account_uid", + "run_and_evaluate_evalset", + "run_case_evaluators", + "timestamp_slug", + "watch_runs", + "write_eval_report_csv", + "write_eval_reports", +] diff --git a/datalayer_core/evals/evals.py b/datalayer_core/evals/evals.py new file mode 100644 index 00000000..ca46ae3c --- /dev/null +++ b/datalayer_core/evals/evals.py @@ -0,0 +1,274 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Shared helpers for evals CLI and integrations.""" + +from __future__ import annotations + +import json +import time +from pathlib import Path +from typing import Any, Optional + +import typer + +from datalayer_core.client.client import DatalayerClient +from datalayer_core.utils.urls import DatalayerURLs + +_TERMINAL_RUN_STATES = { + "completed", + "failed", + "error", + "cancelled", + "success", + "succeeded", + "passed", + "done", +} + + +def parse_json_value(raw: Optional[str], flag_name: str) -> dict[str, Any]: + if not raw: + return {} + try: + parsed = json.loads(raw) + except Exception as exc: + raise typer.BadParameter(f"Invalid JSON for {flag_name}: {exc}") from exc + if not isinstance(parsed, dict): + raise typer.BadParameter(f"{flag_name} must decode to an object") + return parsed + + +def parse_json_file(path_value: Optional[str], flag_name: str) -> dict[str, Any]: + if not path_value: + return {} + path = Path(path_value) + if not path.exists(): + raise typer.BadParameter(f"File not found for {flag_name}: {path}") + text = path.read_text(encoding="utf-8") + return parse_json_value(text, flag_name) + + +def merge_dicts(*parts: dict[str, Any]) -> dict[str, Any]: + merged: dict[str, Any] = {} + for part in parts: + merged.update(part) + return merged + + +def make_client( + token: Optional[str] = None, + api_key: Optional[str] = None, + *, + iam_url: Optional[str] = None, + runtimes_url: Optional[str] = None, + ai_agents_url: Optional[str] = None, +) -> DatalayerClient: + """Build a :class:`DatalayerClient` from the environment. + + Optional service-URL overrides are forwarded to + :meth:`DatalayerURLs.from_environment` so examples and integrations can + point at local proxies without re-implementing client construction. + """ + urls = DatalayerURLs.from_environment( + iam_url=iam_url or None, + runtimes_url=runtimes_url or None, + ai_agents_url=ai_agents_url or None, + ) + return DatalayerClient(urls=urls, token=(token or api_key)) + + +def resolve_billable_account_uid( + billable_account_uid: Optional[str], + account_uid: Optional[str], +) -> Optional[str]: + """Resolve billable account UID with backwards-compatible fallback.""" + return billable_account_uid or account_uid + + +def load_evalset_spec( + spec_file: str | Path, + *, + expected_kind: Optional[str] = None, + require_cases: bool = False, +) -> dict[str, Any]: + """Load and validate a JSON evalset spec file. + + The returned dict can be passed straight to + :meth:`DatalayerClient.evals_create_eval_from_spec`. Shared by examples, + the GitHub Action, and any other integration that creates evalsets from a + declarative JSON spec. + """ + path = Path(spec_file) + if not path.exists(): + raise FileNotFoundError(f"Evalset spec file not found: {path}") + payload = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise ValueError(f"Evalset spec must be a JSON object: {path}") + if not str(payload.get("name") or "").strip(): + raise ValueError(f"Evalset spec is missing 'name': {path}") + if expected_kind is not None: + kind = str(payload.get("kind") or "").strip().lower() + if kind and kind != expected_kind: + raise ValueError( + f"Evalset spec kind '{kind}' does not match expected " + f"'{expected_kind}': {path}" + ) + if require_cases: + cases = payload.get("cases") + if not isinstance(cases, list) or not cases: + raise ValueError( + f"Evalset spec must include a non-empty 'cases' array: {path}" + ) + return payload + + +def watch_runs( + client: DatalayerClient, + run_ids: list[str], + *, + account_uid: Optional[str] = None, + timeout_seconds: int = 120, + interval_seconds: int = 3, + verbose: bool = True, +) -> dict[str, str]: + """Poll eval runs until they reach a terminal state or the timeout elapses. + + Returns a mapping of ``run_id`` to its last observed status. Generic helper + reused by examples and integrations; it intentionally carries no demo-only + logic. + """ + started = time.time() + statuses: dict[str, str] = {} + while True: + pending: list[str] = [] + counts: dict[str, int] = {} + for run_id in run_ids: + snapshot = client.evals_get_run(run_id, account_uid=account_uid) + status = ( + str((snapshot.get("run") or {}).get("status") or "").lower() + or "unknown" + ) + statuses[run_id] = status + counts[status] = counts.get(status, 0) + 1 + if status not in _TERMINAL_RUN_STATES: + pending.append(run_id) + if verbose: + elapsed = int(time.time() - started) + summary = ( + ", ".join(f"{status}={count}" for status, count in sorted(counts.items())) + or "unknown=0" + ) + print(f"Run status at t+{elapsed}s: {summary}") + if not pending: + return statuses + if time.time() - started > timeout_seconds: + if verbose: + preview = ", ".join(pending[:5]) + suffix = " ..." if len(pending) > 5 else "" + print( + "Run watch timed out before terminal state. " + f"Pending ({len(pending)}): {preview}{suffix}" + ) + return statuses + time.sleep(max(1, interval_seconds)) + + +def now_iso() -> str: + """Return the current UTC timestamp in ISO-8601 form.""" + from datalayer_core.evals.report import _now_iso + + return _now_iso() + + +def timestamp_slug(raw_iso: str) -> str: + """Return a filesystem-safe slug for an ISO-8601 timestamp.""" + from datalayer_core.evals.report import _timestamp_slug + + return _timestamp_slug(raw_iso) + + +def build_eval_report( + client: DatalayerClient, + evalset_id: str, + *, + account_uid: Optional[str] = None, + run_limit: int = 50, +) -> dict[str, Any]: + """Return the structured eval report for an evalset. + + Thin public facade over the report engine so callers do not import private + CLI helpers. + """ + from datalayer_core.evals.report import _report_data + + return _report_data( + client=client, + evalset_id=evalset_id, + run_limit=run_limit, + account_uid=account_uid, + ) + + +def render_eval_report_markdown( + report: dict[str, Any], + *, + run_limit: int = 50, + colorize: bool = False, +) -> str: + """Render a structured eval report as markdown.""" + from datalayer_core.evals.report import _report_markdown + + return _report_markdown(report, run_limit=run_limit, colorize=colorize) + + +def write_eval_report_csv(report: dict[str, Any], output_path: str | Path) -> Path: + """Write a structured eval report to a CSV file and return its path.""" + from datalayer_core.evals.report import _write_report_csv + + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + _write_report_csv(report, path) + return path + + +def write_eval_reports( + client: DatalayerClient, + evalset_id: str, + *, + account_uid: Optional[str] = None, + run_limit: int = 50, + output_dir: str | Path = ".", + basename: str = "report", + timestamped: bool = True, + export_csv: bool = True, +) -> dict[str, Any]: + """Build and persist markdown (and optionally CSV) eval reports. + + Returns a dict with the structured ``report`` plus the written file paths. + Shared by examples and integrations to avoid duplicating report I/O. + """ + from datalayer_core.evals.report import _timestamp_slug + + report = build_eval_report( + client, evalset_id, account_uid=account_uid, run_limit=run_limit + ) + markdown = render_eval_report_markdown(report, run_limit=run_limit) + + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + if timestamped: + stem = f"{basename}-{_timestamp_slug(str(report.get('generated_at') or ''))}" + else: + stem = basename + + markdown_path = out_dir / f"{stem}.md" + markdown_path.write_text(markdown + "\n", encoding="utf-8") + + result: dict[str, Any] = {"report": report, "markdown_path": markdown_path} + if export_csv: + result["csv_path"] = write_eval_report_csv(report, out_dir / f"{stem}.csv") + return result diff --git a/datalayer_core/evals/evaluators.py b/datalayer_core/evals/evaluators.py new file mode 100644 index 00000000..a83c331d --- /dev/null +++ b/datalayer_core/evals/evaluators.py @@ -0,0 +1,415 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Reusable evaluator execution for real (non-synthetic) eval runs. + +Implements the common Datalayer evaluators (``equals_expected``, ``equals``, +``contains``, ``pass_rate_threshold``) so examples, the CLI, and integrations +can grade *real* agent outputs instead of fabricating scores. Evaluator names +mirror the evaluator catalog (see ``agent_runtimes/specs/evals``); unknown names +degrade gracefully to a skipped record so callers never crash on an unsupported +evaluator. +""" + +from __future__ import annotations + +from typing import Any, Callable + +CaseEvaluator = Callable[[Any, Any, dict[str, Any]], dict[str, Any]] +ReportEvaluator = Callable[[list[dict[str, Any]], dict[str, Any]], dict[str, Any]] + + +def _coerce_text(value: Any) -> str: + """Return the textual payload of an output/expected value.""" + if isinstance(value, dict): + text = value.get("text") + if text is not None: + return str(text) + return "" + if value is None: + return "" + return str(value) + + +def _normalize_name(name: Any) -> str: + return str(name or "").strip().lower().replace("-", "_") + + +def _evaluate_equals_expected( + output: Any, expected: Any, arguments: dict[str, Any] +) -> dict[str, Any]: + expected_text = _coerce_text(expected).strip() + output_text = _coerce_text(output).strip() + if not expected_text: + return {"passed": True, "score": 1.0, "reason": "no expected output"} + passed = output_text == expected_text + return { + "passed": passed, + "score": 1.0 if passed else 0.0, + "reason": "exact match" if passed else "output does not equal expected", + } + + +def _evaluate_contains( + output: Any, expected: Any, arguments: dict[str, Any] +) -> dict[str, Any]: + output_text = _coerce_text(output) + case_sensitive = bool(arguments.get("case_sensitive")) + + # Prefer explicit ``tokens`` (or a single ``substring``/``value``) from the + # evaluator arguments; fall back to the case ``expected_output`` text only + # when no needles are configured. + tokens = arguments.get("tokens") + if isinstance(tokens, (list, tuple)) and tokens: + needles = [str(token) for token in tokens] + else: + single = arguments.get("substring", arguments.get("value")) + if single is not None: + needles = [str(single)] + else: + needles = [_coerce_text(expected)] + + needles = [needle for needle in needles if needle] + if not needles: + return {"passed": True, "score": 1.0, "reason": "no expected substring"} + + haystack = output_text if case_sensitive else output_text.lower() + missing = [ + needle + for needle in needles + if (needle if case_sensitive else needle.lower()) not in haystack + ] + passed = not missing + return { + "passed": passed, + "score": 1.0 if passed else 0.0, + "reason": ( + "all tokens found" + if passed + else f"missing tokens: {', '.join(missing)}" + ), + } + + +def _evaluate_pass_rate_threshold( + case_results: list[dict[str, Any]], arguments: dict[str, Any] +) -> dict[str, Any]: + total = len(case_results) + passed = sum(1 for case in case_results if case.get("passed")) + rate = passed / total if total else 0.0 + min_pass_rate = arguments.get("min_pass_rate", 0.8) + threshold = float(min_pass_rate) if isinstance(min_pass_rate, (int, float)) else 0.8 + ok = total > 0 and rate >= threshold + return { + "passed": ok, + "score": round(rate, 4), + "threshold": round(threshold, 4), + "observed": round(rate, 4), + "summary": ( + f"pass rate {rate:.2f} " + f"{'≥' if ok else '<'} threshold {threshold:.2f}" + ), + } + + +CASE_EVALUATORS: dict[str, CaseEvaluator] = { + "equals_expected": _evaluate_equals_expected, + "equals": _evaluate_equals_expected, + "contains": _evaluate_contains, +} + +REPORT_EVALUATORS: dict[str, ReportEvaluator] = { + "pass_rate_threshold": _evaluate_pass_rate_threshold, +} + + +def run_case_evaluators( + *, + output: Any, + expected: Any, + evaluators: list[dict[str, Any]] | None, +) -> dict[str, Any]: + """Grade a single real output against its per-case evaluators. + + Returns ``{"passed": bool, "score": float, "evaluators": [...]}``. A case + passes when every applicable evaluator passes; the score is the mean of the + evaluator scores (and defaults to ``1.0`` when no evaluator applies). + Unsupported evaluators are recorded as ``skipped`` and ignored. + """ + records: list[dict[str, Any]] = [] + scores: list[float] = [] + passed_all = True + applied = 0 + for evaluator in evaluators or []: + if not isinstance(evaluator, dict): + continue + name = _normalize_name(evaluator.get("name")) + arguments = evaluator.get("arguments") or {} + func = CASE_EVALUATORS.get(name) + if func is None: + records.append({"name": name or "evaluator", "skipped": True}) + continue + outcome = func(output, expected, arguments) + outcome_passed = bool(outcome.get("passed")) + outcome_score = float( + outcome.get("score", 1.0 if outcome_passed else 0.0) + ) + records.append( + { + "name": name, + "passed": outcome_passed, + "score": round(outcome_score, 4), + "reason": str(outcome.get("reason") or ""), + } + ) + scores.append(outcome_score) + passed_all = passed_all and outcome_passed + applied += 1 + score = round(sum(scores) / len(scores), 4) if scores else 1.0 + return { + "passed": passed_all if applied else True, + "score": score, + "evaluators": records, + } + + +def evaluate_run( + cases: list[dict[str, Any]], + outputs: list[Any], + *, + evalset_evaluators: list[dict[str, Any]] | None = None, + report_evaluators: list[dict[str, Any]] | None = None, + statuses: list[str] | None = None, +) -> dict[str, Any]: + """Grade real per-case outputs and return run metrics. + + ``outputs`` is aligned with ``cases`` (each entry a ``str`` or a mapping + with a ``text`` key). Evalset-level evaluators run for every case; + report-level evaluators run once over the resulting case outcomes. The + returned metrics mirror the synthetic shape (``case_results`` and + ``evaluator_results``) so the UI and report render real and synthetic runs + identically. + """ + evalset_evaluators = [ + item for item in (evalset_evaluators or []) if isinstance(item, dict) + ] + report_evaluators = [ + item for item in (report_evaluators or []) if isinstance(item, dict) + ] + + def _expected_for(case: dict[str, Any]) -> Any: + expected = case.get("expected_output") + if expected is None: + expected = case.get("expected") + return expected + + def _status_for(idx: int) -> str: + if statuses and idx < len(statuses): + return str(statuses[idx] or "").strip().lower() + return "completed" + + case_results: list[dict[str, Any]] = [] + for idx, case in enumerate(cases): + metadata = case.get("metadata") or {} + expected = _expected_for(case) + output = outputs[idx] if idx < len(outputs) else None + case_evaluators = [ + item for item in (case.get("evaluators") or []) if isinstance(item, dict) + ] + # Per-case evaluators override the evalset-level defaults; the evalset + # evaluators only apply to cases that do not declare their own. + applicable = case_evaluators or evalset_evaluators + outcome = run_case_evaluators( + output=output, expected=expected, evaluators=applicable + ) + passed = bool(outcome.get("passed")) + score = float(outcome.get("score", 0.0)) + if _status_for(idx) in {"failed", "error"}: + passed = False + score = 0.0 + case_results.append( + { + "name": case.get("name"), + "passed": passed, + "status": "passed" if passed else "failed", + "score": round(score, 4), + "category": metadata.get("category"), + "difficulty": metadata.get("difficulty") or metadata.get("priority"), + } + ) + + evaluator_results: list[dict[str, Any]] = [] + total = len(cases) + for evaluator in evalset_evaluators: + name = str(evaluator.get("name") or "evaluator") + passed_cases = 0 + scores: list[float] = [] + applicable_cases = 0 + for idx, case in enumerate(cases): + # Skip cases that override the evalset default with their own + # per-case evaluators so the summary reflects only where this + # evalset evaluator actually applies. + if [ + item + for item in (case.get("evaluators") or []) + if isinstance(item, dict) + ]: + continue + applicable_cases += 1 + expected = _expected_for(case) + output = outputs[idx] if idx < len(outputs) else None + single = run_case_evaluators( + output=output, expected=expected, evaluators=[evaluator] + ) + ok = bool(single.get("passed")) and _status_for(idx) not in { + "failed", + "error", + } + if ok: + passed_cases += 1 + scores.append(float(single.get("score", 0.0)) if ok else 0.0) + mean_score = round(sum(scores) / len(scores), 4) if scores else None + evaluator_results.append( + { + "name": name, + "scope": "evalset", + "score": mean_score, + "passed": applicable_cases > 0 and passed_cases == applicable_cases, + "passed_cases": passed_cases, + "total_cases": applicable_cases, + "summary": f"{passed_cases}/{applicable_cases} cases passed {name}", + } + ) + + for evaluator in report_evaluators: + name = str(evaluator.get("name") or "evaluator") + func = REPORT_EVALUATORS.get(_normalize_name(evaluator.get("name"))) + if func is None: + evaluator_results.append( + { + "name": name, + "scope": "report", + "score": None, + "passed": False, + "summary": f"{name} not executed (unsupported)", + } + ) + continue + outcome = func(case_results, evaluator.get("arguments") or {}) + entry: dict[str, Any] = { + "name": name, + "scope": "report", + "score": outcome.get("score"), + "passed": bool(outcome.get("passed")), + "summary": str(outcome.get("summary") or ""), + } + for optional in ("threshold", "observed"): + if outcome.get(optional) is not None: + entry[optional] = outcome.get(optional) + evaluator_results.append(entry) + + passed = sum(1 for case in case_results if case.get("passed")) + pass_rate = round(passed / total, 4) if total else 0.0 + avg_score = ( + round(sum(float(case["score"]) for case in case_results) / total, 4) + if total + else 0.0 + ) + return { + "pass_rate": pass_rate, + "total_cases": total, + "passed": passed, + "failed": total - passed, + "avg_score": avg_score, + "case_results": case_results, + "evaluator_results": evaluator_results, + } + + +def evaluate_evalset( + evalset_spec: dict[str, Any], + outputs: list[Any], + *, + statuses: list[str] | None = None, +) -> dict[str, Any]: + """Grade real outputs against a declarative evalset spec. + + Convenience wrapper that pulls ``cases``, ``evalset_evaluators`` and + ``report_evaluators`` out of an evalset spec dict (as produced by + :func:`datalayer_core.evals.load_evalset_spec`) and delegates to + :func:`evaluate_run`. This is the single entry point examples and the CLI + use so evaluator execution lives in the evals API rather than the caller. + """ + cases = [item for item in (evalset_spec.get("cases") or []) if isinstance(item, dict)] + evalset_evaluators = [ + item + for item in (evalset_spec.get("evalset_evaluators") or []) + if isinstance(item, dict) + ] + report_evaluators = [ + item + for item in (evalset_spec.get("report_evaluators") or []) + if isinstance(item, dict) + ] + return evaluate_run( + cases, + outputs, + evalset_evaluators=evalset_evaluators, + report_evaluators=report_evaluators, + statuses=statuses, + ) + + +CaseRunner = Callable[[dict[str, Any], int], Any] + + +def run_and_evaluate_evalset( + evalset_spec: dict[str, Any], + run_case: CaseRunner, + *, + statuses: list[str] | None = None, +) -> dict[str, Any]: + """Execute every case through a runner callback, then grade the outputs. + + This bakes the per-case execution loop into the evals API so consumers + (examples, GitHub Actions, and other integrations) never re-implement the + "run each case, then evaluate" orchestration. ``run_case`` is called once + per case as ``run_case(case, index)`` and may return either: + + * a plain output (``str`` or a mapping with a ``text`` key), or + * a mapping ``{"output": , "status": }`` where ``status`` + is an optional per-case run status (e.g. ``"failed"``) that forces the + case to fail regardless of evaluator outcome. + + Per-case and report-level evaluators from the spec then run for real over + the collected outputs via :func:`evaluate_evalset`, returning the same + metrics shape as synthetic runs (``case_results`` and ``evaluator_results``) + so reports and the UI render real and simulated runs identically. + """ + cases = [ + item for item in (evalset_spec.get("cases") or []) if isinstance(item, dict) + ] + outputs: list[Any] = [] + collected_statuses: list[str | None] = [] + for index, case in enumerate(cases): + result = run_case(case, index) + if isinstance(result, dict) and ("output" in result or "status" in result): + outputs.append(result.get("output")) + status = result.get("status") + else: + outputs.append(result) + status = None + if status is None and statuses is not None and index < len(statuses): + status = statuses[index] + collected_statuses.append( + str(status) if status is not None else None + ) + normalized_statuses = ( + [value or "" for value in collected_statuses] + if any(value is not None for value in collected_statuses) + else None + ) + return evaluate_evalset(evalset_spec, outputs, statuses=normalized_statuses) diff --git a/datalayer_core/evals/report.py b/datalayer_core/evals/report.py new file mode 100644 index 00000000..1f32e603 --- /dev/null +++ b/datalayer_core/evals/report.py @@ -0,0 +1,3329 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Real evaluation and reporting logic for Datalayer evals. + +This module hosts the evals report engine and the helper functions used by the +CLI commands, the Python evals API, and the examples. The CLI command layer in +``datalayer_core.cli.commands.evals`` imports from here so it can stay a thin +Typer wrapper around this logic. +""" + +from __future__ import annotations + +import csv +import json +import math +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional +from urllib.parse import quote + +import typer +from rich.console import Console +from rich.table import Table + +from datalayer_core.client.client import DatalayerClient + +console = Console() + +WEB_APP_BASE_URL = "https://datalayer.ai" + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + + +def _timestamp_slug(raw_iso: str) -> str: + cleaned = raw_iso.replace("-", "").replace(":", "").replace(".", "") + cleaned = cleaned.replace("+0000", "Z").replace("+00:00", "Z") + cleaned = cleaned.replace("T", "T") + if cleaned.endswith("Z"): + return cleaned + return f"{cleaned}Z" + + +def _status_style(status: str) -> str: + normalized = status.lower() + if normalized in {"completed", "success", "passed"}: + return "green" + if normalized in {"running", "queued", "pending"}: + return "yellow" + if normalized in {"failed", "error"}: + return "red" + return "white" + + +def _run_pass_rate(run: dict[str, Any]) -> float | None: + metrics = run.get("metrics") or {} + raw = metrics.get("pass_rate") + if isinstance(raw, (int, float)): + value = float(raw) + if value < 0: + return 0.0 + if value > 1: + return 1.0 + return value + return None + + +def _fmt_pct(raw: float | None) -> str: + if raw is None: + return "n/a" + return f"{raw * 100:.1f}%" + + +def _parse_csv_values(raw: str | None) -> list[str]: + if raw is None: + return [] + values: list[str] = [] + seen: set[str] = set() + for token in str(raw).split(","): + item = token.strip() + if not item or item in seen: + continue + seen.add(item) + values.append(item) + return values + + +def _parse_evaluator_specs(raw_values: list[str], option_name: str) -> list[dict[str, Any]]: + evaluators: list[dict[str, Any]] = [] + for index, raw in enumerate(raw_values, start=1): + try: + parsed = json.loads(raw) + except Exception as error: + raise typer.BadParameter( + f"{option_name} entry #{index} is not valid JSON: {error}" + ) from error + if not isinstance(parsed, dict): + raise typer.BadParameter( + f"{option_name} entry #{index} must be a JSON object" + ) + name = parsed.get("name") + if not isinstance(name, str) or not name.strip(): + raise typer.BadParameter( + f"{option_name} entry #{index} must include a non-empty string field 'name'" + ) + if "arguments" in parsed and not isinstance(parsed.get("arguments"), dict): + raise typer.BadParameter( + f"{option_name} entry #{index} field 'arguments' must be an object when provided" + ) + if "arguments" not in parsed: + parsed = {**parsed, "arguments": {}} + evaluators.append(parsed) + return evaluators + + +def _agentspec_details_url(agent_spec_id: str) -> str: + value = str(agent_spec_id or "").strip() + if not value: + return "" + return f"{WEB_APP_BASE_URL}/settings/agentspecs/{quote(value, safe='')}" + + +def _evalset_runs_url(evalset_id: str, run_environment: str) -> str: + evalset_value = str(evalset_id or "").strip() + if not evalset_value: + return "" + encoded_evalset_id = quote(evalset_value, safe='') + env_value = str(run_environment or "").strip() + if env_value: + encoded_env = quote(env_value, safe='') + return f"{WEB_APP_BASE_URL}/evals/experiments/{encoded_env}/{encoded_evalset_id}" + return f"{WEB_APP_BASE_URL}/evals/experiments?evalset_id={encoded_evalset_id}" + + +def _run_overlay_url(evalset_runs_url: str, run_id: str) -> str: + """Build a deep link that opens the run-details overlay directly. + + The experiments page reads the ``run`` query parameter and opens the + run-details dialog for that run, so the same overlay shown by the in-app + "Details" button is reachable straight from the CLI report. + """ + base = str(evalset_runs_url or "").strip() + run_value = str(run_id or "").strip() + if not base or not run_value: + return base + separator = "&" if "?" in base else "?" + return f"{base}{separator}run={quote(run_value, safe='')}" + + + +def _style_text(value: str, style: str | None, colorize: bool) -> str: + if not colorize or not style: + return value + return f"[{style}]{value}[/{style}]" + + +def _compute_baseline_and_drift(runs: list[dict[str, Any]]) -> tuple[float | None, float | None, float | None]: + pass_rates = [rate for rate in (_run_pass_rate(run) for run in runs) if rate is not None] + if not pass_rates: + return None, None, None + baseline_size = min(3, max(1, len(pass_rates) // 2)) + baseline_slice = pass_rates[:baseline_size] + baseline = sum(baseline_slice) / baseline_size + latest = pass_rates[-1] + drift = latest - baseline + return baseline, latest, drift + + +def _analysis_scalar(name: str, metric: str, value: float | int | None) -> dict[str, Any]: + return { + "kind": "scalar", + "name": name, + "metric": metric, + "value": value, + } + + +def _analysis_table(name: str, columns: list[str], rows: list[list[Any]]) -> dict[str, Any]: + return { + "kind": "table", + "name": name, + "columns": columns, + "rows": rows, + } + + +def _analysis_line(name: str, x: list[Any], y: list[float]) -> dict[str, Any]: + return { + "kind": "line", + "name": name, + "x": x, + "y": y, + } + + +def _build_experiment_report_analyses( + runs: list[dict[str, Any]], + consecutive_comparisons: list[dict[str, Any]], + *, + baseline: float | None, + latest: float | None, + drift: float | None, + latest_two_delta: float | None, + mean_pass: float | None, + stddev_pass: float | None, +) -> list[dict[str, Any]]: + analyses: list[dict[str, Any]] = [ + _analysis_scalar("Latest Pass Rate", "latest_pass_rate", latest), + _analysis_scalar("Baseline Pass Rate", "baseline_pass_rate", baseline), + _analysis_scalar("Drift Delta", "drift_delta", drift), + _analysis_scalar("Latest Two Delta", "latest_two_delta", latest_two_delta), + _analysis_scalar("Mean Pass Rate", "mean_pass_rate", mean_pass), + _analysis_scalar("Stddev Pass Rate", "stddev_pass_rate", stddev_pass), + ] + + run_rows: list[list[Any]] = [] + line_x: list[str] = [] + line_y: list[float] = [] + for idx, run in enumerate(runs): + run_id = str(run.get("id") or "") + run_status = str(run.get("status") or "") + run_pass_rate = _run_pass_rate(run) + run_rows.append([idx, run_id, run_status, run_pass_rate]) + if isinstance(run_pass_rate, (int, float)): + line_x.append(run_id or str(idx)) + line_y.append(float(run_pass_rate)) + analyses.append( + _analysis_table( + "Run Pass Rates", + ["index", "run_id", "status", "pass_rate"], + run_rows, + ) + ) + analyses.append(_analysis_line("Run Pass Rate Trend", line_x, line_y)) + + delta_rows: list[list[Any]] = [] + for item in consecutive_comparisons: + delta_rows.append( + [ + str(item.get("run_a_id") or ""), + str(item.get("run_b_id") or ""), + item.get("run_a_pass_rate"), + item.get("run_b_pass_rate"), + item.get("delta_pass_rate"), + ] + ) + analyses.append( + _analysis_table( + "Consecutive Run Deltas", + ["run_a_id", "run_b_id", "run_a_pass_rate", "run_b_pass_rate", "delta_pass_rate"], + delta_rows, + ) + ) + return analyses + + +def _build_evalset_report_analyses(experiments: list[dict[str, Any]]) -> list[dict[str, Any]]: + latest_rows: list[list[Any]] = [] + latest_names: list[str] = [] + latest_values: list[float] = [] + all_run_pass_rates: list[float] = [] + for experiment in experiments: + experiment_name = str(experiment.get("name") or experiment.get("id") or "") + latest_pass = experiment.get("latest_pass_rate") + baseline_pass = experiment.get("baseline_pass_rate") + drift_delta = experiment.get("drift_delta") + latest_rows.append([experiment_name, latest_pass, baseline_pass, drift_delta]) + if isinstance(latest_pass, (int, float)): + latest_names.append(experiment_name) + latest_values.append(float(latest_pass)) + + for run in experiment.get("runs") or []: + if isinstance(run, dict): + pass_rate = run.get("pass_rate") + if isinstance(pass_rate, (int, float)): + all_run_pass_rates.append(float(pass_rate)) + + overall_mean = ( + (sum(all_run_pass_rates) / len(all_run_pass_rates)) if all_run_pass_rates else None + ) + + return [ + _analysis_scalar("Experiment Count", "experiment_count", len(experiments)), + _analysis_scalar("Overall Mean Pass Rate", "overall_mean_pass_rate", overall_mean), + _analysis_table( + "Experiment Latest/Baseline", + ["experiment", "latest_pass_rate", "baseline_pass_rate", "drift_delta"], + latest_rows, + ), + _analysis_line("Latest Pass Rate By Experiment", latest_names, latest_values), + ] + + +def _classify_legacy_failure(message: str) -> dict[str, Any]: + """Infer a structured stage/type/url from a free-form legacy error message. + + Older runs (and any path that only persisted a plain error string) lack a + structured ``failure_cause``. Rather than rendering ``unknown`` / + ``legacy_error`` with an empty detail excerpt, classify the most common + error shapes so the report stays actionable. + """ + text = message.strip() + lowered = text.lower() + + url_match = re.search(r"https?://[^\s]+", text) + execution_url = url_match.group(0).rstrip(".,)") if url_match else "" + + stage = "unknown" + failure_type = "legacy_error" + if "all connection attempts failed" in lowered or "connection refused" in lowered or "request failed" in lowered: + stage = "runtime_execution" + failure_type = "runtime_unreachable" + elif "returned http" in lowered or re.search(r"\bhttp\s*[45]\d\d\b", lowered): + stage = "runtime_execution" + failure_type = "runtime_http_error" + elif "traceback" in lowered: + stage = "runtime_execution" + failure_type = "runtime_traceback" + elif "no submitted code" in lowered or "missing" in lowered and "code" in lowered: + stage = "run_preparation" + failure_type = "missing_submitted_code" + elif "no interactive runtime url" in lowered or "not configured" in lowered: + stage = "runtime_resolution" + failure_type = "no_runtime_url" + + cause: dict[str, Any] = { + "stage": stage, + "type": failure_type, + "message": text, + "detail_excerpt": text, + } + if execution_url: + cause["execution_url"] = execution_url + return cause + + +def _extract_failure_cause(run: dict[str, Any]) -> dict[str, Any] | None: + """Extract a structured failure cause from a run's report/summary payload.""" + for container_key in ("report", "summary"): + container = run.get(container_key) + if isinstance(container, dict): + cause = container.get("failure_cause") + if isinstance(cause, dict) and cause: + return cause + # Fallback: synthesize a structured cause from legacy error fields. + summary = run.get("summary") if isinstance(run.get("summary"), dict) else {} + report = run.get("report") if isinstance(run.get("report"), dict) else {} + message = ( + summary.get("failure_reason") + or summary.get("execution_error") + or report.get("error") + ) + if isinstance(message, str) and message.strip(): + return _classify_legacy_failure(message) + return None + + +def _format_failure_cause(cause: dict[str, Any] | None) -> str: + """Render a failure cause as a concise single-line string.""" + if not isinstance(cause, dict) or not cause: + return "" + failure_type = str(cause.get("type") or "").strip() + message = str(cause.get("message") or "").strip() + parts: list[str] = [] + if failure_type: + parts.append(f"[{failure_type}]") + if message: + parts.append(message) + return " ".join(parts).strip() + + +def _failure_cause_detail_lines(cause: dict[str, Any]) -> list[str]: + """Render the full failure cause (message, context, diagnostics, attempts) as markdown lines.""" + lines: list[str] = [] + message = str(cause.get("message") or "").strip() + if message: + lines.append(f"- Message: {message}") + for key, label in ( + ("stage", "Stage"), + ("type", "Type"), + ("runtime_pod_name", "Runtime pod"), + ("runtime_id", "Runtime ID"), + ("environment_name", "Environment"), + ("execution_url", "Execution URL"), + ): + value = str(cause.get(key) or "").strip() + if value: + lines.append(f"- {label}: `{value}`") + + detail = str(cause.get("detail_excerpt") or "").strip() + if detail: + lines.append("- Detail excerpt:") + lines.append("") + lines.append("```text") + lines.extend(detail.splitlines() or [detail]) + lines.append("```") + + diagnostics = cause.get("diagnostics") + if isinstance(diagnostics, dict) and diagnostics: + for key, label in ( + ("agent_runtimes_url", "Agent runtimes URL"), + ("run_url", "Run URL"), + ): + value = diagnostics.get(key) + if value: + lines.append(f"- {label}: `{value}`") + for key, label in ( + ("route_ids", "Route IDs tried"), + ("discovered_agent_ids", "Discovered agent IDs"), + ("candidate_urls", "Candidate URLs"), + ): + value = diagnostics.get(key) + if isinstance(value, list) and value: + rendered = ", ".join(f"`{item}`" for item in value) + lines.append(f"- {label}: {rendered}") + + attempts = diagnostics.get("attempts") + if isinstance(attempts, list) and attempts: + lines.append("- Connection attempts:") + attempt_rows: list[list[str]] = [] + for attempt in attempts: + if not isinstance(attempt, dict): + continue + status_code = attempt.get("status_code") + attempt_rows.append( + [ + str(attempt.get("url") or "-"), + "ok" if attempt.get("ok") else "failed", + "-" if status_code is None else str(status_code), + str(attempt.get("error") or "-"), + ] + ) + if attempt_rows: + lines.append("") + lines.extend( + _markdown_table( + ["URL", "Result", "HTTP", "Error"], + attempt_rows, + ["left", "left", "right", "left"], + ) + ) + return lines + + +def _run_detail_record(run: dict[str, Any]) -> dict[str, Any]: + metrics = run.get("metrics") if isinstance(run.get("metrics"), dict) else {} + summary = run.get("summary") if isinstance(run.get("summary"), dict) else {} + report = run.get("report") if isinstance(run.get("report"), dict) else {} + usage = _extract_run_usage(run) + return { + "id": str(run.get("id", "")), + "status": str(run.get("status", "")), + "created_at": str(run.get("created_at", "")), + "updated_at": str(run.get("updated_at", "")), + "pass_rate": _run_pass_rate(run), + "metrics": metrics, + "summary": summary, + "report": report, + "usage": usage, + "failure_cause": _extract_failure_cause(run), + } + + +def _extract_experiment_agentspec(experiment: dict[str, Any], runs: list[dict[str, Any]]) -> tuple[str, str]: + config = experiment.get("config") if isinstance(experiment.get("config"), dict) else {} + summary = experiment.get("summary") if isinstance(experiment.get("summary"), dict) else {} + run_summaries = [ + run.get("summary") + for run in runs + if isinstance(run, dict) and isinstance(run.get("summary"), dict) + ] + + id_candidates: list[Any] = [ + config.get("agent_spec_id"), + config.get("agentSpecId"), + summary.get("agent_spec_id"), + summary.get("agentSpecId"), + ] + name_candidates: list[Any] = [ + config.get("agent_spec_name"), + config.get("agentSpecName"), + summary.get("agent_spec_name"), + summary.get("agentSpecName"), + ] + for run_summary in run_summaries: + assert isinstance(run_summary, dict) + id_candidates.extend( + [ + run_summary.get("agent_spec_id"), + run_summary.get("agentSpecId"), + ] + ) + name_candidates.extend( + [ + run_summary.get("agent_spec_name"), + run_summary.get("agentSpecName"), + ] + ) + + agent_spec_id = "" + for candidate in id_candidates: + if isinstance(candidate, str) and candidate.strip(): + agent_spec_id = candidate.strip() + break + + agent_spec_name = "" + for candidate in name_candidates: + if isinstance(candidate, str) and candidate.strip(): + agent_spec_name = candidate.strip() + break + + if not agent_spec_name and agent_spec_id: + agent_spec_name = agent_spec_id + return agent_spec_id, agent_spec_name + + +def _first_str(*candidates: Any) -> str: + """Return the first non-empty stripped string from the candidates.""" + for candidate in candidates: + if isinstance(candidate, str) and candidate.strip(): + return candidate.strip() + return "" + + +def _normalize_tags(value: Any) -> list[str]: + """Normalize a tags value (list or comma-separated string) to a list.""" + if isinstance(value, (list, tuple)): + tags = [str(item).strip() for item in value if str(item).strip()] + elif isinstance(value, str): + tags = [token.strip() for token in value.split(",") if token.strip()] + else: + return [] + seen: set[str] = set() + ordered: list[str] = [] + for tag in tags: + if tag not in seen: + seen.add(tag) + ordered.append(tag) + return ordered + + +def _extract_experiment_agentspec_details( + experiment: dict[str, Any], runs: list[dict[str, Any]] +) -> dict[str, Any]: + """Extract rich agentspec metadata from experiment/run payloads. + + Mirrors the fields surfaced by the in-app Agentspec Details dialog + (name, description, version, model, tags, icon/emoji/color) by + inspecting the experiment config/summary, any inline ``agent_spec`` + object, and the most recent run summaries. + """ + config = experiment.get("config") if isinstance(experiment.get("config"), dict) else {} + summary = experiment.get("summary") if isinstance(experiment.get("summary"), dict) else {} + run_summaries = [ + run.get("summary") + for run in runs + if isinstance(run, dict) and isinstance(run.get("summary"), dict) + ] + + # Inline agent_spec objects can live under several keys/scopes. + inline_specs: list[dict[str, Any]] = [] + for scope in (config, summary, *run_summaries): + if not isinstance(scope, dict): + continue + for key in ("agent_spec", "agentSpec", "agentspec"): + candidate = scope.get(key) + if isinstance(candidate, dict): + inline_specs.append(candidate) + + def _pick(field: str, camel: str) -> str: + candidates: list[Any] = [] + for spec in inline_specs: + candidates.extend([spec.get(field), spec.get(camel)]) + for scope in (config, summary, *run_summaries): + if isinstance(scope, dict): + candidates.extend( + [ + scope.get(f"agent_spec_{field}"), + scope.get(f"agentSpec{camel[0].upper()}{camel[1:]}"), + ] + ) + return _first_str(*candidates) + + tags_candidates: list[Any] = [] + for spec in inline_specs: + tags_candidates.extend([spec.get("tags")]) + for scope in (config, summary, *run_summaries): + if isinstance(scope, dict): + tags_candidates.extend( + [scope.get("agent_spec_tags"), scope.get("agentSpecTags")] + ) + tags: list[str] = [] + for candidate in tags_candidates: + tags = _normalize_tags(candidate) + if tags: + break + + return { + "description": _pick("description", "description"), + "version": _pick("version", "version"), + "model": _pick("model", "model"), + "icon": _pick("icon", "icon"), + "emoji": _pick("emoji", "emoji"), + "color": _pick("color", "color"), + "tags": tags, + } + + +def _merge_agentspec_details(target: dict[str, Any], details: dict[str, Any]) -> None: + """Merge non-empty agentspec detail fields into the aggregate record.""" + for key in ("description", "version", "model", "icon", "emoji", "color"): + value = details.get(key) + if isinstance(value, str) and value.strip() and not str(target.get(key) or "").strip(): + target[key] = value.strip() + incoming_tags = details.get("tags") + if isinstance(incoming_tags, list) and incoming_tags: + existing = target.get("tags") + if not isinstance(existing, list) or not existing: + target["tags"] = list(incoming_tags) + + +_AGENTSPEC_REGISTRY_LOOKUP: Any = None +_AGENTSPEC_REGISTRY_LOADED = False +_AGENTSPEC_REGISTRY_MAP: dict[str, Any] | None = None + + +def _load_agentspec_registry() -> tuple[dict[str, Any], Any]: + """Load the agent_runtimes agentspec catalog once and cache it. + + Returns a tuple of ``(catalog_by_id, get_agent_spec)`` where the first is a + mapping built from ``list_agentspecs`` (the Python equivalent of the + in-app ``listAgentspecs``) keyed by both the full id and the id without a + trailing ``:version`` segment, and the second is a per-id lookup callable. + Either component may be empty/``None`` when ``agent_runtimes`` (or a given + API surface) is unavailable, so the report degrades gracefully. + """ + global _AGENTSPEC_REGISTRY_LOADED, _AGENTSPEC_REGISTRY_LOOKUP, _AGENTSPEC_REGISTRY_MAP + if _AGENTSPEC_REGISTRY_LOADED: + return (_AGENTSPEC_REGISTRY_MAP or {}, _AGENTSPEC_REGISTRY_LOOKUP) + _AGENTSPEC_REGISTRY_LOADED = True + + module = None + for module_name in ("agent_runtimes.specs.agents", "agent_runtimes"): + try: + module = __import__(module_name, fromlist=["*"]) + break + except Exception: + module = None + if module is None: + return ({}, None) + + # Per-id lookup (handles version suffixes) is available on both the new + # and legacy package layouts. + _AGENTSPEC_REGISTRY_LOOKUP = getattr(module, "get_agent_spec", None) + + # Build a full catalog map from the list accessor, mirroring the UI which + # calls listAgentspecs() and indexes the result by id. + catalog: dict[str, Any] = {} + list_fn = getattr(module, "list_agentspecs", None) or getattr( + module, "list_agent_specs", None + ) + specs: list[Any] = [] + if callable(list_fn): + try: + specs = list(list_fn() or []) + except Exception: + specs = [] + if not specs: + registry = getattr(module, "AGENTSPECS", None) or getattr( + module, "AGENT_SPECS", None + ) + if isinstance(registry, dict): + specs = list(registry.values()) + for spec in specs: + spec_id = str(getattr(spec, "id", "") or "").strip() + if not spec_id: + continue + catalog[spec_id] = spec + base = spec_id.rpartition(":")[0] + if base and base not in catalog: + catalog[base] = spec + _AGENTSPEC_REGISTRY_MAP = catalog + return (catalog, _AGENTSPEC_REGISTRY_LOOKUP) + + +def _agentspec_registry_details(agent_spec_id: str) -> dict[str, Any]: + """Look up rich agentspec metadata from the agent_runtimes catalog. + + Uses the bundled agentspecification registry (the Python equivalent of + the in-app ``listAgentspecs``) to enrich the report with the canonical + name, description, version, model, tags, and display metadata for an + agentspec id. Returns an empty dict when the catalog or id is + unavailable, so the report still works without ``agent_runtimes``. + """ + value = str(agent_spec_id or "").strip() + if not value: + return {} + catalog, lookup = _load_agentspec_registry() + spec = catalog.get(value) + if spec is None: + base = value.rpartition(":")[0] + if base: + spec = catalog.get(base) + if spec is None and callable(lookup): + try: + spec = lookup(value) + except Exception: + spec = None + if spec is None: + return {} + + def _attr(*names: str) -> str: + for name in names: + candidate = getattr(spec, name, None) + if isinstance(candidate, str) and candidate.strip(): + return candidate.strip() + return "" + + return { + "name": _attr("name"), + "description": _attr("description"), + "version": _attr("version"), + "model": _attr("model"), + "icon": _attr("icon"), + "emoji": _attr("emoji"), + "color": _attr("color"), + "tags": _normalize_tags(getattr(spec, "tags", None)), + } + + +def _aggregate_evaluator_results( + samples: list[dict[str, Any]], +) -> list[dict[str, Any]]: + """Aggregate per-run global-evaluator results across an evalset's runs. + + ``samples`` are the ``evaluator_results`` entries collected from each run's + metrics (newest first). The aggregate exposes the mean score, how many runs + passed, and the latest run's outcome for each ``(name, scope)`` evaluator so + the report can show results, not just evaluator configuration. + """ + order: list[tuple[str, str]] = [] + grouped: dict[tuple[str, str], dict[str, Any]] = {} + for sample in samples: + if not isinstance(sample, dict): + continue + name = str(sample.get("name") or "evaluator") + scope = str(sample.get("scope") or "evalset") + key = (name, scope) + bucket = grouped.get(key) + if bucket is None: + bucket = { + "name": name, + "scope": scope, + "scores": [], + "passed_runs": 0, + "runs": 0, + "latest": sample, + } + grouped[key] = bucket + order.append(key) + bucket["runs"] += 1 + score = sample.get("score") + if isinstance(score, (int, float)): + bucket["scores"].append(float(score)) + if sample.get("passed"): + bucket["passed_runs"] += 1 + + aggregated: list[dict[str, Any]] = [] + for key in order: + bucket = grouped[key] + scores = bucket["scores"] + latest = bucket["latest"] + latest_score = latest.get("score") + entry: dict[str, Any] = { + "name": bucket["name"], + "scope": bucket["scope"], + "runs": bucket["runs"], + "passed_runs": bucket["passed_runs"], + "mean_score": round(sum(scores) / len(scores), 4) if scores else None, + "latest_score": ( + round(float(latest_score), 4) + if isinstance(latest_score, (int, float)) + else None + ), + "latest_passed": bool(latest.get("passed")), + "summary": str(latest.get("summary") or ""), + } + for optional in ("threshold", "observed", "passed_cases", "total_cases"): + if latest.get(optional) is not None: + entry[optional] = latest.get(optional) + aggregated.append(entry) + return aggregated + + +def _report_data( + client: DatalayerClient, + evalset_id: str, + run_limit: int, + account_uid: Optional[str], +) -> dict[str, Any]: + evalset_record: dict[str, Any] = {} + evalsets_payload = client.evals_list_evals( + q=evalset_id, + limit=200, + offset=0, + account_uid=account_uid, + ) + for item in (evalsets_payload.get("evalsets") or []): + if isinstance(item, dict) and str(item.get("id") or "") == evalset_id: + evalset_record = item + break + + experiments_payload = client.evals_list_experiments( + evalset_id=evalset_id, + limit=200, + offset=0, + account_uid=account_uid, + ) + experiments = experiments_payload.get("experiments") or [] + + report: dict[str, Any] = { + "evalset_id": evalset_id, + "evalset_name": str(evalset_record.get("name") or ""), + "run_environment": str(evalset_record.get("run_environment") or ""), + "generated_at": _now_iso(), + "agentspecs": [], + "evalset_evaluators": [ + item + for item in (evalset_record.get("evalset_evaluators") or []) + if isinstance(item, dict) + ], + "report_evaluators": [ + item + for item in (evalset_record.get("report_evaluators") or []) + if isinstance(item, dict) + ], + "cases": [ + case for case in (evalset_record.get("cases") or []) if isinstance(case, dict) + ], + "report_analyses": [], + "experiments": [], + } + agentspec_by_id: dict[str, dict[str, Any]] = {} + evaluator_result_samples: list[dict[str, Any]] = [] + + for experiment in experiments: + experiment_id = str(experiment.get("id", "")) + experiment_name = str(experiment.get("name", experiment_id)) + + runs_payload = client.evals_list_runs( + experiment_id, + limit=run_limit, + offset=0, + account_uid=account_uid, + ) + runs = runs_payload.get("runs") or [] + for run in runs: + if not isinstance(run, dict): + continue + metrics = run.get("metrics") + if not isinstance(metrics, dict): + continue + samples = metrics.get("evaluator_results") + if isinstance(samples, list): + evaluator_result_samples.extend( + sample for sample in samples if isinstance(sample, dict) + ) + agent_spec_id, agent_spec_name = _extract_experiment_agentspec(experiment, runs) + registry_details = ( + _agentspec_registry_details(agent_spec_id) if agent_spec_id else {} + ) + registry_name = str(registry_details.get("name") or "").strip() + if registry_name: + agent_spec_name = registry_name + if agent_spec_id and agent_spec_id not in agentspec_by_id: + agentspec_by_id[agent_spec_id] = { + "id": agent_spec_id, + "name": agent_spec_name or agent_spec_id, + "experiments": 0, + "runs": 0, + "experiment_names": [], + } + if agent_spec_id: + record = agentspec_by_id[agent_spec_id] + record["experiments"] += 1 + record["runs"] += len(runs) + if experiment_name: + names = record.setdefault("experiment_names", []) + if experiment_name not in names: + names.append(experiment_name) + # The agent_runtimes catalog is authoritative; fall back to any + # metadata embedded in the experiment/run payloads for fields the + # catalog does not provide (or when the catalog is unavailable). + _merge_agentspec_details(record, registry_details) + _merge_agentspec_details( + record, + _extract_experiment_agentspec_details(experiment, runs), + ) + total_runs = int(runs_payload.get("total") or len(runs)) + baseline, latest, drift = _compute_baseline_and_drift(runs) + + latest_two_delta: float | None = None + latest_two_run_ids: list[str] = [] + latest_two_compare: dict[str, Any] | None = None + if len(runs) >= 2: + latest_two_run_ids = [str(runs[0].get("id", "")), str(runs[1].get("id", ""))] + compare_payload = client.evals_compare_runs( + latest_two_run_ids, + account_uid=account_uid, + ) + compared_runs = compare_payload.get("runs") or [] + compared_by_id = { + str(run.get("id", "")): run + for run in compared_runs + if isinstance(run, dict) + } + run_a = compared_by_id.get(latest_two_run_ids[0], runs[0]) + run_b = compared_by_id.get(latest_two_run_ids[1], runs[1]) + pass_a = _run_pass_rate(run_a) + pass_b = _run_pass_rate(run_b) + if pass_a is not None and pass_b is not None: + latest_two_delta = pass_a - pass_b + latest_two_compare = { + "run_ids": latest_two_run_ids, + "run_a": _run_detail_record(run_a), + "run_b": _run_detail_record(run_b), + "delta_pass_rate": latest_two_delta, + } + + consecutive_comparisons: list[dict[str, Any]] = [] + for idx in range(max(0, len(runs) - 1)): + run_a = runs[idx] + run_b = runs[idx + 1] + pass_a = _run_pass_rate(run_a) + pass_b = _run_pass_rate(run_b) + delta = None + if pass_a is not None and pass_b is not None: + delta = pass_a - pass_b + consecutive_comparisons.append( + { + "run_a_id": str(run_a.get("id", "")), + "run_b_id": str(run_b.get("id", "")), + "run_a_status": str(run_a.get("status", "")), + "run_b_status": str(run_b.get("status", "")), + "run_a_pass_rate": pass_a, + "run_b_pass_rate": pass_b, + "delta_pass_rate": delta, + } + ) + + pass_rates = [ + _run_pass_rate(run) + for run in runs + if isinstance(_run_pass_rate(run), (int, float)) + ] + numeric_pass_rates = [float(value) for value in pass_rates if isinstance(value, (int, float))] + mean_pass = sum(numeric_pass_rates) / len(numeric_pass_rates) if numeric_pass_rates else None + stddev_pass = None + if numeric_pass_rates: + variance = sum((value - mean_pass) ** 2 for value in numeric_pass_rates) / len(numeric_pass_rates) + stddev_pass = math.sqrt(variance) + + report["experiments"].append( + { + "id": experiment_id, + "name": experiment_name, + "runs_total": total_runs, + "runs_fetched": len(runs), + "agent_spec_id": agent_spec_id, + "agent_spec_name": agent_spec_name, + "latest_pass_rate": latest, + "baseline_pass_rate": baseline, + "drift_delta": drift, + "latest_two_run_ids": latest_two_run_ids, + "latest_two_delta": latest_two_delta, + "latest_two_comparison": latest_two_compare, + "mean_pass_rate": mean_pass, + "stddev_pass_rate": stddev_pass, + "runs": [_run_detail_record(run) for run in runs], + "consecutive_comparisons": consecutive_comparisons, + "report_analyses": _build_experiment_report_analyses( + runs, + consecutive_comparisons, + baseline=baseline, + latest=latest, + drift=drift, + latest_two_delta=latest_two_delta, + mean_pass=mean_pass, + stddev_pass=stddev_pass, + ), + } + ) + report["agentspecs"] = list(agentspec_by_id.values()) + report["evaluator_results"] = _aggregate_evaluator_results(evaluator_result_samples) + report["report_analyses"] = _build_evalset_report_analyses(report["experiments"]) + return report + + +def _ascii_bar( + value: float | None, + width: int = 28, + *, + full_blocks: bool = True, + colorize: bool = False, +) -> str: + if value is None: + return "-" + bounded = max(0.0, min(1.0, float(value))) + filled = int(round(bounded * width)) + fill_char = "█" if full_blocks else "#" + empty_char = "░" if full_blocks else "." + filled_part = fill_char * filled + empty_part = empty_char * (width - filled) + if not colorize: + return filled_part + empty_part + if bounded >= 0.85: + style = "green" + elif bounded >= 0.75: + style = "yellow" + else: + style = "red" + return _style_text(filled_part, style, True) + _style_text(empty_part, "grey39", True) + + +def _fmt_pts(value: float) -> str: + return f"{value * 100:.1f}" + + +def _ascii_histogram( + values: list[float], + *, + bins: int = 8, + width: int = 22, + min_value: float | None = None, + max_value: float | None = None, + full_blocks: bool = True, + colorize: bool = False, + drift_palette: bool = False, +) -> list[str]: + if not values: + return ["n/a"] + + lo = min_value if isinstance(min_value, (int, float)) else min(values) + hi = max_value if isinstance(max_value, (int, float)) else max(values) + if hi <= lo: + hi = lo + 1e-9 + + bins = max(2, bins) + counts = [0 for _ in range(bins)] + span = hi - lo + for value in values: + ratio = (value - lo) / span + idx = int(ratio * bins) + idx = max(0, min(bins - 1, idx)) + counts[idx] += 1 + + peak = max(counts) if counts else 1 + fill_char = "█" if full_blocks else "#" + empty_char = "░" if full_blocks else "." + lines: list[str] = [] + for idx, count in enumerate(counts): + left = lo + (span * idx / bins) + right = lo + (span * (idx + 1) / bins) + filled = int(round((count / peak) * width)) if peak > 0 else 0 + filled_part = fill_char * filled + empty_part = empty_char * (width - filled) + if colorize: + if drift_palette: + if right <= 0: + bar_style = "red" + elif left >= 0: + bar_style = "green" + else: + bar_style = "yellow" + elif peak > 0 and count / peak >= 0.67: + bar_style = "cyan" + elif peak > 0 and count / peak >= 0.34: + bar_style = "blue" + else: + bar_style = "magenta" + bar = _style_text(filled_part, bar_style, True) + _style_text(empty_part, "grey39", True) + else: + bar = filled_part + empty_part + lines.append( + f"{_fmt_pts(left):>6} to {_fmt_pts(right):>6} pts |{bar}| {count}" + ) + return lines + + +def _fmt_delta(value: float | None, *, colorize: bool = False) -> str: + if value is None: + return "n/a" + rendered = f"{value * 100:+.1f} pts" + if value > 0: + return f"🟢 {_style_text(rendered, 'green', colorize)}" + if value < 0: + return f"🔴 {_style_text(rendered, 'red', colorize)}" + return f"⚪ {_style_text(rendered, 'yellow', colorize)}" + + +def _sparkline(values: list[float], *, colorize: bool = False) -> str: + if not values: + return "n/a" + ticks = "▁▂▃▄▅▆▇█" + lo = min(values) + hi = max(values) + if hi <= lo: + base = ticks[-2] * len(values) + else: + span = hi - lo + chars = [] + for value in values: + idx = int(round(((value - lo) / span) * (len(ticks) - 1))) + idx = max(0, min(len(ticks) - 1, idx)) + chars.append(ticks[idx]) + base = "".join(chars) + if not colorize: + return base + if values[-1] >= 0.85: + style = "green" + elif values[-1] >= 0.75: + style = "yellow" + else: + style = "red" + return _style_text(base, style, True) + + +def _clamp_unit(value: float) -> float: + return max(0.0, min(1.0, value)) + + +def _heat_char(value: float) -> str: + shades = "░▒▓█" + bounded = _clamp_unit(value) + idx = int(round(bounded * (len(shades) - 1))) + return shades[idx] + + +def _fit_label(text: str, width: int = 20) -> str: + raw = str(text or "") + if len(raw) <= width: + return raw.ljust(width) + if width <= 3: + return raw[:width] + return (raw[: width - 3] + "...") + + +def _ascii_passrate_heatmap( + experiments: list[dict[str, Any]], + *, + max_columns: int = 12, + colorize: bool = False, +) -> list[str]: + if not experiments: + return ["n/a"] + + max_columns = max(1, max_columns) + header = f"{'Experiment':<20} | " + " ".join( + f"r{idx:02d}" for idx in range(1, max_columns + 1) + ) + lines = [header, "-" * len(header)] + + for experiment in experiments: + runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] + cells: list[str] = [] + for idx in range(max_columns): + value: float | None = None + if idx < len(runs): + raw = runs[idx].get("pass_rate") + if isinstance(raw, (int, float)): + value = float(raw) + if value is None: + cells.append("·") + continue + + token = _heat_char(value) + if colorize: + if value >= 0.85: + token = _style_text(token, "green", True) + elif value >= 0.75: + token = _style_text(token, "yellow", True) + else: + token = _style_text(token, "red", True) + cells.append(token) + + label = _fit_label(str(experiment.get("name", "")), width=20) + lines.append(f"{label} | " + " ".join(cells)) + + lines.append("Legend: low='░' .. high='█' (r01=latest fetched run, '·'=no run)") + return lines + + +def _ascii_drift_heatmap( + experiments: list[dict[str, Any]], + *, + max_columns: int = 12, + colorize: bool = False, +) -> list[str]: + if not experiments: + return ["n/a"] + + max_columns = max(1, max_columns) + header = f"{'Experiment':<20} | " + " ".join( + f"d{idx:02d}" for idx in range(1, max_columns + 1) + ) + lines = [header, "-" * len(header)] + + for experiment in experiments: + comparisons = [ + item for item in (experiment.get("consecutive_comparisons") or []) + if isinstance(item, dict) + ] + cells: list[str] = [] + for idx in range(max_columns): + delta: float | None = None + if idx < len(comparisons): + raw = comparisons[idx].get("delta_pass_rate") + if isinstance(raw, (int, float)): + delta = float(raw) + if delta is None: + cells.append("··") + continue + + sign = "+" if delta >= 0 else "-" + magnitude = _heat_char(abs(delta)) + token = f"{sign}{magnitude}" + if colorize: + if delta > 0: + token = _style_text(token, "green", True) + elif delta < 0: + token = _style_text(token, "red", True) + else: + token = _style_text(token, "yellow", True) + cells.append(token) + + label = _fit_label(str(experiment.get("name", "")), width=20) + lines.append(f"{label} | " + " ".join(cells)) + + lines.append("Legend: dNN are consecutive deltas (A-B), sign shows direction, magnitude uses '░'..'█', '··'=no comparison") + return lines + + +def _pairwise_latest_deltas(experiments: list[dict[str, Any]]) -> list[dict[str, Any]]: + pairs: list[dict[str, Any]] = [] + for idx, left in enumerate(experiments): + left_latest = left.get("latest_pass_rate") + if not isinstance(left_latest, (int, float)): + continue + left_agent_spec_id = str(left.get("agent_spec_id") or "") + left_agent_spec_name = str(left.get("agent_spec_name") or left_agent_spec_id or "") + for right in experiments[idx + 1 :]: + right_latest = right.get("latest_pass_rate") + if not isinstance(right_latest, (int, float)): + continue + right_agent_spec_id = str(right.get("agent_spec_id") or "") + right_agent_spec_name = str(right.get("agent_spec_name") or right_agent_spec_id or "") + comparison_group = ( + "within_agentspec" + if left_agent_spec_id and right_agent_spec_id and left_agent_spec_id == right_agent_spec_id + else "cross_agentspec" + ) + pairs.append( + { + "left_id": str(left.get("id", "")), + "left": str(left.get("name", "")), + "left_agent_spec_id": left_agent_spec_id, + "left_agent_spec_name": left_agent_spec_name, + "right_id": str(right.get("id", "")), + "right": str(right.get("name", "")), + "right_agent_spec_id": right_agent_spec_id, + "right_agent_spec_name": right_agent_spec_name, + "left_latest": float(left_latest), + "right_latest": float(right_latest), + "delta": float(left_latest) - float(right_latest), + "group": comparison_group, + } + ) + pairs.sort(key=lambda item: abs(item["delta"]), reverse=True) + return pairs + + +def _markdown_table(headers: list[str], rows: list[list[str]], aligns: list[str]) -> list[str]: + widths = [len(header) for header in headers] + for row in rows: + for idx, cell in enumerate(row): + widths[idx] = max(widths[idx], len(cell)) + + def _pad(cell: str, width: int, align: str) -> str: + if align == "right": + return cell.rjust(width) + return cell.ljust(width) + + header_line = "| " + " | ".join(headers[idx].ljust(widths[idx]) for idx in range(len(headers))) + " |" + + sep_parts: list[str] = [] + for idx, align in enumerate(aligns): + width = max(3, widths[idx]) + if align == "right": + sep_parts.append("-" * (width - 1) + ":") + else: + sep_parts.append(":" + "-" * (width - 1)) + sep_line = "| " + " | ".join(sep_parts) + " |" + + body_lines = [ + "| " + " | ".join(_pad(row[idx], widths[idx], aligns[idx]) for idx in range(len(headers))) + " |" + for row in rows + ] + return [header_line, sep_line, *body_lines] + + +def _compact_json(value: Any, max_len: int = 140) -> str: + if value is None: + return "-" + if isinstance(value, str): + text = value + else: + try: + text = json.dumps(value, ensure_ascii=False, separators=(",", ":"), sort_keys=True) + except Exception: + text = str(value) + text = " ".join(text.split()) + if len(text) <= max_len: + return text + return text[: max_len - 3] + "..." + + +def _aggregate_case_outcomes( + experiments: list[dict[str, Any]], +) -> tuple[dict[str, dict[str, Any]], list[str]]: + """Aggregate per-case pass/score stats across every fetched run. + + Returns ``(case_stats, agentspec_names)`` where ``case_stats`` maps a case + name to ``{"runs", "passed", "score_sum", "score_count", "by_spec"}`` and + ``by_spec`` maps an agentspec label to ``{"runs", "passed"}``. + """ + case_stats: dict[str, dict[str, Any]] = {} + case_order: list[str] = [] + agentspec_names: list[str] = [] + for experiment in experiments: + spec_label = str( + experiment.get("agent_spec_name") + or experiment.get("agent_spec_id") + or "-" + ) + if spec_label not in agentspec_names: + agentspec_names.append(spec_label) + for run in experiment.get("runs") or []: + if not isinstance(run, dict): + continue + metrics = run.get("metrics") if isinstance(run.get("metrics"), dict) else {} + case_results = metrics.get("case_results") + if not isinstance(case_results, list): + continue + for case_result in case_results: + if not isinstance(case_result, dict): + continue + name = str(case_result.get("name") or "-") + if name not in case_stats: + case_stats[name] = { + "runs": 0, + "passed": 0, + "score_sum": 0.0, + "score_count": 0, + "by_spec": {}, + } + case_order.append(name) + stat = case_stats[name] + passed = bool(case_result.get("passed")) + stat["runs"] += 1 + stat["passed"] += 1 if passed else 0 + score = case_result.get("score") + if isinstance(score, (int, float)): + stat["score_sum"] += float(score) + stat["score_count"] += 1 + spec_entry = stat["by_spec"].setdefault( + spec_label, {"runs": 0, "passed": 0} + ) + spec_entry["runs"] += 1 + spec_entry["passed"] += 1 if passed else 0 + ordered = {name: case_stats[name] for name in case_order} + return ordered, agentspec_names + + +def _per_case_outcome_lines( + experiments: list[dict[str, Any]], + *, + colorize: bool = False, +) -> list[str]: + """Render the per-case outcomes section (pass rate per case across runs).""" + lines: list[str] = [] + lines.append("## Per-Case Outcomes") + lines.append("") + case_stats, agentspec_names = _aggregate_case_outcomes(experiments) + if not case_stats: + lines.append( + "No per-case results were recorded on the fetched runs. Runs that " + "store `case_results` in their metrics populate this section." + ) + lines.append("") + return lines + + lines.append( + "Pass rate for each case across every fetched run (all experiments and " + "agentspecs combined). This reveals which cases are reliable and which " + "ones regress, instead of only the aggregate run pass rate." + ) + lines.append("") + overall_rows: list[list[str]] = [] + for name, stat in case_stats.items(): + runs = int(stat["runs"]) + passed = int(stat["passed"]) + pass_rate = (passed / runs) if runs else None + avg_score = ( + stat["score_sum"] / stat["score_count"] if stat["score_count"] else None + ) + overall_rows.append( + [ + name, + str(runs), + f"{passed}/{runs}", + _fmt_pct(pass_rate), + "n/a" if avg_score is None else f"{avg_score:.3f}", + ] + ) + lines.extend( + _markdown_table( + ["Case", "Runs", "Passed", "Pass Rate", "Avg Score"], + overall_rows, + ["left", "right", "right", "right", "right"], + ) + ) + lines.append("") + + if len(agentspec_names) > 1: + lines.append("### Per-Case Pass Rate By Agentspec") + lines.append("") + lines.append( + "Compare how each case performs across agentspecs (for example " + "codemode vs no-codemode)." + ) + lines.append("") + spec_rows: list[list[str]] = [] + for name, stat in case_stats.items(): + row = [name] + for spec_label in agentspec_names: + spec_entry = stat["by_spec"].get(spec_label) + if not spec_entry or not spec_entry.get("runs"): + row.append("n/a") + continue + spec_pass = spec_entry["passed"] / spec_entry["runs"] + row.append(_fmt_pct(spec_pass)) + spec_rows.append(row) + lines.extend( + _markdown_table( + ["Case", *agentspec_names], + spec_rows, + ["left", *["right"] * len(agentspec_names)], + ) + ) + lines.append("") + + return lines + + +def _report_analyses_lines(report: dict[str, Any]) -> list[str]: + lines: list[str] = [] + analyses = [ + item for item in (report.get("report_analyses") or []) if isinstance(item, dict) + ] + lines.append("## Appendix: Structured Report Analyses") + lines.append("") + lines.append( + "The JSON block below is rendered directly from the top-level " + "`report_analyses` payload." + ) + lines.append("") + lines.append("```json") + lines.append(json.dumps(analyses, ensure_ascii=False, indent=2, sort_keys=True)) + lines.append("```") + lines.append("") + return lines + + +def _report_markdown(report: dict[str, Any], run_limit: int, *, colorize: bool = False) -> str: + evalset_id = str(report.get("evalset_id", "")) + run_environment = str(report.get("run_environment") or "") + generated_at = str(report.get("generated_at", "")) + experiments = [item for item in (report.get("experiments") or []) if isinstance(item, dict)] + agentspecs = [item for item in (report.get("agentspecs") or []) if isinstance(item, dict)] + cases = [item for item in (report.get("cases") or []) if isinstance(item, dict)] + case_by_name: dict[str, dict[str, Any]] = {} + representative_case_name: str | None = None + for case in cases: + name = str(case.get("name") or "") + if not name: + continue + if representative_case_name is None: + representative_case_name = name + if name not in case_by_name: + case_by_name[name] = case + evalset_runs_url = _evalset_runs_url(evalset_id, run_environment) + + lines: list[str] = [] + # Verbose, drill-down content is collected here and emitted under a single + # "# Appendices" heading at the end so the body stays scannable. + appendix_lines: list[str] = [] + lines.append(f"# Evals Report: {evalset_id}") + lines.append("") + lines.append(f"- Generated at: {generated_at}") + lines.append(f"- Experiments: {len(experiments)}") + lines.append(f"- Agentspecs: {len(agentspecs)}") + lines.append(f"- Cases: {len(cases)}") + lines.append( + f"- Evalset evaluators: {len([item for item in (report.get('evalset_evaluators') or []) if isinstance(item, dict)])}" + ) + lines.append( + f"- Report evaluators: {len([item for item in (report.get('report_evaluators') or []) if isinstance(item, dict)])}" + ) + lines.append(f"- Run window per experiment: {run_limit}") + if evalset_runs_url: + lines.append(f"- Evalset run details: [Open in Datalayer]({evalset_runs_url})") + lines.append("") + lines.append( + "> The body summarises results (evaluators, per-case outcomes, drift, and " + "comparisons). Full configuration, cases, heatmaps, per-experiment timelines, " + "and per-run details are in the appendices at the end." + ) + lines.append("") + + lines.append("## Agentspec Coverage") + lines.append("") + if agentspecs: + agentspec_rows: list[list[str]] = [] + for item in agentspecs: + agent_spec_id = str(item.get("id") or "") + agent_spec_link = _agentspec_details_url(agent_spec_id) + agentspec_rows.append( + [ + agent_spec_id, + str(item.get("name") or item.get("id") or ""), + str(item.get("model") or "-"), + str(item.get("version") or "-"), + str(int(item.get("experiments") or 0)), + str(int(item.get("runs") or 0)), + f"[Open]({agent_spec_link})" if agent_spec_link else "-", + ] + ) + lines.extend( + _markdown_table( + [ + "Agentspec ID", + "Agentspec", + "Model", + "Version", + "Experiments", + "Runs", + "Details", + ], + agentspec_rows, + ["left", "left", "left", "left", "right", "right", "left"], + ) + ) + lines.append("") + appendix_lines.append("## Appendix: Agentspec Details") + appendix_lines.append("") + for item in agentspecs: + agent_spec_id = str(item.get("id") or "") + agent_spec_link = _agentspec_details_url(agent_spec_id) + display_name = str(item.get("name") or agent_spec_id or "-") + emoji = str(item.get("emoji") or "").strip() + heading = f"{emoji} {display_name}".strip() + appendix_lines.append(f"### {heading}") + appendix_lines.append("") + appendix_lines.append(f"- ID: `{agent_spec_id or '-'}`") + description = str(item.get("description") or "").strip() + if description: + appendix_lines.append(f"- Description: {description}") + model = str(item.get("model") or "").strip() + if model: + appendix_lines.append(f"- Model: {model}") + version = str(item.get("version") or "").strip() + if version: + appendix_lines.append(f"- Version: {version}") + color = str(item.get("color") or "").strip() + if color: + appendix_lines.append(f"- Color: {color}") + tags = item.get("tags") + if isinstance(tags, list) and tags: + appendix_lines.append(f"- Tags: {', '.join(str(tag) for tag in tags)}") + experiment_names = item.get("experiment_names") + if isinstance(experiment_names, list) and experiment_names: + appendix_lines.append( + f"- Experiments ({len(experiment_names)}): " + + ", ".join(str(name) for name in experiment_names) + ) + appendix_lines.append( + f"- Runs analysed: {int(item.get('runs') or 0)}" + ) + if agent_spec_link: + appendix_lines.append(f"- Details: [Open in Datalayer]({agent_spec_link})") + appendix_lines.append("") + else: + lines.append("No agentspec metadata found in experiment/run payloads.") + lines.append("") + + evalset_evaluators = [ + item for item in (report.get("evalset_evaluators") or []) if isinstance(item, dict) + ] + report_evaluators = [ + item for item in (report.get("report_evaluators") or []) if isinstance(item, dict) + ] + evaluator_results = [ + item for item in (report.get("evaluator_results") or []) if isinstance(item, dict) + ] + lines.append("## Evaluator Results") + lines.append("") + lines.append( + "Evalset-scoped evaluators run for each case; report-scoped evaluators run once " + "over the aggregated report. Scores are aggregated across all runs in the run " + "window (full evaluator configuration is in the appendix)." + ) + lines.append("") + if evaluator_results: + lines.append( + "Runs Passed counts runs where the evaluator passed; Latest reflects the " + "most recent run." + ) + lines.append("") + result_rows: list[list[str]] = [] + for item in evaluator_results: + runs_count = int(item.get("runs") or 0) + passed_runs = int(item.get("passed_runs") or 0) + mean_score = item.get("mean_score") + latest_score = item.get("latest_score") + result_rows.append( + [ + str(item.get("name") or "-"), + str(item.get("scope") or "-"), + f"{passed_runs}/{runs_count}", + f"{float(mean_score):.3f}" if isinstance(mean_score, (int, float)) else "-", + f"{float(latest_score):.3f}" if isinstance(latest_score, (int, float)) else "-", + "pass" if item.get("latest_passed") else "fail", + str(item.get("summary") or "-"), + ] + ) + lines.extend( + _markdown_table( + ["Evaluator", "Scope", "Runs Passed", "Mean Score", "Latest Score", "Latest", "Summary"], + result_rows, + ["left", "left", "right", "right", "right", "left", "left"], + ) + ) + else: + lines.append( + "No evaluator results recorded. Runs that store `evaluator_results` " + "in their metrics populate this section." + ) + lines.append("") + + appendix_lines.append("## Appendix: Evaluator Configuration") + appendix_lines.append("") + appendix_lines.append( + "Evalset-level evaluators run for each case; report-level evaluators run once " + "after all cases. Evaluator names are resolved at runtime via the Pydantic " + "evaluator registries." + ) + appendix_lines.append("") + appendix_lines.append("### Evalset Evaluators") + appendix_lines.append("") + if evalset_evaluators: + appendix_lines.append("```json") + appendix_lines.append(json.dumps(evalset_evaluators, ensure_ascii=False, indent=2, sort_keys=True)) + appendix_lines.append("```") + else: + appendix_lines.append("No evalset-level evaluators configured.") + appendix_lines.append("") + appendix_lines.append("### Report Evaluators") + appendix_lines.append("") + if report_evaluators: + appendix_lines.append("```json") + appendix_lines.append(json.dumps(report_evaluators, ensure_ascii=False, indent=2, sort_keys=True)) + appendix_lines.append("```") + else: + appendix_lines.append("No report-level evaluators configured.") + appendix_lines.append("") + + appendix_lines.append("## Appendix: Evalset Cases") + appendix_lines.append("") + appendix_lines.append(f"{len(cases)} case(s) in this evalset.") + appendix_lines.append("") + if cases: + case_rows: list[list[str]] = [] + for case in cases: + expected_output = case.get("expected_output") + if expected_output is None: + expected_output = case.get("expected") + case_rows.append( + [ + str(case.get("name") or "-"), + str(case.get("id") or "-"), + _compact_json(case.get("inputs")), + _compact_json(expected_output), + _compact_json(case.get("evaluators")), + _compact_json(case.get("metadata")), + ] + ) + appendix_lines.extend( + _markdown_table( + ["Case", "ID", "Inputs", "Expected Output", "Evaluators", "Metadata"], + case_rows, + ["left", "left", "left", "left", "left", "left"], + ) + ) + else: + appendix_lines.append("No cases returned for this evalset.") + appendix_lines.append("") + + lines.extend(_per_case_outcome_lines(experiments, colorize=colorize)) + + lines.append("## Experiment Overview") + lines.append("") + overview_rows: list[list[str]] = [] + for experiment in experiments: + runs_fetched = int(experiment.get("runs_fetched") or 0) + runs_total = int(experiment.get("runs_total") or 0) + overview_rows.append( + [ + f"{experiment.get('name', '')}", + str(experiment.get('agent_spec_name') or experiment.get('agent_spec_id') or '-'), + f"{runs_fetched}/{runs_total}", + _fmt_pct(experiment.get('latest_pass_rate') if isinstance(experiment.get('latest_pass_rate'), (int, float)) else None), + _fmt_pct(experiment.get('baseline_pass_rate') if isinstance(experiment.get('baseline_pass_rate'), (int, float)) else None), + _fmt_delta(experiment.get('drift_delta') if isinstance(experiment.get('drift_delta'), (int, float)) else None, colorize=colorize), + _fmt_delta(experiment.get('latest_two_delta') if isinstance(experiment.get('latest_two_delta'), (int, float)) else None, colorize=colorize), + ] + ) + lines.extend( + _markdown_table( + ["Experiment", "Agentspec", "Runs (fetched/total)", "Latest", "Baseline", "Drift", "Latest-2 Delta"], + overview_rows, + ["left", "left", "right", "right", "right", "right", "right"], + ) + ) + lines.append("") + + appendix_lines.extend(_report_analyses_lines(report)) + + lines.append("## Comparison Combinations") + lines.append("") + + ranked_latest = sorted( + [item for item in experiments if isinstance(item.get("latest_pass_rate"), (int, float))], + key=lambda item: float(item.get("latest_pass_rate") or 0.0), + reverse=True, + ) + lines.append("### By Latest Pass Rate") + lines.append("") + latest_rows: list[list[str]] = [] + for idx, item in enumerate(ranked_latest, start=1): + latest_rows.append([str(idx), f"{item.get('name', '')}", _fmt_pct(float(item.get('latest_pass_rate') or 0.0))]) + lines.extend(_markdown_table(["Rank", "Experiment", "Latest"], latest_rows, ["right", "left", "right"])) + latest_values = [ + float(item.get("latest_pass_rate")) + for item in ranked_latest + if isinstance(item.get("latest_pass_rate"), (int, float)) + ] + lines.append("") + lines.append("Latest pass-rate histogram (pts):") + for hist_line in _ascii_histogram( + latest_values, + bins=8, + width=20, + min_value=0.0, + max_value=1.0, + full_blocks=True, + colorize=colorize, + ): + lines.append(f"`{hist_line}`") + lines.append("") + + ranked_drift = sorted( + [item for item in experiments if isinstance(item.get("drift_delta"), (int, float))], + key=lambda item: float(item.get("drift_delta") or 0.0), + ) + lines.append("### By Drift (Most Negative To Most Positive)") + lines.append("") + drift_rows: list[list[str]] = [] + for idx, item in enumerate(ranked_drift, start=1): + drift_rows.append([str(idx), f"{item.get('name', '')}", _fmt_delta(float(item.get('drift_delta') or 0.0), colorize=colorize)]) + lines.extend(_markdown_table(["Rank", "Experiment", "Drift"], drift_rows, ["right", "left", "right"])) + drift_values = [ + float(item.get("drift_delta")) + for item in ranked_drift + if isinstance(item.get("drift_delta"), (int, float)) + ] + lines.append("") + lines.append("Drift histogram (delta pts):") + for hist_line in _ascii_histogram( + drift_values, + bins=8, + width=20, + full_blocks=True, + colorize=colorize, + drift_palette=True, + ): + lines.append(f"`{hist_line}`") + lines.append("") + + ranked_stability = sorted( + [item for item in experiments if isinstance(item.get("stddev_pass_rate"), (int, float))], + key=lambda item: float(item.get("stddev_pass_rate") or 0.0), + ) + lines.append("### By Stability (Lowest Pass-Rate StdDev)") + lines.append("") + stability_rows: list[list[str]] = [] + for idx, item in enumerate(ranked_stability, start=1): + stddev = item.get("stddev_pass_rate") + mean = item.get("mean_pass_rate") + stability_rows.append( + [ + str(idx), + f"{item.get('name', '')}", + (f"{float(stddev) * 100:.2f} pts" if isinstance(stddev, (int, float)) else "n/a"), + (_fmt_pct(float(mean)) if isinstance(mean, (int, float)) else "n/a"), + ] + ) + lines.extend(_markdown_table(["Rank", "Experiment", "StdDev", "Mean"], stability_rows, ["right", "left", "right", "right"])) + lines.append("") + + pairwise = _pairwise_latest_deltas(experiments) + within_agentspec_pairs = [ + pair for pair in pairwise if str(pair.get("group") or "") == "within_agentspec" + ] + cross_agentspec_pairs = [ + pair for pair in pairwise if str(pair.get("group") or "") == "cross_agentspec" + ] + lines.append("### Pairwise Latest-Pass Deltas") + lines.append("") + pair_rows: list[list[str]] = [] + for pair in pairwise: + pair_rows.append( + [ + f"{pair['left']} vs {pair['right']}", + _fmt_pct(pair['left_latest']), + _fmt_pct(pair['right_latest']), + _fmt_delta(pair['delta'], colorize=colorize), + ] + ) + if not pairwise: + pair_rows.append(["n/a", "n/a", "n/a", "n/a"]) + lines.extend( + _markdown_table( + ["Pair", "Left Latest", "Right Latest", "Delta (Left-Right)"], + pair_rows, + ["left", "right", "right", "right"], + ) + ) + pair_deltas = [float(pair["delta"]) for pair in pairwise if isinstance(pair.get("delta"), (int, float))] + lines.append("") + lines.append("Pairwise latest-delta histogram (pts):") + for hist_line in _ascii_histogram( + pair_deltas, + bins=8, + width=20, + full_blocks=True, + colorize=colorize, + drift_palette=True, + ): + lines.append(f"`{hist_line}`") + lines.append("") + + lines.append("### Within-Agentspec Pairwise Latest-Pass Deltas") + lines.append("") + within_pair_rows: list[list[str]] = [] + for pair in within_agentspec_pairs: + within_pair_rows.append( + [ + f"{pair['left']} vs {pair['right']}", + str(pair.get('left_agent_spec_name') or pair.get('left_agent_spec_id') or '-'), + _fmt_pct(pair['left_latest']), + _fmt_pct(pair['right_latest']), + _fmt_delta(pair['delta'], colorize=colorize), + ] + ) + if not within_pair_rows: + within_pair_rows.append(["n/a", "n/a", "n/a", "n/a", "n/a"]) + lines.extend( + _markdown_table( + ["Pair", "Agentspec", "Left Latest", "Right Latest", "Delta (Left-Right)"], + within_pair_rows, + ["left", "left", "right", "right", "right"], + ) + ) + lines.append("") + + lines.append("### Cross-Agentspec Pairwise Latest-Pass Deltas") + lines.append("") + cross_pair_rows: list[list[str]] = [] + for pair in cross_agentspec_pairs: + cross_pair_rows.append( + [ + f"{pair['left']} ({pair.get('left_agent_spec_name') or pair.get('left_agent_spec_id') or '-'}) vs {pair['right']} ({pair.get('right_agent_spec_name') or pair.get('right_agent_spec_id') or '-'})", + _fmt_pct(pair['left_latest']), + _fmt_pct(pair['right_latest']), + _fmt_delta(pair['delta'], colorize=colorize), + ] + ) + if not cross_pair_rows: + cross_pair_rows.append(["n/a", "n/a", "n/a", "n/a"]) + lines.extend( + _markdown_table( + ["Pair", "Left Latest", "Right Latest", "Delta (Left-Right)"], + cross_pair_rows, + ["left", "right", "right", "right"], + ) + ) + lines.append("") + + appendix_lines.append("## Appendix: Heatmaps") + appendix_lines.append("") + appendix_lines.append("Pass-rate heatmap by experiment and run window:") + appendix_lines.append("") + appendix_lines.append("```text") + appendix_lines.extend(_ascii_passrate_heatmap(experiments, max_columns=12, colorize=False)) + appendix_lines.append("```") + appendix_lines.append("") + appendix_lines.append("Consecutive delta heatmap (A-B) by experiment:") + appendix_lines.append("") + appendix_lines.append("```text") + appendix_lines.extend(_ascii_drift_heatmap(experiments, max_columns=12, colorize=False)) + appendix_lines.append("```") + appendix_lines.append("") + + lines.append("### Insight Highlights") + lines.append("") + best_latest = ranked_latest[0] if ranked_latest else None + worst_latest = ranked_latest[-1] if ranked_latest else None + most_negative = ranked_drift[0] if ranked_drift else None + most_positive = ranked_drift[-1] if ranked_drift else None + most_stable = ranked_stability[0] if ranked_stability else None + if best_latest: + lines.append( + "- Top latest pass-rate: " + + f"{best_latest.get('name', '')} ({_fmt_pct(float(best_latest.get('latest_pass_rate') or 0.0))})." + ) + if worst_latest: + lines.append( + "- Lowest latest pass-rate: " + + f"{worst_latest.get('name', '')} ({_fmt_pct(float(worst_latest.get('latest_pass_rate') or 0.0))})." + ) + if most_positive: + drift_pos = float(most_positive.get("drift_delta") or 0.0) + lines.append( + "- Strongest positive drift: " + + f"{most_positive.get('name', '')} ({_fmt_delta(drift_pos, colorize=colorize)})." + ) + if most_negative: + drift_neg = float(most_negative.get("drift_delta") or 0.0) + lines.append( + "- Strongest negative drift: " + + f"{most_negative.get('name', '')} ({_fmt_delta(drift_neg, colorize=colorize)})." + ) + if most_stable: + std = most_stable.get("stddev_pass_rate") + mean = most_stable.get("mean_pass_rate") + lines.append( + "- Stability leader: " + + f"{most_stable.get('name', '')} " + + f"(stddev={(float(std) * 100):.2f} pts, mean={_fmt_pct(float(mean)) if isinstance(mean, (int, float)) else 'n/a'})." + ) + + drift_neg_count = len([value for value in drift_values if value < 0]) + drift_flat_count = len([value for value in drift_values if value == 0]) + drift_pos_count = len([value for value in drift_values if value > 0]) + total = max(1, drift_neg_count + drift_flat_count + drift_pos_count) + neg_meter = "█" * int(round((drift_neg_count / total) * 14)) + flat_meter = "█" * int(round((drift_flat_count / total) * 14)) + pos_meter = "█" * int(round((drift_pos_count / total) * 14)) + neg_meter = neg_meter or "·" + flat_meter = flat_meter or "·" + pos_meter = pos_meter or "·" + lines.append("") + lines.append("Drift balance meter:") + lines.append( + "`NEG " + + _style_text(neg_meter, "red", colorize) + + f" ({drift_neg_count}) | FLAT " + + _style_text(flat_meter, "yellow", colorize) + + f" ({drift_flat_count}) | POS " + + _style_text(pos_meter, "green", colorize) + + f" ({drift_pos_count})`" + ) + lines.append("") + + appendix_lines.append("## Appendix: Per-Experiment Details") + appendix_lines.append("") + for experiment in experiments: + appendix_lines.append(f"### {experiment.get('name', '')}") + appendix_lines.append("") + agent_spec_id = str(experiment.get("agent_spec_id") or "") + agent_spec_label = str(experiment.get('agent_spec_name') or agent_spec_id or '-') + agent_spec_link = _agentspec_details_url(agent_spec_id) + if agent_spec_link: + appendix_lines.append(f"Agentspec: [{agent_spec_label}]({agent_spec_link})") + else: + appendix_lines.append(f"Agentspec: {agent_spec_label}") + if evalset_runs_url: + appendix_lines.append(f"Evalset run details: [Open run page]({evalset_runs_url})") + appendix_lines.append("") + appendix_lines.append("#### Run Timeline") + appendix_lines.append("") + run_rows: list[list[str]] = [] + runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] + for idx, run in enumerate(runs, start=1): + pass_rate = run.get("pass_rate") if isinstance(run.get("pass_rate"), (int, float)) else None + cause_text = _format_failure_cause(run.get("failure_cause")) + run_id = str(run.get('id', '')) + run_link = _run_overlay_url(evalset_runs_url, run_id) + run_rows.append( + [ + str(idx), + (f"[{run_id}]({run_link})" if run_link and run_id else run_id), + str(run.get('status', '')), + _fmt_pct(float(pass_rate)) if isinstance(pass_rate, (int, float)) else 'n/a', + f"`{_ascii_bar(float(pass_rate), full_blocks=True, colorize=colorize) if isinstance(pass_rate, (int, float)) else '-'}`", + cause_text or "-", + ] + ) + if not runs: + run_rows.append(["1", "n/a", "n/a", "n/a", "`-`", "-"]) + appendix_lines.extend(_markdown_table(["#", "Run ID", "Status", "Pass Rate", "ASCII Trend", "Failure Cause"], run_rows, ["right", "left", "left", "right", "left", "left"])) + appendix_lines.append("") + failure_rows: list[list[str]] = [] + for idx, run in enumerate(runs, start=1): + cause = run.get("failure_cause") + if not isinstance(cause, dict) or not cause: + continue + detail = str(cause.get("detail_excerpt") or "").strip() + detail_single = " ".join(detail.split()) + if len(detail_single) > 240: + detail_single = detail_single[:237] + "..." + failure_rows.append( + [ + str(idx), + str(run.get("id", "")), + str(cause.get("stage") or "-"), + str(cause.get("type") or "-"), + str(cause.get("message") or "-"), + detail_single or "-", + ] + ) + if failure_rows: + appendix_lines.append("#### Failure Causes") + appendix_lines.append("") + appendix_lines.extend( + _markdown_table( + ["#", "Run ID", "Stage", "Type", "Message", "Detail Excerpt"], + failure_rows, + ["right", "left", "left", "left", "left", "left"], + ) + ) + appendix_lines.append("") + for idx, run in enumerate(runs, start=1): + cause = run.get("failure_cause") + if not isinstance(cause, dict) or not cause: + continue + detail_lines = _failure_cause_detail_lines(cause) + if not detail_lines: + continue + appendix_lines.append(f"
Run {idx} failure detail ({run.get('id', '')})") + appendix_lines.append("") + appendix_lines.extend(detail_lines) + appendix_lines.append("") + appendix_lines.append("
") + appendix_lines.append("") + timeline_values = [ + float(run.get("pass_rate")) + for run in runs + if isinstance(run.get("pass_rate"), (int, float)) + ] + appendix_lines.append( + "Pass-rate sparkline: " + + f"`{_sparkline(timeline_values, colorize=colorize) if timeline_values else 'n/a'}`" + ) + appendix_lines.append("") + + comparisons = [ + item for item in (experiment.get("consecutive_comparisons") or []) + if isinstance(item, dict) + ] + appendix_lines.append("#### Consecutive Run Deltas (A-B)") + appendix_lines.append("") + comparison_rows: list[list[str]] = [] + for item in comparisons: + run_a = item.get("run_a_pass_rate") if isinstance(item.get("run_a_pass_rate"), (int, float)) else None + run_b = item.get("run_b_pass_rate") if isinstance(item.get("run_b_pass_rate"), (int, float)) else None + delta = item.get("delta_pass_rate") if isinstance(item.get("delta_pass_rate"), (int, float)) else None + comparison_rows.append( + [ + str(item.get('run_a_id', '')), + str(item.get('run_b_id', '')), + _fmt_pct(float(run_a)) if isinstance(run_a, (int, float)) else 'n/a', + _fmt_pct(float(run_b)) if isinstance(run_b, (int, float)) else 'n/a', + _fmt_delta(float(delta), colorize=colorize) if isinstance(delta, (int, float)) else 'n/a', + ] + ) + if not comparisons: + comparison_rows.append(["n/a", "n/a", "n/a", "n/a", "n/a"]) + appendix_lines.extend(_markdown_table(["Run A", "Run B", "A Pass", "B Pass", "Delta"], comparison_rows, ["left", "left", "right", "right", "right"])) + appendix_lines.append("") + + lines.append("## Notes") + lines.append("") + lines.append("- Drift is computed as latest - baseline.") + lines.append("- Baseline uses the first half of fetched runs (minimum 1, maximum 3).") + lines.append("- Latest-2 delta uses the latest two runs returned in the fetched window.") + lines.append("") + + appendix_lines.extend( + _report_appendix_lines( + experiments, + evalset_runs_url, + case_by_name=case_by_name, + representative_case_name=representative_case_name, + ) + ) + + if appendix_lines: + lines.append("# Appendices") + lines.append("") + lines.append( + "Full configuration, cases, heatmaps, per-experiment timelines, and per-run " + "details are collected below to keep the summary above readable." + ) + lines.append("") + lines.extend(appendix_lines) + + return "\n".join(lines) + + +def _appendix_metric_int(metrics: dict[str, Any], *keys: str) -> str: + for key in keys: + value = metrics.get(key) + if isinstance(value, bool): + continue + if isinstance(value, (int, float)): + return str(int(value)) + return "-" + + +def _appendix_metric_float(metrics: dict[str, Any], *keys: str) -> str: + for key in keys: + value = metrics.get(key) + if isinstance(value, bool): + continue + if isinstance(value, (int, float)): + return f"{float(value):.3f}" + return "-" + + +# Candidate paths mirror the in-app run-details overlay +# (`getRunInteractionDetails` in AIEvals.tsx) so the report renders the same +# prompt/output the UI shows. +_PROMPT_CANDIDATE_PATHS: tuple[tuple[str, str], ...] = ( + ("summary", "agent_prompt"), + ("summary", "sent_prompt"), + ("summary", "prompt"), + ("report", "agent_prompt"), + ("report", "sent_prompt"), + ("report", "prompt"), +) + +_OUTPUT_CANDIDATE_PATHS: tuple[tuple[str, str], ...] = ( + ("summary", "agent_output"), + ("summary", "output"), + ("report", "agent_output"), + ("report", "output"), + ("report", "parsed"), + ("summary", "agent_output_text"), + ("report", "agent_output_text"), + ("report", "raw_excerpt"), +) + + +def _run_interaction_value( + run: dict[str, Any], paths: tuple[tuple[str, str], ...] +) -> Any: + """Return the first non-empty value found along the candidate paths.""" + for container_key, field in paths: + container = run.get(container_key) + if isinstance(container, dict): + value = container.get(field) + if value is not None: + return value + return None + + +def _format_display_value(value: Any) -> tuple[str, str]: + """Render a value the way the UI overlay does. + + Returns a ``(language, text)`` tuple so callers can fence the content + with the right code-block language hint. + """ + if value is None: + return "text", "(none)" + if isinstance(value, str): + return "text", value + try: + return "json", json.dumps(value, ensure_ascii=False, indent=2, sort_keys=True) + except Exception: + return "text", str(value) + + +def _fenced_block(language: str, text: str) -> list[str]: + """Emit a fenced code block, guarding against backtick collisions.""" + body = text if text != "" else "(empty)" + return [f"```{language}", *body.splitlines(), "```"] + + +def _extract_case_prompt(case_record: dict[str, Any] | None) -> Any: + if not isinstance(case_record, dict): + return None + inputs = case_record.get("inputs") + if not isinstance(inputs, dict): + return None + for key in ("prompt", "text", "query", "message"): + value = inputs.get(key) + if value is not None: + return value + return inputs + + +def _extract_case_prompt_from_result(case_result: dict[str, Any]) -> Any: + for key in ("prompt", "input", "inputs", "case_input"): + value = case_result.get(key) + if value is not None: + return value + return None + + +def _extract_case_output_from_result(case_result: dict[str, Any]) -> Any: + for key in ("output", "actual_output", "response", "result"): + value = case_result.get(key) + if value is not None: + return value + return None + + +def _is_synthetic_run(run: dict[str, Any]) -> bool: + summary = run.get("summary") if isinstance(run.get("summary"), dict) else {} + report = run.get("report") if isinstance(run.get("report"), dict) else {} + if summary.get("synthetic") is True or report.get("synthetic") is True: + return True + output = summary.get("agent_output") + if isinstance(output, dict): + if output.get("synthetic") is True: + return True + if output.get("mode") == "synthetic": + return True + return False + + +def _synthetic_case_output( + run: dict[str, Any], + case_record: dict[str, Any] | None, + case_result: dict[str, Any], + *, + representative_case_name: str | None, + case_name: str, +) -> Any: + """Build per-case output for synthetic runs to mirror UI case switching.""" + run_output = _run_interaction_value(run, _OUTPUT_CANDIDATE_PATHS) + if representative_case_name and case_name == representative_case_name: + return run_output + if case_result.get("passed"): + if isinstance(case_record, dict) and "expected_output" in case_record: + return case_record.get("expected_output") + return None + if isinstance(case_record, dict): + inputs = case_record.get("inputs") + if isinstance(inputs, dict): + return ( + inputs.get("text") + or inputs.get("prompt") + or inputs.get("query") + or inputs.get("message") + or "(no usable answer — regressed run)" + ) + return "(no usable answer — regressed run)" + + +def _run_detail_block_lines( + idx: int, + run: dict[str, Any], + case_by_name: dict[str, dict[str, Any]], + *, + representative_case_name: str | None, +) -> list[str]: + """Render the full per-run detail shown by the in-app overlay. + + Mirrors the run-details dialog in AIEvals.tsx: prompt sent, agent output + received, run summary, and run report. + """ + run_id = str(run.get("id", "") or "") + status = str(run.get("status", "") or "unknown") + created = str(run.get("created_at", "") or "-") + pass_rate = run.get("pass_rate") + pass_text = ( + _fmt_pct(float(pass_rate)) if isinstance(pass_rate, (int, float)) else "n/a" + ) + + lines: list[str] = [] + summary_label = run_id or f"run {idx}" + lines.append( + f"
Run {idx} — {summary_label} " + f"(status: {status}, pass rate: {pass_text})" + ) + lines.append("") + lines.append(f"- Run ID: `{run_id or '-'}`") + lines.append(f"- Status: {status}") + lines.append(f"- Pass rate: {pass_text}") + lines.append(f"- Created: {created}") + summary_for_header = ( + run.get("summary") if isinstance(run.get("summary"), dict) else {} + ) + runtime_pod = str(summary_for_header.get("runtime_pod_name") or "").strip() + if runtime_pod: + lines.append(f"- Runtime: `{runtime_pod}`") + runtime_id = str(summary_for_header.get("runtime_id") or "").strip() + if runtime_id: + lines.append(f"- Runtime ID: `{runtime_id}`") + lines.append("") + + metrics = run.get("metrics") if isinstance(run.get("metrics"), dict) else {} + case_results = metrics.get("case_results") + if isinstance(case_results, list) and case_results: + lines.append("**Per-Case Results**") + lines.append("") + case_rows: list[list[str]] = [] + for case_result in case_results: + if not isinstance(case_result, dict): + continue + score = case_result.get("score") + case_rows.append( + [ + str(case_result.get("name") or "-"), + "✅ pass" if case_result.get("passed") else "❌ fail", + f"{float(score):.3f}" if isinstance(score, (int, float)) else "-", + str(case_result.get("category") or "-"), + str(case_result.get("difficulty") or "-"), + ] + ) + if case_rows: + lines.extend( + _markdown_table( + ["Case", "Result", "Score", "Category", "Difficulty"], + case_rows, + ["left", "left", "right", "left", "left"], + ) + ) + lines.append("") + + lines.append("**Per-Case Prompts and Outputs**") + lines.append("") + synthetic_run = _is_synthetic_run(run) + for case_result in case_results: + if not isinstance(case_result, dict): + continue + case_name = str(case_result.get("name") or "-") + case_record = case_by_name.get(case_name) + prompt_value = _extract_case_prompt(case_record) + if prompt_value is None: + prompt_value = _extract_case_prompt_from_result(case_result) + if synthetic_run: + output_value = _synthetic_case_output( + run, + case_record, + case_result, + representative_case_name=representative_case_name, + case_name=case_name, + ) + else: + output_value = _extract_case_output_from_result(case_result) + if output_value is None: + output_value = "(per-case output not captured for this run)" + expected_value = ( + case_record.get("expected_output") if isinstance(case_record, dict) else None + ) + metadata_value = ( + case_record.get("metadata") if isinstance(case_record, dict) else None + ) + evaluators_value = ( + case_record.get("evaluators") if isinstance(case_record, dict) else None + ) + prompt_lang, prompt_text = _format_display_value(prompt_value) + output_lang, output_text = _format_display_value(output_value) + expected_lang, expected_text = _format_display_value(expected_value) + metadata_lang, metadata_text = _format_display_value(metadata_value) + evaluators_lang, evaluators_text = _format_display_value(evaluators_value) + result_text = "pass" if case_result.get("passed") else "fail" + score = case_result.get("score") + score_text = f"{float(score):.3f}" if isinstance(score, (int, float)) else "-" + category_text = str(case_result.get("category") or "-") + difficulty_text = str(case_result.get("difficulty") or "-") + lines.append( + f"
Case {case_name} ({result_text}, score: {score_text})" + ) + lines.append("") + lines.append(f"- Category: {category_text}") + lines.append(f"- Difficulty: {difficulty_text}") + lines.append("") + lines.append("**Prompt**") + lines.append("") + lines.extend(_fenced_block(prompt_lang, prompt_text)) + lines.append("") + lines.append("**Output**") + lines.append("") + lines.extend(_fenced_block(output_lang, output_text)) + lines.append("") + lines.append("**Expected Output**") + lines.append("") + lines.extend(_fenced_block(expected_lang, expected_text)) + lines.append("") + lines.append("**Case Metadata**") + lines.append("") + lines.extend(_fenced_block(metadata_lang, metadata_text)) + lines.append("") + lines.append("**Case Evaluators**") + lines.append("") + lines.extend(_fenced_block(evaluators_lang, evaluators_text)) + lines.append("") + lines.append("
") + lines.append("") + + prompt_lang, prompt_text = _format_display_value( + _run_interaction_value(run, _PROMPT_CANDIDATE_PATHS) + ) + lines.append("**Prompt Sent**") + lines.append("") + lines.extend(_fenced_block(prompt_lang, prompt_text)) + lines.append("") + + output_lang, output_text = _format_display_value( + _run_interaction_value(run, _OUTPUT_CANDIDATE_PATHS) + ) + lines.append("**Agent Output Received**") + lines.append("") + lines.extend(_fenced_block(output_lang, output_text)) + lines.append("") + + usage = _extract_run_usage(run) + if usage: + lines.append("**Pydantic AI Usage**") + lines.append("") + preferred_keys = [ + "source", + "provider", + "model", + "requests", + "prompt_tokens", + "completion_tokens", + "total_tokens", + "input_cached_tokens", + "tool_calls", + "duration_ms", + "credits_consumed", + "captured_at", + "reservation_id", + "runtime_pod_name", + ] + usage_rows: list[list[str]] = [] + for key in preferred_keys: + if key not in usage: + continue + usage_rows.append([key, str(usage.get(key) or "-")]) + for key in sorted(str(k) for k in usage.keys()): + if key in preferred_keys: + continue + usage_rows.append([key, str(usage.get(key) or "-")]) + if usage_rows: + lines.extend(_markdown_table(["Metric", "Value"], usage_rows, ["left", "left"])) + lines.append("") + usage_lang, usage_text = _format_display_value(usage) + lines.append("Raw usage payload:") + lines.append("") + lines.extend(_fenced_block(usage_lang, usage_text)) + lines.append("") + + summary = run.get("summary") if isinstance(run.get("summary"), dict) else {} + summary_lang, summary_text = _format_display_value(summary) + lines.append("**Run Summary**") + lines.append("") + lines.extend(_fenced_block(summary_lang, summary_text)) + lines.append("") + + report = run.get("report") if isinstance(run.get("report"), dict) else {} + report_lang, report_text = _format_display_value(report) + lines.append("**Run Report**") + lines.append("") + lines.extend(_fenced_block(report_lang, report_text)) + lines.append("") + + cause = run.get("failure_cause") + if isinstance(cause, dict) and cause: + detail_lines = _failure_cause_detail_lines(cause) + if detail_lines: + lines.append("**Failure Cause**") + lines.append("") + lines.extend(detail_lines) + lines.append("") + + lines.append("
") + lines.append("") + return lines + + +def _extract_run_usage(run: dict[str, Any]) -> dict[str, Any]: + def _coerce_usage(candidate: Any) -> dict[str, Any]: + if not isinstance(candidate, dict) or not candidate: + return {} + nested = candidate.get("pydantic_ai_usage") + if isinstance(nested, dict) and nested: + merged = dict(nested) + for key, value in candidate.items(): + if key == "pydantic_ai_usage": + continue + merged.setdefault(str(key), value) + return merged + return dict(candidate) + + # Prefer usage already normalized onto run detail records. + direct_usage = _coerce_usage(run.get("usage")) + if direct_usage: + return direct_usage + + metrics = run.get("metrics") if isinstance(run.get("metrics"), dict) else {} + for key in ("pydantic_ai_usage", "usage"): + usage = _coerce_usage(metrics.get(key)) + if usage: + return usage + + summary = run.get("summary") if isinstance(run.get("summary"), dict) else {} + for key in ("pydantic_ai_usage", "usage"): + usage = _coerce_usage(summary.get(key)) + if usage: + return usage + + report_payload = run.get("report") if isinstance(run.get("report"), dict) else {} + report_usage = _coerce_usage(report_payload.get("usage")) + if report_usage: + return report_usage + return {} + + +def _usage_pick(usage: dict[str, Any], *keys: str) -> Any: + for key in keys: + value = usage.get(key) + if value is None: + continue + if isinstance(value, str) and not value.strip(): + continue + return value + return None + + +def _usage_number(value: Any) -> float | None: + if isinstance(value, bool): + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + text = value.strip() + if not text: + return None + try: + return float(text) + except Exception: + return None + return None + + +def _usage_total_tokens_value(usage: dict[str, Any]) -> str: + total_value = _usage_number( + _usage_pick( + usage, + "total_tokens", + "totalTokens", + "tokens_total", + "token_total", + ) + ) + if total_value is None: + prompt = _usage_number(_usage_pick(usage, "prompt_tokens", "promptTokens", "input_tokens", "inputTokens")) + completion = _usage_number( + _usage_pick(usage, "completion_tokens", "completionTokens", "output_tokens", "outputTokens") + ) + if prompt is not None and completion is not None: + total_value = prompt + completion + if total_value is None: + return "-" + return str(int(round(total_value))) + + +def _usage_credits_value(usage: dict[str, Any]) -> str: + credits = _usage_number( + _usage_pick( + usage, + "credits_consumed", + "creditsConsumed", + "credits", + "total_credits", + "cost_credits", + ) + ) + if credits is None: + return "-" + return f"{credits:.6f}".rstrip("0").rstrip(".") + + +def _report_appendix_lines( + experiments: list[dict[str, Any]], + evalset_runs_url: str, + *, + case_by_name: dict[str, dict[str, Any]] | None = None, + representative_case_name: str | None = None, +) -> list[str]: + """Render an appendix that lists every fetched run with its details. + + Each Run ID links back to the experiments page with a ``run`` query + parameter, which opens the run-details overlay directly. + """ + lines: list[str] = [] + lines.append("## Appendix: Run Details") + lines.append("") + lines.append( + "Per-run detail for every run fetched in the window above. " + "Each Run ID opens the run-details overlay directly in Datalayer, and " + "the collapsible blocks below reproduce the same prompt, agent output, " + "summary, and report shown by the in-app run-details dialog." + ) + lines.append("") + + any_runs = False + case_by_name = case_by_name or {} + for experiment in experiments: + runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] + if not runs: + continue + any_runs = True + agent_spec_label = str( + experiment.get("agent_spec_name") + or experiment.get("agent_spec_id") + or "-" + ) + lines.append(f"### {experiment.get('name', '')}") + lines.append("") + lines.append(f"Agentspec: {agent_spec_label}") + lines.append("") + run_rows: list[list[str]] = [] + for idx, run in enumerate(runs, start=1): + metrics = run.get("metrics") if isinstance(run.get("metrics"), dict) else {} + usage = _extract_run_usage(run) + run_id = str(run.get("id", "")) + run_link = _run_overlay_url(evalset_runs_url, run_id) + pass_rate = run.get("pass_rate") + passed = _appendix_metric_int(metrics, "passed", "passed_cases") + total = _appendix_metric_int(metrics, "total_cases", "total", "cases") + cases_cell = ( + f"{passed}/{total}" if passed != "-" or total != "-" else "-" + ) + run_rows.append( + [ + str(idx), + (f"[{run_id}]({run_link})" if run_link and run_id else (run_id or "-")), + str(run.get("status", "") or "-"), + _fmt_pct(float(pass_rate)) if isinstance(pass_rate, (int, float)) else "n/a", + cases_cell, + _appendix_metric_float(metrics, "avg_score", "average_score"), + _usage_total_tokens_value(usage), + _usage_credits_value(usage), + str(run.get("created_at", "") or "-"), + _format_failure_cause(run.get("failure_cause")) or "-", + ] + ) + lines.extend( + _markdown_table( + [ + "#", + "Run ID", + "Status", + "Pass Rate", + "Cases (pass/total)", + "Avg Score", + "Total Tokens", + "Credits", + "Created", + "Failure Cause", + ], + run_rows, + ["right", "left", "left", "right", "right", "right", "right", "right", "left", "left"], + ) + ) + lines.append("") + lines.append("#### Full Run Detail (as shown in the UI)") + lines.append("") + for idx, run in enumerate(runs, start=1): + lines.extend( + _run_detail_block_lines( + idx, + run, + case_by_name, + representative_case_name=representative_case_name, + ) + ) + + if not any_runs: + lines.append("No runs were fetched for any experiment.") + lines.append("") + + return lines + + +def _write_report_csv(report: dict[str, Any], output_path: Path) -> None: + experiments = [item for item in (report.get("experiments") or []) if isinstance(item, dict)] + + def _run_usage_fields(run: dict[str, Any]) -> dict[str, Any]: + usage = _extract_run_usage(run) + return { + "usage_source": _usage_pick(usage, "source"), + "usage_provider": _usage_pick(usage, "provider"), + "usage_model": _usage_pick(usage, "model"), + "usage_requests": _usage_pick(usage, "requests"), + "usage_prompt_tokens": _usage_pick(usage, "prompt_tokens", "promptTokens", "input_tokens", "inputTokens"), + "usage_completion_tokens": _usage_pick(usage, "completion_tokens", "completionTokens", "output_tokens", "outputTokens"), + "usage_total_tokens": _usage_total_tokens_value(usage), + "usage_input_cached_tokens": _usage_pick(usage, "input_cached_tokens", "inputCachedTokens"), + "usage_tool_calls": _usage_pick(usage, "tool_calls", "toolCalls"), + "usage_duration_ms": _usage_pick(usage, "duration_ms", "durationMs"), + "usage_credits_consumed": _usage_credits_value(usage), + "usage_captured_at": _usage_pick(usage, "captured_at", "capturedAt"), + } + + fieldnames = [ + "row_type", + "evalset_id", + "evalset_runs_url", + "agent_spec_id", + "agent_spec_name", + "agent_spec_url", + "experiment_id", + "experiment_name", + "run_index", + "run_id", + "run_status", + "run_pass_rate", + "runs_fetched", + "runs_total", + "baseline_pass_rate", + "latest_pass_rate", + "drift_delta", + "latest_two_delta", + "mean_pass_rate", + "stddev_pass_rate", + "failure_stage", + "failure_type", + "failure_message", + "usage_source", + "usage_provider", + "usage_model", + "usage_requests", + "usage_prompt_tokens", + "usage_completion_tokens", + "usage_total_tokens", + "usage_input_cached_tokens", + "usage_tool_calls", + "usage_duration_ms", + "usage_credits_consumed", + "usage_captured_at", + "case_name", + "case_status", + "case_score", + "case_category", + "case_difficulty", + "evaluator_name", + "evaluator_scope", + "evaluator_runs", + "evaluator_passed_runs", + "evaluator_mean_score", + "evaluator_latest_score", + "evaluator_latest_passed", + "generated_at", + ] + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("w", encoding="utf-8", newline="") as stream: + writer = csv.DictWriter(stream, fieldnames=fieldnames) + writer.writeheader() + evalset_id = str(report.get("evalset_id", "")) + run_environment = str(report.get("run_environment") or "") + evalset_runs_url = _evalset_runs_url(evalset_id, run_environment) + for experiment in experiments: + agent_spec_id = str(experiment.get("agent_spec_id", "")) + writer.writerow( + { + "row_type": "experiment", + "evalset_id": evalset_id, + "evalset_runs_url": evalset_runs_url, + "agent_spec_id": agent_spec_id, + "agent_spec_name": str(experiment.get("agent_spec_name", "")), + "agent_spec_url": _agentspec_details_url(agent_spec_id), + "experiment_id": str(experiment.get("id", "")), + "experiment_name": str(experiment.get("name", "")), + "run_index": "", + "run_id": "", + "run_status": "", + "run_pass_rate": "", + "runs_fetched": int(experiment.get("runs_fetched") or 0), + "runs_total": int(experiment.get("runs_total") or 0), + "baseline_pass_rate": experiment.get("baseline_pass_rate"), + "latest_pass_rate": experiment.get("latest_pass_rate"), + "drift_delta": experiment.get("drift_delta"), + "latest_two_delta": experiment.get("latest_two_delta"), + "mean_pass_rate": experiment.get("mean_pass_rate"), + "stddev_pass_rate": experiment.get("stddev_pass_rate"), + "failure_stage": "", + "failure_type": "", + "failure_message": "", + "usage_source": "", + "usage_provider": "", + "usage_model": "", + "usage_requests": "", + "usage_prompt_tokens": "", + "usage_completion_tokens": "", + "usage_total_tokens": "", + "usage_input_cached_tokens": "", + "usage_tool_calls": "", + "usage_duration_ms": "", + "usage_credits_consumed": "", + "usage_captured_at": "", + "generated_at": str(report.get("generated_at", "")), + } + ) + runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] + for idx, run in enumerate(runs, start=1): + cause = run.get("failure_cause") if isinstance(run.get("failure_cause"), dict) else {} + usage_fields = _run_usage_fields(run) + writer.writerow( + { + "row_type": "run", + "evalset_id": evalset_id, + "evalset_runs_url": evalset_runs_url, + "agent_spec_id": agent_spec_id, + "agent_spec_name": str(experiment.get("agent_spec_name", "")), + "agent_spec_url": _agentspec_details_url(agent_spec_id), + "experiment_id": str(experiment.get("id", "")), + "experiment_name": str(experiment.get("name", "")), + "run_index": idx, + "run_id": str(run.get("id", "")), + "run_status": str(run.get("status", "")), + "run_pass_rate": run.get("pass_rate"), + "runs_fetched": int(experiment.get("runs_fetched") or 0), + "runs_total": int(experiment.get("runs_total") or 0), + "baseline_pass_rate": experiment.get("baseline_pass_rate"), + "latest_pass_rate": experiment.get("latest_pass_rate"), + "drift_delta": experiment.get("drift_delta"), + "latest_two_delta": experiment.get("latest_two_delta"), + "mean_pass_rate": experiment.get("mean_pass_rate"), + "stddev_pass_rate": experiment.get("stddev_pass_rate"), + "failure_stage": str(cause.get("stage", "")), + "failure_type": str(cause.get("type", "")), + "failure_message": str(cause.get("message", "")), + **usage_fields, + "generated_at": str(report.get("generated_at", "")), + } + ) + run_metrics = ( + run.get("metrics") if isinstance(run.get("metrics"), dict) else {} + ) + case_results = run_metrics.get("case_results") + if isinstance(case_results, list): + for case_result in case_results: + if not isinstance(case_result, dict): + continue + writer.writerow( + { + "row_type": "case", + "evalset_id": evalset_id, + "evalset_runs_url": evalset_runs_url, + "agent_spec_id": agent_spec_id, + "agent_spec_name": str( + experiment.get("agent_spec_name", "") + ), + "agent_spec_url": _agentspec_details_url(agent_spec_id), + "experiment_id": str(experiment.get("id", "")), + "experiment_name": str(experiment.get("name", "")), + "run_index": idx, + "run_id": str(run.get("id", "")), + "run_status": str(run.get("status", "")), + "run_pass_rate": run.get("pass_rate"), + "case_name": str(case_result.get("name", "")), + "case_status": ( + "passed" + if case_result.get("passed") + else "failed" + ), + "case_score": case_result.get("score"), + "case_category": str( + case_result.get("category") or "" + ), + "case_difficulty": str( + case_result.get("difficulty") or "" + ), + **usage_fields, + "generated_at": str(report.get("generated_at", "")), + } + ) + + evaluator_results = [ + item + for item in (report.get("evaluator_results") or []) + if isinstance(item, dict) + ] + for item in evaluator_results: + writer.writerow( + { + "row_type": "evaluator", + "evalset_id": evalset_id, + "evalset_runs_url": evalset_runs_url, + "evaluator_name": str(item.get("name") or ""), + "evaluator_scope": str(item.get("scope") or ""), + "evaluator_runs": int(item.get("runs") or 0), + "evaluator_passed_runs": int(item.get("passed_runs") or 0), + "evaluator_mean_score": item.get("mean_score"), + "evaluator_latest_score": item.get("latest_score"), + "evaluator_latest_passed": bool(item.get("latest_passed")), + "usage_source": "", + "usage_provider": "", + "usage_model": "", + "usage_requests": "", + "usage_prompt_tokens": "", + "usage_completion_tokens": "", + "usage_total_tokens": "", + "usage_input_cached_tokens": "", + "usage_tool_calls": "", + "usage_duration_ms": "", + "usage_credits_consumed": "", + "usage_captured_at": "", + "generated_at": str(report.get("generated_at", "")), + } + ) + + +def _print_report_console(report: dict[str, Any], run_limit: int) -> None: + evalset_id = str(report.get("evalset_id", "")) + run_environment = str(report.get("run_environment") or "") + generated_at = str(report.get("generated_at", "")) + experiments = [item for item in (report.get("experiments") or []) if isinstance(item, dict)] + agentspecs = [item for item in (report.get("agentspecs") or []) if isinstance(item, dict)] + evalset_runs_url = _evalset_runs_url(evalset_id, run_environment) + + console.rule(f"[bold cyan]Evals Report[/bold cyan] {evalset_id}") + console.print(f"Generated at: {generated_at}") + console.print(f"Experiments: {len(experiments)} | Run window per experiment: {run_limit}") + if evalset_runs_url: + console.print(f"Evalset run details: {evalset_runs_url}") + console.print("") + + if agentspecs: + agentspec_table = Table(title="Agentspec Coverage") + agentspec_table.add_column("Agentspec ID", style="cyan") + agentspec_table.add_column("Agentspec", style="white") + agentspec_table.add_column("Model", style="white") + agentspec_table.add_column("Version", style="white") + agentspec_table.add_column("Experiments", justify="right") + agentspec_table.add_column("Runs", justify="right") + for item in agentspecs: + agentspec_table.add_row( + str(item.get("id") or ""), + str(item.get("name") or item.get("id") or ""), + str(item.get("model") or "-"), + str(item.get("version") or "-"), + str(int(item.get("experiments") or 0)), + str(int(item.get("runs") or 0)), + ) + console.print(agentspec_table) + + overview = Table(title="Experiment Overview") + overview.add_column("Experiment", style="white") + overview.add_column("Agentspec", style="white") + overview.add_column("Runs", justify="right") + overview.add_column("Latest", justify="right") + overview.add_column("Baseline", justify="right") + overview.add_column("Drift", justify="right") + overview.add_column("Latest-2", justify="right") + for experiment in experiments: + overview.add_row( + str(experiment.get("name", "")), + str(experiment.get("agent_spec_name") or experiment.get("agent_spec_id") or "-"), + f"{int(experiment.get('runs_fetched') or 0)}/{int(experiment.get('runs_total') or 0)}", + _fmt_pct(experiment.get("latest_pass_rate") if isinstance(experiment.get("latest_pass_rate"), (int, float)) else None), + _fmt_pct(experiment.get("baseline_pass_rate") if isinstance(experiment.get("baseline_pass_rate"), (int, float)) else None), + _fmt_delta(experiment.get("drift_delta") if isinstance(experiment.get("drift_delta"), (int, float)) else None, colorize=True), + _fmt_delta(experiment.get("latest_two_delta") if isinstance(experiment.get("latest_two_delta"), (int, float)) else None, colorize=True), + ) + console.print(overview) + + ranked_latest = sorted( + [item for item in experiments if isinstance(item.get("latest_pass_rate"), (int, float))], + key=lambda item: float(item.get("latest_pass_rate") or 0.0), + reverse=True, + ) + latest_table = Table(title="By Latest Pass Rate") + latest_table.add_column("Rank", justify="right", no_wrap=True) + latest_table.add_column("Experiment", style="white") + latest_table.add_column("Latest", justify="right", no_wrap=True) + for idx, item in enumerate(ranked_latest, start=1): + latest_table.add_row(str(idx), str(item.get("name", "")), _fmt_pct(float(item.get("latest_pass_rate") or 0.0))) + console.print(latest_table) + latest_values = [ + float(item.get("latest_pass_rate")) + for item in ranked_latest + if isinstance(item.get("latest_pass_rate"), (int, float)) + ] + console.print("Latest histogram:") + for hist_line in _ascii_histogram( + latest_values, + bins=8, + width=20, + min_value=0.0, + max_value=1.0, + full_blocks=True, + colorize=True, + ): + console.print(hist_line) + + ranked_drift = sorted( + [item for item in experiments if isinstance(item.get("drift_delta"), (int, float))], + key=lambda item: float(item.get("drift_delta") or 0.0), + ) + drift_table = Table(title="By Drift (Negative To Positive)") + drift_table.add_column("Rank", justify="right", no_wrap=True) + drift_table.add_column("Experiment", style="white") + drift_table.add_column("Drift", justify="right", no_wrap=True) + for idx, item in enumerate(ranked_drift, start=1): + drift_table.add_row( + str(idx), + str(item.get("name", "")), + _fmt_delta(float(item.get("drift_delta") or 0.0), colorize=True), + ) + console.print(drift_table) + drift_values = [ + float(item.get("drift_delta")) + for item in ranked_drift + if isinstance(item.get("drift_delta"), (int, float)) + ] + console.print("Drift histogram:") + for hist_line in _ascii_histogram( + drift_values, + bins=8, + width=20, + full_blocks=True, + colorize=True, + drift_palette=True, + ): + console.print(hist_line) + + pairwise = _pairwise_latest_deltas(experiments) + pairwise_table = Table(title="Pairwise Latest-Pass Deltas") + pairwise_table.add_column("Pair", style="white") + pairwise_table.add_column("Left", justify="right", no_wrap=True) + pairwise_table.add_column("Right", justify="right", no_wrap=True) + pairwise_table.add_column("Delta", justify="right", no_wrap=True) + for pair in pairwise: + pairwise_table.add_row( + f"{pair['left']} vs {pair['right']}", + _fmt_pct(pair["left_latest"]), + _fmt_pct(pair["right_latest"]), + _fmt_delta(pair["delta"], colorize=True), + ) + if not pairwise: + pairwise_table.add_row("n/a", "n/a", "n/a", "n/a") + console.print(pairwise_table) + + within_agentspec_pairs = [ + pair for pair in pairwise if str(pair.get("group") or "") == "within_agentspec" + ] + cross_agentspec_pairs = [ + pair for pair in pairwise if str(pair.get("group") or "") == "cross_agentspec" + ] + + within_table = Table(title="Within-Agentspec Pairwise Latest-Pass Deltas") + within_table.add_column("Pair", style="white") + within_table.add_column("Agentspec", style="white") + within_table.add_column("Left", justify="right", no_wrap=True) + within_table.add_column("Right", justify="right", no_wrap=True) + within_table.add_column("Delta", justify="right", no_wrap=True) + for pair in within_agentspec_pairs: + within_table.add_row( + f"{pair['left']} vs {pair['right']}", + str(pair.get("left_agent_spec_name") or pair.get("left_agent_spec_id") or "-"), + _fmt_pct(pair["left_latest"]), + _fmt_pct(pair["right_latest"]), + _fmt_delta(pair["delta"], colorize=True), + ) + if not within_agentspec_pairs: + within_table.add_row("n/a", "n/a", "n/a", "n/a", "n/a") + console.print(within_table) + + cross_table = Table(title="Cross-Agentspec Pairwise Latest-Pass Deltas") + cross_table.add_column("Pair", style="white") + cross_table.add_column("Left", justify="right", no_wrap=True) + cross_table.add_column("Right", justify="right", no_wrap=True) + cross_table.add_column("Delta", justify="right", no_wrap=True) + for pair in cross_agentspec_pairs: + cross_table.add_row( + ( + f"{pair['left']} ({pair.get('left_agent_spec_name') or pair.get('left_agent_spec_id') or '-'}) " + f"vs {pair['right']} ({pair.get('right_agent_spec_name') or pair.get('right_agent_spec_id') or '-'})" + ), + _fmt_pct(pair["left_latest"]), + _fmt_pct(pair["right_latest"]), + _fmt_delta(pair["delta"], colorize=True), + ) + if not cross_agentspec_pairs: + cross_table.add_row("n/a", "n/a", "n/a", "n/a") + console.print(cross_table) + + console.print("[bold]Pass-rate heatmap (r01=latest fetched run):[/bold]") + for line in _ascii_passrate_heatmap(experiments, max_columns=12, colorize=True): + console.print(line) + console.print("[bold]Consecutive delta heatmap (A-B):[/bold]") + for line in _ascii_drift_heatmap(experiments, max_columns=12, colorize=True): + console.print(line) + + if ranked_latest: + console.print( + "[bold]Insight:[/bold] top latest " + f"[green]{ranked_latest[0].get('name', '')}[/green] " + f"({_fmt_pct(float(ranked_latest[0].get('latest_pass_rate') or 0.0))})" + ) + if ranked_drift: + console.print( + "[bold]Insight:[/bold] strongest drift " + f"{ranked_drift[-1].get('name', '')} " + f"({_fmt_delta(float(ranked_drift[-1].get('drift_delta') or 0.0), colorize=True)})" + ) + console.print("") + + for experiment in experiments: + console.print("") + console.print(f"[bold]Run Timeline:[/bold] {experiment.get('name', '')}") + run_table = Table() + run_table.add_column("#", justify="right", style="cyan", no_wrap=True) + run_table.add_column("Run ID", style="white", no_wrap=True) + run_table.add_column("Status", no_wrap=True) + run_table.add_column("Pass Rate", justify="right", no_wrap=True) + run_table.add_column("Trend", style="white", no_wrap=True) + run_table.add_column("Failure Cause", style="red", overflow="fold") + + runs = [run for run in (experiment.get("runs") or []) if isinstance(run, dict)] + for idx, run in enumerate(runs, start=1): + status_value = str(run.get("status", "")) + pass_rate = float(run.get("pass_rate")) if isinstance(run.get("pass_rate"), (int, float)) else None + cause_text = _format_failure_cause(run.get("failure_cause")) + run_table.add_row( + str(idx), + str(run.get("id", "")), + f"[{_status_style(status_value)}]{status_value}[/{_status_style(status_value)}]", + _fmt_pct(pass_rate), + _ascii_bar(pass_rate, width=28, full_blocks=True, colorize=True) if pass_rate is not None else "-", + cause_text or "-", + ) + if not runs: + run_table.add_row("1", "n/a", "n/a", "n/a", "-", "-") + console.print(run_table) + + for idx, run in enumerate(runs, start=1): + cause = run.get("failure_cause") + if not isinstance(cause, dict) or not cause: + continue + console.print( + f"[red bold]Run {idx} failure:[/red bold] " + f"[red]{str(cause.get('message') or 'Unknown failure.')}[/red]" + ) + for key, label in ( + ("stage", "stage"), + ("type", "type"), + ("execution_url", "execution url"), + ): + value = str(cause.get(key) or "").strip() + if value: + console.print(f" {label}: {value}") + diagnostics = cause.get("diagnostics") + if isinstance(diagnostics, dict): + for key, label in ( + ("agent_runtimes_url", "agent runtimes url"), + ("run_url", "run url"), + ): + value = diagnostics.get(key) + if value: + console.print(f" {label}: {value}") + candidate_urls = diagnostics.get("candidate_urls") + if isinstance(candidate_urls, list) and candidate_urls: + console.print(f" candidate urls: {', '.join(str(u) for u in candidate_urls)}") + attempts = diagnostics.get("attempts") + if isinstance(attempts, list) and attempts: + for attempt in attempts: + if not isinstance(attempt, dict): + continue + outcome = "ok" if attempt.get("ok") else "failed" + console.print( + f" attempt: {attempt.get('url', '')} -> {outcome} " + f"{attempt.get('error') or ''}".rstrip() + ) + detail = str(cause.get("detail_excerpt") or "").strip() + if detail: + console.print(f" detail: {detail}") + + deltas_table = Table(title="Consecutive Run Deltas") + deltas_table.add_column("Run A", style="white", no_wrap=True) + deltas_table.add_column("Run B", style="white", no_wrap=True) + deltas_table.add_column("A Pass", justify="right", no_wrap=True) + deltas_table.add_column("B Pass", justify="right", no_wrap=True) + deltas_table.add_column("Delta", justify="right", no_wrap=True) + comparisons = [ + item for item in (experiment.get("consecutive_comparisons") or []) + if isinstance(item, dict) + ] + for item in comparisons: + run_a = item.get("run_a_pass_rate") if isinstance(item.get("run_a_pass_rate"), (int, float)) else None + run_b = item.get("run_b_pass_rate") if isinstance(item.get("run_b_pass_rate"), (int, float)) else None + delta = item.get("delta_pass_rate") if isinstance(item.get("delta_pass_rate"), (int, float)) else None + deltas_table.add_row( + str(item.get("run_a_id", "")), + str(item.get("run_b_id", "")), + _fmt_pct(float(run_a)) if isinstance(run_a, (int, float)) else "n/a", + _fmt_pct(float(run_b)) if isinstance(run_b, (int, float)) else "n/a", + _fmt_delta(float(delta), colorize=True) if isinstance(delta, (int, float)) else "n/a", + ) + if not comparisons: + deltas_table.add_row("n/a", "n/a", "n/a", "n/a", "n/a") + console.print(deltas_table) + + +def iter_report_runs(report: dict[str, Any]) -> list[tuple[str, dict[str, Any]]]: + """Return ``(experiment_name, run)`` tuples for every run in a report. + + Operates on the structured report produced by :func:`build_eval_report`, so + consumers (the GitHub Action, dashboards, alerts) don't re-walk the nested + ``experiments -> runs`` shape themselves. + """ + pairs: list[tuple[str, dict[str, Any]]] = [] + for experiment in report.get("experiments") or []: + if not isinstance(experiment, dict): + continue + experiment_name = str(experiment.get("name") or experiment.get("id") or "") + for run in experiment.get("runs") or []: + if isinstance(run, dict): + pairs.append((experiment_name, run)) + return pairs + + +def collect_report_failures(report: dict[str, Any]) -> dict[str, Any]: + """Aggregate failure information across every run in a report. + + Returns a dict with the failed-run count, failed-status run count, a + breakdown by failure type, and structured failure records (experiment, run + id, stage, type, message, trimmed detail excerpt, execution URL). This is + the shared aggregation the GitHub Action renders into its step summary. + """ + failures: list[dict[str, Any]] = [] + type_counts: dict[str, int] = {} + failed_status_runs = 0 + + for experiment_name, run in iter_report_runs(report): + status = str(run.get("status") or "").strip().lower() + cause = run.get("failure_cause") + cause = cause if isinstance(cause, dict) else None + is_failed = status in {"failed", "error"} or bool(cause) + if not is_failed: + continue + if status in {"failed", "error"}: + failed_status_runs += 1 + + failure_type = str((cause or {}).get("type") or "unknown") + type_counts[failure_type] = type_counts.get(failure_type, 0) + 1 + + detail = str((cause or {}).get("detail_excerpt") or "").strip() + detail_single = " ".join(detail.split()) + if len(detail_single) > 300: + detail_single = detail_single[:297] + "..." + + failures.append( + { + "experiment": experiment_name, + "run_id": str(run.get("id") or ""), + "status": status or "unknown", + "stage": str((cause or {}).get("stage") or "-"), + "type": failure_type, + "message": str((cause or {}).get("message") or "-"), + "detail_excerpt": detail_single or "-", + "execution_url": str((cause or {}).get("execution_url") or ""), + } + ) + + return { + "failed_run_count": len(failures), + "failed_status_runs": failed_status_runs, + "type_counts": type_counts, + "failures": failures, + } + + +def average_latest_pass_rate(report: dict[str, Any]) -> float | None: + """Return the mean of each experiment's ``latest_pass_rate`` (or ``None``).""" + values = [ + float(experiment.get("latest_pass_rate")) + for experiment in (report.get("experiments") or []) + if isinstance(experiment, dict) + and isinstance(experiment.get("latest_pass_rate"), (int, float)) + ] + if not values: + return None + return sum(values) / len(values) diff --git a/datalayer_core/evals/runner.py b/datalayer_core/evals/runner.py new file mode 100644 index 00000000..6c0fb578 --- /dev/null +++ b/datalayer_core/evals/runner.py @@ -0,0 +1,713 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Reusable evalset execution runner. + +This module hosts the end-to-end "execute an evalset spec against one or more +agentspecs" workflow so that examples, the GitHub Action, and any other +integration can launch real eval runs without re-implementing the orchestration +(create evalset -> launch cloud runtime(s) -> run each case through the agent -> +grade outputs -> persist runs -> teardown runtimes). +""" + +from __future__ import annotations + +import json +import os +import uuid +from typing import Any, Callable, Optional +from urllib.parse import urlparse + +from datalayer_core.agents import ( + LocalAgentRuntime, + compute_time_reservation_minutes, + create_cloud_agent_runtime, + ensure_local_agent, + resolve_environment_burning_rate, + start_local_agent_runtime, + teardown_agent_execution_resources, +) +from datalayer_core.agents.agent_local import ( + run_cloud_agent_chat, + run_local_agent_chat, + runtime_route_candidates, + wait_for_local_runtime, +) +from datalayer_core.client.client import DatalayerClient +from datalayer_core.evals.evals import now_iso, timestamp_slug, write_eval_reports +from datalayer_core.evals.evaluators import evaluate_evalset + +DEFAULT_ENVIRONMENT_NAME = "ai-agents-env" +DEFAULT_AGENT_NAME = "default" +DEFAULT_LOCAL_AGENT_BASE_URL = "http://localhost:8765" +# Default per-request timeout (seconds) for a single agent chat call. Bounding +# each call guarantees a hung agent cannot block the run forever: the call is +# aborted, the case is marked failed, execution continues, and the enclosing +# runner always tears down its cloud runtimes before returning. +DEFAULT_REQUEST_TIMEOUT_SECONDS = 180 + + +def _case_prompt(case: dict[str, Any]) -> str: + """Extract a prompt string from an evalset case's inputs.""" + inputs = case.get("inputs") + if isinstance(inputs, dict): + for key in ("prompt", "text", "query", "message"): + value = inputs.get(key) + if isinstance(value, str) and value.strip(): + return value + return json.dumps(inputs, ensure_ascii=True) + if isinstance(inputs, str): + return inputs + return "" + + +def _compose_case_prompt(case: dict[str, Any], *, preamble: str = "") -> str: + """Build the effective case prompt with an optional preamble. + + ``preamble`` lets a spec enforce task instructions (for example output + format/constraints) without mutating every individual case input. + """ + base_prompt = _case_prompt(case) + normalized_preamble = str(preamble or "").strip() + if not normalized_preamble: + return base_prompt + if not base_prompt: + return normalized_preamble + return f"{normalized_preamble}\n\nInput:\n{base_prompt}" + + +def _extract_text(payload: Any) -> str: + """Coerce an agent output payload into a plain text answer.""" + if isinstance(payload, dict): + text = payload.get("text") + if isinstance(text, str): + return text + message = payload.get("message") + if isinstance(message, str): + return message + if isinstance(payload, str): + return payload + return json.dumps(payload, ensure_ascii=True) + + +def _usage_number(value: Any) -> float | None: + if isinstance(value, bool): + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + text = value.strip() + if not text: + return None + try: + return float(text) + except Exception: + return None + return None + + +def _usage_pick_number(usage: dict[str, Any], *keys: str) -> float | None: + for key in keys: + number = _usage_number(usage.get(key)) + if number is not None: + return number + return None + + +def _extract_case_usage(chat_result: dict[str, Any]) -> dict[str, Any]: + direct = chat_result.get("usage") + if isinstance(direct, dict) and direct: + return dict(direct) + output = chat_result.get("output") if isinstance(chat_result.get("output"), dict) else {} + nested = output.get("pydantic_ai_usage") or output.get("usage") + if isinstance(nested, dict) and nested: + return dict(nested) + return {} + + +def _merge_run_usage(aggregate: dict[str, Any], case_usage: dict[str, Any]) -> dict[str, Any]: + if not case_usage: + return aggregate + + prompt_tokens = _usage_pick_number( + case_usage, + "prompt_tokens", + "promptTokens", + "input_tokens", + "inputTokens", + ) + completion_tokens = _usage_pick_number( + case_usage, + "completion_tokens", + "completionTokens", + "output_tokens", + "outputTokens", + ) + total_tokens = _usage_pick_number( + case_usage, + "total_tokens", + "totalTokens", + "tokens_total", + "token_total", + ) + if total_tokens is None and prompt_tokens is not None and completion_tokens is not None: + total_tokens = prompt_tokens + completion_tokens + + numeric_fields: list[tuple[str, float | None]] = [ + ("prompt_tokens", prompt_tokens), + ("completion_tokens", completion_tokens), + ("total_tokens", total_tokens), + ( + "input_cached_tokens", + _usage_pick_number( + case_usage, + "input_cached_tokens", + "inputCachedTokens", + "cached_input_tokens", + "cachedInputTokens", + ), + ), + ( + "tool_calls", + _usage_pick_number( + case_usage, + "tool_calls", + "toolCalls", + "tool_call_count", + "toolCallCount", + ), + ), + ( + "requests", + _usage_pick_number( + case_usage, + "requests", + "request_count", + "requestCount", + ), + ), + ( + "duration_ms", + _usage_pick_number( + case_usage, + "duration_ms", + "durationMs", + "latency_ms", + "latencyMs", + ), + ), + ( + "credits_consumed", + _usage_pick_number( + case_usage, + "credits_consumed", + "creditsConsumed", + "credits", + "total_credits", + "cost_credits", + ), + ), + ] + for key, value in numeric_fields: + if value is None: + continue + current = _usage_number(aggregate.get(key)) or 0.0 + summed = current + value + if key in {"credits_consumed"}: + aggregate[key] = round(summed, 6) + else: + aggregate[key] = int(round(summed)) + + for key in ( + "source", + "provider", + "model", + "billable_account_kind", + "billable_account_uid", + "requester_kind", + "requester_uid", + "captured_at", + "timestamp", + ): + value = case_usage.get(key) + if value is None: + continue + if isinstance(value, str) and not value.strip(): + continue + aggregate.setdefault(key, value) + return aggregate + + +def execute_evalset_spec( + client: DatalayerClient, + *, + spec: dict[str, Any], + agentspec_ids: list[str], + run_evalset: bool = True, + create_report: bool = False, + run_limit: int = 1, + run_environment: str = "sdk", + environment_name: str = DEFAULT_ENVIRONMENT_NAME, + account_uid: Optional[str] = None, + credits_limit: float = 100.0, + evalset_name: Optional[str] = None, + backend_run_environment: str = "sdk", + launch_source: str = "datalayer-core", + agent_name: str = DEFAULT_AGENT_NAME, + execution_target: str = "cloud", + local_agent_base_url: str = DEFAULT_LOCAL_AGENT_BASE_URL, + auto_start_local_agent_runtime: bool = False, + local_agent_log_level: str = "info", + request_timeout_seconds: int = DEFAULT_REQUEST_TIMEOUT_SECONDS, + log: Optional[Callable[[str], None]] = print, +) -> dict[str, Any]: + """Execute an evalset spec against one or more agentspecs and persist runs. + + Creates an evalset from ``spec``, runs every case through each agentspec + ``run_limit`` times against either a cloud runtime (one per agentspec) or a + local ``agent-runtimes`` server, grades the outputs with the evals API, and + stores one run record per execution. Execution resources (cloud runtimes or + the local agent registration/server) are always torn down before returning, + including on error. + + Parameters + ---------- + client : DatalayerClient + An authenticated client. + spec : dict[str, Any] + Evalset spec (as loaded by :func:`load_evalset_spec`). + agentspec_ids : list[str] + Agentspec ids to execute. One experiment is created per id (plus one + cloud runtime per id when ``execution_target='cloud'``). + run_evalset : bool + Whether to execute the evalset after creating it. Defaults to ``True``. + When ``False``, this function only creates the evalset and returns. + create_report : bool + Whether to generate markdown/CSV reports via + :func:`write_eval_reports` before returning. Defaults to ``False``. + run_limit : int + Number of runs to create per experiment (minimum 1). + run_environment : str + Run-environment label stored on run summaries (for example ``sdk``). + environment_name : str + Runtime environment to launch cloud agents in (cloud only). + account_uid : Optional[str] + Optional billable account UID context. + credits_limit : float + Target credits budget used to size each cloud runtime reservation. + evalset_name : Optional[str] + Optional explicit evalset name. Defaults to a timestamped name derived + from the spec name. + backend_run_environment : str + ``run_environment`` value persisted on the created evalset. + launch_source : str + ``launch_source`` recorded on experiments and runs. + agent_name : str + Agent route/name used when contacting the runtime. + execution_target : str + ``cloud`` (default) launches one cloud runtime per agentspec; ``local`` + executes against a local ``agent-runtimes`` server. + local_agent_base_url : str + Base URL of the local ``agent-runtimes`` server (local only). Ignored + when ``auto_start_local_agent_runtime`` starts a new server. + auto_start_local_agent_runtime : bool + When ``execution_target='local'``, start a local ``agent-runtimes`` + server on a free port and tear it down afterwards. When ``False``, the + runner first attempts to reuse ``local_agent_base_url`` and will + auto-start a local runtime only if that server is unreachable. + local_agent_log_level : str + Log level for an auto-started local ``agent-runtimes`` server. + request_timeout_seconds : int + Per-request timeout (seconds) for a single agent chat call. When an + agent does not respond within this window the call is aborted, the case + is recorded as failed, and execution continues with the next case. This + bounds hung agents without killing legitimately slow multi-agentspec + runs. Defaults to ``180`` (3 minutes per call). + log : Optional[Callable[[str], None]] + Optional logging callback (defaults to ``print``; pass ``None`` to + silence progress output). + + Returns + ------- + dict[str, Any] + ``{"evalset_id", "evalset_name", "experiment_ids", "run_ids", "view_url"}`` + plus optional report paths when ``create_report=True``. + + Raises + ------ + ValueError + If ``agentspec_ids`` is empty, the spec has no cases, or + ``execution_target`` is not ``cloud``/``local``. + RuntimeError + If the platform returns an unexpected create response or a cloud + runtime is missing its ingress/pod. + """ + + def _emit(message: str) -> None: + if log is not None: + log(message) + + target = str(execution_target or "").strip().lower() + if target not in {"cloud", "local"}: + raise ValueError( + f"execution_target must be 'cloud' or 'local', got {execution_target!r}." + ) + + normalized_specs: list[str] = [] + for value in agentspec_ids: + spec_id = str(value or "").strip() + if spec_id and spec_id not in normalized_specs: + normalized_specs.append(spec_id) + if not normalized_specs: + raise ValueError("agentspec_ids must contain at least one agentspec id.") + + cases = [item for item in (spec.get("cases") or []) if isinstance(item, dict)] + if not cases: + raise ValueError("Evalset spec has no cases; cannot execute real runs.") + + metadata = spec.get("metadata") if isinstance(spec.get("metadata"), dict) else {} + run_mode = str(spec.get("kind") or "batch").strip().lower() or "batch" + if run_mode not in {"batch", "interactive"}: + raise ValueError( + f"Evalset spec kind must be 'batch' or 'interactive', got {run_mode!r}." + ) + prompt_preamble = str(metadata.get("prompt_preamble") or "").strip() + + run_limit = max(1, int(run_limit)) + + case_request_timeout = max(1, int(request_timeout_seconds)) + + resolved_name = str( + evalset_name + or f"{str(spec.get('name') or 'evalset')}-{run_environment}-{timestamp_slug(now_iso())}" + ) + evalset_payload = client.evals_create_eval_from_spec( + spec=spec, + name=resolved_name, + run_environment=backend_run_environment, + kind=run_mode, + account_uid=account_uid, + ) + evalset_id = str((evalset_payload.get("evalset") or {}).get("id") or "") + if not evalset_id: + raise RuntimeError(f"Unable to create evalset from spec: {evalset_payload}") + _emit(f"Created evalset: {evalset_id} ({resolved_name})") + + ui_base = str(os.environ.get("DATALAYER_UI_URL") or "http://localhost:3063").strip().rstrip("/") + view_url = f"{ui_base}/evals/experiments/{run_environment}/{evalset_id}" + + result: dict[str, Any] = { + "evalset_id": evalset_id, + "evalset_name": resolved_name, + "experiment_ids": [], + "run_ids": [], + "view_url": view_url, + } + + if not bool(run_evalset): + _emit(f"Skipped eval execution for evalset: {evalset_id}") + if bool(create_report): + reports = write_eval_reports( + client, + evalset_id, + account_uid=account_uid, + ) + result["report_markdown_path"] = str(reports.get("markdown_path") or "") + if reports.get("csv_path") is not None: + result["report_csv_path"] = str(reports.get("csv_path") or "") + return result + + experiment_ids: list[str] = [] + run_ids: list[str] = [] + runtimes_by_spec: dict[str, Any] = {} + local_runtime: Optional[LocalAgentRuntime] = None + local_base_url = str(local_agent_base_url or DEFAULT_LOCAL_AGENT_BASE_URL) + token = str(client._get_token() or "") + try: + if target == "cloud": + for spec_id in normalized_specs: + burning_rate = resolve_environment_burning_rate( + client, environment_name + ) + reservation_minutes = compute_time_reservation_minutes( + credits_limit=credits_limit, + burning_rate=burning_rate, + ) + runtime = create_cloud_agent_runtime( + client, + environment_name=environment_name, + name=f"evals-{spec_id}-{uuid.uuid4().hex[:8]}", + agent_spec_id=spec_id, + time_reservation=reservation_minutes, + billable_account_uid=account_uid, + ) + runtimes_by_spec[spec_id] = runtime + _emit( + f"Launched runtime for agentspec {spec_id}: " + f"pod={getattr(runtime, 'pod_name', '')} " + f"runtime_id={getattr(runtime, 'uid', '')}" + ) + elif target == "local": + local_host = urlparse(local_base_url).hostname or "127.0.0.1" + should_start_local_runtime = bool(auto_start_local_agent_runtime) + if not should_start_local_runtime: + try: + wait_for_local_runtime(local_base_url, timeout_seconds=2) + except Exception: + should_start_local_runtime = True + _emit( + "No local agent-runtimes server reachable at " + f"{local_base_url}; starting one automatically." + ) + if should_start_local_runtime: + local_runtime = start_local_agent_runtime( + agent_spec_id=normalized_specs[0], + agent_name=agent_name, + host=local_host, + log_level=local_agent_log_level, + disable_tool_approvals=True, + ) + local_base_url = local_runtime.base_url + _emit(f"Started local agent-runtimes server at {local_base_url}") + + for spec_id in normalized_specs: + experiment_payload = client.evals_create_experiment( + name=f"evals-{spec_id}-{timestamp_slug(now_iso())}", + evalset_id=evalset_id, + description="Eval execution via datalayer-core runner.", + status="running", + config={ + "run_mode": run_mode, + "execution_target": target, + "agent_spec_id": spec_id, + "environment_name": environment_name, + }, + summary={ + "launch_source": launch_source, + "run_environment": run_environment, + "agent_spec_id": spec_id, + }, + account_uid=account_uid, + ) + experiment_id = str( + (experiment_payload.get("experiment") or {}).get("id") or "" + ) + if not experiment_id: + raise RuntimeError( + f"Unable to create experiment: {experiment_payload}" + ) + experiment_ids.append(experiment_id) + + ingress = "" + pod_name = "" + runtime_id = "" + if target == "cloud": + runtime = runtimes_by_spec[spec_id] + ingress = str(getattr(runtime, "ingress", "") or "").strip() + pod_name = str(getattr(runtime, "pod_name", "") or "").strip() + runtime_id = str(getattr(runtime, "uid", "") or "").strip() + if not ingress or not pod_name: + raise RuntimeError( + f"Runtime missing ingress/pod for agentspec {spec_id}" + ) + else: + ensure_local_agent( + base_url=local_base_url, + agent_name=agent_name, + token=token, + agent_spec_id=spec_id, + disable_tool_approvals=True, + ) + _emit( + f"Using local agent execution at {local_base_url.rstrip('/')} " + f"(agent: {agent_name}, agentspec: {spec_id})." + ) + + for run_index in range(run_limit): + outputs: list[dict[str, Any]] = [] + full_outputs: list[dict[str, Any]] = [] + case_statuses: list[str] = [] + case_prompts: list[Any] = [] + aggregated_usage: dict[str, Any] = {} + failed_cases = 0 + failure_causes: list[dict[str, Any]] = [] + + for case in cases: + prompt = _compose_case_prompt(case, preamble=prompt_preamble) + case_prompts.append(prompt) + if target == "cloud": + chat_result = run_cloud_agent_chat( + ingress=ingress, + token=token, + prompt=prompt, + route_candidates=runtime_route_candidates( + agent_name=agent_name, + agent_spec_id=spec_id, + pod_name=pod_name, + ), + timeout=case_request_timeout, + ) + else: + chat_result = run_local_agent_chat( + base_url=local_base_url, + agent_name=agent_name, + token=token, + prompt=prompt, + timeout=case_request_timeout, + ) + status = str(chat_result.get("status") or "completed").strip().lower() + case_statuses.append(status) + output_payload = chat_result.get("output") or {} + outputs.append({"text": _extract_text(output_payload)}) + full_outputs.append( + output_payload + if isinstance(output_payload, dict) + else {"text": _extract_text(output_payload)} + ) + if status in {"failed", "error"}: + failed_cases += 1 + failure = chat_result.get("failure_cause") + if isinstance(failure, dict): + failure_causes.append(failure) + + case_usage = _extract_case_usage(chat_result) + aggregated_usage = _merge_run_usage(aggregated_usage, case_usage) + + metrics = evaluate_evalset(spec, outputs, statuses=case_statuses) + # Persist per-case prompts/outputs onto the graded case results so + # the report can render the actual agent interaction instead of + # "(per-case output not captured for this run)". + case_results = metrics.get("case_results") + if isinstance(case_results, list): + for idx, case_result in enumerate(case_results): + if not isinstance(case_result, dict): + continue + if idx < len(case_prompts): + case_result["prompt"] = case_prompts[idx] + if idx < len(full_outputs): + case_result["output"] = full_outputs[idx] + + interaction = [ + { + "case": str(cases[idx].get("name") or f"case-{idx + 1}"), + "status": case_statuses[idx] if idx < len(case_statuses) else None, + "prompt": case_prompts[idx] if idx < len(case_prompts) else None, + "output": full_outputs[idx] if idx < len(full_outputs) else None, + } + for idx in range(len(cases)) + ] + + run_status = "failed" if failed_cases > 0 else "completed" + if target == "cloud": + # Surface the runtime pod name and runtime id on every + # failure cause so the report's failure-cause block (and UI) + # can show which runtime produced the failure for easier + # debugging. + for cause in failure_causes: + cause.setdefault("runtime_pod_name", pod_name) + if runtime_id: + cause.setdefault("runtime_id", runtime_id) + summary: dict[str, Any] = { + "launch_source": launch_source, + "run_mode": run_mode, + "run_environment": run_environment, + "execution_target": target, + "agent_spec_id": spec_id, + "case_failures": failed_cases, + "run_index": run_index + 1, + "agent_prompt": [item["prompt"] for item in interaction], + "agent_output": [item["output"] for item in interaction], + } + if target == "cloud": + summary["runtime_pod_name"] = pod_name + if runtime_id: + summary["runtime_id"] = runtime_id + else: + summary["local_agent_base_url"] = local_base_url + summary["local_agent_id"] = agent_name + if failure_causes: + summary["failure_cause"] = failure_causes[0] + report = { + "note": f"real agent execution via datalayer-core runner ({run_mode})", + "interaction": interaction, + "failure_causes": failure_causes, + } + if aggregated_usage: + metrics = { + **metrics, + "pydantic_ai_usage": aggregated_usage, + } + summary["usage"] = {"pydantic_ai_usage": aggregated_usage} + report["usage"] = {"pydantic_ai_usage": aggregated_usage} + if target == "cloud": + report["runtime_pod_name"] = pod_name + if runtime_id: + report["runtime_id"] = runtime_id + else: + report["local_agent_base_url"] = local_base_url + report["local_agent_id"] = agent_name + + run_payload = client.evals_create_run( + experiment_id, + status=run_status, + metrics=metrics, + summary=summary, + report=report, + account_uid=account_uid, + ) + run_id = str((run_payload.get("run") or {}).get("id") or "") + if not run_id: + raise RuntimeError(f"Unable to create run: {run_payload}") + run_ids.append(run_id) + _emit( + f"Created run {run_index + 1}/{run_limit} for agentspec=" + f"{spec_id} experiment={experiment_id}: {run_id}" + ) + + _emit(f"Executed evalset: {evalset_id}") + result["experiment_ids"] = experiment_ids + result["run_ids"] = run_ids + if bool(create_report): + reports = write_eval_reports( + client, + evalset_id, + account_uid=account_uid, + ) + result["report_markdown_path"] = str(reports.get("markdown_path") or "") + if reports.get("csv_path") is not None: + result["report_csv_path"] = str(reports.get("csv_path") or "") + return result + finally: + if target == "cloud": + for spec_id, runtime in runtimes_by_spec.items(): + pod_name = str(getattr(runtime, "pod_name", "") or "").strip() + cleanup = teardown_agent_execution_resources( + client, + execution_target="cloud", + cloud_runtime_or_pod_name=pod_name, + token=token, + ) + if cleanup.get("cloud_runtime_terminated"): + _emit(f"Terminated runtime for agentspec {spec_id}: {pod_name}") + else: + _emit( + "Warning: runtime termination unconfirmed for agentspec " + f"{spec_id}: {pod_name}" + ) + else: + cleanup = teardown_agent_execution_resources( + client, + execution_target="local", + local_base_url=local_base_url, + local_agent_name=agent_name, + token=token, + local_runtime=local_runtime, + ) + if cleanup.get("local_agent_deleted"): + _emit(f"Terminated local agent registration: {agent_name}") + if cleanup.get("local_runtime_terminated"): + _emit("Stopped auto-started local agent-runtimes server.") + diff --git a/datalayer_core/mixins/__init__.py b/datalayer_core/mixins/__init__.py index 8370f351..8980b223 100644 --- a/datalayer_core/mixins/__init__.py +++ b/datalayer_core/mixins/__init__.py @@ -5,7 +5,7 @@ from .sandbox_snapshots import SandboxSnapshotsMixin from .runtimes import RuntimesMixin from .secrets import SecretsMixin -from .tokens import TokensMixin +from .api_keys import ApiKeysMixin from .usage import UsageMixin from .whoami import WhoamiAppMixin @@ -15,7 +15,7 @@ "SandboxSnapshotsMixin", "RuntimesMixin", "SecretsMixin", - "TokensMixin", + "ApiKeysMixin", "UsageMixin", "WhoamiAppMixin", ] diff --git a/datalayer_core/mixins/tokens.py b/datalayer_core/mixins/api_keys.py similarity index 55% rename from datalayer_core/mixins/tokens.py rename to datalayer_core/mixins/api_keys.py index e5810e03..01239e05 100644 --- a/datalayer_core/mixins/tokens.py +++ b/datalayer_core/mixins/api_keys.py @@ -3,22 +3,22 @@ from typing import Any, Union -from datalayer_core.models.token import TokenType +from datalayer_core.models.api_key import ApiKeyType from datalayer_core.utils import btoa -class TokensCreateMixin: - """Mixin for creating tokens in Datalayer.""" +class ApiKeysCreateMixin: + """Mixin for creating API keys in Datalayer.""" - def _create_token( + def _create_api_key( self, name: str, description: str, expiration_date: int = 0, - token_type: Union[str, TokenType] = TokenType.USER, + api_key_type: Union[str, ApiKeyType] = ApiKeyType.SECRET, ) -> dict[str, Any]: """ - Create a Token with the given parameters. + Create an API key with the given parameters. Parameters ---------- @@ -27,10 +27,10 @@ def _create_token( description : str Description of the secret. expiration_date : float - Expiration date of the token. - token_type : str, TokenType - Variant or type of the token. Defaults to "user_token". - Type of the token (e.g., "user"). + Expiration date of the API key. + api_key_type : str, ApiKeyType + Variant or type of the API key. Defaults to "secret". + Type of the API key (secret, publishable, restricted, temporary). Returns ------- @@ -40,14 +40,14 @@ def _create_token( body = { "name": name, "description": btoa(description), - "variant": token_type.value - if isinstance(token_type, TokenType) - else token_type, + "variant": api_key_type.value + if isinstance(api_key_type, ApiKeyType) + else api_key_type, "expiration_date": expiration_date, } try: response = self._fetch( # type: ignore - "{}/api/iam/v1/tokens".format(self.urls.iam_url), # type: ignore + "{}/api/iam/v1/api-keys".format(self.urls.iam_url), # type: ignore method="POST", json=body, ) @@ -56,17 +56,17 @@ def _create_token( return {"success": False, "message": str(e)} -class TokensDeleteMixin: - """Mixin for deleting tokens in Datalayer.""" +class ApiKeysDeleteMixin: + """Mixin for deleting API keys in Datalayer.""" - def _delete_token(self, token_uid: str) -> dict[str, Any]: + def _delete_api_key(self, api_key_uid: str) -> dict[str, Any]: """ - Delete a token by its unique identifier. + Delete an API key by its unique identifier. Parameters ---------- - token_uid : str - Unique identifier of the token to delete. + api_key_uid : str + Unique identifier of the API key to delete. Returns ------- @@ -75,7 +75,7 @@ def _delete_token(self, token_uid: str) -> dict[str, Any]: """ try: response = self._fetch( # type: ignore - "{}/api/iam/v1/tokens/{}".format(self.urls.iam_url, token_uid), # type: ignore + "{}/api/iam/v1/api-keys/{}".format(self.urls.iam_url, api_key_uid), # type: ignore method="DELETE", ) return response.json() @@ -83,21 +83,21 @@ def _delete_token(self, token_uid: str) -> dict[str, Any]: return {"success": False, "message": str(e)} -class TokensListMixin: - """Mixin class for listing tokens.""" +class ApiKeysListMixin: + """Mixin class for listing API keys.""" - def _list_tokens(self) -> dict[str, Any]: + def _list_api_keys(self) -> dict[str, Any]: """ - List all tokens in the Datalayer environment. + List all API keys in the Datalayer environment. Returns ------- dict[str, Any] - Dictionary containing tokens information. + Dictionary containing API key information. """ try: response = self._fetch( # type: ignore - "{}/api/iam/v1/tokens".format(self.urls.iam_url), # type: ignore + "{}/api/iam/v1/api-keys".format(self.urls.iam_url), # type: ignore method="GET", ) return response.json() @@ -105,5 +105,5 @@ def _list_tokens(self) -> dict[str, Any]: return {"sucess": False, "error": str(e)} -class TokensMixin(TokensCreateMixin, TokensDeleteMixin, TokensListMixin): - """A mixin that combines create, delete, and list functionalities for tokens.""" +class ApiKeysMixin(ApiKeysCreateMixin, ApiKeysDeleteMixin, ApiKeysListMixin): + """A mixin that combines create, delete, and list functionalities for API keys.""" diff --git a/datalayer_core/mixins/evals.py b/datalayer_core/mixins/evals.py index 6cc27043..519f7852 100644 --- a/datalayer_core/mixins/evals.py +++ b/datalayer_core/mixins/evals.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. @@ -5,6 +8,7 @@ from __future__ import annotations +import os from typing import Any, Optional @@ -16,13 +20,20 @@ def _evals_request( path: str, *, method: str, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, params: Optional[dict[str, Any]] = None, json_body: Optional[dict[str, Any]] = None, ) -> dict[str, Any]: query: dict[str, Any] = dict(params or {}) - if account_uid: - query["account_uid"] = account_uid + resolved_account_uid = ( + billable_account_uid + or account_uid + or os.environ.get("DATALAYER_ACCOUNT_UID") + or os.environ.get("DATALAYER_BILLABLE_ACCOUNT_UID") + ) + if resolved_account_uid: + query["account_uid"] = resolved_account_uid response = self._fetch( # type: ignore f"{self.urls.ai_agents_url}/api/ai-agents/v1/evals{path}", # type: ignore method=method, @@ -39,6 +50,7 @@ def evals_list_evals( q: Optional[str] = None, limit: int = 50, offset: int = 0, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: params: dict[str, Any] = {"limit": limit, "offset": offset} @@ -52,6 +64,7 @@ def evals_list_evals( "/evalsets", method="GET", params=params, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -63,9 +76,12 @@ def evals_create_eval( run_environment: str = "sdk", kind: str = "batch", schema: Optional[dict[str, Any]] = None, + evalset_evaluators: Optional[list[dict[str, Any]]] = None, + report_evaluators: Optional[list[dict[str, Any]]] = None, tags: Optional[list[str]] = None, metadata: Optional[dict[str, Any]] = None, cases: Optional[list[dict[str, Any]]] = None, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: body = { @@ -74,6 +90,8 @@ def evals_create_eval( "run_environment": run_environment, "kind": kind, "schema": schema or {}, + "evalset_evaluators": evalset_evaluators or [], + "report_evaluators": report_evaluators or [], "tags": tags or [], "metadata": metadata or {}, "cases": cases or [], @@ -82,6 +100,59 @@ def evals_create_eval( "/evalsets", method="POST", json_body=body, + billable_account_uid=billable_account_uid, + account_uid=account_uid, + ) + + def evals_create_eval_from_spec( + self, + *, + spec: dict[str, Any], + name: Optional[str] = None, + description: Optional[str] = None, + run_environment: Optional[str] = None, + kind: Optional[str] = None, + billable_account_uid: Optional[str] = None, + account_uid: Optional[str] = None, + ) -> dict[str, Any]: + if not isinstance(spec, dict): + raise ValueError("spec must be a JSON object") + + resolved_name = str(name if name is not None else spec.get("name") or "").strip() + if not resolved_name: + raise ValueError("spec.name is required when name is not provided") + + resolved_description = str( + description if description is not None else spec.get("description") or "" + ) + resolved_run_environment = str( + run_environment if run_environment is not None else spec.get("run_environment") or "sdk" + ) + resolved_kind = str(kind if kind is not None else spec.get("kind") or "batch") + + schema = spec.get("schema") if isinstance(spec.get("schema"), dict) else {} + metadata = spec.get("metadata") if isinstance(spec.get("metadata"), dict) else {} + tags = [str(tag) for tag in (spec.get("tags") or []) if str(tag).strip()] + evalset_evaluators = [ + item for item in (spec.get("evalset_evaluators") or []) if isinstance(item, dict) + ] + report_evaluators = [ + item for item in (spec.get("report_evaluators") or []) if isinstance(item, dict) + ] + cases = [item for item in (spec.get("cases") or []) if isinstance(item, dict)] + + return self.evals_create_eval( + name=resolved_name, + description=resolved_description, + run_environment=resolved_run_environment, + kind=resolved_kind, + schema=schema, + evalset_evaluators=evalset_evaluators, + report_evaluators=report_evaluators, + tags=tags, + metadata=metadata, + cases=cases, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -89,14 +160,42 @@ def evals_delete_eval( self, evalset_id: str, *, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: return self._evals_request( f"/evalsets/{evalset_id}", method="DELETE", + billable_account_uid=billable_account_uid, account_uid=account_uid, ) + def evals_set_eval_public( + self, + evalset_id: str, + *, + is_public: bool, + billable_account_uid: Optional[str] = None, + account_uid: Optional[str] = None, + ) -> dict[str, Any]: + return self._evals_request( + f"/evalsets/{evalset_id}/public", + method="PATCH", + json_body={"is_public": bool(is_public)}, + billable_account_uid=billable_account_uid, + account_uid=account_uid, + ) + + def evals_get_public_eval( + self, + evalset_id: str, + ) -> dict[str, Any]: + response = self._fetch( # type: ignore + f"{self.urls.ai_agents_url}/api/ai-agents/v1/evals/public/evalsets/{evalset_id}", # type: ignore + method="GET", + ) + return response.json() + def evals_list_experiments( self, *, @@ -104,6 +203,7 @@ def evals_list_experiments( status: Optional[str] = None, limit: int = 50, offset: int = 0, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: params: dict[str, Any] = {"limit": limit, "offset": offset} @@ -115,6 +215,7 @@ def evals_list_experiments( "/experiments", method="GET", params=params, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -128,6 +229,7 @@ def evals_create_experiment( config: Optional[dict[str, Any]] = None, summary: Optional[dict[str, Any]] = None, tags: Optional[list[str]] = None, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: body = { @@ -143,6 +245,7 @@ def evals_create_experiment( "/experiments", method="POST", json_body=body, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -150,11 +253,13 @@ def evals_delete_experiment( self, experiment_id: str, *, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: return self._evals_request( f"/experiments/{experiment_id}", method="DELETE", + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -164,12 +269,14 @@ def evals_list_runs( *, limit: int = 50, offset: int = 0, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: return self._evals_request( f"/experiments/{experiment_id}/runs", method="GET", params={"limit": limit, "offset": offset}, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -183,6 +290,7 @@ def evals_create_run( metrics: Optional[dict[str, Any]] = None, summary: Optional[dict[str, Any]] = None, report: Optional[dict[str, Any]] = None, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: body: dict[str, Any] = { @@ -199,6 +307,7 @@ def evals_create_run( f"/experiments/{experiment_id}/runs", method="POST", json_body=body, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -206,11 +315,13 @@ def evals_get_run( self, run_id: str, *, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: return self._evals_request( f"/runs/{run_id}", method="GET", + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -218,12 +329,14 @@ def evals_compare_runs( self, run_ids: list[str], *, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: return self._evals_request( "/runs/compare", method="POST", json_body={"run_ids": run_ids}, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -239,6 +352,7 @@ def evals_create_live_event( passed: Optional[bool] = None, attributes: Optional[dict[str, Any]] = None, created_at: Optional[str] = None, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: body: dict[str, Any] = { @@ -262,6 +376,7 @@ def evals_create_live_event( "/live/events", method="POST", json_body=body, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -270,12 +385,14 @@ def evals_list_live_targets( *, window: str = "24h", limit: int = 50, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: return self._evals_request( "/live/targets", method="GET", params={"window": window, "limit": limit}, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) @@ -288,6 +405,7 @@ def evals_list_live_events( evaluator_name: Optional[str] = None, limit: int = 50, offset: int = 0, + billable_account_uid: Optional[str] = None, account_uid: Optional[str] = None, ) -> dict[str, Any]: params: dict[str, Any] = { @@ -303,5 +421,6 @@ def evals_list_live_events( "/live/events", method="GET", params=params, + billable_account_uid=billable_account_uid, account_uid=account_uid, ) \ No newline at end of file diff --git a/datalayer_core/mixins/ray.py b/datalayer_core/mixins/ray.py index 7de8b647..815cfc8e 100644 --- a/datalayer_core/mixins/ray.py +++ b/datalayer_core/mixins/ray.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + # Copyright (c) 2023-2026 Datalayer, Inc. # Distributed under the terms of the Modified BSD License. @@ -30,7 +33,7 @@ def _ray_request( prefixes = self._get_ray_api_prefixes() prefix = prefixes[0] response = self._fetch( # type: ignore - f"{self.urls.ray_url}{prefix}{path}", # type: ignore + f"{self.urls.runtimes_url}{prefix}{path}", # type: ignore method=method, params=params, json=json_body, diff --git a/datalayer_core/mixins/runtimes.py b/datalayer_core/mixins/runtimes.py index e721f3e0..4d049820 100644 --- a/datalayer_core/mixins/runtimes.py +++ b/datalayer_core/mixins/runtimes.py @@ -4,6 +4,7 @@ """Runtime management module for Datalayer Core.""" import logging +import os import sys import time from typing import Any, Optional @@ -69,6 +70,15 @@ def _create_runtime( "environment_name": environment_name, } + resolved_billable_account_uid = ( + billable_account_uid + or os.environ.get("DATALAYER_ACCOUNT_UID") + or os.environ.get("DATALAYER_BILLABLE_ACCOUNT_UID") + ) + resolved_billable_account_handle = ( + billable_account_handle or os.environ.get("DATALAYER_ACCOUNT_HANDLE") + ) + if given_name: body["given_name"] = given_name @@ -118,12 +128,12 @@ def _create_runtime( if agent_spec: body["agent_spec"] = agent_spec - if billable_account_uid: - body["billable_account_uid"] = billable_account_uid + if resolved_billable_account_uid: + body["billable_account_uid"] = resolved_billable_account_uid if billable_account_type: body["billable_account_type"] = billable_account_type - if billable_account_handle: - body["billable_account_handle"] = billable_account_handle + if resolved_billable_account_handle: + body["billable_account_handle"] = resolved_billable_account_handle runtime_url = "{}/api/runtimes/v1/runtimes".format(self.urls.runtimes_url) # type: ignore logger.debug( diff --git a/datalayer_core/models/__init__.py b/datalayer_core/models/__init__.py index c128d2c2..053da4a4 100644 --- a/datalayer_core/models/__init__.py +++ b/datalayer_core/models/__init__.py @@ -21,6 +21,7 @@ HealthResponseData, ModelsResponseData, ) +from .api_key import ApiKeyModel, ApiKeyType from .base import ( BaseResponse, DataResponse, @@ -83,10 +84,11 @@ from .runtime import RuntimeModel from .sandbox_snapshot import SandboxSnapshotModel from .secret import SecretModel, SecretVariant -from .token import TokenModel, TokenType __all__ = [ "BaseResponse", + "ApiKeyModel", + "ApiKeyType", "ChatMessage", "ChatRequest", "ChatResponseData", @@ -146,9 +148,6 @@ "TeamListResponseData", "TeamMemberModel", "TeamRequest", - "TokenModel", - "TokenModel", - "TokenType", "UsageData", "User", "UserModel", diff --git a/datalayer_core/models/api_key.py b/datalayer_core/models/api_key.py new file mode 100644 index 00000000..c6801264 --- /dev/null +++ b/datalayer_core/models/api_key.py @@ -0,0 +1,42 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +""" +API key models for Datalayer. + +Provides data structures for API key management in Datalayer environments. +""" + +from enum import Enum +from typing import Any, Dict, Union + +from pydantic import BaseModel, Field + + +class ApiKeyType(str, Enum): + """Enum for API key variants.""" + + SECRET = "secret" + PUBLISHABLE = "publishable" + RESTRICTED = "restricted" + TEMPORARY = "temporary" + + +class ApiKeyModel(BaseModel): + """ + Pydantic model representing an API key in Datalayer. + """ + + uid: str = Field(..., description="Unique identifier for the API key") + name: str = Field(..., description="Name of the API key") + description: str = Field(..., description="Description of the API key") + api_key_type: Union[str, ApiKeyType] = Field( + default=ApiKeyType.SECRET, + description='Type of the API key (secret, publishable, restricted, temporary)', + ) + kwargs: Dict[str, Any] = Field( + default_factory=dict, description="Additional keyword arguments" + ) + + def __repr__(self) -> str: + return f"ApiKeyModel(uid='{self.uid}', name='{self.name}')" diff --git a/datalayer_core/models/token.py b/datalayer_core/models/token.py deleted file mode 100644 index 084deda0..00000000 --- a/datalayer_core/models/token.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2023-2025 Datalayer, Inc. -# Distributed under the terms of the Modified BSD License. - -""" -Token models for Datalayer. - -Provides data structures for token management in Datalayer environments. -""" - -from enum import Enum -from typing import Any, Dict, Union - -from pydantic import BaseModel, Field - - -class TokenType(str, Enum): - """Enum for token variants.""" - - USER = "user_token" - - -class TokenModel(BaseModel): - """ - Pydantic model representing a token in Datalayer. - """ - - uid: str = Field(..., description="Unique identifier for the token") - name: str = Field(..., description="Name of the token") - description: str = Field(..., description="Description of the token") - token_type: Union[str, TokenType] = Field( - default=TokenType.USER, - description='Type of the token (e.g., "user", "admin")', - ) - kwargs: Dict[str, Any] = Field( - default_factory=dict, description="Additional keyword arguments" - ) - - def __repr__(self) -> str: - return f"TokenModel(uid='{self.uid}', name='{self.name}')" diff --git a/datalayer_core/runtimes/__init__.py b/datalayer_core/runtimes/__init__.py deleted file mode 100644 index f7d0007b..00000000 --- a/datalayer_core/runtimes/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) 2023-2025 Datalayer, Inc. -# Distributed under the terms of the Modified BSD License. diff --git a/datalayer_core/runtimes/sandbox_snapshot.py b/datalayer_core/runtimes/sandbox_snapshot.py deleted file mode 100644 index a02198eb..00000000 --- a/datalayer_core/runtimes/sandbox_snapshot.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2023-2025 Datalayer, Inc. -# Distributed under the terms of the Modified BSD License. - -""" -Snapshot services for Datalayer. - -Provides code sandbox snapshot management and operations in Datalayer environments. -""" - -import uuid -from typing import Any, List, Optional, Tuple - -from datalayer_core.models.sandbox_snapshot import SandboxSnapshotModel - - -def create_snapshot(name: Optional[str], description: Optional[str]) -> Tuple[str, str]: - """ - Create snapshot name and description with defaults. - - Parameters - ---------- - name : Optional[str] - Name for the snapshot, or None for auto-generated name. - description : Optional[str] - Description for the snapshot, or None for auto-generated description. - - Returns - ------- - Tuple[str, str] - Tuple of (name, description) strings. - """ - uid = uuid.uuid4() - if name is None: - name = f"snapshot-{uid}" - - if description is None: - description = f"snapshot-{uid}" - - return name, description - - -def as_code_sandbox_snapshots(response: dict[str, Any]) -> List["SandboxSnapshotModel"]: - """ - Parse API response and create SandboxSnapshot objects. - - Parameters - ---------- - response : dict[str, Any] - API response dictionary containing snapshots data. - - Returns - ------- - List[SandboxSnapshot] - List of SandboxSnapshot objects parsed from the response. - """ - snapshot_objects = [] - if response["success"]: - snapshots = response["snapshots"] - for snapshot in snapshots: - snapshot_objects.append( - SandboxSnapshotModel( - uid=snapshot["uid"], - name=snapshot["name"], - description=snapshot["description"], - environment=snapshot["environment"], - metadata=snapshot["metadata"], - ) - ) - return snapshot_objects diff --git a/datalayer_core/sandboxes/__init__.py b/datalayer_core/sandboxes/__init__.py new file mode 100644 index 00000000..e5072289 --- /dev/null +++ b/datalayer_core/sandboxes/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Code sandbox utilities for Datalayer.""" + +from datalayer_core.sandboxes.code_sandbox_snapshots import ( + as_code_sandbox_snapshots, + create_snapshot, +) + +__all__ = [ + "as_code_sandbox_snapshots", + "create_snapshot", +] diff --git a/datalayer_core/sandboxes/code_sandbox_snapshots.py b/datalayer_core/sandboxes/code_sandbox_snapshots.py new file mode 100644 index 00000000..5acdaae0 --- /dev/null +++ b/datalayer_core/sandboxes/code_sandbox_snapshots.py @@ -0,0 +1,44 @@ +# Copyright (c) 2023-2025 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +# Copyright (c) 2023-2026 Datalayer, Inc. +# Distributed under the terms of the Modified BSD License. + +"""Snapshot services for Datalayer code sandboxes.""" + +import uuid +from typing import Any, Optional, Tuple + +from datalayer_core.models.sandbox_snapshot import SandboxSnapshotModel + + +def create_snapshot(name: Optional[str], description: Optional[str]) -> Tuple[str, str]: + """Create snapshot name and description with defaults.""" + uid = uuid.uuid4() + if name is None: + name = f"snapshot-{uid}" + + if description is None: + description = f"snapshot-{uid}" + + return name, description + + +def as_code_sandbox_snapshots( + response: dict[str, Any], +) -> list[SandboxSnapshotModel]: + """Parse API response and create SandboxSnapshotModel objects.""" + snapshot_objects: list[SandboxSnapshotModel] = [] + if response["success"]: + snapshots = response["snapshots"] + for snapshot in snapshots: + snapshot_objects.append( + SandboxSnapshotModel( + uid=snapshot["uid"], + name=snapshot["name"], + description=snapshot["description"], + environment=snapshot["environment"], + metadata=snapshot["metadata"], + ) + ) + return snapshot_objects diff --git a/datalayer_core/templates/index.html b/datalayer_core/templates/index.html index 3b3c720f..0f422224 100644 --- a/datalayer_core/templates/index.html +++ b/datalayer_core/templates/index.html @@ -13,10 +13,10 @@ "version": "{{ datalayer_version }}", "runUrl": "{{ run_url }}", "iamRunUrl": "{{ iam_url }}", - "jupyterRunUrl": {{ run_url }}", + "jupyterRunUrl": "{{ run_url }}", "jupyterServerUrl": "http://localhost:8888{{ base_url }}", - "jupyterServerToken": "{{ token }}" - "jupyterServerless": "true", + "jupyterServerToken": "{{ token }}", + "jupyterServerless": "true" }