diff --git a/README.md b/README.md index af92648..7a721e9 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,9 @@ Commands with JSON output support: - `sample-app` - Basic template with Playwright integration - `captcha-solver` - Template demonstrating Kernel's auto-CAPTCHA solver - `stagehand` - Template with Stagehand SDK (TypeScript only) + - `ehr-system` - EHR system automation demo with Playwright/OpenAI (TypeScript only) - `browser-use` - Template with Browser Use SDK (Python only) + - `lead-scraper` - Google Maps lead scraper using Browser Use (Python only) - `anthropic-computer-use` - Anthropic Computer Use prompt loop - `openai-computer-use` - OpenAI Computer Use Agent sample - `gemini-computer-use` - Implements a Gemini computer use agent (TypeScript only) @@ -449,6 +451,12 @@ kernel create --name my-cu-app --language py --template anthropic-computer-use # Create a Claude Agent SDK app (TypeScript or Python) kernel create --name my-claude-agent --language ts --template claude-agent-sdk + +# Create a Google Maps Lead Scraper (Python) +kernel create --name my-lead-scraper --language python --template lead-scraper + +# Create an EHR System Automation (TypeScript) +kernel create --name my-ehr-bot --language ts --template ehr-system ``` ### Deploy with environment variables diff --git a/pkg/templates/python/lead-scraper/.env.example b/pkg/templates/python/lead-scraper/.env.example new file mode 100644 index 0000000..b74e0a2 --- /dev/null +++ b/pkg/templates/python/lead-scraper/.env.example @@ -0,0 +1,2 @@ +# Copy this file to .env and fill in your API key +OPENAI_API_KEY=your_openai_api_key_here diff --git a/pkg/templates/python/lead-scraper/README.md b/pkg/templates/python/lead-scraper/README.md new file mode 100644 index 0000000..e9e98b0 --- /dev/null +++ b/pkg/templates/python/lead-scraper/README.md @@ -0,0 +1,113 @@ +# Kernel Lead Scraper Template - Google Maps + +A ready-to-use lead scraper that extracts local business data from Google Maps using [browser-use](https://github.com/browser-use/browser-use) and the Kernel platform. + +## What It Does + +This template creates an AI-powered web scraper that: +1. Navigates to Google Maps +2. Searches for businesses by type and location +3. Scrolls through results to load more listings +4. Extracts structured lead data (name, phone, address, website, rating, reviews) +5. Returns clean JSON ready for your CRM or outreach tools + +## Quick Start + +### 1. Install Dependencies + +```bash +uv sync +``` + +### 2. Set Up Environment + +```bash +cp .env.example .env +# Edit .env and add your OpenAI API key +``` + +### 3. Deploy to Kernel + +```bash +kernel deploy main.py -e OPENAI_API_KEY=$OPENAI_API_KEY +``` + +### 4. Run the Scraper + +```bash +kernel run lead-scraper scrape-leads \ + --data '{"query": "restaurants", "location": "Austin, TX", "max_results": 10}' +``` + +## API Reference + +### Action: `scrape-leads` + +**Input Parameters:** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `query` | string | ✅ | - | Business type to search (e.g., "plumbers", "gyms") | +| `location` | string | ✅ | - | Geographic location (e.g., "Miami, FL") | +| `max_results` | integer | ❌ | 20 | Maximum leads to scrape (1-50) | + +**Example Output:** + +```json +{ + "leads": [ + { + "name": "Joe's Pizza", + "phone": "(512) 555-0123", + "address": "123 Main St, Austin, TX 78701", + "website": "https://joespizza.com", + "rating": 4.5, + "review_count": 234, + "category": "Pizza restaurant" + } + ], + "total_found": 1, + "query": "pizza restaurants", + "location": "Austin, TX" +} +``` + +## Use Cases + +- **Sales Teams**: Build targeted prospect lists for cold outreach +- **Marketing Agencies**: Find local businesses needing marketing services +- **Service Providers**: Identify potential B2B clients in your area +- **Market Research**: Analyze competitor density and ratings by location + +## Customization + +### Modify the Search Prompt + +Edit the `SCRAPER_PROMPT` in `main.py` to customize what data the AI extracts: + +```python +SCRAPER_PROMPT = """ +Navigate to Google Maps and search for {query} in {location}. +# Add your custom extraction instructions here +""" +``` + +### Add New Fields + +1. Update `BusinessLead` model in `models.py` +2. Modify the prompt to extract the new fields +3. Redeploy with `kernel deploy main.py` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| No results found | Try a broader search query or different location | +| Timeout errors | Reduce `max_results` or check your network | +| Rate limiting | Add delays between requests in production | + +## Resources + +- [Kernel Documentation](https://www.kernel.sh/docs) +- [Browser Use Docs](https://docs.browser-use.com) +- [Pydantic Models](https://docs.pydantic.dev) diff --git a/pkg/templates/python/lead-scraper/_gitignore b/pkg/templates/python/lead-scraper/_gitignore new file mode 100644 index 0000000..75475bc --- /dev/null +++ b/pkg/templates/python/lead-scraper/_gitignore @@ -0,0 +1,79 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +.project +.pydevproject +.settings/ + +# Testing +.coverage +htmlcov/ +.pytest_cache/ +.tox/ +.nox/ +coverage.xml +*.cover +.hypothesis/ + +# Logs +*.log +logs/ + +# OS +.DS_Store +Thumbs.db + +# Browser Use specific +.playwright-screenshots/ +.playwright-videos/ +.playwright-report/ +test-results/ +blob-report/ +playwright/.cache/ +playwright/.local-browsers/ + +# Lead Scraper specific +leads_output/ +*.csv +*.json + +# Misc +.cache/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.temp/ +.tmp/ diff --git a/pkg/templates/python/lead-scraper/formaters.py b/pkg/templates/python/lead-scraper/formaters.py new file mode 100644 index 0000000..60256c2 --- /dev/null +++ b/pkg/templates/python/lead-scraper/formaters.py @@ -0,0 +1,208 @@ +import json +import re +from typing import Any, Iterable +from models import BusinessLead + +_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.IGNORECASE | re.DOTALL) +_TRAILING_COMMA_RE = re.compile(r",\s*([\]}])") +_SMART_QUOTES = { + "\u201c": '"', "\u201d": '"', # “ ” + "\u2018": "'", "\u2019": "'", # ‘ ’ +} + + +def parse_leads_from_result(result_text: str) -> list[BusinessLead]: + """ + Robustly extract a JSON array of leads from an LLM/browser agent output and + convert it into BusinessLead objects. + + Strategy: + 1) Prefer JSON inside ```json ... ``` fenced blocks + 2) Else try to decode from the first '[' onwards using JSONDecoder.raw_decode + 3) Normalize a few common LLM issues (smart quotes, trailing commas, "null" strings) + """ + if not result_text or not result_text.strip(): + return [] + + candidates = _extract_json_candidates(result_text) + + for candidate in candidates: + parsed = _try_parse_json_list(candidate) + if parsed is None: + continue + + leads: list[BusinessLead] = [] + for raw in parsed: + lead = _to_business_lead(raw) + if lead is not None: + leads.append(lead) + + if leads: + return leads # first successful parse wins + + # Fallback: try to parse markdown format (when agent returns numbered lists) + leads = _parse_markdown_leads(result_text) + if leads: + return leads + + return [] + + +def _parse_markdown_leads(text: str) -> list[BusinessLead]: + """ + Parse markdown-formatted lead data when JSON parsing fails. + Handles format like: + 1. **Business Name** + - Address: 123 Main St + - Rating: 4.5 + - Phone: +1 555-1234 + """ + leads = [] + + # Pattern to match numbered entries with bold names + entry_pattern = re.compile( + r'\d+\.\s*\*\*(.+?)\*\*\s*\n((?:\s*-\s*.+\n?)+)', + re.MULTILINE + ) + + for match in entry_pattern.finditer(text): + name = match.group(1).strip() + details = match.group(2) + + # Extract fields from the dash-prefixed lines + def extract_field(pattern: str, txt: str) -> str | None: + m = re.search(pattern, txt, re.IGNORECASE) + return m.group(1).strip() if m else None + + address = extract_field(r'-\s*Address:\s*(.+?)(?:\n|$)', details) + rating_str = extract_field(r'-\s*Rating:\s*([\d.]+)', details) + review_str = extract_field(r'-\s*Review\s*Count:\s*([\d,]+)', details) + category = extract_field(r'-\s*Category:\s*(.+?)(?:\n|$)', details) + phone = extract_field(r'-\s*Phone:\s*(.+?)(?:\n|$)', details) + website = extract_field(r'-\s*Website:\s*(.+?)(?:\n|$)', details) + + # Clean up "Not available" etc + if phone and phone.lower() in ('not available', 'n/a', 'none'): + phone = None + if website and website.lower() in ('not available', 'n/a', 'none'): + website = None + + try: + lead = BusinessLead( + name=name, + address=address, + rating=float(rating_str) if rating_str else None, + review_count=int(review_str.replace(',', '')) if review_str else None, + category=category, + phone=phone, + website=website, + ) + leads.append(lead) + except Exception: + continue + + return leads + + +def _extract_json_candidates(text: str) -> list[str]: + """ + Return possible JSON snippets, ordered from most to least likely. + """ + # 1) Fenced code blocks first + fenced = [m.group(1) for m in _JSON_FENCE_RE.finditer(text)] + if fenced: + return fenced + + # 2) Otherwise try from first '[' onward (common "Return ONLY a JSON array") + idx = text.find("[") + return [text[idx:]] if idx != -1 else [] + + +def _normalize_llm_json(s: str) -> str: + # Replace smart quotes + for k, v in _SMART_QUOTES.items(): + s = s.replace(k, v) + + # Some models do ``key``: ``value``. Convert double-backticks to quotes carefully. + # (Keep this minimal: it can still be wrong, but it helps common cases.) + s = s.replace("``", '"') + + # Convert string "null" to JSON null + s = s.replace('"null"', "null") + + # Remove trailing commas before ] or } + s = _TRAILING_COMMA_RE.sub(r"\1", s) + + return s.strip() + + +def _try_parse_json_list(candidate: str) -> list[dict[str, Any]] | None: + """ + Attempt to parse a JSON array from a candidate snippet. + Returns a list of dicts or None. + """ + candidate = _normalize_llm_json(candidate) + + # 1) Direct parse + try: + data = json.loads(candidate) + return data if isinstance(data, list) else None + except json.JSONDecodeError: + pass + + # 2) Decoder-based parse from first '[' (more robust than find/rfind slicing) + start = candidate.find("[") + if start == -1: + return None + + decoder = json.JSONDecoder() + try: + obj, _end = decoder.raw_decode(candidate[start:]) + return obj if isinstance(obj, list) else None + except json.JSONDecodeError: + return None + + +def _to_business_lead(raw: Any) -> BusinessLead | None: + """ + Convert one raw object into a BusinessLead, best-effort. + """ + if not isinstance(raw, dict): + return None + + try: + # Optionally coerce some common fields + rating = raw.get("rating") + if isinstance(rating, str): + rating = _safe_float(rating) + + review_count = raw.get("review_count") + if isinstance(review_count, str): + review_count = _safe_int(review_count) + + return BusinessLead( + name=(raw.get("name") or "Unknown").strip() if isinstance(raw.get("name"), str) else (raw.get("name") or "Unknown"), + phone=raw.get("phone"), + address=raw.get("address"), + website=raw.get("website"), + rating=rating, + review_count=review_count, + category=raw.get("category"), + ) + except Exception: + # Keep parsing the rest; caller decides how to log + return None + + +def _safe_float(x: str) -> float | None: + try: + return float(x.replace(",", "").strip()) + except Exception: + return None + + +def _safe_int(x: str) -> int | None: + try: + return int(x.replace(",", "").strip()) + except Exception: + return None diff --git a/pkg/templates/python/lead-scraper/main.py b/pkg/templates/python/lead-scraper/main.py new file mode 100644 index 0000000..aa7a7eb --- /dev/null +++ b/pkg/templates/python/lead-scraper/main.py @@ -0,0 +1,170 @@ +""" +Google Maps Lead Scraper - Kernel Template + +This template demonstrates how to build a lead scraper using browser-use +to extract local business data from Google Maps. + +Usage: + kernel deploy main.py -e OPENAI_API_KEY=$OPENAI_API_KEY + kernel invoke lead-scraper scrape-leads --data '{"query": "restaurants", "location": "Austin, TX"}' +""" + +import json + +import kernel +from browser_use import Agent, Browser +from browser_use.llm import ChatOpenAI +from kernel import Kernel +from formaters import parse_leads_from_result + +from models import BusinessLead, ScrapeInput, ScrapeOutput + +# Initialize Kernel client and app +client = Kernel() +app = kernel.App("lead-scraper") + +# LLM for the browser-use agent +# API key is set via: kernel deploy main.py -e OPENAI_API_KEY=XXX +llm = ChatOpenAI(model="gpt-4.1") + +# ============================================================================ +# SCRAPER PROMPT +# Customize this prompt to change what data the agent extracts +# ============================================================================ +SCRAPER_PROMPT = """ +You are a lead generation assistant. Scrape business information from Google Maps. + +**Instructions:** +1. Navigate to https://www.google.com/maps +2. Search for: "{query} in {location}" +3. Wait for results to load +4. For each of the max {max_results} businesses in the list: + a. Click on the listing to open its detail view + b. SCROLL DOWN in the detail panel to see all info (phone/website are often below) + c. Extract: name, address, rating, review count, category, phone number, website + d. Click back or the X to close the detail view and return to the list +5. After collecting data for max {max_results} businesses, return the JSON + +**What to extract:** +- Business name (REQUIRED) +- Address (REQUIRED) +- Star rating (REQUIRED) +- Review count (optional) +- Category (optional) +- Phone number (scroll down in detail view to find it, null if not shown) +- Website URL (scroll down in detail view to find it, null if not shown) + +**Important:** +- SCROLL DOWN inside each business detail panel to find phone/website +- Use null for any field that isn't available +- Task is SUCCESSFUL when you return at least 1 complete business + +**CRITICAL - Output Format:** +You MUST return ONLY a valid JSON array. No markdown, no explanations, no numbered lists. +Return EXACTLY this format: +[ + {{"name": "Business Name", "address": "123 Main St", "rating": 4.5, "review_count": 100, "category": "Restaurant", "phone": "+1 555-1234", "website": "https://example.com"}} +] +""" + +@app.action("scrape-leads") +async def scrape_leads(ctx: kernel.KernelContext, input_data: dict) -> dict: + """ + Scrape local business leads from Google Maps. + + This action uses browser-use to navigate Google Maps, search for businesses, + and extract structured lead data. + + Args: + ctx: Kernel context containing invocation information + input_data: Dictionary with query, location, and max_results + + Returns: + ScrapeOutput containing list of leads and metadata + + Example: + kernel invoke lead-scraper scrape-leads \ + --data '{"query": "plumbers", "location": "Miami, FL", "max_results": 15}' + """ + # Validate input - default to empty dict if no payload provided + scrape_input = ScrapeInput(**(input_data or {})) + + # Use attribute access for Pydantic model (not dictionary subscript) + input_query = scrape_input.query + input_location = scrape_input.location + input_max_results = scrape_input.max_results + + # Format the prompt with user parameters + task_prompt = SCRAPER_PROMPT.format( + query=input_query, + location=input_location, + max_results=input_max_results, + ) + + print(f"Starting lead scrape: {input_query} in {input_location}") + print(f"Target: {input_max_results} leads") + + # Create Kernel browser session + kernel_browser = None + + try: + + kernel_browser = client.browsers.create( + invocation_id=ctx.invocation_id, + stealth=True, # Use stealth mode to avoid detection + ) + print(f"Browser live view: {kernel_browser.browser_live_view_url}") + + # Connect browser-use to the Kernel browser + browser = Browser( + cdp_url=kernel_browser.cdp_ws_url, + headless=False, + window_size={"width": 1920, "height": 1080}, + viewport={"width": 1920, "height": 1080}, + device_scale_factor=1.0, + ) + + # Create and run the browser-use agent + agent = Agent( + task=task_prompt, + llm=llm, + browser_session=browser, + ) + + print("Running browser-use agent...") + # Limit steps to prevent timeouts (this is a template demo) + result = await agent.run(max_steps=25) + + # Parse the result from final_result + leads = [] + final_text = result.final_result() + + if final_text: + print(f"Parsing final_result ({len(final_text)} chars)...") + leads = parse_leads_from_result(final_text) + else: + # If no final_result, check the last action for done text + print("No final_result, checking last action...") + action_results = result.action_results() + if action_results: + last_action = action_results[-1] + if hasattr(last_action, 'extracted_content') and last_action.extracted_content: + content = last_action.extracted_content + print(f"Found content in last action ({len(content)} chars)...") + leads = parse_leads_from_result(content) + + print(f"Successfully extracted {len(leads)} leads") + + output = ScrapeOutput( + leads=leads, + total_found=len(leads), + query=input_query, + location=input_location, + ) + return output.model_dump() + + finally: + # Always clean up the browsers session + if kernel_browser is not None: + client.browsers.delete_by_id(kernel_browser.session_id) + print("Browser session cleaned up") diff --git a/pkg/templates/python/lead-scraper/models.py b/pkg/templates/python/lead-scraper/models.py new file mode 100644 index 0000000..2d3c6e4 --- /dev/null +++ b/pkg/templates/python/lead-scraper/models.py @@ -0,0 +1,65 @@ +from pydantic import BaseModel, Field +from typing import Optional + + +class ScrapeInput(BaseModel): + """Input parameters for the lead scraper. + + Attributes: + query: The type of business to search (e.g., "restaurants", "plumbers", "gyms") + location: The geographic location to search (e.g., "Austin, TX", "New York, NY") + max_results: Maximum number of leads to scrape (default: 2, max: 5) + """ + + query: str = Field( + default="restaurants", + description="Type of business to search for (e.g., 'restaurants', 'plumbers')" + ) + location: str = Field( + default="New York, NY", + description="Geographic location (e.g., 'Austin, TX', 'New York, NY')" + ) + max_results: int = Field( + default=1, + ge=1, + le=5, + description="Maximum number of leads to scrape (1-5)", + ) + + +class BusinessLead(BaseModel): + """Structured data for a business lead scraped from Google Maps. + + Attributes: + name: Business name + phone: Phone number (if available) + address: Full address + website: Website URL (if available) + rating: Star rating (1-5) + review_count: Number of reviews + category: Business category/type + """ + + name: str = Field(description="Business name") + phone: Optional[str] = Field(default=None, description="Phone number") + address: Optional[str] = Field(default=None, description="Full address") + website: Optional[str] = Field(default=None, description="Website URL") + rating: Optional[float] = Field(default=None, ge=1, le=5, description="Star rating") + review_count: Optional[int] = Field(default=None, ge=0, description="Number of reviews") + category: Optional[str] = Field(default=None, description="Business category") + + +class ScrapeOutput(BaseModel): + """Output from the lead scraper. + + Attributes: + leads: List of scraped business leads + total_found: Total number of leads found + query: The original search query + location: The original search location + """ + + leads: list[BusinessLead] = Field(default_factory=list, description="List of scraped leads") + total_found: int = Field(default=0, description="Total number of leads found") + query: str = Field(description="Original search query") + location: str = Field(description="Original search location") diff --git a/pkg/templates/python/lead-scraper/pyproject.toml b/pkg/templates/python/lead-scraper/pyproject.toml new file mode 100644 index 0000000..2c33639 --- /dev/null +++ b/pkg/templates/python/lead-scraper/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "lead-scraper" +version = "0.1.0" +description = "Google Maps Lead Scraper - A Kernel template for scraping local business leads" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "browser-use>=0.11.1", + "kernel>=0.23.0", + "pydantic>=2.12.5", +] diff --git a/pkg/templates/typescript/ehr-system/.env.example b/pkg/templates/typescript/ehr-system/.env.example new file mode 100644 index 0000000..80a79e6 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/.env.example @@ -0,0 +1 @@ +ANTHROPIC_API_KEY= \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/.gitignore b/pkg/templates/typescript/ehr-system/.gitignore new file mode 100644 index 0000000..d8f3372 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/.gitignore @@ -0,0 +1,3 @@ +node_modules +.DS_Store +.env diff --git a/pkg/templates/typescript/ehr-system/README.md b/pkg/templates/typescript/ehr-system/README.md new file mode 100644 index 0000000..57f3676 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/README.md @@ -0,0 +1,41 @@ +# EHR System Automation Template + +This template demonstrates how to run an agentic browser workflow on Kernel to automate an Electronic Health Records (EHR) portal. It uses an Anthropic Computer Use loop with Kernel's Computer Controls API. + +## Logic + +The automation performs the following steps: +1. Navigate to the EHR login page (`https://ehr-system-six.vercel.app/login`). +2. Authenticate using valid credentials (any email/password works for this demo). +3. Navigate to the **Medical Reports** section in the dashboard. +4. Click the **Download Summary of Care** button to download the report. + +## Quickstart + +Deploy: + +```bash +kernel deploy index.ts -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY +``` + +Invoke: + +```bash +kernel invoke ehr-system export-report +``` + +View logs: + +```bash +kernel logs ehr-system --follow +``` + +## Notes + +- The login page must be publicly reachable from the Kernel browser session. +- Update the URL in `pkg/templates/typescript/ehr-system/index.ts` if you host the portal elsewhere. + +## Requirements + +- ANTHROPIC_API_KEY environment variable set. +- Kernel CLI installed and authenticated. diff --git a/pkg/templates/typescript/ehr-system/index.ts b/pkg/templates/typescript/ehr-system/index.ts new file mode 100644 index 0000000..43d9d82 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/index.ts @@ -0,0 +1,101 @@ +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { samplingLoop } from './loop'; +import { KernelBrowserSession } from './session'; + +interface Input { + task?: string; + record_replay?: boolean; +} + +interface Output { + elapsed: number; + result: string | null; + replay_url?: string | null; +} + +const kernel = new Kernel(); +const app = kernel.app('ehr-system'); + +// LLM API Keys are set in the environment during `kernel deploy -e ANTHROPIC_API_KEY=XXX` +// See https://www.kernel.sh/docs/launch/deploy#environment-variables +const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY; + +if (!ANTHROPIC_API_KEY) { + throw new Error('ANTHROPIC_API_KEY is not set'); +} + +const LOGIN_URL = 'https://ehr-system-six.vercel.app/login'; + +const DEFAULT_TASK = ` +Go to ${LOGIN_URL} +Login with username: Phil1 | password: phil | email: heya@invalid.email.com. +Navigate to the "Medical Reports" page. +Find the "Download Summary of Care" button and click it to download the report. +`; + +app.action( + 'export-report', + async (ctx: KernelContext, payload?: Input): Promise => { + const start = Date.now(); + const task = payload?.task || DEFAULT_TASK; + + // Create browser session with optional replay recording + const session = new KernelBrowserSession(kernel, { + stealth: true, + recordReplay: payload?.record_replay ?? false, + }); + + await session.start(); + console.log('> Kernel browser live view url:', session.liveViewUrl); + + try { + // Run the sampling loop with Anthropic Computer Use + const finalMessages = await samplingLoop({ + model: 'claude-sonnet-4-5-20250929', + messages: [{ + role: 'user', + content: `You are an automated agent. Current date and time: ${new Date().toISOString()}. You must complete the task fully without asking for permission.\n\nTask: ${task}`, + }], + apiKey: ANTHROPIC_API_KEY, + thinkingBudget: 1024, + kernel, + sessionId: session.sessionId, + }); + + // Extract the final result from the messages + if (finalMessages.length === 0) { + throw new Error('No messages were generated during the sampling loop'); + } + + const lastMessage = finalMessages[finalMessages.length - 1]; + if (!lastMessage) { + throw new Error('Failed to get the last message from the sampling loop'); + } + + const result = typeof lastMessage.content === 'string' + ? lastMessage.content + : lastMessage.content.map(block => + block.type === 'text' ? block.text : '' + ).join(''); + + const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); + + // Stop session and get replay URL if recording was enabled + const sessionInfo = await session.stop(); + + return { + elapsed, + result, + replay_url: sessionInfo.replayViewUrl, + }; + } catch (error) { + const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); + console.error('Error in export-report:', error); + await session.stop(); + return { + elapsed, + result: null, + }; + } + }, +); diff --git a/pkg/templates/typescript/ehr-system/loop.ts b/pkg/templates/typescript/ehr-system/loop.ts new file mode 100644 index 0000000..06e22ca --- /dev/null +++ b/pkg/templates/typescript/ehr-system/loop.ts @@ -0,0 +1,196 @@ +import { Anthropic } from '@anthropic-ai/sdk'; +import { DateTime } from 'luxon'; +import type { Kernel } from '@onkernel/sdk'; +import { DEFAULT_TOOL_VERSION, TOOL_GROUPS_BY_VERSION, ToolCollection, type ToolVersion } from './tools/collection'; +import { ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; +import type { ActionParams } from './tools/types/computer'; +import { Action } from './tools/types/computer'; +import type { BetaMessageParam, BetaTextBlock } from './types/beta'; +import { injectPromptCaching, maybeFilterToNMostRecentImages, PROMPT_CACHING_BETA_FLAG, responseToParams } from './utils/message-processing'; +import { makeApiToolResult } from './utils/tool-results'; + +// System prompt optimized for the environment +const SYSTEM_PROMPT = ` +* You are utilising an Ubuntu virtual machine using ${process.arch} architecture with internet access. +* When you connect to the display, CHROMIUM IS ALREADY OPEN. The url bar is not visible but it is there. +* If you need to navigate to a new page, use ctrl+l to focus the url bar and then enter the url. +* You won't be able to see the url bar from the screenshot but ctrl-l still works. +* As the initial step click on the search bar. +* When viewing a page it can be helpful to zoom out so that you can see everything on the page. +* Either that, or make sure you scroll down to see everything before deciding something isn't available. +* When using your computer function calls, they take a while to run and send back to you. +* Where possible/feasible, try to chain multiple of these calls all into one function calls request. +* The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}. +* After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. +* Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. +* Only when you confirm a step was executed correctly should you move on to the next one. + + + +* When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". +* Instead, click on the search bar on the center of the screen where it says "Search or enter address", and enter the appropriate search term or URL there. +`; + +// Add new type definitions +interface ThinkingConfig { + type: 'enabled'; + budget_tokens: number; +} + +interface ExtraBodyConfig { + thinking?: ThinkingConfig; +} + +interface ToolUseInput extends Record { + action: Action; +} + +export async function samplingLoop({ + model, + systemPromptSuffix, + messages, + apiKey, + onlyNMostRecentImages, + maxTokens = 4096, + toolVersion, + thinkingBudget, + tokenEfficientToolsBeta = false, + kernel, + sessionId, +}: { + model: string; + systemPromptSuffix?: string; + messages: BetaMessageParam[]; + apiKey: string; + onlyNMostRecentImages?: number; + maxTokens?: number; + toolVersion?: ToolVersion; + thinkingBudget?: number; + tokenEfficientToolsBeta?: boolean; + kernel: Kernel; + sessionId: string; +}): Promise { + const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION; + const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion]; + const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(kernel, sessionId))); + + const system: BetaTextBlock = { + type: 'text', + text: `${SYSTEM_PROMPT}${systemPromptSuffix ? ' ' + systemPromptSuffix : ''}`, + }; + + while (true) { + const betas: string[] = toolGroup.beta_flag ? [toolGroup.beta_flag] : []; + + if (tokenEfficientToolsBeta) { + betas.push('token-efficient-tools-2025-02-19'); + } + + let imageTruncationThreshold = onlyNMostRecentImages || 0; + + const client = new Anthropic({ apiKey, maxRetries: 4 }); + const enablePromptCaching = true; + + if (enablePromptCaching) { + betas.push(PROMPT_CACHING_BETA_FLAG); + injectPromptCaching(messages); + onlyNMostRecentImages = 0; + (system as BetaTextBlock).cache_control = { type: 'ephemeral' }; + } + + if (onlyNMostRecentImages) { + maybeFilterToNMostRecentImages( + messages, + onlyNMostRecentImages, + imageTruncationThreshold + ); + } + + const extraBody: ExtraBodyConfig = {}; + if (thinkingBudget) { + extraBody.thinking = { type: 'enabled', budget_tokens: thinkingBudget }; + } + + const toolParams = toolCollection.toParams(); + + const response = await client.beta.messages.create({ + max_tokens: maxTokens, + messages, + model, + system: [system], + tools: toolParams as any[], + betas, + ...extraBody, + }); + + const responseParams = responseToParams(response); + + const loggableContent = responseParams.map(block => { + if (block.type === 'tool_use') { + return { + type: 'tool_use', + name: block.name, + input: block.input + }; + } + return block; + }); + console.log('=== LLM RESPONSE ==='); + console.log('Stop reason:', response.stop_reason); + console.log(loggableContent); + console.log("===") + + messages.push({ + role: 'assistant', + content: responseParams, + }); + + if (response.stop_reason === 'end_turn') { + console.log('LLM has completed its task, ending loop'); + return messages; + } + + const toolResultContent = []; + let hasToolUse = false; + + for (const contentBlock of responseParams) { + if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input && typeof contentBlock.input === 'object') { + const input = contentBlock.input as ToolUseInput; + if ('action' in input && typeof input.action === 'string') { + hasToolUse = true; + const toolInput: ActionParams = { + action: input.action as Action, + ...Object.fromEntries( + Object.entries(input).filter(([key]) => key !== 'action') + ) + }; + + try { + const result = await toolCollection.run( + contentBlock.name, + toolInput + ); + + const toolResult = makeApiToolResult(result, contentBlock.id!); + toolResultContent.push(toolResult); + } catch (error) { + console.error(error); + throw error; + } + } + } + } + + if (toolResultContent.length === 0 && !hasToolUse && response.stop_reason !== 'tool_use') { + console.log('No tool use or results, and not waiting for tool use, ending loop'); + return messages; + } + + if (toolResultContent.length > 0) { + messages.push({ + role: 'user', + content: toolResultContent, + }); + } + } +} diff --git a/pkg/templates/typescript/ehr-system/package-lock.json b/pkg/templates/typescript/ehr-system/package-lock.json new file mode 100644 index 0000000..e91f864 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/package-lock.json @@ -0,0 +1,121 @@ +{ + "name": "ehr-system", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "ehr-system", + "dependencies": { + "@anthropic-ai/sdk": "^0.71.2", + "@onkernel/sdk": "^0.24.0", + "luxon": "^3.7.2" + }, + "devDependencies": { + "@types/luxon": "^3.7.1", + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } + }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.71.2", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.71.2.tgz", + "integrity": "sha512-TGNDEUuEstk/DKu0/TflXAEt+p+p/WhTlFzEnoosvbaDU2LTjm42igSdlL0VijrKpWejtOKxX0b8A7uc+XiSAQ==", + "license": "MIT", + "dependencies": { + "json-schema-to-ts": "^3.1.1" + }, + "bin": { + "anthropic-ai-sdk": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.25.0 || ^4.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/@babel/runtime": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz", + "integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@onkernel/sdk": { + "version": "0.24.0", + "resolved": "https://registry.npmjs.org/@onkernel/sdk/-/sdk-0.24.0.tgz", + "integrity": "sha512-f0xZGSaC9Nlg7CwLw6agyw682sc9Q8rPRG6Zyk82JmCKETFBdMqfyXuxK5uESidk0pQp/GYGG8rHy+vGa5jgCQ==", + "license": "Apache-2.0" + }, + "node_modules/@types/luxon": { + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/@types/luxon/-/luxon-3.7.1.tgz", + "integrity": "sha512-H3iskjFIAn5SlJU7OuxUmTEpebK6TKB8rxZShDslBMZJ5u9S//KM1sbdAisiSrqwLQncVjnpi2OK2J51h+4lsg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "22.19.7", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.7.tgz", + "integrity": "sha512-MciR4AKGHWl7xwxkBa6xUGxQJ4VBOmPTF7sL+iGzuahOFaO0jHCsuEfS80pan1ef4gWId1oWOweIhrDEYLuaOw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/json-schema-to-ts": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/json-schema-to-ts/-/json-schema-to-ts-3.1.1.tgz", + "integrity": "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==", + "license": "MIT", + "dependencies": { + "@babel/runtime": "^7.18.3", + "ts-algebra": "^2.0.0" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/luxon": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.7.2.tgz", + "integrity": "sha512-vtEhXh/gNjI9Yg1u4jX/0YVPMvxzHuGgCm6tC5kZyb08yjGWGnqAjGJvcXbqQR2P3MyMEFnRbpcdFS6PBcLqew==", + "license": "MIT", + "engines": { + "node": ">=12" + } + }, + "node_modules/ts-algebra": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz", + "integrity": "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==", + "license": "MIT" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + } + } +} diff --git a/pkg/templates/typescript/ehr-system/package.json b/pkg/templates/typescript/ehr-system/package.json new file mode 100644 index 0000000..7552171 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/package.json @@ -0,0 +1,19 @@ +{ + "name": "ehr-system", + "module": "index.ts", + "type": "module", + "private": true, + "scripts": { + "build": "tsc" + }, + "dependencies": { + "@anthropic-ai/sdk": "^0.71.2", + "@onkernel/sdk": "^0.24.0", + "luxon": "^3.7.2" + }, + "devDependencies": { + "@types/luxon": "^3.7.1", + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } +} diff --git a/pkg/templates/typescript/ehr-system/session.ts b/pkg/templates/typescript/ehr-system/session.ts new file mode 100644 index 0000000..3aeb77c --- /dev/null +++ b/pkg/templates/typescript/ehr-system/session.ts @@ -0,0 +1,222 @@ +/** + * Kernel Browser Session Manager. + * + * Provides a class for managing Kernel browser lifecycle + * with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export interface SessionOptions { + /** Enable stealth mode to avoid bot detection */ + stealth?: boolean; + /** Browser session timeout in seconds */ + timeoutSeconds?: number; + /** Enable replay recording (requires paid plan) */ + recordReplay?: boolean; + /** Grace period in seconds before stopping replay */ + replayGracePeriod?: number; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + replayId?: string; + replayViewUrl?: string; +} + +const DEFAULT_OPTIONS: Required = { + stealth: true, + timeoutSeconds: 300, + recordReplay: false, + replayGracePeriod: 5.0, +}; + +/** + * Manages Kernel browser lifecycle with optional replay recording. + * + * Usage: + * ```typescript + * const session = new KernelBrowserSession(kernel, options); + * await session.start(); + * try { + * // Use session.sessionId for computer controls + * } finally { + * await session.stop(); + * } + * ``` + */ +export class KernelBrowserSession { + private kernel: Kernel; + private options: Required; + + // Session state + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) { + throw new Error('Session not started. Call start() first.'); + } + return this._sessionId; + } + + get liveViewUrl(): string | null { + return this._liveViewUrl; + } + + get replayViewUrl(): string | null { + return this._replayViewUrl; + } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + }; + } + + /** + * Create a Kernel browser session and optionally start recording. + */ + async start(): Promise { + // Create browser with specified settings + const browser = await this.kernel.browsers.create({ + stealth: this.options.stealth, + timeout_seconds: this.options.timeoutSeconds, + viewport: { + width: 1024, + height: 768, + refresh_rate: 60, + }, + }); + + this._sessionId = browser.session_id; + this._liveViewUrl = browser.browser_live_view_url ?? null; + + console.log(`Kernel browser created: ${this._sessionId}`); + console.log(`Live view URL: ${this._liveViewUrl}`); + + // Start replay recording if enabled + if (this.options.recordReplay) { + try { + await this.startReplay(); + } catch (error) { + console.warn(`Warning: Failed to start replay recording: ${error}`); + console.warn('Continuing without replay recording.'); + } + } + + return this.info; + } + + /** + * Start recording a replay of the browser session. + */ + private async startReplay(): Promise { + if (!this._sessionId) { + return; + } + + console.log('Starting replay recording...'); + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } + + /** + * Stop recording and get the replay URL. + */ + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) { + return; + } + + console.log('Stopping replay recording...'); + await this.kernel.browsers.replays.stop(this._replayId, { + id: this._sessionId, + }); + console.log('Replay recording stopped. Processing video...'); + + // Wait a moment for processing + await this.sleep(2000); + + // Poll for replay to be ready (with timeout) + const maxWait = 60000; // 60 seconds + const startTime = Date.now(); + let replayReady = false; + + while (Date.now() - startTime < maxWait) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId); + for (const replay of replays) { + if (replay.replay_id === this._replayId) { + this._replayViewUrl = replay.replay_view_url ?? null; + replayReady = true; + break; + } + } + if (replayReady) { + break; + } + } catch { + // Ignore errors while polling + } + await this.sleep(1000); + } + + if (!replayReady) { + console.log('Warning: Replay may still be processing'); + } else if (this._replayViewUrl) { + console.log(`Replay view URL: ${this._replayViewUrl}`); + } + } + + /** + * Stop recording, and delete the browser session. + */ + async stop(): Promise { + const info = this.info; + + if (this._sessionId) { + try { + // Stop replay if recording was enabled + if (this.options.recordReplay && this._replayId) { + // Wait grace period before stopping to capture final state + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + } finally { + // Always clean up the browser session, even if replay stopping fails + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + console.log('Browser session destroyed.'); + } + } + + // Reset state + this._sessionId = null; + this._liveViewUrl = null; + this._replayId = null; + this._replayViewUrl = null; + + return info; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/ehr-system/tools/collection.ts b/pkg/templates/typescript/ehr-system/tools/collection.ts new file mode 100644 index 0000000..155352d --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/collection.ts @@ -0,0 +1,61 @@ +import { ComputerTool20241022, ComputerTool20250124 } from './computer'; +import { Action } from './types/computer'; +import type { ActionParams, ToolResult } from './types/computer'; + +export type ToolVersion = 'computer_use_20250124' | 'computer_use_20241022' | 'computer_use_20250429'; + +export const DEFAULT_TOOL_VERSION: ToolVersion = 'computer_use_20250429'; + +interface ToolGroup { + readonly version: ToolVersion; + readonly tools: (typeof ComputerTool20241022 | typeof ComputerTool20250124)[]; + readonly beta_flag: string; +} + +export const TOOL_GROUPS: ToolGroup[] = [ + { + version: 'computer_use_20241022', + tools: [ComputerTool20241022], + beta_flag: 'computer-use-2024-10-22', + }, + { + version: 'computer_use_20250124', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, + // 20250429 version inherits from 20250124 + { + version: 'computer_use_20250429', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, +]; + +export const TOOL_GROUPS_BY_VERSION: Record = Object.fromEntries( + TOOL_GROUPS.map(group => [group.version, group]) +) as Record; + +export class ToolCollection { + private tools: Map; + + constructor(...tools: (ComputerTool20241022 | ComputerTool20250124)[]) { + this.tools = new Map(tools.map(tool => [tool.name, tool])); + } + + toParams(): unknown[] { + return Array.from(this.tools.values()).map(tool => tool.toParams()); + } + + async run(name: string, toolInput: ActionParams): Promise { + const tool = this.tools.get(name); + if (!tool) { + throw new Error(`Tool ${name} not found`); + } + + if (!Object.values(Action).includes(toolInput.action)) { + throw new Error(`Invalid action ${toolInput.action} for tool ${name}`); + } + + return await tool.call(toolInput); + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tools/computer.ts b/pkg/templates/typescript/ehr-system/tools/computer.ts new file mode 100644 index 0000000..dc0eb41 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/computer.ts @@ -0,0 +1,401 @@ +import { Buffer } from 'buffer'; +import type { Kernel } from '@onkernel/sdk'; +import type { BaseAnthropicTool, ToolResult, ActionParams } from './types/computer'; +import { Action, ToolError } from './types/computer'; +import { ActionValidator } from './utils/validator'; + +const TYPING_DELAY_MS = 12; + +// Type for the tool parameters sent to Anthropic API +export interface ComputerToolParams { + name: 'computer'; + type: 'computer_20241022' | 'computer_20250124'; + display_width_px: number; + display_height_px: number; + display_number: null; +} + +export class ComputerTool implements BaseAnthropicTool { + name: 'computer' = 'computer'; + protected kernel: Kernel; + protected sessionId: string; + protected _screenshotDelay = 2.0; + protected version: '20241022' | '20250124'; + + private lastMousePosition: [number, number] = [0, 0]; + + private readonly mouseActions = new Set([ + Action.LEFT_CLICK, + Action.RIGHT_CLICK, + Action.MIDDLE_CLICK, + Action.DOUBLE_CLICK, + Action.TRIPLE_CLICK, + Action.MOUSE_MOVE, + Action.LEFT_MOUSE_DOWN, + Action.LEFT_MOUSE_UP, + ]); + + private readonly keyboardActions = new Set([ + Action.KEY, + Action.TYPE, + Action.HOLD_KEY, + ]); + + private readonly systemActions = new Set([ + Action.SCREENSHOT, + Action.CURSOR_POSITION, + Action.SCROLL, + Action.WAIT, + ]); + + constructor(kernel: Kernel, sessionId: string, version: '20241022' | '20250124' = '20250124') { + this.kernel = kernel; + this.sessionId = sessionId; + this.version = version; + } + + get apiType(): 'computer_20241022' | 'computer_20250124' { + return this.version === '20241022' ? 'computer_20241022' : 'computer_20250124'; + } + + toParams(): ComputerToolParams { + const params: ComputerToolParams = { + name: this.name, + type: this.apiType, + display_width_px: 1024, + display_height_px: 768, + display_number: null, + }; + return params; + } + + private getMouseButton(action: Action): 'left' | 'right' | 'middle' { + switch (action) { + case Action.LEFT_CLICK: + case Action.DOUBLE_CLICK: + case Action.TRIPLE_CLICK: + case Action.LEFT_CLICK_DRAG: + case Action.LEFT_MOUSE_DOWN: + case Action.LEFT_MOUSE_UP: + return 'left'; + case Action.RIGHT_CLICK: + return 'right'; + case Action.MIDDLE_CLICK: + return 'middle'; + default: + throw new ToolError(`Invalid mouse action: ${action}`); + } + } + + private async handleMouseAction(action: Action, coordinate: [number, number]): Promise { + const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); + + if (action === Action.MOUSE_MOVE) { + await this.kernel.browsers.computer.moveMouse(this.sessionId, { + x, + y, + }); + this.lastMousePosition = [x, y]; + } else if (action === Action.LEFT_MOUSE_DOWN) { + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'down', + }); + this.lastMousePosition = [x, y]; + } else if (action === Action.LEFT_MOUSE_UP) { + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'up', + }); + this.lastMousePosition = [x, y]; + } else { + const button = this.getMouseButton(action); + let numClicks = 1; + if (action === Action.DOUBLE_CLICK) { + numClicks = 2; + } else if (action === Action.TRIPLE_CLICK) { + numClicks = 3; + } + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button, + click_type: 'click', + num_clicks: numClicks, + }); + this.lastMousePosition = [x, y]; + } + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + private async handleKeyboardAction(action: Action, text: string, duration?: number): Promise { + if (action === Action.HOLD_KEY) { + const key = this.convertToKernelKey(text); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [key], + duration: duration ? duration * 1000 : undefined, + }); + } else if (action === Action.KEY) { + const key = this.convertKeyCombinationToKernel(text); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [key], + }); + } else { + await this.kernel.browsers.computer.typeText(this.sessionId, { + text, + delay: TYPING_DELAY_MS, + }); + } + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + // Key mappings for Kernel Computer Controls API (xdotool format) + private static readonly KEY_MAP: Record = { + // Enter/Return + 'return': 'Return', + 'enter': 'Return', + 'Enter': 'Return', + // Arrow keys + 'left': 'Left', + 'right': 'Right', + 'up': 'Up', + 'down': 'Down', + 'ArrowLeft': 'Left', + 'ArrowRight': 'Right', + 'ArrowUp': 'Up', + 'ArrowDown': 'Down', + // Navigation + 'home': 'Home', + 'end': 'End', + 'pageup': 'Page_Up', + 'page_up': 'Page_Up', + 'PageUp': 'Page_Up', + 'pagedown': 'Page_Down', + 'page_down': 'Page_Down', + 'PageDown': 'Page_Down', + // Editing + 'delete': 'Delete', + 'backspace': 'BackSpace', + 'Backspace': 'BackSpace', + 'tab': 'Tab', + 'insert': 'Insert', + // Escape + 'esc': 'Escape', + 'escape': 'Escape', + // Function keys + 'f1': 'F1', + 'f2': 'F2', + 'f3': 'F3', + 'f4': 'F4', + 'f5': 'F5', + 'f6': 'F6', + 'f7': 'F7', + 'f8': 'F8', + 'f9': 'F9', + 'f10': 'F10', + 'f11': 'F11', + 'f12': 'F12', + // Misc + 'space': 'space', + 'minus': 'minus', + 'equal': 'equal', + 'plus': 'plus', + }; + + // Modifier key mappings (xdotool format) + private static readonly MODIFIER_MAP: Record = { + 'ctrl': 'ctrl', + 'control': 'ctrl', + 'Control': 'ctrl', + 'alt': 'alt', + 'Alt': 'alt', + 'shift': 'shift', + 'Shift': 'shift', + 'meta': 'super', + 'Meta': 'super', + 'cmd': 'super', + 'command': 'super', + 'win': 'super', + 'super': 'super', + }; + + private convertToKernelKey(key: string): string { + // Check modifier keys first + if (ComputerTool.MODIFIER_MAP[key]) { + return ComputerTool.MODIFIER_MAP[key]; + } + // Check special keys + if (ComputerTool.KEY_MAP[key]) { + return ComputerTool.KEY_MAP[key]; + } + // Return as-is if no mapping exists + return key; + } + + private convertKeyCombinationToKernel(combo: string): string { + // Handle key combinations (e.g., "ctrl+a", "Control+t") + if (combo.includes('+')) { + const parts = combo.split('+'); + const mappedParts = parts.map(part => this.convertToKernelKey(part.trim())); + return mappedParts.join('+'); + } + // Single key - just convert it + return this.convertToKernelKey(combo); + } + + async screenshot(): Promise { + try { + console.log('Starting screenshot...'); + await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); + const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); + const blob = await response.blob(); + const arrayBuffer = await blob.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + console.log('Screenshot taken, size:', buffer.length, 'bytes'); + + return { + base64Image: buffer.toString('base64'), + }; + } catch (error) { + throw new ToolError(`Failed to take screenshot: ${error}`); + } + } + + async call(params: ActionParams): Promise { + const { + action, + text, + coordinate, + scrollDirection: scrollDirectionParam, + scroll_amount, + scrollAmount, + duration, + ...kwargs + } = params; + + ActionValidator.validateActionParams(params, this.mouseActions, this.keyboardActions); + + if (action === Action.SCREENSHOT) { + return await this.screenshot(); + } + + if (action === Action.CURSOR_POSITION) { + throw new ToolError('Cursor position is not available with Kernel Computer Controls API'); + } + + if (action === Action.SCROLL) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); + } + + const scrollDirection = (scrollDirectionParam || kwargs.scroll_direction) as string | undefined; + const scrollAmountValue = scrollAmount || scroll_amount; + + if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(String(scrollDirection))) { + throw new ToolError(`Scroll direction "${scrollDirection}" must be 'up', 'down', 'left', or 'right'`); + } + if (typeof scrollAmountValue !== 'number' || scrollAmountValue < 0) { + throw new ToolError(`Scroll amount "${scrollAmountValue}" must be a non-negative number`); + } + + const [x, y] = coordinate + ? ActionValidator.validateAndGetCoordinates(coordinate) + : this.lastMousePosition; + + let delta_x = 0; + let delta_y = 0; + // Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior) + const scrollDelta = (scrollAmountValue ?? 1) * 120; + + if (scrollDirection === 'down') { + delta_y = scrollDelta; + } else if (scrollDirection === 'up') { + delta_y = -scrollDelta; + } else if (scrollDirection === 'right') { + delta_x = scrollDelta; + } else if (scrollDirection === 'left') { + delta_x = -scrollDelta; + } + + await this.kernel.browsers.computer.scroll(this.sessionId, { + x, + y, + delta_x, + delta_y, + }); + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + if (action === Action.WAIT) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); + } + await new Promise(resolve => setTimeout(resolve, duration! * 1000)); + return await this.screenshot(); + } + + if (action === Action.LEFT_CLICK_DRAG) { + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + + const [endX, endY] = ActionValidator.validateAndGetCoordinates(coordinate); + const startCoordinate = kwargs.start_coordinate as [number, number] | undefined; + const [startX, startY] = startCoordinate + ? ActionValidator.validateAndGetCoordinates(startCoordinate) + : this.lastMousePosition; + + console.log(`Dragging from (${startX}, ${startY}) to (${endX}, ${endY})`); + + await this.kernel.browsers.computer.dragMouse(this.sessionId, { + path: [[startX, startY], [endX, endY]], + button: 'left', + }); + + this.lastMousePosition = [endX, endY]; + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + if (this.mouseActions.has(action)) { + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + return await this.handleMouseAction(action, coordinate); + } + + if (this.keyboardActions.has(action)) { + if (!text) { + throw new ToolError(`text is required for ${action}`); + } + return await this.handleKeyboardAction(action, text, duration); + } + + throw new ToolError(`Invalid action: ${action}`); + } +} + +// For backward compatibility +export class ComputerTool20241022 extends ComputerTool { + constructor(kernel: Kernel, sessionId: string) { + super(kernel, sessionId, '20241022'); + } +} + +export class ComputerTool20250124 extends ComputerTool { + constructor(kernel: Kernel, sessionId: string) { + super(kernel, sessionId, '20250124'); + } +} diff --git a/pkg/templates/typescript/ehr-system/tools/types/computer.ts b/pkg/templates/typescript/ehr-system/tools/types/computer.ts new file mode 100644 index 0000000..d7ac72e --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/types/computer.ts @@ -0,0 +1,64 @@ +export enum Action { + // Mouse actions + MOUSE_MOVE = 'mouse_move', + LEFT_CLICK = 'left_click', + RIGHT_CLICK = 'right_click', + MIDDLE_CLICK = 'middle_click', + DOUBLE_CLICK = 'double_click', + TRIPLE_CLICK = 'triple_click', + LEFT_CLICK_DRAG = 'left_click_drag', + LEFT_MOUSE_DOWN = 'left_mouse_down', + LEFT_MOUSE_UP = 'left_mouse_up', + + // Keyboard actions + KEY = 'key', + TYPE = 'type', + HOLD_KEY = 'hold_key', + + // System actions + SCREENSHOT = 'screenshot', + CURSOR_POSITION = 'cursor_position', + SCROLL = 'scroll', + WAIT = 'wait', +} + +// For backward compatibility +export type Action_20241022 = Action; +export type Action_20250124 = Action; + +export type MouseButton = 'left' | 'right' | 'middle'; +export type ScrollDirection = 'up' | 'down' | 'left' | 'right'; +export type Coordinate = [number, number]; +export type Duration = number; + +export interface ActionParams { + action: Action; + text?: string; + coordinate?: Coordinate; + scrollDirection?: ScrollDirection; + scroll_amount?: number; + scrollAmount?: number; + duration?: Duration; + key?: string; + [key: string]: Action | string | Coordinate | ScrollDirection | number | Duration | undefined; +} + +export interface ToolResult { + output?: string; + error?: string; + base64Image?: string; + system?: string; +} + +export interface BaseAnthropicTool { + name: string; + apiType: string; + toParams(): unknown; +} + +export class ToolError extends Error { + constructor(message: string) { + super(message); + this.name = 'ToolError'; + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tools/utils/keyboard.ts b/pkg/templates/typescript/ehr-system/tools/utils/keyboard.ts new file mode 100644 index 0000000..244cddf --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/utils/keyboard.ts @@ -0,0 +1,88 @@ +export class KeyboardUtils { + // Only map alternative names to standard Playwright modifier keys + private static readonly modifierKeyMap: Record = { + 'ctrl': 'Control', + 'alt': 'Alt', + 'cmd': 'Meta', + 'command': 'Meta', + 'win': 'Meta', + }; + + // Essential key mappings for Playwright compatibility + private static readonly keyMap: Record = { + 'return': 'Enter', + 'space': ' ', + 'left': 'ArrowLeft', + 'right': 'ArrowRight', + 'up': 'ArrowUp', + 'down': 'ArrowDown', + 'home': 'Home', + 'end': 'End', + 'pageup': 'PageUp', + 'page_up': 'PageUp', + 'pagedown': 'PageDown', + 'page_down': 'PageDown', + 'delete': 'Delete', + 'backspace': 'Backspace', + 'tab': 'Tab', + 'esc': 'Escape', + 'escape': 'Escape', + 'insert': 'Insert', + 'super_l': 'Meta', + 'f1': 'F1', + 'f2': 'F2', + 'f3': 'F3', + 'f4': 'F4', + 'f5': 'F5', + 'f6': 'F6', + 'f7': 'F7', + 'f8': 'F8', + 'f9': 'F9', + 'f10': 'F10', + 'f11': 'F11', + 'f12': 'F12', + 'minus': '-', + 'equal': '=', + 'plus': '+', + }; + + static isModifierKey(key: string | undefined): boolean { + if (!key) return false; + const normalizedKey = this.modifierKeyMap[key.toLowerCase()] || key; + return ['Control', 'Alt', 'Shift', 'Meta'].includes(normalizedKey); + } + + static getPlaywrightKey(key: string | undefined): string { + if (!key) { + throw new Error('Key cannot be undefined'); + } + + const normalizedKey = key.toLowerCase(); + + // Handle special cases + if (normalizedKey in this.keyMap) { + return this.keyMap[normalizedKey] as string; + } + + // Normalize modifier keys + if (normalizedKey in this.modifierKeyMap) { + return this.modifierKeyMap[normalizedKey] as string; + } + + // Return the key as is - Playwright handles standard key names + return key; + } + + static parseKeyCombination(combo: string): string[] { + if (!combo) { + throw new Error('Key combination cannot be empty'); + } + return combo.toLowerCase().split('+').map(key => { + const trimmedKey = key.trim(); + if (!trimmedKey) { + throw new Error('Invalid key combination: empty key'); + } + return this.getPlaywrightKey(trimmedKey); + }); + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tools/utils/validator.ts b/pkg/templates/typescript/ehr-system/tools/utils/validator.ts new file mode 100644 index 0000000..b8522c8 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/utils/validator.ts @@ -0,0 +1,67 @@ +import { Action, ToolError } from '../types/computer'; +import type { ActionParams, Coordinate, Duration } from '../types/computer'; + +export class ActionValidator { + static validateText(text: string | undefined, required: boolean, action: string): void { + if (required && text === undefined) { + throw new ToolError(`text is required for ${action}`); + } + if (text !== undefined && typeof text !== 'string') { + throw new ToolError(`${text} must be a string`); + } + } + + static validateCoordinate(coordinate: Coordinate | undefined, required: boolean, action: string): void { + if (required && !coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + if (coordinate) { + this.validateAndGetCoordinates(coordinate); + } + } + + static validateDuration(duration: Duration | undefined): void { + if (duration === undefined || typeof duration !== 'number') { + throw new ToolError(`${duration} must be a number`); + } + if (duration < 0) { + throw new ToolError(`${duration} must be non-negative`); + } + if (duration > 100) { + throw new ToolError(`${duration} is too long`); + } + } + + static validateAndGetCoordinates(coordinate: Coordinate): Coordinate { + if (!Array.isArray(coordinate) || coordinate.length !== 2) { + throw new ToolError(`${coordinate} must be a tuple of length 2`); + } + if (!coordinate.every(i => typeof i === 'number' && i >= 0)) { + throw new ToolError(`${coordinate} must be a tuple of non-negative numbers`); + } + return coordinate; + } + + static validateActionParams(params: ActionParams, mouseActions: Set, keyboardActions: Set): void { + const { action, text, coordinate, duration } = params; + + // Validate text parameter + if (keyboardActions.has(action)) { + this.validateText(text, true, action); + } else { + this.validateText(text, false, action); + } + + // Validate coordinate parameter + if (mouseActions.has(action)) { + this.validateCoordinate(coordinate, true, action); + } else { + this.validateCoordinate(coordinate, false, action); + } + + // Validate duration parameter + if (action === Action.HOLD_KEY || action === Action.WAIT) { + this.validateDuration(duration); + } + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tsconfig.json b/pkg/templates/typescript/ehr-system/tsconfig.json new file mode 100644 index 0000000..fa10973 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../tsconfig.base.json", + "compilerOptions": { + "outDir": "dist", + "rootDir": ".", + "lib": ["ESNext", "DOM"] + }, + "include": ["."] +} diff --git a/pkg/templates/typescript/ehr-system/types/beta.ts b/pkg/templates/typescript/ehr-system/types/beta.ts new file mode 100644 index 0000000..35328d7 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/types/beta.ts @@ -0,0 +1,58 @@ +import type { BetaMessageParam as AnthropicMessageParam, BetaMessage as AnthropicMessage, BetaContentBlock as AnthropicContentBlock } from '@anthropic-ai/sdk/resources/beta/messages/messages'; +import type { ActionParams } from '../tools/types/computer'; + +// Re-export the SDK types +export type BetaMessageParam = AnthropicMessageParam; +export type BetaMessage = AnthropicMessage; +export type BetaContentBlock = AnthropicContentBlock; + +// Keep our local types for internal use +export interface BetaTextBlock { + type: 'text'; + text: string; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaImageBlock { + type: 'image'; + source: { + type: 'base64'; + media_type: 'image/png'; + data: string; + }; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaToolUseBlock { + type: 'tool_use'; + name: string; + input: ActionParams; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaThinkingBlock { + type: 'thinking'; + thinking: { + type: 'enabled'; + budget_tokens: number; + } | { + type: 'disabled'; + }; + signature?: string; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaToolResultBlock { + type: 'tool_result'; + content: (BetaTextBlock | BetaImageBlock)[] | string; + tool_use_id: string; + is_error: boolean; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export type BetaLocalContentBlock = BetaTextBlock | BetaImageBlock | BetaToolUseBlock | BetaThinkingBlock | BetaToolResultBlock; \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/utils/message-processing.ts b/pkg/templates/typescript/ehr-system/utils/message-processing.ts new file mode 100644 index 0000000..2595ec4 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/utils/message-processing.ts @@ -0,0 +1,79 @@ +import type { BetaMessage, BetaMessageParam, BetaToolResultBlock, BetaContentBlock, BetaLocalContentBlock } from '../types/beta'; + +export function responseToParams(response: BetaMessage): BetaContentBlock[] { + return response.content.map(block => { + if (block.type === 'text' && block.text) { + return { type: 'text', text: block.text } as BetaContentBlock; + } + if (block.type === 'thinking') { + const { thinking, signature, ...rest } = block as any; + return { ...rest, thinking, ...(signature && { signature }) } as BetaContentBlock; + } + return block as BetaContentBlock; + }); +} + +export function maybeFilterToNMostRecentImages( + messages: BetaMessageParam[], + imagesToKeep: number, + minRemovalThreshold: number +): void { + if (!imagesToKeep) return; + + const toolResultBlocks = messages + .flatMap(message => Array.isArray(message?.content) ? message.content : []) + .filter((item): item is BetaToolResultBlock => + typeof item === 'object' && item.type === 'tool_result' + ); + + const totalImages = toolResultBlocks.reduce((count, toolResult) => { + if (!Array.isArray(toolResult.content)) return count; + return count + toolResult.content.filter( + content => typeof content === 'object' && content.type === 'image' + ).length; + }, 0); + + let imagesToRemove = Math.floor((totalImages - imagesToKeep) / minRemovalThreshold) * minRemovalThreshold; + + for (const toolResult of toolResultBlocks) { + if (Array.isArray(toolResult.content)) { + toolResult.content = toolResult.content.filter(content => { + if (typeof content === 'object' && content.type === 'image') { + if (imagesToRemove > 0) { + imagesToRemove--; + return false; + } + } + return true; + }); + } + } +} + +const PROMPT_CACHING_BETA_FLAG = 'prompt-caching-2024-07-31'; + +export function injectPromptCaching(messages: BetaMessageParam[]): void { + let breakpointsRemaining = 3; + + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i]; + if (!message) continue; + if (message.role === 'user' && Array.isArray(message.content)) { + if (breakpointsRemaining > 0) { + breakpointsRemaining--; + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + (lastContent as BetaLocalContentBlock).cache_control = { type: 'ephemeral' }; + } + } else { + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + delete (lastContent as BetaLocalContentBlock).cache_control; + } + break; + } + } + } +} + +export { PROMPT_CACHING_BETA_FLAG }; \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/utils/tool-results.ts b/pkg/templates/typescript/ehr-system/utils/tool-results.ts new file mode 100644 index 0000000..c18eab2 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/utils/tool-results.ts @@ -0,0 +1,49 @@ +import type { ToolResult } from '../tools/types/computer'; +import type { BetaToolResultBlock, BetaTextBlock, BetaImageBlock } from '../types/beta'; + +export function makeApiToolResult( + result: ToolResult, + toolUseId: string +): BetaToolResultBlock { + const toolResultContent: (BetaTextBlock | BetaImageBlock)[] = []; + let isError = false; + + if (result.error) { + isError = true; + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.error), + }); + } else { + if (result.output) { + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.output), + }); + } + if (result.base64Image) { + toolResultContent.push({ + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: result.base64Image, + }, + }); + } + } + + return { + type: 'tool_result', + content: toolResultContent, + tool_use_id: toolUseId, + is_error: isError, + }; +} + +export function maybePrependSystemToolResult(result: ToolResult, resultText: string): string { + if (result.system) { + return `${result.system}\n${resultText}`; + } + return resultText; +} \ No newline at end of file