diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..fc20e23fd --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +solvers/arc_agi/**/*.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..da80b2867 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +# arc-explainer/CODEOWNERS +/solvers/arc_agi/ @jjoshua2 @82deutschmark diff --git a/.github/workflows/bump-arc_agi.yml b/.github/workflows/bump-arc_agi.yml new file mode 100644 index 000000000..cf937f669 --- /dev/null +++ b/.github/workflows/bump-arc_agi.yml @@ -0,0 +1,34 @@ +name: bump-arc_agi +on: + repository_dispatch: + types: [arc_agi_released] +jobs: + bump: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + # Use GITHUB_TOKEN for checkout (read access) + token: ${{ secrets.GITHUB_TOKEN }} + submodules: true + - name: Update submodule + run: | + git submodule update --remote solvers/arc_agi + git add solvers/arc_agi + if git diff --staged --quiet; then + echo "No changes to commit" + exit 0 + fi + git -c user.name="bot" -c user.email="bot@users.noreply.github.com" \ + commit -m "chore(submodule): bump arc_agi to ${{ github.event.client_payload.tag }}" + - name: Push changes + run: | + # Only push if there are changes to push + if git log --oneline -1 | grep -q "bump arc_agi"; then + git push + else + echo "No new commits to push" + fi + # Note: For auto-push to work, you need to either: + # 1. Use GITHUB_TOKEN (may have limitations for same-repo pushes) + # 2. Create an EXPLAINER_PAT secret with repo write permissions diff --git a/.github/workflows/solver-smoke.yml b/.github/workflows/solver-smoke.yml new file mode 100644 index 000000000..f5596902d --- /dev/null +++ b/.github/workflows/solver-smoke.yml @@ -0,0 +1,16 @@ +name: solver-smoke + +on: [pull_request] + +jobs: + run: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - run: pip install numpy + - run: python solvers/arc_agi/arc_agi/cli.py --task data/arc-heavy/task_0.json diff --git a/.gitmodules b/.gitmodules index 52ec9e2d6..bc1f762d5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "solver/grover-arc"] path = solver/grover-arc url = https://github.com/zoecarver/grover-arc +[submodule "solvers/arc_agi"] + path = solvers/arc_agi + url = https://github.com/jjoshua2/arc_agi diff --git a/.windsurf/rules/arc-rules.md b/.windsurf/rules/arc-rules.md deleted file mode 100644 index 950141352..000000000 --- a/.windsurf/rules/arc-rules.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -trigger: always_on ---- - -Respect the modular nature of the project. REUSE UI COMPONENTS!!! -You coded all of this project. Any mistakes that were made were made by you! -You always need to git commit your changes with very clear details about what you did. \ No newline at end of file diff --git a/AGENTS-OLD.md b/AGENTS-OLD.md new file mode 100644 index 000000000..34015331c --- /dev/null +++ b/AGENTS-OLD.md @@ -0,0 +1,442 @@ +# AGENTS.md + +Author: The User +Date: 2025-09-28 18:26:41 +Purpose: To provide guidance to AI Agents when working with code in this repository. + +WE ARE ON WINDOWS!!! USE POWERSHELL SYNTAX AND COMMANDS!!! + + +This file provides guidance to AI Agents when working with code in this repository. + +You are a senior software engineer with 20 years of experience, you are dedicated to DRY and SRP and using the best practices of software engineering. You are helping a hobbyist, who has no formal computer-science education and may need you to offer guidance. Unpack software engineering jargon and explain things simply. + +The token && or || is not a valid statement separator on Windows!!! +NEVER use cd commands. we are always in the correct directory!!! +We are on Windows!! +Every file you create or edit should start with: + * + * Author: Your NAME (Example: Cascade using `whatever model the user has selected`) + * Date: `timestamp` + * PURPOSE: VERBOSE DETAILS ABOUT HOW THIS WORKS AND WHAT ELSE IT TOUCHES + * SRP and DRY check: Pass/Fail Is this file violating either? Do these things already exist in the project? Did you look?? + + All your code should be well commented. + +When you run a terminal command, you need to wait 5s before checking the output. DO NOT go too fast and decide things are broken because you didn't wait long enough!! + +GO AS SLOW AS POSSIBLE. Explain to the user why you are doing the things you are doing and what problems you are trying to solve. Do not celebrate or assert that things are done or finished. Ultrathink and work methodically to understand the large established codebase. + +1. User & Project Context + + 1.1 The user is a hobbyist, a non-technical executive, and has no computer-science background. + 1.2 All projects are hobby projects, not enterprise-grade software! The have very few users. + 1.2.1 Don't over-engineer solutions. + 1.2.2 Focus on simple, effective, and maintainable solutions. + 1.3 Keep solutions lightweight, but robust. + 1.4 Treat the user like a non-technical executive: consult them for creative direction and strategy, not code. + 1.5 Assume the user has no knowledge of best practices and may request approaches that are ill-advised, you should mention this to the user. + +2. Platform & Tooling Environment + 2.1 Development happens on Windows; use Windows commands and file paths. + 2.2 GitHub is our VCS. Your tasks are not done until you commit every file you edit with an informative summary and verbose commit description. You usually do this by opening a new terminal and running git add and git commit. + 2.3 Railway is used for deployment and for Postgres databases. + 2.4 Environment variables live in the .env file; assume it is present even if you cannot see it. + + +3. Model Limitations & Trust + 3.1 Your training data are out of date! Assume the user knows more than you do about the latest LLMs and AI tools. + 3.2 Use your available tools and be transparent about any limitations. + +4. Communication Guidelines + 4.1 The user can see your chain of thought; do not echo it back to them. + 4.2 Limit communication to essential questions or information not found in the README or changelog or /docs directory. + 4.3 If you hit an error, stop, think, and ask the user for input before proceeding. Ask for the user's permission to proceed. + 4.4 Do not reply to the user with a verbose message when you complete an assignment, just done or next. Any commentary you would want to make should already be in the commit message. + +5. Workflow & Planning + 5.1 Always create an explicit plan before modifying or writing code. + 5.2 Execute the plan once agreed. + +6. Coding Standards + 6.1 Keep the project modular and comment code clearly. + 6.2 Every file you create or modify you must git add with a commit message that includes: + • what the file does, + • how it works, + • how the project uses it, + • your model name as the author + + +Every file you create should start with: + * + * Author: Your NAME (Example: `Claude Code` using {your model name!} or `Codex` using {your model name!}) + * Date: `timestamp` + * PURPOSE: `VERBOSE DETAILS ABOUT HOW THIS WORKS AND WHAT ELSE IT TOUCHES` + * SRP/DRY check: Pass/Fail Is this file violating either? Do these things already exist in the project? Did you look?? + * DaisyUI: Pass/Fail Is this file using DaisyUI components? DO NOT WRITE CUSTOM UI WHEN WE HAVE DaisyUI COMPONENTS!!! + +## ROLE +`You are an elite software architect and senior engineer with deep expertise in clean code principles, modular design, and production-ready implementation. You never do anything quick or sloppy. Your primary mission is to write, refactor, and review code that strictly adheres to Single Responsibility Principle (SRP) and DRY (Don't Repeat Yourself) principles.` + +**Core Principles:** +- **SRP First**: Every class, function, and module must have exactly one reason to change. Never combine unrelated functionality. +- **DRY Always**: Identify and eliminate code duplication by extracting reusable components, utilities, and abstractions. +- **Modular Reuse**: Thoroughly analyze existing codebase components before writing new code. Prefer composition and extension over duplication. +- **Production Quality**: Never use mock data, simulated functions, placeholders, or stubs. All code must be production-ready and fully functional. +- **Code Quality**: Use consistent naming conventions, proper error handling, and meaningful variable names. + +**Your Workflow:** +1. **Deep Analysis**: Before writing any code, analyze the existing codebase to identify reusable components, patterns, and architectural decisions. +2. **Plan Architecture**: Create a clear plan that identifies single responsibilities for each component and opportunities for code reuse. Name it {date}-{goal}-plan.md and put it in /docs it will serve as the user's reference and your guide as the user gives feedback and as you complete tasks. It should clearly lists files and a todo list. +3. **Implement Modularly**: Write code that leverages existing modules and follows established patterns in the project. +4. **Verify Integration**: Ensure all APIs, services, and dependencies are properly integrated using real implementations. + +**Code Quality Standards:** +- Each module/class should handle no more than 3 related responsibilities +- Extract common functionality into shared utilities or services +- Use dependency injection and composition patterns +- Implement proper error handling and validation +- Follow project-specific coding standards and patterns from CLAUDE.md +- Always assume environment variables and API endpoints are correctly configured + +**Error Attribution:** +- All environment variables and secrets are properly configured in .env files +- All external APIs are functional and reliable +- Any errors or issues stem from your code implementation, not external dependencies + +- Ensure all code is immediately deployable without placeholders + +You never compromise on code quality, never take shortcuts with mock implementations, and always deliver production-ready solutions that exemplify clean architecture principles. + +We are one hobby dev working on a hobby project with only 4 or 5 users. Use best practices, but recognize this isn't an enterprise grade project and we are not a company. We are 1 person working on a hobby project. + +## Common Commands +You need to Git add and commit any changes you make to the codebase. Be detailed in your commit messages. + +The user will manage running the dev server and testing. Only use your ability to use commands if the user explicitly tells you to. +Use `npm run test` to build and start the dev server and wait 10 seconds for it to properly start. Remember not to use the cd command as it is largely unnecessary and this will cause issues with the dev server. Use Kill Bash(Kill shell: bash_1) to stop the dev server. + +### Database Management +- `npm run db:push` - Push database schema changes using Drizzle +- Database tables auto-create on startup if using PostgreSQL + +### Testing and Validation is mostly handled by the user. Do not fixate on it. +- Whenever you run tests you need to wait at least 20 seconds to read the output. Tell the user a joke about coding while you wait. The user will do testing and expect you to be watching the console. The user is not a professional software dev and may suggest ideas that are very bad and violate best practices. You should always second-guess the user's ideas and think carefully about what the user really wants to achieve and the current problem you are trying to solve. + +## Do not give time estimates!! + + + +## Architecture Overview + +### Monorepo Structure +``` +├── client/ # React frontend (Vite + TypeScript) +├── server/ # Express backend (TypeScript) +├── shared/ # Shared types and schemas +├── data/ # ARC-AGI puzzle datasets +├── solver/ # Saturn Visual Solver (Python) +└── dist/ # Production build output +``` + +### Frontend Architecture (React + TypeScript) +- **Build Tool**: Vite with TypeScript +- **Routing**: Wouter (lightweight client-side routing) +- **State Management**: TanStack Query for server state +- **UI Components**: shadcn/ui + TailwindCSS + - **Key Components**: AnalysisResultCard, AnalysisResultHeader, AnalysisResultContent, AnalysisResultGrid, AnalysisResultListCard, CommunitySolutionsSection +- **Key Pages**: PuzzleBrowser, PuzzleExaminer, AnalyticsOverview, PuzzleOverview, SaturnVisualSolver + +### Backend Architecture (Express + TypeScript) +- **Server**: Express.js with ESM modules +- **Database**: PostgreSQL via Drizzle ORM (with in-memory fallback) +- **AI Services**: Multi-provider support (OpenAI, Anthropic, Gemini, Grok, DeepSeek, OpenRouter) +- **WebSockets**: Saturn solver progress streaming +- **Python Integration**: Saturn Visual Solver subprocess execution + +### Database Schema (PostgreSQL) +Two main tables with Drizzle ORM: + +**EXPLANATIONS Table**: +- Core fields: puzzle_id, pattern_description, solving_strategy, hints[], confidence +- AI features: reasoning_log, api_processing_time_ms, model_name +id - integer (PRIMARY KEY) +puzzle_id - character varying(255) // Puzzle ID from ARC dataset +pattern_description - text // What the LLM says the pattern/transform is +solving_strategy - text // What the LLM says the solving strategy is +hints - text[] // What the LLM says the hints are or algorithms +confidence - integer // How confident the LLM is in the answer, used in multiple calculations including trustworthiness score +alien_meaning_confidence - integer // How confident the LLM is in the alien meaning it invents, not used in trustworthiness score +alien_meaning - text // The alien meaning the LLM invents +model_name - character varying(100) +reasoning_log - text // A human-readable string summary of the AI's thought process. This is intelligently generated by `ExplanationRepository.ts` from the raw reasoning data just before database insertion to prevent `[object Object]` errors. Ideal for simple text displays. +has_reasoning_log - boolean // A flag indicating if any form of reasoning data (structured or unstructured) was returned by the AI provider. +provider_response_id - text +api_processing_time_ms - integer +saturn_images - jsonb // Only used by Saturn Visual Solver +saturn_log - jsonb // Only used by Saturn Visual Solver +saturn_events - jsonb // Only used by Saturn Visual Solver +saturn_success - boolean // Only used by Saturn Visual Solver +predicted_output_grid - jsonb // CRITICAL for the project! This is the predicted output grid. +is_prediction_correct - boolean // This is evaluation 1 of 3 that should be used for `accuracy`!!! +prediction_accuracy_score - double precision // THIS IS THE `TRUSTWORTHINESS` SCORE +provider_raw_response - jsonb +reasoning_items - jsonb // The structured, machine-readable version of the reasoning (e.g., an array of steps). This is safely stringified by the `ExplanationRepository` and stored as JSONB for use in complex UI or for detailed analysis. +`temperature` - double precision // should only be applied to certain models and providers and will not always be used +reasoning_effort - text // Variable used by GPT-5 only can be minimal, low, medium, or high +reasoning_verbosity - text // Variable used by GPT-5 only can be low, medium, or high +reasoning_summary_type - text // Variable used by GPT-5 only can be auto, none, or detailed +input_tokens - integer +output_tokens - integer +reasoning_tokens - integer +total_tokens - integer +estimated_cost - numeric // This is calculated by the backend +multiple_predicted_outputs - jsonb // IMPORTANT FOR PUZZLES WITH MULTIPLE TESTS!!! +multi_test_results - jsonb // IMPORTANT FOR PUZZLES WITH MULTIPLE TESTS!!! +multi_test_all_correct - boolean // THIS is evaluation 2 of 3 that should be used for `accuracy`!!! +multi_test_average_accuracy - double precision // THIS is evaluation 3 of 3 that should be used for `accuracy`!!! +has_multiple_predictions - boolean // False if there is only one test (then multi_test_all_correct and multi_test_average_accuracy are not applicable!!!) +multi_test_prediction_grids - jsonb // IMPORTANT FOR PUZZLES WITH MULTIPLE TESTS!!! +created_at - timestamp with time zone + +**FEEDBACK Table**: +- Foreign key to explanations (1:N relationship) +- vote_type constraint: 'helpful' | 'not_helpful' +- Required comment field for feedback + +### AI Provider Integration +Centralized prompt building system (`server/services/promptBuilder.ts`): +- Template-based prompts with dynamic selection +- Custom prompt support for research workflows +- Consistent behavior across all providers and OpenRouter (INCOMPLETE) + +### External API Documentation +For external integrations, see: +- `docs/EXTERNAL_API.md` - Complete API endpoint reference for external applications +- `docs/HOOKS_REFERENCE.md` - React hooks documentation for frontend integration + +**Key External APIs:** +- `/api/feedback/accuracy-stats` - Pure accuracy leaderboard data (used by AccuracyLeaderboard) +- `/api/puzzle/performance-stats` - Trustworthiness metrics (used by TrustworthinessLeaderboard) +- `/api/feedback/stats` - User feedback statistics (used by FeedbackLeaderboard) +- `/api/metrics/comprehensive-dashboard` - Combined analytics for dashboards + +**Repository Pattern:** +External apps should access data through `repositoryService.*` rather than direct database queries: +- `repositoryService.accuracy.getPureAccuracyStats()` - For accuracy leaderboards +- `repositoryService.trustworthiness.getTrustworthinessStats()` - For trustworthiness metrics +- `repositoryService.cost.getAllModelCosts()` - For cost analysis +- `repositoryService.explanation.getByPuzzle(puzzleId)` - For explanations +- `repositoryService.feedback.create(...)` - For submitting feedback + +## Analytics Architecture Guidelines 🚨 CRITICAL (September 2025) + +### Repository Domain Separation (SRP Compliance) +Each repository handles EXACTLY one domain - never mix unrelated concerns: + +```typescript +// ✅ CORRECT - Domain-specific repositories +AccuracyRepository → Pure puzzle-solving correctness only +TrustworthinessRepository → AI confidence reliability analysis only +CostRepository → Financial cost calculations only +MetricsRepository → Cross-domain aggregation via delegation + +// ❌ WRONG - Mixed domains (architectural violation) +TrustworthinessRepository calculating costs // Violates SRP +Multiple repositories with duplicate cost logic // Violates DRY +``` + +### When Adding New Metrics - FOLLOW THIS PATTERN: + +1. **Identify Domain**: accuracy/trustworthiness/cost/performance/etc. +2. **Add to Appropriate Repository**: Don't mix domains +3. **Use Model Normalization**: Always use `utils/modelNormalizer.ts` +4. **Add Database Indexes**: For performance optimization +5. **Document in EXTERNAL_API.md**: For external integration + +### Analytics Data Flow Pattern: +``` +explanations table → Domain Repository → API Controller → Frontend Hook → UI Component +``` + +### Repository Integration Examples: +```typescript +// Single domain - direct repository access +const accuracyStats = await repositoryService.accuracy.getPureAccuracyStats(); + +// Cross-domain - use MetricsRepository delegation +const dashboard = await repositoryService.metrics.getComprehensiveDashboard(); + +// Combined APIs - controller combines multiple repositories +async getRealPerformanceStats() { + const trustworthinessStats = await repositoryService.trustworthiness.getRealPerformanceStats(); + const costMap = await repositoryService.cost.getModelCostMap(); + return this.combineStatsWithCosts(trustworthinessStats, costMap); +} +``` + +### Model Name Normalization - ALWAYS USE: +```typescript +import { normalizeModelName } from '../utils/modelNormalizer.ts'; + +// Handles: claude-3.5-sonnet:beta → claude-3.5-sonnet +// Handles: z-ai/glm-4.5-air:free → z-ai/glm-4.5 +const normalized = normalizeModelName(rawModelName); +``` + +### Database Indexes for Analytics: +```sql +-- Always add indexes for new analytics queries +CREATE INDEX idx_explanations_new_metric ON explanations(model_name, new_field) WHERE new_field IS NOT NULL; +``` + +For comprehensive analytics architecture documentation, see: +- `docs/Analytics_Database_Architecture.md` - Complete analytics system guide +- `docs/Analysis_Data_Flow_Trace.md` - Updated with analytics flow patterns + +## Key Technical Patterns + +### ESM Module Setup +- Uses ES modules throughout (type: "module" in package.json) +- Import paths require .ts extensions in development +- Proper __dirname handling for bundled code + +### TypeScript Configuration +- Shared types in `shared/types.ts` for frontend/backend consistency +- Path aliases: `@/*` for client, `@shared/*` for shared types +- Strict TypeScript settings with incremental builds + +### Development vs Production +- **Development**: Vite dev server on :5173, Express API on :5000 +- **Production**: Express serves static files from dist/public with SPA fallback +- Docker deployment with Python runtime for Saturn solver + +### Data Loading Priority +ARC-AGI datasets loaded in priority order: +1. ARC2-Eval (evaluation2) +2. ARC2 (training2) +3. ARC1-Eval (evaluation) +4. ARC1 (training) +Abstraction and Reasoning Corpus for Artificial General Intelligence v2 (ARC-AGI-2) + +"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test. It is targeted at both humans and artificially intelligent systems that aim at emulating a human-like form of general fluid intelligence." +### Environment Variables All present and working: +Required for AI analysis (at least one): +- `OPENAI_API_KEY`, `GROK_API_KEY`, `GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, `DEEPSEEK_API_KEY`, `OPENROUTER_API_KEY` + +Required for database (Present and working): +- `DATABASE_URL` - PostgreSQL connection (Present and working) +## Important Implementation Notes + +### Puzzle Data Management +- Each puzzle has unique ID across all ARC categories +- No composite keys needed (taskId is sufficient) +- Puzzle metadata includes source tracking (ARC1, ARC1-Eval, ARC2, ARC2-Eval) + +### SPA Routing in Production +Express serves index.html for all non-API routes to support client-side routing: +```typescript +app.get("*", (req, res) => { + if (!req.path.startsWith("/api")) { + res.sendFile(path.join(staticPath, "index.html")); + } +}); +``` + +### Prompt System Architecture (REFACTORED Sept 1, 2025 - NOW ROBUST & DOCUMENTED) REFACTORED AGAIN in October!! NEEDS UPDATE!!! +- **DRY Architecture**: Composable prompt components eliminate 90% code duplication +- **Single Source of Truth**: All prompts built from shared components in `server/services/prompts/components/` +- **Database Integration**: Full traceability with `system_prompt_used`, `user_prompt_used`, `prompt_template_id` columns +- **Schema Alignment**: JSON schema fields map 1:1 to database columns (`reasoningItems` → `reasoning_items`) +- **Custom Prompt Support**: Dedicated CUSTOM_SYSTEM_PROMPT ensures structured JSON output +- **Provider-agnostic**: Works with both Chat Completions and Responses API formats +- **Template selection**: Supports solver, explanation, alien communication, educational, and custom modes + + +### Endpoint difference +All OpenAI models should be using Responses API, but OpenRouter and other providers still use Chat Completions. +Chat Completions: /v1/chat/completions + +Responses API: /v1/responses + +Output location + +Chat Completions: text lives in choices[0].message.content + +Responses: visible answer lives in output_text or inside output[], reasoning lives in output_reasoning + +Reasoning capture + +Chat Completions: no structured reasoning, only free-form text if the model decides to include it + +Responses: dedicated output_reasoning.summary and output_reasoning.items[] fields + +Token accounting + +Chat Completions: max_tokens controls the final answer only + +Responses: reasoning tokens and visible output tokens are separate; must set max_output_tokens or you risk only getting reasoning with no final text + +Streaming + +Chat Completions: stream only text deltas for choices[].delta.content + +Responses: streams both reasoning and output chunks, with separate message types (reasoning-summary, output_text, etc.) + +Chaining + +Chat Completions: manually manage conversation history + +Responses: use previous_response_id to continue reasoning chains without resending full history + +Parsing logic + +Chat Completions: simple—always look at choices[0].message.content + +Responses: must parse multiple top-level keys: output_text, output[], output_reasoning, response.id + +Failure modes + +Chat Completions: usually just truncates answer if token cap too small + +Responses: if misconfigured, you can get only reasoning and no visible reply, or nothing if your parser ignores output[]!!! This might be where to start investigating. + +### Saturn Visual Solver Integration (Can be ignored) +- Python-based visual reasoning solver +- Streams progress via WebSockets and NDJSON events +- Requires OPENAI_API_KEY for image analysis +- Image gallery with real-time updates +### WebSocket Integration +Saturn solver uses WebSocket for real-time progress streaming with event-based updates and image gallery rendering. + + + +ARC-AGI-2 contains 1,000 public training tasks and 120 public evaluation tasks. + +The training tasks are intended to demonstrate the task format and the Core Knowledge priors used by ARC-AGI. They can be used for training AI models. The public evaluation tasks are intended for testing AI models that have never seen these tasks before. Average human performance on these tasks in our test sample was 66%. + +ARC-AGI-2 also features two private test sets not included in the repo: + +A semi-private set intended for testing remotely-hosted commercial models with low leakage probability. It is calibrated to be the same human-facing difficulty as the public evaluation set. +A fully-private set intended for testing self-contained models during the ARC Prize competition, with near-zeo leakage probability. It is also calibrated to be the same difficulty. +This multi-tiered structure allows for both open research and a secure, high-stakes competition. + +Task success criterion +A test-taker is said to solve a task when, upon seeing the task for the first time, they are able to produce the correct output grid for all test inputs in the task (this includes picking the dimensions of the output grid). For each test input, the test-taker is allowed 2 trials (this holds for all test-takers, either humans or AI). + +Task file format +The data directory contains two subdirectories: + +data/training: contains the task files for training (1000 tasks). Use these to prototype your algorithm or to train your algorithm to acquire ARC-relevant cognitive priors. This set combines tasks from ARC-AGI-1 as well as new tasks. +data/evaluation: contains the task files for evaluation (120 tasks). Use these to evaluate your final algorithm. To ensure fair evaluation results, do not leak information from the evaluation set into your algorithm (e.g. by looking at the evaluation tasks yourself during development, or by repeatedly modifying an algorithm while using its evaluation score as feedback). Each task in evaluation has been solved by a minimum of 2 people (many tasks were solved by more) in 2 attempts or less in a controlled test. +The tasks are stored in JSON format. Each task JSON file contains a dictionary with two fields: + +"train": demonstration input/output pairs. It is a list of "pairs" (typically 3 pairs). +"test": test input/output pairs. It is a list of "pairs" (typically 1-2 pair). +A "pair" is a dictionary with two fields: + +"input": the input "grid" for the pair. +"output": the output "grid" for the pair. +A "grid" is a rectangular matrix (list of lists) of integers between 0 and 9 (inclusive). The smallest possible grid size is 1x1 and the largest is 30x30. + +When looking at a task, a test-taker has access to inputs & outputs of the demonstration pairs, plus the input(s) of the test pair(s). The goal is to construct the output grid(s) corresponding to the test input grid(s), using 3 trials for each test input. "Constructing the output grid" involves picking the height and width of the output grid, then filling each cell in the grid with a symbol (integer between 0 and 9, which are visualized as colors). Only exact solutions (all cells match the expected answer) can be said to be correct. \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index e7b5258a3..b06f02300 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,388 +1,288 @@ # AGENTS.md -Author: The User -Date: 2025-09-28 18:26:41 -Purpose: To provide guidance to AI Agents when working with code in this repository. - -WE ARE ON WINDOWS!!! USE POWERSHELL SYNTAX AND COMMANDS!!! - -Your output to the user MUST BE LESS THAN 200 WORDS. No code snippets. No markdown. Just plain text simple language. -YOU MAY NOT OUTPUT A WALL OF TEXT TO THE USER!!! - -This file provides guidance to AI Agents when working with code in this repository. -Every file you create should start with: - * - * Author: Your NAME (Example: `Claude Code` using `Sonnet 4` or `Codex` using `GPT-5-high`) - * Date: `timestamp` - * PURPOSE: `VERBOSE DETAILS ABOUT HOW THIS WORKS AND WHAT ELSE IT TOUCHES` - * SRP/DRY check: Pass/Fail Is this file violating either? Do these things already exist in the project? Did you look?? - * shadcn/ui: Pass/Fail Is this file using shadcn/ui components? DO NOT WRITE CUSTOM UI WHEN WE HAVE shadcn/ui COMPONENTS!!! - -## ROLE -`You are an elite software architect and senior engineer with deep expertise in clean code principles, modular design, and production-ready implementation. You never do anything quick or sloppy. Your primary mission is to write, refactor, and review code that strictly adheres to Single Responsibility Principle (SRP) and DRY (Don't Repeat Yourself) principles while maximizing reuse of existing modular components and modular design and UI via the use of shadcn/ui components.` - -**Core Principles:** -- **SRP First**: Every class, function, and module must have exactly one reason to change. Never combine unrelated functionality. -- **DRY Always**: Identify and eliminate code duplication by extracting reusable components, utilities, and abstractions. -- **Modular Reuse**: Thoroughly analyze existing codebase components before writing new code. Prefer composition and extension over duplication. -- **Production Quality**: Never use mock data, simulated functions, placeholders, or stubs. All code must be production-ready and fully functional. -- **Code Quality**: Use consistent naming conventions, proper error handling, and meaningful variable names. - -**Your Workflow:** -1. **Deep Analysis**: Before writing any code, analyze the existing codebase to identify reusable components, patterns, and architectural decisions. -2. **Plan Architecture**: Create a clear plan that identifies single responsibilities for each component and opportunities for code reuse. -3. **Implement Modularly**: Write code that leverages existing modules and follows established patterns in the project. -4. **Verify Integration**: Ensure all APIs, services, and dependencies are properly integrated using real implementations. - -**Code Quality Standards:** -- Each module/class should handle no more than 3 related responsibilities -- Extract common functionality into shared utilities or services -- Use dependency injection and composition patterns -- Implement proper error handling and validation -- Follow project-specific coding standards and patterns from CLAUDE.md -- Always assume environment variables and API endpoints are correctly configured - -**Error Attribution:** -- All environment variables and secrets are properly configured in .env files -- All external APIs are functional and reliable -- Any errors or issues stem from your code implementation, not external dependencies -- Debug and fix code logic, API usage, and integration patterns - -**Output Requirements:** -- Provide clear explanations of architectural decisions -- Identify specific SRP violations and how they're resolved -- Highlight code reuse opportunities and implementations -- Include comprehensive error handling -- Ensure all code is immediately deployable without placeholders - -You never compromise on code quality, never take shortcuts with mock implementations, and always deliver production-ready solutions that exemplify clean architecture principles. - -You should always write up your todo list and larger plan and goal in the form of a markdown file in the /docs folder. This should be named {date}-{plan}-{goal}.md and it will serve as the user's reference and your guide as the user gives feedback. - -We are one hobby dev working on a hobby project with only 4 or 5 users. Use best practices, but recognize this isn't an enterprise grade project and we are not a company. We are 1 person working on a hobby project. - -## Common Commands -You need to Git add and commit any changes you make to the codebase. Be detailed in your commit messages. -Use `npm run test` to build and start the dev server and wait 10 seconds for it to properly start. Remember not to use the cd command as it is largely unnecessary and this will cause issues with the dev server. Use Kill Bash(Kill shell: bash_1) to stop the dev server. - -### Database Management -- `npm run db:push` - Push database schema changes using Drizzle -- Database tables auto-create on startup if using PostgreSQL - -### Testing and Validation -- Whenever you run tests you need to wait at least 20 seconds to read the output. Tell the user a joke about coding while you wait. The user will do testing and expect you to be watching the console. The user is not a professional software dev and may suggest ideas that are very bad and violate best practices. You should always second-guess the user's ideas and think carefully about what the user really wants to achieve and the current problem you are trying to solve. - - -## Architecture Overview - -### Monorepo Structure -``` +**Author:** The User +**Date:** 2025-09-28 18:26:41 +**Purpose:** Guidance for AI Agents working with code in this repository + +## 🚨 CRITICAL PLATFORM NOTES + +- **WE ARE ON WINDOWS** - Use PowerShell syntax and commands only +- **NEVER** use `&&` or `||` as statement separators on Windows +- **NEVER** use `cd` commands - we are always in the correct directory +- **WAIT 5 SECONDS** after terminal commands before checking output +- **GO SLOW** - Work methodically and understand the established codebase + +## Agent Role & Communication + +### Your Role +You are a senior software engineer with 20+ years of experience, dedicated to: +- **DRY (Don't Repeat Yourself)** and **SRP (Single Responsibility Principle)** +- Clean code principles and modular design +- Production-ready implementations without shortcuts + +### User Context +- **Hobbyist developer** with no formal computer-science education +- **Non-technical executive** mindset - consult for creative direction, not code +- **Hobby projects only** - 4-5 users, not enterprise-grade +- May request ill-advised approaches - gently guide toward best practices + +### Communication Guidelines +- **Unpack jargon** and explain concepts simply +- **Don't echo chain of thought** - user can see it +- **Limit communication** to essential questions not in README/docs +- **On errors**: Stop, think, ask user for input before proceeding +- **On completion**: Use "done" or "next" - detailed commentary belongs in commit messages + +## File Creation Standards + +**Every TypeScript file you create or edit should have a header with the following information:** + +/** + * Author: {Your Model Name} (Example: "DeepSeek V3.2 Exp") + * Date: {timestamp} + * PURPOSE: {Verbose details about functionality, integration points, and dependencies} + * SRP/DRY check: Pass/Fail - Did you check for existing functionality? + * DaisyUI: Pass/Fail - Are you using DaisyUI components instead of custom UI? + */ + +Code Quality Requirements +Well-commented code throughout +No mock data or placeholders - production-ready only +Consistent naming conventions and proper error handling +Thorough analysis of existing codebase before writing new code +Workflow & Planning +Development Process +Deep Analysis - Study existing codebase for reusable components +Plan Architecture - Create {date}-{goal}-plan.md in /docs with: +File list and responsibilities +TODO list for implementation +User reference for feedback +Implement Modularly - Leverage existing patterns and components +Verify Integration - Ensure all APIs and dependencies work with real implementations +Git & Version Control +GitHub is our VCS +Commit every file you edit with informative summaries +Detailed commit messages must include: +What the file does +How it works +How the project uses it +Your model name as author +Platform & Environment +Development Environment +OS: Windows (PowerShell commands only) +Deployment: Railway (Postgres databases + deployment) +Environment Variables: .env file (assume present and working) +Tool Limitations +Training data out of date - User knows more about latest LLMs/AI tools +Be transparent about your limitations +Use available tools appropriately +Project Architecture +Monorepo Structure + ├── client/ # React frontend (Vite + TypeScript) -├── server/ # Express backend (TypeScript) +├── server/ # Express backend (TypeScript) ├── shared/ # Shared types and schemas ├── data/ # ARC-AGI puzzle datasets ├── solver/ # Saturn Visual Solver (Python) └── dist/ # Production build output -``` - -### Frontend Architecture (React + TypeScript) -- **Build Tool**: Vite with TypeScript -- **Routing**: Wouter (lightweight client-side routing) -- **State Management**: TanStack Query for server state -- **UI Components**: shadcn/ui + TailwindCSS - - **Key Components**: AnalysisResultCard, AnalysisResultHeader, AnalysisResultContent, AnalysisResultGrid, AnalysisResultListCard, CommunitySolutionsSection -- **Key Pages**: PuzzleBrowser, PuzzleExaminer, AnalyticsOverview, PuzzleOverview, SaturnVisualSolver - -### Backend Architecture (Express + TypeScript) -- **Server**: Express.js with ESM modules -- **Database**: PostgreSQL via Drizzle ORM (with in-memory fallback) -- **AI Services**: Multi-provider support (OpenAI, Anthropic, Gemini, Grok, DeepSeek, OpenRouter) -- **WebSockets**: Saturn solver progress streaming -- **Python Integration**: Saturn Visual Solver subprocess execution - -### Database Schema (PostgreSQL) -Two main tables with Drizzle ORM: - -**EXPLANATIONS Table**: -- Core fields: puzzle_id, pattern_description, solving_strategy, hints[], confidence -- AI features: reasoning_log, api_processing_time_ms, model_name -id - integer (PRIMARY KEY) -puzzle_id - character varying(255) // Puzzle ID from ARC dataset -pattern_description - text // What the LLM says the pattern/transform is -solving_strategy - text // What the LLM says the solving strategy is -hints - text[] // What the LLM says the hints are or algorithms -confidence - integer // How confident the LLM is in the answer, used in multiple calculations including trustworthiness score -alien_meaning_confidence - integer // How confident the LLM is in the alien meaning it invents, not used in trustworthiness score -alien_meaning - text // The alien meaning the LLM invents -model_name - character varying(100) -reasoning_log - text // A human-readable string summary of the AI's thought process. This is intelligently generated by `ExplanationRepository.ts` from the raw reasoning data just before database insertion to prevent `[object Object]` errors. Ideal for simple text displays. -has_reasoning_log - boolean // A flag indicating if any form of reasoning data (structured or unstructured) was returned by the AI provider. -provider_response_id - text -api_processing_time_ms - integer -saturn_images - jsonb // Only used by Saturn Visual Solver -saturn_log - jsonb // Only used by Saturn Visual Solver -saturn_events - jsonb // Only used by Saturn Visual Solver -saturn_success - boolean // Only used by Saturn Visual Solver -predicted_output_grid - jsonb // CRITICAL for the project! This is the predicted output grid. -is_prediction_correct - boolean // This is evaluation 1 of 3 that should be used for `accuracy`!!! -prediction_accuracy_score - double precision // THIS IS THE `TRUSTWORTHINESS` SCORE -provider_raw_response - jsonb -reasoning_items - jsonb // The structured, machine-readable version of the reasoning (e.g., an array of steps). This is safely stringified by the `ExplanationRepository` and stored as JSONB for use in complex UI or for detailed analysis. -`temperature` - double precision // should only be applied to certain models and providers and will not always be used -reasoning_effort - text // Variable used by GPT-5 only can be minimal, low, medium, or high -reasoning_verbosity - text // Variable used by GPT-5 only can be low, medium, or high -reasoning_summary_type - text // Variable used by GPT-5 only can be auto, none, or detailed -input_tokens - integer -output_tokens - integer -reasoning_tokens - integer -total_tokens - integer -estimated_cost - numeric // This is calculated by the backend -multiple_predicted_outputs - jsonb // IMPORTANT FOR PUZZLES WITH MULTIPLE TESTS!!! -multi_test_results - jsonb // IMPORTANT FOR PUZZLES WITH MULTIPLE TESTS!!! -multi_test_all_correct - boolean // THIS is evaluation 2 of 3 that should be used for `accuracy`!!! -multi_test_average_accuracy - double precision // THIS is evaluation 3 of 3 that should be used for `accuracy`!!! -has_multiple_predictions - boolean // False if there is only one test (then multi_test_all_correct and multi_test_average_accuracy are not applicable!!!) -multi_test_prediction_grids - jsonb // IMPORTANT FOR PUZZLES WITH MULTIPLE TESTS!!! -created_at - timestamp with time zone - -**FEEDBACK Table**: -- Foreign key to explanations (1:N relationship) -- vote_type constraint: 'helpful' | 'not_helpful' -- Required comment field for feedback - -### AI Provider Integration -Centralized prompt building system (`server/services/promptBuilder.ts`): -- Template-based prompts with dynamic selection -- Custom prompt support for research workflows -- Consistent behavior across all providers and OpenRouter (INCOMPLETE) - -### External API Documentation -For external integrations, see: -- `docs/EXTERNAL_API.md` - Complete API endpoint reference for external applications -- `docs/HOOKS_REFERENCE.md` - React hooks documentation for frontend integration - -**Key External APIs:** -- `/api/feedback/accuracy-stats` - Pure accuracy leaderboard data (used by AccuracyLeaderboard) -- `/api/puzzle/performance-stats` - Trustworthiness metrics (used by TrustworthinessLeaderboard) -- `/api/feedback/stats` - User feedback statistics (used by FeedbackLeaderboard) -- `/api/metrics/comprehensive-dashboard` - Combined analytics for dashboards - -**Repository Pattern:** -External apps should access data through `repositoryService.*` rather than direct database queries: -- `repositoryService.accuracy.getPureAccuracyStats()` - For accuracy leaderboards -- `repositoryService.trustworthiness.getTrustworthinessStats()` - For trustworthiness metrics -- `repositoryService.cost.getAllModelCosts()` - For cost analysis -- `repositoryService.explanation.getByPuzzle(puzzleId)` - For explanations -- `repositoryService.feedback.create(...)` - For submitting feedback - -## Analytics Architecture Guidelines 🚨 CRITICAL (September 2025) - -### Repository Domain Separation (SRP Compliance) -Each repository handles EXACTLY one domain - never mix unrelated concerns: - -```typescript -// ✅ CORRECT - Domain-specific repositories -AccuracyRepository → Pure puzzle-solving correctness only -TrustworthinessRepository → AI confidence reliability analysis only -CostRepository → Financial cost calculations only -MetricsRepository → Cross-domain aggregation via delegation - -// ❌ WRONG - Mixed domains (architectural violation) -TrustworthinessRepository calculating costs // Violates SRP -Multiple repositories with duplicate cost logic // Violates DRY -``` - -### When Adding New Metrics - FOLLOW THIS PATTERN: - -1. **Identify Domain**: accuracy/trustworthiness/cost/performance/etc. -2. **Add to Appropriate Repository**: Don't mix domains -3. **Use Model Normalization**: Always use `utils/modelNormalizer.ts` -4. **Add Database Indexes**: For performance optimization -5. **Document in EXTERNAL_API.md**: For external integration - -### Analytics Data Flow Pattern: -``` +Frontend (React + TypeScript) +Build: Vite with TypeScript +Routing: Wouter (client-side) +State: TanStack Query for server state +UI: shadcn/ui + TailwindCSS +Key Components: AnalysisResultCard, AnalysisResultHeader, AnalysisResultContent, etc. +Key Pages: PuzzleBrowser, PuzzleExaminer, AnalyticsOverview, etc. +Backend (Express + TypeScript) +Server: Express.js with ESM modules +Database: PostgreSQL via Drizzle ORM (in-memory fallback) +AI Services: Multi-provider support (OpenAI, Anthropic, Gemini, Grok, DeepSeek, OpenRouter) +WebSockets: Saturn solver progress streaming +Python Integration: Saturn Visual Solver subprocess execution +Database Schema +EXPLANATIONS Table (Core Analytics) + +-- Primary puzzle analysis storage +id INTEGER (PRIMARY KEY) +puzzle_id VARCHAR(255) -- Puzzle ID from ARC dataset +pattern_description TEXT -- LLM's pattern/transform analysis +solving_strategy TEXT -- LLM's solving strategy +hints TEXT[] -- LLM's hints/algorithms +confidence INTEGER -- Used in trustworthiness score +alien_meaning_confidence INTEGER -- Confidence in invented alien meaning +alien_meaning TEXT -- Invented alien meaning +model_name VARCHAR(100) +reasoning_log TEXT -- Human-readable reasoning summary +has_reasoning_log BOOLEAN -- Flag for reasoning data presence +provider_response_id TEXT +api_processing_time_ms INTEGER +saturn_images JSONB -- Saturn Visual Solver only +saturn_log JSONB -- Saturn Visual Solver only +saturn_events JSONB -- Saturn Visual Solver only +saturn_success BOOLEAN -- Saturn Visual Solver only + +-- CRITICAL Prediction Fields +predicted_output_grid JSONB -- Predicted output grid +is_prediction_correct BOOLEAN -- Evaluation 1 of 3 for accuracy +trustworthiness_score DOUBLE PRECISION -- TRUSTWORTHINESS SCORE (formerly called prediction_accuracy_score which was problematic!!) + +-- Multi-test Support +multiple_predicted_outputs JSONB -- Multiple test predictions +multi_test_results JSONB -- Multi-test results +multi_test_all_correct BOOLEAN -- Evaluation 2 of 3 for accuracy +multi_test_average_accuracy DOUBLE PRECISION -- Evaluation 3 of 3 for accuracy +has_multiple_predictions BOOLEAN -- False for single-test puzzles +multi_test_prediction_grids JSONB -- Multiple test prediction grids + +-- Token & Cost Tracking +input_tokens INTEGER +output_tokens INTEGER +reasoning_tokens INTEGER +total_tokens INTEGER +estimated_cost NUMERIC + +-- AI Model Parameters +temperature DOUBLE PRECISION -- Applied selectively +reasoning_effort TEXT -- GPT-5 only: minimal/low/medium/high +reasoning_verbosity TEXT -- GPT-5 only: low/medium/high +reasoning_summary_type TEXT -- GPT-5 only: auto/none/detailed + +-- Timestamp +created_at TIMESTAMPTZ +FEEDBACK Table +Foreign key to explanations (1:N relationship) +vote_type constraint: 'helpful' | 'not_helpful' +Required comment field for feedback +AI Provider Integration +Prompt System Architecture +DRY Architecture: Composable prompt components in server/services/prompts/components/ +Single Source of Truth: Shared prompt components eliminate 90% duplication +Database Traceability: system_prompt_used, user_prompt_used, prompt_template_id columns +Schema Alignment: JSON fields map 1:1 to database columns +Provider-agnostic: Works with both Chat Completions and Responses API +API Endpoint Differences +Chat Completions (/v1/chat/completions): + +Text in choices[0].message.content +No structured reasoning, only free-form text +Simple parsing logic +Responses API (/v1/responses): + +Answer in output_text or output[] +Structured reasoning in output_reasoning.summary and output_reasoning.items[] +Separate token accounting for reasoning vs output +Complex parsing required for multiple top-level keys +Analytics Architecture 🚨 CRITICAL +Repository Domain Separation (SRP Compliance) + +// ✅ CORRECT - Single responsibility domains +AccuracyRepository → Pure puzzle-solving correctness ONLY +TrustworthinessRepository → AI confidence reliability analysis ONLY +CostRepository → Financial cost calculations ONLY +MetricsRepository → Cross-domain aggregation via delegation ONLY + +// ❌ WRONG - Architectural violations +TrustworthinessRepository calculating costs // VIOLATES SRP +Multiple repositories with duplicate logic // VIOLATES DRY +Analytics Data Flow Pattern + explanations table → Domain Repository → API Controller → Frontend Hook → UI Component -``` +Repository Integration Examples -### Repository Integration Examples: -```typescript -// Single domain - direct repository access +// Single domain - direct access const accuracyStats = await repositoryService.accuracy.getPureAccuracyStats(); -// Cross-domain - use MetricsRepository delegation +// Cross-domain - use delegation const dashboard = await repositoryService.metrics.getComprehensiveDashboard(); -// Combined APIs - controller combines multiple repositories +// Combined APIs - controller combines repositories async getRealPerformanceStats() { const trustworthinessStats = await repositoryService.trustworthiness.getRealPerformanceStats(); const costMap = await repositoryService.cost.getModelCostMap(); return this.combineStatsWithCosts(trustworthinessStats, costMap); } -``` +Model Name Normalization - ALWAYS USE -### Model Name Normalization - ALWAYS USE: -```typescript import { normalizeModelName } from '../utils/modelNormalizer.ts'; // Handles: claude-3.5-sonnet:beta → claude-3.5-sonnet // Handles: z-ai/glm-4.5-air:free → z-ai/glm-4.5 const normalized = normalizeModelName(rawModelName); -``` - -### Database Indexes for Analytics: -```sql --- Always add indexes for new analytics queries -CREATE INDEX idx_explanations_new_metric ON explanations(model_name, new_field) WHERE new_field IS NOT NULL; -``` - -For comprehensive analytics architecture documentation, see: -- `docs/Analytics_Database_Architecture.md` - Complete analytics system guide -- `docs/Analysis_Data_Flow_Trace.md` - Updated with analytics flow patterns - -## Key Technical Patterns - -### ESM Module Setup -- Uses ES modules throughout (type: "module" in package.json) -- Import paths require .ts extensions in development -- Proper __dirname handling for bundled code - -### TypeScript Configuration -- Shared types in `shared/types.ts` for frontend/backend consistency -- Path aliases: `@/*` for client, `@shared/*` for shared types -- Strict TypeScript settings with incremental builds - -### Development vs Production -- **Development**: Vite dev server on :5173, Express API on :5000 -- **Production**: Express serves static files from dist/public with SPA fallback -- Docker deployment with Python runtime for Saturn solver - -### Data Loading Priority -ARC-AGI datasets loaded in priority order: -1. ARC2-Eval (evaluation2) -2. ARC2 (training2) -3. ARC1-Eval (evaluation) -4. ARC1 (training) -Abstraction and Reasoning Corpus for Artificial General Intelligence v2 (ARC-AGI-2) - -"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test. It is targeted at both humans and artificially intelligent systems that aim at emulating a human-like form of general fluid intelligence." -### Environment Variables All present and working: -Required for AI analysis (at least one): -- `OPENAI_API_KEY`, `GROK_API_KEY`, `GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, `DEEPSEEK_API_KEY`, `OPENROUTER_API_KEY` - -Required for database (Present and working): -- `DATABASE_URL` - PostgreSQL connection (Present and working) -## Important Implementation Notes - -### Puzzle Data Management -- Each puzzle has unique ID across all ARC categories -- No composite keys needed (taskId is sufficient) -- Puzzle metadata includes source tracking (ARC1, ARC1-Eval, ARC2, ARC2-Eval) - -### SPA Routing in Production -Express serves index.html for all non-API routes to support client-side routing: -```typescript -app.get("*", (req, res) => { - if (!req.path.startsWith("/api")) { - res.sendFile(path.join(staticPath, "index.html")); - } -}); -``` - -### Prompt System Architecture (REFACTORED Sept 1, 2025 - NOW ROBUST & DOCUMENTED) -- **DRY Architecture**: Composable prompt components eliminate 90% code duplication -- **Single Source of Truth**: All prompts built from shared components in `server/services/prompts/components/` -- **Database Integration**: Full traceability with `system_prompt_used`, `user_prompt_used`, `prompt_template_id` columns -- **Schema Alignment**: JSON schema fields map 1:1 to database columns (`reasoningItems` → `reasoning_items`) -- **Custom Prompt Support**: Dedicated CUSTOM_SYSTEM_PROMPT ensures structured JSON output -- **Provider-agnostic**: Works with both Chat Completions and Responses API formats -- **Template selection**: Supports solver, explanation, alien communication, educational, and custom modes - - -### Endpoint difference -All OpenAI models should be using Responses API, but OpenRouter and other providers still use Chat Completions. -Chat Completions: /v1/chat/completions - -Responses API: /v1/responses - -Output location - -Chat Completions: text lives in choices[0].message.content - -Responses: visible answer lives in output_text or inside output[], reasoning lives in output_reasoning - -Reasoning capture - -Chat Completions: no structured reasoning, only free-form text if the model decides to include it - -Responses: dedicated output_reasoning.summary and output_reasoning.items[] fields - -Token accounting - -Chat Completions: max_tokens controls the final answer only - -Responses: reasoning tokens and visible output tokens are separate; must set max_output_tokens or you risk only getting reasoning with no final text - -Streaming - -Chat Completions: stream only text deltas for choices[].delta.content - -Responses: streams both reasoning and output chunks, with separate message types (reasoning-summary, output_text, etc.) - -Chaining - -Chat Completions: manually manage conversation history - -Responses: use previous_response_id to continue reasoning chains without resending full history - -Parsing logic - -Chat Completions: simple—always look at choices[0].message.content - -Responses: must parse multiple top-level keys: output_text, output[], output_reasoning, response.id - -Failure modes - -Chat Completions: usually just truncates answer if token cap too small - -Responses: if misconfigured, you can get only reasoning and no visible reply, or nothing if your parser ignores output[]!!! This might be where to start investigating. - -### Saturn Visual Solver Integration (Can be ignored) -- Python-based visual reasoning solver -- Streams progress via WebSockets and NDJSON events -- Requires OPENAI_API_KEY for image analysis -- Image gallery with real-time updates -### WebSocket Integration -Saturn solver uses WebSocket for real-time progress streaming with event-based updates and image gallery rendering. - - - -ARC-AGI-2 contains 1,000 public training tasks and 120 public evaluation tasks. - -The training tasks are intended to demonstrate the task format and the Core Knowledge priors used by ARC-AGI. They can be used for training AI models. The public evaluation tasks are intended for testing AI models that have never seen these tasks before. Average human performance on these tasks in our test sample was 66%. - -ARC-AGI-2 also features two private test sets not included in the repo: - -A semi-private set intended for testing remotely-hosted commercial models with low leakage probability. It is calibrated to be the same human-facing difficulty as the public evaluation set. -A fully-private set intended for testing self-contained models during the ARC Prize competition, with near-zeo leakage probability. It is also calibrated to be the same difficulty. -This multi-tiered structure allows for both open research and a secure, high-stakes competition. - -Task success criterion -A test-taker is said to solve a task when, upon seeing the task for the first time, they are able to produce the correct output grid for all test inputs in the task (this includes picking the dimensions of the output grid). For each test input, the test-taker is allowed 2 trials (this holds for all test-takers, either humans or AI). - -Task file format -The data directory contains two subdirectories: - -data/training: contains the task files for training (1000 tasks). Use these to prototype your algorithm or to train your algorithm to acquire ARC-relevant cognitive priors. This set combines tasks from ARC-AGI-1 as well as new tasks. -data/evaluation: contains the task files for evaluation (120 tasks). Use these to evaluate your final algorithm. To ensure fair evaluation results, do not leak information from the evaluation set into your algorithm (e.g. by looking at the evaluation tasks yourself during development, or by repeatedly modifying an algorithm while using its evaluation score as feedback). Each task in evaluation has been solved by a minimum of 2 people (many tasks were solved by more) in 2 attempts or less in a controlled test. -The tasks are stored in JSON format. Each task JSON file contains a dictionary with two fields: - -"train": demonstration input/output pairs. It is a list of "pairs" (typically 3 pairs). -"test": test input/output pairs. It is a list of "pairs" (typically 1-2 pair). -A "pair" is a dictionary with two fields: - -"input": the input "grid" for the pair. -"output": the output "grid" for the pair. -A "grid" is a rectangular matrix (list of lists) of integers between 0 and 9 (inclusive). The smallest possible grid size is 1x1 and the largest is 30x30. - -When looking at a task, a test-taker has access to inputs & outputs of the demonstration pairs, plus the input(s) of the test pair(s). The goal is to construct the output grid(s) corresponding to the test input grid(s), using 3 trials for each test input. "Constructing the output grid" involves picking the height and width of the output grid, then filling each cell in the grid with a symbol (integer between 0 and 9, which are visualized as colors). Only exact solutions (all cells match the expected answer) can be said to be correct. \ No newline at end of file +ARC-AGI Dataset Information +Data Loading Priority +ARC datasets loaded in order: + +ARC2-Eval (evaluation2) - Highest priority +ARC2 (training2) +ARC1-Eval (evaluation) +ARC1 (training) - Lowest priority +ARC-AGI-2 Structure (arxiv.org) +Training Set: 1,000 public tasks for prototyping/training +Public Eval Set: 120 calibrated tasks for final evaluation +Average Human Performance: 66% on evaluation tasks +Task Success: Correct output grid for all test inputs within 2 trials +Task File Format + +{ + "train": [ // Demonstration pairs (typically 3) + { + "input": [[grid_matrix]], // 1x1 to 30x30 grid + "output": [[grid_matrix]] // Integers 0-9 + } + ], + "test": [ // Test pairs (typically 1-2) + { + "input": [[grid_matrix]], + "output": [[grid_matrix]] // Target for prediction + } + ] +} +Common Commands +Development +npm run test - Build and start dev server (wait 10 seconds) +User manages dev server - only run commands when explicitly told +Use "Kill Bash" to stop dev server +Database Management +npm run db:push - Push schema changes using Drizzle +Tables auto-create on startup with PostgreSQL +Testing Philosophy +User handles testing and validation +Wait 20 seconds when running tests to read output +Tell a coding joke while waiting for test results +Second-guess user suggestions that violate best practices +Important Implementation Notes +Technical Configuration +ESM Modules throughout (type: "module" in package.json) +TypeScript with shared types in shared/types.ts +Path aliases: @/* (client), @shared/* (shared types) +Production vs Development +Development: Vite dev server (:5173), Express API (:5000) +Production: Express serves static files from dist/public with SPA fallback +Environment Variables (Present and Working) +AI Services (at least one required): + +OPENAI_API_KEY, GROK_API_KEY, GEMINI_API_KEY, ANTHROPIC_API_KEY, DEEPSEEK_API_KEY, OPENROUTER_API_KEY +Database: + +DATABASE_URL - PostgreSQL connection +External Integration +API Documentation +docs/EXTERNAL_API.md - Complete API endpoint reference +docs/HOOKS_REFERENCE.md - React hooks documentation +Key External APIs +/api/feedback/accuracy-stats - Pure accuracy leaderboard +/api/puzzle/performance-stats - Trustworthiness metrics +/api/feedback/stats - User feedback statistics +/api/metrics/comprehensive-dashboard - Combined analytics +Repository Pattern for External Apps + +// Access data through repositoryService, not direct queries +repositoryService.accuracy.getPureAccuracyStats() // Accuracy leaderboards +repositoryService.trustworthiness.getTrustworthinessStats() // Trustworthiness metrics +repositoryService.cost.getAllModelCosts() // Cost analysis +repositoryService.explanation.getByPuzzle(puzzleId) // Explanations +repositoryService.feedback.create(...) // Submit feedback +🚫 PROHIBITED ACTIONS +No time estimates - Never give completion time predictions +No celebration - Avoid "done/finished" assertions +No shortcuts - Never compromise on code quality +No over-engineering - Keep solutions simple and maintainable for hobby project scale \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ea191314..b217eb49e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,588 @@ +## [4.8.9] - 2025-10-13 +### 🎨 REDESIGN: Saturn Visual Solver with ATC Design System + +**Problem:** Saturn Visual Solver UI was cluttered, lacked information density, and didn't follow consistent design patterns. + +**Solution:** Complete rebuild using Agent Traffic Control design system with focus on information density and modular architecture. + +**Key Design Principles Applied:** +- **CSS Grid layouts** - 30%/70% column splits for maximum screen density +- **Monospace terminal logs** - Real-time log streaming with color-coded status +- **Small modular components** - Each component ~100 lines, single responsibility +- **Status-based color coding** - Visual indicators for analyzing/generating/complete states +- **Information-dense** - Show everything at once, no minimalism +- **Light theme** - Clean white background with amber accents instead of dark theme + +**New Components Created:** +1. **SaturnMonitoringTable.tsx** (~90 lines) + - Puzzle ID, status, phase, progress tracking + - Status color coding (blue=running, green=complete, red=error) + - Information-dense 6-cell grid layout + +2. **SaturnWorkTable.tsx** (~110 lines) + - Phase history table with status-based row colors + - Amber=in-progress, emerald=completed, red=errors + - Monospace font, ATC-style table design + +3. **SaturnTerminalLogs.tsx** (~100 lines) + - Monospace terminal log display with auto-scroll + - Color-coded log levels (red=error, yellow=warning, green=success) + - Live reasoning display in blue box + - Shows line count and connection status + +4. **SaturnRadarCanvas.tsx** (~130 lines) + - Information-dense image gallery + integrated controls + - 180px control panel (model/temp/effort) + image grid + - Shows all generated images simultaneously + - Master control panel with Execute button + +**Page Architecture:** +- **SaturnVisualSolver.tsx** - Main orchestration page +- **Desktop Layout:** 30%/70% grid split + - Left: Monitoring Table + Work Table (stacked) + - Right: Terminal Logs + Radar Canvas (stacked) +- **Mobile Layout:** Vertical stack with compact views +- **Light theme** throughout with gray-50 backgrounds + +**Files Modified:** +- `client/src/pages/SaturnVisualSolver.tsx` - Complete rewrite +- `client/src/components/saturn/SaturnMonitoringTable.tsx` - NEW +- `client/src/components/saturn/SaturnWorkTable.tsx` - NEW +- `client/src/components/saturn/SaturnTerminalLogs.tsx` - NEW +- `client/src/components/saturn/SaturnRadarCanvas.tsx` - NEW + +**Impact:** Saturn Visual Solver now has professional, information-dense UI matching Agent Traffic Control design patterns. All components follow SRP, are small and focused, and provide maximum screen real estate utilization. + +--- + +## [4.8.8] - 2025-10-13 +### 🚀 NEW: ARC API Client for External Researchers + +**Problem:** Python researchers needed simple way to contribute analyses to ARC Explainer encyclopedia using existing API endpoints. + +**Solution:** Created simple Python client (`tools/api-client/`) that provides one-line integration for researchers to contribute ARC puzzle analyses. + +**Features:** +- **One-line contribution:** `contribute_to_arc_explainer(puzzle_id, analysis, model, url, key)` +- **Current model support:** Uses October 2025 model names (grok-4-2025-10-13, gpt-5-turbo-2025-10-13, etc.) +- **Existing API integration:** Calls `POST /api/puzzle/save-explained/:puzzleId` endpoint +- **Model-specific functions:** `contribute_grok4_analysis()`, `contribute_gpt5_analysis()`, `contribute_claude_analysis()` +- **Batch processing:** `contribute_batch_analyses()` for multiple puzzles +- **Zero dependencies:** Only requires `requests` library + +**Files:** +- `tools/api-client/arc_client.py` - Main API client +- `tools/api-client/examples.py` - Usage examples +- `tools/api-client/README.md` - Complete documentation + +**Usage:** +```python +from arc_client import contribute_to_arc_explainer + +# One-line contribution to encyclopedia +result = contribute_to_arc_explainer( + "3a25b0d8", analysis_result, "grok-4-2025-10-13", + "https://arc-explainer-staging.up.railway.app", "your-api-key" +) +``` + +**Impact:** Enables Python researchers to easily contribute to ARC puzzle encyclopedia using current SOTA models. + +--- + +## [4.8.7] - 2025-10-13 +### 🐛 FIX: Saturn Solver SSE Streaming Issues + +**Problems:** +1. Redundant `emitStreamChunk()` calls in `sendProgress` helper +2. Missing `analysis` wrapper in `finalizeStream()` causing frontend to not find saved data + +**Solutions:** +- Removed redundant `emitStreamChunk()` from `sendProgress` helper (lines 115-118) + - Status messages already emitted via `emitStreamEvent()` with proper payload + - `emitStreamChunk()` is for content deltas only (like OpenAI text streaming) +- Wrapped `finalResponse` in `analysis` field in `finalizeStream()` call (line 434-436) + - Frontend expects `summary?.responseSummary?.analysis` structure + - Ensures Saturn streaming matches OpenAI/Grok streaming format + +**Benefits:** +- ✅ Eliminates duplicate status messages in SSE stream +- ✅ Frontend correctly displays and saves Saturn streaming results +- ✅ Consistent streaming architecture across all services + +**Files:** `server/services/saturnService.ts` + +--- + +## [4.8.6] - 2025-10-13 +### 🐛 FIX: Streaming Modal Stays Open After Completion + +**Problem:** Streaming modal disappeared immediately when analysis completed, before user could see final result or saved explanation. + +**Root Cause:** `resetStreamingState()` called immediately after save, setting `streamingModelKey` to null and closing modal. + +**Solution:** +- Removed immediate `resetStreamingState()` call from `handleStreamingComplete()` +- Modal now stays open when status="completed", showing "Close" button +- User can review final streaming output before manually closing +- `resetStreamingState()` only called when user clicks "Close" button + +**Benefits:** +- ✅ User sees completed streaming analysis result +- ✅ Explanation saves and appears in list while modal still open +- ✅ User controls when to dismiss the modal +- ✅ Better UX - no jarring disappearance + +**Files:** `client/src/hooks/useAnalysisResults.ts` + +--- + +## [4.8.5] - 2025-10-13 +### ✨ UX: Smart Prompt Preview (Show Once Per Config) + +**Problem:** Preview modal appeared every time, tedious when testing multiple models with same prompt. + +**Solution:** +- Preview shows only on **first run** for a given prompt configuration +- Button changes: "Preview & Run" → "Run" after first confirmation +- Resets automatically when prompt template/settings change + +**Impact:** Preserves safety on first run, removes friction for batch model testing. + +**Files:** `client/src/components/puzzle/ModelTable.tsx` + +--- + +## [4.8.4] - 2025-10-13 +### 🔧 CRITICAL: OpenAI Streaming Event Field Access Fix + +**Problem:** Code accessed non-existent `content` field on streaming events, causing empty reasoning/text deltas. + +**Root Cause:** OpenAI SDK v5.16.0 uses different field names per event type: +- `ResponseReasoningSummaryTextDeltaEvent` → `delta` (not `content`) +- `ResponseReasoningSummaryPartAddedEvent` → `part.text` (not `content`) +- `ResponseContentPartAddedEvent` → `part.text` (not `content`) + +**Solution:** +- Fixed field access in `handleStreamingEvent()` to match SDK types +- Added proper type imports and replaced `as any` casts +- Added type guards for union type handling + +**Impact:** Real-time reasoning/content now streams correctly for GPT-5 models. + +**Files:** `server/services/openai.ts` + +--- + +## [4.8.3] - 2025-10-13 12:35 AM +### 🔧 OPENAI SERVICE COMPILATION FIXES + +**CRITICAL FIXES TO OPENAI SERVICE:** + +#### 1. **Fixed Corrupted OpenAI Service File** +- **Problem**: `server/services/openai.ts` had corrupted syntax, missing method implementations, and TypeScript compilation errors +- **Root Cause**: Previous edits introduced syntax errors, incomplete method definitions, and corrupted code blocks +- **Solution**: + - Fixed bracket/brace mismatches and malformed statements + - Implemented missing abstract methods (`parseProviderResponse`) + - Repaired corrupted code sections (lines 715-754) + - Added proper error handling and method signatures + +#### 2. **Corrected OpenAI Responses API Streaming Events** +- **Problem**: Using incorrect event types (`response.reasoning.delta`, `response.output.delta`) that don't exist in OpenAI's API +- **Solution**: Updated `handleStreamingEvent()` method to use correct event types: + - `response.reasoning_summary_part.added` - Accumulates reasoning parts in real-time + - `response.reasoning_summary_text.done` - Finalizes reasoning summary + - `response.content_part.added` - Handles text content deltas +- **Impact**: Real-time reasoning display now works correctly for GPT-5 models + +#### 3. **Fixed Method Ordering and Dependencies** +- **Problem**: `normalizeOpenAIResponse()` method was defined after being called +- **Solution**: Moved method definition before `analyzePuzzleWithStreaming()` method +- **Impact**: Eliminates "method does not exist" TypeScript errors + +#### 4. **Enhanced Response Parsing** +- **Added**: Complete `parseProviderResponse()` implementation with proper return types +- **Added**: `callResponsesAPI()` method for HTTP calls to OpenAI Responses API +- **Fixed**: Token usage extraction and reasoning capture logic +- **Impact**: OpenAI service now properly handles both streaming and non-streaming responses + +#### 5. **Improved Type Safety** +- **Fixed**: Implicit `any` types in method parameters +- **Added**: Proper TypeScript type annotations throughout +- **Impact**: Better IDE support and compile-time error detection + +**Files Modified:** +- `server/services/openai.ts` - Complete overhaul and fixes + +**Impact**: OpenAI service now compiles successfully and handles streaming puzzle analysis correctly. Real-time reasoning feedback works as intended. + +**Author**: Cascade using DeepSeek V3.2 Exp +**Date**: 2025-10-13 + +--- + +## [4.8.2] - 2025-10-12 11:20 PM +### 🔧 HEURISTIC ARC SOLVER INTEGRATION + +**NEW INTERNAL SOLVER ADDED:** + +#### Heuristic Solver Package (`solver/heuristic/`) +**Modular Python package with SRP (Single Responsibility Principle) design:** + +- **`grids.py`** - Grid operations and utilities (trim, rotate, flip, color mapping, connected components) +- **`prims.py`** - Parameterized transform primitives (geometry, object ops, learned color mappings) +- **`program.py`** - Program search and composition logic (single → composition → fallback strategy) +- **`cli.py`** - JSON contract interface for backend integration + +**Key Features:** +- **Learning Strategy**: Learns transformations from training examples using primitive operations +- **Search Algorithm**: Single transforms → Two-step compositions → Trim+transform → Fallback +- **Shape Handling**: Median target shape from training outputs with padding/trimming +- **Performance**: Very fast (< 1s) using only numpy, no external API calls +- **Integration**: `heuristic-solver` model key routes to internal Python execution + +**Backend Integration:** +- **Service**: `HeuristicService` extends `BaseAIService` (same pattern as Grover/Saturn) +- **Factory Routing**: `model.startsWith('heuristic-')` → `heuristicService` +- **Database**: Full compatibility with existing schema and validation +- **Error Handling**: Proper error propagation and fallback strategies + +**Files Added:** +- `solver/heuristic/__init__.py` - Package initialization +- `solver/heuristic/grids.py` - Grid manipulation utilities +- `solver/heuristic/prims.py` - Transform primitive definitions +- `solver/heuristic/program.py` - Learning and composition logic +- `solver/heuristic/cli.py` - JSON contract interface +- `solver/heuristic_solver.py` - Single-file version for easy deployment +- `server/services/heuristic.ts` - Backend service integration +- `docs/2025-10-12-plan-heuristic-solver.md` - Complete integration documentation + +**Usage:** +```bash +# Test individual puzzle +python solver/heuristic_solver.py data/arc-heavy/50846271.json + +# Backend integration (saves to database) +POST /api/puzzle/analyze/50846271/heuristic-solver +``` + +**Impact:** Provides fast, reliable baseline solver for obvious ARC patterns. Ready for jjosh library integration via `merge()`/`diff()` adapters. + +--- + +## [4.8.1] - 2025-10-12 11:00 PM +### 💰 COST CONTROL: Prompt Preview Confirmation + Prompt Order Fix + +**TWO CRITICAL IMPROVEMENTS:** + +#### 1. 🔍 Mandatory Prompt Preview Before Expensive API Calls +**Problem:** Users could accidentally trigger expensive LLM API calls without seeing what prompt would be sent. + +**Solution - Two-Step Confirmation Flow:** +- **"Preview & Run" buttons** replace direct "Run" buttons in ModelTable +- **Confirmation modal** shows complete prompt before execution: + - System prompt (AI role/behavior) + - User prompt (puzzle data + instructions) + - Estimated token count and character count + - Template info and mode badges +- **User must confirm** by clicking "Confirm & Run" button +- **Can cancel** without any API call or charges +- **Loading state** shown while analysis starts + +**Files Changed:** +- `client/src/components/PromptPreviewModal.tsx` - Added confirmation mode with confirm/cancel buttons +- `client/src/components/puzzle/ModelTable.tsx` - Integrated preview modal, changed Run → Preview & Run +- `client/src/pages/PuzzleExaminer.tsx` - Pass prompt configuration props to ModelTable + +**Impact:** Prevents accidental expensive API calls. Users verify prompt correctness before spending money. + +#### 2. 📝 Fixed Prompt Order: Data Before Instructions +**Problem:** Task descriptions came BEFORE puzzle data, making prompts confusing ("analyze the examples below" but examples came after). + +**Solution - Reordered User Prompts:** +1. Training examples (data) +2. Test cases (data) +3. Emoji legend (if applicable) +4. Task description (instructions) + +**Files Changed:** +- `server/services/prompts/userTemplates.ts` - Moved task description to end in all prompt modes (solver, explanation, discussion, debate) + +**Impact:** Improved prompt clarity - AI sees the data first, then reads instructions on what to do with it. + +--- + +## [4.8.0] - 2025-10-12 8:45 PM +### 🎨 MAJOR UX OVERHAUL: Data-Dense Layout & Explicit Grid Labeling + +**THREE MAJOR IMPROVEMENTS:** + +#### 1. 📊 Grid Pair Redesign - Explicit INPUT/OUTPUT Labels +**Problem:** Users couldn't clearly see which grid was input vs output, especially with multiple test outputs. + +**Solution - New `GridPair` Component:** +- **Explicit labels:** "📥 INPUT" and "📤 OUTPUT" badges above each grid +- **Split container design:** Vertical divider between input and output sections +- **Multi-output support:** Displays "N outputs" badge and labels as "OUTPUT 1", "OUTPUT 2", etc. +- **Color-coded sections:** + - Training pairs: Blue input bg, amber output bg, gray borders + - Test pairs: Blue input bg, green output bg, green borders with title bar +- **Title bar:** Shows "Training Example N" or "Test N" with multi-output indicator + +**Impact:** Eliminates ambiguity about which grid transforms into which, especially critical for multi-output test cases. + +#### 2. 📋 Model Table Improvements - Sticky Header & Smart Sorting +**Problem:** +- Scrolling long model lists lost header context +- Models unsorted, newest models buried at bottom +- "Runs" and "Streaming" columns had poor visual clarity + +**Solutions:** +- **Sticky header:** Table header stays visible during scroll (max-height: 600px) +- **Smart sorting:** Models sorted by release date (newest first), then alphabetically + - Models without release dates pushed to bottom + - GPT-4.1, o4-mini, latest models now appear at top +- **Better column display:** + - "Runs" column: Shows "0" instead of "-", green badge for completed runs + - "Stream" column: Blue badge "Yes"/"LIVE" or "No" (was text-only) + - Header renamed "Streaming" → "Stream" for compactness + +**Impact:** Users immediately see newest models and header context never lost. + +#### 3. 🗜️ Data-Dense Compact Controls +**From previous commits:** +- Merged 3 CollapsibleCard sections into 2 compact panels +- Prompt controls in single row: dropdown + toggles + preview button +- Advanced parameters collapsible but always accessible +- ~75% less vertical space while preserving all functionality + +**FILES CHANGED:** +- `client/src/components/puzzle/GridPair.tsx` - **NEW** (119 lines) +- `client/src/components/puzzle/PuzzleGridDisplay.tsx` - Refactored to use GridPair +- `client/src/components/puzzle/ModelTable.tsx` - Sticky header, sorting, badge columns +- `client/src/components/puzzle/CompactControls.tsx` - From earlier commit +- `server/routes/models.ts` - Added releaseDate to API responses +- `client/src/components/puzzle/PuzzleGrid.tsx` - Removed confusing highlight prop + +**TECHNICAL DETAILS:** +- GridPair component handles single and multiple outputs +- Responsive classification preserved (standard/wide/tall grid layouts) +- DaisyUI badges and sticky positioning used throughout +- No breaking changes to existing props or types + +**UX WINS:** +✅ Input/output relationship crystal clear with explicit labels +✅ Multi-output test cases unambiguous with numbered outputs +✅ Model table header always visible when scrolling +✅ Newest models at top of list (2025-04 releases first) +✅ Cleaner badge-based column styling +✅ 75% reduction in control panel vertical space + +--- + +## [4.7.1] - 2025-10-12 6:00 PM +### 🎯 CRITICAL FIX: Grover Live Streaming - Complete Terminal Experience + +**SEVERITY:** P0 - Complete absence of real-time Python execution feedback + +**ROOT CAUSE:** +The fundamental issue was NOT in the streaming infrastructure (SSE, WebSocket, harness) - those all work perfectly. The problem was that **Python execution was a black hole**. Users couldn't see what was happening during the 30-60 second execution periods. + +**WHAT WAS MISSING:** +1. ❌ Generated Python code from each iteration +2. ❌ Real-time Python execution progress ("Executing program 1 of 3...") +3. ❌ Individual program pass/fail status during execution +4. ❌ Execution results and scores +5. ❌ The winning program highlighted after each iteration +6. ❌ Best program evolution across iterations + +**THE FIX - Terminal-Style Live Output:** + +**1. Python Executor Streaming (`grover_executor.py`)** +- Added progress events DURING execution (not just at the end) +- Emits `{"type": "log", "message": "⚙️ Executing program 1 of 3..."}` before each program +- Emits success/failure status after each execution +- Works for both training mode (multiple programs) and test mode (best program on test cases) +- All events are NDJSON (one JSON object per line) for line-by-line streaming + +**2. Python Bridge Streaming (`pythonBridge.ts`)** +- `runGroverExecution()` now uses `readline.createInterface()` like Saturn +- Processes stdout line-by-line in real-time (not buffered) +- Added optional `onLog` callback parameter to forward Python logs immediately +- `runGroverTestExecution()` gets same streaming treatment +- Python log events are forwarded to the callback as they arrive + +**3. Grover Service Display (`grover.ts`)** +- Shows generated Python code from LLM with visual separators +- Displays execution results table after Python runs +- Highlights new best programs with trophy emoji 🏆 +- Python execution logs stream in real-time through `sendProgress` callback +- All logs flow to both WebSocket (legacy) and SSE (streaming) paths + +**WHAT USERS NOW SEE:** +``` +✅ LLM generates 3 Python programs → CODE DISPLAYED IMMEDIATELY +✅ "⚙️ Executing program 1 of 3..." → LIVE PYTHON PROGRESS +✅ "✅ Program 1 executed successfully" → INSTANT FEEDBACK +✅ Execution results table → SCORES & ERRORS +✅ 🏆 NEW BEST PROGRAM! → WINNING CODE HIGHLIGHTED +✅ Iteration summary → PROGRESS TRACKING +``` + +**WHY THIS FIXES THE BLANK SCREEN:** +- Frontend hooks (`useSaturnProgress`, `useGroverProgress`) already append logs to `logLines` +- UI components already render `logLines` in terminal-style panels +- The missing piece was **THE SOURCE** - Python wasn't emitting anything to stream +- Now Python emits progress → Bridge streams it → Grover forwards it → SSE delivers it → UI displays it + +**FILES CHANGED:** +- `server/python/grover_executor.py`: Added NDJSON log events during execution (lines 123-164) +- `server/services/pythonBridge.ts`: Changed from buffering to line-by-line streaming (lines 246-330, 339-427) +- `server/services/grover.ts`: Added code display, execution results, best program highlighting (lines 231-277, 523-527, 612-619) + +**TESTING INSTRUCTIONS:** +1. Navigate to Grover Solver page +2. Select a puzzle and click "Start Grover Analysis" +3. Watch the terminal panel fill with: + - Iteration start messages + - Generated Python code blocks + - Real-time execution progress + - Success/failure status per program + - Execution results table + - Best program highlights +4. Verify logs appear **AS THEY HAPPEN** (not all at the end) +5. Verify you can see the evolution of code across iterations + +**AUTHOR:** Sonnet 4.5 +**PRIORITY:** P0 (Critical UX Failure) + +--- + +## [4.7.0] - 2025-10-12 5:45 PM +### ✨ FEATURE: Complete DaisyUI Conversion - Dependency Components (15/15) + +**SCOPE:** Converted all 15 assigned dependency components from shadcn/ui to DaisyUI + +**GROUP A - Gallery & Modal Components (7 files):** +- TrainingPairCard.tsx: Card → DaisyUI card +- TrainingPairGallery.tsx: Badge → DaisyUI badge +- TestCaseGallery.tsx: Badge → DaisyUI badge +- PredictionCard.tsx: Badge → DaisyUI badge +- TrainingPairZoomModal.tsx: Dialog → DaisyUI modal +- TestCaseZoomModal.tsx: Dialog → DaisyUI modal +- PromptPreviewModal.tsx: Dialog + Button → DaisyUI modal + button + +**GROUP B - Analysis Result Components (8 files):** +- AnalysisResultMetrics.tsx: Badge → DaisyUI badge +- AnalysisResultCard.tsx: Badge → DaisyUI badge +- AnalysisResultHeader.tsx: Badge + Button → DaisyUI (30+ conversions) +- AnalysisResultContent.tsx: Badge + Button → DaisyUI +- AnalysisResultGrid.tsx: Badge + Button → DaisyUI +- AnalysisResultActions.tsx: No changes needed +- OriginalExplanationCard.tsx: Card + Badge + Button + Collapsible → DaisyUI +- IterationCard.tsx: Card + Badge + Button + Collapsible → DaisyUI + +**CONVERSION PATTERNS:** +- Card → `
` +- Badge → `
` +- Button → `