From 729198b50b08950ae4113ba869697d21ee9f8286 Mon Sep 17 00:00:00 2001 From: bimu233 Date: Sat, 25 Apr 2026 10:58:34 -0500 Subject: [PATCH 1/3] inter phase memory --- src/core/runner.py | 10 ++ templates/agents/session_instructions.txt | 73 ++++++++- templates/base/deliverables/state_template.md | 55 +++++++ templates/base/researcher.txt | 151 +++++++++++++++--- 4 files changed, 268 insertions(+), 21 deletions(-) create mode 100644 templates/base/deliverables/state_template.md diff --git a/src/core/runner.py b/src/core/runner.py index 9b61fa7..758190e 100644 --- a/src/core/runner.py +++ b/src/core/runner.py @@ -300,6 +300,16 @@ def run_research(self, idea_id: str, if use_scribe: (work_dir / "notebooks").mkdir(parents=True, exist_ok=True) + # Initialize STATE.md from template if not already present. + # The agent reads and updates this file; the pipeline owns creation so + # the template structure (italic description lines) is always correct. + state_md = work_dir / "STATE.md" + if not state_md.exists(): + state_template = self.project_root / "templates" / "base" / "deliverables" / "state_template.md" + if state_template.exists(): + state_md.write_text(state_template.read_text(encoding='utf-8'), encoding='utf-8') + print(f" Initialized STATE.md") + # Copy helper scripts to workspace self._copy_workspace_resources(work_dir) diff --git a/templates/agents/session_instructions.txt b/templates/agents/session_instructions.txt index 6609cc7..82c231e 100644 --- a/templates/agents/session_instructions.txt +++ b/templates/agents/session_instructions.txt @@ -1,5 +1,24 @@ {{ session_start }} {{ priority_section }} +RESUME CHECK — do this before anything else: +──────────────────────────────────────────────────────────────────────────────── +STATE.md is present in your workspace (created by the pipeline). +READ it now. Check the Phase Status table in "# Current State": + - All phases pending → new session, start from Phase 0 + - Some phases DONE → resume session, skip DONE phases, start from first non-DONE +The rest of STATE.md contains all decisions, paths, errors, and results from +prior sessions — use it to avoid repeating work or mistakes. +If resuming into Phase 3 (Implementation): READ # Errors and Corrections before touching any code. +Each entry lists an error and the fix that was attempted — do not retry fixes already listed there. + +⚠️ IMMEDIATELY after reading STATE.md — before environment setup, before any other action: +Write to STATE.md now: + - Session Title: set a 5-10 word title for this research task + - Worklog: add "[Session Start] Beginning session — Phase 0 IN PROGRESS" + - Current State: set Active to "Phase 0 — Motivation: IN PROGRESS" +This write must happen BEFORE environment setup so context compaction cannot erase it. +════════════════════════════════════════════════════════════════════════════════ + CRITICAL: Environment Setup ──────────────────────────────────────────────────────────────────────────────── You MUST use an isolated environment for this project. DO NOT use the @@ -138,11 +157,16 @@ run meaningful experiments that test the hypothesis using the resources provided WHAT "FULLY-AUTOMATED" MEANS: ✓ Complete ALL phases (1-6) in a SINGLE CONTINUOUS SESSION ✓ Make reasonable decisions autonomously without waiting for user input -✓ Move immediately from one phase to the next +✓ Move immediately from one phase to the next — EXCEPT at ⚠️ CHECKPOINT steps ✓ Use the pre-gathered resources effectively -✓ Document decisions, but keep moving forward +✓ Document decisions — ⚠️ CHECKPOINT writes to STATE.md ARE the documentation step, not optional ✓ Deliver REPORT.md with actual experimental results at the end +⚠️ CHECKPOINTS ARE MANDATORY PHASE TRANSITIONS — NOT OPTIONAL PAUSES: +Every ⚠️ CHECKPOINT in the research task is a required gate between phases. +Writing to STATE.md at each checkpoint IS "moving forward" — it is part of the phase. +Do NOT skip ⚠️ CHECKPOINT steps. They take less than 1 minute and enable session recovery. + YOU WILL NOT GET ADDITIONAL INSTRUCTIONS between phases. This prompt contains everything you need. Execute it completely from start to finish. @@ -198,7 +222,31 @@ Execute the following research task: EXECUTION WORKFLOW (FOLLOW THIS SEQUENCE - DO NOT STOP BETWEEN PHASES): ──────────────────────────────────────────────────────────────────────────────── +INTER-PHASE MEMORY: STATE.md +──────────────────────────────────────────────────────────────────────────────── +STATE.md is your persistent memory file. It is created by the pipeline before +you start — it is always present in your workspace. It survives context +compaction and lets you resume correctly if the session is interrupted. + +RULES — follow exactly: + 1. SESSION START: READ STATE.md immediately. Check the Phase Status table in + "# Current State" — the first non-DONE phase is where you resume from. + All decisions, paths, errors, and results from prior sessions are in there. + 2. PHASE END: update STATE.md before proceeding to the next phase. + 3. NEVER modify lines written in italic format: _like this_ + These are structural template instructions — preserve them EXACTLY as written. + Do not reword, remove, or overwrite italic lines under any circumstance. + 4. Only write content BELOW the italic description line in each section. + 5. Write detail-dense content: absolute file paths, exact error messages, + exact numerical results, precise shell commands. + 6. Update "# Current State" whenever the phase status or active action changes. + 7. Keep total file size under 150 lines / ~3,000 tokens. + If a section exceeds ~20 lines, condense older entries first. +──────────────────────────────────────────────────────────────────────────────── + BEFORE YOU START: Review Pre-Gathered Resources (5-10 min) + ✓ READ STATE.md — check Phase Status table to determine if this is a resume. + If any phase shows DONE, skip it and continue from the first non-DONE phase. ✓ READ literature_review.md to understand the research landscape ✓ READ resources.md to see what's available ✓ Browse papers/ directory for key papers @@ -218,6 +266,9 @@ Phase 1: Motivation & Planning (20-40 min) ✓ Choose baselines and metrics based on literature review ✓ Plan timeline and resource allocation ✓ Document plan in planning.md (2-3 pages maximum) + ✓ STATE.md: Write Research Specification (hypothesis, datasets, metrics, constraints). + Update Worklog. + Update Current State: mark Phase 1 as DONE, set Active to "Phase 2 — Setup". → WHEN COMPLETE: Immediately proceed to Phase 2 (Setup) → DO NOT WAIT for user confirmation - this is fully automated! @@ -228,6 +279,9 @@ Phase 2: Environment & Data Setup (10-20 min) ✓ Load and verify pre-downloaded datasets from datasets/ directory ✓ Verify data quality and characteristics ✓ Run exploratory data analysis + ✓ STATE.md: Update Files and Resources (dataset paths, sizes, venv location). + Update Workflow (activation command, install commands). Update Worklog. + Update Current State: mark Phase 2 as DONE, set Active to "Phase 3 — Implementation". → WHEN COMPLETE: Immediately proceed to Phase 3 (Implementation) @@ -239,6 +293,10 @@ Phase 3: Implementation (60-90 min) ✓ Create evaluation harness ✓ Write clean, documented code with comments and docstrings ✓ Test incrementally + ✓ STATE.md: Update Experiment Design (baselines, metrics, hyperparameters, seeds). + Update Files and Resources (key scripts + what they do). Record any errors in + Errors and Corrections. Update Worklog. + Update Current State: mark Phase 3 as DONE, set Active to "Phase 4 — Experimentation". → WHEN COMPLETE: Immediately proceed to Phase 4 (Experiments) @@ -248,6 +306,10 @@ Phase 4: Experimentation (60-90 min) ✓ Collect results systematically (save to results/ directory) ✓ Generate visualizations ✓ Monitor for issues + ✓ STATE.md: Update Experiment Results with raw numbers from every run. + Update Workflow with exact commands used. Record any failed runs in + Errors and Corrections. Update Worklog. + Update Current State: mark Phase 4 as DONE, set Active to "Phase 5 — Analysis". → WHEN COMPLETE: Immediately proceed to Phase 5 (Analysis) @@ -257,6 +319,10 @@ Phase 5: Analysis (30-45 min) ✓ Perform error analysis ✓ Create comprehensive visualizations ✓ Document findings incrementally + ✓ STATE.md: Update Learnings (what worked, what did not, insights). + Finalize Experiment Results with statistical test outcomes and effect sizes. + Update Worklog. + Update Current State: mark Phase 5 as DONE, set Active to "Phase 6 — Documentation". → WHEN COMPLETE: Immediately proceed to Phase 6 (Documentation) @@ -266,6 +332,9 @@ Phase 6: Final Documentation (20-30 min) - MANDATORY BEFORE ENDING SESSION ✓ Ensure resources.md documents your research process ✓ Verify all code has clear comments and docstrings ✓ Check reproducibility + ✓ STATE.md: Final update — set Session Title to reflect completed work. + Update Worklog with session completion entry. + Update Current State: mark Phase 6 as DONE, set Active to "Complete — all deliverables written." → WHEN COMPLETE: Session is finished diff --git a/templates/base/deliverables/state_template.md b/templates/base/deliverables/state_template.md new file mode 100644 index 0000000..9b71e22 --- /dev/null +++ b/templates/base/deliverables/state_template.md @@ -0,0 +1,55 @@ +# Session Title +_5-10 word descriptive title summarizing the research task. Update each phase._ + +# Current State +_Active phase number and name. What is immediately pending. The single most important next action. +Write a Phase Status table (all 7 phases: DONE / IN PROGRESS / pending) then the active action. +Update this section whenever the phase status or active action changes._ + +Phase 0 — Motivation: pending +Phase 1 — Planning: pending +Phase 2 — Setup: pending +Phase 3 — Implementation: pending +Phase 4 — Analysis: pending +Phase 5 — Documentation: pending +Phase 6 — Validation: pending + +Active: Phase 0 — Motivation: [next action here] + +# Worklog +_One terse line per significant action. Phase transitions explicitly noted. +Format: [Phase N — Name] action taken — outcome._ + + +# Research Specification +_Hypothesis being tested. Datasets used (name + absolute path). Evaluation metrics and how computed. Key constraints. +Written in Phase 1. NEVER overwrite this content — only append corrections below it._ + + +# Files and Resources +_Important files with absolute paths. Datasets: name, location, size. Key scripts and what they do. +Model checkpoints. Configuration files. Output directories._ + +# Workflow +_Exact commands to reproduce experiments in order. Environment activation command. +How to interpret outputs. Package install commands used._ + +# Errors and Corrections +_Errors encountered: exact message, file path, line number, and how fixed. +Approaches that FAILED and must NOT be retried — include the reason they failed. +This is the anti-repetition record: if it is listed here, do not attempt it again._ + +# Experiment Design +_Baselines chosen and justification. Evaluation metrics and how computed. +Hyperparameters, random seeds, train/val/test splits. Architecture decisions and rationale._ + +# Learnings +_What worked and why. What did not work and why. Surprising findings. Domain insights. +Do not duplicate content already recorded in other sections._ + +# Experiment Results +_Exact numerical results: metric name, value, std, conditions. Complete comparison tables. +Record actual numbers, not prose summaries._ + + + diff --git a/templates/base/researcher.txt b/templates/base/researcher.txt index 16e655c..efe485b 100644 --- a/templates/base/researcher.txt +++ b/templates/base/researcher.txt @@ -71,6 +71,10 @@ For each planned experiment: DO NOT proceed to implementation until this section is complete. +⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: + ✓ In the # Worklog section: append "[Phase 0 — Motivation] Novelty assessment complete." + ✓ Current State: change "Phase 0 — Motivation: pending" to "Phase 0 — Motivation: DONE". Change Active to "Active: Phase 1 — Planning: Starting." + ═══════════════════════════════════════════════════════════════════════════════ AFTER COMPLETING PHASE 0: IMMEDIATELY BEGIN PHASE 1 ───────────────────────────────────────────────────────────────────────────── @@ -180,21 +184,29 @@ Must include: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: + ✓ In the # Research Specification section: write hypothesis, metrics + how computed, key constraints. + ✓ In the # Files and Resources section: write absolute path to planning.md, dataset names + absolute paths + ✓ In the # Worklog section: append "[Phase 1 — Planning] Plan complete." + ✓ In the # Current State section: change "Phase 1 — Planning: pending" to "Phase 1 — Planning: DONE". Change Active to "Active: Phase 2 — Setup: Starting." + ═══════════════════════════════════════════════════════════════════════════════ AFTER COMPLETING PHASE 1: IMMEDIATELY BEGIN PHASE 2 ───────────────────────────────────────────────────────────────────────────── ✓ Review your planning.md to ensure it's complete -✓ NOW START IMPLEMENTING - Do not wait for user confirmation +✓ In the # Worklog section of STATE.md: append "[Phase 2 — Setup] Starting." +✓ In the # Current State section: change "Phase 2 — Setup: pending" to "Phase 2 — Setup: in progress" +✓ NOW PROCEED to setup - Do not wait for user confirmation ✓ Remember: This is a fully automated research system -✓ The goal is to complete all phases (1-6) in a single continuous session +✓ The goal is to complete all phases (0-6) in a single continuous session ═══════════════════════════════════════════════════════════════════════════════ -PHASE 2: IMPLEMENTATION +PHASE 2: SETUP ───────────────────────────────────────────────────────────────────────────── -Execute your plan systematically and carefully: +Prepare the environment and data before writing any code: 1. Environment Setup ✓ Install required dependencies @@ -214,7 +226,31 @@ Execute your plan systematically and carefully: ✓ Save example samples for documentation ✓ Visualize data distributions -3. Implementation (adapt to your research type) +⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: + ✓ In the # Worklog section: append "[Phase 2 — Setup] Environment and data ready." — write this FIRST. + ✓ In the # Files and Resources section: append dataset absolute path + size, venv absolute path, Python version, GPU model/memory. + ✓ In the # Workflow section: write exact activation command (e.g. source /path/.venv/bin/activate), exact package install commands used. + ✓ In the # Current State section: change "Phase 2 — Setup: in progress" to "Phase 2 — Setup: DONE". Change Active to "Active: Phase 3 — Implementation: Starting." +A Worklog entry does NOT substitute for writing to the named sections above. + +═══════════════════════════════════════════════════════════════════════════════ +AFTER COMPLETING PHASE 2: IMMEDIATELY BEGIN PHASE 3 +───────────────────────────────────────────────────────────────────────────── + +✓ Environment is ready with all dependencies installed +✓ Datasets are loaded and validated +✓ In the # Worklog section of STATE.md: append "[Phase 3 — Implementation] Starting." +✓ In the # Current State section: change "Phase 3 — Implementation: pending" to "Phase 3 — Implementation: in progress" +✓ NOW START IMPLEMENTING - Do not wait for user confirmation + +═══════════════════════════════════════════════════════════════════════════════ + +PHASE 3: IMPLEMENTATION +───────────────────────────────────────────────────────────────────────────── + +Execute your plan systematically and carefully: + +1. Implementation (adapt to your research type) ✓ Follow the approach outlined in your research plan ✓ Implement reference methods or baselines first (if applicable) ✓ Implement your proposed method, analysis, or computational workflow @@ -230,19 +266,42 @@ Execute your plan systematically and carefully: ✓ Use descriptive variable names ✓ Add assertions to validate assumptions -5. Error Handling +2. Error Handling ✓ Use try-except blocks for robustness ✓ Log errors with full context ✓ Don't ignore warnings - investigate them ✓ Validate assumptions with assertions ✓ Handle edge cases gracefully -6. Iterative Development +⚠️ MANDATORY STATE.md UPDATE — do this NOW, after writing the experiment script and BEFORE running it. +This is a SEPARATE STATE.md edit from the Phase 3 end CHECKPOINT — do not defer or combine them. + + ✓ In the # Worklog section: append "[Phase 3 — Implementation] Experiment script written — starting experiments." — write this FIRST. + ✓ In the # Experiment Design section: Write baselines chosen + justification, metrics + how computed, hyperparameters, random seeds, train/val/test splits. + ✓ In the # Files and Resources: write absolute path to experiment script + one-line description of what it does. + ✓ In the # Workflow section: Write exact command to run the experiment — copy it verbatim (e.g. python /abs/path/experiment.py). + ✓ In the # Current State section: change Active to "Active: Phase 3 — Implementation: running experiments". +A Worklog entry does NOT substitute for writing to the named sections above. + +3. Iterative Development — follow this loop exactly: + + STEP A — Run the experiment. + + STEP B — If the run FAILED: + ⚠️ MANDATORY — do NOT touch any code until this STATE.md write is done: + Append to the # Errors and Corrections section of STATE.md: + Error: [exact last line of traceback or error message] + File: [file path and line number] + Fix: [what you are about to change and why] + This write is the gate. You cannot apply a fix until it is written. + Then apply the fix and return to STEP A. + + STEP C — If the run SUCCEEDED: proceed past this loop to the Phase 3 CHECKPOINT. + + Additional guidance: ✓ Start small - validate on tiny dataset first ✓ Gradually scale up - ✓ Monitor resource usage - ✓ Checkpoint before long-running operations - ✓ Document deviations from plan + ✓ Before each run: READ # Errors and Corrections — do not retry a fix already listed there BEST PRACTICES: @@ -292,19 +351,39 @@ with open('results/config.json', 'w') as f: json.dump(config, f, indent=2) ``` +⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: + ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete." — write this FIRST. + ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are complete (add anything missing from the earlier write). + ✓ In the # Files and Resources: confirm all key scripts are listed with absolute paths. Write path to experiment result files. + ✓ In the # Workflow section: confirm exact run commands are recorded verbatim. + ✓ In the # Errors and Corrections section: confirm all errors are recorded (should already be written during the loop). Add any that were missed; mark each fix as applied or pending. If no errors occurred, write "None." + ✓ Current State: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting." +A Worklog entry does NOT substitute for writing to the named sections above. + ═══════════════════════════════════════════════════════════════════════════════ -AFTER COMPLETING PHASE 2: IMMEDIATELY BEGIN PHASE 3 +AFTER COMPLETING PHASE 3: IMMEDIATELY BEGIN PHASE 4 ───────────────────────────────────────────────────────────────────────────── ✓ Verify your implementation is working +✓ In the # Worklog section of STATE.md: append "[Phase 4 — Analysis] Starting." ✓ NOW PROCEED to analysis - continue the automated workflow ✓ Use your implementation to analyze results and test hypotheses ═══════════════════════════════════════════════════════════════════════════════ -PHASE 3: ANALYSIS +PHASE 4: ANALYSIS ───────────────────────────────────────────────────────────────────────────── +⚠️ FIRST ACTION — READ STATE.md now. Find the line "Phase 3 — Implementation: ..." in # Current State. +If it is NOT "DONE": complete the Phase 3 CHECKPOINT write before doing any analysis: + ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete." + ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are recorded. + ✓ In the # Files and Resources section: confirm all key scripts with absolute paths are listed. + ✓ In the # Workflow section: confirm exact run commands are recorded verbatim. + ✓ In the # Errors and Corrections section: confirm all errors recorded; if none, write "None." + ✓ In the # Current State section: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting." + A Worklog entry does NOT substitute for writing to the named sections above. + Interpret results rigorously using chain-of-thought reasoning: 1. Descriptive Statistics @@ -358,19 +437,35 @@ For each result, ask yourself: 4. What additional evidence would strengthen this conclusion? 5. What are the practical implications? +⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: + ✓ In the # Worklog section: append "[Phase 4 — Analysis] Analysis complete." — write this FIRST. + ✓ In the # Experiment Results section: Write exact numbers for every metric, every condition, every baseline — comparison table format. Numbers, not prose. + ✓ In the # Learnings section: write what worked and why (1-2 lines per finding). What failed and why. Surprising findings. Domain insights. + ✓ In the # Current State section: change "Phase 4 — Analysis: ..." to "Phase 4 — Analysis: DONE". Change Active to "Active: Phase 5 — Documentation: Starting." +A Worklog entry does NOT substitute for writing to the named sections above. + ═══════════════════════════════════════════════════════════════════════════════ -AFTER COMPLETING PHASE 3: IMMEDIATELY BEGIN PHASE 4 +AFTER COMPLETING PHASE 4: IMMEDIATELY BEGIN PHASE 5 ───────────────────────────────────────────────────────────────────────────── ✓ Verify your analysis is complete with statistical tests and visualizations -✓ NOW PROCEED to documentation - this is the final phase +✓ In the # Worklog section of STATE.md: append "[Phase 5 — Documentation] Starting." +✓ NOW PROCEED to documentation ✓ Document all your findings, methodology, and results in REPORT.md ═══════════════════════════════════════════════════════════════════════════════ -PHASE 4: DOCUMENTATION +PHASE 5: DOCUMENTATION ───────────────────────────────────────────────────────────────────────────── +⚠️ FIRST ACTION — READ STATE.md now. Find the line "Phase 4 — Analysis: ..." in # Current State. +If it is NOT "DONE": complete the Phase 4 CHECKPOINT write before writing any documentation: + ✓ In the # Worklog section: append "[Phase 4 — Analysis] Analysis complete." + ✓ Experiment Results: exact numbers for every metric, every condition, every baseline — table format. + ✓ Learnings: what worked and why, what failed and why, surprising findings. + ✓ Current State: change "Phase 4 — Analysis: ..." to "Phase 4 — Analysis: DONE". Change Active to "Active: Phase 5 — Documentation: Starting." + A Worklog entry does NOT substitute for writing to the named sections above. + Create comprehensive documentation that allows others (and your future self) to understand and reproduce your work. @@ -526,19 +621,31 @@ Common issues and solutions ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: + ✓ In the # Worklog section: append "[Phase 5 — Documentation] REPORT.md written." — write this FIRST. + ✓ Files and Resources: add REPORT.md absolute path. + ✓ Current State: change "Phase 5 — Documentation: ..." to "Phase 5 — Documentation: DONE". Change Active to "Active: Phase 6 — Validation: Starting." + ═══════════════════════════════════════════════════════════════════════════════ -AFTER COMPLETING PHASE 4: IMMEDIATELY BEGIN PHASE 5 +AFTER COMPLETING PHASE 5: IMMEDIATELY BEGIN PHASE 6 ───────────────────────────────────────────────────────────────────────────── ✓ Verify REPORT.md and README.md are complete with actual results +✓ In the # Worklog section of STATE.md: append "[Phase 6 — Validation] Starting." ✓ NOW PROCEED to final validation - the last step before completion ✓ Validate everything is reproducible and scientifically sound ═══════════════════════════════════════════════════════════════════════════════ -PHASE 5: VALIDATION +PHASE 6: VALIDATION ───────────────────────────────────────────────────────────────────────────── +⚠️ FIRST ACTION — READ STATE.md now. Find the line "Phase 5 — Documentation: ..." in # Current State. +If it is NOT "DONE": complete the Phase 5 CHECKPOINT write before validating: + ✓ In the # Worklog section: append "[Phase 5 — Documentation] REPORT.md written." + ✓ In the # Files and Resources section: add REPORT.md absolute path. + ✓ In the # Current State section: change "Phase 5 — Documentation: ..." to "Phase 5 — Documentation: DONE". Change Active to "Active: Phase 6 — Validation: Starting." + Before considering the work complete, validate everything: ✓ CODE VALIDATION @@ -572,11 +679,17 @@ Before considering the work complete, validate everything: If any checkbox is unchecked, address it before finishing. +⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: + ✓ Session Title: set a 5-10 word summary of what was accomplished. + ✓ In the # Worklog section: append "[Phase 6 — Validation] Session complete." + ✓ Current State: change "Phase 6 — Validation: ..." to "Phase 6 — Validation: DONE". Change Active to "Active: Complete — all deliverables written." +NEVER modify italic _description_ lines in STATE.md. + ═══════════════════════════════════════════════════════════════════════════════ -AFTER COMPLETING PHASE 5: RESEARCH SESSION IS COMPLETE +AFTER COMPLETING PHASE 6: RESEARCH SESSION IS COMPLETE ───────────────────────────────────────────────────────────────────────────── -✓ All phases (1-5) have been completed +✓ All phases (0-6) have been completed ✓ REPORT.md contains actual experimental results ✓ All validation checks have passed ✓ The research session is now finished From 601e0f44095a372bc963625385859ea3bc185ded Mon Sep 17 00:00:00 2001 From: bimu233 Date: Sun, 26 Apr 2026 10:51:43 -0500 Subject: [PATCH 2/3] experiment attempts error and fix log enabled --- templates/base/deliverables/state_template.md | 20 +++--- templates/base/researcher.txt | 61 ++++++++++++++----- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/templates/base/deliverables/state_template.md b/templates/base/deliverables/state_template.md index 9b71e22..493b8ce 100644 --- a/templates/base/deliverables/state_template.md +++ b/templates/base/deliverables/state_template.md @@ -31,25 +31,25 @@ _Important files with absolute paths. Datasets: name, location, size. Key script Model checkpoints. Configuration files. Output directories._ # Workflow -_Exact commands to reproduce experiments in order. Environment activation command. -How to interpret outputs. Package install commands used._ - -# Errors and Corrections -_Errors encountered: exact message, file path, line number, and how fixed. -Approaches that FAILED and must NOT be retried — include the reason they failed. -This is the anti-repetition record: if it is listed here, do not attempt it again._ +_Final successful reproduction commands only: environment activation command, then the exact run command. +Do not list failed attempts here — those belong in # Experiment Attempts._ # Experiment Design _Baselines chosen and justification. Evaluation metrics and how computed. Hyperparameters, random seeds, train/val/test splits. Architecture decisions and rationale._ -# Learnings -_What worked and why. What did not work and why. Surprising findings. Domain insights. -Do not duplicate content already recorded in other sections._ +# Experiment Attempts +_One entry per run. Written BEFORE each run (Status: RUNNING) and updated AFTER (Status: FAILED or SUCCESS). +Never delete or edit past entries — append only._ # Experiment Results _Exact numerical results: metric name, value, std, conditions. Complete comparison tables. Record actual numbers, not prose summaries._ +# Learnings +_What worked and why. What did not work and why. Surprising findings. Domain insights. +Do not duplicate content already recorded in other sections._ + + diff --git a/templates/base/researcher.txt b/templates/base/researcher.txt index efe485b..5add07b 100644 --- a/templates/base/researcher.txt +++ b/templates/base/researcher.txt @@ -229,7 +229,7 @@ Prepare the environment and data before writing any code: ⚠️ CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing: ✓ In the # Worklog section: append "[Phase 2 — Setup] Environment and data ready." — write this FIRST. ✓ In the # Files and Resources section: append dataset absolute path + size, venv absolute path, Python version, GPU model/memory. - ✓ In the # Workflow section: write exact activation command (e.g. source /path/.venv/bin/activate), exact package install commands used. + ✓ In the # Workflow section: write exact environment activation command (e.g. source /path/.venv/bin/activate). Package install commands belong in # Files and Resources, not here. ✓ In the # Current State section: change "Phase 2 — Setup: in progress" to "Phase 2 — Setup: DONE". Change Active to "Active: Phase 3 — Implementation: Starting." A Worklog entry does NOT substitute for writing to the named sections above. @@ -283,25 +283,54 @@ This is a SEPARATE STATE.md edit from the Phase 3 end CHECKPOINT — do not defe ✓ In the # Current State section: change Active to "Active: Phase 3 — Implementation: running experiments". A Worklog entry does NOT substitute for writing to the named sections above. -3. Iterative Development — follow this loop exactly: +3. Iterative Development — follow this protocol exactly, one attempt at a time: - STEP A — Run the experiment. + STEP A — BEFORE RUN (write first, run second): + Append to the # Experiment Attempts section of STATE.md: - STEP B — If the run FAILED: - ⚠️ MANDATORY — do NOT touch any code until this STATE.md write is done: - Append to the # Errors and Corrections section of STATE.md: - Error: [exact last line of traceback or error message] - File: [file path and line number] - Fix: [what you are about to change and why] - This write is the gate. You cannot apply a fix until it is written. + Attempt N: + - Status: RUNNING + - Command: [exact command you are about to run] + + Do not run the experiment until this write is complete. + + STEP B — RUN the experiment. + + STEP C — AFTER RUN, update the Attempt N entry in STATE.md immediately. + + A run is FAILED if ANY of the following occur: + - The process exits with a non-zero code + - A Python exception or traceback appears in output + - An output file that should exist is missing + - Metrics contain NaN, Inf, or nonsensical values + - You need to change any code before running again + + IF FAILED — append to STATE.md # Experiment Attempts BEFORE touching any code: + Attempt N: + - Status: FAILED + - Command: [exact command run] + - Error: [exact last line of traceback or error message] + - Fix Applied: [what you are about to change and why] + - Output: [any partial output saved, or None] Then apply the fix and return to STEP A. - STEP C — If the run SUCCEEDED: proceed past this loop to the Phase 3 CHECKPOINT. + IF SUCCESS — append to STATE.md # Experiment Attempts: + Attempt N: + - Status: SUCCESS + - Command: [exact command run] + - Metrics: [key metric values] + - Output Files: [paths to result files] + Then proceed past this loop to the Phase 3 CHECKPOINT. + + ⚠️ Hard constraint: You are NOT allowed to modify code or run the next + experiment until the current Attempt entry shows Status: FAILED or SUCCESS + (not RUNNING). Writing RUNNING and FAILED in one combined edit is allowed — + the constraint is that the FAILED entry exists before any code changes. Additional guidance: ✓ Start small - validate on tiny dataset first ✓ Gradually scale up - ✓ Before each run: READ # Errors and Corrections — do not retry a fix already listed there + ✓ Before each run: READ # Experiment Attempts — do not retry a fix already listed there BEST PRACTICES: @@ -355,8 +384,8 @@ with open('results/config.json', 'w') as f: ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete." — write this FIRST. ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are complete (add anything missing from the earlier write). ✓ In the # Files and Resources: confirm all key scripts are listed with absolute paths. Write path to experiment result files. - ✓ In the # Workflow section: confirm exact run commands are recorded verbatim. - ✓ In the # Errors and Corrections section: confirm all errors are recorded (should already be written during the loop). Add any that were missed; mark each fix as applied or pending. If no errors occurred, write "None." + ✓ In the # Workflow section: confirm the final successful run command is recorded (one line, verbatim). + ✓ In the # Experiment Attempts section: confirm all attempts are recorded with Status FAILED or SUCCESS (none left as RUNNING). If no errors occurred, write "Attempt 1: Status: SUCCESS" with metrics. ✓ Current State: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting." A Worklog entry does NOT substitute for writing to the named sections above. @@ -379,8 +408,8 @@ If it is NOT "DONE": complete the Phase 3 CHECKPOINT write before doing any anal ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete." ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are recorded. ✓ In the # Files and Resources section: confirm all key scripts with absolute paths are listed. - ✓ In the # Workflow section: confirm exact run commands are recorded verbatim. - ✓ In the # Errors and Corrections section: confirm all errors recorded; if none, write "None." + ✓ In the # Workflow section: confirm the final successful run command is recorded (one line, verbatim). + ✓ In the # Experiment Attempts section: confirm all attempts are recorded with Status FAILED or SUCCESS (none left as RUNNING). ✓ In the # Current State section: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting." A Worklog entry does NOT substitute for writing to the named sections above. From 1fdffeb9e8601fb4aaea8a9c64bc4630714f6d3f Mon Sep 17 00:00:00 2001 From: bimu233 Date: Mon, 27 Apr 2026 14:49:36 -0500 Subject: [PATCH 3/3] add resume instruction to researcher.txt --- templates/agents/session_instructions.txt | 73 +---------------------- templates/base/researcher.txt | 30 ++++++++++ 2 files changed, 32 insertions(+), 71 deletions(-) diff --git a/templates/agents/session_instructions.txt b/templates/agents/session_instructions.txt index 82c231e..6609cc7 100644 --- a/templates/agents/session_instructions.txt +++ b/templates/agents/session_instructions.txt @@ -1,24 +1,5 @@ {{ session_start }} {{ priority_section }} -RESUME CHECK — do this before anything else: -──────────────────────────────────────────────────────────────────────────────── -STATE.md is present in your workspace (created by the pipeline). -READ it now. Check the Phase Status table in "# Current State": - - All phases pending → new session, start from Phase 0 - - Some phases DONE → resume session, skip DONE phases, start from first non-DONE -The rest of STATE.md contains all decisions, paths, errors, and results from -prior sessions — use it to avoid repeating work or mistakes. -If resuming into Phase 3 (Implementation): READ # Errors and Corrections before touching any code. -Each entry lists an error and the fix that was attempted — do not retry fixes already listed there. - -⚠️ IMMEDIATELY after reading STATE.md — before environment setup, before any other action: -Write to STATE.md now: - - Session Title: set a 5-10 word title for this research task - - Worklog: add "[Session Start] Beginning session — Phase 0 IN PROGRESS" - - Current State: set Active to "Phase 0 — Motivation: IN PROGRESS" -This write must happen BEFORE environment setup so context compaction cannot erase it. -════════════════════════════════════════════════════════════════════════════════ - CRITICAL: Environment Setup ──────────────────────────────────────────────────────────────────────────────── You MUST use an isolated environment for this project. DO NOT use the @@ -157,16 +138,11 @@ run meaningful experiments that test the hypothesis using the resources provided WHAT "FULLY-AUTOMATED" MEANS: ✓ Complete ALL phases (1-6) in a SINGLE CONTINUOUS SESSION ✓ Make reasonable decisions autonomously without waiting for user input -✓ Move immediately from one phase to the next — EXCEPT at ⚠️ CHECKPOINT steps +✓ Move immediately from one phase to the next ✓ Use the pre-gathered resources effectively -✓ Document decisions — ⚠️ CHECKPOINT writes to STATE.md ARE the documentation step, not optional +✓ Document decisions, but keep moving forward ✓ Deliver REPORT.md with actual experimental results at the end -⚠️ CHECKPOINTS ARE MANDATORY PHASE TRANSITIONS — NOT OPTIONAL PAUSES: -Every ⚠️ CHECKPOINT in the research task is a required gate between phases. -Writing to STATE.md at each checkpoint IS "moving forward" — it is part of the phase. -Do NOT skip ⚠️ CHECKPOINT steps. They take less than 1 minute and enable session recovery. - YOU WILL NOT GET ADDITIONAL INSTRUCTIONS between phases. This prompt contains everything you need. Execute it completely from start to finish. @@ -222,31 +198,7 @@ Execute the following research task: EXECUTION WORKFLOW (FOLLOW THIS SEQUENCE - DO NOT STOP BETWEEN PHASES): ──────────────────────────────────────────────────────────────────────────────── -INTER-PHASE MEMORY: STATE.md -──────────────────────────────────────────────────────────────────────────────── -STATE.md is your persistent memory file. It is created by the pipeline before -you start — it is always present in your workspace. It survives context -compaction and lets you resume correctly if the session is interrupted. - -RULES — follow exactly: - 1. SESSION START: READ STATE.md immediately. Check the Phase Status table in - "# Current State" — the first non-DONE phase is where you resume from. - All decisions, paths, errors, and results from prior sessions are in there. - 2. PHASE END: update STATE.md before proceeding to the next phase. - 3. NEVER modify lines written in italic format: _like this_ - These are structural template instructions — preserve them EXACTLY as written. - Do not reword, remove, or overwrite italic lines under any circumstance. - 4. Only write content BELOW the italic description line in each section. - 5. Write detail-dense content: absolute file paths, exact error messages, - exact numerical results, precise shell commands. - 6. Update "# Current State" whenever the phase status or active action changes. - 7. Keep total file size under 150 lines / ~3,000 tokens. - If a section exceeds ~20 lines, condense older entries first. -──────────────────────────────────────────────────────────────────────────────── - BEFORE YOU START: Review Pre-Gathered Resources (5-10 min) - ✓ READ STATE.md — check Phase Status table to determine if this is a resume. - If any phase shows DONE, skip it and continue from the first non-DONE phase. ✓ READ literature_review.md to understand the research landscape ✓ READ resources.md to see what's available ✓ Browse papers/ directory for key papers @@ -266,9 +218,6 @@ Phase 1: Motivation & Planning (20-40 min) ✓ Choose baselines and metrics based on literature review ✓ Plan timeline and resource allocation ✓ Document plan in planning.md (2-3 pages maximum) - ✓ STATE.md: Write Research Specification (hypothesis, datasets, metrics, constraints). - Update Worklog. - Update Current State: mark Phase 1 as DONE, set Active to "Phase 2 — Setup". → WHEN COMPLETE: Immediately proceed to Phase 2 (Setup) → DO NOT WAIT for user confirmation - this is fully automated! @@ -279,9 +228,6 @@ Phase 2: Environment & Data Setup (10-20 min) ✓ Load and verify pre-downloaded datasets from datasets/ directory ✓ Verify data quality and characteristics ✓ Run exploratory data analysis - ✓ STATE.md: Update Files and Resources (dataset paths, sizes, venv location). - Update Workflow (activation command, install commands). Update Worklog. - Update Current State: mark Phase 2 as DONE, set Active to "Phase 3 — Implementation". → WHEN COMPLETE: Immediately proceed to Phase 3 (Implementation) @@ -293,10 +239,6 @@ Phase 3: Implementation (60-90 min) ✓ Create evaluation harness ✓ Write clean, documented code with comments and docstrings ✓ Test incrementally - ✓ STATE.md: Update Experiment Design (baselines, metrics, hyperparameters, seeds). - Update Files and Resources (key scripts + what they do). Record any errors in - Errors and Corrections. Update Worklog. - Update Current State: mark Phase 3 as DONE, set Active to "Phase 4 — Experimentation". → WHEN COMPLETE: Immediately proceed to Phase 4 (Experiments) @@ -306,10 +248,6 @@ Phase 4: Experimentation (60-90 min) ✓ Collect results systematically (save to results/ directory) ✓ Generate visualizations ✓ Monitor for issues - ✓ STATE.md: Update Experiment Results with raw numbers from every run. - Update Workflow with exact commands used. Record any failed runs in - Errors and Corrections. Update Worklog. - Update Current State: mark Phase 4 as DONE, set Active to "Phase 5 — Analysis". → WHEN COMPLETE: Immediately proceed to Phase 5 (Analysis) @@ -319,10 +257,6 @@ Phase 5: Analysis (30-45 min) ✓ Perform error analysis ✓ Create comprehensive visualizations ✓ Document findings incrementally - ✓ STATE.md: Update Learnings (what worked, what did not, insights). - Finalize Experiment Results with statistical test outcomes and effect sizes. - Update Worklog. - Update Current State: mark Phase 5 as DONE, set Active to "Phase 6 — Documentation". → WHEN COMPLETE: Immediately proceed to Phase 6 (Documentation) @@ -332,9 +266,6 @@ Phase 6: Final Documentation (20-30 min) - MANDATORY BEFORE ENDING SESSION ✓ Ensure resources.md documents your research process ✓ Verify all code has clear comments and docstrings ✓ Check reproducibility - ✓ STATE.md: Final update — set Session Title to reflect completed work. - Update Worklog with session completion entry. - Update Current State: mark Phase 6 as DONE, set Active to "Complete — all deliverables written." → WHEN COMPLETE: Session is finished diff --git a/templates/base/researcher.txt b/templates/base/researcher.txt index 5add07b..d424d3b 100644 --- a/templates/base/researcher.txt +++ b/templates/base/researcher.txt @@ -4,6 +4,36 @@ Your goal is to test the hypothesis provided, design appropriate experiments, execute them rigorously, analyze results objectively, and document everything comprehensively. +═══════════════════════════════════════════════════════════════════════════════ +RESUME CHECK — do this before anything else +═══════════════════════════════════════════════════════════════════════════════ + +STATE.md is present in your workspace (created by the pipeline before you start). +READ it now. Check the Phase Status table in "# Current State": + + - All phases pending → new session, start from Phase 0 + - Some phases DONE → resumed session, skip all DONE phases, + start from the first non-DONE phase + +If resuming, also read these sections before touching any files or code: + - # Research Specification — hypothesis and metrics already decided + - # Experiment Design — baselines and hyperparameters already chosen + - # Files and Resources — absolute paths to existing scripts and data + - # Experiment Attempts — errors already encountered and fixes already applied; + do NOT retry any fix listed here + - # Experiment Results — numbers already collected; do not re-run completed experiments + - # Workflow — exact activation command and final run command + +⚠️ Write to STATE.md NOW before doing anything else: + - # Worklog: append "[Session Start] Beginning session — Phase X IN PROGRESS" + (replace X with the first non-DONE phase, or 0 if new session) + - # Current State: set Active to the correct phase and action + +This write must happen BEFORE environment setup so it is recorded even if the +session is later interrupted. + +═══════════════════════════════════════════════════════════════════════════════ + This prompt uses meta-prompting techniques and chain-of-thought reasoning to guide you through the complete research workflow. Follow each phase carefully.