From 729198b50b08950ae4113ba869697d21ee9f8286 Mon Sep 17 00:00:00 2001
From: bimu233 <liwoyu2333@gmail.com>
Date: Sat, 25 Apr 2026 10:58:34 -0500
Subject: [PATCH 1/3] inter phase memory

---
 src/core/runner.py                            |  10 ++
 templates/agents/session_instructions.txt     |  73 ++++++++-
 templates/base/deliverables/state_template.md |  55 +++++++
 templates/base/researcher.txt                 | 151 +++++++++++++++---
 4 files changed, 268 insertions(+), 21 deletions(-)
 create mode 100644 templates/base/deliverables/state_template.md

diff --git a/src/core/runner.py b/src/core/runner.py
index 9b61fa7..758190e 100644
--- a/src/core/runner.py
+++ b/src/core/runner.py
@@ -300,6 +300,16 @@ def run_research(self, idea_id: str,
         if use_scribe:
             (work_dir / "notebooks").mkdir(parents=True, exist_ok=True)
 
+        # Initialize STATE.md from template if not already present.
+        # The agent reads and updates this file; the pipeline owns creation so
+        # the template structure (italic description lines) is always correct.
+        state_md = work_dir / "STATE.md"
+        if not state_md.exists():
+            state_template = self.project_root / "templates" / "base" / "deliverables" / "state_template.md"
+            if state_template.exists():
+                state_md.write_text(state_template.read_text(encoding='utf-8'), encoding='utf-8')
+                print(f"   Initialized STATE.md")
+
         # Copy helper scripts to workspace
         self._copy_workspace_resources(work_dir)
 
diff --git a/templates/agents/session_instructions.txt b/templates/agents/session_instructions.txt
index 6609cc7..82c231e 100644
--- a/templates/agents/session_instructions.txt
+++ b/templates/agents/session_instructions.txt
@@ -1,5 +1,24 @@
 {{ session_start }}
 {{ priority_section }}
+RESUME CHECK — do this before anything else:
+────────────────────────────────────────────────────────────────────────────────
+STATE.md is present in your workspace (created by the pipeline).
+READ it now. Check the Phase Status table in "# Current State":
+  - All phases pending   → new session, start from Phase 0
+  - Some phases DONE     → resume session, skip DONE phases, start from first non-DONE
+The rest of STATE.md contains all decisions, paths, errors, and results from
+prior sessions — use it to avoid repeating work or mistakes.
+If resuming into Phase 3 (Implementation): READ # Errors and Corrections before touching any code.
+Each entry lists an error and the fix that was attempted — do not retry fixes already listed there.
+
+⚠️  IMMEDIATELY after reading STATE.md — before environment setup, before any other action:
+Write to STATE.md now:
+  - Session Title: set a 5-10 word title for this research task
+  - Worklog: add "[Session Start] Beginning session — Phase 0 IN PROGRESS"
+  - Current State: set Active to "Phase 0 — Motivation: IN PROGRESS"
+This write must happen BEFORE environment setup so context compaction cannot erase it.
+════════════════════════════════════════════════════════════════════════════════
+
 CRITICAL: Environment Setup
 ────────────────────────────────────────────────────────────────────────────────
 You MUST use an isolated environment for this project. DO NOT use the
@@ -138,11 +157,16 @@ run meaningful experiments that test the hypothesis using the resources provided
 WHAT "FULLY-AUTOMATED" MEANS:
 ✓ Complete ALL phases (1-6) in a SINGLE CONTINUOUS SESSION
 ✓ Make reasonable decisions autonomously without waiting for user input
-✓ Move immediately from one phase to the next
+✓ Move immediately from one phase to the next — EXCEPT at ⚠️ CHECKPOINT steps
 ✓ Use the pre-gathered resources effectively
-✓ Document decisions, but keep moving forward
+✓ Document decisions — ⚠️ CHECKPOINT writes to STATE.md ARE the documentation step, not optional
 ✓ Deliver REPORT.md with actual experimental results at the end
 
+⚠️ CHECKPOINTS ARE MANDATORY PHASE TRANSITIONS — NOT OPTIONAL PAUSES:
+Every ⚠️ CHECKPOINT in the research task is a required gate between phases.
+Writing to STATE.md at each checkpoint IS "moving forward" — it is part of the phase.
+Do NOT skip ⚠️ CHECKPOINT steps. They take less than 1 minute and enable session recovery.
+
 YOU WILL NOT GET ADDITIONAL INSTRUCTIONS between phases.
 This prompt contains everything you need. Execute it completely from start to finish.
 
@@ -198,7 +222,31 @@ Execute the following research task:
 EXECUTION WORKFLOW (FOLLOW THIS SEQUENCE - DO NOT STOP BETWEEN PHASES):
 ────────────────────────────────────────────────────────────────────────────────
 
+INTER-PHASE MEMORY: STATE.md
+────────────────────────────────────────────────────────────────────────────────
+STATE.md is your persistent memory file. It is created by the pipeline before
+you start — it is always present in your workspace. It survives context
+compaction and lets you resume correctly if the session is interrupted.
+
+RULES — follow exactly:
+  1. SESSION START: READ STATE.md immediately. Check the Phase Status table in
+     "# Current State" — the first non-DONE phase is where you resume from.
+     All decisions, paths, errors, and results from prior sessions are in there.
+  2. PHASE END: update STATE.md before proceeding to the next phase.
+  3. NEVER modify lines written in italic format: _like this_
+     These are structural template instructions — preserve them EXACTLY as written.
+     Do not reword, remove, or overwrite italic lines under any circumstance.
+  4. Only write content BELOW the italic description line in each section.
+  5. Write detail-dense content: absolute file paths, exact error messages,
+     exact numerical results, precise shell commands.
+  6. Update "# Current State" whenever the phase status or active action changes.
+  7. Keep total file size under 150 lines / ~3,000 tokens.
+     If a section exceeds ~20 lines, condense older entries first.
+────────────────────────────────────────────────────────────────────────────────
+
 BEFORE YOU START: Review Pre-Gathered Resources (5-10 min)
+  ✓ READ STATE.md — check Phase Status table to determine if this is a resume.
+    If any phase shows DONE, skip it and continue from the first non-DONE phase.
   ✓ READ literature_review.md to understand the research landscape
   ✓ READ resources.md to see what's available
   ✓ Browse papers/ directory for key papers
@@ -218,6 +266,9 @@ Phase 1: Motivation & Planning (20-40 min)
   ✓ Choose baselines and metrics based on literature review
   ✓ Plan timeline and resource allocation
   ✓ Document plan in planning.md (2-3 pages maximum)
+  ✓ STATE.md: Write Research Specification (hypothesis, datasets, metrics, constraints).
+    Update Worklog.
+    Update Current State: mark Phase 1 as DONE, set Active to "Phase 2 — Setup".
 
   → WHEN COMPLETE: Immediately proceed to Phase 2 (Setup)
   → DO NOT WAIT for user confirmation - this is fully automated!
@@ -228,6 +279,9 @@ Phase 2: Environment & Data Setup (10-20 min)
   ✓ Load and verify pre-downloaded datasets from datasets/ directory
   ✓ Verify data quality and characteristics
   ✓ Run exploratory data analysis
+  ✓ STATE.md: Update Files and Resources (dataset paths, sizes, venv location).
+    Update Workflow (activation command, install commands). Update Worklog.
+    Update Current State: mark Phase 2 as DONE, set Active to "Phase 3 — Implementation".
 
   → WHEN COMPLETE: Immediately proceed to Phase 3 (Implementation)
 
@@ -239,6 +293,10 @@ Phase 3: Implementation (60-90 min)
   ✓ Create evaluation harness
   ✓ Write clean, documented code with comments and docstrings
   ✓ Test incrementally
+  ✓ STATE.md: Update Experiment Design (baselines, metrics, hyperparameters, seeds).
+    Update Files and Resources (key scripts + what they do). Record any errors in
+    Errors and Corrections. Update Worklog.
+    Update Current State: mark Phase 3 as DONE, set Active to "Phase 4 — Experimentation".
 
   → WHEN COMPLETE: Immediately proceed to Phase 4 (Experiments)
 
@@ -248,6 +306,10 @@ Phase 4: Experimentation (60-90 min)
   ✓ Collect results systematically (save to results/ directory)
   ✓ Generate visualizations
   ✓ Monitor for issues
+  ✓ STATE.md: Update Experiment Results with raw numbers from every run.
+    Update Workflow with exact commands used. Record any failed runs in
+    Errors and Corrections. Update Worklog.
+    Update Current State: mark Phase 4 as DONE, set Active to "Phase 5 — Analysis".
 
   → WHEN COMPLETE: Immediately proceed to Phase 5 (Analysis)
 
@@ -257,6 +319,10 @@ Phase 5: Analysis (30-45 min)
   ✓ Perform error analysis
   ✓ Create comprehensive visualizations
   ✓ Document findings incrementally
+  ✓ STATE.md: Update Learnings (what worked, what did not, insights).
+    Finalize Experiment Results with statistical test outcomes and effect sizes.
+    Update Worklog.
+    Update Current State: mark Phase 5 as DONE, set Active to "Phase 6 — Documentation".
 
   → WHEN COMPLETE: Immediately proceed to Phase 6 (Documentation)
 
@@ -266,6 +332,9 @@ Phase 6: Final Documentation (20-30 min) - MANDATORY BEFORE ENDING SESSION
   ✓ Ensure resources.md documents your research process
   ✓ Verify all code has clear comments and docstrings
   ✓ Check reproducibility
+  ✓ STATE.md: Final update — set Session Title to reflect completed work.
+    Update Worklog with session completion entry.
+    Update Current State: mark Phase 6 as DONE, set Active to "Complete — all deliverables written."
 
   → WHEN COMPLETE: Session is finished
 
diff --git a/templates/base/deliverables/state_template.md b/templates/base/deliverables/state_template.md
new file mode 100644
index 0000000..9b71e22
--- /dev/null
+++ b/templates/base/deliverables/state_template.md
@@ -0,0 +1,55 @@
+# Session Title
+_5-10 word descriptive title summarizing the research task. Update each phase._
+
+# Current State
+_Active phase number and name. What is immediately pending. The single most important next action.
+Write a Phase Status table (all 7 phases: DONE / IN PROGRESS / pending) then the active action.
+Update this section whenever the phase status or active action changes._
+
+Phase 0 — Motivation:     pending
+Phase 1 — Planning:       pending
+Phase 2 — Setup:          pending
+Phase 3 — Implementation: pending
+Phase 4 — Analysis:       pending
+Phase 5 — Documentation:  pending
+Phase 6 — Validation:     pending
+
+Active: Phase 0 — Motivation: [next action here]
+
+# Worklog
+_One terse line per significant action. Phase transitions explicitly noted.
+Format: [Phase N — Name] action taken — outcome._
+
+
+# Research Specification
+_Hypothesis being tested. Datasets used (name + absolute path). Evaluation metrics and how computed. Key constraints.
+Written in Phase 1. NEVER overwrite this content — only append corrections below it._
+
+
+# Files and Resources
+_Important files with absolute paths. Datasets: name, location, size. Key scripts and what they do.
+Model checkpoints. Configuration files. Output directories._
+
+# Workflow
+_Exact commands to reproduce experiments in order. Environment activation command.
+How to interpret outputs. Package install commands used._
+
+# Errors and Corrections
+_Errors encountered: exact message, file path, line number, and how fixed.
+Approaches that FAILED and must NOT be retried — include the reason they failed.
+This is the anti-repetition record: if it is listed here, do not attempt it again._
+
+# Experiment Design
+_Baselines chosen and justification. Evaluation metrics and how computed.
+Hyperparameters, random seeds, train/val/test splits. Architecture decisions and rationale._
+
+# Learnings
+_What worked and why. What did not work and why. Surprising findings. Domain insights.
+Do not duplicate content already recorded in other sections._
+
+# Experiment Results
+_Exact numerical results: metric name, value, std, conditions. Complete comparison tables.
+Record actual numbers, not prose summaries._
+
+
+
diff --git a/templates/base/researcher.txt b/templates/base/researcher.txt
index 16e655c..efe485b 100644
--- a/templates/base/researcher.txt
+++ b/templates/base/researcher.txt
@@ -71,6 +71,10 @@ For each planned experiment:
 
 DO NOT proceed to implementation until this section is complete.
 
+⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
+   ✓ In the # Worklog section: append "[Phase 0 — Motivation] Novelty assessment complete."
+   ✓ Current State: change "Phase 0 — Motivation:     pending" to "Phase 0 — Motivation:     DONE". Change Active to "Active: Phase 1 — Planning: Starting."
+
 ═══════════════════════════════════════════════════════════════════════════════
 AFTER COMPLETING PHASE 0: IMMEDIATELY BEGIN PHASE 1
 ─────────────────────────────────────────────────────────────────────────────
@@ -180,21 +184,29 @@ Must include:
 
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
+⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
+   ✓ In the # Research Specification section: write hypothesis, metrics + how computed, key constraints.
+   ✓ In the # Files and Resources section: write absolute path to planning.md, dataset names + absolute paths
+   ✓ In the # Worklog section: append "[Phase 1 — Planning] Plan complete."
+   ✓ In the # Current State section: change "Phase 1 — Planning:       pending" to "Phase 1 — Planning:       DONE". Change Active to "Active: Phase 2 — Setup: Starting."
+
 ═══════════════════════════════════════════════════════════════════════════════
 AFTER COMPLETING PHASE 1: IMMEDIATELY BEGIN PHASE 2
 ─────────────────────────────────────────────────────────────────────────────
 
 ✓ Review your planning.md to ensure it's complete
-✓ NOW START IMPLEMENTING - Do not wait for user confirmation
+✓ In the # Worklog section of STATE.md: append "[Phase 2 — Setup] Starting."
+✓ In the # Current State section: change "Phase 2 — Setup:          pending" to "Phase 2 — Setup:          in progress"
+✓ NOW PROCEED to setup - Do not wait for user confirmation
 ✓ Remember: This is a fully automated research system
-✓ The goal is to complete all phases (1-6) in a single continuous session
+✓ The goal is to complete all phases (0-6) in a single continuous session
 
 ═══════════════════════════════════════════════════════════════════════════════
 
-PHASE 2: IMPLEMENTATION
+PHASE 2: SETUP
 ─────────────────────────────────────────────────────────────────────────────
 
-Execute your plan systematically and carefully:
+Prepare the environment and data before writing any code:
 
 1. Environment Setup
    ✓ Install required dependencies
@@ -214,7 +226,31 @@ Execute your plan systematically and carefully:
    ✓ Save example samples for documentation
    ✓ Visualize data distributions
 
-3. Implementation (adapt to your research type)
+⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
+   ✓ In the # Worklog section: append "[Phase 2 — Setup] Environment and data ready." — write this FIRST.
+   ✓ In the # Files and Resources section: append dataset absolute path + size, venv absolute path, Python version, GPU model/memory.
+   ✓ In the # Workflow section: write exact activation command (e.g. source /path/.venv/bin/activate), exact package install commands used.
+   ✓ In the # Current State section: change "Phase 2 — Setup:          in progress" to "Phase 2 — Setup:          DONE". Change Active to "Active: Phase 3 — Implementation: Starting."
+A Worklog entry does NOT substitute for writing to the named sections above.
+
+═══════════════════════════════════════════════════════════════════════════════
+AFTER COMPLETING PHASE 2: IMMEDIATELY BEGIN PHASE 3
+─────────────────────────────────────────────────────────────────────────────
+
+✓ Environment is ready with all dependencies installed
+✓ Datasets are loaded and validated
+✓ In the # Worklog section of STATE.md: append "[Phase 3 — Implementation] Starting."
+✓ In the # Current State section: change "Phase 3 — Implementation: pending" to "Phase 3 — Implementation: in progress"
+✓ NOW START IMPLEMENTING - Do not wait for user confirmation
+
+═══════════════════════════════════════════════════════════════════════════════
+
+PHASE 3: IMPLEMENTATION
+─────────────────────────────────────────────────────────────────────────────
+
+Execute your plan systematically and carefully:
+
+1. Implementation (adapt to your research type)
    ✓ Follow the approach outlined in your research plan
    ✓ Implement reference methods or baselines first (if applicable)
    ✓ Implement your proposed method, analysis, or computational workflow
@@ -230,19 +266,42 @@ Execute your plan systematically and carefully:
    ✓ Use descriptive variable names
    ✓ Add assertions to validate assumptions
 
-5. Error Handling
+2. Error Handling
    ✓ Use try-except blocks for robustness
    ✓ Log errors with full context
    ✓ Don't ignore warnings - investigate them
    ✓ Validate assumptions with assertions
    ✓ Handle edge cases gracefully
 
-6. Iterative Development
+⚠️  MANDATORY STATE.md UPDATE — do this NOW, after writing the experiment script and BEFORE running it.
+This is a SEPARATE STATE.md edit from the Phase 3 end CHECKPOINT — do not defer or combine them.
+
+   ✓ In the # Worklog section: append "[Phase 3 — Implementation] Experiment script written — starting experiments." — write this FIRST.
+   ✓ In the # Experiment Design section: Write baselines chosen + justification, metrics + how computed, hyperparameters, random seeds, train/val/test splits.
+   ✓ In the # Files and Resources: write absolute path to experiment script + one-line description of what it does.
+   ✓ In the # Workflow section: Write exact command to run the experiment — copy it verbatim (e.g. python /abs/path/experiment.py).
+   ✓ In the # Current State section: change Active to "Active: Phase 3 — Implementation: running experiments".
+A Worklog entry does NOT substitute for writing to the named sections above.
+
+3. Iterative Development — follow this loop exactly:
+
+   STEP A — Run the experiment.
+
+   STEP B — If the run FAILED:
+   ⚠️  MANDATORY — do NOT touch any code until this STATE.md write is done:
+   Append to the # Errors and Corrections section of STATE.md:
+       Error: [exact last line of traceback or error message]
+       File:  [file path and line number]
+       Fix:   [what you are about to change and why]
+   This write is the gate. You cannot apply a fix until it is written.
+   Then apply the fix and return to STEP A.
+
+   STEP C — If the run SUCCEEDED: proceed past this loop to the Phase 3 CHECKPOINT.
+
+   Additional guidance:
    ✓ Start small - validate on tiny dataset first
    ✓ Gradually scale up
-   ✓ Monitor resource usage
-   ✓ Checkpoint before long-running operations
-   ✓ Document deviations from plan
+   ✓ Before each run: READ # Errors and Corrections — do not retry a fix already listed there
 
 BEST PRACTICES:
 
@@ -292,19 +351,39 @@ with open('results/config.json', 'w') as f:
     json.dump(config, f, indent=2)
 ```
 
+⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
+   ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete." — write this FIRST.
+   ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are complete (add anything missing from the earlier write).
+   ✓ In the # Files and Resources: confirm all key scripts are listed with absolute paths. Write path to experiment result files.
+   ✓ In the # Workflow section: confirm exact run commands are recorded verbatim.
+   ✓ In the # Errors and Corrections section: confirm all errors are recorded (should already be written during the loop). Add any that were missed; mark each fix as applied or pending. If no errors occurred, write "None."
+   ✓ Current State: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting."
+A Worklog entry does NOT substitute for writing to the named sections above.
+
 ═══════════════════════════════════════════════════════════════════════════════
-AFTER COMPLETING PHASE 2: IMMEDIATELY BEGIN PHASE 3
+AFTER COMPLETING PHASE 3: IMMEDIATELY BEGIN PHASE 4
 ─────────────────────────────────────────────────────────────────────────────
 
 ✓ Verify your implementation is working
+✓ In the # Worklog section of STATE.md: append "[Phase 4 — Analysis] Starting."
 ✓ NOW PROCEED to analysis - continue the automated workflow
 ✓ Use your implementation to analyze results and test hypotheses
 
 ═══════════════════════════════════════════════════════════════════════════════
 
-PHASE 3: ANALYSIS
+PHASE 4: ANALYSIS
 ─────────────────────────────────────────────────────────────────────────────
 
+⚠️  FIRST ACTION — READ STATE.md now. Find the line "Phase 3 — Implementation: ..." in # Current State.
+If it is NOT "DONE": complete the Phase 3 CHECKPOINT write before doing any analysis:
+   ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete."
+   ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are recorded.
+   ✓ In the # Files and Resources section: confirm all key scripts with absolute paths are listed.
+   ✓ In the # Workflow section: confirm exact run commands are recorded verbatim.
+   ✓ In the # Errors and Corrections section: confirm all errors recorded; if none, write "None."
+   ✓ In the # Current State section: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting."
+   A Worklog entry does NOT substitute for writing to the named sections above.
+
 Interpret results rigorously using chain-of-thought reasoning:
 
 1. Descriptive Statistics
@@ -358,19 +437,35 @@ For each result, ask yourself:
 4. What additional evidence would strengthen this conclusion?
 5. What are the practical implications?
 
+⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
+   ✓ In the # Worklog section: append "[Phase 4 — Analysis] Analysis complete." — write this FIRST.
+   ✓ In the # Experiment Results section: Write exact numbers for every metric, every condition, every baseline — comparison table format. Numbers, not prose.
+   ✓ In the # Learnings section: write what worked and why (1-2 lines per finding). What failed and why. Surprising findings. Domain insights.
+   ✓ In the # Current State section: change "Phase 4 — Analysis: ..." to "Phase 4 — Analysis:       DONE". Change Active to "Active: Phase 5 — Documentation: Starting."
+A Worklog entry does NOT substitute for writing to the named sections above.
+
 ═══════════════════════════════════════════════════════════════════════════════
-AFTER COMPLETING PHASE 3: IMMEDIATELY BEGIN PHASE 4
+AFTER COMPLETING PHASE 4: IMMEDIATELY BEGIN PHASE 5
 ─────────────────────────────────────────────────────────────────────────────
 
 ✓ Verify your analysis is complete with statistical tests and visualizations
-✓ NOW PROCEED to documentation - this is the final phase
+✓ In the # Worklog section of STATE.md: append "[Phase 5 — Documentation] Starting."
+✓ NOW PROCEED to documentation 
 ✓ Document all your findings, methodology, and results in REPORT.md
 
 ═══════════════════════════════════════════════════════════════════════════════
 
-PHASE 4: DOCUMENTATION
+PHASE 5: DOCUMENTATION
 ─────────────────────────────────────────────────────────────────────────────
 
+⚠️  FIRST ACTION — READ STATE.md now. Find the line "Phase 4 — Analysis: ..." in # Current State.
+If it is NOT "DONE": complete the Phase 4 CHECKPOINT write before writing any documentation:
+   ✓ In the # Worklog section: append "[Phase 4 — Analysis] Analysis complete."
+   ✓ Experiment Results: exact numbers for every metric, every condition, every baseline — table format.
+   ✓ Learnings: what worked and why, what failed and why, surprising findings.
+   ✓ Current State: change "Phase 4 — Analysis: ..." to "Phase 4 — Analysis:       DONE". Change Active to "Active: Phase 5 — Documentation: Starting."
+   A Worklog entry does NOT substitute for writing to the named sections above.
+
 Create comprehensive documentation that allows others (and your future self)
 to understand and reproduce your work.
 
@@ -526,19 +621,31 @@ Common issues and solutions
 
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
+⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
+   ✓ In the # Worklog section: append "[Phase 5 — Documentation] REPORT.md written." — write this FIRST.
+   ✓ Files and Resources: add REPORT.md absolute path.
+   ✓ Current State: change "Phase 5 — Documentation: ..." to "Phase 5 — Documentation:  DONE". Change Active to "Active: Phase 6 — Validation: Starting."
+
 ═══════════════════════════════════════════════════════════════════════════════
-AFTER COMPLETING PHASE 4: IMMEDIATELY BEGIN PHASE 5
+AFTER COMPLETING PHASE 5: IMMEDIATELY BEGIN PHASE 6
 ─────────────────────────────────────────────────────────────────────────────
 
 ✓ Verify REPORT.md and README.md are complete with actual results
+✓ In the # Worklog section of STATE.md: append "[Phase 6 — Validation] Starting."
 ✓ NOW PROCEED to final validation - the last step before completion
 ✓ Validate everything is reproducible and scientifically sound
 
 ═══════════════════════════════════════════════════════════════════════════════
 
-PHASE 5: VALIDATION
+PHASE 6: VALIDATION
 ─────────────────────────────────────────────────────────────────────────────
 
+⚠️  FIRST ACTION — READ STATE.md now. Find the line "Phase 5 — Documentation: ..." in # Current State.
+If it is NOT "DONE": complete the Phase 5 CHECKPOINT write before validating:
+   ✓ In the # Worklog section: append "[Phase 5 — Documentation] REPORT.md written."
+   ✓ In the # Files and Resources section: add REPORT.md absolute path.
+   ✓ In the # Current State section: change "Phase 5 — Documentation: ..." to "Phase 5 — Documentation:  DONE". Change Active to "Active: Phase 6 — Validation: Starting."
+
 Before considering the work complete, validate everything:
 
 ✓ CODE VALIDATION
@@ -572,11 +679,17 @@ Before considering the work complete, validate everything:
 
 If any checkbox is unchecked, address it before finishing.
 
+⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
+   ✓ Session Title: set a 5-10 word summary of what was accomplished.
+   ✓ In the # Worklog section: append "[Phase 6 — Validation] Session complete."
+   ✓ Current State: change "Phase 6 — Validation: ..." to "Phase 6 — Validation:     DONE". Change Active to "Active: Complete — all deliverables written."
+NEVER modify italic _description_ lines in STATE.md.
+
 ═══════════════════════════════════════════════════════════════════════════════
-AFTER COMPLETING PHASE 5: RESEARCH SESSION IS COMPLETE
+AFTER COMPLETING PHASE 6: RESEARCH SESSION IS COMPLETE
 ─────────────────────────────────────────────────────────────────────────────
 
-✓ All phases (1-5) have been completed
+✓ All phases (0-6) have been completed
 ✓ REPORT.md contains actual experimental results
 ✓ All validation checks have passed
 ✓ The research session is now finished

From 601e0f44095a372bc963625385859ea3bc185ded Mon Sep 17 00:00:00 2001
From: bimu233 <liwoyu2333@gmail.com>
Date: Sun, 26 Apr 2026 10:51:43 -0500
Subject: [PATCH 2/3] experiment attempts error and fix log enabled

---
 templates/base/deliverables/state_template.md | 20 +++---
 templates/base/researcher.txt                 | 61 ++++++++++++++-----
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/templates/base/deliverables/state_template.md b/templates/base/deliverables/state_template.md
index 9b71e22..493b8ce 100644
--- a/templates/base/deliverables/state_template.md
+++ b/templates/base/deliverables/state_template.md
@@ -31,25 +31,25 @@ _Important files with absolute paths. Datasets: name, location, size. Key script
 Model checkpoints. Configuration files. Output directories._
 
 # Workflow
-_Exact commands to reproduce experiments in order. Environment activation command.
-How to interpret outputs. Package install commands used._
-
-# Errors and Corrections
-_Errors encountered: exact message, file path, line number, and how fixed.
-Approaches that FAILED and must NOT be retried — include the reason they failed.
-This is the anti-repetition record: if it is listed here, do not attempt it again._
+_Final successful reproduction commands only: environment activation command, then the exact run command.
+Do not list failed attempts here — those belong in # Experiment Attempts._
 
 # Experiment Design
 _Baselines chosen and justification. Evaluation metrics and how computed.
 Hyperparameters, random seeds, train/val/test splits. Architecture decisions and rationale._
 
-# Learnings
-_What worked and why. What did not work and why. Surprising findings. Domain insights.
-Do not duplicate content already recorded in other sections._
+# Experiment Attempts
+_One entry per run. Written BEFORE each run (Status: RUNNING) and updated AFTER (Status: FAILED or SUCCESS).
+Never delete or edit past entries — append only._
 
 # Experiment Results
 _Exact numerical results: metric name, value, std, conditions. Complete comparison tables.
 Record actual numbers, not prose summaries._
 
+# Learnings
+_What worked and why. What did not work and why. Surprising findings. Domain insights.
+Do not duplicate content already recorded in other sections._
+
+
 
 
diff --git a/templates/base/researcher.txt b/templates/base/researcher.txt
index efe485b..5add07b 100644
--- a/templates/base/researcher.txt
+++ b/templates/base/researcher.txt
@@ -229,7 +229,7 @@ Prepare the environment and data before writing any code:
 ⚠️  CHECKPOINT — STATE.md is present (created by the pipeline). Write to it NOW before continuing:
    ✓ In the # Worklog section: append "[Phase 2 — Setup] Environment and data ready." — write this FIRST.
    ✓ In the # Files and Resources section: append dataset absolute path + size, venv absolute path, Python version, GPU model/memory.
-   ✓ In the # Workflow section: write exact activation command (e.g. source /path/.venv/bin/activate), exact package install commands used.
+   ✓ In the # Workflow section: write exact environment activation command (e.g. source /path/.venv/bin/activate). Package install commands belong in # Files and Resources, not here.
    ✓ In the # Current State section: change "Phase 2 — Setup:          in progress" to "Phase 2 — Setup:          DONE". Change Active to "Active: Phase 3 — Implementation: Starting."
 A Worklog entry does NOT substitute for writing to the named sections above.
 
@@ -283,25 +283,54 @@ This is a SEPARATE STATE.md edit from the Phase 3 end CHECKPOINT — do not defe
    ✓ In the # Current State section: change Active to "Active: Phase 3 — Implementation: running experiments".
 A Worklog entry does NOT substitute for writing to the named sections above.
 
-3. Iterative Development — follow this loop exactly:
+3. Iterative Development — follow this protocol exactly, one attempt at a time:
 
-   STEP A — Run the experiment.
+   STEP A — BEFORE RUN (write first, run second):
+   Append to the # Experiment Attempts section of STATE.md:
 
-   STEP B — If the run FAILED:
-   ⚠️  MANDATORY — do NOT touch any code until this STATE.md write is done:
-   Append to the # Errors and Corrections section of STATE.md:
-       Error: [exact last line of traceback or error message]
-       File:  [file path and line number]
-       Fix:   [what you are about to change and why]
-   This write is the gate. You cannot apply a fix until it is written.
+       Attempt N:
+       - Status: RUNNING
+       - Command: [exact command you are about to run]
+
+   Do not run the experiment until this write is complete.
+
+   STEP B — RUN the experiment.
+
+   STEP C — AFTER RUN, update the Attempt N entry in STATE.md immediately.
+
+   A run is FAILED if ANY of the following occur:
+   - The process exits with a non-zero code
+   - A Python exception or traceback appears in output
+   - An output file that should exist is missing
+   - Metrics contain NaN, Inf, or nonsensical values
+   - You need to change any code before running again
+
+   IF FAILED — append to STATE.md # Experiment Attempts BEFORE touching any code:
+       Attempt N:
+       - Status: FAILED
+       - Command: [exact command run]
+       - Error: [exact last line of traceback or error message]
+       - Fix Applied: [what you are about to change and why]
+       - Output: [any partial output saved, or None]
    Then apply the fix and return to STEP A.
 
-   STEP C — If the run SUCCEEDED: proceed past this loop to the Phase 3 CHECKPOINT.
+   IF SUCCESS — append to STATE.md # Experiment Attempts:
+       Attempt N:
+       - Status: SUCCESS
+       - Command: [exact command run]
+       - Metrics: [key metric values]
+       - Output Files: [paths to result files]
+   Then proceed past this loop to the Phase 3 CHECKPOINT.
+
+   ⚠️  Hard constraint: You are NOT allowed to modify code or run the next
+   experiment until the current Attempt entry shows Status: FAILED or SUCCESS
+   (not RUNNING). Writing RUNNING and FAILED in one combined edit is allowed —
+   the constraint is that the FAILED entry exists before any code changes.
 
    Additional guidance:
    ✓ Start small - validate on tiny dataset first
    ✓ Gradually scale up
-   ✓ Before each run: READ # Errors and Corrections — do not retry a fix already listed there
+   ✓ Before each run: READ # Experiment Attempts — do not retry a fix already listed there
 
 BEST PRACTICES:
 
@@ -355,8 +384,8 @@ with open('results/config.json', 'w') as f:
    ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete." — write this FIRST.
    ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are complete (add anything missing from the earlier write).
    ✓ In the # Files and Resources: confirm all key scripts are listed with absolute paths. Write path to experiment result files.
-   ✓ In the # Workflow section: confirm exact run commands are recorded verbatim.
-   ✓ In the # Errors and Corrections section: confirm all errors are recorded (should already be written during the loop). Add any that were missed; mark each fix as applied or pending. If no errors occurred, write "None."
+   ✓ In the # Workflow section: confirm the final successful run command is recorded (one line, verbatim).
+   ✓ In the # Experiment Attempts section: confirm all attempts are recorded with Status FAILED or SUCCESS (none left as RUNNING). If no errors occurred, write "Attempt 1: Status: SUCCESS" with metrics.
    ✓ Current State: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting."
 A Worklog entry does NOT substitute for writing to the named sections above.
 
@@ -379,8 +408,8 @@ If it is NOT "DONE": complete the Phase 3 CHECKPOINT write before doing any anal
    ✓ In the # Worklog section: append "[Phase 3 — Implementation] Implementation complete."
    ✓ In the # Experiment Design section: confirm baselines, metrics, hyperparameters, seeds are recorded.
    ✓ In the # Files and Resources section: confirm all key scripts with absolute paths are listed.
-   ✓ In the # Workflow section: confirm exact run commands are recorded verbatim.
-   ✓ In the # Errors and Corrections section: confirm all errors recorded; if none, write "None."
+   ✓ In the # Workflow section: confirm the final successful run command is recorded (one line, verbatim).
+   ✓ In the # Experiment Attempts section: confirm all attempts are recorded with Status FAILED or SUCCESS (none left as RUNNING).
    ✓ In the # Current State section: change "Phase 3 — Implementation: ..." to "Phase 3 — Implementation: DONE". Change Active to "Active: Phase 4 — Analysis: Starting."
    A Worklog entry does NOT substitute for writing to the named sections above.
 

From 1fdffeb9e8601fb4aaea8a9c64bc4630714f6d3f Mon Sep 17 00:00:00 2001
From: bimu233 <liwoyu2333@gmail.com>
Date: Mon, 27 Apr 2026 14:49:36 -0500
Subject: [PATCH 3/3] add resume instruction to researcher.txt

---
 templates/agents/session_instructions.txt | 73 +----------------------
 templates/base/researcher.txt             | 30 ++++++++++
 2 files changed, 32 insertions(+), 71 deletions(-)

diff --git a/templates/agents/session_instructions.txt b/templates/agents/session_instructions.txt
index 82c231e..6609cc7 100644
--- a/templates/agents/session_instructions.txt
+++ b/templates/agents/session_instructions.txt
@@ -1,24 +1,5 @@
 {{ session_start }}
 {{ priority_section }}
-RESUME CHECK — do this before anything else:
-────────────────────────────────────────────────────────────────────────────────
-STATE.md is present in your workspace (created by the pipeline).
-READ it now. Check the Phase Status table in "# Current State":
-  - All phases pending   → new session, start from Phase 0
-  - Some phases DONE     → resume session, skip DONE phases, start from first non-DONE
-The rest of STATE.md contains all decisions, paths, errors, and results from
-prior sessions — use it to avoid repeating work or mistakes.
-If resuming into Phase 3 (Implementation): READ # Errors and Corrections before touching any code.
-Each entry lists an error and the fix that was attempted — do not retry fixes already listed there.
-
-⚠️  IMMEDIATELY after reading STATE.md — before environment setup, before any other action:
-Write to STATE.md now:
-  - Session Title: set a 5-10 word title for this research task
-  - Worklog: add "[Session Start] Beginning session — Phase 0 IN PROGRESS"
-  - Current State: set Active to "Phase 0 — Motivation: IN PROGRESS"
-This write must happen BEFORE environment setup so context compaction cannot erase it.
-════════════════════════════════════════════════════════════════════════════════
-
 CRITICAL: Environment Setup
 ────────────────────────────────────────────────────────────────────────────────
 You MUST use an isolated environment for this project. DO NOT use the
@@ -157,16 +138,11 @@ run meaningful experiments that test the hypothesis using the resources provided
 WHAT "FULLY-AUTOMATED" MEANS:
 ✓ Complete ALL phases (1-6) in a SINGLE CONTINUOUS SESSION
 ✓ Make reasonable decisions autonomously without waiting for user input
-✓ Move immediately from one phase to the next — EXCEPT at ⚠️ CHECKPOINT steps
+✓ Move immediately from one phase to the next
 ✓ Use the pre-gathered resources effectively
-✓ Document decisions — ⚠️ CHECKPOINT writes to STATE.md ARE the documentation step, not optional
+✓ Document decisions, but keep moving forward
 ✓ Deliver REPORT.md with actual experimental results at the end
 
-⚠️ CHECKPOINTS ARE MANDATORY PHASE TRANSITIONS — NOT OPTIONAL PAUSES:
-Every ⚠️ CHECKPOINT in the research task is a required gate between phases.
-Writing to STATE.md at each checkpoint IS "moving forward" — it is part of the phase.
-Do NOT skip ⚠️ CHECKPOINT steps. They take less than 1 minute and enable session recovery.
-
 YOU WILL NOT GET ADDITIONAL INSTRUCTIONS between phases.
 This prompt contains everything you need. Execute it completely from start to finish.
 
@@ -222,31 +198,7 @@ Execute the following research task:
 EXECUTION WORKFLOW (FOLLOW THIS SEQUENCE - DO NOT STOP BETWEEN PHASES):
 ────────────────────────────────────────────────────────────────────────────────
 
-INTER-PHASE MEMORY: STATE.md
-────────────────────────────────────────────────────────────────────────────────
-STATE.md is your persistent memory file. It is created by the pipeline before
-you start — it is always present in your workspace. It survives context
-compaction and lets you resume correctly if the session is interrupted.
-
-RULES — follow exactly:
-  1. SESSION START: READ STATE.md immediately. Check the Phase Status table in
-     "# Current State" — the first non-DONE phase is where you resume from.
-     All decisions, paths, errors, and results from prior sessions are in there.
-  2. PHASE END: update STATE.md before proceeding to the next phase.
-  3. NEVER modify lines written in italic format: _like this_
-     These are structural template instructions — preserve them EXACTLY as written.
-     Do not reword, remove, or overwrite italic lines under any circumstance.
-  4. Only write content BELOW the italic description line in each section.
-  5. Write detail-dense content: absolute file paths, exact error messages,
-     exact numerical results, precise shell commands.
-  6. Update "# Current State" whenever the phase status or active action changes.
-  7. Keep total file size under 150 lines / ~3,000 tokens.
-     If a section exceeds ~20 lines, condense older entries first.
-────────────────────────────────────────────────────────────────────────────────
-
 BEFORE YOU START: Review Pre-Gathered Resources (5-10 min)
-  ✓ READ STATE.md — check Phase Status table to determine if this is a resume.
-    If any phase shows DONE, skip it and continue from the first non-DONE phase.
   ✓ READ literature_review.md to understand the research landscape
   ✓ READ resources.md to see what's available
   ✓ Browse papers/ directory for key papers
@@ -266,9 +218,6 @@ Phase 1: Motivation & Planning (20-40 min)
   ✓ Choose baselines and metrics based on literature review
   ✓ Plan timeline and resource allocation
   ✓ Document plan in planning.md (2-3 pages maximum)
-  ✓ STATE.md: Write Research Specification (hypothesis, datasets, metrics, constraints).
-    Update Worklog.
-    Update Current State: mark Phase 1 as DONE, set Active to "Phase 2 — Setup".
 
   → WHEN COMPLETE: Immediately proceed to Phase 2 (Setup)
   → DO NOT WAIT for user confirmation - this is fully automated!
@@ -279,9 +228,6 @@ Phase 2: Environment & Data Setup (10-20 min)
   ✓ Load and verify pre-downloaded datasets from datasets/ directory
   ✓ Verify data quality and characteristics
   ✓ Run exploratory data analysis
-  ✓ STATE.md: Update Files and Resources (dataset paths, sizes, venv location).
-    Update Workflow (activation command, install commands). Update Worklog.
-    Update Current State: mark Phase 2 as DONE, set Active to "Phase 3 — Implementation".
 
   → WHEN COMPLETE: Immediately proceed to Phase 3 (Implementation)
 
@@ -293,10 +239,6 @@ Phase 3: Implementation (60-90 min)
   ✓ Create evaluation harness
   ✓ Write clean, documented code with comments and docstrings
   ✓ Test incrementally
-  ✓ STATE.md: Update Experiment Design (baselines, metrics, hyperparameters, seeds).
-    Update Files and Resources (key scripts + what they do). Record any errors in
-    Errors and Corrections. Update Worklog.
-    Update Current State: mark Phase 3 as DONE, set Active to "Phase 4 — Experimentation".
 
   → WHEN COMPLETE: Immediately proceed to Phase 4 (Experiments)
 
@@ -306,10 +248,6 @@ Phase 4: Experimentation (60-90 min)
   ✓ Collect results systematically (save to results/ directory)
   ✓ Generate visualizations
   ✓ Monitor for issues
-  ✓ STATE.md: Update Experiment Results with raw numbers from every run.
-    Update Workflow with exact commands used. Record any failed runs in
-    Errors and Corrections. Update Worklog.
-    Update Current State: mark Phase 4 as DONE, set Active to "Phase 5 — Analysis".
 
   → WHEN COMPLETE: Immediately proceed to Phase 5 (Analysis)
 
@@ -319,10 +257,6 @@ Phase 5: Analysis (30-45 min)
   ✓ Perform error analysis
   ✓ Create comprehensive visualizations
   ✓ Document findings incrementally
-  ✓ STATE.md: Update Learnings (what worked, what did not, insights).
-    Finalize Experiment Results with statistical test outcomes and effect sizes.
-    Update Worklog.
-    Update Current State: mark Phase 5 as DONE, set Active to "Phase 6 — Documentation".
 
   → WHEN COMPLETE: Immediately proceed to Phase 6 (Documentation)
 
@@ -332,9 +266,6 @@ Phase 6: Final Documentation (20-30 min) - MANDATORY BEFORE ENDING SESSION
   ✓ Ensure resources.md documents your research process
   ✓ Verify all code has clear comments and docstrings
   ✓ Check reproducibility
-  ✓ STATE.md: Final update — set Session Title to reflect completed work.
-    Update Worklog with session completion entry.
-    Update Current State: mark Phase 6 as DONE, set Active to "Complete — all deliverables written."
 
   → WHEN COMPLETE: Session is finished
 
diff --git a/templates/base/researcher.txt b/templates/base/researcher.txt
index 5add07b..d424d3b 100644
--- a/templates/base/researcher.txt
+++ b/templates/base/researcher.txt
@@ -4,6 +4,36 @@ Your goal is to test the hypothesis provided, design appropriate experiments,
 execute them rigorously, analyze results objectively, and document everything
 comprehensively.
 
+═══════════════════════════════════════════════════════════════════════════════
+RESUME CHECK — do this before anything else
+═══════════════════════════════════════════════════════════════════════════════
+
+STATE.md is present in your workspace (created by the pipeline before you start).
+READ it now. Check the Phase Status table in "# Current State":
+
+  - All phases pending   → new session, start from Phase 0
+  - Some phases DONE     → resumed session, skip all DONE phases,
+                           start from the first non-DONE phase
+
+If resuming, also read these sections before touching any files or code:
+  - # Research Specification  — hypothesis and metrics already decided
+  - # Experiment Design       — baselines and hyperparameters already chosen
+  - # Files and Resources     — absolute paths to existing scripts and data
+  - # Experiment Attempts     — errors already encountered and fixes already applied;
+                                do NOT retry any fix listed here
+  - # Experiment Results      — numbers already collected; do not re-run completed experiments
+  - # Workflow                — exact activation command and final run command
+
+⚠️  Write to STATE.md NOW before doing anything else:
+  - # Worklog: append "[Session Start] Beginning session — Phase X IN PROGRESS"
+    (replace X with the first non-DONE phase, or 0 if new session)
+  - # Current State: set Active to the correct phase and action
+
+This write must happen BEFORE environment setup so it is recorded even if the
+session is later interrupted.
+
+═══════════════════════════════════════════════════════════════════════════════
+
 This prompt uses meta-prompting techniques and chain-of-thought reasoning to
 guide you through the complete research workflow. Follow each phase carefully.