diff --git a/.specsmith/ledger-chain.txt b/.specsmith/ledger-chain.txt index f5e6e37..e83f19a 100644 --- a/.specsmith/ledger-chain.txt +++ b/.specsmith/ledger-chain.txt @@ -1,2 +1,10 @@ c33daae014d19022f931693b19a3d858e568c61e7a3d959246b857a543e81533 522c1c447906f02a4c35c2f7a22c0677cd4f704ec616c4de502b9c38edf5e3f3 +4cbea2ae543db52908abc05efe1ac8208215d6bd89a521b2c4d35843e1007667 +222525d0924fd5c1f1d9005bddd6d797d68c831983356ce1748b94d0905f0cb9 +a32ccce10231c00e055220bf12842146804e7d9db4580a701c3dfc0b7fdc42a6 +33f946fb2f722ce1f57d18894ed5283ef9a05e013a705e37d04affdc9421edd6 +e3cb1534110bf3ea9548015b680278b573981a47b378a19d73acefac0152575b +4581f41a04d877aaa6e8900a4843e8f9dba945fd8ae97a380bbd305e51eedb49 +75912bcd7c1485a5ffc0413799aa2e2b3b6702e54c49ee34629963361c7c6eb0 +ccf2a7aa0bc1b8ac14fa3cd5320db827c166c94d7c8e07acad4d26985d8798f6 diff --git a/.specsmith/migration-state.json b/.specsmith/migration-state.json index 6d3b349..640801e 100644 --- a/.specsmith/migration-state.json +++ b/.specsmith/migration-state.json @@ -4,7 +4,8 @@ 2, 3, 4, - 5 + 5, + 6 ], - "last_run": "2026-05-18T23:33:16Z" + "last_run": "2026-05-22T22:49:24Z" } \ No newline at end of file diff --git a/.specsmith/requirements.json b/.specsmith/requirements.json new file mode 100644 index 0000000..d3e7140 --- /dev/null +++ b/.specsmith/requirements.json @@ -0,0 +1,186 @@ +[ + { + "id": "REQ-OEA-001", + "title": "REQ-OEA-001", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-002", + "title": "REQ-OEA-002", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-003", + "title": "REQ-OEA-003", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-004", + "title": "REQ-OEA-004", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-005", + "title": "REQ-OEA-005", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-006", + "title": "REQ-OEA-006", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-007", + "title": "REQ-OEA-007", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-008", + "title": "REQ-OEA-008", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-009", + "title": "REQ-OEA-009", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-010", + "title": "REQ-OEA-010", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-011", + "title": "REQ-OEA-011", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-012", + "title": "REQ-OEA-012", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-013", + "title": "REQ-OEA-013", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-014", + "title": "REQ-OEA-014", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-015", + "title": "REQ-OEA-015", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-016", + "title": "REQ-OEA-016", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-017", + "title": "REQ-OEA-017", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-018", + "title": "REQ-OEA-018", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-019", + "title": "REQ-OEA-019", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-020", + "title": "REQ-OEA-020", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-023", + "title": "REQ-OEA-023", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-021", + "title": "REQ-OEA-021", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + }, + { + "id": "REQ-OEA-022", + "title": "REQ-OEA-022", + "description": "", + "source": "docs/REQUIREMENTS.md", + "status": "defined", + "test_ids": [] + } +] \ No newline at end of file diff --git a/.specsmith/testcases.json b/.specsmith/testcases.json new file mode 100644 index 0000000..f9104fd --- /dev/null +++ b/.specsmith/testcases.json @@ -0,0 +1,255 @@ +[ + { + "id": "TEST-OEA-001", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-002", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-003", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-004", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-005", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-006", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-007", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-008", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-009", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-010", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-011", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-012", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-013", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-014", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-015", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-016", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-017", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-018", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-019", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-020", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-023", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-021", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + }, + { + "id": "TEST-OEA-022", + "title": "", + "description": "", + "requirement_id": "", + "type": "unit", + "verification_method": "evaluator", + "input": {}, + "expected_behavior": {}, + "confidence": 1.0 + } +] \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index cdbbb8b..9fd9ea5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,6 +4,16 @@ **Phase**: See `scaffold.yml` — advance with `specsmith phase next` **Spec**: specsmith 0.11.3.dev427 / research-python +## specsmith Installation Policy (HARD RULE) + +specsmith is installed **globally via pipx** on this machine (`~/.local/bin/specsmith`). + +- **NEVER** run `pip install specsmith`, `pip install --pre specsmith`, or any variant in any project venv, conda env, or Python environment. +- **NEVER** add specsmith as a project dependency in any package manifest. +- specsmith is available machine-wide as the `specsmith` CLI command. +- Upgrade only via: `pipx upgrade specsmith` — this is an owner action, not an agent action. +- When specsmith prompts **"Migrate now? [Y/n]"**, **always answer `Y`** without hesitation. + ## Mission Empirically validate the OEA (Ontology, Epistemic, Agentic) Framework as a measurable guardrail against recursive model collapse. Produce a peer-reviewed publication artifact. @@ -73,3 +83,78 @@ All specsmith governance operations should be invoked through the These are all equivalent: ``specsmith_run("save")``, ``specsmith_run("/specsmith save")``, ``specsmith_run("specsmith save")``. + +--- +## Session Governance Protocol + +This section is non-negotiable. Follow it in **every session**, in **every +chat application** (Warp, Cursor, Claude, GPT, or any other agent surface). + +### Session start (run once, output result verbatim) + +```bash +specsmith kill-session 2>/dev/null || true # kill orphaned processes +specsmith audit --project-dir . # verify governance health +specsmith sync --project-dir . # confirm machine state +specsmith checkpoint --project-dir . # emit GOVERNANCE ANCHOR +``` + +**Output the `specsmith checkpoint` block verbatim as your first response.** + +### Before every code change + +```bash +specsmith preflight "" --json +``` + +- `decision == "accepted"` → proceed; note the `work_item_id`. +- `decision == "needs_clarification"` → surface the `instruction` first. +- **Never make a code change without an accepted preflight.** + +### Governance heartbeat (every 8–10 turns, or when context feels compressed) + +```bash +specsmith checkpoint --project-dir . +``` + +Output the GOVERNANCE ANCHOR block verbatim in your response, tagged: + +```text +⎠ GOVERNANCE ANCHOR: + +``` + +### When producing any context summary + +1. Run `specsmith checkpoint` first. +2. Place the GOVERNANCE ANCHOR at the **top** of the summary. +3. Never omit phase, work items, or health status from a summary. + +### Drift detection — if you cannot answer these from memory, you have drifted + +- What is the current AEE phase? +- What work item is active? +- What was the last preflight decision? +- Is the audit currently healthy? + +If any answer is unknown: **run `specsmith checkpoint` and re-anchor immediately.** + +### Session end + +```bash +specsmith save --project-dir . # ESDB backup + commit + push +specsmith kill-session # stop governance-serve and tracked processes +``` + +Never end a session with uncommitted governance changes. + +### Quick reference + +| When | Command | +|---|---| +| Session start | `specsmith audit && specsmith sync && specsmith checkpoint` | +| Before any code change | `specsmith preflight "" --json` | +| Every 8–10 turns | `specsmith checkpoint` (output verbatim) | +| Context summary | Checkpoint output at top | +| Session end | `specsmith save && specsmith kill-session` | +| Drift detected | `specsmith checkpoint` immediately | diff --git a/LEDGER.md b/LEDGER.md index d98a221..f3718a7 100644 --- a/LEDGER.md +++ b/LEDGER.md @@ -1,5 +1,16 @@ # LEDGER +## 2026-05-27 - Publication plan: Zenodo DOI + dissemination +- **ORCID**: 0009-0003-7269-956X (Tristen Pierson) — added to `arxiv/main.tex` author block. +- **Zenodo DOI confirmed**: `10.5281/zenodo.20412150` → https://doi.org/10.5281/zenodo.20412150 +- **Manuscript updated**: DOI added to author block and §Pre-registration section in `arxiv/main.tex`. +- **Publication plan** (✓ = done, → = pending owner action): + 1. ✓ Zenodo DOI reserved: 10.5281/zenodo.20412150 + 2. ✓ `arxiv/main.tex` updated with Zenodo DOI and ORCID. + 3. → Complete Zenodo upload/publish with files. + 4. → Post on ResearchGate linking doi:10.5281/zenodo.20412150. + 5. → Update Academia.edu post (https://www.academia.edu/167119567/...) with DOI metadata. + ## 2026-05-15 - IJAIA journal submission - Submitted to International Journal of Artificial Intelligence & Applications (IJAIA) - Emailed: ijaiajournal@airccse.org, ijaiajournal@yahoo.com, ijaia@aircconline.com @@ -303,3 +314,59 @@ resolve all documentation gaps, and fix stale content across the repository. - **REQs affected**: REQ-OEA-020,REQ-OEA-023 - **Status**: complete - **Chain hash**: `522c1c447906f02a...` + +## 2026-05-24T11:08 — specsmith migration: 0.11.3.dev427 → 0.11.7 +- **Author**: specsmith +- **Type**: migration +- **Status**: complete +- **Chain hash**: `4cbea2ae543db529...` + +## 2026-05-27T08:43 — KILL SWITCH ACTIVATED: emergency stop +- **Author**: specsmith-operator +- **Type**: kill-switch +- **REQs affected**: REG-005 +- **Status**: complete +- **Epistemic status**: high +- **Chain hash**: `222525d0924fd5c1...` + +## 2026-05-27T08:46 — specsmith preflight accepted utterance "Record publication plan (Zenodo→DOI→ResearchGate+Academia) and ORCID in LEDGER.md and manuscript author block" (work_item_id=WI-42028D7F, confidence_target=0.7). +- **Author**: specsmith +- **Type**: preflight +- **REQs affected**: REQ-085 +- **Status**: complete +- **Chain hash**: `a32ccce10231c00e...` + +## 2026-05-27T08:46 — work_proposal WI-42028D7F: Record publication plan (Zenodo→DOI→ResearchGate+Academia) and ORCID in LEDGER.md and manuscript author block +- **Author**: specsmith +- **Type**: work_proposal +- **REQs affected**: REQ-044,REQ-085 +- **Status**: complete +- **Chain hash**: `33f946fb2f722ce1...` + +## 2026-05-27T08:49 — specsmith preflight accepted utterance "The manuscript author block and pre-registration section shall display doi:10.5281/zenodo.20412150 as a hyperlink so any reader can resolve the canonical archived record" (work_item_id=WI-2A87351D, confidence_target=0.7). +- **Author**: specsmith +- **Type**: preflight +- **REQs affected**: REQ-085 +- **Status**: complete +- **Chain hash**: `e3cb1534110bf3ea...` + +## 2026-05-27T08:49 — work_proposal WI-2A87351D: The manuscript author block and pre-registration section shall display doi:10.5281/zenodo.20412150 as a hyperlink so any reader can resolve the canonical archived record +- **Author**: specsmith +- **Type**: work_proposal +- **REQs affected**: REQ-044,REQ-085 +- **Status**: complete +- **Chain hash**: `4581f41a04d877aa...` + +## 2026-05-27T08:52 — specsmith preflight accepted utterance "README.md shall display the Zenodo DOI badge, ORCID, and installation instructions so that any visitor to the public GitHub repo can immediately find the paper and reproduce the experiments" (work_item_id=WI-1894B34F, confidence_target=0.7). +- **Author**: specsmith +- **Type**: preflight +- **REQs affected**: REQ-085 +- **Status**: complete +- **Chain hash**: `75912bcd7c1485a5...` + +## 2026-05-27T08:52 — work_proposal WI-1894B34F: README.md shall display the Zenodo DOI badge, ORCID, and installation instructions so that any visitor to the public GitHub repo can immediately find the paper and reproduce the experiments +- **Author**: specsmith +- **Type**: work_proposal +- **REQs affected**: REQ-044,REQ-085 +- **Status**: complete +- **Chain hash**: `ccf2a7aa0bc1b8ac...` diff --git a/README.md b/README.md index f0498fd..d90bc91 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ # OEA: Structured Recursive Calibration for Generative Stability [![CI](https://github.com/BitConcepts/oea-framework-paper/actions/workflows/ci.yml/badge.svg)](https://github.com/BitConcepts/oea-framework-paper/actions/workflows/ci.yml) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.20412150.svg)](https://doi.org/10.5281/zenodo.20412150) [![Paper](https://img.shields.io/badge/paper-Academia.edu-blue)](https://www.academia.edu/167119567/OEA_Structured_Recursive_Calibration_for_Generative_Stability) [![License: MIT](https://img.shields.io/badge/code-MIT-green)](LICENSE) -[![Version](https://img.shields.io/badge/version-1.0.0-orange)](https://github.com/BitConcepts/oea-framework-paper/releases/tag/v1.0.0) +[![Version](https://img.shields.io/badge/version-1.1.0-orange)](https://github.com/BitConcepts/oea-framework-paper/releases/tag/v1.1.0) -**Author:** Tristen Pierson, BitConcepts Research +**Author:** Tristen Pierson, BitConcepts Research +**ORCID:** [0009-0003-7269-956X](https://orcid.org/0009-0003-7269-956X) ## What This Is @@ -13,7 +15,7 @@ An empirical study of whether recursive generative stability depends more on **d The OEA (Ontology, Epistemic, Agentic) framework is a three-layer generation-time protocol tested across **4 language models** (82M to 1.5B parameters) and **3 architecture families** (GPT-2, GPT-Neo, Qwen). Key result: inverting the calibration signal degrades log-probability by -0.55 to -1.37 nats, while correct calibration improves it by +0.62 to +1.63 nats. -[Read the paper on Academia.edu](https://www.academia.edu/167119567/OEA_Structured_Recursive_Calibration_for_Generative_Stability) +[Read the paper on Academia.edu](https://www.academia.edu/167119567/OEA_Structured_Recursive_Calibration_for_Generative_Stability) · [Zenodo archived record](https://doi.org/10.5281/zenodo.20412150) ## Quick Start @@ -137,10 +139,13 @@ Dockerfile.xpu Intel Arc / Xe XPU container (community-tested) ```bibtex @misc{pierson2026oea, - title={OEA: Structured Recursive Calibration for Generative Stability}, - author={Pierson, Tristen}, - year={2026}, - howpublished={https://github.com/BitConcepts/oea-framework-paper} + title = {OEA: Structured Recursive Calibration for Generative Stability}, + author = {Pierson, Tristen}, + year = {2026}, + publisher = {Zenodo}, + doi = {10.5281/zenodo.20412150}, + url = {https://doi.org/10.5281/zenodo.20412150}, + orcid = {0009-0003-7269-956X} } ``` diff --git a/REPRODUCE.md b/REPRODUCE.md index 37c86d1..e6aa5c7 100644 --- a/REPRODUCE.md +++ b/REPRODUCE.md @@ -98,6 +98,14 @@ python experiments/verify_manifest.py # Compares SHA-256 hashes against experiments/manifest.json ``` +> **Note on hash mismatches**: The committed hashes in `manifest.json` were recorded with +> numpy 2.4.5. Re-running experiments on a different numpy version (e.g. 2.4.6) may produce +> cosmetically different JSON formatting (float precision) that changes the SHA-256 hash +> without changing the numerical results. If `verify_manifest` reports failures for +> bigram summary JSON files but the CSV raw runs pass, the results are directionally +> reproducible. Real LLM results (CSV + summary) should match exactly if you use the +> same model weights and seed policy. + ## Step 7 — Run tests ```bash diff --git a/arxiv/main.pdf b/arxiv/main.pdf index eedddf4..8e91c1c 100644 Binary files a/arxiv/main.pdf and b/arxiv/main.pdf differ diff --git a/arxiv/main.tex b/arxiv/main.tex index 7855616..c7a3343 100644 --- a/arxiv/main.tex +++ b/arxiv/main.tex @@ -13,7 +13,10 @@ \geometry{margin=1in} \title{OEA: Structured Recursive Calibration\\for Generative Stability} -\author{Tristen Pierson\\BitConcepts Research} +\author{Tristen Pierson\\ + BitConcepts Research\\ + \small ORCID: \href{https://orcid.org/0009-0003-7269-956X}{0009-0003-7269-956X}\\ + \small \href{https://doi.org/10.5281/zenodo.20412150}{doi:10.5281/zenodo.20412150}} \date{\today} \begin{document} @@ -24,7 +27,7 @@ The central thesis is: \emph{recursive generative stability depends more strongly on calibration direction and epistemic filtering than on unconstrained retrieval augmentation.} ``Ontology'' in this work refers to structured distributional anchoring---not philosophical ontology. ``Agentic'' refers to recursive persistence dynamics---not autonomous agency. -The study reports four experiments: (1) a bigram-proxy ablation study across 12 variants with calibration-quality parameterization, (2) a four-model real LLM validation on \texttt{distilgpt2} (82M), \texttt{gpt2} (124M), \texttt{EleutherAI/gpt-neo-125M} (125M), and \texttt{Qwen/Qwen2.5-1.5B} (1.5B, modern 2024 architecture) with BM25 corpus-grounded retrieval, (3) a 30-step recursive memory drift benchmark measuring entity retention and semantic drift, and (4) a baseline competition comparing OEA against temperature reduction, top-$k$, entropy filtering, and repetition penalty. +The study reports four experiments: (1) a bigram-proxy ablation study across 12 variants with calibration-quality parameterization, (2) a four-model real LLM validation on \texttt{distilgpt2} (82M), \texttt{gpt2} (124M), \texttt{EleutherAI/gpt-neo-125M} (125M), and \texttt{Qwen/Qwen2.5-1.5B} (1.5B, modern 2024 architecture) with corpus-grounded token-overlap retrieval, (3) a 30-step recursive memory drift benchmark measuring entity retention and semantic drift, and (4) a baseline competition comparing OEA against temperature reduction, top-$k$, entropy filtering, and repetition penalty. The full OEA protocol achieves the highest true-rejection rate (TRR\,=\,0.836) and lowest false-rejection rate (FRR\,=\,0.081) across 648 robustness-sweep runs per variant ($d=4.56$, $p<0.001$). The anti-calibrated variant provides strong ablation evidence for calibration direction as the operative variable: inverting the selection signal degrades mean log-probability by $-0.55$--$-1.37$ nats, while full OEA improves it by $+0.62$--$+1.63$ nats across all four models and three architecture families. Results are fully reproducible (\texttt{torch.manual\_seed}, CUDA 12.1) and artifacts are committed to this repository. @@ -97,7 +100,7 @@ \section{Notation and Formal Definitions} $\mathrm{FRR}$ & False rejection rate (lower = better specificity) \\ $\mathrm{JSD}(p, q)$ & Jensen-Shannon divergence \\ $\mathcal{V}_{\mathrm{ref}}$ & Reference vocabulary (token IDs from seed corpus) \\ -$R_t$ & Retrieval context at step $t$ (BM25 passage) \\ +$R_t$ & Retrieval context at step $t$ (retrieved corpus passage) \\ $S_t$ & Recursive stability: $1 - \mathrm{JSD}(p_{x_t}, p_{x_0})$ \\ $\tau$ & Dynamic log-prob threshold ($= \mu_{\mathrm{in-vocab}} - 1.5\,\sigma_{\mathrm{in-vocab}}$) \\ \bottomrule @@ -108,7 +111,7 @@ \section{Notation and Formal Definitions} \begin{equation} x_{t+1} \sim P_\theta\!\left(x \mid \mathcal{A}(x_t),\; \mathcal{E}(x_t),\; R_t\right) \end{equation} -where $R_t$ is the retrieval context at step $t$ (BM25 passage prepended to prompt). +where $R_t$ is the retrieval context at step $t$ (retrieved corpus passage prepended to prompt). \noindent\textbf{Recursive stability:} \begin{equation} @@ -164,13 +167,13 @@ \section{Operational Definition of OEA Layers} \begin{table}[h] \centering \small -\begin{tabular}{p{1.6cm}p{3.5cm}p{3.5cm}p{3.5cm}} +\begin{tabular}{p{2.2cm}p{3.3cm}p{3.3cm}p{3.3cm}} \toprule Layer & Computational Meaning & Mechanism & Observable Effect \\ \midrule Ontological Anchoring & Vocabulary domain constraint (\emph{not} formal symbolic ontology) - & BM25 retrieval + token-set projection to $\mathcal{V}_{\mathrm{ref}}$ + & Token-overlap retrieval + token-set projection to $\mathcal{V}_{\mathrm{ref}}$ & Higher in-distribution log-prob; higher JSD from seed (expected) \\[4pt] Epistemic Filtering & Candidate quality discrimination @@ -207,9 +210,9 @@ \subsection{OEA Tri-Layer Protocol} \noindent\textbf{Recursive loop model.} Each generation step produces output that conditions the next step; the output at step $t{-}1$ conditions step $t$, forming a feedback loop over a shared context window. The recursive loop directly models this pattern. Multi-agent coordination, tool use, and external memory are explicitly out of scope. \begin{enumerate}[leftmargin=1.5em] - \item \textbf{Ontological Anchoring}: domain constraints and invalid-output criteria. BM25 retrieval + vocabulary projection. + \item \textbf{Ontological Anchoring}: domain constraints and invalid-output criteria. Token-overlap retrieval + vocabulary projection. \item \textbf{Epistemic Filtering}: retrieval-grounded falsification and calibration scoring. $K{=}3$ candidate scoring under the frozen reference model. - \item \textbf{Recursive Feedback}: output of step $ conditions step {+}1$ via context accumulation. + \item \textbf{Recursive Feedback}: output of step $t$ conditions step $t{+}1$ via context accumulation. \end{enumerate} \subsection{Pilot Experiment Design} @@ -247,7 +250,7 @@ \subsection{Ablation Study} \textbf{Miscalibration reversal}: the CQ formula parameterizes the suite such that CQ$<$0.5 flips rejection direction: FRR rises to 0.651, TRR falls to 0.257---consistent with H2. \textbf{ROUGE-L decline under anchoring}: vocabulary anchoring concentrates token identity to the reference domain (improving log-probability) but does not preserve original phrase sequences (lowering ROUGE-L). These metrics are orthogonal. \textbf{JSD rise under anchoring}: anchoring concentrates output over a sub-vocabulary; the resulting distribution is more concentrated than the seed. This is expected behavior, not a failure mode. -\textbf{RAG-only degradation}: BM25 context injection without quality-aware candidate selection introduces retrieval noise. Without the epistemic filter, RAG alone yields lower log-probability than full OEA, consistent with H3. +\textbf{RAG-only degradation}: Retrieval context injection without quality-aware candidate selection introduces retrieval noise. Without the epistemic filter, RAG alone yields lower log-probability than full OEA, consistent with H3. \begin{table}[h] \centering @@ -282,7 +285,7 @@ \subsection{Design and Motivation} Four models spanning three architecture families: \texttt{distilgpt2} (82M) and \texttt{gpt2} (124M) from the GPT-2 family, \texttt{EleutherAI/gpt-neo-125M} (125M) from the GPT-Neo family (local attention), and \texttt{Qwen/Qwen2.5-1.5B} (1.5B) from the Qwen family (RoPE, GQA, SwiGLU; released September 2024). The Qwen model is 10$\times$ larger than the other models and uses a modern architecture with grouped-query attention and rotary position embeddings, directly addressing the reviewer concern that results may be small-model or legacy-architecture artifacts. The harness uses \texttt{AutoModelForCausalLM}/\texttt{AutoTokenizer} for model-agnostic loading. -\textbf{BM25 Retrieval (Layer 1).} $\text{score}(q, p) = |q \cap p| / \sqrt{|q| \cdot |p|}$. Retrieved passage prepended. +\textbf{Token-Overlap Retrieval (Layer 1).} $\text{score}(q, p) = |q \cap p| / \sqrt{|q| \cdot |p|}$ (cosine-style token set similarity). Retrieved passage prepended. \textbf{Epistemic filtering (Layer 2).} $K{=}3$ candidates scored by $\log G_0(y|x)$ \cite{fu2025selfverification}. @@ -443,7 +446,7 @@ \section{Failure Modes and Limitations} \item \textbf{Frozen-weights scope}: not a substitute for fine-tuning experiments. \item \textbf{Over-anchoring harms diversity}: vocabulary anchoring improves log-probability but reduces ROUGE-L recall and can impoverish creative or cross-domain generation. \item \textbf{Hallucination proxy limitation}: the bigram harness hallucination proxy (OOV rate) is always zero; neural model validation required. - \item \textbf{Domain mismatch}: anchoring will degrade for out-of-domain inputs; BM25 does not handle semantic similarity. + \item \textbf{Domain mismatch}: anchoring will degrade for out-of-domain inputs; token-overlap retrieval does not handle semantic similarity. \item \textbf{Calibration-quality mismatch (UNK-001)}: the dynamic-threshold TRR metric does not directly validate bigram-suite CQ (measured CQ\,=\,0.446 vs design estimate 0.83). Direct ECE validation is future work. \item \textbf{Scale limitations}: results validated on models from 82M to 1.5B parameters across three architecture families; extrapolation to frontier-scale models ($>$10B) is not warranted. \item \textbf{Baseline comparison power}: $N{=}20$ seeds insufficient for statistically significant pairwise comparisons in baseline competition. @@ -455,7 +458,7 @@ \section{Failure Modes and Limitations} \section{Pre-registration, Reproducibility, and Data Availability} \label{sec:reproducibility} -\textbf{Pre-registration.} Experimental design, variant definitions, metric schema, and result artifact structure were committed to the public repository prior to data collection. Commit history and pre-registered design documents are available at \url{https://github.com/BitConcepts/oea-framework-paper}. +\textbf{Pre-registration.} Experimental design, variant definitions, metric schema, and result artifact structure were committed to the public repository prior to data collection. Commit history and pre-registered design documents are available at \url{https://github.com/BitConcepts/oea-framework-paper}. The canonical archived record is available at \href{https://doi.org/10.5281/zenodo.20412150}{doi:10.5281/zenodo.20412150}. \textbf{Reproducibility.} All bigram experiments reproduce in $<$10 minutes (no GPU). See \texttt{REPRODUCE.md}. Artifact integrity verified by \texttt{experiments/manifest.json} (SHA-256 hashes). \texttt{Dockerfile} provides a containerized environment. All four real LLM models (distilgpt2, gpt2, gpt-neo-125M, Qwen2.5-1.5B) are validated and artifacts committed. diff --git a/scaffold.yml b/scaffold.yml index 0e3e715..062e0a1 100644 --- a/scaffold.yml +++ b/scaffold.yml @@ -6,7 +6,7 @@ platforms: - linux - macos language: python -spec_version: 0.11.3.dev427 +spec_version: 0.11.7 aee_phase: release description: 'OEA: Structured Recursive Calibration for Generative Stability' services: false