From 1659b720a0c4a4f2351cd43b082520e5d5714f4f Mon Sep 17 00:00:00 2001
From: heznpc <heznpc@gmail.com>
Date: Thu, 21 May 2026 02:21:22 +0900
Subject: [PATCH] docs+experiments(z-gap): pre-experiment review fixes
 (C1/C2/C3 + M1/M2/M3 + M4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical (paper integrity):
- C1 pretraining contamination caveat: new paragraph in paper §5.5 NL-Code
  Alignment + Limitations bullet. R_code > 1 reframed as "at least as strong
  as pretraining co-occurrence statistics would predict", not as independent
  evidence for Z_sem convergence beyond training-data overlap. Decisive
  separation deferred to tier2/tier3 OOD stimuli.
- C2 random-matching baseline framing: §5.5 protocol sentence now explicitly
  identifies permutation test (n=10,000) as the random-matching baseline with
  null R ≈ 1. compute_per_language_R_code() now exports the null distribution
  mean/std/p95 to results JSON.
- C3 HuggingFace revision policy: documented in
  run_strategy_d_code_alignment.py header. Pilot accepts floating-main risk
  and relies on EmbeddingCache for embedding-level reproducibility; explicit
  revision= pin deferred as Minor TODO.

Major:
- M1 stimulus complexity: new Limitations paragraph stating conclusions apply
  to stdlib-idiom-level operations only.
- M2 translation provenance: new Limitations paragraph stating no formal IAA;
  translations were first-author + LLM-assisted + bilingual review.
- M3 model robustness wrap: per-model try/except in run loop; OOM /
  trust-remote-code / network failure of one model skips that cell instead of
  aborting the 7-model sweep.
- M4 prior art: web-search confirmed no per-language × per-model NL-code
  matrix exists; "to our knowledge, first" qualifier added to §5.5.

Strategy D extension (this session's experiment):
- MODELS extended 4 -> 7: + E5-small, E5-base, BGE-M3. M5 (P3 multi-model
  probing) deferred to follow-up PR. M6 (Codestral Embed) excluded — no
  MISTRAL_API_KEY in this session.
- Run meta block (started/finished UTC, Python/torch/sentence-transformers
  versions, seed, n_perm, n_boot, failed_models) written to results JSON.

Decisions log:
- planning/decisions.md: 2026-05-21 entry documenting all C/M fixes and the
  scope choices for M5/M6.
---
 .../scripts/run_strategy_d_code_alignment.py  | 90 +++++++++++++++++--
 experiments/src/code_alignment.py             |  8 ++
 paper/main.tex                                | 10 ++-
 planning/decisions.md                         | 28 ++++++
 4 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/experiments/scripts/run_strategy_d_code_alignment.py b/experiments/scripts/run_strategy_d_code_alignment.py
index 9d75c5d..4aa5a58 100644
--- a/experiments/scripts/run_strategy_d_code_alignment.py
+++ b/experiments/scripts/run_strategy_d_code_alignment.py
@@ -2,21 +2,32 @@
 """Strategy D: Enhanced NL-Code Alignment (per-language × per-model R_code matrix).
 
 Extends the original NL-code alignment experiment from 2 models × aggregate
-to 4 models × 5 languages with per-cell statistical testing.
+to 7 models × 5 languages with per-cell statistical testing.
 
-Models:
+Models (review-2026-05-21 extension):
   1. UniXcoder (code-trained, 768d) — existing
   2. MiniLM-L12 (NL-only, 384d) — existing
-  3. Nomic Embed Text v1.5 (NL+code, 768d) — new
-  4. E5-large (NL multilingual, 1024d) — existing embedder, new for code alignment
+  3. Nomic Embed Text v1.5 (NL+code, 768d) — existing
+  4. E5-large (NL multilingual, 1024d) — existing
+  5. E5-small (NL multilingual, 384d) — NEW, P1 scale-convergence anchor
+  6. E5-base (NL multilingual, 768d) — NEW, P1 scale-convergence midpoint
+  7. BGE-M3 (NL+code multilingual, 1024d) — NEW, top MTEB cross-lingual
+
+NOTE(C3, review-2026-05-21): sentence-transformers pulls the model card's
+`main` branch at load time. For this pilot we accept floating-main risk and
+rely on EmbeddingCache (`.npz` keyed by (model_name, text_hash)) to freeze
+the actual computed embeddings. Explicit `revision=<sha>` pinning is a
+future TODO once the matrix lands.
 
 Usage:
     python experiments/scripts/run_strategy_d_code_alignment.py
 """
 
+import datetime
 import json
 import sys
 import gc
+import platform
 from pathlib import Path
 
 import numpy as np
@@ -37,6 +48,10 @@
     ("paraphrase-multilingual-MiniLM-L12-v2", "MiniLM-L12 (NL)", {}),
     ("nomic-ai/nomic-embed-text-v1.5", "Nomic v1.5 (NL+code)", {"trust_remote_code": True}),
     ("intfloat/multilingual-e5-large", "E5-large (NL)", {}),
+    # review-2026-05-21 extension (M5 a-default scope: NL-code only)
+    ("intfloat/multilingual-e5-small", "E5-small (NL)", {}),
+    ("intfloat/multilingual-e5-base", "E5-base (NL)", {}),
+    ("BAAI/bge-m3", "BGE-M3 (NL+code)", {}),
 ]
 
 
@@ -198,16 +213,65 @@ def make_figures(all_results: list[dict]):
     print(f"  Figure saved: strategy_d_dmatch_bars.png")
 
 
+def _build_run_meta() -> dict:
+    """Capture environment metadata for reproducibility (Minor TODO from review-2026-05-21)."""
+    try:
+        import sentence_transformers as _st
+        st_version = _st.__version__
+    except Exception:
+        st_version = "unknown"
+    try:
+        import torch
+        torch_version = torch.__version__
+    except Exception:
+        torch_version = "unknown"
+    return {
+        "started_at_utc": datetime.datetime.utcnow().isoformat() + "Z",
+        "python": platform.python_version(),
+        "platform": platform.platform(),
+        "sentence_transformers": st_version,
+        "torch": torch_version,
+        "numpy": np.__version__,
+        "seed": 42,
+        "n_perm": 10000,
+        "n_boot": 10000,
+        "review_id": "review-2026-05-21",
+    }
+
+
 def main():
     print("=" * 60)
     print("Strategy D: Enhanced NL-Code Alignment")
-    print("Per-Language × Per-Model R_code Matrix")
+    print("Per-Language × Per-Model R_code Matrix (7-model extension)")
     print("=" * 60)
 
+    run_meta = _build_run_meta()
+    print(f"\n  started_at_utc={run_meta['started_at_utc']}")
+    print(f"  python={run_meta['python']}  sentence_transformers={run_meta['sentence_transformers']}  torch={run_meta['torch']}")
+    print(f"  seed={run_meta['seed']}  n_perm={run_meta['n_perm']}  n_boot={run_meta['n_boot']}")
+
     all_results = []
+    failed_models = []
+    # M3 (review-2026-05-21): per-model try/except so a single OOM / network /
+    # trust-remote-code failure does not abort the full 7-model sweep.
     for model_name, label, kwargs in MODELS:
-        result = run_model(model_name, label, kwargs)
-        all_results.append(result)
+        try:
+            result = run_model(model_name, label, kwargs)
+            all_results.append(result)
+        except Exception as exc:  # noqa: BLE001
+            err = {
+                "model": model_name,
+                "label": label,
+                "error_type": type(exc).__name__,
+                "error_message": str(exc),
+            }
+            failed_models.append(err)
+            print(
+                f"\n  [SKIP] {label} ({model_name}) failed: "
+                f"{err['error_type']}: {err['error_message']}",
+                file=sys.stderr,
+            )
+            gc.collect()
 
     # Holm-Bonferroni correction across all per-language p-values
     all_p = []
@@ -268,9 +332,19 @@ def _convert(obj):
         if isinstance(obj, (np.bool_,)): return bool(obj)
         return obj
 
+    run_meta["finished_at_utc"] = datetime.datetime.utcnow().isoformat() + "Z"
+    run_meta["n_models_attempted"] = len(MODELS)
+    run_meta["n_models_succeeded"] = len(all_results)
+    run_meta["failed_models"] = failed_models
+
+    payload = {"_meta": run_meta, "results": all_results}
     with open(out_path, "w") as f:
-        json.dump(all_results, f, indent=2, default=_convert)
+        json.dump(payload, f, indent=2, default=_convert)
     print(f"\n  Results saved: {out_path}")
+    if failed_models:
+        print(f"  [WARN] {len(failed_models)} model(s) skipped due to errors:")
+        for err in failed_models:
+            print(f"    - {err['label']}: {err['error_type']}")
 
 
 if __name__ == "__main__":
diff --git a/experiments/src/code_alignment.py b/experiments/src/code_alignment.py
index bc76ca2..0c26cd5 100644
--- a/experiments/src/code_alignment.py
+++ b/experiments/src/code_alignment.py
@@ -216,6 +216,14 @@ def compute_per_language_R_code(
             "d_mismatch_mean": float(np.mean(d_mismatch_arr)),
             "n_match": len(d_match_arr),
             "n_mismatch": len(d_mismatch_arr),
+            # C2 (review-2026-05-21): random-matching baseline, sourced from the
+            # permutation null distribution. Expected ≈ 1.0 if shuffled NL→code
+            # pairings produce the same mean(d_mismatch)/mean(d_match) ratio as
+            # matched pairings. Used in paper §5.5 to anchor R_code = 1 as the
+            # null line rather than as an asserted-but-unmeasured baseline.
+            "random_baseline_R_mean": float(np.mean(perm_Rs)),
+            "random_baseline_R_std": float(np.std(perm_Rs)),
+            "random_baseline_R_p95": float(np.percentile(perm_Rs, 95)),
         }
 
     # Aggregate (all languages pooled)
diff --git a/paper/main.tex b/paper/main.tex
index d1a9200..1306a81 100644
--- a/paper/main.tex
+++ b/paper/main.tex
@@ -460,7 +460,7 @@ \subsection{Pilot Experiment and Results}\label{sec:pilot}
 P7 is \textbf{strongly supported}: Korean spacing variants cluster ${\sim}3\times$ closer than semantically different operations. This holds across both models with tight bootstrap confidence intervals. Byte-level models (ByT5, CANINE) should achieve higher ratios.
 
 \paragraph{NL-Code Cross-Modal Alignment.}
-To directly test PRH for code, we embed 50 computational NL descriptions alongside their Python code equivalents through four models: UniXcoder~\cite{unixcoder} (code-trained, 768d), MiniLM-L12 (NL-only, 384d), Nomic Embed Text v1.5 (768d), and E5-large (1024d). We define $R_{\text{code}} = d_{\text{mismatch}} / d_{\text{match}}$, where $d_{\text{match}}$ is the distance between an NL description and its corresponding code, and $d_{\text{mismatch}}$ is the distance to a different operation's code. We compute per-language $R_{\text{code}}$ with permutation tests ($n=10{,}000$) and bootstrap confidence intervals, corrected via Holm-Bonferroni across 20 cells.
+To directly test PRH for code, we embed 50 computational NL descriptions alongside their Python code equivalents through four models: UniXcoder~\cite{unixcoder} (code-trained, 768d), MiniLM-L12 (NL-only, 384d), Nomic Embed Text v1.5 (768d), and E5-large (1024d). We define $R_{\text{code}} = d_{\text{mismatch}} / d_{\text{match}}$, where $d_{\text{match}}$ is the distance between an NL description and its corresponding code, and $d_{\text{mismatch}}$ is the distance to a different operation's code. We compute per-language $R_{\text{code}}$ with permutation tests ($n=10{,}000$; shuffled NL-code pairings serve as the random-matching baseline with null $R \approx 1$) and bootstrap confidence intervals ($n=10{,}000$), corrected via Holm-Bonferroni across 20 cells. To our knowledge, this is the first per-language $\times$ per-model NL-code alignment matrix reported in the cross-lingual representation literature; concurrent work on omnilingual sentence-code embeddings extends the modality set at the model level rather than measuring the cross-lingual gradient within a fixed code-stimulus set.
 
 \begin{table}[h]
 \centering
@@ -484,6 +484,8 @@ \subsection{Pilot Experiment and Results}\label{sec:pilot}
 
 \paragraph{Lexical overlap control.} A potential confound: NL descriptions share tokens with their code equivalents (``sort'' appears in both ``Sort the list'' and \texttt{sorted(lst)}). Token overlap correlates with $d_{\text{match}}$ (Spearman $\rho = -0.51$, $p < 0.001$ for MiniLM), confirming a lexical component. However, $R_{\text{code}} > 1$ survives two controls. First, for the 32/50 operations with \emph{zero} token overlap (after stemming), $R_{\text{code}}$ remains above 1 in all three models (1.06--1.18). Second, obfuscating variable names in code (\texttt{lst}$\to$\texttt{v0}, \texttt{s}$\to$\texttt{v0}) reduces $R_{\text{code}}$ by only 1.6--5.4\%, and all models retain $R_{\text{code}} > 1$. Lexical overlap inflates the effect but does not create it: the alignment is primarily semantic.
 
+\paragraph{Pretraining contamination caveat.} A stronger confound than surface lexical overlap is direct memorization. All 50 computational operations are Python standard-library idioms (\texttt{sorted(lst)}, \texttt{max(lst)}, \texttt{len(lst)}, \texttt{collections.Counter(lst)}, ...), so the embedding models almost certainly observed exact or near-exact NL$\leftrightarrow$code co-occurrences during pretraining (docstrings, Stack Overflow, GitHub README files). The token-overlap controls above bound the \emph{surface lexical} component of the alignment but not the \emph{operation-level memorization} component. We therefore interpret $R_{\text{code}} > 1$ as evidence that NL-code alignment is \emph{at least as strong as} pretraining co-occurrence statistics would predict, not as independent evidence for $\Zsem$ convergence beyond what training-data overlap explains. Decisive separation requires either out-of-distribution operations (novel composite stimuli---\texttt{tier2\_multistep.json} and \texttt{tier3\_compositional.json} are released in the experiment repository for this purpose) or matched-perplexity controls; both are left to future work.
+
 Critically, this resolves the P2 result. P2 measured NL-NL cross-lingual invariance and found computational operations \emph{less} invariant---a finding the vocabulary mediation and language-pair analyses explain as a property of domain-specific terminology. The NL-code experiment shows that despite this description-level divergence, NL-code alignment is positive across all 20 cells. The four results form a coherent picture: (i)~computational vocabulary drives cross-lingual description divergence (vocabulary mediation); (ii)~this divergence is uniform across language pairs (language-pair decomposition); (iii)~yet NL-code alignment holds in every language and model (20/20 significant); (iv)~the alignment is modulated by $\Dtrain$, not eliminated. Description-level invariance and execution-level convergence are distinct phenomena---\textbf{convergence $\neq$ communicability}.
 
 \paragraph{P7 Extension: Punctuation Robustness.}
@@ -630,6 +632,12 @@ \section*{Limitations}
 
 \textbf{Vocabulary mediation analysis has limited power.} With $n=50$ operations per category and Bonferroni correction across 8 features, the minimum detectable effect is $|\rho| \geq 0.35$ (pooled) or $|\rho| \geq 0.48$ (within-category). Moderate effects may be missed. The language-pair decomposition uses ordinal typological ranks assigned by the authors, not an independent typological distance metric.
 
+\textbf{Stimulus complexity.} The 50 computational operations are single-statement Python standard-library idioms (\texttt{sorted}, \texttt{max}, list comprehensions of depth $\leq 1$). Multi-statement composition and tier-2 / tier-3 stimuli (\texttt{experiments/data/stimuli/tier2\_multistep.json}, \texttt{tier3\_compositional.json}) are released in the repository but not yet analyzed; they are left to future work. Conclusions therefore apply to the population of stdlib-idiom-level operations, not to programming complexity more broadly.
+
+\textbf{Translation provenance.} The 5-language stimulus set was produced by the first author (Korean L1) with LLM-assisted draft translation and bilingual review for Spanish, Chinese, and Arabic. No formal inter-annotator agreement (Cohen's $\kappa$, ICC) was computed---a recognized weakness given $n=5$ languages. This affects all cross-lingual claims in this paper.
+
+\textbf{Pretraining contamination of NL-code stimuli.} As detailed in \S\ref{sec:pilot}, the 50 computational operations are common stdlib idioms with high probability of appearing in every embedding model's pretraining corpus. The $R_{\text{code}} > 1$ finding is bounded above by training-data co-occurrence statistics; it is not independent evidence for representation-level $\Zsem$ convergence beyond pretraining memorization.
+
 \textbf{Disambiguation conservation is a design heuristic.} The information-theoretic formalization (Eq.~\ref{eq:conservation}) draws on Kolmogorov complexity, which is uncomputable in general. The conservation principle guides system design but is not a formal theorem.
 
 \textbf{Neuroscience connections are structural analogies.} We draw parallels between PRH convergence and Hyperalignment, and between the communicability gap and the private language problem. These are structural correspondences, not claims of mechanistic identity between artificial and biological neural networks.
diff --git a/planning/decisions.md b/planning/decisions.md
index d7b12c0..534f5c3 100644
--- a/planning/decisions.md
+++ b/planning/decisions.md
@@ -38,3 +38,31 @@ Format: `## YYYY-MM-DD -- <short title>` with **Context**, **Decision**, **Why**
 **Decision**: Freeze the submission copies as venue-specific snapshots. paper/main.tex is the preferred version going forward. Do not auto-sync.
 
 **Why**: The divergence reflects real editorial work (COLM-specific framing, EMNLP review responses). Collapsing to one auto-synced source would erase those decisions; keeping them frozen lets the user reconcile manually.
+
+---
+
+## 2026-05-21 -- Pre-experiment research review for Strategy D cross-model extension
+
+**Context**: Before extending `run_strategy_d_code_alignment.py` from 4 to 7 embedding models, a 9-dimension review surfaced three Critical issues and three Major issues that needed to be reflected in `paper/main.tex` (limitations + main text) before re-running the experiment, so the experiment is not invalidated by a known reviewer-side weakness discovered after the fact.
+
+**Decisions**:
+
+  - **C1 (Pretraining contamination)**: The 50 computational stimuli are all Python stdlib idioms (`sorted`, `max`, `len`, ...). Embedding models almost certainly saw these exact NL↔code pairings during pretraining. Added a `Pretraining contamination caveat` paragraph to `paper/main.tex` §5.5 and a corresponding bullet to Limitations. `R_code > 1` is now interpreted as "at least as strong as pretraining co-occurrence would predict," not as independent evidence for Z_sem convergence. Decisive separation deferred to tier-2/tier-3 OOD stimuli (already in `experiments/data/stimuli/` but unanalyzed).
+
+  - **C2 (Random matching baseline)**: The permutation test (n=10,000) is now explicitly framed in the main text as the random-matching baseline with null R ≈ 1. The shuffled-pairing R distribution mean will be exported to `results/strategy_d_code_alignment.json` per language for transparency.
+
+  - **C3 (HuggingFace model revision pin)**: `sentence-transformers` pulls the model card's `main` branch at load time. For this pilot we accept the floating-`main` risk and rely on the existing `EmbeddingCache` (`.npz` keyed by `(model_name, text_hash)`) to freeze the actual computed embeddings. Pinning via `revision=` is left as a Minor TODO once the cross-model matrix lands.
+
+  - **M1 (Trivial stimuli)**: Added `Stimulus complexity` paragraph to Limitations stating that conclusions apply to stdlib-idiom-level operations only; tier-2/tier-3 stimuli exist but are not yet analyzed.
+
+  - **M2 (Translation provenance)**: Added `Translation provenance` paragraph stating that translations were produced by the first author with LLM-assisted draft and bilingual review; no formal IAA. This is acknowledged as a recognized weakness for cross-lingual claims.
+
+  - **M3 (Model robustness wrap)**: `run_strategy_d_code_alignment.py` model loop wrapped in `try/except` per model so a single OOM/network/trust-remote-code failure does not abort the full 7-model sweep.
+
+  - **M4 (Prior art)**: Web search confirmed no per-language × per-model NL-code alignment matrix exists in the cross-lingual representation literature as of 2026-05. OmniSONAR (Meta, 2026.03; arXiv:2603.16606) is the closest concurrent work but operates at the model-level multi-modal embedding axis, not the cross-lingual gradient within a fixed code-stimulus set. A "to our knowledge, first" qualifier was added to §5.5.
+
+  - **M5 (P3 multi-model probing scope)**: Deferred. This session's experiment is NL-code alignment only (Strategy D scope). P3 cross-lingual probing on the 7-model set is a separate follow-up PR.
+
+  - **M6 (Codestral Embed)**: Excluded for this session. `.env` has no `MISTRAL_API_KEY`, and the user constrained the session to Claude Code-accessible models. Sentence-transformers / open-source HF only.
+
+**Why**: The pre-experiment review caught contamination and baseline-framing issues that, if discovered after results were reported, would have required a paper revision plus a fresh experiment. Catching them before the cross-model extension lets a single PR carry the corrected framing and the new evidence simultaneously.