Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 82 additions & 8 deletions experiments/scripts/run_strategy_d_code_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,32 @@
"""Strategy D: Enhanced NL-Code Alignment (per-language × per-model R_code matrix).

Extends the original NL-code alignment experiment from 2 models × aggregate
to 4 models × 5 languages with per-cell statistical testing.
to 7 models × 5 languages with per-cell statistical testing.

Models:
Models (review-2026-05-21 extension):
1. UniXcoder (code-trained, 768d) — existing
2. MiniLM-L12 (NL-only, 384d) — existing
3. Nomic Embed Text v1.5 (NL+code, 768d) — new
4. E5-large (NL multilingual, 1024d) — existing embedder, new for code alignment
3. Nomic Embed Text v1.5 (NL+code, 768d) — existing
4. E5-large (NL multilingual, 1024d) — existing
5. E5-small (NL multilingual, 384d) — NEW, P1 scale-convergence anchor
6. E5-base (NL multilingual, 768d) — NEW, P1 scale-convergence midpoint
7. BGE-M3 (NL+code multilingual, 1024d) — NEW, top MTEB cross-lingual

NOTE(C3, review-2026-05-21): sentence-transformers pulls the model card's
`main` branch at load time. For this pilot we accept floating-main risk and
rely on EmbeddingCache (`.npz` keyed by (model_name, text_hash)) to freeze
the actual computed embeddings. Explicit `revision=<sha>` pinning is a
future TODO once the matrix lands.

Usage:
python experiments/scripts/run_strategy_d_code_alignment.py
"""

import datetime
import json
import sys
import gc
import platform
from pathlib import Path

import numpy as np
Expand All @@ -37,6 +48,10 @@
("paraphrase-multilingual-MiniLM-L12-v2", "MiniLM-L12 (NL)", {}),
("nomic-ai/nomic-embed-text-v1.5", "Nomic v1.5 (NL+code)", {"trust_remote_code": True}),
("intfloat/multilingual-e5-large", "E5-large (NL)", {}),
# review-2026-05-21 extension (M5 a-default scope: NL-code only)
("intfloat/multilingual-e5-small", "E5-small (NL)", {}),
("intfloat/multilingual-e5-base", "E5-base (NL)", {}),
("BAAI/bge-m3", "BGE-M3 (NL+code)", {}),
]


Expand Down Expand Up @@ -198,16 +213,65 @@ def make_figures(all_results: list[dict]):
print(f" Figure saved: strategy_d_dmatch_bars.png")


def _build_run_meta() -> dict:
"""Capture environment metadata for reproducibility (Minor TODO from review-2026-05-21)."""
try:
import sentence_transformers as _st
st_version = _st.__version__
except Exception:
st_version = "unknown"
try:
import torch
torch_version = torch.__version__
except Exception:
torch_version = "unknown"
return {
"started_at_utc": datetime.datetime.utcnow().isoformat() + "Z",
"python": platform.python_version(),
"platform": platform.platform(),
"sentence_transformers": st_version,
"torch": torch_version,
"numpy": np.__version__,
"seed": 42,
"n_perm": 10000,
"n_boot": 10000,
"review_id": "review-2026-05-21",
}


def main():
print("=" * 60)
print("Strategy D: Enhanced NL-Code Alignment")
print("Per-Language × Per-Model R_code Matrix")
print("Per-Language × Per-Model R_code Matrix (7-model extension)")
print("=" * 60)

run_meta = _build_run_meta()
print(f"\n started_at_utc={run_meta['started_at_utc']}")
print(f" python={run_meta['python']} sentence_transformers={run_meta['sentence_transformers']} torch={run_meta['torch']}")
print(f" seed={run_meta['seed']} n_perm={run_meta['n_perm']} n_boot={run_meta['n_boot']}")

all_results = []
failed_models = []
# M3 (review-2026-05-21): per-model try/except so a single OOM / network /
# trust-remote-code failure does not abort the full 7-model sweep.
for model_name, label, kwargs in MODELS:
result = run_model(model_name, label, kwargs)
all_results.append(result)
try:
result = run_model(model_name, label, kwargs)
all_results.append(result)
except Exception as exc: # noqa: BLE001
err = {
"model": model_name,
"label": label,
"error_type": type(exc).__name__,
"error_message": str(exc),
}
failed_models.append(err)
print(
f"\n [SKIP] {label} ({model_name}) failed: "
f"{err['error_type']}: {err['error_message']}",
file=sys.stderr,
)
gc.collect()

# Holm-Bonferroni correction across all per-language p-values
all_p = []
Expand Down Expand Up @@ -268,9 +332,19 @@ def _convert(obj):
if isinstance(obj, (np.bool_,)): return bool(obj)
return obj

run_meta["finished_at_utc"] = datetime.datetime.utcnow().isoformat() + "Z"
run_meta["n_models_attempted"] = len(MODELS)
run_meta["n_models_succeeded"] = len(all_results)
run_meta["failed_models"] = failed_models

payload = {"_meta": run_meta, "results": all_results}
with open(out_path, "w") as f:
json.dump(all_results, f, indent=2, default=_convert)
json.dump(payload, f, indent=2, default=_convert)
print(f"\n Results saved: {out_path}")
if failed_models:
print(f" [WARN] {len(failed_models)} model(s) skipped due to errors:")
for err in failed_models:
print(f" - {err['label']}: {err['error_type']}")


if __name__ == "__main__":
Expand Down
8 changes: 8 additions & 0 deletions experiments/src/code_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,14 @@ def compute_per_language_R_code(
"d_mismatch_mean": float(np.mean(d_mismatch_arr)),
"n_match": len(d_match_arr),
"n_mismatch": len(d_mismatch_arr),
# C2 (review-2026-05-21): random-matching baseline, sourced from the
# permutation null distribution. Expected ≈ 1.0 if shuffled NL→code
# pairings produce the same mean(d_mismatch)/mean(d_match) ratio as
# matched pairings. Used in paper §5.5 to anchor R_code = 1 as the
# null line rather than as an asserted-but-unmeasured baseline.
"random_baseline_R_mean": float(np.mean(perm_Rs)),
"random_baseline_R_std": float(np.std(perm_Rs)),
"random_baseline_R_p95": float(np.percentile(perm_Rs, 95)),
}

# Aggregate (all languages pooled)
Expand Down
10 changes: 9 additions & 1 deletion paper/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ \subsection{Pilot Experiment and Results}\label{sec:pilot}
P7 is \textbf{strongly supported}: Korean spacing variants cluster ${\sim}3\times$ closer than semantically different operations. This holds across both models with tight bootstrap confidence intervals. Byte-level models (ByT5, CANINE) should achieve higher ratios.

\paragraph{NL-Code Cross-Modal Alignment.}
To directly test PRH for code, we embed 50 computational NL descriptions alongside their Python code equivalents through four models: UniXcoder~\cite{unixcoder} (code-trained, 768d), MiniLM-L12 (NL-only, 384d), Nomic Embed Text v1.5 (768d), and E5-large (1024d). We define $R_{\text{code}} = d_{\text{mismatch}} / d_{\text{match}}$, where $d_{\text{match}}$ is the distance between an NL description and its corresponding code, and $d_{\text{mismatch}}$ is the distance to a different operation's code. We compute per-language $R_{\text{code}}$ with permutation tests ($n=10{,}000$) and bootstrap confidence intervals, corrected via Holm-Bonferroni across 20 cells.
To directly test PRH for code, we embed 50 computational NL descriptions alongside their Python code equivalents through four models: UniXcoder~\cite{unixcoder} (code-trained, 768d), MiniLM-L12 (NL-only, 384d), Nomic Embed Text v1.5 (768d), and E5-large (1024d). We define $R_{\text{code}} = d_{\text{mismatch}} / d_{\text{match}}$, where $d_{\text{match}}$ is the distance between an NL description and its corresponding code, and $d_{\text{mismatch}}$ is the distance to a different operation's code. We compute per-language $R_{\text{code}}$ with permutation tests ($n=10{,}000$; shuffled NL-code pairings serve as the random-matching baseline with null $R \approx 1$) and bootstrap confidence intervals ($n=10{,}000$), corrected via Holm-Bonferroni across 20 cells. To our knowledge, this is the first per-language $\times$ per-model NL-code alignment matrix reported in the cross-lingual representation literature; concurrent work on omnilingual sentence-code embeddings extends the modality set at the model level rather than measuring the cross-lingual gradient within a fixed code-stimulus set.

\begin{table}[h]
\centering
Expand All @@ -484,6 +484,8 @@ \subsection{Pilot Experiment and Results}\label{sec:pilot}

\paragraph{Lexical overlap control.} A potential confound: NL descriptions share tokens with their code equivalents (``sort'' appears in both ``Sort the list'' and \texttt{sorted(lst)}). Token overlap correlates with $d_{\text{match}}$ (Spearman $\rho = -0.51$, $p < 0.001$ for MiniLM), confirming a lexical component. However, $R_{\text{code}} > 1$ survives two controls. First, for the 32/50 operations with \emph{zero} token overlap (after stemming), $R_{\text{code}}$ remains above 1 in all three models (1.06--1.18). Second, obfuscating variable names in code (\texttt{lst}$\to$\texttt{v0}, \texttt{s}$\to$\texttt{v0}) reduces $R_{\text{code}}$ by only 1.6--5.4\%, and all models retain $R_{\text{code}} > 1$. Lexical overlap inflates the effect but does not create it: the alignment is primarily semantic.

\paragraph{Pretraining contamination caveat.} A stronger confound than surface lexical overlap is direct memorization. All 50 computational operations are Python standard-library idioms (\texttt{sorted(lst)}, \texttt{max(lst)}, \texttt{len(lst)}, \texttt{collections.Counter(lst)}, ...), so the embedding models almost certainly observed exact or near-exact NL$\leftrightarrow$code co-occurrences during pretraining (docstrings, Stack Overflow, GitHub README files). The token-overlap controls above bound the \emph{surface lexical} component of the alignment but not the \emph{operation-level memorization} component. We therefore interpret $R_{\text{code}} > 1$ as evidence that NL-code alignment is \emph{at least as strong as} pretraining co-occurrence statistics would predict, not as independent evidence for $\Zsem$ convergence beyond what training-data overlap explains. Decisive separation requires either out-of-distribution operations (novel composite stimuli---\texttt{tier2\_multistep.json} and \texttt{tier3\_compositional.json} are released in the experiment repository for this purpose) or matched-perplexity controls; both are left to future work.

Critically, this resolves the P2 result. P2 measured NL-NL cross-lingual invariance and found computational operations \emph{less} invariant---a finding the vocabulary mediation and language-pair analyses explain as a property of domain-specific terminology. The NL-code experiment shows that despite this description-level divergence, NL-code alignment is positive across all 20 cells. The four results form a coherent picture: (i)~computational vocabulary drives cross-lingual description divergence (vocabulary mediation); (ii)~this divergence is uniform across language pairs (language-pair decomposition); (iii)~yet NL-code alignment holds in every language and model (20/20 significant); (iv)~the alignment is modulated by $\Dtrain$, not eliminated. Description-level invariance and execution-level convergence are distinct phenomena---\textbf{convergence $\neq$ communicability}.

\paragraph{P7 Extension: Punctuation Robustness.}
Expand Down Expand Up @@ -630,6 +632,12 @@ \section*{Limitations}

\textbf{Vocabulary mediation analysis has limited power.} With $n=50$ operations per category and Bonferroni correction across 8 features, the minimum detectable effect is $|\rho| \geq 0.35$ (pooled) or $|\rho| \geq 0.48$ (within-category). Moderate effects may be missed. The language-pair decomposition uses ordinal typological ranks assigned by the authors, not an independent typological distance metric.

\textbf{Stimulus complexity.} The 50 computational operations are single-statement Python standard-library idioms (\texttt{sorted}, \texttt{max}, list comprehensions of depth $\leq 1$). Multi-statement composition and tier-2 / tier-3 stimuli (\texttt{experiments/data/stimuli/tier2\_multistep.json}, \texttt{tier3\_compositional.json}) are released in the repository but not yet analyzed; they are left to future work. Conclusions therefore apply to the population of stdlib-idiom-level operations, not to programming complexity more broadly.

\textbf{Translation provenance.} The 5-language stimulus set was produced by the first author (Korean L1) with LLM-assisted draft translation and bilingual review for Spanish, Chinese, and Arabic. No formal inter-annotator agreement (Cohen's $\kappa$, ICC) was computed---a recognized weakness given $n=5$ languages. This affects all cross-lingual claims in this paper.

\textbf{Pretraining contamination of NL-code stimuli.} As detailed in \S\ref{sec:pilot}, the 50 computational operations are common stdlib idioms with high probability of appearing in every embedding model's pretraining corpus. The $R_{\text{code}} > 1$ finding is bounded above by training-data co-occurrence statistics; it is not independent evidence for representation-level $\Zsem$ convergence beyond pretraining memorization.

\textbf{Disambiguation conservation is a design heuristic.} The information-theoretic formalization (Eq.~\ref{eq:conservation}) draws on Kolmogorov complexity, which is uncomputable in general. The conservation principle guides system design but is not a formal theorem.

\textbf{Neuroscience connections are structural analogies.} We draw parallels between PRH convergence and Hyperalignment, and between the communicability gap and the private language problem. These are structural correspondences, not claims of mechanistic identity between artificial and biological neural networks.
Expand Down
28 changes: 28 additions & 0 deletions planning/decisions.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,31 @@ Format: `## YYYY-MM-DD -- <short title>` with **Context**, **Decision**, **Why**
**Decision**: Freeze the submission copies as venue-specific snapshots. paper/main.tex is the preferred version going forward. Do not auto-sync.

**Why**: The divergence reflects real editorial work (COLM-specific framing, EMNLP review responses). Collapsing to one auto-synced source would erase those decisions; keeping them frozen lets the user reconcile manually.

---

## 2026-05-21 -- Pre-experiment research review for Strategy D cross-model extension

**Context**: Before extending `run_strategy_d_code_alignment.py` from 4 to 7 embedding models, a 9-dimension review surfaced three Critical issues and three Major issues that needed to be reflected in `paper/main.tex` (limitations + main text) before re-running the experiment, so the experiment is not invalidated by a known reviewer-side weakness discovered after the fact.

**Decisions**:

- **C1 (Pretraining contamination)**: The 50 computational stimuli are all Python stdlib idioms (`sorted`, `max`, `len`, ...). Embedding models almost certainly saw these exact NL↔code pairings during pretraining. Added a `Pretraining contamination caveat` paragraph to `paper/main.tex` §5.5 and a corresponding bullet to Limitations. `R_code > 1` is now interpreted as "at least as strong as pretraining co-occurrence would predict," not as independent evidence for Z_sem convergence. Decisive separation deferred to tier-2/tier-3 OOD stimuli (already in `experiments/data/stimuli/` but unanalyzed).

- **C2 (Random matching baseline)**: The permutation test (n=10,000) is now explicitly framed in the main text as the random-matching baseline with null R ≈ 1. The shuffled-pairing R distribution mean will be exported to `results/strategy_d_code_alignment.json` per language for transparency.

- **C3 (HuggingFace model revision pin)**: `sentence-transformers` pulls the model card's `main` branch at load time. For this pilot we accept the floating-`main` risk and rely on the existing `EmbeddingCache` (`.npz` keyed by `(model_name, text_hash)`) to freeze the actual computed embeddings. Pinning via `revision=` is left as a Minor TODO once the cross-model matrix lands.

- **M1 (Trivial stimuli)**: Added `Stimulus complexity` paragraph to Limitations stating that conclusions apply to stdlib-idiom-level operations only; tier-2/tier-3 stimuli exist but are not yet analyzed.

- **M2 (Translation provenance)**: Added `Translation provenance` paragraph stating that translations were produced by the first author with LLM-assisted draft and bilingual review; no formal IAA. This is acknowledged as a recognized weakness for cross-lingual claims.

- **M3 (Model robustness wrap)**: `run_strategy_d_code_alignment.py` model loop wrapped in `try/except` per model so a single OOM/network/trust-remote-code failure does not abort the full 7-model sweep.

- **M4 (Prior art)**: Web search confirmed no per-language × per-model NL-code alignment matrix exists in the cross-lingual representation literature as of 2026-05. OmniSONAR (Meta, 2026.03; arXiv:2603.16606) is the closest concurrent work but operates at the model-level multi-modal embedding axis, not the cross-lingual gradient within a fixed code-stimulus set. A "to our knowledge, first" qualifier was added to §5.5.

- **M5 (P3 multi-model probing scope)**: Deferred. This session's experiment is NL-code alignment only (Strategy D scope). P3 cross-lingual probing on the 7-model set is a separate follow-up PR.

- **M6 (Codestral Embed)**: Excluded for this session. `.env` has no `MISTRAL_API_KEY`, and the user constrained the session to Claude Code-accessible models. Sentence-transformers / open-source HF only.

**Why**: The pre-experiment review caught contamination and baseline-framing issues that, if discovered after results were reported, would have required a paper revision plus a fresh experiment. Catching them before the cross-model extension lets a single PR carry the corrected framing and the new evidence simultaneously.
Loading