diff --git a/experiments/README.md b/experiments/README.md index 49182af..ea41efa 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -10,6 +10,7 @@ Tests predictions P1–P7 from the paper: cross-lingual semantic invariance (P2) - **Hardware**: CPU sufficient for the 100-op pilot (~3–5h end-to-end across 7 embedding models). MPS/CUDA optional and only used by `scripts/run_v2_extract.py` for 8B decoder hidden-state extraction. - **External APIs**: OpenAI Embeddings (`text-embedding-3-small`/`-large`) and Mistral Codestral Embed (`codestral-embed-2505`). Both calls now retry on 429/5xx with exponential backoff (`max_retries=5`). - **Data sent to providers**: synthetic stimuli only (`data/stimuli/*.json`). No PII. +- **Model weights**: HuggingFace commit SHAs pinned in `src/model_registry.py` (frozen on 2026-05-21); sentence-transformers `>=5.5` honors the `revision=` kwarg. Embedding-level reproducibility is additionally guaranteed by the `.npz` cache in `results/embeddings/` keyed by `(model_name, text_hash)`. To refresh the registry, see the helper snippet at the bottom of `model_registry.py`. ## Setup diff --git a/experiments/scripts/run_strategy_d_code_alignment.py b/experiments/scripts/run_strategy_d_code_alignment.py index 4aa5a58..b48a5cb 100644 --- a/experiments/scripts/run_strategy_d_code_alignment.py +++ b/experiments/scripts/run_strategy_d_code_alignment.py @@ -13,11 +13,11 @@ 6. E5-base (NL multilingual, 768d) — NEW, P1 scale-convergence midpoint 7. BGE-M3 (NL+code multilingual, 1024d) — NEW, top MTEB cross-lingual -NOTE(C3, review-2026-05-21): sentence-transformers pulls the model card's -`main` branch at load time. For this pilot we accept floating-main risk and -rely on EmbeddingCache (`.npz` keyed by (model_name, text_hash)) to freeze -the actual computed embeddings. Explicit `revision=` pinning is a -future TODO once the matrix lands. +Models and their HuggingFace `revision=` SHAs are centralized in +`src/model_registry.py` (closes C3 from the 2026-05-21 review). The +EmbeddingCache (`.npz` keyed by (model_name, text_hash)) still provides +embedding-level reproducibility; the SHA pin adds upstream-mutation +protection. Usage: python experiments/scripts/run_strategy_d_code_alignment.py @@ -37,23 +37,15 @@ from src.stimuli import get_all_operations, LANGUAGES from src.embeddings import SentenceTransformerEmbedder, EmbeddingCache from src.code_alignment import CODE_EQUIVALENTS, compute_per_language_R_code +from src.model_registry import MODELS_7_FROZEN, registry_sha_summary + +MODELS = MODELS_7_FROZEN ROOT = Path(__file__).parent.parent RESULTS_DIR = ROOT / "results" FIGURES_DIR = RESULTS_DIR / "figures" CACHE_DIR = RESULTS_DIR / "embeddings" -MODELS = [ - ("microsoft/unixcoder-base", "UniXcoder (code)", {}), - ("paraphrase-multilingual-MiniLM-L12-v2", "MiniLM-L12 (NL)", {}), - ("nomic-ai/nomic-embed-text-v1.5", "Nomic v1.5 (NL+code)", {"trust_remote_code": True}), - ("intfloat/multilingual-e5-large", "E5-large (NL)", {}), - # review-2026-05-21 extension (M5 a-default scope: NL-code only) - ("intfloat/multilingual-e5-small", "E5-small (NL)", {}), - ("intfloat/multilingual-e5-base", "E5-base (NL)", {}), - ("BAAI/bge-m3", "BGE-M3 (NL+code)", {}), -] - def run_model(model_name: str, label: str, kwargs: dict) -> dict: """Run per-language R_code for one model.""" @@ -236,6 +228,7 @@ def _build_run_meta() -> dict: "n_perm": 10000, "n_boot": 10000, "review_id": "review-2026-05-21", + "model_revisions": registry_sha_summary(), } diff --git a/experiments/scripts/run_strategy_e_multimodel_probing.py b/experiments/scripts/run_strategy_e_multimodel_probing.py index b09d177..b012f81 100644 --- a/experiments/scripts/run_strategy_e_multimodel_probing.py +++ b/experiments/scripts/run_strategy_e_multimodel_probing.py @@ -38,23 +38,16 @@ from src.stimuli import get_all_operations, LANGUAGES from src.embeddings import SentenceTransformerEmbedder, EmbeddingCache +from src.model_registry import MODELS_7_FROZEN, registry_sha_summary ROOT = Path(__file__).parent.parent RESULTS_DIR = ROOT / "results" FIGURES_DIR = RESULTS_DIR / "figures" CACHE_DIR = RESULTS_DIR / "embeddings" -# Same 7-model set as Strategy D (run_strategy_d_code_alignment.py). -# Kept in sync manually; consider a shared model_registry.py if extended. -MODELS = [ - ("microsoft/unixcoder-base", "UniXcoder (code)", {}), - ("paraphrase-multilingual-MiniLM-L12-v2", "MiniLM-L12 (NL)", {}), - ("nomic-ai/nomic-embed-text-v1.5", "Nomic v1.5 (NL+code)", {"trust_remote_code": True}), - ("intfloat/multilingual-e5-small", "E5-small (NL)", {}), - ("intfloat/multilingual-e5-base", "E5-base (NL)", {}), - ("intfloat/multilingual-e5-large", "E5-large (NL)", {}), - ("BAAI/bge-m3", "BGE-M3 (NL+code)", {}), -] +# Frozen 7-model set with HuggingFace revision SHAs pinned at 2026-05-21. +# See experiments/src/model_registry.py. +MODELS = MODELS_7_FROZEN # Random seed mirrors Strategy D for cross-experiment consistency SEED = 42 @@ -232,6 +225,7 @@ def _build_run_meta() -> dict: "seed": SEED, "review_id": "review-2026-05-21", "closes": "M5 (multi-model P3 probing)", + "model_revisions": registry_sha_summary(), } diff --git a/experiments/scripts/run_strategy_f_ood_alignment.py b/experiments/scripts/run_strategy_f_ood_alignment.py index d9b156e..df94f79 100644 --- a/experiments/scripts/run_strategy_f_ood_alignment.py +++ b/experiments/scripts/run_strategy_f_ood_alignment.py @@ -47,6 +47,7 @@ from src.embeddings import SentenceTransformerEmbedder, EmbeddingCache from src.code_alignment import compute_per_language_R_code +from src.model_registry import MODELS_7_FROZEN, registry_sha_summary ROOT = Path(__file__).parent.parent DATA_DIR = ROOT / "data" / "stimuli" @@ -57,15 +58,9 @@ LANGUAGES = ["en", "ko", "zh", "ar", "es"] SEED = 42 -MODELS = [ - ("microsoft/unixcoder-base", "UniXcoder (code)", {}), - ("paraphrase-multilingual-MiniLM-L12-v2", "MiniLM-L12 (NL)", {}), - ("nomic-ai/nomic-embed-text-v1.5", "Nomic v1.5 (NL+code)", {"trust_remote_code": True}), - ("intfloat/multilingual-e5-small", "E5-small (NL)", {}), - ("intfloat/multilingual-e5-base", "E5-base (NL)", {}), - ("intfloat/multilingual-e5-large", "E5-large (NL)", {}), - ("BAAI/bge-m3", "BGE-M3 (NL+code)", {}), -] +# Frozen 7-model set with HuggingFace revision SHAs pinned at 2026-05-21. +# See experiments/src/model_registry.py. +MODELS = MODELS_7_FROZEN def load_ood_stimuli() -> tuple[list[dict], dict[str, str]]: @@ -214,6 +209,7 @@ def _build_run_meta() -> dict: "n_boot": 10000, "review_id": "review-2026-05-21", "closes": "C1 deferred portion (contamination via OOD stimuli)", + "model_revisions": registry_sha_summary(), } diff --git a/experiments/src/model_registry.py b/experiments/src/model_registry.py new file mode 100644 index 0000000..92f79fe --- /dev/null +++ b/experiments/src/model_registry.py @@ -0,0 +1,69 @@ +"""Frozen model registry for Strategy D / E / F experiments. + +Each model is pinned to the HuggingFace `main` branch commit SHA observed +on 2026-05-21 via `huggingface_hub.HfApi().model_info(repo).sha`. +sentence-transformers >=5.5 honors the `revision=` kwarg in +SentenceTransformer.__init__, so the experiments load exactly the weights +captured at review time even if the upstream `main` branch moves later. + +This closes C3 from the 2026-05-21 pre-experiment review: explicit +revision pin in addition to the existing embedding-level `.npz` cache. + +To refresh: re-run the snippet at the bottom of this file and commit the +new SHAs as a single chore PR. Do NOT update individual model SHAs +silently — keeping all 7 frozen at the same review point lets cross- +experiment comparisons (D / E / F) stay valid. +""" + +from __future__ import annotations + +# Frozen SHA snapshot: 2026-05-21 +MODELS_7_FROZEN: list[tuple[str, str, dict]] = [ + ("microsoft/unixcoder-base", "UniXcoder (code)", { + "revision": "5604afdc964f6c53782a6813140ade5216b99006", + }), + ("paraphrase-multilingual-MiniLM-L12-v2", "MiniLM-L12 (NL)", { + # sentence-transformers/* namespace, but sentence-transformers + # library auto-prefixes when the bare model name is used. + "revision": "e8f8c211226b894fcb81acc59f3b34ba3efd5f42", + }), + ("nomic-ai/nomic-embed-text-v1.5", "Nomic v1.5 (NL+code)", { + "trust_remote_code": True, + "revision": "e9b6763023c676ca8431644204f50c2b100d9aab", + }), + ("intfloat/multilingual-e5-small", "E5-small (NL)", { + "revision": "614241f622f53c4eeff9890bdc4f31cfecc418b3", + }), + ("intfloat/multilingual-e5-base", "E5-base (NL)", { + "revision": "d128750597153bb5987e10b1c3493a34e5a4502a", + }), + ("intfloat/multilingual-e5-large", "E5-large (NL)", { + "revision": "3d7cfbdacd47fdda877c5cd8a79fbcc4f2a574f3", + }), + ("BAAI/bge-m3", "BGE-M3 (NL+code)", { + "revision": "5617a9f61b028005a4858fdac845db406aefb181", + }), +] + + +def registry_sha_summary() -> dict: + """Return a serializable model -> revision mapping for run_meta dumps.""" + return {model: kwargs.get("revision", "unpinned") for model, _, kwargs in MODELS_7_FROZEN} + + +# --------------------------------------------------------------------------- +# Refresh helper (manual; NOT called by experiments) +# --------------------------------------------------------------------------- +# Run interactively when you intentionally want to roll the frozen SHAs: +# +# experiments/.venv/bin/python -c " +# from huggingface_hub import HfApi +# from experiments.src.model_registry import MODELS_7_FROZEN +# api = HfApi() +# for model, label, kwargs in MODELS_7_FROZEN: +# info = api.model_info(model) +# print(f' ({model!r}, {label!r}, {{\"revision\": {info.sha!r}, ...}}),') +# " +# +# Review the diff, commit as a chore PR, and re-run Strategy D / E / F so the +# results JSON _meta blocks pick up the new SHAs. diff --git a/planning/decisions.md b/planning/decisions.md index a8b7c54..31920c0 100644 --- a/planning/decisions.md +++ b/planning/decisions.md @@ -122,3 +122,18 @@ Format: `## YYYY-MM-DD -- ` with **Context**, **Decision**, **Why** - Limitations "Pretraining contamination of NL-code stimuli" bullet renamed to "(partially addressed)" with summary of OOD result; residual matched-perplexity work remains future. **Why**: This was the single most important deferred item because the contamination caveat (added in PR #3 for paper integrity) explicitly predicted a directional outcome. Running the test and reporting the result---in either direction---is what distinguishes the caveat from rhetorical hedging. The observed direction (OOD effect stronger than tier-1) is the strongest empirical anchor for the paper's PRH-for-code claim that the embedding-only paradigm can produce. + +--- + +## 2026-05-21 -- Model registry with frozen HuggingFace SHAs (closes C3) + +**Context**: The C3 fix in PR #3 accepted floating-`main` risk for the pilot and relied on the existing `EmbeddingCache` for embedding-level reproducibility. After Strategy D / E / F all landed using the same 7-model set, the cost of pinning revision SHAs became trivial (one fetch via `huggingface_hub.HfApi`) and the benefit grew (any reviewer re-running the pipeline 6 months from now would otherwise pull a moved `main`). + +**Decisions**: + + - Added `experiments/src/model_registry.py` with `MODELS_7_FROZEN` — the 7 (model_name, label, kwargs) tuples used by Strategy D / E / F, each pinned to its `main` commit SHA observed on 2026-05-21 via `HfApi().model_info(repo).sha`. `registry_sha_summary()` returns a JSON-serializable mapping for `run_meta` blocks. + - sentence-transformers `>=5.5` accepts `revision=` in `SentenceTransformer.__init__`; confirmed via `inspect.signature`. + - Refactored Strategy D / E / F runners to `from src.model_registry import MODELS_7_FROZEN, registry_sha_summary` and replaced their inline MODELS lists. Each runner's `run_meta` now includes `model_revisions` so the SHAs are recorded in every results JSON for forensic reproducibility. + - `experiments/README.md` Reproducibility envelope bullet added: model-weight pinning policy + pointer to the registry's refresh snippet. + +**Why**: C3 was originally classified as a Minor TODO because the embedding cache covered the practical reproducibility need. Centralizing the registry now (rather than after another experiment lands) prevents future SHA drift between runners and gives reviewers a single auditable location for "which exact weights did this paper use?"