heznpc · heznpc · May 20, 2026 · May 20, 2026
diff --git a/.github/workflows/reproduce-smoke.yml b/.github/workflows/reproduce-smoke.yml
@@ -0,0 +1,59 @@
+name: reproduce-smoke
+
+# Lightweight CI: validates that the experimental pipeline imports cleanly
+# and that the stimuli JSON parses. Does NOT execute the pilot (would require
+# OpenAI/Mistral API keys and ~3-5h of compute). Intended to catch
+# requirements drift and JSON corruption on every push.
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'experiments/**'
+      - 'pyproject.toml'
+      - '.python-version'
+      - 'Makefile'
+      - '.github/workflows/reproduce-smoke.yml'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'experiments/**'
+      - 'pyproject.toml'
+      - '.python-version'
+      - 'Makefile'
+      - '.github/workflows/reproduce-smoke.yml'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  smoke:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: '.python-version'
+          cache: 'pip'
+          cache-dependency-path: experiments/requirements.txt
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r experiments/requirements.txt
+
+      - name: Import smoke test
+        run: |
+          python -c "import numpy, scipy, sklearn, matplotlib, seaborn, sentence_transformers, openai, requests, kiwipiepy; print('imports OK')"
+
+      - name: Validate stimuli JSON
+        run: |
+          python -c "import json, glob; files = sorted(glob.glob('experiments/data/stimuli/*.json')); [json.load(open(p)) for p in files]; print(f'{len(files)} stimuli JSON files parse OK')"
+
+      - name: Verify random seed pinning
+        run: |
+          python -c "import re, pathlib; pat = re.compile(r'default_rng\\((seed|42|RNG_SEED)'); hits = sum(1 for p in pathlib.Path('experiments').rglob('*.py') for line in p.read_text().splitlines() if pat.search(line)); print(f'seed-pinned RNG sites: {hits}'); assert hits >= 10, 'expected >=10 seeded RNG sites'"
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/Makefile b/Makefile
@@ -0,0 +1,39 @@
+# Single entry point for reproducing the z-gap pilot.
+# See experiments/README.md for the reproducibility envelope.
+
+.PHONY: help setup smoke reproduce figures clean
+
+PYTHON ?= python3
+VENV   := experiments/.venv
+PIP    := $(VENV)/bin/pip
+PY     := $(VENV)/bin/python
+
+help:
+	@echo "Targets:"
+	@echo "  setup      Create experiments/.venv and install requirements"
+	@echo "  smoke      Validate imports and stimuli JSON without running the pipeline"
+	@echo "  reproduce  Run the full pilot (scripts/run_all.py)"
+	@echo "  figures    Re-run cross-experiment synthesis only"
+	@echo "  clean      Remove venv and Python caches (keeps results/embeddings/)"
+
+setup:
+	cd experiments && $(PYTHON) -m venv .venv
+	$(PIP) install --upgrade pip
+	$(PIP) install -r experiments/requirements.txt
+	@echo "Setup complete."
+	@echo "Next: cp experiments/.env.example experiments/.env  # add OPENAI_API_KEY, MISTRAL_API_KEY"
+
+smoke:
+	$(PY) -c "import numpy, scipy, sklearn, matplotlib, seaborn, sentence_transformers, openai, requests, kiwipiepy; print('smoke OK')"
+	$(PY) -c "import json, glob; [json.load(open(p)) for p in glob.glob('experiments/data/stimuli/*.json')]; print('stimuli JSON OK')"
+
+reproduce:
+	cd experiments && ../$(VENV)/bin/python scripts/run_all.py
+
+figures:
+	cd experiments && ../$(VENV)/bin/python scripts/run_cross_experiment_synthesis.py
+
+clean:
+	rm -rf experiments/.venv experiments/.pytest_cache
+	find . -type d -name "__pycache__" -prune -exec rm -rf {} +
+	@echo "Caches cleared. experiments/results/ preserved."
diff --git a/experiments/README.md b/experiments/README.md
@@ -2,21 +2,43 @@
 
 Tests predictions P1–P7 from the paper: cross-lingual semantic invariance (P2), dialect continuum (P2-dialect), NL-code alignment (P3), spacing robustness (P7), and scale-convergence (P1).
 
+## Reproducibility envelope
+
+- **Python**: `3.11` or `3.12` (pinned in `.python-version`; `kiwipiepy` and `torch` wheels for `3.13` are not yet consistently available)
+- **Random seed**: `np.random.default_rng(42)` is used throughout `src/predictions.py`, `src/metrics.py`, `src/code_alignment.py`, `src/vocab_mediation.py`, `src/hidden_state_analysis.py`, and the strategy runners (17+ call sites). Override via the `seed` kwarg where exposed.
+- **OS**: tested on macOS (Apple Silicon) and Ubuntu 24.04. Windows untested.
+- **Hardware**: CPU sufficient for the 100-op pilot (~3–5h end-to-end across 7 embedding models). MPS/CUDA optional and only used by `scripts/run_v2_extract.py` for 8B decoder hidden-state extraction.
+- **External APIs**: OpenAI Embeddings (`text-embedding-3-small`/`-large`) and Mistral Codestral Embed (`codestral-embed-2505`). Both calls now retry on 429/5xx with exponential backoff (`max_retries=5`).
+- **Data sent to providers**: synthetic stimuli only (`data/stimuli/*.json`). No PII.
+
 ## Setup
 
+From the repository root:
+
+```bash
+make setup     # creates experiments/.venv (Python 3.12) and installs requirements
+cp experiments/.env.example experiments/.env   # add OPENAI_API_KEY, MISTRAL_API_KEY
+make smoke     # validates imports + stimuli JSON without burning API budget
+```
+
+Or manually:
+
 ```bash
 cd experiments
 python -m venv .venv && source .venv/bin/activate
 pip install -r requirements.txt
-cp .env.example .env  # add API keys (OpenAI, Mistral)
+cp .env.example .env
 ```
 
 ## Run
 
 ```bash
-python scripts/run_all.py
+make reproduce     # full pipeline (scripts/run_all.py)
+make figures       # cross-experiment synthesis only
 ```
 
+The CI workflow (`.github/workflows/reproduce-smoke.yml`) runs the smoke target on every push touching `experiments/` — it confirms imports succeed and stimuli JSON parses, but does not burn API credits.
+
 ## What it does
 
 1. Generates 100 stimuli (50 computational + 50 judgment) × 5 languages × dialectal variants (~1,800 total)
@@ -43,3 +65,17 @@ python scripts/run_all.py
 - **P2**: R\_C > R\_J for computational vs judgment operations
 - **P2-dialect**: R degrades continuously: within-dialect > cross-dialect > cross-lingual
 - **P7**: R\_spacing > 1 — spacing variation produces less Z distance than semantic variation
+
+## Logging
+
+Scripts ship with `print()` for transcript continuity. New code should use the helper:
+
+```python
+from src.logging_config import configure_logging, get_logger
+
+configure_logging()                    # call once at entry point
+logger = get_logger(__name__)
+logger.info("Embedded %d stimuli", n)
+```
+
+Set `Z_GAP_LOG_LEVEL=DEBUG` to escalate. Output goes to stderr so stdout stays clean for redirection.
diff --git a/experiments/src/embeddings.py b/experiments/src/embeddings.py
@@ -48,7 +48,9 @@ def __init__(self, model: str = "text-embedding-3-small"):
         import openai
         from dotenv import load_dotenv
         load_dotenv()
-        self._client = openai.OpenAI()
+        # max_retries=5 covers transient 429/5xx during multi-model sweeps.
+        # SDK uses exponential backoff with jitter internally.
+        self._client = openai.OpenAI(max_retries=5, timeout=60.0)
         self._model = model
         self._dim = 1536 if "small" in model else 3072
 
@@ -83,18 +85,38 @@ def __init__(self, model: str = "codestral-embed-2505"):
         self._api_key = os.environ["MISTRAL_API_KEY"]
         self._model = model
         self._dim = 1024
+        self._session = self._make_session()
 
-    def encode(self, texts: list[str]) -> np.ndarray:
+    @staticmethod
+    def _make_session():
+        """Session with retry/backoff for 429 + 5xx (matches OpenAI SDK behavior)."""
         import requests
+        from requests.adapters import HTTPAdapter
+        from urllib3.util.retry import Retry
+
+        retry = Retry(
+            total=5,
+            backoff_factor=1.0,  # 1s, 2s, 4s, 8s, 16s
+            status_forcelist=(429, 500, 502, 503, 504),
+            allowed_methods=frozenset(["POST"]),
+            respect_retry_after_header=True,
+            raise_on_status=False,
+        )
+        session = requests.Session()
+        session.mount("https://", HTTPAdapter(max_retries=retry))
+        return session
+
+    def encode(self, texts: list[str]) -> np.ndarray:
         from tqdm import tqdm
         results = []
         batch_size = 50
         for i in tqdm(range(0, len(texts), batch_size), desc=f"Mistral {self._model}"):
             batch = texts[i:i + batch_size]
-            resp = requests.post(
+            resp = self._session.post(
                 "https://api.mistral.ai/v1/embeddings",
                 headers={"Authorization": f"Bearer {self._api_key}"},
                 json={"model": self._model, "input": batch},
+                timeout=60,
             )
             resp.raise_for_status()
             data = resp.json()["data"]

diff --git a/experiments/src/logging_config.py b/experiments/src/logging_config.py
@@ -0,0 +1,63 @@
+"""Centralized logging configuration for z-gap experiments.
+
+Existing scripts (run_all.py, run_strategy_*.py) use bare ``print()`` calls;
+those are preserved to avoid changing user-visible output. New code should
+prefer ``get_logger()`` so long multi-model runs produce greppable,
+level-tagged output that survives redirection.
+
+Usage:
+    from src.logging_config import configure_logging, get_logger
+
+    configure_logging()              # call once at entry point
+    logger = get_logger(__name__)
+    logger.info("Running %s on %d stimuli", model.name, len(texts))
+
+Level is taken from ``Z_GAP_LOG_LEVEL`` env var if set, else the ``level``
+argument (default INFO).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+
+
+_DEFAULT_FORMAT = "%(asctime)s %(levelname)s %(name)s: %(message)s"
+_PLAIN_FORMAT = "%(message)s"
+
+
+def configure_logging(
+    level: str | int = "INFO",
+    *,
+    with_timestamp: bool = True,
+    stream=None,
+) -> None:
+    """Configure the root logger. Idempotent; later calls override format.
+
+    Args:
+        level: ``logging`` level name or int. Overridden by ``Z_GAP_LOG_LEVEL``
+            env var if present.
+        with_timestamp: include asctime/levelname/name prefix. Set ``False`` to
+            match legacy ``print()`` output exactly.
+        stream: file-like; defaults to ``sys.stderr`` so stdout stays clean
+            for any tool that captures it.
+    """
+    env_level = os.environ.get("Z_GAP_LOG_LEVEL")
+    if env_level:
+        level = env_level
+    if isinstance(level, str):
+        level = getattr(logging, level.upper(), logging.INFO)
+
+    fmt = _DEFAULT_FORMAT if with_timestamp else _PLAIN_FORMAT
+    logging.basicConfig(
+        level=level,
+        format=fmt,
+        stream=stream or sys.stderr,
+        force=True,
+    )
+
+
+def get_logger(name: str) -> logging.Logger:
+    """Return a child logger; configure_logging() should have been called once."""
+    return logging.getLogger(name)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,54 @@
+# PEP 621 metadata for the z-gap experimental pipeline.
+# This file is intentionally metadata-only — no build-system section, since
+# experiments/scripts and experiments/src are run directly (not packaged).
+# It exists so pip-audit, dependabot, and IDE tooling can read a canonical
+# dependency list and the required Python version, complementing the
+# pip-installable `experiments/requirements.txt`.
+
+[project]
+name = "z-gap"
+version = "0.2.0"
+description = "Z-Gap — Beyond the Chomsky Wall: PRH refinement, NL-code communicability pilot (Program 3 anchor)."
+requires-python = ">=3.11,<3.13"
+readme = "README.md"
+license = { file = "LICENSE" }
+authors = [{ name = "heznpc" }]
+keywords = [
+    "platonic-representation-hypothesis",
+    "cross-lingual",
+    "nl-code",
+    "sentence-transformers",
+    "discriminability-ratio",
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+
+# Mirrors experiments/requirements.txt. Lower bounds chosen for ABI safety
+# with sentence-transformers/torch wheels; upper bound on numpy keeps the
+# 1.x ABI to avoid silent drift in cached .npz embeddings.
+dependencies = [
+    "numpy>=1.24,<2.3",
+    "scipy>=1.10",
+    "pandas>=2.0",
+    "pyyaml>=6.0",
+    "scikit-learn>=1.3",
+    "matplotlib>=3.7",
+    "seaborn>=0.12",
+    "sentence-transformers>=2.2",
+    "openai>=1.0",
+    "requests>=2.32.4",
+    "kiwipiepy>=0.16",
+    "tqdm>=4.65",
+    "click>=8.1",
+    "python-dotenv>=1.0",
+]
+
+[project.urls]
+Repository = "https://github.com/heznpc/z-gap"
+Paper = "https://github.com/heznpc/z-gap/blob/main/paper/main.tex"