diff --git a/.github/workflows/reproduce-smoke.yml b/.github/workflows/reproduce-smoke.yml new file mode 100644 index 0000000..6f1bfb2 --- /dev/null +++ b/.github/workflows/reproduce-smoke.yml @@ -0,0 +1,59 @@ +name: reproduce-smoke + +# Lightweight CI: validates that the experimental pipeline imports cleanly +# and that the stimuli JSON parses. Does NOT execute the pilot (would require +# OpenAI/Mistral API keys and ~3-5h of compute). Intended to catch +# requirements drift and JSON corruption on every push. + +on: + push: + branches: [main] + paths: + - 'experiments/**' + - 'pyproject.toml' + - '.python-version' + - 'Makefile' + - '.github/workflows/reproduce-smoke.yml' + pull_request: + branches: [main] + paths: + - 'experiments/**' + - 'pyproject.toml' + - '.python-version' + - 'Makefile' + - '.github/workflows/reproduce-smoke.yml' + workflow_dispatch: + +permissions: + contents: read + +jobs: + smoke: + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: '.python-version' + cache: 'pip' + cache-dependency-path: experiments/requirements.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r experiments/requirements.txt + + - name: Import smoke test + run: | + python -c "import numpy, scipy, sklearn, matplotlib, seaborn, sentence_transformers, openai, requests, kiwipiepy; print('imports OK')" + + - name: Validate stimuli JSON + run: | + python -c "import json, glob; files = sorted(glob.glob('experiments/data/stimuli/*.json')); [json.load(open(p)) for p in files]; print(f'{len(files)} stimuli JSON files parse OK')" + + - name: Verify random seed pinning + run: | + python -c "import re, pathlib; pat = re.compile(r'default_rng\\((seed|42|RNG_SEED)'); hits = sum(1 for p in pathlib.Path('experiments').rglob('*.py') for line in p.read_text().splitlines() if pat.search(line)); print(f'seed-pinned RNG sites: {hits}'); assert hits >= 10, 'expected >=10 seeded RNG sites'" diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..131ce87 --- /dev/null +++ b/Makefile @@ -0,0 +1,39 @@ +# Single entry point for reproducing the z-gap pilot. +# See experiments/README.md for the reproducibility envelope. + +.PHONY: help setup smoke reproduce figures clean + +PYTHON ?= python3 +VENV := experiments/.venv +PIP := $(VENV)/bin/pip +PY := $(VENV)/bin/python + +help: + @echo "Targets:" + @echo " setup Create experiments/.venv and install requirements" + @echo " smoke Validate imports and stimuli JSON without running the pipeline" + @echo " reproduce Run the full pilot (scripts/run_all.py)" + @echo " figures Re-run cross-experiment synthesis only" + @echo " clean Remove venv and Python caches (keeps results/embeddings/)" + +setup: + cd experiments && $(PYTHON) -m venv .venv + $(PIP) install --upgrade pip + $(PIP) install -r experiments/requirements.txt + @echo "Setup complete." + @echo "Next: cp experiments/.env.example experiments/.env # add OPENAI_API_KEY, MISTRAL_API_KEY" + +smoke: + $(PY) -c "import numpy, scipy, sklearn, matplotlib, seaborn, sentence_transformers, openai, requests, kiwipiepy; print('smoke OK')" + $(PY) -c "import json, glob; [json.load(open(p)) for p in glob.glob('experiments/data/stimuli/*.json')]; print('stimuli JSON OK')" + +reproduce: + cd experiments && ../$(VENV)/bin/python scripts/run_all.py + +figures: + cd experiments && ../$(VENV)/bin/python scripts/run_cross_experiment_synthesis.py + +clean: + rm -rf experiments/.venv experiments/.pytest_cache + find . -type d -name "__pycache__" -prune -exec rm -rf {} + + @echo "Caches cleared. experiments/results/ preserved." diff --git a/experiments/README.md b/experiments/README.md index 967c412..49182af 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -2,21 +2,43 @@ Tests predictions P1–P7 from the paper: cross-lingual semantic invariance (P2), dialect continuum (P2-dialect), NL-code alignment (P3), spacing robustness (P7), and scale-convergence (P1). +## Reproducibility envelope + +- **Python**: `3.11` or `3.12` (pinned in `.python-version`; `kiwipiepy` and `torch` wheels for `3.13` are not yet consistently available) +- **Random seed**: `np.random.default_rng(42)` is used throughout `src/predictions.py`, `src/metrics.py`, `src/code_alignment.py`, `src/vocab_mediation.py`, `src/hidden_state_analysis.py`, and the strategy runners (17+ call sites). Override via the `seed` kwarg where exposed. +- **OS**: tested on macOS (Apple Silicon) and Ubuntu 24.04. Windows untested. +- **Hardware**: CPU sufficient for the 100-op pilot (~3–5h end-to-end across 7 embedding models). MPS/CUDA optional and only used by `scripts/run_v2_extract.py` for 8B decoder hidden-state extraction. +- **External APIs**: OpenAI Embeddings (`text-embedding-3-small`/`-large`) and Mistral Codestral Embed (`codestral-embed-2505`). Both calls now retry on 429/5xx with exponential backoff (`max_retries=5`). +- **Data sent to providers**: synthetic stimuli only (`data/stimuli/*.json`). No PII. + ## Setup +From the repository root: + +```bash +make setup # creates experiments/.venv (Python 3.12) and installs requirements +cp experiments/.env.example experiments/.env # add OPENAI_API_KEY, MISTRAL_API_KEY +make smoke # validates imports + stimuli JSON without burning API budget +``` + +Or manually: + ```bash cd experiments python -m venv .venv && source .venv/bin/activate pip install -r requirements.txt -cp .env.example .env # add API keys (OpenAI, Mistral) +cp .env.example .env ``` ## Run ```bash -python scripts/run_all.py +make reproduce # full pipeline (scripts/run_all.py) +make figures # cross-experiment synthesis only ``` +The CI workflow (`.github/workflows/reproduce-smoke.yml`) runs the smoke target on every push touching `experiments/` — it confirms imports succeed and stimuli JSON parses, but does not burn API credits. + ## What it does 1. Generates 100 stimuli (50 computational + 50 judgment) × 5 languages × dialectal variants (~1,800 total) @@ -43,3 +65,17 @@ python scripts/run_all.py - **P2**: R\_C > R\_J for computational vs judgment operations - **P2-dialect**: R degrades continuously: within-dialect > cross-dialect > cross-lingual - **P7**: R\_spacing > 1 — spacing variation produces less Z distance than semantic variation + +## Logging + +Scripts ship with `print()` for transcript continuity. New code should use the helper: + +```python +from src.logging_config import configure_logging, get_logger + +configure_logging() # call once at entry point +logger = get_logger(__name__) +logger.info("Embedded %d stimuli", n) +``` + +Set `Z_GAP_LOG_LEVEL=DEBUG` to escalate. Output goes to stderr so stdout stays clean for redirection. diff --git a/experiments/src/embeddings.py b/experiments/src/embeddings.py index ca7ffcf..ea3c5b6 100644 --- a/experiments/src/embeddings.py +++ b/experiments/src/embeddings.py @@ -48,7 +48,9 @@ def __init__(self, model: str = "text-embedding-3-small"): import openai from dotenv import load_dotenv load_dotenv() - self._client = openai.OpenAI() + # max_retries=5 covers transient 429/5xx during multi-model sweeps. + # SDK uses exponential backoff with jitter internally. + self._client = openai.OpenAI(max_retries=5, timeout=60.0) self._model = model self._dim = 1536 if "small" in model else 3072 @@ -83,18 +85,38 @@ def __init__(self, model: str = "codestral-embed-2505"): self._api_key = os.environ["MISTRAL_API_KEY"] self._model = model self._dim = 1024 + self._session = self._make_session() - def encode(self, texts: list[str]) -> np.ndarray: + @staticmethod + def _make_session(): + """Session with retry/backoff for 429 + 5xx (matches OpenAI SDK behavior).""" import requests + from requests.adapters import HTTPAdapter + from urllib3.util.retry import Retry + + retry = Retry( + total=5, + backoff_factor=1.0, # 1s, 2s, 4s, 8s, 16s + status_forcelist=(429, 500, 502, 503, 504), + allowed_methods=frozenset(["POST"]), + respect_retry_after_header=True, + raise_on_status=False, + ) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=retry)) + return session + + def encode(self, texts: list[str]) -> np.ndarray: from tqdm import tqdm results = [] batch_size = 50 for i in tqdm(range(0, len(texts), batch_size), desc=f"Mistral {self._model}"): batch = texts[i:i + batch_size] - resp = requests.post( + resp = self._session.post( "https://api.mistral.ai/v1/embeddings", headers={"Authorization": f"Bearer {self._api_key}"}, json={"model": self._model, "input": batch}, + timeout=60, ) resp.raise_for_status() data = resp.json()["data"] diff --git a/experiments/src/logging_config.py b/experiments/src/logging_config.py new file mode 100644 index 0000000..8449361 --- /dev/null +++ b/experiments/src/logging_config.py @@ -0,0 +1,63 @@ +"""Centralized logging configuration for z-gap experiments. + +Existing scripts (run_all.py, run_strategy_*.py) use bare ``print()`` calls; +those are preserved to avoid changing user-visible output. New code should +prefer ``get_logger()`` so long multi-model runs produce greppable, +level-tagged output that survives redirection. + +Usage: + from src.logging_config import configure_logging, get_logger + + configure_logging() # call once at entry point + logger = get_logger(__name__) + logger.info("Running %s on %d stimuli", model.name, len(texts)) + +Level is taken from ``Z_GAP_LOG_LEVEL`` env var if set, else the ``level`` +argument (default INFO). +""" + +from __future__ import annotations + +import logging +import os +import sys + + +_DEFAULT_FORMAT = "%(asctime)s %(levelname)s %(name)s: %(message)s" +_PLAIN_FORMAT = "%(message)s" + + +def configure_logging( + level: str | int = "INFO", + *, + with_timestamp: bool = True, + stream=None, +) -> None: + """Configure the root logger. Idempotent; later calls override format. + + Args: + level: ``logging`` level name or int. Overridden by ``Z_GAP_LOG_LEVEL`` + env var if present. + with_timestamp: include asctime/levelname/name prefix. Set ``False`` to + match legacy ``print()`` output exactly. + stream: file-like; defaults to ``sys.stderr`` so stdout stays clean + for any tool that captures it. + """ + env_level = os.environ.get("Z_GAP_LOG_LEVEL") + if env_level: + level = env_level + if isinstance(level, str): + level = getattr(logging, level.upper(), logging.INFO) + + fmt = _DEFAULT_FORMAT if with_timestamp else _PLAIN_FORMAT + logging.basicConfig( + level=level, + format=fmt, + stream=stream or sys.stderr, + force=True, + ) + + +def get_logger(name: str) -> logging.Logger: + """Return a child logger; configure_logging() should have been called once.""" + return logging.getLogger(name) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3dd2fa4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +# PEP 621 metadata for the z-gap experimental pipeline. +# This file is intentionally metadata-only — no build-system section, since +# experiments/scripts and experiments/src are run directly (not packaged). +# It exists so pip-audit, dependabot, and IDE tooling can read a canonical +# dependency list and the required Python version, complementing the +# pip-installable `experiments/requirements.txt`. + +[project] +name = "z-gap" +version = "0.2.0" +description = "Z-Gap — Beyond the Chomsky Wall: PRH refinement, NL-code communicability pilot (Program 3 anchor)." +requires-python = ">=3.11,<3.13" +readme = "README.md" +license = { file = "LICENSE" } +authors = [{ name = "heznpc" }] +keywords = [ + "platonic-representation-hypothesis", + "cross-lingual", + "nl-code", + "sentence-transformers", + "discriminability-ratio", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] + +# Mirrors experiments/requirements.txt. Lower bounds chosen for ABI safety +# with sentence-transformers/torch wheels; upper bound on numpy keeps the +# 1.x ABI to avoid silent drift in cached .npz embeddings. +dependencies = [ + "numpy>=1.24,<2.3", + "scipy>=1.10", + "pandas>=2.0", + "pyyaml>=6.0", + "scikit-learn>=1.3", + "matplotlib>=3.7", + "seaborn>=0.12", + "sentence-transformers>=2.2", + "openai>=1.0", + "requests>=2.32.4", + "kiwipiepy>=0.16", + "tqdm>=4.65", + "click>=8.1", + "python-dotenv>=1.0", +] + +[project.urls] +Repository = "https://github.com/heznpc/z-gap" +Paper = "https://github.com/heznpc/z-gap/blob/main/paper/main.tex"