diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ad5e11..175dae0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,5 +29,5 @@ jobs: - name: Test with coverage run: uv run pytest --cov --cov-report=term-missing - - name: Validate integrated layer (spec §7.4 — 7 gates) - run: uv run python -m m_standard.tools.validate --root . + - name: Manifest drift gate (Phase 0 / Track C) + run: make check-manifest diff --git a/.gitignore b/.gitignore index 2f1002e..da27616 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,8 @@ __pycache__/ .env !.env.example *.log -dist/ +dist/* +!dist/repo.meta.json build/ *.egg-info/ src/*.egg-info/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..2191c4c --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,166 @@ +--- +# Machine-readable project descriptor — schema v1 (2026-05-05). +name: m-standard +kind: [data, library, reference] +status: active +languages: [python] + +runtime: + needs: + - python>=3.10 + - uv + optional: [] + excludes: [] # GT.M docs deliberately out of scope (project rule) + +distribution: + pypi: null + github: rafael5/m-standard + +location: ~/projects/m-standard + +exposes: + python_api: "src/m_standard/ — library + tools package" + cli_modules: + - "python -m m_standard.tools.crawl" + - "python -m m_standard.tools.extract" + - "python -m m_standard.tools.reconcile" + - "python -m m_standard.tools.emit" + - "python -m m_standard.tools.validate" + formats_produced: + - "per-source//*.tsv (per-source extracted)" + - "integrated/*.tsv (reconciled, citable)" + - "integrated/*.json (machine-emitted; consumed by tree-sitter-m + m-cli)" + - "schemas/*.json (validation schemas)" + - "docs/m-standards-guide.md (narrative)" + - "docs/adr/* (decision records)" + +consumes: + formats: [] + services: [] + upstream_sources: + - "Annotated M Standard (AnnoStd)" + - "YottaDB documentation corpus" + - "InterSystems IRIS docs (v2.0+)" + +companions: + - project: tree-sitter-m + relation: "downstream consumer — `m-standard/integrated/grammar-surface.json` drives tree-sitter-m's grammar generator" + - project: m-cli + relation: "downstream consumer — m-cli loads commands/ISVs/functions from m-standard's TSVs" + - project: m-tools + relation: "the M toolchain hub references m-standard as the spec layer" + - project: m-stdlib + relation: "m-stdlib obeys m-standard's reconciled language definitions" + +incompatibilities: + - "GT.M permanently out of scope. Do not add GT.M sources." + - "No live network at pipeline run time. `crawl`/`clone` populates `sources/`; downstream stages read disk only." + - "Every integrated row needs `in_anno`/`in_ydb` provenance flags + at least one source ref." + +docs: + primary: README.md + spec: docs/spec.md + user_guide: docs/m-standards-guide.md + adr: docs/adr/ +--- + +# Claude project context — m-standard + +## What this is +Reconciles the Annotated M Standard (AnnoStd) and the YottaDB +documentation corpus into a single citable, machine-readable reference +standard for the M (MUMPS) language. Outputs are TSV + JSON pairs under +`integrated/` plus a narrative under `docs/m-standards-guide.md`. + +The full design and rationale are in `docs/spec.md`. ADRs in +`docs/adr/`. + +## Where things live +- `src/m_standard/` — library + tools package. Anything importable. +- `src/m_standard/tools/` — pipeline stages (crawl, extract, reconcile, + emit, validate). Each is invokable via `python -m + m_standard.tools.`. +- `tools/` — non-Python utilities (e.g. `clone-ydb.sh`). +- `sources/` — offline local replicas of the upstream sources. The + pipeline reads only from here, never the network at run time. +- `per-source/`, `integrated/`, `schemas/` — pipeline outputs (committed + artifacts). +- `tests/` — pytest, mirrors `src/m_standard/` structure. + +## Pipeline (per spec §7) +``` +sources/ ──extract──▶ per-source//*.tsv + └───reconcile──▶ integrated/*.tsv + conflicts.tsv + └────emit──▶ integrated/*.json + └────validate──▶ CI gates pass +``` + +## Hard rules +- **TDD.** Test first, confirm failure, then implement. Always. +- **No live network at pipeline run time.** Crawl/clone populates + `sources/`; everything downstream reads from disk. +- **Reproducibility.** Every source file has a sha256 in + `sources//manifest.tsv`. Every YDB-derived row carries the + pinned commit SHA. +- **Provenance.** Every integrated row has `in_anno`/`in_ydb` flags + + source section refs. No integrated row exists without at least one + source attesting it. +- **Determinism.** `reconcile.py` is byte-deterministic — same inputs, + same outputs. + +## Toolchain +- Python ≥3.12, `uv`, ruff, mypy, pytest. +- `Makefile` uses `.venv/bin/` prefixes for every tool (parent direnv + hijacks bare names — see `docs/build-log.md` BL-001). + +## Conventions +- No `print()` in library code — use `logging.getLogger(__name__)`. +- BeautifulSoup attr access: cast with `str()` (mypy strict). +- Click group options before subcommand if Click is added later. +- YAML frontmatter: quote any value containing a colon. + +## Setup +```bash +make install # uv sync --extra dev + install pre-commit hooks +``` + +## Test +```bash +make test # .venv/bin/pytest +make cov # pytest with coverage report +make check # lint + mypy + cov (run before push) +``` + +## Build / generate +The committed payloads under `docs/integrated/` are produced by the +pipeline. To regenerate them from `sources/`: + +```bash +make integrated # extract → reconcile → emit → validate (alias; no network fetch) +make all # adds `sources` (network fetch) at the head — full rebuild +``` + +The `dist/repo.meta.json` manifest is hand-authored; CI verifies it +hasn't drifted from the committed integrated payloads. + +## Verify +These are the `verification_commands` declared in `dist/repo.meta.json`: + +```bash +make integrated # regenerates docs/integrated/ deterministically +make test # full test suite +``` + +A green `make check-manifest` proves the committed integrated payloads +still match what the pipeline produces. + +## Guardrails +- **Do not hand-edit `docs/integrated/*` files.** They are pipeline + outputs; edit `sources/` + the extract/reconcile/emit stages instead. +- **Do not hand-edit `dist/repo.meta.json` `verified_on` to a future + date.** The Phase 0 smoke test rejects manifests older than 90 days; + bump the date only when the manifest changes materially. +- **No GT.M sources.** Permanently out of scope. +- **No live network at pipeline run time.** `make sources` is the only + stage that touches the network; all downstream stages read disk. +- **Determinism.** Reconcile output must be byte-stable across runs. diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index b625519..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -# Machine-readable project descriptor — schema v1 (2026-05-05). -name: m-standard -kind: [data, library, reference] -status: active -languages: [python] - -runtime: - needs: - - python>=3.10 - - uv - optional: [] - excludes: [] # GT.M docs deliberately out of scope (project rule) - -distribution: - pypi: null - github: rafael5/m-standard - -location: ~/projects/m-standard - -exposes: - python_api: "src/m_standard/ — library + tools package" - cli_modules: - - "python -m m_standard.tools.crawl" - - "python -m m_standard.tools.extract" - - "python -m m_standard.tools.reconcile" - - "python -m m_standard.tools.emit" - - "python -m m_standard.tools.validate" - formats_produced: - - "per-source//*.tsv (per-source extracted)" - - "integrated/*.tsv (reconciled, citable)" - - "integrated/*.json (machine-emitted; consumed by tree-sitter-m + m-cli)" - - "schemas/*.json (validation schemas)" - - "docs/m-standards-guide.md (narrative)" - - "docs/adr/* (decision records)" - -consumes: - formats: [] - services: [] - upstream_sources: - - "Annotated M Standard (AnnoStd)" - - "YottaDB documentation corpus" - - "InterSystems IRIS docs (v2.0+)" - -companions: - - project: tree-sitter-m - relation: "downstream consumer — `m-standard/integrated/grammar-surface.json` drives tree-sitter-m's grammar generator" - - project: m-cli - relation: "downstream consumer — m-cli loads commands/ISVs/functions from m-standard's TSVs" - - project: m-tools - relation: "the M toolchain hub references m-standard as the spec layer" - - project: m-stdlib - relation: "m-stdlib obeys m-standard's reconciled language definitions" - -incompatibilities: - - "GT.M permanently out of scope. Do not add GT.M sources." - - "No live network at pipeline run time. `crawl`/`clone` populates `sources/`; downstream stages read disk only." - - "Every integrated row needs `in_anno`/`in_ydb` provenance flags + at least one source ref." - -docs: - primary: README.md - spec: docs/spec.md - user_guide: docs/m-standards-guide.md - adr: docs/adr/ ---- - -# Claude project context — m-standard - -## What this is -Reconciles the Annotated M Standard (AnnoStd) and the YottaDB -documentation corpus into a single citable, machine-readable reference -standard for the M (MUMPS) language. Outputs are TSV + JSON pairs under -`integrated/` plus a narrative under `docs/m-standards-guide.md`. - -The full design and rationale are in `docs/spec.md`. ADRs in -`docs/adr/`. - -## Where things live -- `src/m_standard/` — library + tools package. Anything importable. -- `src/m_standard/tools/` — pipeline stages (crawl, extract, reconcile, - emit, validate). Each is invokable via `python -m - m_standard.tools.`. -- `tools/` — non-Python utilities (e.g. `clone-ydb.sh`). -- `sources/` — offline local replicas of the upstream sources. The - pipeline reads only from here, never the network at run time. -- `per-source/`, `integrated/`, `schemas/` — pipeline outputs (committed - artifacts). -- `tests/` — pytest, mirrors `src/m_standard/` structure. - -## Pipeline (per spec §7) -``` -sources/ ──extract──▶ per-source//*.tsv - └───reconcile──▶ integrated/*.tsv + conflicts.tsv - └────emit──▶ integrated/*.json - └────validate──▶ CI gates pass -``` - -## Hard rules -- **TDD.** Test first, confirm failure, then implement. Always. -- **No live network at pipeline run time.** Crawl/clone populates - `sources/`; everything downstream reads from disk. -- **Reproducibility.** Every source file has a sha256 in - `sources//manifest.tsv`. Every YDB-derived row carries the - pinned commit SHA. -- **Provenance.** Every integrated row has `in_anno`/`in_ydb` flags + - source section refs. No integrated row exists without at least one - source attesting it. -- **Determinism.** `reconcile.py` is byte-deterministic — same inputs, - same outputs. - -## Toolchain -- Python ≥3.12, `uv`, ruff, mypy, pytest. -- `Makefile` uses `.venv/bin/` prefixes for every tool (parent direnv - hijacks bare names — see `docs/build-log.md` BL-001). - -## Conventions -- No `print()` in library code — use `logging.getLogger(__name__)`. -- BeautifulSoup attr access: cast with `str()` (mypy strict). -- Click group options before subcommand if Click is added later. -- YAML frontmatter: quote any value containing a colon. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 0000000..47dc3e3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/Makefile b/Makefile index 5e3e0b2..9a80308 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ .PHONY: install hooks test test-lf watch lint format mypy cov check push pull \ - sources sources-anno sources-ydb serve-anno extract reconcile validate all clean + sources sources-anno sources-ydb serve-anno extract reconcile validate all clean \ + integrated check-manifest PYTHON := .venv/bin/python PYTEST := .venv/bin/pytest @@ -101,3 +102,18 @@ all: sources extract reconcile emit validate clean: rm -rf .pytest_cache .mypy_cache .ruff_cache htmlcov .coverage find . -type d -name __pycache__ -prune -exec rm -rf {} + + +# ----- Phase 0 manifest drift gate (per .github/docs/phase0-plan.md §4 / C4) + +# `integrated` is the named verification entry-point referenced by +# `dist/repo.meta.json.verification_commands`. It re-runs the same +# consistency check CI already enforces: every committed integrated +# payload still matches its schema and has source provenance. +integrated: + $(PYTHON) -m m_standard.tools.validate --root . + +# `check-manifest` is the per-repo Phase 0 gate. It (a) reruns +# `integrated` and (b) verifies dist/repo.meta.json schema-validates and +# every `exposes.*` payload still exists on disk and (for .json) parses. +check-manifest: integrated + $(PYTHON) tools/check-repo-meta.py dist/repo.meta.json diff --git a/dist/repo.meta.json b/dist/repo.meta.json new file mode 100644 index 0000000..caf2cdf --- /dev/null +++ b/dist/repo.meta.json @@ -0,0 +1,22 @@ +{ + "$schema": "https://raw.githubusercontent.com/m-dev-tools/.github/main/profile/repo.meta.schema.json", + "id": "tool:m-standard", + "repo": "https://github.com/m-dev-tools/m-standard", + "role": "Machine-readable M language reference", + "language": ["python"], + "license": "AGPL-3.0", + "agent_instructions": "AGENTS.md", + "verified_on": "2026-05-10", + "exposes": { + "grammar_surface": "docs/integrated/grammar-surface.json", + "commands": "docs/integrated/commands.tsv", + "intrinsic_functions": "docs/integrated/intrinsic-functions.tsv", + "intrinsic_special_variables": "docs/integrated/intrinsic-special-variables.tsv", + "operators": "docs/integrated/operators.tsv", + "errors": "docs/integrated/errors.tsv", + "pragmatic_standard": "docs/integrated/pragmatic-m-standard.json", + "operational_standard": "docs/integrated/operational-m-standard.json", + "va_sac_rules": "docs/integrated/va-sac-rules.tsv" + }, + "verification_commands": ["make integrated", "make test"] +} diff --git a/tools/check-repo-meta.py b/tools/check-repo-meta.py new file mode 100755 index 0000000..4449273 --- /dev/null +++ b/tools/check-repo-meta.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Phase 0 manifest drift gate for dist/repo.meta.json. + +Validates that: + 1. The given manifest file parses as JSON and conforms to its + declared `$schema` (the org-level repo.meta.schema.json). + 2. Every path under `exposes.*` still exists on disk; if the path + ends in `.json`, the file also parses as JSON. + +Exits 0 on success; non-zero with structured stderr output on failure. + +This is the per-repo local mirror of `.github/profile/build/validate-repo-meta.py` +(Track A / Phase 0 plan §2). Once the org-level validator is published, +this script may delegate to it via URL; for now it carries an inline +copy so the gate is self-contained. +""" + +from __future__ import annotations + +import json +import sys +import urllib.request +from pathlib import Path + +try: + from jsonschema import Draft202012Validator +except ImportError: + sys.stderr.write("ERROR: jsonschema not installed. Run `make install` first.\n") + sys.exit(2) + + +def _load_schema(schema_uri: str, repo_root: Path) -> dict: + """Load the schema from URL or, if the canonical URL is unreachable, + fall back to a sibling .github checkout under the org parent dir.""" + try: + with urllib.request.urlopen(schema_uri, timeout=5) as resp: + return json.load(resp) + except Exception as e: # noqa: BLE001 + sys.stderr.write( + f"NOTE: could not fetch schema from {schema_uri} ({e}); " + "trying local fallback at ../.github/profile/repo.meta.schema.json\n" + ) + fallback = repo_root.parent / ".github" / "profile" / "repo.meta.schema.json" + if not fallback.exists(): + sys.stderr.write(f"ERROR: fallback schema not found at {fallback}\n") + sys.exit(2) + return json.loads(fallback.read_text()) + + +def main(argv: list[str]) -> int: + if len(argv) != 2: + sys.stderr.write(f"usage: {argv[0]} \n") + return 2 + + manifest_path = Path(argv[1]).resolve() + if not manifest_path.exists(): + sys.stderr.write(f"ERROR: manifest not found: {manifest_path}\n") + return 1 + + repo_root = manifest_path.parent.parent # dist/repo.meta.json → repo root + data = json.loads(manifest_path.read_text()) + + schema_uri = data.get( + "$schema", + "https://raw.githubusercontent.com/m-dev-tools/.github/main/profile/repo.meta.schema.json", + ) + schema = _load_schema(schema_uri, repo_root) + + errors = list(Draft202012Validator(schema).iter_errors(data)) + if errors: + for err in errors: + path = "/".join(str(p) for p in err.absolute_path) or "" + sys.stderr.write(f"SCHEMA ERROR at {path}: {err.message}\n") + return 1 + + failures: list[str] = [] + for key, rel_path in data["exposes"].items(): + if rel_path.startswith(("http://", "https://")): + continue # remote URLs are smoke-checked by Track E, not here + target = repo_root / rel_path + if not target.exists(): + failures.append(f"exposes.{key}: missing payload at {rel_path}") + continue + if rel_path.endswith(".json"): + try: + json.loads(target.read_text()) + except json.JSONDecodeError as e: + failures.append(f"exposes.{key}: {rel_path} fails to parse: {e}") + + if failures: + for f in failures: + sys.stderr.write(f"DRIFT: {f}\n") + return 1 + + print(f"check-repo-meta ✓ {manifest_path.relative_to(repo_root)}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv))