diff --git a/.github/workflows/asset-drift.yml b/.github/workflows/asset-drift.yml new file mode 100644 index 00000000..11254a06 --- /dev/null +++ b/.github/workflows/asset-drift.yml @@ -0,0 +1,65 @@ +name: Asset Drift Check + +on: + schedule: + - cron: "0 3 * * *" + workflow_dispatch: + +permissions: + contents: read + +env: + PYTHON_VERSION: "3.11" + +jobs: + drift: + name: Verify Model/Corpus Release Assets + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Install dependencies + run: | + uv venv .venv --python "${PYTHON_VERSION}" + uv sync --frozen --python .venv/bin/python + + - name: Download + verify pinned release assets + env: + GITHUB_TOKEN: ${{ github.token }} + run: | + .venv/bin/python scripts/asset_drift_check.py --force-download + + - name: Run contract model quality gate + env: + GITHUB_TOKEN: ${{ github.token }} + run: | + .venv/bin/python scripts/model_quality_gate.py \ + --baseline-tag pipeline/is-contract/0.1 \ + --candidate-tag pipeline/is-contract/0.1 \ + --baseline-metrics-json test_data/model_quality/is_contract_baseline_metrics.json \ + --max-accuracy-regression 0.0 \ + --max-f1-regression 0.0 + + - name: Run contract-type quality gate + env: + GITHUB_TOKEN: ${{ github.token }} + run: | + .venv/bin/python scripts/contract_type_quality_gate.py \ + --baseline-tag pipeline/contract-type/0.2-runtime \ + --candidate-tag pipeline/contract-type/0.2-runtime \ + --baseline-metrics-json test_data/model_quality/contract_type_baseline_metrics.json \ + --max-accuracy-top1-regression 0.0 \ + --max-accuracy-topn-regression 0.0 \ + --max-f1-macro-regression 0.0 \ + --max-f1-weighted-regression 0.0 + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..f71111c0 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,258 @@ +name: CI + +on: + pull_request: + push: + +permissions: + contents: read + +env: + PYTHON_VERSION: "3.11" + CONTRACT_MODEL_BASELINE_TAG: "pipeline/is-contract/0.1" + CONTRACT_MODEL_CANDIDATE_TAG: "pipeline/is-contract/0.1" + CONTRACT_MODEL_BASELINE_METRICS: "test_data/model_quality/is_contract_baseline_metrics.json" + LEXNLP_CONTRACT_TYPE_MODEL_TAG: "pipeline/contract-type/0.2-runtime" + CONTRACT_TYPE_MODEL_BASELINE_METRICS: "test_data/model_quality/contract_type_baseline_metrics.json" + +jobs: + base-tests: + name: Base Tests + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Cache NLTK data + uses: actions/cache@v4 + with: + path: | + ~/nltk_data + key: nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('uv.lock') }} + restore-keys: | + nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}- + + - name: Install dependencies + run: | + uv venv .venv --python "${PYTHON_VERSION}" + uv sync --frozen --python .venv/bin/python --extra dev --extra test + + - name: Bootstrap required assets + env: + GITHUB_TOKEN: ${{ github.token }} + run: .venv/bin/python scripts/bootstrap_assets.py --nltk --contract-model + + - name: Enforce skip-audit policy + run: .venv/bin/python ci/skip_audit.py + + - name: Run base suite + run: .venv/bin/pytest lexnlp + + stanford-tests: + name: Stanford Tests + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Cache NLTK data + uses: actions/cache@v4 + with: + path: | + ~/nltk_data + key: nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('uv.lock') }} + restore-keys: | + nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}- + + - name: Set up Java + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "11" + + - name: Install dependencies + run: | + uv venv .venv --python "${PYTHON_VERSION}" + uv sync --frozen --python .venv/bin/python --extra dev --extra test + + - name: Bootstrap required assets (including Stanford) + env: + GITHUB_TOKEN: ${{ github.token }} + run: .venv/bin/python scripts/bootstrap_assets.py --nltk --contract-model --stanford + + - name: Run Stanford suite + env: + LEXNLP_USE_STANFORD: "true" + run: | + .venv/bin/pytest \ + lexnlp/nlp/en/tests/test_stanford.py \ + lexnlp/extract/en/entities/tests/test_stanford_ner.py + + model-quality: + name: Model Quality Gate + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Cache NLTK data + uses: actions/cache@v4 + with: + path: | + ~/nltk_data + key: nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('uv.lock') }} + restore-keys: | + nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}- + + - name: Install dependencies + run: | + uv venv .venv --python "${PYTHON_VERSION}" + uv sync --frozen --python .venv/bin/python --extra test + + - name: Bootstrap contract-model asset + env: + GITHUB_TOKEN: ${{ github.token }} + run: .venv/bin/python scripts/bootstrap_assets.py --nltk --contract-model + + - name: Run contract model quality gate + run: | + .venv/bin/python scripts/model_quality_gate.py \ + --baseline-tag "${CONTRACT_MODEL_BASELINE_TAG}" \ + --candidate-tag "${CONTRACT_MODEL_CANDIDATE_TAG}" \ + --baseline-metrics-json "${CONTRACT_MODEL_BASELINE_METRICS}" \ + --output-json artifacts/model_quality_gate.json \ + --max-f1-regression 0.0 \ + --max-accuracy-regression 0.0 + + - name: Upload quality-gate result + uses: actions/upload-artifact@v4 + with: + name: model-quality-gate + path: artifacts/model_quality_gate.json + if-no-files-found: error + + contract-type-smoke: + name: Contract Type Smoke + runs-on: ubuntu-latest + timeout-minutes: 45 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Cache NLTK data + uses: actions/cache@v4 + with: + path: | + ~/nltk_data + key: nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('uv.lock') }} + restore-keys: | + nltk-data-${{ runner.os }}-py${{ env.PYTHON_VERSION }}- + + - name: Install dependencies + run: | + uv venv .venv --python "${PYTHON_VERSION}" + uv sync --frozen --python .venv/bin/python --extra test + + - name: Bootstrap runtime contract-type model + env: + GITHUB_TOKEN: ${{ github.token }} + run: .venv/bin/python scripts/bootstrap_assets.py --contract-type-model + + - name: Run contract-type predictor smoke + run: | + .venv/bin/python - <<'PY' + from lexnlp.extract.en.contracts.predictors import ProbabilityPredictorContractType + predictor = ProbabilityPredictorContractType() + predictions = predictor.make_predictions( + "This Employment Agreement is entered into on January 1, 2024.", + top_n=3, + ) + assert len(predictions) > 0 + print(predictions.to_dict()) + PY + + - name: Run contract-type quality gate + run: | + .venv/bin/python scripts/contract_type_quality_gate.py \ + --baseline-tag "${LEXNLP_CONTRACT_TYPE_MODEL_TAG}" \ + --candidate-tag "${LEXNLP_CONTRACT_TYPE_MODEL_TAG}" \ + --baseline-metrics-json "${CONTRACT_TYPE_MODEL_BASELINE_METRICS}" \ + --output-json artifacts/contract_type_quality_gate.json \ + --max-accuracy-top1-regression 0.0 \ + --max-accuracy-topn-regression 0.0 \ + --max-f1-macro-regression 0.0 \ + --max-f1-weighted-regression 0.0 + + - name: Upload contract-type quality-gate result + uses: actions/upload-artifact@v4 + with: + name: contract-type-quality-gate + path: artifacts/contract_type_quality_gate.json + if-no-files-found: error + + packaging-smoke: + name: Packaging Smoke + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Build source and wheel artifacts + run: | + uv build + + - name: Validate artifact contents + run: python3 ci/check_dist_contents.py + + - name: Install wheel in clean env + run: | + uv venv .venv-smoke --python "${PYTHON_VERSION}" + uv pip install --python .venv-smoke/bin/python dist/*.whl + .venv-smoke/bin/python - <<'PY' + import lexnlp + print(getattr(lexnlp, "__version__", "unknown")) + PY diff --git a/.github/workflows/publish-contract-model.yml b/.github/workflows/publish-contract-model.yml new file mode 100644 index 00000000..a4f39627 --- /dev/null +++ b/.github/workflows/publish-contract-model.yml @@ -0,0 +1,111 @@ +name: Publish Contract Model (is-contract) + +on: + workflow_dispatch: + +# Note: this workflow publishes a GitHub Release asset in the repository where it +# runs (i.e., `github.repository`). LexNLP downloads release tags from the +# repository configured via `LEXNLP_MODELS_REPO` / `LEXNLP_MODELS_REPO_SLUG`. + +permissions: + contents: write + +env: + PYTHON_VERSION: "3.11" + # Ensure publish workflows work on forks even if the fork does not host the + # upstream model/corpus release tags needed for training/re-export. + LEXNLP_MODELS_REPO_SLUG: "LexPredict/lexpredict-lexnlp" + SOURCE_TAG: "pipeline/is-contract/0.1" + MODEL_TAG: "pipeline/is-contract/0.2" + BASELINE_METRICS: "test_data/model_quality/is_contract_baseline_metrics.json" + +jobs: + publish: + name: Publish Re-exported Artifact + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Install dependencies + run: | + uv venv .venv --python "${PYTHON_VERSION}" + uv sync --frozen --python .venv/bin/python + + - name: Re-export contract model + env: + GITHUB_TOKEN: ${{ github.token }} + run: | + .venv/bin/python scripts/reexport_contract_model.py \ + --source-tag "${SOURCE_TAG}" \ + --target-tag "${MODEL_TAG}" \ + --baseline-metrics-json "${BASELINE_METRICS}" \ + --output-metadata-json "artifacts/model_reexports/${MODEL_TAG//\\//__}.metadata.json" \ + --skip-quality-gate \ + --force + + - name: Run contract model quality gate + env: + GITHUB_TOKEN: ${{ github.token }} + LEXNLP_IS_CONTRACT_MODEL_TAG: ${{ env.MODEL_TAG }} + run: | + .venv/bin/python scripts/model_quality_gate.py \ + --baseline-tag "${SOURCE_TAG}" \ + --candidate-tag "${MODEL_TAG}" \ + --baseline-metrics-json "${BASELINE_METRICS}" \ + --output-json artifacts/model_quality_gate.json \ + --max-accuracy-regression 0.0 \ + --max-f1-regression 0.0 + + - name: Locate model file + id: locate + env: + LEXNLP_IS_CONTRACT_MODEL_TAG: ${{ env.MODEL_TAG }} + run: | + MODEL_PATH="$( + .venv/bin/python - <<'PY' + from lexnlp.ml.catalog import get_path_from_catalog + import os + + tag = (os.getenv("LEXNLP_IS_CONTRACT_MODEL_TAG") or "").strip() + if not tag: + raise SystemExit("Missing LEXNLP_IS_CONTRACT_MODEL_TAG") + print(get_path_from_catalog(tag)) + PY + )" + test -f "${MODEL_PATH}" + echo "model_path=${MODEL_PATH}" >> "$GITHUB_OUTPUT" + + - name: Upload training + gate artifacts + uses: actions/upload-artifact@v4 + with: + name: contract-model-reexport + path: | + artifacts/model_reexports/*.metadata.json + artifacts/model_quality_gate.json + if-no-files-found: error + + - name: Create release if missing + env: + GH_TOKEN: ${{ github.token }} + run: | + if ! gh release view "${MODEL_TAG}" >/dev/null 2>&1; then + gh release create "${MODEL_TAG}" \ + --title "${MODEL_TAG}" \ + --notes "Runtime re-export of the LexNLP is-contract classifier for modern Python/scikit-learn." + fi + + - name: Upload model asset to release + env: + GH_TOKEN: ${{ github.token }} + run: | + gh release upload "${MODEL_TAG}" "${{ steps.locate.outputs.model_path }}" --clobber diff --git a/.github/workflows/publish-contract-type-runtime-model.yml b/.github/workflows/publish-contract-type-runtime-model.yml new file mode 100644 index 00000000..91222f68 --- /dev/null +++ b/.github/workflows/publish-contract-type-runtime-model.yml @@ -0,0 +1,110 @@ +name: Publish Contract-Type Runtime Model + +on: + workflow_dispatch: + +# Note: this workflow publishes a GitHub Release asset in the repository where it +# runs (i.e., `github.repository`). LexNLP downloads release tags from the +# repository configured in `lexnlp.MODELS_REPO`. + +permissions: + contents: write + +env: + PYTHON_VERSION: "3.11" + # Ensure publish workflows work on forks even if the fork does not host the + # upstream model/corpus release tags needed for training. + LEXNLP_MODELS_REPO_SLUG: "LexPredict/lexpredict-lexnlp" + MODEL_TAG: "pipeline/contract-type/0.2-runtime" + BASELINE_METRICS: "test_data/model_quality/contract_type_baseline_metrics.json" + +jobs: + publish: + name: Publish Runtime Artifact + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Install dependencies + run: | + uv venv .venv --python "${PYTHON_VERSION}" + uv sync --frozen --python .venv/bin/python + + - name: Train runtime contract-type model + env: + GITHUB_TOKEN: ${{ github.token }} + LEXNLP_CONTRACT_TYPE_MODEL_TAG: ${{ env.MODEL_TAG }} + run: | + .venv/bin/python scripts/train_contract_type_model.py \ + --target-tag "${MODEL_TAG}" \ + --force \ + --output-json artifacts/model_training/contract_type_model_training_report.json + + - name: Run contract-type quality gate + env: + GITHUB_TOKEN: ${{ github.token }} + LEXNLP_CONTRACT_TYPE_MODEL_TAG: ${{ env.MODEL_TAG }} + run: | + .venv/bin/python scripts/contract_type_quality_gate.py \ + --baseline-tag "${MODEL_TAG}" \ + --candidate-tag "${MODEL_TAG}" \ + --baseline-metrics-json "${BASELINE_METRICS}" \ + --output-json artifacts/contract_type_quality_gate.json \ + --max-accuracy-top1-regression 0.0 \ + --max-accuracy-topn-regression 0.0 \ + --max-f1-macro-regression 0.0 \ + --max-f1-weighted-regression 0.0 + + - name: Locate model file + id: locate + run: | + MODEL_PATH="$( + .venv/bin/python - <<'PY' + from lexnlp.extract.en.contracts.runtime_model import ( + CONTRACT_TYPE_MODEL_FILENAME, + ) + from lexnlp.ml.catalog import CATALOG + import os + + tag = (os.getenv("MODEL_TAG") or "").strip() or "pipeline/contract-type/0.2-runtime" + path = CATALOG / tag / CONTRACT_TYPE_MODEL_FILENAME + print(path) + PY + )" + test -f "${MODEL_PATH}" + echo "model_path=${MODEL_PATH}" >> "$GITHUB_OUTPUT" + + - name: Upload training + gate artifacts + uses: actions/upload-artifact@v4 + with: + name: contract-type-runtime-build + path: | + artifacts/model_training/contract_type_model_training_report.json + artifacts/contract_type_quality_gate.json + if-no-files-found: error + + - name: Create release if missing + env: + GH_TOKEN: ${{ github.token }} + run: | + if ! gh release view "${MODEL_TAG}" >/dev/null 2>&1; then + gh release create "${MODEL_TAG}" \ + --title "${MODEL_TAG}" \ + --notes "Runtime-compatible contract-type classifier for LexNLP." + fi + + - name: Upload model asset to release + env: + GH_TOKEN: ${{ github.token }} + run: | + gh release upload "${MODEL_TAG}" "${{ steps.locate.outputs.model_path }}" --clobber diff --git a/.gitignore b/.gitignore index 3a21bea2..1907c67d 100644 --- a/.gitignore +++ b/.gitignore @@ -110,7 +110,8 @@ ENV/ # mypy .mypy_cache/ +artifacts/ + benchmarks lexnlp/extract/en/contracts/data/*.model /.pytest_cache/ - diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..941da6a1 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,219 @@ +# AGENTS.md + +This document is a quick-start guide for coding agents working in this repository. + +## Project Summary + +- Project: `lexpredict-lexnlp` (LexNLP) +- Purpose: legal-text NLP and information extraction library +- Primary package: `lexnlp/` +- Packaging: `pyproject.toml` (setuptools backend; version in repo: `2.3.0`) +- Python requirement in `pyproject.toml`: `>=3.10,<3.13` (default to Python `3.11`) + +## Directory Structure + +```text +. +|-- lexnlp/ # Main package +| |-- config/ # Locale-specific configuration (en, de, es) +| |-- extract/ # Extraction modules by locale and domain +| | |-- common/ +| | |-- en/ +| | |-- de/ +| | |-- es/ +| | `-- ml/ +| |-- ml/ # ML utilities/catalog helpers +| |-- nlp/ # NLP components and training helpers +| |-- tests/ # Shared test helpers + tests +| `-- utils/ # Utility modules and utility tests +|-- test_data/ # Fixtures, sample inputs, expected outputs +|-- scripts/ # Helper scripts (Tika, release, data helpers) +|-- libs/ # Download/runtime helper scripts and assets +|-- notebooks/ # Exploratory notebooks by topic +|-- documentation/ # Sphinx docs source +|-- pyproject.toml # Canonical packaging/dependency metadata +|-- python-requirements.txt # Deprecated legacy dependency snapshot +|-- python-requirements-dev.txt # Deprecated legacy dev/test snapshot +|-- Pipfile # Deprecated legacy pipenv workflow +|-- .pylintrc # Lint configuration +|-- .travis.yml # Historical CI reference +|-- setup.py # Legacy compatibility wrapper +`-- AGENTS.md +``` + +## Environment Setup (Recommended: uv) + +Use Python 3.11 in a local `.venv`. + +```bash +cd /Users/jackeames/Downloads/LexNLP +uv python install 3.11 +uv venv --python 3.11 .venv +uv sync --frozen --python .venv/bin/python --extra dev --extra test +``` + +Notes: +- `uv sync` installs the project editable by default (good for local development). +- If you want a non-editable install (closer to a wheel install), add `--no-editable`. + +```bash +# Only needed if you used --no-install-project above: +uv pip install --python .venv/bin/python -e ".[dev,test]" +``` + +### Deprecated setup variants + +`Pipfile`, `python-requirements.txt`, and `python-requirements-dev.txt` are deprecated. Use `uv` with `pyproject.toml` for all new local setup and CI updates. + +## Required Runtime/Test Assets + +Use the bootstrap script for deterministic setup: + +```bash +./.venv/bin/python scripts/bootstrap_assets.py --nltk --contract-model --contract-type-model +``` + +Advanced: redirect model downloads to a different GitHub repository: + +```bash +# full GitHub API base URL (must point at `/releases/tags/`) +export LEXNLP_MODELS_REPO="https://api.github.com/repos///releases/tags/" + +# or: slug form (LexNLP constructs the API URL) +export LEXNLP_MODELS_REPO_SLUG="/" +``` + +Optional assets: + +```bash +# Stanford +./.venv/bin/python scripts/bootstrap_assets.py --stanford + +# Tika +./.venv/bin/python scripts/bootstrap_assets.py --tika +``` + +## Stanford-Dependent Tests + +Stanford tests are gated by `LEXNLP_USE_STANFORD=true`. + +1. Install Java: +```bash +brew install openjdk@11 +``` + +2. Ensure Java is on path for test commands: +```bash +export PATH="/opt/homebrew/opt/openjdk@11/bin:$PATH" +``` + +3. Download Stanford assets to `libs/stanford_nlp`: +- `stanford-postagger-full-2017-06-09` +- `stanford-ner-2017-06-09` + +Expected files: +- `libs/stanford_nlp/stanford-postagger-full-2017-06-09/stanford-postagger.jar` +- `libs/stanford_nlp/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger` +- `libs/stanford_nlp/stanford-ner-2017-06-09/stanford-ner.jar` +- `libs/stanford_nlp/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz` + +## Tika Notes + +`scripts/download_tika.sh` can fail on macOS because it assumes GNU `mkdir --parents` and `wget`. +If needed, manually download `tika-app-1.16.jar` and `tika-server-1.16.jar` into `bin/` using `curl`. + +Migration and troubleshooting details are in `MIGRATION_RUNBOOK.md`. + +## Test Integrity Policy + +- Do not add, remove, or modify `skip`, `skipif`, or `xfail` markers to bypass failures. +- Fix failing behavior or document a real external blocker; never mask regressions by changing skip behavior. +- Any `skip`/`skipif`/`xfail` that is genuinely required (e.g., external dependency outage) must include an inline annotation: + - `skip-audit: issue= expires=YYYY-MM-DD` + - CI enforces this via `ci/skip_audit.py`. + - Prefer inline annotations; `ci/skip_audit_allowlist.txt` is reserved for rare cases where annotation is not feasible: + - Use stable allowlist keys (not line-number based) + - Generate keys with: `python ci/skip_audit.py --print-markers` +- Validation target is **100% pass** for required suites. + +## Full Validation Commands (100% pass target) + +Run in two phases: + +1. Base suite: +```bash +./.venv/bin/pytest lexnlp +``` + +2. Stanford-only suite: +```bash +PATH=/opt/homebrew/opt/openjdk@11/bin:$PATH \ +LEXNLP_USE_STANFORD=true \ +./.venv/bin/pytest \ + lexnlp/nlp/en/tests/test_stanford.py \ + lexnlp/extract/en/entities/tests/test_stanford_ner.py +``` + +When Stanford assets are installed and enabled, both phases must pass (0 failures) for a **100% pass** result. + +Note: a single monolithic `LEXNLP_USE_STANFORD=true` run can occasionally hang in non-Stanford modules on this machine, so prefer the two-phase approach. + +## Common Commands + +```bash +# quick dependency sanity +./.venv/bin/pip check + +# packaging content sanity +python3 ci/check_dist_contents.py + +# contract model quality gate (baseline metrics) +./.venv/bin/python scripts/model_quality_gate.py \ + --baseline-tag pipeline/is-contract/0.1 \ + --candidate-tag pipeline/is-contract/0.1 \ + --baseline-metrics-json test_data/model_quality/is_contract_baseline_metrics.json + +# contract-type model quality gate (baseline metrics) +./.venv/bin/python scripts/contract_type_quality_gate.py \ + --baseline-tag pipeline/contract-type/0.2-runtime \ + --candidate-tag pipeline/contract-type/0.2-runtime \ + --baseline-metrics-json test_data/model_quality/contract_type_baseline_metrics.json + +# build a runtime-compatible contract-type model artifact +./.venv/bin/python scripts/train_contract_type_model.py \ + --target-tag pipeline/contract-type/0.2-runtime + +# create a re-exported candidate model tag and validate it +./.venv/bin/python scripts/reexport_contract_model.py \ + --source-tag pipeline/is-contract/0.1 \ + --target-tag pipeline/is-contract/0.2 \ + --baseline-metrics-json test_data/model_quality/is_contract_baseline_metrics.json + +# refresh bundled sklearn artifacts under the current runtime +./.venv/bin/python scripts/reexport_bundled_sklearn_models.py + +# run one file +./.venv/bin/pytest lexnlp/extract/en/tests/test_dates.py + +# historical CI-style command +./.venv/bin/pytest --cov lexnlp --pylint --pylint-rcfile=.pylintrc lexnlp +``` + +## Implementation Guidelines + +- Keep changes scoped to the relevant locale/module (`extract/en`, `extract/de`, etc.). +- Add or update tests alongside behavior changes. +- Prefer existing utilities under `lexnlp/utils/` over introducing duplicates. +- When adding extraction patterns/models, include representative fixtures in `test_data/`. +- Avoid committing downloaded/generated third-party assets unless explicitly required. + +## Pull Request Checklist + +- Editable install works: `uv pip install --python .venv/bin/python -e ".[dev,test]"` +- Targeted tests for changed modules pass. +- Full base run (`pytest lexnlp`) passes. +- If Stanford assets are enabled, Stanford-only suite with `LEXNLP_USE_STANFORD=true` passes. +- Contract model quality gate passes against `test_data/model_quality/is_contract_baseline_metrics.json`. +- Contract-type smoke flow works (`scripts/bootstrap_assets.py --contract-type-model` + predictor instantiation). +- No `skip`/`skipif`/`xfail` policy bypasses were introduced. +- Document any required asset downloads (NLTK, pipeline models, Stanford, Tika) in PR notes. diff --git a/DEPENDENCY_MODERNIZATION_PLAN.md b/DEPENDENCY_MODERNIZATION_PLAN.md new file mode 100644 index 00000000..022c14eb --- /dev/null +++ b/DEPENDENCY_MODERNIZATION_PLAN.md @@ -0,0 +1,59 @@ +# Dependency Modernization With Zero-Skip Test Policy + +## Summary + +- Treat `AGENTS.md` as policy-enforcing documentation for engineering agents. +- Make dependency management current (`uv` + single source of truth). +- Keep behavior stable while upgrading packages and model artifacts. +- Require all collected tests to pass with fully provisioned dependencies. + +## Scope + +### In + +- Add explicit “do not skip/disable tests” policy to `AGENTS.md`. +- Consolidate dependency definitions into one modern packaging workflow. +- Make full test execution deterministic, including currently optional integrations. +- Introduce a controlled model-improvement pipeline with measurable acceptance gates. + +### Out + +- Feature-level extraction redesign unrelated to dependency/model reliability. +- Permanent acceptance of partial-pass builds. + +## Action Items + +- [ ] Add a **Test Integrity Policy** section to `AGENTS.md` that forbids adding/removing `skip`, `skipif`, or `xfail` to bypass failures, and requires fixing root causes instead. +- [ ] Define a **100% pass target** as: all collected tests pass on a fully provisioned runner, including Stanford-gated tests when required assets are present. +- [ ] Add a **skip-audit check** in CI that fails if new skip/xfail markers are introduced without an approved issue link and expiry date. +- [ ] Consolidate packaging to `pyproject.toml` + lockfile and deprecate conflicting manifests (`Pipfile`, split requirements variants) after parity is captured. +- [ ] Standardize runtime on modern Python (default 3.11) and align metadata/docs/CI to that policy. +- [ ] Replace brittle setup scripts with deterministic bootstrap steps for NLTK corpora, contract pipeline artifacts, Java/Stanford assets, and optional Tika. +- [ ] Run compatibility validation for serialized ML pipelines against upgraded `scikit-learn`; retrain/re-export artifacts when incompatible, with explicit version tags. +- [ ] Add a model quality gate: compare old vs new models on fixed evaluation fixtures and accept upgrades only when metrics improve or regressions are within strict tolerance. +- [ ] Publish a migration runbook in repo docs with exact commands for local setup, full test run, optional component enablement, and failure triage. +- [ ] Roll out in staged PRs (policy/doc first, packaging second, CI third, model upgrades fourth), each required to stay green end-to-end. + +## Important Changes to Interfaces + +- Installation interface becomes `uv` + `pyproject.toml` driven. +- Dependency groups become explicit extras (`dev`, `test`, `stanford`, `tika`). +- Model artifact interface becomes versioned and benchmark-gated (new tags for improved models). +- No intended changes to user-facing extraction function signatures during dependency modernization. + +## Test Scenarios + +- Fresh environment bootstrap succeeds with documented commands only. +- Base suite passes with zero unexpected skips/failures. +- Stanford suite passes when provisioned and enabled. +- CI skip-audit catches any new bypass markers. +- Model regression suite compares baseline vs upgraded outputs and blocks degradations. +- Built wheel installs in clean venv and passes smoke tests. + +## Assumptions and Defaults + +- Default judgment: this should be done now. +- Default policy: no test disabling for convenience; failures are fixed, not hidden. +- Default Python target: 3.11 (with compatibility checks for adjacent supported versions). +- Default model policy: prefer newer models/methods only when measured outputs are better. +- Default blocker handling: if an external dependency outage prevents completion, build is marked blocked/failing with explicit root-cause notes, not treated as pass. diff --git a/MANIFEST.in b/MANIFEST.in index 983e9b3e..7d3aa75c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,18 @@ include README.md include README.rst include index.rst -include Pipfile -include Pipfile.lock +exclude Pipfile +exclude Pipfile.lock +exclude python-requirements.txt +exclude python-requirements-dev.txt +exclude python-requirements-full.txt recursive-include lexnlp *.pickle recursive-include lexnlp/extract/en/addresses *.json *.txt *.xml recursive-include lexnlp *.csv -recursive-include libs * -recursive-include scripts * +include libs/download_stanford_nlp.sh +include libs/download_wiki.sh +recursive-include scripts *.py *.sh recursive-include documentation * +prune libs/stanford_nlp +global-exclude __pycache__ +global-exclude *.py[cod] diff --git a/MIGRATION_RUNBOOK.md b/MIGRATION_RUNBOOK.md new file mode 100644 index 00000000..6f299a61 --- /dev/null +++ b/MIGRATION_RUNBOOK.md @@ -0,0 +1,247 @@ +# Dependency Migration Runbook + +This runbook is the operational guide for maintaining a modern, reproducible LexNLP environment with a zero-skip testing policy. + +## 1) Toolchain Baseline + +- Python: `3.11` (default), supported range in `pyproject.toml`: `>=3.10,<3.13` +- Packaging/dependencies: `pyproject.toml` + `uv.lock` +- Installer/runner: `uv` + +Legacy files are retained for historical reproduction only: +- `Pipfile` +- `python-requirements.txt` +- `python-requirements-dev.txt` +- `python-requirements-full.txt` + +## 2) Fresh Setup + +```bash +cd /Users/jackeames/Downloads/LexNLP +uv python install 3.11 +uv venv --python 3.11 .venv +uv sync --frozen --python .venv/bin/python --extra dev --extra test +``` + +Notes: +- `uv sync` installs the project editable by default (good for local development). +- If you want a non-editable install (closer to a wheel install), add `--no-editable`. + +```bash +# Only needed if you used --no-install-project above: +uv pip install --python .venv/bin/python -e ".[dev,test]" +``` + +## 3) Bootstrap Required Assets + +```bash +# NLTK + required model artifacts +./.venv/bin/python scripts/bootstrap_assets.py --nltk --contract-model --contract-type-model + +# Optional: Stanford assets for Stanford-gated tests +./.venv/bin/python scripts/bootstrap_assets.py --stanford + +# Optional: Tika jars +./.venv/bin/python scripts/bootstrap_assets.py --tika +``` + +## 4) Policy Checks + +```bash +# Fail if unapproved skip/skipif/xfail markers were added +./.venv/bin/python ci/skip_audit.py +``` + +## 5) Full Validation (100% pass target) + +```bash +# Base suite +./.venv/bin/pytest lexnlp + +# Stanford-only suite (requires Stanford assets + Java) +PATH=/opt/homebrew/opt/openjdk@11/bin:$PATH \ +LEXNLP_USE_STANFORD=true \ +./.venv/bin/pytest \ + lexnlp/nlp/en/tests/test_stanford.py \ + lexnlp/extract/en/entities/tests/test_stanford_ner.py +``` + +Passing both commands is the required 100% result for a fully provisioned environment. + +## 6) Packaging Validation + +```bash +uv build +python3 ci/check_dist_contents.py +uv venv --python 3.11 .venv-smoke +uv pip install --python .venv-smoke/bin/python dist/*.whl +.venv-smoke/bin/python -c "import lexnlp; print(lexnlp.__version__)" +``` + +## 7) Model Upgrade Quality Gate + +Use the quality gate script before adopting a new contract-model artifact. + +The committed baseline metrics file is: +- `test_data/model_quality/is_contract_baseline_metrics.json` + +To create a modern candidate artifact by re-serializing the baseline model +under the current runtime (Python/scikit-learn), use: + +```bash +./.venv/bin/python scripts/reexport_contract_model.py \ + --source-tag pipeline/is-contract/0.1 \ + --target-tag pipeline/is-contract/0.2 \ + --baseline-metrics-json test_data/model_quality/is_contract_baseline_metrics.json +``` + +This writes model-export metadata to +`artifacts/model_reexports/pipeline__is-contract__0.2.metadata.json` by default. + +### Retrain candidate classifier from corpora (phase 2 path) + +For a fuller upgrade than pure re-serialization, train a new classifier while +reusing LexNLP baseline preprocessing/vectorization steps: + +```bash +./.venv/bin/python scripts/train_contract_model.py \ + --baseline-tag pipeline/is-contract/0.1 \ + --candidate-tag pipeline/is-contract/0.2 \ + --baseline-metrics-json test_data/model_quality/is_contract_baseline_metrics.json \ + --max-f1-regression 0.0 \ + --max-accuracy-regression 0.0 \ + --force +``` + +Training report output: +- `artifacts/model_training/contract_model_training_report.json` + +The script automatically: +- downloads configured corpora tags if missing, +- trains multiple estimator candidates, +- selects best by validation metrics (F1 first), +- writes candidate artifact to catalog, +- runs `scripts/model_quality_gate.py` unless skipped. + +Candidate evaluation command: + +```bash +./.venv/bin/python scripts/model_quality_gate.py \ + --baseline-tag pipeline/is-contract/0.1 \ + --candidate-tag pipeline/is-contract/0.2 \ + --baseline-metrics-json test_data/model_quality/is_contract_baseline_metrics.json \ + --fixture test_data/lexnlp/extract/en/contracts/tests/test_contracts/test_is_contract.csv \ + --max-f1-regression 0.0 \ + --max-accuracy-regression 0.0 +``` + +Default policy is non-regression against baseline metrics from +`pipeline/is-contract/0.1` on the fixed fixture above. + +### Runtime model-tag overrides + +Predictors can select newer validated tags without API/signature changes: + +```bash +# is-contract classifier +export LEXNLP_IS_CONTRACT_MODEL_TAG="pipeline/is-contract/0.2" + +# contract-type classifier +export LEXNLP_CONTRACT_TYPE_MODEL_TAG="pipeline/contract-type/0.2-runtime" +``` + +### Models repo override (advanced) + +By default, model/corpus tags are downloaded from the upstream LexPredict +repository via the GitHub API (`lexnlp.DEFAULT_MODELS_REPO`). + +For forks or air-gapped mirrors, you can redirect downloads: + +```bash +# Option A: set a full GitHub API base URL (must point at `/releases/tags/`) +export LEXNLP_MODELS_REPO="https://api.github.com/repos///releases/tags/" + +# Option B: set the slug (LexNLP constructs the GitHub API tags endpoint) +export LEXNLP_MODELS_REPO_SLUG="/" +``` + +### Contract-type runtime fallback model + +The legacy `pipeline/contract-type/0.1` artifact may fail to unpickle on modern +Python runtimes. LexNLP now supports a deterministic runtime-compatible fallback +artifact (`pipeline/contract-type/0.2-runtime`) trained from +`corpus/contract-types/0.1`. + +The committed contract-type fixture and baseline metrics file are: +- Fixture: `test_data/lexnlp/extract/en/contracts/tests/test_contracts/test_contract_type.csv` +- Baseline: `test_data/model_quality/contract_type_baseline_metrics.json` + +Run the contract-type quality gate: + +```bash +./.venv/bin/python scripts/contract_type_quality_gate.py \ + --baseline-tag pipeline/contract-type/0.2-runtime \ + --candidate-tag pipeline/contract-type/0.2-runtime \ + --baseline-metrics-json test_data/model_quality/contract_type_baseline_metrics.json \ + --output-json artifacts/contract_type_quality_gate.json \ + --max-accuracy-top1-regression 0.0 \ + --max-accuracy-topn-regression 0.0 \ + --max-f1-macro-regression 0.0 \ + --max-f1-weighted-regression 0.0 +``` + +Build/rebuild it explicitly: + +```bash +./.venv/bin/python scripts/bootstrap_assets.py --contract-type-model +``` + +Or run full training with report output: + +```bash +./.venv/bin/python scripts/train_contract_type_model.py \ + --target-tag pipeline/contract-type/0.2-runtime \ + --output-json artifacts/model_training/contract_type_model_training_report.json +``` + +On first use of `ProbabilityPredictorContractType`, if legacy default loading +fails and no env override is set, LexNLP automatically builds/loads this runtime +fallback tag. + +If baseline-tag model behavior is intentionally changed, regenerate and review +the baseline metrics file in the same PR: + +```bash +./.venv/bin/python scripts/model_quality_gate.py \ + --baseline-tag pipeline/is-contract/0.1 \ + --candidate-tag pipeline/is-contract/0.1 \ + --fixture test_data/lexnlp/extract/en/contracts/tests/test_contracts/test_is_contract.csv \ + --write-baseline-metrics-json test_data/model_quality/is_contract_baseline_metrics.json +``` + +### Refresh bundled sklearn artifacts + +If bundled sklearn artifacts emit legacy-version warnings, re-serialize them on +the current runtime and re-run targeted tests: + +```bash +./.venv/bin/python scripts/reexport_bundled_sklearn_models.py +``` + +## 8) Failure Triage + +- `LookupError` for NLTK resources: + - Re-run `scripts/bootstrap_assets.py --nltk` +- Contract tests failing with missing model tag: + - Re-run `scripts/bootstrap_assets.py --contract-model` +- Stanford tests failing due missing jars/models: + - Re-run `scripts/bootstrap_assets.py --stanford` +- Skip-audit failure: + - Remove the marker, or add annotation: + - `# skip-audit: issue= expires=YYYY-MM-DD` + - In rare cases where annotation is not feasible, allowlist the marker: + - Use the stable key format (not line-number based) + - Generate keys with: `python ci/skip_audit.py --print-markers` + - Add the stable key to `ci/skip_audit_allowlist.txt` +- Packaging smoke failure: + - Ensure build artifacts are generated with `uv build` and install into a clean venv diff --git a/Pipfile b/Pipfile index 31e08076..bce5e800 100644 --- a/Pipfile +++ b/Pipfile @@ -1,3 +1,6 @@ +# DEPRECATED: Legacy pipenv manifest retained only for historical reproduction. +# Use pyproject.toml + uv.lock for all active development and CI workflows. + [[source]] url = "https://pypi.org/simple" verify_ssl = true diff --git a/README.md b/README.md index 32b20fb6..29c54961 100755 --- a/README.md +++ b/README.md @@ -42,8 +42,40 @@ evaluation license by contacting ContraxSuite Licensing at <>. ## Requirements -* Python 3.8 -* pipenv +* Python 3.11 (default; supported range is defined in `pyproject.toml`) +* `uv` + +## Quick Setup (uv + pyproject) +```bash +cd /Users/jackeames/Downloads/LexNLP +uv python install 3.11 +uv venv --python 3.11 .venv +uv pip install --python .venv/bin/python -e ".[dev,test]" +./.venv/bin/python scripts/bootstrap_assets.py --nltk --contract-model +``` + +## Deprecated Setup Variants +`Pipfile`, `python-requirements.txt`, and `python-requirements-dev.txt` are deprecated and kept only for legacy reproduction. New development and CI updates should use `uv` with `pyproject.toml`. + +## Migration Runbook +See `MIGRATION_RUNBOOK.md` for complete migration/triage/quality-gate procedures. + +## Test Integrity and Full Validation +- Do not add/remove/modify `skip`, `skipif`, or `xfail` markers to bypass failing tests. +- Target is **100% pass**. +- If Stanford assets are enabled, 100% pass includes both base and Stanford-only suites. + +```bash +# Base suite +./.venv/bin/pytest lexnlp + +# Stanford-only suite (run when Stanford assets are installed) +PATH=/opt/homebrew/opt/openjdk/bin:$PATH \ +LEXNLP_USE_STANFORD=true \ +./.venv/bin/pytest \ + lexnlp/nlp/en/tests/test_stanford.py \ + lexnlp/extract/en/entities/tests/test_stanford_ner.py +``` ## Releases * 2.3.0: November 30, 2022 - Twenty sixth scheduled public release; [code](https://github.com/LexPredict/lexpredict-lexnlp/tree/2.3.0) diff --git a/README.rst b/README.rst index 4d12bbd9..b8fa1d0f 100644 --- a/README.rst +++ b/README.rst @@ -75,8 +75,27 @@ terms or a non-GPL evaluation license by contacting ContraxSuite Licensing at Requirements ------------ -- Python 3.8 -- pipenv +- Python 3.11 (default; supported range is defined in ``pyproject.toml``) +- ``uv`` + +Quick Setup (uv + pyproject) +---------------------------- + +.. code:: bash + + cd /Users/jackeames/Downloads/LexNLP + uv python install 3.11 + uv venv --python 3.11 .venv + uv pip install --python .venv/bin/python -e ".[dev,test]" + ./.venv/bin/python scripts/bootstrap_assets.py --nltk --contract-model + +Deprecated Setup Variants +------------------------- + +``Pipfile``, ``python-requirements.txt``, ``python-requirements-dev.txt``, +and ``python-requirements-full.txt`` are deprecated and kept only for +legacy reproduction. New development and CI updates should use ``uv`` with +``pyproject.toml``. Releases -------- diff --git a/ci/check_dist_contents.py b/ci/check_dist_contents.py new file mode 100644 index 00000000..a8096f24 --- /dev/null +++ b/ci/check_dist_contents.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +"""Validate built distribution contents.""" + +from __future__ import annotations + +import sys +import tarfile +import zipfile +from pathlib import Path +from typing import Iterable, List + +BANNED_SUBSTRINGS = ( + "libs/stanford_nlp/", + "scripts/__pycache__/", +) + +BANNED_BASENAMES = ( + "Pipfile", + "Pipfile.lock", + "python-requirements.txt", + "python-requirements-dev.txt", + "python-requirements-full.txt", +) + +BANNED_SUFFIXES = ( + ".pyc", + ".pyo", +) + + +def iter_tar_names(path: Path) -> Iterable[str]: + with tarfile.open(path, "r:*") as archive: + for member in archive.getmembers(): + if member.isfile(): + yield member.name + + +def iter_zip_names(path: Path) -> Iterable[str]: + with zipfile.ZipFile(path) as archive: + for name in archive.namelist(): + if not name.endswith("/"): + yield name + + +def find_violations(names: Iterable[str]) -> List[str]: + violations: List[str] = [] + for name in names: + normalized = name.replace("\\", "/") + if any(token in normalized for token in BANNED_SUBSTRINGS): + violations.append(normalized) + continue + if normalized.rsplit("/", 1)[-1] in BANNED_BASENAMES: + violations.append(normalized) + continue + if normalized.endswith(BANNED_SUFFIXES): + violations.append(normalized) + return violations + + +def main(argv: list[str]) -> int: + dist_dir = Path(argv[0]) if argv else Path("dist") + if not dist_dir.exists(): + print(f"dist-check: missing dist directory: {dist_dir}", file=sys.stderr) + return 1 + + artifacts = sorted( + [*dist_dir.glob("*.whl"), *dist_dir.glob("*.tar.gz")] + ) + if not artifacts: + print(f"dist-check: no build artifacts found under {dist_dir}", file=sys.stderr) + return 1 + + failures: List[str] = [] + for artifact in artifacts: + if artifact.suffix == ".whl": + names = iter_zip_names(artifact) + else: + names = iter_tar_names(artifact) + violations = find_violations(names) + for violation in violations: + failures.append(f"{artifact.name}: {violation}") + + if failures: + print("dist-check: forbidden files found in artifacts", file=sys.stderr) + for failure in failures: + print(f" - {failure}", file=sys.stderr) + return 1 + + print("dist-check: OK") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/ci/skip_audit.py b/ci/skip_audit.py new file mode 100644 index 00000000..1cc1ebdb --- /dev/null +++ b/ci/skip_audit.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +"""Audit pytest skip markers. + +Policy: +- New pytest skip markers must include a nearby annotation: + `skip-audit: issue=... expires=YYYY-MM-DD` +- Existing markers can be grandfathered via an allowlist file. +""" + +from __future__ import annotations + +import argparse +import ast +import datetime as dt +import hashlib +import re +import subprocess +import sys +import warnings +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Set, Tuple + +ANNOTATION_RE = re.compile( + r"skip-audit:\s*issue=(?P\S+)\s+expires=(?P\d{4}-\d{2}-\d{2})" +) +MARKER_NAMES = {"skip", "skipif", "xfail"} +LOOKBACK_LINES = 2 + + +@dataclass(frozen=True) +class Marker: + path: Path + line: int + col: int + kind: str + expression: str + + @property + def key(self) -> str: + # Legacy allowlist key (deprecated): line-number based. + return f"{self.path.as_posix()}:{self.line}:{self.kind}" + + @property + def stable_key(self) -> str: + # Stable allowlist key: resilient to line movements and formatting changes + # outside the marker expression itself. + digest = hashlib.sha256(self.expression.encode("utf-8")).hexdigest()[:12] + return f"{self.path.as_posix()}:{self.kind}:sha256={digest}" + + +def parse_args(argv: Sequence[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Fail if unapproved pytest skip markers are present." + ) + script_path = Path(__file__).resolve() + default_repo_root = script_path.parent.parent + parser.add_argument( + "--repo-root", + default=str(default_repo_root), + help="Repository root path (default: %(default)s)", + ) + parser.add_argument( + "--allowlist", + default="ci/skip_audit_allowlist.txt", + help="Allowlist path (relative to repo root unless absolute).", + ) + parser.add_argument( + "--print-markers", + action="store_true", + help="Print detected skip markers and exit (useful for allowlisting).", + ) + return parser.parse_args(argv) + + +def marker_kind(node: ast.AST) -> Optional[str]: + if not isinstance(node, ast.Attribute): + return None + if node.attr not in MARKER_NAMES: + return None + mark_parent = node.value + if ( + isinstance(mark_parent, ast.Attribute) + and mark_parent.attr == "mark" + and isinstance(mark_parent.value, ast.Name) + and mark_parent.value.id == "pytest" + ): + return node.attr + return None + + +def list_python_files(repo_root: Path) -> List[Path]: + try: + result = subprocess.run( + ["git", "ls-files", "*.py"], + cwd=repo_root, + check=True, + capture_output=True, + text=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError): + return sorted(path for path in repo_root.rglob("*.py") if ".git" not in path.parts) + + files = [] + for line in result.stdout.splitlines(): + relative_path = line.strip() + if not relative_path: + continue + files.append(repo_root / relative_path) + return sorted(files) + + +def collect_markers(repo_root: Path) -> Tuple[List[Marker], List[str]]: + markers: List[Marker] = [] + parse_errors: List[str] = [] + + for file_path in list_python_files(repo_root): + relative_path = file_path.relative_to(repo_root) + try: + source = file_path.read_text(encoding="utf-8") + except UnicodeDecodeError as exc: + parse_errors.append( + f"{relative_path.as_posix()}: failed to decode as UTF-8 ({exc})" + ) + continue + + try: + with warnings.catch_warnings(): + # Older files can emit parser-level invalid escape warnings; + # they are unrelated to skip marker policy and should not fail the audit. + warnings.simplefilter("ignore", SyntaxWarning) + tree = ast.parse(source, filename=str(relative_path)) + except SyntaxError as exc: + parse_errors.append( + f"{relative_path.as_posix()}:{exc.lineno}: syntax error while parsing ({exc.msg})" + ) + continue + + parents: Dict[int, ast.AST] = {} + for parent in ast.walk(tree): + for child in ast.iter_child_nodes(parent): + parents[id(child)] = parent + + seen: Set[Tuple[int, int, str]] = set() + for node in ast.walk(tree): + kind: Optional[str] = None + if isinstance(node, ast.Call): + kind = marker_kind(node.func) + elif isinstance(node, ast.Attribute): + kind = marker_kind(node) + parent = parents.get(id(node)) + if isinstance(parent, ast.Call) and parent.func is node: + kind = None + + if kind is None: + continue + + key = (node.lineno, node.col_offset, kind) + if key in seen: + continue + seen.add(key) + + expression = ast.get_source_segment(source, node) or kind + markers.append( + Marker( + path=relative_path, + line=node.lineno, + col=node.col_offset, + kind=kind, + expression=" ".join(expression.split()), + ) + ) + + markers.sort(key=lambda marker: (marker.path.as_posix(), marker.line, marker.col)) + return markers, parse_errors + + +def load_allowlist(allowlist_path: Path) -> Set[str]: + if not allowlist_path.exists(): + return set() + entries: Set[str] = set() + for raw_line in allowlist_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + entries.add(line) + return entries + + +def find_annotation(lines: Sequence[str], marker_line: int) -> Optional[re.Match[str]]: + start = max(1, marker_line - LOOKBACK_LINES) + for line_number in range(marker_line, start - 1, -1): + line = lines[line_number - 1] + match = ANNOTATION_RE.search(line) + if match: + return match + return None + + +def main(argv: Sequence[str]) -> int: + args = parse_args(argv) + repo_root = Path(args.repo_root).resolve() + allowlist_path = Path(args.allowlist) + if not allowlist_path.is_absolute(): + allowlist_path = (repo_root / allowlist_path).resolve() + + allowlist = load_allowlist(allowlist_path) + markers, parse_errors = collect_markers(repo_root) + + if parse_errors: + print("skip-audit: parse errors detected", file=sys.stderr) + for parse_error in parse_errors: + print(f" - {parse_error}", file=sys.stderr) + return 1 + + if args.print_markers: + for marker in markers: + print(f"{marker.key} stable={marker.stable_key} expr={marker.expression}") + return 0 + + files_cache: Dict[Path, List[str]] = {} + today = dt.date.today() + violations: List[str] = [] + allowlisted_count = 0 + + for marker in markers: + if marker.key in allowlist or marker.stable_key in allowlist: + allowlisted_count += 1 + continue + + lines = files_cache.setdefault( + marker.path, (repo_root / marker.path).read_text(encoding="utf-8").splitlines() + ) + annotation_match = find_annotation(lines, marker.line) + display_id = f"{marker.path.as_posix()}:{marker.line}:{marker.kind}" + if annotation_match is None: + violations.append( + f"{display_id} missing annotation `skip-audit: issue=... expires=YYYY-MM-DD`" + ) + continue + + expires_raw = annotation_match.group("expires") + try: + expires_date = dt.date.fromisoformat(expires_raw) + except ValueError: + violations.append(f"{display_id} has invalid expires date: {expires_raw}") + continue + + if expires_date < today: + violations.append( + f"{display_id} has expired annotation (expires={expires_raw}, today={today.isoformat()})" + ) + + if violations: + print("skip-audit: policy violations found", file=sys.stderr) + for violation in violations: + print(f" - {violation}", file=sys.stderr) + print( + ( + "skip-audit: either add a valid annotation near each marker or update " + f"{allowlist_path.relative_to(repo_root).as_posix()} for approved legacy markers." + ), + file=sys.stderr, + ) + return 1 + + print( + "skip-audit: OK " + f"(markers={len(markers)}, allowlisted={allowlisted_count}, " + f"annotated_new={len(markers) - allowlisted_count})" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/ci/skip_audit_allowlist.txt b/ci/skip_audit_allowlist.txt new file mode 100644 index 00000000..2ba3116a --- /dev/null +++ b/ci/skip_audit_allowlist.txt @@ -0,0 +1,12 @@ +# Grandfathered skip markers for pytest. +# Formats (one per line): +# - Preferred (stable): path:marker:sha256= +# - resilient to unrelated line movements +# - is the first 12 chars of sha256(normalized marker expression) +# - get keys via: `python ci/skip_audit.py --print-markers` +# - Legacy (deprecated): path:line:marker +# +# Prefer annotations near the marker: +# skip-audit: issue= expires=YYYY-MM-DD +# +# This file is reserved for rare cases where annotation is not feasible. diff --git a/lexnlp/__init__.py b/lexnlp/__init__.py index 9d664596..f3c6285a 100755 --- a/lexnlp/__init__.py +++ b/lexnlp/__init__.py @@ -13,7 +13,34 @@ USE_STANFORD = os.environ["LEXNLP_USE_STANFORD"].lower() == "true" if "LEXNLP_USE_STANFORD" in os.environ else False -MODELS_REPO: str = "https://api.github.com/repos/LexPredict/lexpredict-lexnlp/releases/tags/" +DEFAULT_MODELS_REPO_SLUG: str = "LexPredict/lexpredict-lexnlp" +DEFAULT_MODELS_REPO: str = ( + f"https://api.github.com/repos/{DEFAULT_MODELS_REPO_SLUG}/releases/tags/" +) + + +def get_models_repo() -> str: + """ + Base GitHub API URL for LexNLP release tags used by model/corpus downloads. + + Override options: + - LEXNLP_MODELS_REPO: full base URL (should end with `/releases/tags/`) + - LEXNLP_MODELS_REPO_SLUG: GitHub slug like `owner/repo` + + Note: This returns a *base* URL; callers append the tag name. + """ + url = (os.getenv("LEXNLP_MODELS_REPO") or "").strip() + if url: + return url if url.endswith("/") else f"{url}/" + + slug = (os.getenv("LEXNLP_MODELS_REPO_SLUG") or "").strip() + if slug: + return f"https://api.github.com/repos/{slug}/releases/tags/" + + return DEFAULT_MODELS_REPO + + +MODELS_REPO: str = get_models_repo() def get_module_path(): diff --git a/lexnlp/extract/de/date_model.pickle b/lexnlp/extract/de/date_model.pickle index f74d50ac..b1b1901e 100644 Binary files a/lexnlp/extract/de/date_model.pickle and b/lexnlp/extract/de/date_model.pickle differ diff --git a/lexnlp/extract/de/model.pickle b/lexnlp/extract/de/model.pickle index cce13729..49f76796 100644 Binary files a/lexnlp/extract/de/model.pickle and b/lexnlp/extract/de/model.pickle differ diff --git a/lexnlp/extract/en/addresses/addresses_clf.pickle b/lexnlp/extract/en/addresses/addresses_clf.pickle index 591b78df..4e1ff820 100644 Binary files a/lexnlp/extract/en/addresses/addresses_clf.pickle and b/lexnlp/extract/en/addresses/addresses_clf.pickle differ diff --git a/lexnlp/extract/en/addresses/tests/test_addresses.py b/lexnlp/extract/en/addresses/tests/test_addresses.py index 75f34909..7e37ae50 100644 --- a/lexnlp/extract/en/addresses/tests/test_addresses.py +++ b/lexnlp/extract/en/addresses/tests/test_addresses.py @@ -8,7 +8,6 @@ from lexnlp.extract.en.addresses.addresses import get_address_spans, _safe_index from lexnlp.tests import lexnlp_tests -from nose.tools import assert_true, assert_equal def test_get_address(): @@ -18,7 +17,7 @@ def test_get_address(): def test_safe_index(): actual = _safe_index('hello world', 'world', 1) - assert_equal(actual, 6) + assert actual == 6 def test_safe_index_not_found(): @@ -26,7 +25,7 @@ def test_safe_index_not_found(): _safe_index('hello world', 'world', 7) raise AssertionError('Should raise ValueError before this line') except ValueError as e: - assert_true('start' in str(e)) + assert 'start' in str(e) # def test_bad_cases(): # lexnlp_tests.test_extraction_func_on_test_data(get_addresses) diff --git a/lexnlp/extract/en/contracts/README.md b/lexnlp/extract/en/contracts/README.md index d92c8ba0..78c139fb 100644 --- a/lexnlp/extract/en/contracts/README.md +++ b/lexnlp/extract/en/contracts/README.md @@ -41,5 +41,57 @@ Training processes can be found under `notebooks/classification/contracts/` ## Contract Type Classifier -*Not yet implemented* +### Usage + +Instantiate the classifier: + +```python +from lexnlp.extract.en.contracts.predictors import ProbabilityPredictorContractType + +predictor = ProbabilityPredictorContractType() +predictions = predictor.make_predictions("This Employment Agreement is entered into ...", top_n=3) +print(predictions) +``` + +Or infer a single label with thresholding: + +```python +classification = predictor.detect_contract_type( + "This Employment Agreement is entered into ...", + min_probability=0.15, + max_closest_probability=0.75, + unknown_classification="", +) +print(classification) +``` +### Runtime model tags + +The default catalog tag for this predictor is `pipeline/contract-type/0.1`. +On modern Python runtimes, this legacy artifact may fail to unpickle. + +When legacy loading fails and no explicit override is configured, the predictor +auto-falls back to a runtime-compatible tag: `pipeline/contract-type/0.2-runtime`. + +You can explicitly select a tag at runtime: + +```bash +export LEXNLP_CONTRACT_TYPE_MODEL_TAG="pipeline/contract-type/0.2-runtime" +``` + +### Bootstrap / Training + +Build (or reuse) the runtime-compatible contract-type model from the released +corpus tag `corpus/contract-types/0.1`: + +```bash +python scripts/bootstrap_assets.py --contract-type-model +``` + +Train explicitly and write a training summary report: + +```bash +python scripts/train_contract_type_model.py \ + --target-tag pipeline/contract-type/0.2-runtime \ + --output-json artifacts/model_training/contract_type_model_training_report.json +``` diff --git a/lexnlp/extract/en/contracts/predictors.py b/lexnlp/extract/en/contracts/predictors.py index 0e5dff56..2815bfc6 100644 --- a/lexnlp/extract/en/contracts/predictors.py +++ b/lexnlp/extract/en/contracts/predictors.py @@ -10,6 +10,8 @@ # standard library +import logging +import pickle from typing import Iterable, Tuple, Union # third-party imports @@ -20,12 +22,57 @@ from lexnlp.ml.predictor import ProbabilityPredictor +LOGGER = logging.getLogger(__name__) + + class ProbabilityPredictorIsContract(ProbabilityPredictor): """ Uses a Scikit-Learn Pipeline to classify textual input as is/is-not a contract. """ - _DEFAULT_PIPELINE: str = 'pipeline/is-contract/0.1' + _DEFAULT_PIPELINE: str = 'pipeline/is-contract/0.2' + _DEFAULT_PIPELINE_ENV_VAR: str = 'LEXNLP_IS_CONTRACT_MODEL_TAG' + _LEGACY_FALLBACK_PIPELINE: str = 'pipeline/is-contract/0.1' + + @classmethod + def get_default_pipeline(cls): + try: + return super().get_default_pipeline() + except ( + FileNotFoundError, + EOFError, + ImportError, + AttributeError, + TypeError, + ValueError, + pickle.UnpicklingError, + ) as exc: + # Respect explicit env override failures and only auto-fallback for + # the default model tag. + if cls.get_default_pipeline_tag() != cls._DEFAULT_PIPELINE: + raise + + LOGGER.warning( + "Failed to load default contract model tag=%s; falling back to legacy tag=%s. error=%s", + cls._DEFAULT_PIPELINE, + cls._LEGACY_FALLBACK_PIPELINE, + exc, + exc_info=True, + ) + + try: + from cloudpickle import load + + from lexnlp.ml.catalog import get_path_from_catalog + + legacy_path = get_path_from_catalog(cls._LEGACY_FALLBACK_PIPELINE) + with legacy_path.open("rb") as legacy_file: + return load(legacy_file) + except Exception as fallback_error: + raise RuntimeError( + "Failed to load default contract model and legacy fallback model. " + "Run `python scripts/bootstrap_assets.py --contract-model` and retry." + ) from fallback_error def _sanity_check(self) -> None: """ @@ -72,12 +119,54 @@ class ProbabilityPredictorContractType(ProbabilityPredictor): """ _DEFAULT_PIPELINE: str = 'pipeline/contract-type/0.1' + _DEFAULT_PIPELINE_ENV_VAR: str = 'LEXNLP_CONTRACT_TYPE_MODEL_TAG' + _RUNTIME_FALLBACK_PIPELINE: str = 'pipeline/contract-type/0.2-runtime' def _sanity_check(self) -> None: """ Does nothing. No sanity check required. """ + @classmethod + def get_default_pipeline(cls): + try: + return super().get_default_pipeline() + except ( + FileNotFoundError, + EOFError, + ImportError, + AttributeError, + TypeError, + ValueError, + pickle.UnpicklingError, + ) as exc: + # Respect explicit env override failures and only auto-fallback for + # the legacy default model tag. + if cls.get_default_pipeline_tag() != cls._DEFAULT_PIPELINE: + raise + + LOGGER.warning( + "Failed to load legacy contract-type model tag=%s; falling back to runtime tag=%s. error=%s", + cls._DEFAULT_PIPELINE, + cls._RUNTIME_FALLBACK_PIPELINE, + exc, + exc_info=True, + ) + + from lexnlp.extract.en.contracts.runtime_model import ( + ensure_runtime_contract_type_model, + load_pipeline_for_tag, + ) + + ensure_runtime_contract_type_model(target_tag=cls._RUNTIME_FALLBACK_PIPELINE) + try: + return load_pipeline_for_tag(cls._RUNTIME_FALLBACK_PIPELINE) + except Exception as fallback_error: + raise RuntimeError( + "Failed to load legacy contract-type model and runtime fallback model. " + "Run `python scripts/bootstrap_assets.py --contract-type-model` and retry." + ) from fallback_error + def make_predictions( self, text: Union[str, Iterable[str]], diff --git a/lexnlp/extract/en/contracts/runtime_model.py b/lexnlp/extract/en/contracts/runtime_model.py new file mode 100644 index 00000000..3ee4960d --- /dev/null +++ b/lexnlp/extract/en/contracts/runtime_model.py @@ -0,0 +1,214 @@ +""" +Utilities to build and load a Python 3.11-compatible contract-type classifier. +""" + +__author__ = "ContraxSuite, LLC; LexPredict, LLC" +__copyright__ = "Copyright 2015-2021, ContraxSuite, LLC" +__license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/2.3.0/LICENSE" +__version__ = "2.3.0" +__maintainer__ = "LexPredict, LLC" +__email__ = "support@contraxsuite.com" + + +# standard library +import logging +import pickle +import tarfile +from collections import defaultdict +from pathlib import Path +from typing import Dict, Iterable, List, Sequence, Tuple + +# third-party imports +from cloudpickle import load +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline + + +LOGGER = logging.getLogger(__name__) + +LEGACY_CONTRACT_TYPE_TAG = "pipeline/contract-type/0.1" +RUNTIME_CONTRACT_TYPE_TAG = "pipeline/contract-type/0.2-runtime" +CONTRACT_TYPE_CORPUS_TAG = "corpus/contract-types/0.1" +CONTRACT_TYPE_MODEL_FILENAME = "pipeline_contract_type_classifier.cloudpickle" + + +def ensure_tag_downloaded(tag: str) -> Path: + from lexnlp.ml.catalog import get_path_from_catalog + from lexnlp.ml.catalog.download import download_github_release + + try: + return get_path_from_catalog(tag) + except FileNotFoundError: + LOGGER.info("Catalog tag missing; downloading release tag=%s", tag) + download_github_release(tag, prompt_user=False) + return get_path_from_catalog(tag) + + +def load_pipeline_for_tag(tag: str) -> Pipeline: + path = ensure_tag_downloaded(tag) + with path.open("rb") as model_file: + return load(model_file) + + +def _extract_label(member_name: str) -> str: + # Expected shape: CONTRACT_TYPES/