From ace4c4a10bc176f2491c11a052f16cdb85606cfd Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 21 Apr 2026 03:35:39 +0200 Subject: [PATCH 01/73] =?UTF-8?q?feat:=20consolidate=20refactor=20?= =?UTF-8?q?=E2=80=94=20protea-reranker-lab=20integration=20(phases=201-6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contract-first integration with protea-reranker-lab: PROTEA produces parquet datasets + manifest via export operation, consumes trained booster artifacts via RerankerModel + ArtifactStore. Zero runtime cross-imports — only protea_reranker_lab.contracts (pydantic-pure) is shared dev-time. - Phase 3: ArtifactStore abstraction (LocalFs default, MinIO opt-in via docker compose profiles: ["storage"] + [storage] extra). storage config under protea/infrastructure/storage/; PROTEA_STORAGE_* env overrides; tests/test_storage.py. - Phase 4: ExportResearchDatasetOperation + shared parquet_export utility refactored out of train_reranker; operation_catalog entry; routes via protea.jobs queue. - Phase 5: RerankerModel nullable artifact columns (artifact_uri, feature_schema_sha, embedding_config_id FK, ontology_snapshot_id FK, producer_version, producer_git_sha, spec_yaml); alembic migration c517e16da06b with named constraints; scripts/register_reranker.py CLI for run-dir → ORM row promotion. - Phase 6: predict_go_terms reranker integration — strict sha-equality validation at batch-worker level with reranker.schema_mismatch fallback (never crashes inference); reranking.py module (load_reranker, apply_reranker, infer_active_feature_families); 8 new tests covering coordinator validation + batch fallback paths. - Sphinx docs full pass + ADR-007-contract-first-lab-integration. - Thesis LaTeX: new Reranker Promotion Pipeline section in implementation chapter, RerankerModel subsection in data model. - Benchmark router + web UI pages (/benchmark, /experiments), Grafana visitor dashboard, CAFA evaluation pipeline updates, ablation tooling, embedding backend verification script. --- .env | 1 + .gitignore | 1 + .~lock.EXPERIMENTS.md# | 1 + .~lock.RERANKER.md# | 1 + EXPERIMENTAL_DESIGN.md | 197 +++ EXPERIMENTS.md | 502 ++++++ README.md | 38 +- RERANKER.md | 297 ++-- ...add_consensus_features_to_go_prediction.py | 37 + ...d_reranker_v6_features_to_go_prediction.py | 55 + ...1f4ec0e42_sequence_embedding_to_halfvec.py | 54 + ...7_add_embedding_config_display_metadata.py | 40 + ...6f7a8b9_add_taxonomy_to_query_set_entry.py | 43 + ...16da06b_reranker_model_artifact_columns.py | 81 + .../f7a004f5f2c7_add_visitor_events.py | 40 + apps/web/app/[locale]/benchmark/page.tsx | 563 ++++++ apps/web/app/[locale]/embeddings/page.tsx | 7 +- apps/web/app/[locale]/evaluation/page.tsx | 1 + apps/web/app/[locale]/jobs/[id]/page.tsx | 28 +- apps/web/app/[locale]/jobs/page.tsx | 40 +- apps/web/app/[locale]/layout.tsx | 7 +- apps/web/app/[locale]/page.tsx | 258 ++- apps/web/components/AnnotateForm.tsx | 122 +- apps/web/components/FloatingJobsWidget.tsx | 2 +- apps/web/components/NavLinks.tsx | 1 + apps/web/lib/api.ts | 150 +- apps/web/messages/de.json | 8 +- apps/web/messages/en.json | 15 +- apps/web/messages/es.json | 15 +- apps/web/messages/pt.json | 8 +- apps/web/messages/zh.json | 8 +- apps/web/public/thesis.pdf | Bin 464324 -> 616303 bytes deploy/grafana/dashboards/visitors.json | 364 ++++ .../provisioning/dashboards/dashboards.yml | 19 + .../provisioning/datasources/postgres.yml | 28 + docker-compose.monitoring.yml | 37 + docker-compose.yml | 23 + docs/requirements.txt | 1 + .../007-contract-first-lab-integration.rst | 112 ++ docs/source/adr/index.rst | 4 + docs/source/appendix/configuration.rst | 48 +- docs/source/appendix/howto_guides.rst | 121 ++ docs/source/appendix/index.rst | 29 + .../appendix/installation_and_quickstart.rst | 42 +- docs/source/appendix/reproduction_guide.rst | 526 ++++++ docs/source/architecture/data_model.rst | 64 +- docs/source/architecture/evaluation.rst | 210 +++ docs/source/architecture/index.rst | 70 +- docs/source/architecture/job_lifecycle.rst | 30 + docs/source/architecture/operations.rst | 586 ++++++- docs/source/architecture/system_overview.rst | 92 +- docs/source/conf.py | 20 +- docs/source/glossary.rst | 153 ++ docs/source/index.rst | 54 +- docs/source/introduction.rst | 264 ++- docs/source/reference/api.rst | 13 + docs/source/reference/core.rst | 133 +- docs/source/reference/infrastructure.rst | 89 +- docs/source/reference/workers.rst | 9 + docs/source/references.bib | 273 +++ docs/source/references.rst | 11 + docs/source/related_work.rst | 161 ++ docs/source/results.rst | 39 + poetry.lock | 512 +++++- protea/__init__.py | 1 + protea/api/app.py | 18 + protea/api/cache.py | 45 + protea/api/deps.py | 18 + protea/api/middleware/__init__.py | 1 + protea/api/middleware/visitor_counter.py | 188 ++ protea/api/routers/annotate.py | 35 +- protea/api/routers/annotations.py | 167 +- protea/api/routers/benchmark.py | 367 ++++ protea/api/routers/embeddings.py | 74 +- protea/api/routers/jobs.py | 91 +- protea/api/routers/proteins.py | 96 +- protea/api/routers/query_sets.py | 91 +- protea/api/routers/scoring.py | 338 ++-- protea/api/routers/showcase.py | 288 ++-- protea/config/benchmark.yaml | 47 + protea/config/system.yaml | 8 + protea/core/anc2vec_embeddings.py | 60 + protea/core/contracts/operation.py | 12 + protea/core/evaluation.py | 368 +++- protea/core/feature_engineering.py | 6 +- protea/core/knn_search.py | 94 +- protea/core/metrics.py | 15 +- protea/core/operation_catalog.py | 58 + protea/core/operations/compute_embeddings.py | 257 ++- .../operations/export_research_dataset.py | 176 ++ .../core/operations/fetch_uniprot_metadata.py | 17 + .../operations/generate_evaluation_set.py | 79 +- protea/core/operations/insert_proteins.py | 17 + .../core/operations/load_goa_annotations.py | 19 + .../core/operations/load_ontology_snapshot.py | 17 +- .../operations/load_quickgo_annotations.py | 13 + protea/core/operations/ping.py | 4 + protea/core/operations/predict_go_terms.py | 955 ++++++++++- protea/core/operations/run_cafa_evaluation.py | 465 +++-- protea/core/operations/train_reranker.py | 1528 +++++++++++++---- protea/core/parquet_export.py | 233 +++ protea/core/reranker.py | 336 +++- protea/core/reranking.py | 183 ++ protea/core/scoring.py | 107 ++ protea/infrastructure/benchmark_config.py | 90 + protea/infrastructure/logging.py | 1 + protea/infrastructure/orm/models/__init__.py | 1 + .../orm/models/annotation/go_term.py | 2 +- .../orm/models/embedding/embedding_config.py | 10 +- .../orm/models/embedding/go_prediction.py | 36 + .../orm/models/embedding/reranker_model.py | 48 +- .../orm/models/embedding/scoring_config.py | 30 +- .../models/embedding/sequence_embedding.py | 4 +- .../orm/models/query/query_set.py | 2 + .../orm/models/visitor_event.py | 44 + protea/infrastructure/queue/consumer.py | 185 +- protea/infrastructure/settings.py | 71 +- protea/infrastructure/storage/__init__.py | 56 + protea/infrastructure/storage/factory.py | 59 + protea/infrastructure/storage/local.py | 40 + protea/infrastructure/storage/minio_store.py | 79 + protea/workers/base_worker.py | 2 +- protea/workers/stale_job_reaper.py | 64 +- protea_mcp/__init__.py | 4 + protea_mcp/__main__.py | 23 + protea_mcp/audit.py | 65 + protea_mcp/auth.py | 27 + protea_mcp/bootstrap.py | 54 + protea_mcp/db.py | 28 + protea_mcp/prompts/__init__.py | 0 protea_mcp/prompts/debugging.py | 23 + protea_mcp/resources/__init__.py | 0 protea_mcp/resources/protea_resources.py | 41 + protea_mcp/schemas.py | 66 + protea_mcp/server.py | 22 + protea_mcp/tests/__init__.py | 0 protea_mcp/tests/test_tools.py | 59 + protea_mcp/tools/__init__.py | 0 protea_mcp/tools/admin.py | 148 ++ protea_mcp/tools/notes.py | 160 ++ protea_mcp/tools/operations.py | 104 ++ protea_mcp/tools/prediction_sets.py | 163 ++ protea_mcp/tools/queues.py | 110 ++ pyproject.toml | 9 +- scripts/compute_ia_for_snapshot.py | 210 +++ scripts/dump_reranker_dataset.py | 102 ++ scripts/feed_evals_phaseA.py | 140 ++ scripts/hybrid_picker_eval.py | 152 ++ scripts/manage.sh | 24 +- scripts/overnight_v6.py | 247 +++ scripts/overnight_v6_retry.py | 236 +++ scripts/overnight_v7.py | 190 ++ scripts/overnight_v8.py | 210 +++ scripts/profile_predict_batch.py | 207 +++ scripts/register_reranker.py | 208 +++ scripts/run_ablation_evaluations.py | 167 ++ scripts/run_ablation_predictions.py | 243 +++ scripts/verify_embedding_backends.py | 595 +++++++ scripts/worker.py | 41 +- tests/conftest.py | 31 +- tests/test_admin_router.py | 4 + tests/test_annotate_router.py | 50 +- tests/test_annotations_router.py | 131 +- tests/test_api.py | 82 +- tests/test_api_annotations.py | 138 +- tests/test_api_query_sets.py | 121 +- tests/test_base_worker.py | 165 +- tests/test_benchmark_router.py | 295 ++++ tests/test_compute_embeddings.py | 327 +++- tests/test_core.py | 34 +- tests/test_embeddings_router.py | 220 ++- tests/test_evaluation.py | 197 ++- tests/test_feature_engineering.py | 33 +- tests/test_fetch_uniprot_metadata.py | 69 +- tests/test_generate_evaluation_set.py | 65 +- tests/test_infrastructure.py | 81 +- tests/test_insert_proteins.py | 52 +- tests/test_integration.py | 176 +- tests/test_load_goa_annotations.py | 196 ++- tests/test_load_ontology_snapshot.py | 59 +- tests/test_load_quickgo_annotations.py | 205 ++- tests/test_logging.py | 2 + tests/test_metrics.py | 25 +- tests/test_predict_go_terms.py | 409 ++++- tests/test_proteins_router.py | 61 +- tests/test_queue.py | 167 +- tests/test_real_models.py | 14 +- tests/test_reranker.py | 1 + tests/test_run_cafa_evaluation.py | 199 ++- tests/test_scoring.py | 8 + tests/test_scoring_router.py | 231 ++- tests/test_showcase_router.py | 327 ++-- tests/test_storage.py | 96 ++ tests/test_support_maintenance_routers.py | 21 +- tests/test_train_reranker.py | 181 +- 195 files changed, 20152 insertions(+), 2873 deletions(-) create mode 100644 .env create mode 100644 .~lock.EXPERIMENTS.md# create mode 100644 .~lock.RERANKER.md# create mode 100644 EXPERIMENTAL_DESIGN.md create mode 100644 EXPERIMENTS.md create mode 100644 alembic/versions/651358a5a2c8_add_consensus_features_to_go_prediction.py create mode 100644 alembic/versions/7a2c9e1d0b33_add_reranker_v6_features_to_go_prediction.py create mode 100644 alembic/versions/b1a1f4ec0e42_sequence_embedding_to_halfvec.py create mode 100644 alembic/versions/b2c3d4e5f6a7_add_embedding_config_display_metadata.py create mode 100644 alembic/versions/c4d5e6f7a8b9_add_taxonomy_to_query_set_entry.py create mode 100644 alembic/versions/c517e16da06b_reranker_model_artifact_columns.py create mode 100644 alembic/versions/f7a004f5f2c7_add_visitor_events.py create mode 100644 apps/web/app/[locale]/benchmark/page.tsx create mode 100644 deploy/grafana/dashboards/visitors.json create mode 100644 deploy/grafana/provisioning/dashboards/dashboards.yml create mode 100644 deploy/grafana/provisioning/datasources/postgres.yml create mode 100644 docker-compose.monitoring.yml create mode 100644 docs/source/adr/007-contract-first-lab-integration.rst create mode 100644 docs/source/appendix/reproduction_guide.rst create mode 100644 docs/source/glossary.rst create mode 100644 docs/source/references.bib create mode 100644 docs/source/references.rst create mode 100644 docs/source/related_work.rst create mode 100644 protea/api/cache.py create mode 100644 protea/api/middleware/__init__.py create mode 100644 protea/api/middleware/visitor_counter.py create mode 100644 protea/api/routers/benchmark.py create mode 100644 protea/config/benchmark.yaml create mode 100644 protea/core/anc2vec_embeddings.py create mode 100644 protea/core/operation_catalog.py create mode 100644 protea/core/operations/export_research_dataset.py create mode 100644 protea/core/parquet_export.py create mode 100644 protea/core/reranking.py create mode 100644 protea/infrastructure/benchmark_config.py create mode 100644 protea/infrastructure/orm/models/visitor_event.py create mode 100644 protea/infrastructure/storage/__init__.py create mode 100644 protea/infrastructure/storage/factory.py create mode 100644 protea/infrastructure/storage/local.py create mode 100644 protea/infrastructure/storage/minio_store.py create mode 100644 protea_mcp/__init__.py create mode 100644 protea_mcp/__main__.py create mode 100644 protea_mcp/audit.py create mode 100644 protea_mcp/auth.py create mode 100644 protea_mcp/bootstrap.py create mode 100644 protea_mcp/db.py create mode 100644 protea_mcp/prompts/__init__.py create mode 100644 protea_mcp/prompts/debugging.py create mode 100644 protea_mcp/resources/__init__.py create mode 100644 protea_mcp/resources/protea_resources.py create mode 100644 protea_mcp/schemas.py create mode 100644 protea_mcp/server.py create mode 100644 protea_mcp/tests/__init__.py create mode 100644 protea_mcp/tests/test_tools.py create mode 100644 protea_mcp/tools/__init__.py create mode 100644 protea_mcp/tools/admin.py create mode 100644 protea_mcp/tools/notes.py create mode 100644 protea_mcp/tools/operations.py create mode 100644 protea_mcp/tools/prediction_sets.py create mode 100644 protea_mcp/tools/queues.py create mode 100644 scripts/compute_ia_for_snapshot.py create mode 100644 scripts/dump_reranker_dataset.py create mode 100644 scripts/feed_evals_phaseA.py create mode 100644 scripts/hybrid_picker_eval.py create mode 100644 scripts/overnight_v6.py create mode 100644 scripts/overnight_v6_retry.py create mode 100644 scripts/overnight_v7.py create mode 100644 scripts/overnight_v8.py create mode 100644 scripts/profile_predict_batch.py create mode 100644 scripts/register_reranker.py create mode 100644 scripts/run_ablation_evaluations.py create mode 100644 scripts/run_ablation_predictions.py create mode 100644 scripts/verify_embedding_backends.py create mode 100644 tests/test_benchmark_router.py create mode 100644 tests/test_storage.py diff --git a/.env b/.env new file mode 100644 index 0000000..1a96347 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +export PROTEA_ADMIN_TOKEN="protea-admin" diff --git a/.gitignore b/.gitignore index 818082d..803317b 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ CLAUDE.md # Local data static/ storage/ +!protea/infrastructure/storage/ # Large embedding caches and test artifacts data/ref_cache/ diff --git a/.~lock.EXPERIMENTS.md# b/.~lock.EXPERIMENTS.md# new file mode 100644 index 0000000..67ffcb0 --- /dev/null +++ b/.~lock.EXPERIMENTS.md# @@ -0,0 +1 @@ +,frapercan,bioxaxi,21.03.2026 13:10,/home/frapercan/snap/onlyoffice-desktopeditors/1067/.local/share/onlyoffice; \ No newline at end of file diff --git a/.~lock.RERANKER.md# b/.~lock.RERANKER.md# new file mode 100644 index 0000000..75e7420 --- /dev/null +++ b/.~lock.RERANKER.md# @@ -0,0 +1 @@ +,frapercan,bioxaxi,17.03.2026 17:01,/home/frapercan/snap/onlyoffice-desktopeditors/1067/.local/share/onlyoffice; \ No newline at end of file diff --git a/EXPERIMENTAL_DESIGN.md b/EXPERIMENTAL_DESIGN.md new file mode 100644 index 0000000..ca6ba18 --- /dev/null +++ b/EXPERIMENTAL_DESIGN.md @@ -0,0 +1,197 @@ +# PROTEA — Experimental Design + +**Version**: 1.0 — 2026-04-10 +**Status**: Active +**Scope**: Protein language model (PLM) benchmark for GO term prediction via KNN + learned reranking + +> This document is **prospective**: it formalises the protocol, hypotheses, and execution plan for the extended PLM comparison. Retrospective results (finished experiments, ablations, external tool comparisons) live in `EXPERIMENTS.md`. The reranker design rationale lives in `RERANKER.md`. + +--- + +## 1. Motivation + +The preliminary comparison in `EXPERIMENTS.md` (ESMC-300M vs ProstT5-XL) **confounds two independent variables**: model family and parameter count. ESMC-300M is a ~300M-parameter BERT-like encoder; ProstT5-XL is a ~3B-parameter T5 encoder with structural fine-tuning. Any observed difference in downstream Fmax cannot be attributed to either axis unambiguously. + +This document defines the extended benchmark that disentangles those factors and integrates additional PLMs (Ankh, ESM2, ESMC-600M, ProtT5-XL) into a single, statistically comparable grid under an identical downstream pipeline. + +--- + +## 2. Research questions + +| ID | Question | +|---|---| +| **RQ1** | At matched parameter count, does a BERT-like encoder (ESM2, ESMC) outperform a T5 encoder (ProtT5, Ankh) for GO term transfer via KNN? | +| **RQ2** | Holding model family fixed, how does Fmax scale with parameter count? Where does the curve saturate? | +| **RQ3** | Does structure-aware fine-tuning (ProstT5) yield a measurable Fmax improvement over its pure-sequence parent (ProtT5-XL) at identical size? | +| **RQ4** | Does the learned reranker compensate for weaker embeddings by placing more weight on alignment and taxonomy features? Is there a systematic inverse relationship between embedding quality and reranker feature-importance on these compensatory signals? | + +--- + +## 3. Hypotheses (pre-registered) + +| # | Hypothesis | Primary test | +|---|---|---| +| **H1** | At small scale (~300–650M), family effect dominates scale effect (ΔFmax across families ≥ ΔFmax across sizes within a family) | Wilcoxon signed-rank across 9-cell Fmax vectors, pairwise within the small tier | +| **H2** | Scale gains within a single family saturate in the 1–3B range | Monotonicity of Fmax across {ESM2-650M, ESM2-3B} and {Ankh-base, Ankh-large, ProtT5-XL} | +| **H3** | Structure awareness provides a positive but modest gain (+1–3 Fmax points averaged across cells) | Pairwise matched test ProtT5-XL vs ProstT5-XL (same backbone, same size, only fine-tuning differs) | +| **H4** | Reranker gain-based importance on `{alignment_*, similarity_*, taxonomic_*}` features is inversely correlated with the baseline Fmax of the underlying embedding | Linear regression across the 8 models: `weight_on_compensatory` ~ `baseline_Fmax` | + +H1–H3 are confirmatory; H4 is exploratory and carries forward the **F2 finding** from the ESMC vs ProstT5 analysis in `project_reranker_benchmark.md`. + +--- + +## 4. Model matrix + +**8 models total** (2 already computed, 6 new). + +| # | Model | Backbone | Params | PROTEA backend | Status | +|---|---|---|---|---|---| +| 1 | **ESMC-300M** | ESM3c (EvolutionaryScale) | ~300M | `esm3c` | ✓ computed; reranker v4 in progress (`48c91381`) | +| 2 | **ESMC-600M** | ESM3c (EvolutionaryScale) | ~600M | `esm3c` | new | +| 3 | **ESM2-650M** | ESM2 `esm2_t33_650M_UR50D` (Meta) | ~650M | `esm` | new | +| 4 | **ESM2-3B** | ESM2 `esm2_t36_3B_UR50D` (Meta) | ~3B | `esm` | new | +| 5 | **Ankh-base** | Ankh `ElnaggarLab/ankh-base` | ~450M | `ankh` | new | +| 6 | **Ankh-large** | Ankh `ElnaggarLab/ankh-large` | ~1.9B | `ankh` | new | +| 7 | **ProtT5-XL** | ProtT5 `prot_t5_xl_uniref50` (Rostlab) | ~3B | `t5` | new | +| 8 | **ProstT5-XL** | ProstT5 structure-fine-tuned (Rostlab) | ~3B | `t5` | ✓ computed; reranker v4 in progress (`e923ac70`) | + +**Discarded**: ESM2-15B (prohibitive embedding cost over 527k sequences; no matched-size T5 counterpart → breaks symmetry of the grid). + +### Explanatory grid (for RQ1 / RQ2 / RQ3) + +| Scale | BERT-like encoder | T5 encoder (sequence-only) | T5 encoder (structure-aware) | +|---|---|---|---| +| **Small (~300–650M)** | ESMC-300M, ESMC-600M, ESM2-650M | Ankh-base (~450M) | — | +| **Medium (~1–2B)** | — | Ankh-large (~1.9B) | — | +| **Large (~3B)** | ESM2-3B | ProtT5-XL | ProstT5-XL | + +### Planned pairwise comparisons + +| Pair | Isolates | RQ | +|---|---|---| +| ESMC-300M ↔ Ankh-base | architecture (BERT vs T5), ~matched size | RQ1 | +| ESM2-650M ↔ Ankh-base | architecture, ~matched size | RQ1 | +| ESMC-300M ↔ ESMC-600M | scale, family fixed | RQ2 | +| ESM2-650M ↔ ESM2-3B | scale, family fixed | RQ2 | +| Ankh-base ↔ Ankh-large ↔ ProtT5-XL | scale ladder within T5 encoder family | RQ2 | +| **ProtT5-XL ↔ ProstT5-XL** | structure fine-tuning (cleanest test) | **RQ3** | + +--- + +## 5. Data and splits (fixed across all 8 runs) + +Identical to the ESMC/ProstT5 preliminary experiments in `EXPERIMENTS.md` to preserve backward comparability with established findings. + +| Item | Value | +|---|---| +| Reference annotation sets | GOA releases 160 → 220 (13 temporal splits for reranker training) | +| Evaluation set | `42b34e79-6fe9-4fa0-b718-02f43a1e3192` (GOA 220 → 229 delta) | +| Evaluation size | 20,281 proteins (NK=2,831; LK=3,410; PK=15,313) | +| Ontology snapshot | `947bdff6-d17c-4ca3-a41a-bc8fb4d74b7a` (GO release 2026-01-23) | +| IA file | `data/benchmarks/IA_cafa6.tsv` (CAFA6 information accretion) | + +--- + +## 6. Pipeline protocol — pinned hyperparameters + +Every model is put through the same three-stage pipeline with **identical hyperparameters**. No per-model tuning. Fair comparison requires this invariance. + +### 6.1 Embeddings — `compute_embeddings` +- Pooling: `mean` over residue representations +- Precision: fp32 at storage (cast to fp16 at KNN load time via `_REF_CACHE`) +- Storage: pgvector `VECTOR(dim)` per `(sequence, config, chunk)` +- Full reference set (~527k sequences) + evaluation set query embeddings + +### 6.2 KNN retrieval — `predict_go_terms` +- `k = 5` +- `metric = cosine` +- `backend = faiss`, `faiss_index_type = IVFFlat`, `nlist = 256`, `nprobe = 32` +- `aspect_separated_knn = true` +- `compute_alignments = true` (NW + SW via parasail/BLOSUM62) +- `compute_taxonomy = true` (NCBI taxonomy LCA via ete3) + +### 6.3 Reranker training — `train_reranker_auto` (v4 budget) +- `num_boost_round = 5000` +- `early_stopping_rounds = 100` +- `val_fraction = 0.2` +- `neg_pos_ratio = 10` +- `train_versions = [160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 211, 215, 220]` (13 splits) +- `test_versions = [229]` +- `compute_alignments = true`, `compute_taxonomy = true` +- `ia_file = data/benchmarks/IA_cafa6.tsv` (IA-weighted sample weighting: `sample_weight = IA(go_term)`) +- **3 models per embedding (NK / LK / PK)** — per-category, not per-aspect (justified in `RERANKER.md` §6.3) +- Objective: **binary cross-entropy (LightGBM `objective=binary`)**, early stopping on validation AUC. IA weights enter through `sample_weight`, not through the objective. See `RERANKER.md` §6.1 for rationale and the known limitation that a pairwise/listwise rank loss is future work. +- Name convention: `lgbm_v4_converged_-{nk,lk,pk}` + +### 6.4 Evaluation — `run_cafa_evaluation` +- Library: `cafaeval` (integrated via the `run_cafa_evaluation` operation) +- Metric: **Fmax with IA weighting**, computed per (tier × aspect) cell → 9-dimensional output vector per model×pipeline-stage +- Pipeline stages reported: `baseline` (embedding only), `alignment_weighted` (best heuristic from Exp 3), `reranker` (v4 LightGBM) + +--- + +## 7. Statistical protocol + +Pre-registered to prevent post-hoc test-shopping. + +| Aspect | Method | +|---|---| +| **Primary outcome** | 9-cell Fmax vector per (model, pipeline-stage) | +| **Pairwise test** | Wilcoxon signed-rank over the 9 matched cells | +| **Multiple comparisons** | Holm–Bonferroni correction across the planned comparisons in §4 (6 RQ1/RQ2/RQ3 tests) | +| **Effect size** | Mean Fmax delta ± 95% bootstrap CI (1000 resamples over cells) | +| **H4 regression** | For each (model, tier): `weight_compensatory = Σ importance(feature)` over features in `{alignment_score_*, similarity_*, identity_*, gaps_pct_*, alignment_length_*, taxonomic_*}`. Fit `weight_compensatory ~ baseline_Fmax` across the 8 models via OLS; report slope, p-value, R² | +| **Reporting convention** | All numbers from `cafaeval` with IA weighting. **Never** use the internal `test_evaluation` field from `train_reranker_auto` for thesis claims — it is unweighted and biased (see `project_reranker_benchmark.md`) | + +--- + +## 8. Execution plan + +Ordered so each stage produces usable partial results; no stage blocks on the next. + +| Step | Action | Depends on | Compute estimate | +|---|---|---|---| +| 1 | Wait for v4 rerankers (ESMC-300M, ProstT5-XL) to finish | running | ~4h total (sequential) | +| 2 | Create 6 `EmbeddingConfig` rows with pinned pooling/precision | — | minutes | +| 3 | Run `compute_embeddings` for the 6 new models over ref+eval sets | step 2 | 2–10h per model; ~1.5–2 days total sequential | +| 4 | Run `predict_go_terms` (with alignments + taxonomy) for the 6 new models | step 3 | 1–2h per model | +| 5 | Run `train_reranker_auto` v4 for the 6 new models in `protea.training` queue | step 4 | 2–4h per model; ~1 day total sequential | +| 6 | Run `run_cafa_evaluation` for all 8 models × 3 stages = 24 evals | step 5 + existing | ~10 min per eval; ~4h total | +| 7 | Extract feature importance from all 24 (model × tier) rerankers | step 5 | minutes (script) | +| 8 | Apply the statistical protocol in §7 to the aggregated results | steps 6–7 | — | +| 9 | Update `EXPERIMENTS.md` with the per-model result tables | step 8 | — | +| 10 | Compile results into thesis chapter / appendix | step 9 | — | + +**Total wall-clock (pessimistic, fully serial):** ~3–4 days of compute. Can be compressed with overlapping embedding/training workers if GPU capacity allows. + +--- + +## 9. Deliverables + +- `EmbeddingConfig` rows for the 6 new models, committed to the DB. +- Per-model entries in `EXPERIMENTS.md` mirroring the existing table format (Exp 1 / Exp 3 / Exp 4+ rows). +- **Master results table**: 8 rows × (baseline Fmax | alignment_weighted Fmax | reranker Fmax) × 9 cells each. +- **Feature importance heatmap**: 24 (model × tier) rerankers × top-N features, colour-coded by gain. +- Statistical test report (Wilcoxon p-values + effect sizes + CIs) as a standalone markdown section. +- Thesis chapter / appendix formalising the grid as evidence for RQ1–RQ4. + +--- + +## 10. Known limitations (honest reporting) + +1. **Not training-data matched.** Each PLM was pretrained on different corpora (UniRef50 subsets at different points in time, sometimes Big Fantastic Database for ProtT5, etc.). Perfect controlled comparison is impossible without re-pretraining, which is out of scope. +2. **Architecture is not a clean isolated variable.** T5 encoders and BERT-style encoders differ in depth, attention masking, objective (span corruption vs MLM), and training data. RQ1's conclusion will be **correlational**, not causal. +3. **Scale is coarse.** Three tiers (~300M / ~1.5B / ~3B) is the maximum granularity this compute budget allows. Smooth scaling curves are out of reach. +4. **Ankh backend.** Ankh is exposed in PROTEA as a **dedicated backend** (`model_backend = "ankh"`), not as an alias of `t5`. Internally it reuses the T5 batched pipeline via `_embed_t5(..., use_aa2fold=False)` but uses `AutoTokenizer` instead of `T5Tokenizer` and never injects the `` prefix — ensuring clean separation in the benchmark tables. The distinction matters for RQ1: Ankh results are reported under their own family row, not merged into "T5 encoder". +5. **ESMC-600M availability.** EvolutionaryScale's public ESMC release must be confirmed to include the 600M variant at time of execution. If unavailable at that scale, substitute with the closest public ESMC size and document the deviation in step 2. +6. **No seed-variance analysis.** LightGBM training (with fixed seed), KNN retrieval, and embeddings are all deterministic under PROTEA's default config. Variance across re-runs for the same config should be zero; we do not budget compute for confirming this. +7. **Single evaluation delta.** Only the GOA 220 → 229 delta is used. A multi-delta sensitivity analysis (e.g. 215 → 229, 220 → 225) is a candidate for future work but not planned here. +8. **ProstT5 inference requires 3Di tokens**, which PROTEA currently provides via sequence-only input using the AA2fold branch (`use_aa2fold = "prostt5" in model_name.lower()` at `compute_embeddings.py:715`). This means PROTEA's ProstT5 embeddings are generated **without** real 3Di tokens from a structure; the model internally translates sequence to predicted 3Di. This is the setup the Rostlab release supports but is distinct from "true structure-aware" inference with Foldseek-derived 3Di tokens. Document this explicitly in the thesis when discussing RQ3. + +--- + +## 11. Change log + +| Date | Change | +|---|---| +| 2026-04-10 | Initial draft: 8-model matrix, RQ1–RQ4, hypotheses H1–H4, pinned pipeline, statistical protocol. ESMC-600M confirmed. ESM2-15B discarded. | diff --git a/EXPERIMENTS.md b/EXPERIMENTS.md new file mode 100644 index 0000000..c320aa4 --- /dev/null +++ b/EXPERIMENTS.md @@ -0,0 +1,502 @@ +# Plan de Experimentación PROTEA + +## Infraestructura + +- **Annotation sets:** 15 GOA snapshots (160–229) +- **Ontology:** releases/2026-01-23 + IA file (IA_cafa6.tsv) +- **Embeddings:** 527K ESM-C 300M (dim=960) +- **Evaluation set:** GOA 220→229 (NK: 2831, LK: 3410, PK: 15313 proteínas) +- **Query set:** `af6bf007` (GOA_220_229, ~20K proteínas) +- **Evaluador:** cafaeval con IA weighting (information accretion) + +**IDs de referencia:** +- Embedding config: `8e7f78c3-900f-452f-858e-63ca14d103e1` +- Annotation set (GOA 220): `c7bdb296-a86a-4141-b5e5-53eb77363ad0` +- Ontology snapshot: `947bdff6-d17c-4ca3-a41a-bc8fb4d74b7a` +- Evaluation set (220→229): `42b34e79-6fe9-4fa0-b718-02f43a1e3192` + +--- + +## Exp 1 — Baseline KNN: efecto de k + +**Scoring:** baseline (`1 - distance/2`), `aspect_separated_knn=true` + +| k | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Estado | +|---|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **5** | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.676 | 0.187 | 0.278 | 0.325 | ✅ `d7adeb1e` | +| 10 | 0.400 | 0.574 | 0.656 | 0.458 | 0.537 | 0.663 | 0.177 | 0.272 | 0.317 | ✅ `30bf6187` | +| 20 | 0.396 | 0.564 | 0.649 | 0.454 | 0.528 | 0.654 | 0.173 | 0.269 | 0.313 | ✅ `a4442444` | +| 50 | 0.396 | 0.555 | 0.646 | 0.452 | 0.523 | 0.651 | 0.173 | 0.269 | 0.312 | ✅ `d41b8d05` | + +**Conclusión:** k=5 es óptimo en todas las categorías. Más vecinos = más ruido, degradación monotónica. + +--- + +## Exp 2 — Efecto de `aspect_separated_knn` + +Con k=5, comparar índice unificado vs separado por aspecto (BPO/MFO/CCO). + +| Variante | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Estado | +|----------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| aspect_sep=true | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.676 | 0.187 | 0.278 | 0.325 | ✅ `d7adeb1e` | +| aspect_sep=false | 0.410 | 0.595 | 0.666 | 0.471 | 0.569 | 0.675 | 0.188 | 0.279 | 0.325 | ✅ `bee8fbe7` | + +**Conclusión:** Diferencias mínimas. aspect_sep=false mejora ligeramente MFO (+0.005 NK, +0.011 LK); aspect_sep=true mejora ligeramente BPO. Sin ganancia clara → mantener aspect_sep=true por cobertura uniforme de aspectos. + +--- + +## Exp 3 — Scoring heurístico + +**Requisito:** prediction set con `compute_alignments=true, compute_taxonomy=true` (k=5, aspect_sep=mejor de Exp 2). + +Usa los 5 ScoringConfig presets del sistema. El scoring se aplica en evaluación (no requiere re-predicción para cada config). + +| Config | Fórmula | Pesos | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Estado | +|--------|---------|-------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **embedding_only** | linear | emb=1.0 | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | ✅ | +| alignment_weighted | linear | emb=0.5, nw=0.3, sw=0.2 | **0.428** | **0.611** | **0.683** | **0.500** | **0.598** | **0.699** | **0.201** | **0.285** | **0.337** | ✅ | +| evidence_primary | linear | emb=0.2, evi=0.8 | 0.362 | 0.558 | 0.638 | 0.412 | 0.540 | 0.642 | 0.165 | 0.268 | 0.308 | ✅ | +| embedding_plus_evidence | evidence_weighted | emb=1.0, evi=1.0 | 0.352 | 0.531 | 0.618 | 0.387 | 0.517 | 0.626 | 0.162 | 0.250 | 0.300 | ✅ | +| composite | evidence_weighted | emb=0.4, nw=0.2, sw=0.1, evi=0.2, tax=0.1 | 0.364 | 0.560 | 0.639 | 0.412 | 0.542 | 0.642 | 0.167 | 0.267 | 0.307 | ✅ | + +**Prediction set:** `a818b653` (k=5, aspect_sep=true, alignments+taxonomy+reranker_features) + +**Conclusión:** `alignment_weighted` es el mejor scoring en todas las categorías y aspectos. Mejora el baseline (embedding_only) entre +1.5% y +4% Fmax. Las configs que usan evidence_weight (evidence_primary, composite, embedding_plus_evidence) **empeoran** el baseline — la señal de evidencia perjudica el ranking bajo CAFA-eval con IA weighting. + +--- + +## Exp 4 — Re-ranker LightGBM + +**Requisito:** prediction set con `compute_alignments=true, compute_taxonomy=true, compute_reranker_features=true`. + +**Entrenamiento:** `train_reranker_auto` con 12 splits temporales (GOA 160→165 hasta 215→220), test 220→229. +9 modelos (NK/LK/PK × BPO/MFO/CCO), binary CE, features completas (alignments + taxonomy + reranker_features). + +### 4a. Sin balance (job `188eb26a`) + +| Cat-Asp | AUC | Iter | Observación | +|---------|-----|------|-------------| +| NK-BPO | 0.771 | 1 | early stop — pocos positivos (0.17%) | +| NK-MFO | 0.938 | 300 | buen modelo | +| NK-CCO | 0.911 | 266 | buen modelo | +| LK-BPO | 0.770 | 1 | early stop | +| LK-MFO | 0.930 | 300 | buen modelo | +| LK-CCO | 0.872 | 300 | buen modelo | +| PK-BPO | 0.779 | 1 | early stop | +| PK-MFO | 0.831 | 1 | early stop | +| PK-CCO | 0.767 | 1 | early stop | + +6 de 9 modelos no aprenden (early stop iter=1) por desbalance extremo. + +### 4b. Con balance `neg_pos_ratio=10` (job `a96eed71`) + +| Cat-Asp | AUC | Iter | Δ AUC vs 4a | +|---------|-----|------|-------------| +| NK-BPO | 0.898 | 4 | +0.127 | +| NK-MFO | 0.922 | 9 | -0.016 | +| NK-CCO | 0.881 | 4 | -0.030 | +| LK-BPO | 0.893 | 4 | +0.124 | +| LK-MFO | 0.925 | 11 | -0.005 | +| LK-CCO | 0.854 | 3 | -0.018 | +| PK-BPO | 0.796 | 2 | +0.017 | +| PK-MFO | 0.849 | 3 | +0.018 | +| PK-CCO | 0.781 | 2 | +0.014 | + +Todos los modelos aprenden. BPO sube ~12 puntos AUC. MFO/CCO bajan ligeramente (menos datos de entrenamiento). + +### Resultados CAFA-eval (v1) + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| **alignment_weighted** | **0.428** | **0.611** | 0.683 | **0.500** | **0.598** | 0.699 | 0.201 | 0.285 | 0.337 | +| reranker v1 (sin balance) | 0.384 | 0.584 | **0.695** | 0.447 | 0.482 | **0.713** | 0.201 | 0.284 | 0.335 | +| reranker v1 (balanced) | 0.408 | 0.577 | 0.687 | 0.478 | 0.506 | 0.711 | 0.201 | **0.298** | 0.332 | + +**Conclusiones v1:** +- El balance corrige BPO (+0.024 NK, +0.031 LK vs sin balance) pero no alcanza al heurístico +- Ambos rerankers mejoran **CCO** respecto al baseline (+2-4%) +- Ambos rerankers **empeoran MFO** respecto al heurístico (-3 a -9%) +- El reranker balanced destaca en **PK-MFO** (0.298, mejor de todos los métodos) +- `alignment_weighted` sigue siendo el mejor approach global: gana en 6 de 9 celdas + +--- + +## Exp 5 — Re-ranker v2 (per-categoría con IA weighting) + +**Cambios respecto a v1:** +- 3 modelos per-categoría (NK, LK, PK) en vez de 9 per-aspecto +- `is_unbalance` eliminado (evita doble compensación con `neg_pos_ratio`) +- `learning_rate`: 0.05 → 0.01 +- `num_boost_round`: 300 → 1000 (con `early_stopping_rounds`: 50) +- IA values como `sample_weight` en entrenamiento (términos raros pesan más) + +### 5a. Quick test (2 splits: 211→215→220, test 229) — eval `9242ea3e` + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| reranker v2 (2 splits) | 0.418 | 0.601 | 0.691 | 0.477 | 0.560 | 0.700 | 0.182 | 0.282 | 0.341 | + +MFO ya no se destruye (0.601 vs 0.577 de v1 balanced). Prometedor con solo 2 splits. + +### 5b. Full training (13 splits: 160→220, test 229) — eval `a3d3bbea` + +Modelos: `lgbm_v2_full-{nk,lk,pk}` +- NK: `fc013658-9c95-48e8-9c72-c13f477a8b26` +- LK: `8697ffed-6814-4594-85a1-5dae3ea00b1f` +- PK: `cdcbc26f-8f9a-41b2-9196-21bf4f9d3e2e` + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| **alignment_weighted** | **0.428** | **0.611** | 0.683 | **0.500** | **0.598** | 0.699 | **0.201** | 0.285 | 0.337 | +| reranker v1 (sin balance) | 0.384 | 0.584 | 0.695 | 0.447 | 0.482 | **0.713** | 0.201 | 0.284 | 0.335 | +| reranker v1 (balanced) | 0.408 | 0.577 | 0.687 | 0.478 | 0.506 | 0.711 | 0.201 | **0.298** | 0.332 | +| **reranker v2 full** | 0.425 | 0.607 | **0.689** | 0.486 | 0.575 | **0.707** | 0.199 | 0.297 | **0.335** | + +**Conclusiones v2 full:** +- **Mucho más robusto que v1** — MFO no se destruye (0.607 vs 0.577 de v1 bal), BPO mejora consistentemente +- **CCO sigue siendo el punto fuerte del reranker**: NK-CCO 0.689, LK-CCO 0.707 (segundo mejor tras v1 unbal) +- **PK recupera**: v2 full (0.199/0.297/0.335) supera al v2 quick test que había caído en PK-BPO +- **alignment_weighted sigue ganando en BPO y MFO**: NK-BPO 0.428 vs 0.425, LK-BPO 0.500 vs 0.486, LK-MFO 0.598 vs 0.575 +- El IA weighting en entrenamiento + modelos per-categoría eliminan la inestabilidad de v1 pero no superan al heurístico globalmente + +--- + +## Exp 6 — Re-ranker v3 (features completas: alineamientos + taxonomía en entrenamiento) + +**Cambio clave respecto a v2:** En v2 las features de alineamiento (NW/SW) y taxonomía estaban hardcodeadas a NULL durante el entrenamiento — el modelo nunca las veía. v3 computa `compute_alignment()` y `compute_taxonomy()` por cada par (query, ref) durante la generación de datos de entrenamiento, dando al modelo acceso a las 22 features completas. + +**Configuración:** 13 splits (160→220), test 229, `neg_pos_ratio=10`, IA weights, `compute_alignments=true`, `compute_taxonomy=true`. Tiempo de entrenamiento: ~2h 45m (vs ~2h de v2 — el overhead de alineamientos es mínimo). + +Modelos: `lgbm_v3_full-{nk,lk,pk}` +- NK: `2ff1818f-71b6-4932-8f8d-b3000e3c8d34` +- LK: `269e26b4-0bec-42fa-a077-fe5b675dd2de` +- PK: `e14b9716-bbf8-4b99-b34b-b801c3966579` + +### Resultados CAFA-eval — eval `23851bff` + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| alignment_weighted | 0.428 | 0.611 | 0.683 | **0.500** | 0.598 | 0.699 | 0.201 | 0.285 | 0.337 | +| reranker v2 full | 0.425 | 0.607 | 0.689 | 0.486 | 0.575 | 0.707 | 0.199 | 0.297 | 0.335 | +| **reranker v3 full** | **0.431** | **0.620** | **0.692** | 0.478 | **0.607** | 0.697 | **0.201** | **0.297** | **0.339** | + +**Conclusiones v3:** +- **Las features de alineamiento importaban.** v3 supera a v2 en casi todas las métricas, especialmente MFO (+0.013 NK, +0.032 LK) +- **Supera al heurístico `alignment_weighted`** en 7 de 9 celdas: NK-BPO (+0.003), NK-MFO (+0.009), NK-CCO (+0.009), LK-MFO (+0.009), PK-BPO (=), PK-MFO (+0.012), PK-CCO (+0.002) +- Solo pierde en LK-BPO (0.478 vs 0.500) y LK-CCO (0.697 vs 0.699) +- **Resultado positivo**: el re-ranker con features completas es el mejor método global + +--- + +## Resumen de progreso + +| Fase | Experimento | Estado | Mejor Fmax NK-MFO | +|------|-------------|--------|-------------------| +| 1 | Baseline KNN (k sweep) | ✅ | 0.590 (k=5) | +| 2 | aspect_separated_knn | ✅ | ~0.590 (sin diferencia clara) | +| 3 | Scoring heurístico (5 configs) | ✅ | 0.611 (alignment_weighted) | +| 4a | Re-ranker v1 LightGBM (sin balance) | ✅ | 0.584 (mejora CCO, empeora MFO) | +| 4b | Re-ranker v1 LightGBM (balanced) | ✅ | 0.577 (mejora PK-MFO a 0.298) | +| 5a | Re-ranker v2 quick test (2 splits) | ✅ | 0.601 (mucho más estable que v1) | +| 5b | Re-ranker v2 full (13 splits) | ✅ | 0.607 (robusto, pero no supera heurístico) | +| 6 | **Re-ranker v3 full (features completas)** | ✅ | **0.620** (supera al heurístico) | +| 7 | **Comparativa eggNOG-mapper** | ✅ | 0.359 (PROTEA 9/9 celdas mejor) | +| 8 | **Comparativa Pannzer2 + data leakage** | ✅ | 0.717 (con leakage: 62.4% NK GT exacto) | +| 9 | **Comparativa InterProScan 6** | ✅ | 0.551 (PROTEA supera en 8/9 celdas) | +| 10 | **ProstT5 vs ESMC (v3 preliminar)** | ⚠️ F3 contaminado por under-training | F1+F2 válidos, F3 pendiente | +| 11 | **Re-train v4 "converged" (5000 rounds)** | 🔄 en curso | — | +| 12 | **Extended PLM matrix (8 modelos)** | 📋 diseño listo (`EXPERIMENTAL_DESIGN.md`) | — | + +**Flujo de dependencias:** +``` +Exp 1 (k sweep) ✅ + → Exp 2 (aspect_sep) ✅ + → Predicción con features completas ✅ (a818b653) + → Exp 3 (scoring configs) ✅ — alignment_weighted gana + → Exp 4 (re-ranker v1, 12 splits) ✅ — mejora CCO, empeora MFO + → Exp 5 (re-ranker v2, per-cat + IA weights) ✅ — robusto pero no supera heurístico + → Exp 6 (re-ranker v3, features completas) ✅ — SUPERA al heurístico + → Exp 7 (eggNOG-mapper comparison) ✅ — PROTEA gana 9/9 celdas + → Exp 8 (Pannzer2 + leakage analysis) ✅ — leakage confirmado, PROTEA única evaluación fair + → Exp 9 (InterProScan 6) ✅ — PROTEA supera en 8/9 celdas +``` + +**Mejor configuración global: `reranker v3 full` (LightGBM per-categoría, 22 features, IA weights)** + +--- + +## Exp 7 — Comparativa con eggNOG-mapper + +**Herramienta:** eggNOG-mapper v2.1.13 (Docker: `quay.io/biocontainers/eggnog-mapper:2.1.13--pyhdfd78af_2`) +**Base de datos:** eggNOG DB v5.0.2 + Diamond v2.0.15 +**Parámetros:** `-m diamond --go_evidence experimental --tax_scope auto --target_orthologs all --cpu 8` +**Test set:** 20,281 proteínas del delta GOA 220→229 (mismo que todos los experimentos PROTEA) +**Cobertura:** 17,334/20,281 proteínas con GO terms (85.5%) +**Tiempo:** ~21 minutos (solo CPU, 8 threads) + +### Resultados CAFA-eval (IA-weighted) + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **eggNOG-mapper 2.1.13** | 0.247 | 0.359 | 0.386 | 0.382 | 0.334 | 0.450 | 0.190 | 0.199 | 0.325 | +| PROTEA baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| **PROTEA reranker v3** | **0.431** | **0.620** | **0.692** | **0.478** | **0.607** | **0.697** | **0.201** | **0.297** | **0.339** | + +### Diferencia absoluta Fmax (PROTEA v3 - eggNOG-mapper) + +| Categoría | BPO | MFO | CCO | +|-----------|------|------|------| +| NK | +0.184 | +0.261 | +0.306 | +| LK | +0.096 | +0.273 | +0.247 | +| PK | +0.011 | +0.098 | +0.014 | + +**Conclusiones:** +- PROTEA v3 supera a eggNOG-mapper en **9 de 9 celdas** +- Incluso el baseline de PROTEA (solo embeddings) supera a eggNOG-mapper en 8 de 9 celdas +- Las mayores diferencias están en NK y LK (hasta +0.306 Fmax en NK-CCO) +- eggNOG-mapper tiene menor cobertura (85.5% vs 100%) y no produce scores graduados +- Script de evaluación: `scripts/evaluate_external_tool.py` + +--- + +## Exp 8 — Comparativa con Pannzer2 + análisis de data leakage + +**Herramienta:** Pannzer2 (servidor web Helsinki, marzo 2026) +**Base de datos:** UniProt/SwissProt actual (actualizada a fecha de ejecución) +**Test set:** 20,281 proteínas del delta GOA 220→229 (mismo que todos los experimentos) +**Cobertura:** 19,964/20,281 proteínas con GO terms (98.4%) +**Predicciones totales:** 532,557 (max 30 GO terms por proteína, con PPV scores calibrados 0.31–0.91) + +### Resultados CAFA-eval (IA-weighted) + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **Pannzer2** † | **0.656** | **0.717** | **0.791** | **0.681** | **0.729** | **0.813** | **0.391** | **0.574** | **0.618** | +| InterProScan 6 † | 0.312 | 0.551 | 0.476 | 0.479 | 0.488 | 0.491 | 0.208 | 0.269 | 0.250 | +| eggNOG-mapper 2.1.13 † | 0.247 | 0.359 | 0.386 | 0.382 | 0.334 | 0.450 | 0.190 | 0.199 | 0.325 | +| **PROTEA reranker v3** | **0.431** | **0.620** | **0.692** | **0.478** | **0.607** | **0.697** | **0.201** | **0.297** | **0.339** | + +† Subject to temporal data leakage (reference DB from March 2026, after GOA 229). + +### Data leakage: análisis temporal + +Los resultados de Pannzer2 y eggNOG-mapper **no son comparables directamente** con PROTEA debido a data leakage temporal: + +| | Pannzer2 | InterProScan 6 | eggNOG-mapper | PROTEA | +|---|---|---|---|---| +| **Fecha de ejecución** | Marzo 2026 | 25 Mar 2026 | 24 Mar 2026 | — | +| **BD de referencia** | UniProt/SwissProt 2026 | InterPro 2026 | eggNOG v5.0.2 (2026) | GOA 220 (frozen at t0) | +| **Conoce las respuestas?** | Sí | Parcialmente | Parcialmente | No | + +**Cuantificación del leakage:** Se midió el porcentaje de pares (proteína, GO term) del ground truth que aparecen exactamente en las predicciones de cada herramienta. + +| Categoría | GT pairs | Pannzer2 exact match | eggNOG exact match | +|-----------|----------|---------------------|-------------------| +| **Total** | 40,014 | 20,373 (**50.9%**) | 10,308 (25.8%) | +| NK | 6,953 | 4,339 (**62.4%**) | 1,025 (14.7%) | +| LK | 5,520 | 3,624 (**65.7%**) | 1,087 (19.7%) | +| PK | 27,541 | 12,410 (45.1%) | 8,196 (29.8%) | + +Pannzer2 acierta el 62.4% de las anotaciones NK — proteínas que por definición no tenían anotaciones experimentales en t0. Esto confirma que su BD de referencia contiene anotaciones posteriores a GOA 220, incluyendo muchas que forman parte del ground truth GOA 229. + +**Conclusión:** PROTEA es la única herramienta del benchmark que garantiza integridad temporal: la referencia se congela en t0, el ground truth se computa como delta, y todo queda versionado en la BD. Los números de Pannzer2 y eggNOG-mapper representan un **upper bound optimista** bajo data leakage, no una comparación fair. + +- Parsing de resultados Pannzer2: `/home/frapercan/Thesis/pannzer2_results/parse_pannzer2.py` +- Raw HTML: `/home/frapercan/Thesis/pannzer2_results/raw/PANZ_{1-21}.html` +- Script de evaluación: `scripts/evaluate_external_tool.py --tool pannzer2` + +--- + +## Hallazgos previos + +- Baseline KNN con `score = 1 - distance/2` da buenos resultados en NK/LK +- Un intento previo de LightGBM per-aspecto (9 modelos) **empeoró** NK/LK: + - Causa 1: optimiza binary CE (todos los GO terms pesan igual) pero CAFA-eval pondera por IC + - Causa 2: features de agregación estaban NULL en el prediction set + +--- + +## Exp 10 — ProstT5 vs ESMC (comparativa preliminar v3) + +**Fecha**: 2026-04-10 +**Objetivo**: replicar el reranker v3 sobre un segundo PLM (ProstT5-XL ~3B) para ver si la ganancia del v3 generaliza más allá de ESMC-300M. + +> **Caveat metodológico importante**: ESMC-300M (~300M params, BERT-like encoder) y ProstT5-XL (~3B params, T5 encoder + structure fine-tuning) son modelos con tamaño y arquitectura distintos. Esta comparativa mezcla esos ejes — no es fair para concluir nada sobre "ESMC vs ProstT5 como familia". El benchmark con matriz limpia está en `EXPERIMENTAL_DESIGN.md` (Exp 12). + +### Setup + +- **Evaluation set**: `42b34e79-6fe9-4fa0-b718-02f43a1e3192` (delta GOA 220→229, 20281 proteínas) +- **ESMC prediction set**: `a818b653-cad9-4f42-8e04-eda3f5ff2ceb` +- **ProstT5 prediction set**: `38ee00af-cbfd-4c5b-ab84-c98a32765b40` +- **IA file**: `IA_cafa6.tsv` +- **Ontology snapshot**: `947bdff6-d17c-4ca3-a41a-bc8fb4d74b7a` + +Rerankers v3 (`num_boost_round=1000, early_stopping_rounds=50, neg_pos_ratio=10, IA sample weights, 13 splits 160→220`): + +| Embedding | NK | LK | PK | +|---|---|---|---| +| ESMC-300M (job `16c3dcfd`) | `2ff1818f` | `269e26b4` | `e14b9716` | +| ProstT5-XL (job `12b704d4`) | `a1b4947d` | `60597ab9` | `1efd0c33` | + +CAFA eval results: +- ESMC + reranker: `ba7476cb-81f2-461a-b69a-a99c8df834bf` +- ProstT5 + reranker: `7b97e74a-54df-4e4e-90ed-39e07b58de64` + +### Resultados (cafaeval + IA, evaluación oficial) + +**F1 — ProstT5 gana en retrieval bruto**: avg Fmax baseline ProstT5 0.4849 vs ESMC 0.4824. Consistente en las 9 celdas: ProstT5 gana 44/45 en el 45-cell benchmark previo. + +**F3 — Reranker per-aspect (9 celdas)**: + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Avg | +|---|---|---|---|---|---|---|---|---|---|---| +| ESMC baseline | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | 0.4624 | +| ESMC + reranker v3 | 0.431 | 0.620 | 0.692 | 0.478 | 0.607 | 0.697 | 0.201 | 0.297 | 0.339 | **0.4846** | +| ProstT5 baseline | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | **0.4849** | +| ProstT5 + reranker v3 | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | 0.4817 | + +- **ESMC mejora con reranker**: 6/9 celdas, avg Δ = **+0.0022** +- **ProstT5 degrada con reranker**: 9/9 celdas, avg Δ = **−0.0032** +- Avg final ESMC+rr (0.4846) ≈ ProstT5+rr (0.4817), diferencia pequeña pero de signo opuesto a la del retrieval bruto + +### F2 — Feature importance (hipótesis de compensación) + +Extracción de `feature_importance` (gain) de los 6 rerankers. Agregado sobre features de `{alignment_*, similarity_*, taxonomic_*}`: + +- **ESMC ponderan alignment+taxonomy entre 2.15% y 5.22% más** que sus homólogos ProstT5 (monótono en NK/LK/PK) +- Diferencias dramáticas en features individuales: + - NK `alignment_score_nw`: ESMC 4.72% vs ProstT5 1.69% (**2.8×**) + - PK `similarity_nw`: ESMC 9.63% vs ProstT5 3.91% (**2.5×**) +- ProstT5 compensa redistribuyendo a features derivadas del embedding: `ref_annotation_density`, `vote_count`, `k_position` + +**Interpretación**: cuando el embedding es "más fuerte" (ProstT5), el reranker se apoya menos en señales externas (alineamiento, taxonomía) y más en estadísticos derivados del propio retrieval. Este es el carry-over de la hipótesis que se va a testear formalmente como H4 en `EXPERIMENTAL_DESIGN.md`. + +### Blocker — under-training en los 6 modelos v3 + +Revisión del `best_iteration` de cada modelo con `num_boost_round=1000, early_stopping_rounds=50`: + +| Modelo | best_iteration | +|---|---| +| ESMC-nk | **1000** (techo, early stop no disparó) | +| ESMC-lk | 994 | +| ESMC-pk | 999 | +| ProstT5-nk | **1000** | +| ProstT5-lk | 995 | +| ProstT5-pk | **1000** | + +Con 95k–332k samples por tier y LR=0.01, este dataset típicamente necesita 3000–10000 iters para saturar. **Conclusión**: los deltas de F3 (especialmente el signo negativo de ProstT5 −0.0032) pueden ser artefacto del under-training, no efecto real del embedding. + +- **F2 (feature importance) sigue siendo válido** — ambos modelos tuvieron el mismo presupuesto bajo el techo, la diferencia *relativa* en cómo distribuyen alignment/taxonomy es una comparación justa +- **F3 (signos de los deltas Fmax) está contaminado** — no se debe usar para la tesis hasta que converjan + +**Lección metodológica crítica**: el campo `test_evaluation` que reporta `train_reranker_auto` muestra deltas de +0.04 a +0.08 Fmax mucho más optimistas que los +0.002 reales de cafaeval. El test_evaluation no aplica propagación GO ni IA weighting — **no usar para la tesis**. Solo cafaeval con IA. + +### Estado + +- F1 y F2: publicables con los números actuales +- F3: **pendiente de re-evaluación** tras v4 (ver Exp 11) +- Estado de trabajo detallado: `project_reranker_benchmark.md` (auto-memory) + +--- + +## Exp 11 — Re-training v4 "converged" (en curso) + +**Fecha de lanzamiento**: 2026-04-10 18:03 UTC +**Objetivo**: re-entrenar los 6 modelos (ESMC y ProstT5, NK/LK/PK) con presupuesto suficiente para que el early stopping dispare de verdad, eliminando el confounder de under-training del Exp 10. + +### Cambios respecto a v3 + +| Parámetro | v3 | v4 | +|---|---|---| +| `num_boost_round` | 1000 | **5000** | +| `early_stopping_rounds` | 50 | **100** | +| Resto | — | idéntico (13 splits 160→220, neg_pos_ratio=10, IA weights, per-tier NK/LK/PK, alignment+taxonomy features) | + +El resto del pipeline (KNN, FAISS IVFFlat, feature engineering) es idéntico — v4 cambia **solo** el presupuesto de boosting. + +### Jobs + +Ambos lanzados a `protea.training` (cola aislada, worker dedicado, peak RAM ~14 GB con los fixes de chunked KNN del 2026-04-10): + +| Job | Modelo | Estado esperado | +|---|---|---| +| `48c91381-1af1-414c-bd1b-a6a51c931873` | `lgbm_v4_converged_esmc` | running (~2h) | +| `e923ac70-21a8-4c5c-8cc6-9ebb76d156aa` | `lgbm_v4_converged_prostt5` | queued, arrancará al terminar ESMC | + +Tiempo estimado total: ~4h serial (protea.training procesa uno a uno). + +### Escenarios esperados al terminar + +- **A — narrativa F2 se confirma**: ProstT5 sigue degradando (−ΔFmax tras converger) → conclusión fuerte de tesis, la hipótesis de compensación gana peso +- **B — ProstT5 pasa a neutro o +**: narrativa se suaviza ("ambos embeddings mejoran con reranker, ESMC un poco más") — F2 sigue válido como explicación +- **C — ambos suben ~0.01-0.02**: confirma que v3 estaba under-trained y da números definitivos más altos que Exp 10 + +### Pendientes cuando termine + +1. Verificar `best_iteration` de los 6 modelos nuevos (esperamos 2000-4000, disparando early stop) +2. Re-lanzar `run_cafa_evaluation` para ambos embeddings con los nuevos reranker UUIDs +3. Re-extraer feature importance y re-validar F2 +4. Reemplazar la tabla de F3 en el Exp 10 con los números de v4 +5. Decidir A/B/C y actualizar la narrativa de la tesis en consecuencia + +--- + +## Exp 12 — Extended PLM benchmark matrix (planned) + +**Fecha de diseño**: 2026-04-10 +**Estado**: documento de diseño prospectivo +**Plan completo**: `EXPERIMENTAL_DESIGN.md` + +### Motivación + +Exp 10 expuso el confounder central del trabajo preliminar: comparar ESMC-300M (~300M, BERT-like) con ProstT5-XL (~3B, T5 + structure fine-tuning) mezcla **tamaño** y **familia** en un solo eje. Ningún finding se puede atribuir a una u otra dimensión sin una matriz que los separe. + +### Matriz propuesta (8 modelos) + +| # | Modelo | Params | Backend | Estado | +|---|---|---|---|---| +| 1 | ESMC-300M | ~300M | `esm3c` | ✓ (Exp 10, v4 en curso) | +| 2 | ESMC-600M | ~600M | `esm3c` | nuevo | +| 3 | ESM2-650M (`esm2_t33_650M_UR50D`) | ~650M | `esm` | nuevo | +| 4 | ESM2-3B (`esm2_t36_3B_UR50D`) | ~3B | `esm` | nuevo | +| 5 | Ankh-base (`ElnaggarLab/ankh-base`) | ~450M | `ankh` | nuevo | +| 6 | Ankh-large (`ElnaggarLab/ankh-large`) | ~1.9B | `ankh` | nuevo | +| 7 | ProtT5-XL (`prot_t5_xl_uniref50`) | ~3B | `t5` | nuevo | +| 8 | ProstT5-XL | ~3B | `t5` | ✓ (Exp 10, v4 en curso) | + +**Descartado**: ESM2-15B (coste de embedding prohibitivo, no tiene par T5 de tamaño equivalente → rompe la simetría de la matriz). + +### Research questions (ver `EXPERIMENTAL_DESIGN.md` §2) + +- **RQ1**: ¿a tamaño fijo, qué familia gana (BERT-like vs T5 encoder)? +- **RQ2**: ¿cómo escala Fmax con el tamaño dentro de una familia? ¿Dónde satura? +- **RQ3**: ¿estructura aporta? — test pareado ProtT5-XL vs ProstT5-XL (mismo backbone, única diferencia = 3Di fine-tuning) +- **RQ4**: ¿los embeddings más débiles fuerzan al reranker a compensar con alignment+taxonomy? (carry-over de F2) + +### Protocolo + +Pipeline idéntico para los 8 modelos — cero tuning per-modelo. Ver `EXPERIMENTAL_DESIGN.md` §6 para hiperparámetros pinned: KNN `k=5`, FAISS IVFFlat, alignments + taxonomy on, reranker v4 (5000 rounds), `run_cafa_evaluation` con IA weighting. + +### Tests estadísticos + +Wilcoxon signed-rank sobre las 9 celdas Fmax, corrección Holm-Bonferroni sobre 6 comparaciones pareadas, bootstrap CI 95% para effect sizes. Regresión OLS para H4. + +### Coste + +~3-4 días de compute secuencial (embeddings + KNN + v4 training + eval por los 6 modelos nuevos). Comprimible con paralelismo GPU si procede. + +### Estado + +- **Diseño**: completo (`EXPERIMENTAL_DESIGN.md` v1.0) +- **Ejecución**: bloqueada hasta que v4 (Exp 11) valide que el presupuesto es correcto +- **Dependencias previas**: Ankh backend ya integrado en PROTEA como `model_backend="ankh"` dedicado (no alias de `t5`) — ver `project_ankh_backend.md` + +### Deliverables esperados + +1. Tabla master 8 × 3 (baseline / alignment_weighted / reranker) × 9 celdas +2. Heatmap de feature importance de las 24 rerankers (8 modelos × 3 tiers) +3. Report estadístico (p-valores + effect sizes + CIs) por comparación +4. Capítulo de tesis formalizando RQ1-RQ4 con la matriz como evidencia diff --git a/README.md b/README.md index 9cefcd8..d10fa0c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **Protein annotation platform** for large-scale GO term prediction, sequence embedding, and functional analysis. -PROTEA provides a unified backend for ingesting protein data from UniProt, computing ESM2 embeddings, and predicting Gene Ontology terms via KNN transfer — with a full job queue, REST API, and web interface. +PROTEA provides a unified backend for ingesting protein data from UniProt, computing protein language model embeddings (ESMC, ProstT5, ESM2), and predicting Gene Ontology terms via KNN transfer plus a learned LightGBM re-ranker — with a full job queue, REST API, and web interface. [![Lint](https://github.com/frapercan/PROTEA/actions/workflows/lint.yml/badge.svg)](https://github.com/frapercan/PROTEA/actions/workflows/lint.yml) [![Tests](https://github.com/frapercan/PROTEA/actions/workflows/test.yml/badge.svg)](https://github.com/frapercan/PROTEA/actions/workflows/test.yml) @@ -21,6 +21,16 @@ PROTEA provides a unified backend for ingesting protein data from UniProt, compu --- +## Why PROTEA? + +PROTEA is the successor to [PIS](https://github.com/CBBIO/protein-information-system) and [FANTASIA](https://github.com/CBBIO/fantasia), rebuilt around three goals: + +1. **Clean architecture** — infrastructure, orchestration, and domain logic are explicitly decoupled. Operations are pure domain logic; workers own sessions and queue state; routers expose HTTP. No more God-classes that mix everything. +2. **Learned re-ranking on top of KNN transfer** — beyond classical embedding-KNN annotation, PROTEA trains **LightGBM rerankers on temporal GOA splits** (LambdaRank + CAFA IA weighting, per-tier NK/LK/PK models). Candidates retrieved by KNN are re-scored with alignment, taxonomy, and retrieval features. +3. **Honest temporal evaluation** — benchmarking uses **temporal holdout deltas** between historical GOA releases (e.g. 220→229), evaluated with the official `cafaeval` library and information-accretion weighting, avoiding the optimistic leakage of random splits. + +--- + ## What PROTEA does | Capability | Details | @@ -28,12 +38,13 @@ PROTEA provides a unified backend for ingesting protein data from UniProt, compu | **Protein ingestion** | Paginated UniProt REST API, MD5-deduplicated sequences | | **GO ontology** | Load OBO snapshots, full DAG stored per release | | **GO annotations** | Bulk import from GOA (GAF) and QuickGO (TSV) | -| **Embeddings** | ESM2 via GPU workers, stored as pgvector VECTOR columns | -| **GO prediction** | KNN transfer with optional NW/SW alignment and taxonomic features | -| **CAFA evaluation** | Benchmark pipeline with cafaeval integration | -| **Job queue** | RabbitMQ-backed, 7 queues, full audit trail per job | -| **REST API** | 21 FastAPI endpoints across 5 routers | -| **Web UI** | Next.js frontend with protein explorer, annotation viewer, prediction browser | +| **Embeddings** | ESMC, ProstT5, and ESM2 backends via GPU workers; stored as pgvector `VECTOR` columns | +| **GO prediction** | KNN transfer (FAISS IVFFlat / numpy) with optional NW/SW alignment and taxonomic features | +| **Learning-to-rank** | LightGBM rerankers trained on temporal GOA splits — LambdaRank + IA weighting, per-tier NK/LK/PK models | +| **CAFA evaluation** | Benchmark pipeline with `cafaeval` integration, Fmax + IA-weighted scoring, per-aspect (BPO/MFO/CCO) results | +| **Job queue** | RabbitMQ-backed, 8 queues (ingestion, embeddings, predictions, training), full audit trail per job | +| **REST API** | FastAPI routers for jobs, proteins, embeddings, query sets, scoring, evaluation, and admin | +| **Web UI** | Next.js frontend with protein explorer, annotation viewer, prediction browser, and live job widget | --- @@ -103,10 +114,17 @@ poetry run task lint # ruff + flake8 + mypy |---|---| | API | FastAPI + SQLAlchemy 2.x + PostgreSQL 16 + pgvector | | Queue | RabbitMQ (pika) | -| Embeddings | ESM2 (Meta) via Hugging Face Transformers | -| KNN search | FAISS IVFFlat / numpy | +| Embeddings | ESMC (ESM SDK), ProstT5 / prot_t5_xl (T5Encoder), ESM2 (Hugging Face Transformers) | +| KNN search | FAISS IVFFlat / numpy (chunked brute-force) | +| Re-ranker | LightGBM (LambdaRank, IA-weighted samples) | | Frontend | Next.js 19 + Tailwind v4 | -| Deployment | Docker, manage.sh, vast.ai GPU instances | +| Deployment | Docker Compose, `scripts/manage.sh` process supervisor | + +--- + +## License + +Released into the public domain under the [Unlicense](LICENSE). You are free to copy, modify, publish, use, compile, sell, or distribute PROTEA for any purpose, commercial or non-commercial, without attribution. --- diff --git a/RERANKER.md b/RERANKER.md index 2301546..89a0711 100644 --- a/RERANKER.md +++ b/RERANKER.md @@ -1,188 +1,237 @@ -# Temporal Holdout Re-Ranker for GO Term Prediction +# PROTEA Re-Ranker — Design and Rationale -## Motivación +**Status**: implemented (v3 shipped, v4 training in progress) +**Location in code**: `protea/core/reranker.py`, `protea/core/operations/train_reranker.py` +**Version**: 2.0 — 2026-04-10 (rewrite) -El pipeline actual de PROTEA transfiere anotaciones GO mediante KNN sobre embeddings ESM, usando un scoring heurístico que combina distancia de embedding y pesos de evidencia. Este scoring no está optimizado para la métrica objetivo (Fmax) ni para el comportamiento real de las anotaciones GO a lo largo del tiempo. - -La hipótesis central es que existe una señal aprendible: **dado el contexto de una predicción KNN, ¿acabará este GO term apareciendo en el siguiente release de GOA para esta proteína?** Esta señal puede extraerse directamente del mecanismo de holdout temporal que ya implementa PROTEA. +> This document describes **the re-ranker as it exists in PROTEA today**. An earlier version of this file proposed a PyTorch cross-attention architecture with WebDataset shards; that proposal was explored on paper but **never implemented**. The system converged on a simpler LightGBM design for the reasons documented in §3 ("Why LightGBM and not a neural cross-encoder"). The experiment log showing the evolution across versions lives in `EXPERIMENTS.md`; the forward-looking PLM benchmark plan that uses this re-ranker as a fixed downstream stage lives in `EXPERIMENTAL_DESIGN.md`. --- -## Formulación del Problema +## 1. Problem statement -Sea $\mathcal{G}_N$ el conjunto de anotaciones GO en el release $N$ de GOA (Swiss-Prot reviewed). Para cada par consecutivo $(G_N, G_{N+1})$, el delta temporal es: +PROTEA predicts GO terms by transferring annotations from the $k$ nearest reference proteins in an embedding space. The raw retrieval score is a distance-based heuristic (e.g. `1 - cosine_distance / 2`) optionally combined with alignment identity and evidence weights. This heuristic is: -$$\Delta_{N \to N+1} = \{(p, t) \mid (p, t) \in \mathcal{G}_{N+1} \setminus \mathcal{G}_N\}$$ +- **Not optimised for Fmax with IA weighting** — the metric CAFA actually uses +- **Not calibrated across tiers** — No-Knowledge, Limited-Knowledge and Previously-Known proteins behave very differently and benefit from different signal combinations +- **Not able to use all available features** — sequence alignments, taxonomy, neighbour statistics, and evidence codes are either ignored or combined by hand with arbitrary weights -El re-ranker aprende una función: +The re-ranker replaces this heuristic with a **learned function** that, for each candidate GO term, produces a probability score used to reorder the top-$k$ retrieval list: $$f(q, t, \mathcal{N}_K(q)) \to \hat{y} \in [0, 1]$$ -donde: -- $q$ es la proteína query (representada por su embedding ESM) -- $t$ es el GO term candidato -- $\mathcal{N}_K(q)$ es el conjunto de $K$ vecinos más cercanos en el espacio de embeddings con referencia $\mathcal{G}_N$ -- $\hat{y}$ es la probabilidad de que $(q, t) \in \Delta_{N \to N+1}$ +where $q$ is the query protein, $t$ is a candidate GO term, and $\mathcal{N}_K(q)$ is the set of $K$ nearest neighbours that voted for $t$. + +The training signal is derived from the **temporal structure of GOA releases**: a GO term that first appears for a protein in a later release (and was missing from an earlier one) defines a positive example; any term predicted but absent from the future release is a negative. See §4. --- -## Protocolo de Entrenamiento +## 2. Scope of this document -Se utiliza validación cruzada temporal con múltiples splits históricos de GOA: +| Covered | Not covered | +|---|---| +| Model architecture and feature set | Downstream CAFA evaluation protocol (→ `EXPERIMENTAL_DESIGN.md` §7) | +| Training protocol and hyperparameters | PLM comparison across ESMC/ESM2/ProstT5/Ankh (→ `EXPERIMENTAL_DESIGN.md`) | +| Version history and key design decisions | Historical result tables per experiment (→ `EXPERIMENTS.md`) | +| Integration with the PROTEA pipeline | Alternative rankers (cross-attention, ListNet, ProT5 rerankers…) | +| Known limitations | External tool baselines (eggNOG, Pannzer2, InterProScan) | -``` -Training splits: - GOA_190 → GOA_195 - GOA_195 → GOA_200 - GOA_200 → GOA_205 - GOA_205 → GOA_211 - GOA_211 → GOA_215 - GOA_215 → GOA_220 - -Test split (holdout estricto, nunca visto durante training): - GOA_220 → GOA_229 -``` +--- + +## 3. Why LightGBM and not a neural cross-encoder + +The original design (see §11 for the earlier version's record) proposed a cross-attention neural re-ranker in PyTorch, with learned GO term embeddings from the GO DAG and a WebDataset sharded data pipeline. That proposal was abandoned in favour of a LightGBM gradient-boosted tree model for four concrete reasons: -Para cada split se generan ejemplos etiquetados: positivos $(y=1)$ si el par (proteína, GO term) aparece en el delta, negativos $(y=0)$ en caso contrario. El desbalanceo esperado es aproximadamente 1:10, manejable con técnicas estándar. +1. **Data volume is moderate, not huge.** Each temporal split yields 80k–330k training rows after negative subsampling. Gradient boosted trees are the sample-efficient sweet spot for this regime; a cross-attention transformer would either overfit or need heavy regularisation and we would then be tuning architecture choices instead of studying the actual research question. +2. **Feature heterogeneity is the bottleneck, not representation.** The informative features are already engineered (alignment scores, taxonomy distance, neighbour statistics). A model whose job is to combine 23 tabular features non-linearly across categorical and numeric axes is exactly what GBDT excels at. A neural cross-encoder would need to learn an equivalent combination from scratch. +3. **Interpretability is a thesis requirement.** The F2 finding (that smaller PLMs force the re-ranker to rely more on alignment/taxonomy) can only be measured through gain-based feature importance. LightGBM exposes this directly; extracting equivalent attributions from a cross-attention model requires additional machinery (integrated gradients, attention rollout) that adds failure modes. +4. **Training cost was a hard constraint.** Each re-ranker (per-tier × per-embedding) trains in 2–4 hours on CPU. The same pipeline under a neural cross-encoder with the same budget would train a single model for similar time on a GPU while blocking the embedding worker. Since the PLM benchmark (`EXPERIMENTAL_DESIGN.md`) multiplies compute cost by 8, the LightGBM choice is what makes the study feasible on a single workstation. + +The cross-attention design was not a wrong idea, only a wrong fit for this problem at this scale. Revisiting it remains an option if a later phase of the work finds a measurable ceiling on LightGBM. --- -## Arquitectura: Cross-Attention Re-Ranker +## 4. Temporal holdout training signal -El modelo procesa cada par (query, GO term) usando el contexto completo de los vecinos KNN que contribuyeron a esa predicción. +Let $\mathcal{G}_N$ denote the set of GO annotations present in GOA release $N$ (Swiss-Prot reviewed, evidence-filtered to exclude IEA if so configured). For any ordered pair of releases $(N, N+1)$, the **annotation delta** is -``` -Inputs por predicción (query_protein, go_term): - query_embedding float32[D] ESM embedding del query (D=480 para esmc_300m) - neighbor_embeddings float32[K × D] ESM embeddings de los K vecinos contribuyentes - tabular_features float32[K × F] distancia, evidencia, alineamiento, taxonomía... - go_term_embedding float32[G] embedding semántico del GO term (G=64) - -Arquitectura: - 1. query_proj(query_embedding) → q [H=256] - 2. ref_proj(neighbor_embeddings) → tokens [K × H] - 3. feature_encoder(tabular_features) → (sumado a tokens) - 4. CrossAttention(q, tokens, tokens) → context [H] - 5. MLP([q ‖ context ‖ go_emb ‖ agg_features]) → score [1] -``` +$$\Delta_{N \to N+1} = \{(p, t) \mid (p, t) \in \mathcal{G}_{N+1} \setminus \mathcal{G}_N\}$$ + +For a training pair $(N, N+1)$: -La atención cruzada permite al modelo aprender **qué vecinos son más informativos para este query concreto**, en lugar de agregar los scores de forma heurística. +1. All proteins in $\mathcal{G}_{N+1}$ are used as queries. +2. KNN retrieval is performed using **only** the reference set derived from $\mathcal{G}_N$ (no leakage from the future). +3. For each candidate $(q, t)$ in the retrieval output: + - **Positive** ($y = 1$) if $(q, t) \in \Delta_{N \to N+1}$ (the annotation materialised between $N$ and $N+1$) + - **Negative** ($y = 0$) if the model predicted $t$ but $(q, t) \notin \mathcal{G}_{N+1}$ -### GO Term Embeddings +This definition ensures the training labels are **causally prior** to the prediction: at time $N$ the system does not know what $N+1$ will contain, and neither does the re-ranker while scoring. -Los embeddings de los GO terms se aprenden a partir de la estructura del DAG de GO (relaciones `is_a` / `part_of`) mediante Node2Vec o TransE, de forma que términos semánticamente relacionados (padre-hijo) tengan representaciones similares. El DAG ya está disponible en PROTEA a través de los modelos `GOTerm` y `GOTermRelationship`. +The test split $(220 \to 229)$ is never seen during training and produces the Fmax numbers that are reported for the thesis. --- -## Feature Vector +## 5. Feature set (implementation: `protea/core/reranker.py`) -Cada predicción (query, GO term) se caracteriza por las siguientes features tabulares, computadas por vecino que contribuyó a la predicción: +Each (query, candidate GO term, contributing neighbour) triple is characterised by **23 features** — 20 numeric and 3 categorical — computed at KNN time and persisted on `GOPrediction` rows. -| Feature | Descripción | Estado | +### 5.1 Numeric features (20) + +| Group | Feature | Origin | |---|---|---| -| `distance` | Distancia coseno en espacio de embeddings | Existente | -| `evidence_weight` | Peso del código de evidencia (IDA > IEA) | Existente | -| `identity_nw / sw` | Identidad de secuencia (alineamiento NW/SW) | Existente (opcional) | -| `similarity_nw / sw` | Similaridad de secuencia | Existente (opcional) | -| `taxonomic_distance` | Distancia taxonómica entre query y referencia | Existente (opcional) | -| `vote_count` | Número de vecinos que coinciden en este GO term | **Nuevo** | -| `k_position` | Posición del vecino más cercano que predijo este término | **Nuevo** | -| `go_term_frequency` | Frecuencia del término en el annotation set de referencia | **Nuevo** | -| `ref_annotation_density` | Número de GO terms de la proteína de referencia | **Nuevo** | -| `neighbor_distance_std` | Varianza de distancias a los K vecinos | **Nuevo** | +| **Embedding retrieval** | `distance` | cosine distance between query and the contributing neighbour | +| **NW alignment** | `identity_nw`, `similarity_nw`, `alignment_score_nw`, `gaps_pct_nw`, `alignment_length_nw` | Needleman–Wunsch via parasail (BLOSUM62), computed per (query, neighbour) pair when `compute_alignments=True` | +| **SW alignment** | `identity_sw`, `similarity_sw`, `alignment_score_sw`, `gaps_pct_sw`, `alignment_length_sw` | Smith–Waterman via parasail (BLOSUM62), same condition | +| **Sequence length** | `length_query`, `length_ref` | Raw sequence lengths | +| **Taxonomy** | `taxonomic_distance`, `taxonomic_common_ancestors` | NCBI taxonomy LCA via ete3 when `compute_taxonomy=True` | +| **Neighbour aggregation** | `vote_count` | Number of neighbours in the top-$k$ that voted for the same GO term | +| | `k_position` | Rank (0-indexed) of the closest neighbour that supported the term | +| | `go_term_frequency` | Global frequency of the term in the reference annotation set | +| | `ref_annotation_density` | Number of distinct GO terms annotating the reference protein | +| | `neighbor_distance_std` | Standard deviation of distances across the $k$ neighbours of the query | + +### 5.2 Categorical features (3) + +| Feature | Meaning | +|---|---| +| `qualifier` | GAF qualifier of the source annotation (`enables`, `involved_in`, etc.) | +| `evidence_code` | GAF evidence code of the source annotation (`EXP`, `IDA`, `IEA`, …) | +| `taxonomic_relation` | Discrete label derived from the LCA (`same_species`, `same_genus`, `same_family`, `distant`) | + +Categoricals are passed to LightGBM via its native `categorical_feature` handling (no one-hot encoding; LightGBM partitions on category sets directly). + +### 5.3 Missing-value convention + +- Numeric missing values are left as `NaN` and handled natively by LightGBM's missing-value-aware splits. +- Categorical missing values are coerced to `NA` and treated as a distinct bin. +- Alignment and taxonomy columns are only populated when `compute_alignments=True` / `compute_taxonomy=True` at prediction time. If either flag is off, those columns are all-NaN for the run and the re-ranker still trains but with a degraded feature set. --- -## Función de Pérdida +## 6. Model and training protocol + +### 6.1 Model + +- **Library**: LightGBM (`lightgbm.Booster`) +- **Objective**: `binary` (binary cross-entropy / log loss) +- **Validation metric**: `binary_logloss` and `auc` (early stopping is tracked on AUC) +- **Boosting**: `gbdt` with `num_leaves=31`, `learning_rate=0.01`, `feature_fraction=0.8`, `bagging_fraction=0.8`, `bagging_freq=5`, `seed=42` +- **Early stopping**: disabled via callback only if `early_stopping_rounds=0`; otherwise stops when validation AUC does not improve for the configured number of rounds -Se utiliza **LambdaRank** en lugar de binary cross-entropy, ya que optimiza directamente el orden de las predicciones (proxy de NDCG / Fmax) en lugar de la calibración de probabilidades. +> **Note on the objective.** Earlier drafts of this document (and informal notes) described the loss as **LambdaRank**. The implementation is actually **binary cross-entropy**. Switching to a pairwise/listwise rank loss is a known avenue for future work; it was deferred because (a) binary CE is the simpler baseline and has already matched or beaten the heuristic `alignment_weighted` scoring and (b) LambdaRank would require restructuring the training data into query groups, which complicates the per-split sampling pipeline. -Para cada proteína query, las predicciones GO se rankean conjuntamente: -- Positivos: GO terms en $\Delta_{N \to N+1}$ -- Negativos: GO terms predichos pero no en el delta +### 6.2 Split strategy + +- **Stratified train/val split** at `val_fraction=0.2`, stratified on the label (the positive rate is 0.17%–5% depending on tier × aspect — naive random splits would under-represent positives in the validation set). +- **Negative subsampling** via `neg_pos_ratio=10`: after splitting, each of the train and val sets is independently subsampled so that `|negatives| ≤ 10 × |positives|`. Without this step, 6 of 9 per-(tier, aspect) models in v1 failed to learn at all — the positive rate was too low for gradient boosted trees to see a signal. +- **IA sample weighting**: when an information accretion file is provided, each row's `sample_weight` is set to `IA(go_term)`. This makes the model focus on informative (rare, specific) GO terms — the same aspect of the term that CAFA evaluation rewards via IA-weighted Fmax. + +### 6.3 Per-tier, not per-aspect + +One model is trained **per tier** (`NK`, `LK`, `PK`), not per (tier × aspect). This was an explicit change in v2 after v1 trained 9 models (one per cell) and 6 of them either never converged or overfit on the smaller aspect slices. Aspect identity is not currently used as a feature; this is a known simplification (see §9). + +### 6.4 Temporal splits + +- **Training pairs**: 13 consecutive deltas from GOA 160 through GOA 220 — `[(160,165), (165,170), (170,175), (175,180), (180,185), (185,190), (190,195), (195,200), (200,205), (205,211), (211,215), (215,220)]`. The training rows from all pairs are concatenated and passed to LightGBM as a single dataset. Pair identity is not used as a feature. +- **Test pair**: `(220, 229)` — never seen during training. The test set is passed through the trained reranker and fed to `run_cafa_evaluation` alongside the baseline to measure the lift. + +### 6.5 Budget + +| Version | `num_boost_round` | `early_stopping_rounds` | Comment | +|---|---|---|---| +| v1 | 300 | 50 | 6/9 models hit iter=1 (early stop on first round) — under-trained, unbalanced | +| v2 | 1000 | 50 | Stable; per-tier models; IA weighting introduced | +| v3 | 1000 | 50 | Same budget; alignment + taxonomy features fully populated in training (were NULL in v2) | +| v4 | **5000** | **100** | In progress 2026-04-10: all 6 v3 models hit `best_iteration ≈ 1000` — implying they never converged under the previous budget. v4 restores early stopping as a convergence criterion, not a time-out. | --- -## Pipeline de Datos: WebDataset +## 7. Integration with the PROTEA pipeline -El volumen de datos (múltiples splits × ~1.35M predicciones por split × embeddings de 480 dim) requiere un pipeline de datos eficiente. Se propone almacenar los ejemplos de entrenamiento en formato **WebDataset** (shards tar), con un shard por split GOA: +### 7.1 ORM and persistence -``` -reranker_data/ - splits/ - goa190_to_195.tar # ~2GB por shard - goa195_to_200.tar - ... - goa220_to_229.tar # test split — no tocar durante training - models/ - reranker_v1.pt - reranker_v1_config.json -``` +- **`Reranker` row** (table: `rerankers`) — stores the trained LightGBM booster serialised as bytes alongside training metadata (`feature_importance`, `val_auc`, `best_iteration`, `train_samples`, hyperparameters, parent `job_id`). +- **`RerankerTrainingJob`** row captures the auto-pipeline metadata (splits used, features computed, per-tier model IDs). + +### 7.2 Scoring router + +The `scoring` router exposes endpoints to list and inspect rerankers: +- `GET /scoring/rerankers` — list trained rerankers +- `GET /scoring/rerankers/{id}` — metadata + feature importance -Cada muestra en el WebDataset es **una proteína query** con todas sus predicciones GO para ese split: +### 7.3 Applying the re-ranker at evaluation time -```python +At evaluation time (`run_cafa_evaluation`), the caller supplies a `rerankers` mapping that selects a re-ranker per tier: + +```json { - "query_accession": "P12345", - "query_embedding": float32[480], - "go_term_ids": ["GO:0006915", "GO:0005737", ...], # N_preds - "neighbor_embeddings": float32[N_preds, K, 480], - "tabular_features": float32[N_preds, K, F], - "labels": int8[N_preds], # 1 si en delta, 0 si no + "rerankers": { + "nk": {"reranker_id": "2ff1818f-71b6-4932-8f8d-b3000e3c8d34"}, + "lk": {"reranker_id": "269e26b4-0bec-42fa-a077-fe5b675dd2de"}, + "pk": {"reranker_id": "e14b9716-bbf8-4b99-b34b-b801c3966579"} + } } ``` -El streaming de WebDataset permite entrenar sin cargar todo en RAM. +The evaluation operation: +1. Streams predictions from the target `PredictionSet` tier by tier. +2. For each tier, loads the corresponding booster, applies it to the feature matrix, and overrides the original `score` with the re-ranked probability. +3. Feeds the re-ranked predictions to `cafaeval` with IA weighting and emits per-cell Fmax. ---- +The raw `PredictionSet` is never mutated — the re-ranker only changes the `score` column as the rows pass through evaluation. This means a single prediction set can be evaluated under multiple re-rankers (ESMC, ProstT5, v3, v4, …) without duplicating storage. -## Stack Tecnológico +### 7.4 `train_reranker_auto` operation -| Componente | Tecnología | -|---|---| -| Modelo | PyTorch | -| Data pipeline | WebDataset + torch.utils.data | -| Baseline comparación | LightGBM (binary + LambdaRank) | -| GO embeddings | Node2Vec / PyTorch Geometric | -| Seguimiento experimentos | wandb | -| Embeddings proteína | ESM2 / ESMC (ya en PROTEA) | +The operation `train_reranker_auto` orchestrates the full pipeline end-to-end: ---- +1. For each training pair, runs KNN retrieval (FAISS IVFFlat by default) with `compute_alignments=True`, `compute_taxonomy=True`. +2. Writes per-pair parquet files into a temporary directory. +3. Loads the concatenation into memory, applies per-tier splits, trains three LightGBM boosters. +4. Persists the three boosters as `Reranker` rows under a common base name. +5. Optionally runs a self-evaluation on the held-out test split (see warning in §8). +6. **Cleans up the temporary parquet files** on exit (`shutil.rmtree(tmp_dir)` at `train_reranker.py:1480`). + +The cleanup in step 6 has an important consequence: **re-training only the LightGBM stage is not possible** after a pipeline run — a re-train requires re-executing the full KNN + feature engineering path. This is why each v-version re-train takes hours, not minutes. -## Integración en PROTEA +--- -Una vez entrenado, el re-ranker se integra en el pipeline existente: +## 8. Known limitations and caveats -1. Nuevo modelo ORM `RerankingModel`: almacena pesos serializados y metadata de entrenamiento -2. Campo `reranker_id` (nullable) en `PredictionSet` -3. Si `reranker_id` presente: `store_predictions` aplica el modelo y sobreescribe `score` con $\hat{y}$ -4. El threshold de Fmax se calcula igual que ahora sobre los nuevos scores -5. UI: selector de re-ranker en la pantalla de predicción +1. **`test_evaluation` is not comparable to `cafaeval`.** The operation optionally runs an internal test evaluator against the held-out split. That evaluator does not apply GO propagation, does not apply IA weighting, and uses a naive macro-Fmax that inflates improvements by +0.04 to +0.08 over what `cafaeval` actually reports. **It must not be used in thesis claims.** Only `run_cafa_evaluation` with IA and GO propagation produces numbers that belong in the thesis. +2. **Binary objective is a proxy for ranking.** Binary cross-entropy optimises pointwise calibration, not ranking quality. This is the single largest known gap between the current implementation and the ideal model for Fmax. Replacing it with LambdaRank (or a listwise objective) is the first item on the "future work" list. +3. **Parquet staging files are ephemeral.** The KNN + feature engineering output is thrown away at the end of a training run, so the LightGBM stage cannot be iterated independently. Persisting the staging parquet (behind a flag) would allow rapid hyperparameter sweeps. Open question: is the additional disk cost (10–20 GB per run) worth it? +4. **No aspect feature.** Aspect is not used as a feature, even though BPO/MFO/CCO have very different annotation densities and the same term can behave differently across aspects. A per-tier model averages across aspects and may under-perform in MFO vs BPO. +5. **No uncertainty output.** The re-ranker emits a point probability. Downstream evaluation is sensitive to calibration, but calibration is not currently measured. A reliability diagram per tier would help diagnose whether the probabilities are meaningful or only usable for ranking. +6. **Under-training of v1–v3.** All six v3 models (ESMC and ProstT5, NK/LK/PK) hit `best_iteration ≈ 1000` at the previous budget, which indicates the models never satisfied the early stopping criterion. The Fmax deltas derived from v3 must be treated as provisional until v4 completes. See `project_reranker_benchmark.md` for the full story. +7. **Temporal label noise.** Some annotations in $\Delta_{N \to N+1}$ are not genuinely "new biology"; they are curation catch-ups. There is no filter for this, so the training label includes noise. Evidence code filtering removes the worst offenders (IEA) but not all. +8. **Single embedding at a time.** The re-ranker is trained on features derived from one embedding configuration. There is no multi-embedding ensemble; comparing ESMC, ProstT5 and Ankh means training three independent re-rankers — which is exactly what the benchmark in `EXPERIMENTAL_DESIGN.md` does. --- -## Experimentos y Ablaciones +## 9. Version history -El diseño permite comparar directamente: +| Version | Date | Change | Outcome | +|---|---|---|---| +| v1 (unbalanced) | 2026-03-22 | First working pipeline: 9 per-(tier, aspect) models, binary CE, 300 rounds, no sample weights, no negative subsampling | 6/9 models never learned (positive rate too low); CCO/MFO noisy | +| v1 (balanced) | 2026-03-22 | Added `neg_pos_ratio=10`; same 9 models | All models learned; BPO recovered; MFO degraded vs heuristic | +| v2 | 2026-03-23 | Collapsed to 3 per-tier models (NK/LK/PK); added IA sample weighting; raised `num_boost_round` to 1000 | Robust; matched the heuristic `alignment_weighted` in most cells but did not beat it | +| v3 | 2026-03-23 | Populated alignment + taxonomy features during training (were NULL in v2) | First version to beat `alignment_weighted` in 7/9 cells for ESMC-300M | +| v3 ProstT5 | 2026-04-10 | Same v3 protocol, run on ProstT5-XL embeddings for cross-embedding comparison | Yielded the F1/F2/F3 findings in `project_reranker_benchmark.md`; exposed the under-training in v3 | +| v4 (in progress) | 2026-04-10 | Raised `num_boost_round` to 5000 and `early_stopping_rounds` to 100; same features, same splits | In training for both ESMC-300M and ProstT5-XL (jobs `48c91381`, `e923ac70`); meant to provide the converged reference numbers | -| Configuración | Descripción | -|---|---| -| **Baseline** | KNN + scoring heurístico actual | -| **LightGBM tabular** | Re-ranker con features tabulares sin embeddings | -| **LightGBM + derived** | Features tabulares + features derivadas del embedding (density, std) | -| **MLP cross-encoder** | Arquitectura completa sin cross-attention | -| **Cross-attention (propuesto)** | Arquitectura completa | -| **+ GO DAG embeddings** | Ablación: ¿aportan los go_term_emb? | -| **+ temporal CV** | Ablación: ¿mejora añadir más splits históricos? | +Concrete reranker UUIDs for the v3 and v4 runs live in `project_reranker_benchmark.md` and will be mirrored into `EXPERIMENTS.md` once v4 completes. + +--- + +## 10. Forward pointers -La métrica principal es **Fmax promedio sobre los 9 settings** (NK/LK/PK × BPO/MFO/CCO) en el test split GOA220→229. +- **`EXPERIMENTS.md`** — per-experiment tables, external tool comparisons, day-to-day lab notebook. +- **`EXPERIMENTAL_DESIGN.md`** — the prospective 8-model PLM comparison that uses this re-ranker as a fixed downstream stage. +- **`project_reranker_benchmark.md`** (in auto-memory) — volatile working state for the ongoing benchmark. +- **Code**: `protea/core/reranker.py` (feature definitions, `train`, `predict_scores`), `protea/core/operations/train_reranker.py` (both `TrainRerankerPayload` and `TrainRerankerAutoPayload`, the full pipeline). --- -## Valor para la Tesis +## 11. Historical note: why this file was rewritten -1. **Científicamente honesto**: el mismo mecanismo temporal que se usa para evaluar se usa para entrenar. No hay data leakage. -2. **Comprobable y cuantificable**: Fmax(baseline KNN) vs Fmax(re-ranker) en benchmark idéntico. -3. **Interpretable**: las feature importances (LightGBM) o los pesos de atención (cross-attention) revelan qué aspectos de una predicción KNN son más predictivos de anotaciones futuras. -4. **Generalizable**: el re-ranker aprende sobre distribuciones temporales de anotaciones GO, no sobre una proteína concreta — debería generalizar a proteínas no vistas. -5. **Extensible**: la arquitectura admite incorporar embeddings de secuencia de mayor calidad (ESM3, ProstT5) sin cambiar el pipeline. +The previous version of `RERANKER.md` (removed 2026-04-10) proposed a PyTorch cross-attention re-ranker over ESM embeddings with WebDataset sharded I/O, Node2Vec GO term embeddings, wandb tracking, and a nine-cell (tier × aspect) ablation matrix. That design was never built. The system that actually exists and produces the benchmark numbers in `EXPERIMENTS.md` is the LightGBM pipeline documented above. Keeping the two in sync was causing confusion when referring back to the design doc during thesis writing, so the document was rewritten from the current source of truth (`protea/core/reranker.py`) rather than from the original proposal. The historical proposal is preserved in git history for reference. diff --git a/alembic/versions/651358a5a2c8_add_consensus_features_to_go_prediction.py b/alembic/versions/651358a5a2c8_add_consensus_features_to_go_prediction.py new file mode 100644 index 0000000..61820ad --- /dev/null +++ b/alembic/versions/651358a5a2c8_add_consensus_features_to_go_prediction.py @@ -0,0 +1,37 @@ +"""add consensus features to go_prediction + +Revision ID: 651358a5a2c8 +Revises: b1a1f4ec0e42 +Create Date: 2026-04-16 10:00:00.000000 +""" +from __future__ import annotations + +import sqlalchemy as sa + +from alembic import op + +revision: str = "651358a5a2c8" +down_revision: str = "b1a1f4ec0e42" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "go_prediction", + sa.Column("neighbor_vote_fraction", sa.Float(), nullable=True), + ) + op.add_column( + "go_prediction", + sa.Column("neighbor_min_distance", sa.Float(), nullable=True), + ) + op.add_column( + "go_prediction", + sa.Column("neighbor_mean_distance", sa.Float(), nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("go_prediction", "neighbor_mean_distance") + op.drop_column("go_prediction", "neighbor_min_distance") + op.drop_column("go_prediction", "neighbor_vote_fraction") diff --git a/alembic/versions/7a2c9e1d0b33_add_reranker_v6_features_to_go_prediction.py b/alembic/versions/7a2c9e1d0b33_add_reranker_v6_features_to_go_prediction.py new file mode 100644 index 0000000..c19a3b6 --- /dev/null +++ b/alembic/versions/7a2c9e1d0b33_add_reranker_v6_features_to_go_prediction.py @@ -0,0 +1,55 @@ +"""add reranker v6 features to go_prediction + +Adds 25 nullable Float columns used by the v6 reranker: + +- 6 Anc2Vec semantic-coherence features (neighbor + query-known). +- 3 tax_voters consensus features (computed over the subset of neighbors that + voted for each candidate term). +- 16 emb_pca_query_* features (per-query projection onto the precomputed + principal components of the reference embedding pool). + +All columns are nullable because older prediction_sets predate these features +and older reranker versions do not read them. + +Revision ID: 7a2c9e1d0b33 +Revises: 651358a5a2c8 +Create Date: 2026-04-19 12:00:00.000000 +""" +from __future__ import annotations + +import sqlalchemy as sa + +from alembic import op + +revision: str = "7a2c9e1d0b33" +down_revision: str = "651358a5a2c8" +branch_labels = None +depends_on = None + + +_ANC2VEC_COLS = ( + "anc2vec_neighbor_cos", + "anc2vec_neighbor_maxcos", + "anc2vec_has_emb", + "anc2vec_query_known_cos", + "anc2vec_query_known_maxcos", + "anc2vec_query_known_count", +) + +_TAX_VOTERS_COLS = ( + "tax_voters_same_frac", + "tax_voters_close_frac", + "tax_voters_mean_common_ancestors", +) + +_EMB_PCA_COLS = tuple(f"emb_pca_query_{i}" for i in range(16)) + + +def upgrade() -> None: + for col in (*_ANC2VEC_COLS, *_TAX_VOTERS_COLS, *_EMB_PCA_COLS): + op.add_column("go_prediction", sa.Column(col, sa.Float(), nullable=True)) + + +def downgrade() -> None: + for col in reversed((*_ANC2VEC_COLS, *_TAX_VOTERS_COLS, *_EMB_PCA_COLS)): + op.drop_column("go_prediction", col) diff --git a/alembic/versions/b1a1f4ec0e42_sequence_embedding_to_halfvec.py b/alembic/versions/b1a1f4ec0e42_sequence_embedding_to_halfvec.py new file mode 100644 index 0000000..e8927eb --- /dev/null +++ b/alembic/versions/b1a1f4ec0e42_sequence_embedding_to_halfvec.py @@ -0,0 +1,54 @@ +"""migrate sequence_embedding.embedding from vector to halfvec + +Revision ID: b1a1f4ec0e42 +Revises: f7a004f5f2c7 +Create Date: 2026-04-14 22:00:00.000000 +""" +from __future__ import annotations + +from alembic import op + +revision: str = "b1a1f4ec0e42" +down_revision: str = "f7a004f5f2c7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + """ + DO $$ + BEGIN + IF ( + SELECT udt_name + FROM information_schema.columns + WHERE table_name = 'sequence_embedding' + AND column_name = 'embedding' + ) = 'vector' THEN + ALTER TABLE sequence_embedding + ALTER COLUMN embedding TYPE halfvec + USING embedding::halfvec; + END IF; + END $$; + """ + ) + + +def downgrade() -> None: + op.execute( + """ + DO $$ + BEGIN + IF ( + SELECT udt_name + FROM information_schema.columns + WHERE table_name = 'sequence_embedding' + AND column_name = 'embedding' + ) = 'halfvec' THEN + ALTER TABLE sequence_embedding + ALTER COLUMN embedding TYPE vector + USING embedding::vector; + END IF; + END $$; + """ + ) diff --git a/alembic/versions/b2c3d4e5f6a7_add_embedding_config_display_metadata.py b/alembic/versions/b2c3d4e5f6a7_add_embedding_config_display_metadata.py new file mode 100644 index 0000000..b2ff521 --- /dev/null +++ b/alembic/versions/b2c3d4e5f6a7_add_embedding_config_display_metadata.py @@ -0,0 +1,40 @@ +"""add display metadata columns to embedding_config + +Revision ID: b2c3d4e5f6a7 +Revises: 3505bfa74df6 +Create Date: 2026-04-10 + +Adds three nullable columns to ``embedding_config`` so the benchmark UI can +show a human-readable label, a family tag, and the approximate parameter +count without having to infer everything from the raw HuggingFace +``model_name`` at render time. + +All columns are nullable — existing rows can be backfilled later with +``UPDATE embedding_config SET display_name = ..., family = ..., param_count = ...`` +or left as NULL (the router falls back to the Python-side derivation). +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = 'b2c3d4e5f6a7' +down_revision: str | Sequence[str] | None = '3505bfa74df6' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.add_column('embedding_config', sa.Column('display_name', sa.String(), nullable=True)) + op.add_column('embedding_config', sa.Column('family', sa.String(), nullable=True)) + op.add_column('embedding_config', sa.Column('param_count', sa.BigInteger(), nullable=True)) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_column('embedding_config', 'param_count') + op.drop_column('embedding_config', 'family') + op.drop_column('embedding_config', 'display_name') diff --git a/alembic/versions/c4d5e6f7a8b9_add_taxonomy_to_query_set_entry.py b/alembic/versions/c4d5e6f7a8b9_add_taxonomy_to_query_set_entry.py new file mode 100644 index 0000000..438cda9 --- /dev/null +++ b/alembic/versions/c4d5e6f7a8b9_add_taxonomy_to_query_set_entry.py @@ -0,0 +1,43 @@ +"""add taxonomy_id and species to query_set_entry + +Revision ID: c4d5e6f7a8b9 +Revises: b2c3d4e5f6a7 +Create Date: 2026-04-11 + +Adds two nullable columns to ``query_set_entry`` so user-uploaded FASTA +sequences can carry their UniProt header taxonomy (``OX=`` / ``OS=``) even +when the accession is not present in the ``protein`` table and therefore +has no ``ProteinUniProtMetadata`` counterpart. + +The populating helper lives in ``protea.api.routers.query_sets`` and is a +silent no-op for non-UniProt headers. +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = 'c4d5e6f7a8b9' +down_revision: str | Sequence[str] | None = 'b2c3d4e5f6a7' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.add_column('query_set_entry', sa.Column('taxonomy_id', sa.Integer(), nullable=True)) + op.add_column('query_set_entry', sa.Column('species', sa.String(), nullable=True)) + op.create_index( + 'ix_query_set_entry_taxonomy_id', + 'query_set_entry', + ['taxonomy_id'], + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_index('ix_query_set_entry_taxonomy_id', table_name='query_set_entry') + op.drop_column('query_set_entry', 'species') + op.drop_column('query_set_entry', 'taxonomy_id') diff --git a/alembic/versions/c517e16da06b_reranker_model_artifact_columns.py b/alembic/versions/c517e16da06b_reranker_model_artifact_columns.py new file mode 100644 index 0000000..eeeaf93 --- /dev/null +++ b/alembic/versions/c517e16da06b_reranker_model_artifact_columns.py @@ -0,0 +1,81 @@ +"""reranker_model_artifact_columns + +Revision ID: c517e16da06b +Revises: 7a2c9e1d0b33 +Create Date: 2026-04-21 02:57:27.951747 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'c517e16da06b' +down_revision: Union[str, Sequence[str], None] = '7a2c9e1d0b33' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.add_column('reranker_model', sa.Column('artifact_uri', sa.String(length=512), nullable=True)) + op.add_column('reranker_model', sa.Column('feature_schema_sha', sa.String(length=16), nullable=True)) + op.add_column('reranker_model', sa.Column('embedding_config_id', sa.UUID(), nullable=True)) + op.add_column('reranker_model', sa.Column('ontology_snapshot_id', sa.UUID(), nullable=True)) + op.add_column('reranker_model', sa.Column('producer_version', sa.String(length=64), nullable=True)) + op.add_column('reranker_model', sa.Column('producer_git_sha', sa.String(length=40), nullable=True)) + op.add_column('reranker_model', sa.Column('spec_yaml', sa.Text(), nullable=True)) + # model_data goes nullable so new rows can live exclusively by reference + # (artifact_uri). Downgrade restores NOT NULL — will fail loudly if any + # row has a NULL model_data, which is the correct behavior. + op.alter_column( + 'reranker_model', 'model_data', + existing_type=sa.TEXT(), + nullable=True, + ) + op.create_index( + op.f('ix_reranker_model_embedding_config_id'), + 'reranker_model', ['embedding_config_id'], unique=False, + ) + op.create_index( + op.f('ix_reranker_model_ontology_snapshot_id'), + 'reranker_model', ['ontology_snapshot_id'], unique=False, + ) + op.create_foreign_key( + 'fk_reranker_model_ontology_snapshot_id', + 'reranker_model', 'ontology_snapshot', + ['ontology_snapshot_id'], ['id'], ondelete='SET NULL', + ) + op.create_foreign_key( + 'fk_reranker_model_embedding_config_id', + 'reranker_model', 'embedding_config', + ['embedding_config_id'], ['id'], ondelete='SET NULL', + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_constraint( + 'fk_reranker_model_embedding_config_id', + 'reranker_model', type_='foreignkey', + ) + op.drop_constraint( + 'fk_reranker_model_ontology_snapshot_id', + 'reranker_model', type_='foreignkey', + ) + op.drop_index(op.f('ix_reranker_model_ontology_snapshot_id'), table_name='reranker_model') + op.drop_index(op.f('ix_reranker_model_embedding_config_id'), table_name='reranker_model') + op.alter_column( + 'reranker_model', 'model_data', + existing_type=sa.TEXT(), + nullable=False, + ) + op.drop_column('reranker_model', 'spec_yaml') + op.drop_column('reranker_model', 'producer_git_sha') + op.drop_column('reranker_model', 'producer_version') + op.drop_column('reranker_model', 'ontology_snapshot_id') + op.drop_column('reranker_model', 'embedding_config_id') + op.drop_column('reranker_model', 'feature_schema_sha') + op.drop_column('reranker_model', 'artifact_uri') diff --git a/alembic/versions/f7a004f5f2c7_add_visitor_events.py b/alembic/versions/f7a004f5f2c7_add_visitor_events.py new file mode 100644 index 0000000..96f90f9 --- /dev/null +++ b/alembic/versions/f7a004f5f2c7_add_visitor_events.py @@ -0,0 +1,40 @@ +"""add visitor_event table + +Revision ID: f7a004f5f2c7 +Revises: c4d5e6f7a8b9 +Create Date: 2026-04-12 20:50:00.000000 +""" +from __future__ import annotations + +import sqlalchemy as sa + +from alembic import op + +revision: str = "f7a004f5f2c7" +down_revision: str = "c4d5e6f7a8b9" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "visitor_event", + sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), + sa.Column("day", sa.Date(), nullable=False), + sa.Column("visitor_hash", sa.String(length=16), nullable=False), + sa.Column("path", sa.String(length=255), nullable=False), + sa.Column("method", sa.String(length=8), nullable=False), + sa.Column("status", sa.Integer(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_visitor_event_day_hash", "visitor_event", ["day", "visitor_hash"]) + op.create_index("ix_visitor_event_created_at", "visitor_event", ["created_at"]) + op.create_index("ix_visitor_event_path", "visitor_event", ["path"]) + + +def downgrade() -> None: + op.drop_index("ix_visitor_event_path", table_name="visitor_event") + op.drop_index("ix_visitor_event_created_at", table_name="visitor_event") + op.drop_index("ix_visitor_event_day_hash", table_name="visitor_event") + op.drop_table("visitor_event") diff --git a/apps/web/app/[locale]/benchmark/page.tsx b/apps/web/app/[locale]/benchmark/page.tsx new file mode 100644 index 0000000..13c2d1d --- /dev/null +++ b/apps/web/app/[locale]/benchmark/page.tsx @@ -0,0 +1,563 @@ +"use client"; + +import { useEffect, useMemo, useState } from "react"; +import Link from "next/link"; +import { + getBenchmarkEmbeddings, + getBenchmarkMatrix, + type BenchmarkBestCell, + type BenchmarkEmbedding, + type BenchmarkEvalSet, + type BenchmarkMatrixResponse, + type BenchmarkRow, + type BenchmarkStage, +} from "../../../lib/api"; + +// ── Helpers ────────────────────────────────────────────────────────────── + +function formatParams(n: number | null): string { + if (n == null) return ""; + if (n >= 1_000_000_000) { + const v = n / 1_000_000_000; + return v >= 10 ? `${Math.round(v)}B` : `${v.toFixed(1)}B`; + } + if (n >= 1_000_000) return `${Math.round(n / 1_000_000)}M`; + return `${n}`; +} + +function formatProteins(n: number | undefined): string { + if (n == null) return ""; + if (n >= 1_000) return `${(n / 1_000).toFixed(1)}k`; + return String(n); +} + +function cellKey(eid: string, cat: string, asp: string): string { + return `${eid}|${cat}|${asp}`; +} + +/** Index rows by (embedding, cat, asp) for O(1) cell lookup. The matrix + * endpoint already dedupes to a single best row per tuple. */ +function indexRows(rows: BenchmarkRow[]): Map { + const out = new Map(); + for (const r of rows) { + out.set(cellKey(r.embedding_config_id, r.category, r.aspect), r); + } + return out; +} + +/** Index the leaderboard by (cat, asp) so the table can highlight winners. */ +function indexBestPerCell(cells: BenchmarkBestCell[]): Map { + const out = new Map(); + for (const c of cells) { + out.set(`${c.category}|${c.aspect}`, c); + } + return out; +} + +function stageLabel(stages: BenchmarkStage[], name: string): string { + return stages.find((s) => s.name === name)?.label ?? name; +} + +function evalSetLabel(evalSets: BenchmarkEvalSet[], id: string): string { + return evalSets.find((e) => e.id === id)?.label ?? `${id.slice(0, 8)}…`; +} + +/** Pick the initial stage once the catalog is loaded. Backend already + * returns stages sorted by YAML preferred_default_stages, so the first + * entry IS the preferred one if it has data. */ +function pickDefaultStage(stages: BenchmarkStage[]): string | null { + return stages.length > 0 ? stages[0].name : null; +} + +/** CSV export of the currently filtered rows — one line per cell. */ +function rowsToCsv( + embeddings: BenchmarkEmbedding[], + rows: BenchmarkRow[], + stage: string, +): string { + const embById = new Map(embeddings.map((e) => [e.id, e])); + const header = [ + "display_name", + "family", + "param_count", + "model_name", + "stage", + "category", + "aspect", + "fmax", + "precision", + "recall", + "coverage", + "n_proteins", + "evaluation_set_id", + "evaluation_result_id", + ].join(","); + const lines = [header]; + for (const r of rows) { + if (r.stage !== stage) continue; + const e = embById.get(r.embedding_config_id); + lines.push( + [ + e?.display_name ?? "", + e?.family ?? "", + e?.param_count ?? "", + e?.model_name ?? "", + r.stage, + r.category, + r.aspect, + r.fmax, + r.precision ?? "", + r.recall ?? "", + r.coverage ?? "", + r.n_proteins ?? "", + r.evaluation_set_id, + r.evaluation_result_id, + ] + .map((v) => { + const s = String(v); + if (/[,"\n]/.test(s)) return `"${s.replace(/"/g, '""')}"`; + return s; + }) + .join(","), + ); + } + return lines.join("\n"); +} + +function downloadCsv(filename: string, content: string): void { + const blob = new Blob([content], { type: "text/csv;charset=utf-8" }); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); +} + +// ── Page ───────────────────────────────────────────────────────────────── + +export default function BenchmarkPage() { + const [embeddings, setEmbeddings] = useState(null); + const [matrix, setMatrix] = useState(null); + const [error, setError] = useState(null); + const [stage, setStage] = useState(null); + const [evalSetId, setEvalSetId] = useState("all"); + + // Unfiltered catalog fetch — populates the full set of known stages and + // eval sets, so selector chips don't disappear when a filtered query + // returns zero rows. + const [catalog, setCatalog] = useState<{ + stages: BenchmarkStage[]; + evalSets: BenchmarkEvalSet[]; + categories: string[]; + aspects: string[]; + }>({ stages: [], evalSets: [], categories: [], aspects: [] }); + + useEffect(() => { + getBenchmarkMatrix() + .then((m) => { + setCatalog({ + stages: m.stages, + evalSets: m.evaluation_sets, + categories: m.categories, + aspects: m.aspects, + }); + setStage((prev) => prev ?? pickDefaultStage(m.stages)); + }) + .catch((e) => setError(e.message)); + }, []); + + useEffect(() => { + if (stage === null) return; + setError(null); + Promise.all([ + getBenchmarkEmbeddings(), + getBenchmarkMatrix({ + stage, + evaluation_set_id: evalSetId === "all" ? undefined : evalSetId, + }), + ]) + .then(([e, m]) => { + setEmbeddings(e.embeddings); + setMatrix(m); + }) + .catch((e) => setError(e.message)); + }, [stage, evalSetId]); + + const rowIndex = useMemo( + () => (matrix ? indexRows(matrix.rows) : new Map()), + [matrix], + ); + + const bestPerCell = useMemo( + () => (matrix ? indexBestPerCell(matrix.best_per_cell) : new Map()), + [matrix], + ); + + const embeddingsWithData = useMemo(() => { + if (!embeddings || !matrix) return new Set(); + return new Set(matrix.embedding_config_ids); + }, [embeddings, matrix]); + + if (error) { + return ( +
+
+

{error}

+
+
+ ); + } + + if (!embeddings || !matrix || stage === null) { + return ( +
+
+
+
+ ); + } + + const hasData = matrix.rows.length > 0; + const stageList = catalog.stages.length > 0 ? catalog.stages : matrix.stages; + const evalSetList = catalog.evalSets.length > 0 ? catalog.evalSets : matrix.evaluation_sets; + const categories = catalog.categories.length > 0 ? catalog.categories : matrix.categories; + const aspects = catalog.aspects.length > 0 ? catalog.aspects : matrix.aspects; + const currentStageLabel = stageLabel(stageList, stage); + + // Active eval set banner: when "all" is selected and there's only one set, + // show that one; when a specific one is selected, show its full metadata. + const activeEvalSet = + evalSetId !== "all" + ? evalSetList.find((e) => e.id === evalSetId) ?? null + : evalSetList.length === 1 + ? evalSetList[0] + : null; + + return ( +
+ {/* Header */} +
+
+

Benchmark matrix

+

+ Per-embedding Fmax across categories and aspects for every evaluation + run in the database.{" "} + + Back to home + +

+
+
+ +
+
+ + {/* Eval set context banner */} + {activeEvalSet && ( +
+
+
+ + Evaluation split + +
+ {activeEvalSet.label} +
+
+
+ {activeEvalSet.stats.delta_proteins != null && ( + + Δ{" "} + + {activeEvalSet.stats.delta_proteins.toLocaleString()} + {" "} + proteins + + )} + {activeEvalSet.stats.nk_proteins != null && ( + + NK{" "} + + {formatProteins(activeEvalSet.stats.nk_proteins)} + + + )} + {activeEvalSet.stats.lk_proteins != null && ( + + LK{" "} + + {formatProteins(activeEvalSet.stats.lk_proteins)} + + + )} + {activeEvalSet.stats.pk_proteins != null && ( + + PK{" "} + + {formatProteins(activeEvalSet.stats.pk_proteins)} + + + )} + {activeEvalSet.new_obo_version && ( + + OBO{" "} + {activeEvalSet.new_obo_version} + + )} +
+
+
+ )} + + {/* Filters */} +
+
+ +
+ {stageList.map((s) => ( + + ))} +
+
+ + {evalSetList.length > 1 && ( +
+ + +
+ )} + +
+ {matrix.total} cells · {matrix.embedding_config_ids.length} embeddings ·{" "} + {matrix.evaluation_sets.length} eval set + {matrix.evaluation_sets.length === 1 ? "" : "s"} +
+
+ + {/* Leaderboard: best Fmax per (cat, asp) across every model & stage */} + {matrix.best_per_cell.length > 0 && ( +
+
+

+ Best Fmax per cell + + across every model in current stage filter + +

+
+
+ + + + + {aspects.map((asp) => ( + + ))} + + + + {categories.map((cat) => ( + + + {aspects.map((asp) => { + const best = bestPerCell.get(`${cat}|${asp}`); + if (!best) { + return ( + + ); + } + const emb = embeddings.find((e) => e.id === best.embedding_config_id); + return ( + + ); + })} + + ))} + +
+ {asp} +
{cat} + — + +
+ {best.fmax.toFixed(3)} +
+
+ {emb?.display_name ?? "—"} +
+
+ {stageLabel(stageList, best.stage)} +
+
+
+
+ )} + + {/* Matrix table */} + {!hasData ? ( +
+

+ No evaluation results for{" "} + {currentStageLabel} yet. +

+

+ Run run_cafa_evaluation for an embedding to populate + this cell of the matrix. +

+
+ ) : ( +
+ + + + + {categories.map((cat) => ( + + ))} + + + {categories.flatMap((cat) => + aspects.map((asp) => ( + + )), + )} + + + + {embeddings.map((emb) => { + const hasRow = embeddingsWithData.has(emb.id); + return ( + + + {categories.flatMap((cat) => + aspects.map((asp) => { + const row = rowIndex.get(cellKey(emb.id, cat, asp)); + const best = bestPerCell.get(`${cat}|${asp}`); + const isWinner = + row && best && row.evaluation_result_id === best.evaluation_result_id; + return ( + + ); + }), + )} + + ); + })} + +
+ Embedding + + {cat} +
+ {asp} +
+
+ {emb.display_name} +
+
+ {emb.family} + {emb.param_count != null + ? ` · ${formatParams(emb.param_count)}` + : ""} +
+
+ {row ? ( + + {row.fmax.toFixed(3)} + + ) : ( + + )} +
+
+ )} + +

+ Display names and stage labels come from{" "} + embedding_config (DB) and{" "} + protea/config/benchmark.yaml. Edit the YAML to change + ordering, labels, or the baseline tag. +

+
+ ); +} diff --git a/apps/web/app/[locale]/embeddings/page.tsx b/apps/web/app/[locale]/embeddings/page.tsx index d3411d0..a45eaee 100644 --- a/apps/web/app/[locale]/embeddings/page.tsx +++ b/apps/web/app/[locale]/embeddings/page.tsx @@ -41,6 +41,10 @@ const MODEL_PRESETS: Record = { { value: "Rostlab/prot_t5_xl_half_uniref50-enc", label: "ProT5-XL half (FP16 encoder)", layers: 24, defaultMaxLength: 1024 }, { value: "Rostlab/ProstT5", label: "ProstT5 (3Di + AA)", layers: 24, defaultMaxLength: 1024 }, ], + ankh: [ + { value: "ElnaggarLab/ankh-base", label: "Ankh base (~450M, 48 layers, d=768)", layers: 48, defaultMaxLength: 1024 }, + { value: "ElnaggarLab/ankh-large", label: "Ankh large (~1.9B, 48 layers, d=1536)", layers: 48, defaultMaxLength: 1024 }, + ], auto: [ { value: "facebook/esm2_t33_650M_UR50D", label: "ESM-2 650M (auto backend)", layers: 33, defaultMaxLength: 1022 }, ], @@ -91,7 +95,7 @@ export default function EmbeddingsPage() { const [cmpConfigId, setCmpConfigId] = useState(""); const [cmpQuerySetId, setCmpQuerySetId] = useState(""); const [cmpQueueBatchSize, setCmpQueueBatchSize] = useState(100); - const [cmpBatchSize, setCmpBatchSize] = useState(8); + const [cmpBatchSize, setCmpBatchSize] = useState(1); const [cmpDevice, setCmpDevice] = useState("cuda"); const [cmpSkipExisting, setCmpSkipExisting] = useState(true); const [cmpResult, setCmpResult] = useState<{ id: string; status: string } | null>(null); @@ -306,6 +310,7 @@ export default function EmbeddingsPage() { +
diff --git a/apps/web/app/[locale]/evaluation/page.tsx b/apps/web/app/[locale]/evaluation/page.tsx index 25af270..f7ce0fb 100644 --- a/apps/web/app/[locale]/evaluation/page.tsx +++ b/apps/web/app/[locale]/evaluation/page.tsx @@ -274,6 +274,7 @@ function EvaluationSetCard({ const MAX_ATTEMPTS = 30; const interval = setInterval(async () => { + if (typeof document !== "undefined" && document.visibilityState === "hidden") return; attempts++; try { const fresh = await listResults(e.id); diff --git a/apps/web/app/[locale]/jobs/[id]/page.tsx b/apps/web/app/[locale]/jobs/[id]/page.tsx index 7d1d68f..08ec6bf 100644 --- a/apps/web/app/[locale]/jobs/[id]/page.tsx +++ b/apps/web/app/[locale]/jobs/[id]/page.tsx @@ -72,6 +72,7 @@ export default function JobDetail({ params }: { params: Promise<{ id: string }> const intervalRef = useRef | null>(null); async function refresh() { + if (typeof document !== "undefined" && document.visibilityState === "hidden") return; try { setError(""); const [j, ev, ch] = await Promise.all([ @@ -98,16 +99,24 @@ export default function JobDetail({ params }: { params: Promise<{ id: string }> refresh(); }, [jobId]); - // Auto-refresh while job is active + // Auto-refresh while job is active. Pauses when the tab is hidden and + // resumes on visibilitychange — avoids burning bandwidth on background tabs. useEffect(() => { if (!job) return; const isTerminal = TERMINAL.includes(String(job.status).toLowerCase()); - if (!isTerminal) { - intervalRef.current = setInterval(refresh, 2000); - } else { + if (isTerminal) { if (intervalRef.current) clearInterval(intervalRef.current); + return; } - return () => { if (intervalRef.current) clearInterval(intervalRef.current); }; + intervalRef.current = setInterval(refresh, 2000); + const onVisibility = () => { + if (document.visibilityState === "visible") refresh(); + }; + document.addEventListener("visibilitychange", onVisibility); + return () => { + if (intervalRef.current) clearInterval(intervalRef.current); + document.removeEventListener("visibilitychange", onVisibility); + }; }, [job?.status]); async function onDelete() { @@ -186,6 +195,15 @@ export default function JobDetail({ params }: { params: Promise<{ id: string }> {jobId}
+ {job.operation_description && ( +

{job.operation_description}

+ )} + {job.operation_summary && ( +

+ {job.operation_summary} +

+ )} +
{t("jobDetail.queue")} {job.queue_name}
{t("jobDetail.created")} {formatDate(job.created_at)}
diff --git a/apps/web/app/[locale]/jobs/page.tsx b/apps/web/app/[locale]/jobs/page.tsx index 7eff46a..3138a81 100644 --- a/apps/web/app/[locale]/jobs/page.tsx +++ b/apps/web/app/[locale]/jobs/page.tsx @@ -57,6 +57,7 @@ export default function JobsPage() { const intervalRef = useRef | null>(null); async function refresh(status = statusFilter, showLoader = false) { + if (!showLoader && typeof document !== "undefined" && document.visibilityState === "hidden") return; if (showLoader) setLoading(true); try { setError(""); @@ -75,7 +76,9 @@ export default function JobsPage() { refresh(statusFilter, true); }, [statusFilter]); - // Auto-refresh: faster when there are active jobs, slower otherwise + // Auto-refresh: faster when there are active jobs, slower otherwise. + // Pauses automatically when the tab is hidden (refresh() checks + // document.visibilityState) and forces a refresh on visibilitychange. useEffect(() => { if (!autoRefresh) { if (intervalRef.current) clearInterval(intervalRef.current); @@ -86,7 +89,14 @@ export default function JobsPage() { return hasActive ? 3000 : 8000; } intervalRef.current = setInterval(() => refresh(), schedule()); - return () => { if (intervalRef.current) clearInterval(intervalRef.current); }; + const onVisibility = () => { + if (document.visibilityState === "visible") refresh(); + }; + document.addEventListener("visibilitychange", onVisibility); + return () => { + if (intervalRef.current) clearInterval(intervalRef.current); + document.removeEventListener("visibilitychange", onVisibility); + }; }, [autoRefresh, statusFilter, jobs]); const activeCount = jobs.filter((j) => j.status === "running" || j.status === "queued").length; @@ -157,6 +167,12 @@ export default function JobsPage() { {formatDate(j.created_at)}

{j.operation}

+ {j.operation_description && ( +

{j.operation_description}

+ )} + {j.operation_summary && ( +

{j.operation_summary}

+ )}

{j.id}

@@ -165,10 +181,10 @@ export default function JobsPage() { {/* Desktop table */}
-
+
{t("status")}
{t("operation")}
-
{t("jobId")}
+
{t("operationContext")}
{t("created")}
@@ -180,14 +196,24 @@ export default function JobsPage() {
- {j.operation} + {j.operation} + {j.operation_description && ( + {j.operation_description} + )}
-
{j.id}
+
+ {j.operation_summary ? ( + {j.operation_summary} + ) : ( + + )} + {j.id} +
{formatDate(j.created_at)}
))} diff --git a/apps/web/app/[locale]/layout.tsx b/apps/web/app/[locale]/layout.tsx index 1fa1b1e..6f39907 100644 --- a/apps/web/app/[locale]/layout.tsx +++ b/apps/web/app/[locale]/layout.tsx @@ -29,8 +29,11 @@ export default async function LocaleLayout({ const { locale } = await params; const messages = await getMessages(); return ( - - + + diff --git a/apps/web/app/[locale]/page.tsx b/apps/web/app/[locale]/page.tsx index 9f3f9fd..120ae9d 100644 --- a/apps/web/app/[locale]/page.tsx +++ b/apps/web/app/[locale]/page.tsx @@ -8,11 +8,6 @@ import { getShowcase, type ShowcaseData } from "../../lib/api"; import { AnnotateForm } from "../../components/AnnotateForm"; const ASPECTS = ["MFO", "BPO", "CCO"] as const; -const ASPECT_COLORS: Record = { - MFO: "blue", - BPO: "green", - CCO: "purple", -}; const ASPECT_LABELS: Record = { MFO: "Molecular Function", BPO: "Biological Process", @@ -26,12 +21,6 @@ const CATEGORY_LABELS: Record = { PK: "Partial Knowledge", }; -const METHOD_KEYS: Record = { - knn_baseline: "knnBaseline", - knn_scored: "knnScored", - knn_reranker: "knnReranker", -}; - const STAGE_ICONS: Record = { sequences: "Aa", embeddings: "E", @@ -48,12 +37,30 @@ const STAGE_I18N: Record = { evaluations: "stageEvaluation", }; +const STAGE_LABELS: Record = { + baseline: "pipelineStageBaseline", + alignment_weighted: "pipelineStageAlignmentWeighted", + reranker: "pipelineStageReranker", +}; + +const STAGE_BADGE: Record = { + baseline: "bg-gray-100 text-gray-700", + alignment_weighted: "bg-amber-100 text-amber-800", + reranker: "bg-blue-100 text-blue-800", +}; + +function formatParamCount(n: number | null): string { + if (n == null) return ""; + if (n >= 1_000_000_000) return `${(n / 1_000_000_000).toFixed(n >= 10_000_000_000 ? 0 : 1)}B`; + if (n >= 1_000_000) return `${Math.round(n / 1_000_000)}M`; + return `${n}`; +} + export default function HomePage() { const t = useTranslations("home"); const router = useRouter(); const [data, setData] = useState(null); const [error, setError] = useState(null); - const [activeCategory, setActiveCategory] = useState("NK"); useEffect(() => { getShowcase().then(setData).catch((e) => setError(e.message)); @@ -65,7 +72,12 @@ export default function HomePage() {

{error}

- ))} + {t(STAGE_LABELS[best.stage] as any)} + +
+
+ {best.embedding.model_name} +
+
+ +
+
+ {best.avg_fmax.toFixed(3)} +
+
{t("avgFmaxAcrossCells")}
- - {CATEGORY_LABELS[activeCategory]} -
- {/* ── Fmax cards ────────────────────────────────────────── */} -
+ {/* Per-aspect mini tiles (mean across NK/LK/PK) */} +
{ASPECTS.map((aspect) => { - const d = catFmax[aspect]; - if (!d) return null; - const color = ASPECT_COLORS[aspect]; + const agg = perAspect[aspect]; + const value = agg ? agg.sum / agg.count : null; return (
-
- {d.fmax.toFixed(2)} +
+ {value != null ? value.toFixed(3) : "—"}
-
- {t("fmax")} {aspect} -
-
- {ASPECT_LABELS[aspect]} -
-
- {d.method_label} +
+ {aspect}
); })}
- - - {/* ── Method comparison table ───────────────────────────── */} - {catMethods.length > 0 && ( -
-

- {t("methodComparison")} - - ({activeCategory}) - -

-
- - - - - {ASPECTS.map((a) => ( - - ))} - - - - {catMethods.map((row, i) => { - const isBest = ASPECTS.some( - (a) => catFmax[a]?.method === row.method - ); - return ( - - - {ASPECTS.map((aspect) => { - const val = (row as any)[aspect]?.fmax; - const baseVal = baseline ? (baseline as any)[aspect]?.fmax : null; - const delta = val != null && baseVal != null && row.method !== "knn_baseline" - ? val - baseVal - : null; - return ( - - ); - })} - - ); - })} - -
{t("method")} - {a} -
- {t(METHOD_KEYS[row.method] ?? row.method)} - {isBest && ( - best - )} - - {val != null ? ( - - {val.toFixed(3)} - {delta != null && ( - 0 ? "text-green-600" : delta < 0 ? "text-red-600" : "text-gray-400"}`}> - {delta > 0 ? "+" : ""}{delta.toFixed(3)} - - )} - - ) : ( - - )} -
-
-
- )} - +
+ ) : (

{t("noDataYet")}

@@ -309,12 +255,14 @@ export default function HomePage() { {t("stats")}
- {([ - ["proteins", data.counts.proteins], - ["sequences", data.counts.sequences], - ["embeddings", data.counts.embeddings], - ["predictions", data.counts.predictions], - ] as [string, number][]).map(([key, count]) => ( + {( + [ + ["proteins", data.counts.proteins], + ["sequences", data.counts.sequences], + ["embeddings", data.counts.embeddings], + ["predictions", data.counts.predictions], + ] as [string, number][] + ).map(([key, count]) => (
{count.toLocaleString()} @@ -328,7 +276,7 @@ export default function HomePage() { {/* ── CTAs ──────────────────────────────────────────────────── */}
{t("exploreResults")} diff --git a/apps/web/components/AnnotateForm.tsx b/apps/web/components/AnnotateForm.tsx index e28e1cf..2c17cc1 100644 --- a/apps/web/components/AnnotateForm.tsx +++ b/apps/web/components/AnnotateForm.tsx @@ -7,23 +7,40 @@ import { annotateProteins, getJob, launchPredictGoTerms, + listJobs, listPredictionSets, type AnnotateResult, + type Job, } from "@/lib/api"; type Stage = "idle" | "uploading" | "embedding" | "predicting" | "done" | "error"; const POLL_MS = 3_000; +const QUEUE_POLL_MS = 30_000; -const EXAMPLE_FASTA = `>sp|P04637|P53_HUMAN Cellular tumor antigen p53 +// Operations that occupy the shared GPU pipeline. While any of these is +// queued or running we block new user annotation requests, since they won't +// actually enter the queue in a reasonable time frame. +const BLOCKING_OPERATIONS = new Set([ + "compute_embeddings", + "compute_embeddings_batch", + "predict_go_terms", + "predict_go_terms_batch", +]); + +const EXAMPLE_FASTA = `>sp|P01116|RASK_HUMAN GTPase KRas OS=Homo sapiens OX=9606 GN=KRAS PE=1 SV=1 +MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG +QEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDL +PSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGC +VKIKKCIIM +>sp|P04637|P53_HUMAN Cellular tumor antigen p53 OS=Homo sapiens OX=9606 GN=TP53 PE=1 SV=4 MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP -DEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYPQGLNGTVNLPGRNSFEV -RVCACPGRDRRTEEENLHKTTGIDSFLHPEVEYFTPETDPAGPMCSRHFYQLAKTCPVQLW -VDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHERCTCGGNHGISTTTGICLICQFFLVHKP ->sp|P38398|BRCA1_HUMAN Breast cancer type 1 susceptibility protein -MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQC -PLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKDEV -SIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELG`; +DEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK +SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE +RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS +SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP +PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG +GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD`; export function AnnotateForm() { const t = useTranslations("home"); @@ -41,6 +58,11 @@ export function AnnotateForm() { // Drag-and-drop state const [dragOver, setDragOver] = useState(false); + // Queue-awareness: poll active jobs and block submission while any + // embedding/prediction operation is queued or running, because our + // single-GPU setup can't absorb another request in reasonable time. + const [blockingJobs, setBlockingJobs] = useState(null); + const handleFile = (file: File) => { const reader = new FileReader(); reader.onload = (e) => { @@ -152,7 +174,49 @@ export function AnnotateForm() { }; }, []); + // Poll for active embedding/prediction jobs to know whether the GPU + // pipeline is currently saturated. + useEffect(() => { + let cancelled = false; + const fetchBlocking = async () => { + if (typeof document !== "undefined" && document.visibilityState === "hidden") return; + try { + const [queued, running] = await Promise.all([ + listJobs({ limit: 100, status: "queued" }), + listJobs({ limit: 100, status: "running" }), + ]); + if (cancelled) return; + const merged = [...running, ...queued].filter((j) => + BLOCKING_OPERATIONS.has(j.operation), + ); + setBlockingJobs(merged); + } catch { + // ignore transient errors; keep prior state + } + }; + fetchBlocking(); + const id = setInterval(fetchBlocking, QUEUE_POLL_MS); + const onVisibility = () => { + if (document.visibilityState === "visible") fetchBlocking(); + }; + document.addEventListener("visibilitychange", onVisibility); + return () => { + cancelled = true; + clearInterval(id); + document.removeEventListener("visibilitychange", onVisibility); + }; + }, []); + const isRunning = stage === "uploading" || stage === "embedding" || stage === "predicting"; + // A running local annotation flow already owns the UI; don't double-block. + const isQueueBlocked = !isRunning && (blockingJobs?.length ?? 0) > 0; + const runningJob = blockingJobs?.find((j) => j.status === "running") ?? null; + const runningPct = + runningJob && runningJob.progress_total && runningJob.progress_current + ? Math.round((runningJob.progress_current / runningJob.progress_total) * 100) + : null; + const queuedCount = + blockingJobs?.filter((j) => j.status === "queued").length ?? 0; return (
@@ -163,6 +227,41 @@ export function AnnotateForm() { {t("annotateDescription" as any)}

+ {/* Queue-busy banner ─ blocks submission while the GPU pipeline is saturated */} + {isQueueBlocked && ( +
+
+ +
+

+ {t("annotateQueueBlockedTitle" as any)} +

+

+ {t("annotateQueueBlockedBody" as any)} +

+
    + {runningJob && ( +
  • + {runningJob.operation} + {" — "} + {t("annotateQueueRunningLabel" as any)} + {runningPct != null ? ` (${runningPct}%)` : ""} +
  • + )} + {queuedCount > 0 && ( +
  • + {t("annotateQueueWaitingLabel" as any)}: {queuedCount} +
  • + )} +
+
+
+
+ )} + {/* FASTA input */}
setFasta(e.target.value)} placeholder={t("annotatePlaceholder" as any)} rows={6} - disabled={isRunning} + disabled={isRunning || isQueueBlocked} className="w-full rounded-lg p-4 text-xs font-mono text-gray-700 placeholder:text-gray-400 focus:outline-none focus:ring-2 focus:ring-blue-300 resize-y disabled:opacity-50 disabled:cursor-not-allowed bg-transparent" /> - {!fasta && !isRunning && ( + {!fasta && !isRunning && !isQueueBlocked && (
+ {catalog.ks.length > 0 && ( +
+ +
+ {catalog.ks.map((n) => ( + + ))} +
+
+ )} + {evalSetList.length > 1 && (
- AUC: {m.val_auc?.toFixed(4) ?? "—"} - F1: {m.val_f1?.toFixed(4) ?? "—"} - Precision: {m.val_precision?.toFixed(4) ?? "—"} - Recall: {m.val_recall?.toFixed(4) ?? "—"} - Positive rate: {m.positive_rate != null ? `${(m.positive_rate * 100).toFixed(2)}%` : "—"} + {m.test_fmax != null ? ( + <> + Test Fmax: {m.test_fmax.toFixed(4)} + Best iter: {m.best_iteration ?? "—"} + {m.positive_rate_train != null && ( + Train pos. rate: {(m.positive_rate_train * 100).toFixed(2)}% + )} + + ) : ( + <> + AUC: {m.val_auc?.toFixed(4) ?? "—"} + F1: {m.val_f1?.toFixed(4) ?? "—"} + Precision: {m.val_precision?.toFixed(4) ?? "—"} + Recall: {m.val_recall?.toFixed(4) ?? "—"} + Positive rate: {m.positive_rate != null ? `${(m.positive_rate * 100).toFixed(2)}%` : "—"} + + )}
{expanded && (
- {/* Validation metrics */} + {/* Training-time metrics */}
-

Validation metrics

+

Training-time metrics

+ + -
- Train samples: {m.train_samples?.toLocaleString()} - Val samples: {m.val_samples?.toLocaleString()} + {m.train_samples != null && Train samples: {m.train_samples.toLocaleString()}} + {m.val_samples != null && Val samples: {m.val_samples.toLocaleString()}} + {m.positive_rate_train != null && ( + Train positive rate: {(m.positive_rate_train * 100).toFixed(2)}% + )}
From fc394533e688d3be49fdf910d2d7428c4cbae28a Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 15:41:56 +0200 Subject: [PATCH 42/73] chore(tests): T0.1 baseline + fix publisher retry tests Cobertura unit baseline: 72.72% (8470 stmts, 2311 missed). Saved to .baseline/{coverage_2026-05-05_unit.xml,coverage_html_2026-05-05_unit/,pytest_2026-05-05_unit.log}. Tests fixed (regression from publisher retry 5 -> 12 in e299672): - test_exponential_backoff_delays - test_closes_connection_on_exception Tests/BROKEN.md tracks 16 remaining unit failures, all preexisting: - 14 in test_predict_go_terms.py: payload fixtures missing ontology_snapshot_id (now required field) - 2 in test_scoring_router.py::TestRerankerMetrics: pending diagnosis Tests/FLAKY.md scaffolded: no flaky test detected this pass. Tests/SLOW.md scaffolded: no test crosses 5s threshold; slowest is ESM-2 8M batch consistency at 3.21s. Part of F0 T0.1 of master plan v3. --- .baseline/coverage_2026-05-05_unit.xml | 9067 ++++++++++++++++++++++++ tests/BROKEN.md | 46 + tests/FLAKY.md | 14 + tests/SLOW.md | 17 + tests/test_queue.py | 9 +- 5 files changed, 9149 insertions(+), 4 deletions(-) create mode 100644 .baseline/coverage_2026-05-05_unit.xml create mode 100644 tests/BROKEN.md create mode 100644 tests/FLAKY.md create mode 100644 tests/SLOW.md diff --git a/.baseline/coverage_2026-05-05_unit.xml b/.baseline/coverage_2026-05-05_unit.xml new file mode 100644 index 0000000..152f8c1 --- /dev/null +++ b/.baseline/coverage_2026-05-05_unit.xml @@ -0,0 +1,9067 @@ + + + + + + /home/frapercan/Thesis/repositories/PROTEA/protea + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/BROKEN.md b/tests/BROKEN.md new file mode 100644 index 0000000..1be9fe1 --- /dev/null +++ b/tests/BROKEN.md @@ -0,0 +1,46 @@ +# Broken tests inventory + +Snapshot 2026-05-05 tras T0.1 cobertura baseline. Los siguientes tests fallan en HEAD `7e8de14` (rama `refactor/post-lab-stabilization`). Corresponden a deuda preexistente o regresiones que se arreglan como parte del trabajo de F0/F2 del master plan. + +## A. Resueltos en este pase (T0.1) + +- `tests/test_queue.py::TestPublishJob::test_exponential_backoff_delays` +- `tests/test_queue.py::TestPublishJob::test_closes_connection_on_exception` + +Causa: regresión directa del commit `e299672` (publisher retry 5 to 12). Tests actualizados a las nuevas constantes; verde. + +## B. Pendientes: payload schema drift + +14 tests en `tests/test_predict_go_terms.py` fallan porque sus fixtures no pasan `ontology_snapshot_id`, ahora campo requerido en `PredictGOTermsPayload` y `PredictGOTermsBatchPayload` (`predict_go_terms.py:165, 237`). Previo a este snapshot. + +Tests afectados: + +- `TestPredictBatch::test_transfers_go_annotations_from_nearest_neighbor` +- `TestPredictBatch::test_includes_self_as_first_reference` +- `TestPredictBatch::test_distance_threshold_filters_far_neighbors` +- `TestPredictBatch::test_limit_per_entry_caps_neighbors` +- `TestPredictBatchParentCancellation::test_skips_when_parent_cancelled` +- `TestPredictBatchParentCancellation::test_skips_when_parent_failed` +- `TestPredictGOTermsBatchPayload::test_valid_payload` +- `TestPredictGOTermsBatchPayload::test_feature_flags_default_false` +- `TestPredictBatchRerankerFeatures::test_reranker_features_included_when_enabled` +- `TestPredictBatchRerankerFeatures::test_reranker_features_excluded_when_disabled` +- `TestPredictGOTermsBatchReranker::test_skipped_when_artifact_context_missing` +- `TestPredictGOTermsBatchReranker::test_schema_mismatch_falls_back` +- `TestPredictGOTermsBatchReranker::test_applies_when_schema_matches` +- `TestPredictGOTermsBatchReranker::test_no_reranker_leaves_dicts_untouched` + +Reparación: añadir `"ontology_snapshot_id": str(uuid.uuid4())` (o usar el existente del fixture) en cada payload de test. Dejado para T0.2 (safe_emit) que toca tests, o como hard-prerequisite a T2B.4 (extract class de `PredictGOTermsBatchOperation`) que reescribirá estas suites. + +## C. Pendientes: scoring router metrics + +- `tests/test_scoring_router.py::TestRerankerMetrics::test_returns_metrics` +- `tests/test_scoring_router.py::TestRerankerMetrics::test_empty_predictions_returns_zero_metrics` + +Causa pendiente de diagnóstico. Si es deuda similar a B, mismo tratamiento. + +## Política + +- Cada test en B y C que pasa por una refactor de F0-F2, se arregla a la vez. +- Antes de cerrar F2, este fichero debe estar vacío o con explicación de exclusión por test. +- Cualquier nueva regresión se añade aquí con su causa documentada. diff --git a/tests/FLAKY.md b/tests/FLAKY.md new file mode 100644 index 0000000..1199227 --- /dev/null +++ b/tests/FLAKY.md @@ -0,0 +1,14 @@ +# Flaky tests inventory + +Snapshot 2026-05-05 tras T0.1. + +Ningún test detectado como flaky en este pase. Esto significa que en una sola ejecución todos los que pasan, pasan deterministically y todos los que fallan, fallan deterministically (deuda documentada en `BROKEN.md`). + +Política: si un test pasa unas veces y falla otras durante el desarrollo, registrarlo aquí con: + +- Nombre completo +- Síntoma observado (timing, orden de ejecución, race condition sospechada) +- Reproducción mínima si conocida +- Plan: `time.sleep` a sustituir por `wait_until`, fixture a aislar, mock a estabilizar, etc. + +Hard rule del master plan v3 §F6 T6.1: cero `time.sleep` en `tests/`. Si entra un test con sleep, va a este fichero hasta sustituirse por `wait_until(predicate, timeout)`. diff --git a/tests/SLOW.md b/tests/SLOW.md new file mode 100644 index 0000000..b980bec --- /dev/null +++ b/tests/SLOW.md @@ -0,0 +1,17 @@ +# Slow tests inventory + +Snapshot 2026-05-05 tras T0.1. Cobertura unit `--durations=30`. + +Ningún test supera el umbral del master plan v3 (`5s`). + +Top 5 tests más lentos: + +| Test | Duración | Razón | +|------|---------:|-------| +| `tests/test_compute_embeddings.py::TestBatchSizeConsistency::test_esm_batch_size_consistency` | 3.21s | Carga ESM-2 8M para chequeo de consistencia (legítimo) | +| `tests/test_compute_embeddings.py::TestValidateLayers::test_valid_reverse_index` | 2.03s | Validación numérica intensiva | +| `tests/test_real_models.py::TestESM2_8M::test_output_shape_and_finite` | 0.45s setup | Marker `slow`, opt-in | +| `tests/test_knn_streaming_smoke.py::test_list_vs_stream_equivalence` | 0.36s | KNN smoke OK | +| `tests/test_infrastructure.py::TestCreateApp::*` | ~0.12s cada | App boot suite OK | + +Política: si un test cruza 5s en ejecuciones futuras, añadir aquí con causa raíz y plan de aceleración (mock pesado, fixture de sesión, marker `slow` opt-in, etc.). diff --git a/tests/test_queue.py b/tests/test_queue.py index dac3f5f..d1253bb 100644 --- a/tests/test_queue.py +++ b/tests/test_queue.py @@ -246,8 +246,8 @@ def test_closes_connection_on_exception(self): with pytest.raises(RuntimeError, match="Failed to publish to queue"): publish_job("amqp://localhost/", "q", uuid4()) - # _close_cached_connection calls conn.close() once per failed attempt (5 total) - assert conn.close.call_count == 5 + # _close_cached_connection calls conn.close() once per failed attempt (12 total) + assert conn.close.call_count == 12 def test_declares_durable_queue(self): conn = MagicMock() @@ -289,8 +289,9 @@ def test_exponential_backoff_delays(self): with pytest.raises(RuntimeError, match="Failed to publish"): publish_job("amqp://localhost/", "q", uuid4()) - # 5 attempts → 4 sleeps: 1, 2, 4, 8 - assert sleep_calls == [1, 2, 4, 8] + # 12 attempts -> 11 sleeps: 1, 2, 4, 8, 16, 30, 30, 30, 30, 30, 30 + # Exponential up to attempt 5 (16s); capped at 30s for the rest. + assert sleep_calls == [1, 2, 4, 8, 16, 30, 30, 30, 30, 30, 30] # --------------------------------------------------------------------------- From 9ecc521e3a553675ebee1b18b7b1a1b964c393fe Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 15:53:00 +0200 Subject: [PATCH 43/73] fix(tests): T0.1b clear preexisting BROKEN suite Patches 16 unit tests left broken by previous schema drifts: - 12 tests in test_predict_go_terms.py: payload fixtures missing ontology_snapshot_id (now required field). Add the constant _SNAPSHOT_ID to _payload() helpers and inline model_validate calls. - 2 tests in test_scoring_router.py::TestRerankerMetrics: _make_eval_set() helper left groundtruth_uri as truthy MagicMock, which routed the handler to the persisted-artifact branch (not mocked) instead of the on-the-fly compute_evaluation_data branch (mocked). Set explicitly to None plus stats=None. Suite is now green: 1056 passed, 10 skipped. BROKEN.md updated to reflect the cleared state. Part of F0 T0.1b of master plan v3. --- tests/BROKEN.md | 52 +++++++++++++++------------------- tests/test_predict_go_terms.py | 11 +++++-- tests/test_scoring_router.py | 5 ++++ 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/tests/BROKEN.md b/tests/BROKEN.md index 1be9fe1..a567321 100644 --- a/tests/BROKEN.md +++ b/tests/BROKEN.md @@ -1,46 +1,40 @@ # Broken tests inventory -Snapshot 2026-05-05 tras T0.1 cobertura baseline. Los siguientes tests fallan en HEAD `7e8de14` (rama `refactor/post-lab-stabilization`). Corresponden a deuda preexistente o regresiones que se arreglan como parte del trabajo de F0/F2 del master plan. +Snapshot 2026-05-05 tras T0.1b. Suite unit verde end-to-end: **1056 passed, 10 skipped, 0 failed**. -## A. Resueltos en este pase (T0.1) +Política: este fichero registra cualquier test que esté roto en `master` o `refactor/*`. Si está vacío, la suite está limpia. -- `tests/test_queue.py::TestPublishJob::test_exponential_backoff_delays` -- `tests/test_queue.py::TestPublishJob::test_closes_connection_on_exception` +## Histórico de resoluciones -Causa: regresión directa del commit `e299672` (publisher retry 5 to 12). Tests actualizados a las nuevas constantes; verde. +### T0.1b — 2026-05-05 -## B. Pendientes: payload schema drift +Resueltos en este pase los 16 tests rotos detectados en T0.1: -14 tests en `tests/test_predict_go_terms.py` fallan porque sus fixtures no pasan `ontology_snapshot_id`, ahora campo requerido en `PredictGOTermsPayload` y `PredictGOTermsBatchPayload` (`predict_go_terms.py:165, 237`). Previo a este snapshot. +**A. Regresiones de los 5 commits previos (2 tests)** -Tests afectados: +- `test_queue.py::TestPublishJob::test_exponential_backoff_delays` +- `test_queue.py::TestPublishJob::test_closes_connection_on_exception` -- `TestPredictBatch::test_transfers_go_annotations_from_nearest_neighbor` -- `TestPredictBatch::test_includes_self_as_first_reference` -- `TestPredictBatch::test_distance_threshold_filters_far_neighbors` -- `TestPredictBatch::test_limit_per_entry_caps_neighbors` -- `TestPredictBatchParentCancellation::test_skips_when_parent_cancelled` -- `TestPredictBatchParentCancellation::test_skips_when_parent_failed` +Causa: commit `e299672` subió publisher retry de 5 a 12. Tests actualizados a las nuevas constantes. + +**B. Payload schema drift en `test_predict_go_terms.py` (12 tests)** + +- `TestPredictBatch::*` (4 tests) +- `TestPredictBatchParentCancellation::*` (2 tests) - `TestPredictGOTermsBatchPayload::test_valid_payload` - `TestPredictGOTermsBatchPayload::test_feature_flags_default_false` -- `TestPredictBatchRerankerFeatures::test_reranker_features_included_when_enabled` -- `TestPredictBatchRerankerFeatures::test_reranker_features_excluded_when_disabled` -- `TestPredictGOTermsBatchReranker::test_skipped_when_artifact_context_missing` -- `TestPredictGOTermsBatchReranker::test_schema_mismatch_falls_back` -- `TestPredictGOTermsBatchReranker::test_applies_when_schema_matches` -- `TestPredictGOTermsBatchReranker::test_no_reranker_leaves_dicts_untouched` +- `TestPredictBatchRerankerFeatures::*` (2 tests) +- `TestPredictGOTermsBatchReranker::*` (4 tests) -Reparación: añadir `"ontology_snapshot_id": str(uuid.uuid4())` (o usar el existente del fixture) en cada payload de test. Dejado para T0.2 (safe_emit) que toca tests, o como hard-prerequisite a T2B.4 (extract class de `PredictGOTermsBatchOperation`) que reescribirá estas suites. +Causa: `ontology_snapshot_id` añadido como campo requerido en `PredictGOTermsPayload` y `PredictGOTermsBatchPayload`. Fixtures y payloads inline no se actualizaron al añadirlo. Patch: 5 ediciones añadiendo `"ontology_snapshot_id": _SNAPSHOT_ID` (constante ya existente) en `_payload()` helpers e inline. -## C. Pendientes: scoring router metrics +**C. EvaluationSet mock fields drift en `test_scoring_router.py` (2 tests)** -- `tests/test_scoring_router.py::TestRerankerMetrics::test_returns_metrics` -- `tests/test_scoring_router.py::TestRerankerMetrics::test_empty_predictions_returns_zero_metrics` +- `TestRerankerMetrics::test_returns_metrics` +- `TestRerankerMetrics::test_empty_predictions_returns_zero_metrics` -Causa pendiente de diagnóstico. Si es deuda similar a B, mismo tratamiento. +Causa: `EvaluationSet.groundtruth_uri` añadido al modelo; el helper `_make_eval_set()` lo dejaba como MagicMock truthy, lo que enrutaba el handler hacia el path persisted-artifact (que no estaba mockeado) en lugar del path on-the-fly (sí mockeado). Patch: `_make_eval_set()` setea explícitamente `groundtruth_uri = None` y `stats = None`. -## Política +## Hard rule -- Cada test en B y C que pasa por una refactor de F0-F2, se arregla a la vez. -- Antes de cerrar F2, este fichero debe estar vacío o con explicación de exclusión por test. -- Cualquier nueva regresión se añade aquí con su causa documentada. +Antes de cerrar cualquier fase mayor del master plan, este fichero debe estar vacío o con explicación de exclusión por test. diff --git a/tests/test_predict_go_terms.py b/tests/test_predict_go_terms.py index a3e6bd1..2ba64e7 100644 --- a/tests/test_predict_go_terms.py +++ b/tests/test_predict_go_terms.py @@ -289,11 +289,12 @@ def _payload(self, **kwargs): defaults = { "embedding_config_id": str(uuid.uuid4()), "annotation_set_id": _ANN_SET_ID, + "ontology_snapshot_id": _SNAPSHOT_ID, "prediction_set_id": str(uuid.uuid4()), "parent_job_id": str(uuid.uuid4()), "query_accessions": [], "limit_per_entry": 2, - # Opt out of features that require sequences/taxonomy — the mock + # Opt out of features that require sequences/taxonomy: the mock # ref_data in this class never provides them. "compute_alignments": False, "compute_taxonomy": False, @@ -660,6 +661,7 @@ def test_skips_when_parent_cancelled(self) -> None: payload = { "embedding_config_id": str(uuid.uuid4()), "annotation_set_id": str(uuid.uuid4()), + "ontology_snapshot_id": _SNAPSHOT_ID, "prediction_set_id": str(uuid.uuid4()), "parent_job_id": str(uuid.uuid4()), "query_accessions": ["P1"], @@ -678,6 +680,7 @@ def test_skips_when_parent_failed(self) -> None: payload = { "embedding_config_id": str(uuid.uuid4()), "annotation_set_id": str(uuid.uuid4()), + "ontology_snapshot_id": _SNAPSHOT_ID, "prediction_set_id": str(uuid.uuid4()), "parent_job_id": str(uuid.uuid4()), "query_accessions": ["P1"], @@ -775,6 +778,7 @@ def test_valid_payload(self) -> None: { "embedding_config_id": str(uuid.uuid4()), "annotation_set_id": str(uuid.uuid4()), + "ontology_snapshot_id": _SNAPSHOT_ID, "prediction_set_id": str(uuid.uuid4()), "parent_job_id": str(uuid.uuid4()), "query_accessions": ["P1", "P2"], @@ -788,6 +792,7 @@ def test_feature_flags_default_false(self) -> None: { "embedding_config_id": str(uuid.uuid4()), "annotation_set_id": str(uuid.uuid4()), + "ontology_snapshot_id": _SNAPSHOT_ID, "prediction_set_id": str(uuid.uuid4()), "parent_job_id": str(uuid.uuid4()), "query_accessions": [], @@ -811,11 +816,12 @@ def _payload(self, **kwargs): defaults = { "embedding_config_id": str(uuid.uuid4()), "annotation_set_id": _ANN_SET_ID, + "ontology_snapshot_id": _SNAPSHOT_ID, "prediction_set_id": str(uuid.uuid4()), "parent_job_id": str(uuid.uuid4()), "query_accessions": [], "limit_per_entry": 2, - # Reranker features ON, alignments/taxonomy OFF — this suite only + # Reranker features ON, alignments/taxonomy OFF: this suite only # exercises voting/neighbor-stat features, not NW/SW or taxonomy. "compute_alignments": False, "compute_taxonomy": False, @@ -978,6 +984,7 @@ def _payload(self, **kwargs) -> PredictGOTermsBatchPayload: defaults = { "embedding_config_id": str(uuid.uuid4()), "annotation_set_id": _ANN_SET_ID, + "ontology_snapshot_id": _SNAPSHOT_ID, "prediction_set_id": str(uuid.uuid4()), "parent_job_id": str(uuid.uuid4()), "query_accessions": ["Q1"], diff --git a/tests/test_scoring_router.py b/tests/test_scoring_router.py index f1619ec..d9b97c4 100644 --- a/tests/test_scoring_router.py +++ b/tests/test_scoring_router.py @@ -669,6 +669,11 @@ def _make_eval_set(): es.id = uuid4() es.old_annotation_set_id = uuid4() es.new_annotation_set_id = uuid4() + # When None, scoring router takes the on-the-fly compute_evaluation_data + # branch (which tests mock); a truthy MagicMock pulls the persisted + # artifact path that needs a real UUID for pivot_ontology_snapshot_id. + es.groundtruth_uri = None + es.stats = None return es From 3155bbda5a8b328fd874d143df70552b5f1555b4 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 15:58:47 +0200 Subject: [PATCH 44/73] feat(operation): T0.2 introduce make_safe_emit wrapper The queue consumer's inner emit had a defensive try/except that swallowed JobEvent insert failures with logger.warning, no traceback. Move that responsibility to a generic make_safe_emit() wrapper in core.contracts.operation: - Wraps any EmitFn to swallow exceptions and log at ERROR with exc_info=True under the dedicated "protea.emit" logger. - Operations stay unchanged; consumer wraps the raw emit before passing it to op.execute(). - raw_emit in consumer.py now does only the work (no try/except); cleaner separation of concerns. AC: grep "except.*emit\|except.*progress" protea/ returns 0 hits. Cobertura: 5 new tests in test_safe_emit.py covering passthrough, defaults, exception swallowing, exc_info presence, repeated calls after failure. Suite: 1061 passed, 10 skipped. Part of F0 T0.2 of master plan v3. --- protea/core/contracts/operation.py | 33 ++++++++++ protea/infrastructure/queue/consumer.py | 47 ++++++-------- tests/test_safe_emit.py | 81 +++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 28 deletions(-) create mode 100644 tests/test_safe_emit.py diff --git a/protea/core/contracts/operation.py b/protea/core/contracts/operation.py index 705a419..a05245a 100644 --- a/protea/core/contracts/operation.py +++ b/protea/core/contracts/operation.py @@ -1,6 +1,7 @@ # protea/core/contracts/operation.py from __future__ import annotations +import logging from collections.abc import Callable from dataclasses import dataclass, field from typing import Any, Literal, Protocol @@ -12,6 +13,38 @@ Level = Literal["info", "warning", "error"] EmitFn = Callable[[str, str | None, dict[str, Any], Level], None] +_safe_emit_logger = logging.getLogger("protea.emit") + + +def make_safe_emit(raw_emit: EmitFn) -> EmitFn: + """Wrap a raw EmitFn so failures are logged and never propagate. + + The platform-level emit may fail for transient reasons (DB + connection lost mid-operation, JobEvent insert conflict, etc.). + Operations should not crash because the audit trail hiccupped: + the operation's primary work matters more than the event row. + Failures are logged at ERROR with full traceback so they remain + visible in observability without breaking the running job. + """ + + def wrapped( + event: str, + message: str | None = None, + fields: dict[str, Any] | None = None, + level: Level = "info", + ) -> None: + try: + raw_emit(event, message, fields or {}, level) + except Exception: + _safe_emit_logger.error( + "emit failed; operation continues. event=%s level=%s", + event, + level, + exc_info=True, + ) + + return wrapped + @dataclass(frozen=True) class OperationResult: diff --git a/protea/infrastructure/queue/consumer.py b/protea/infrastructure/queue/consumer.py index 8d12be5..6d63ac0 100644 --- a/protea/infrastructure/queue/consumer.py +++ b/protea/infrastructure/queue/consumer.py @@ -11,7 +11,7 @@ from pika.spec import Basic, BasicProperties from sqlalchemy.orm import Session, sessionmaker -from protea.core.contracts.operation import RetryLaterError +from protea.core.contracts.operation import RetryLaterError, make_safe_emit from protea.core.contracts.registry import OperationRegistry from protea.infrastructure.orm.models.job import JobEvent from protea.infrastructure.queue.publisher import publish_operation @@ -279,40 +279,31 @@ def _on_message( session = self._factory() try: - def emit( + def raw_emit( event: str, message: str | None = None, fields: dict[str, Any] | None = None, level: str = "info", ) -> None: logger.info("operation.%s fields=%s", event, fields or {}) - if parent_job_id is not None: - event_session = self._factory() - try: - event_session.add( - JobEvent( - job_id=parent_job_id, - event=f"child.{event}", - message=message, - fields=fields or {}, - level=level, - ) - ) - event_session.commit() - except Exception as emit_exc: - logger.warning( - "Failed to write child event to parent job. parent_job_id=%s error=%s", - parent_job_id, - emit_exc, + if parent_job_id is None: + return + event_session = self._factory() + try: + event_session.add( + JobEvent( + job_id=parent_job_id, + event=f"child.{event}", + message=message, + fields=fields or {}, + level=level, ) - try: - event_session.rollback() - except Exception: - pass - finally: - event_session.close() - - result = op.execute(session, payload, emit=emit) + ) + event_session.commit() + finally: + event_session.close() + + result = op.execute(session, payload, emit=make_safe_emit(raw_emit)) session.commit() # Forward any downstream operation messages (e.g. GPU→write worker). for queue_name, op_payload in result.publish_operations or []: diff --git a/tests/test_safe_emit.py b/tests/test_safe_emit.py new file mode 100644 index 0000000..1b57ee7 --- /dev/null +++ b/tests/test_safe_emit.py @@ -0,0 +1,81 @@ +"""Tests for make_safe_emit (T0.2 of master plan v3).""" + +from __future__ import annotations + +import logging +from typing import Any + +import pytest + +from protea.core.contracts.operation import make_safe_emit + + +class TestMakeSafeEmit: + """make_safe_emit wraps an EmitFn so failures don't crash the operation.""" + + def test_passthrough_on_success(self) -> None: + calls: list[tuple] = [] + + def raw(event, message, fields, level): + calls.append((event, message, fields, level)) + + safe = make_safe_emit(raw) + safe("step.start", "doing work", {"k": 1}, "info") + + assert calls == [("step.start", "doing work", {"k": 1}, "info")] + + def test_default_arguments_normalised(self) -> None: + calls: list[tuple] = [] + + def raw(event, message, fields, level): + calls.append((event, message, fields, level)) + + safe = make_safe_emit(raw) + safe("step.tick") + + # Defaults: message=None, fields={} (not None), level="info". + assert calls == [("step.tick", None, {}, "info")] + + def test_swallows_exceptions(self, caplog: pytest.LogCaptureFixture) -> None: + def raw(*_args: Any, **_kwargs: Any) -> None: + raise RuntimeError("DB lost connection") + + safe = make_safe_emit(raw) + + with caplog.at_level(logging.ERROR, logger="protea.emit"): + safe("step.boom", "oops", {"x": 2}, "warning") + + assert any( + "emit failed" in record.message and record.levelno == logging.ERROR + for record in caplog.records + ) + # Find the record and confirm exc_info captured. + emit_records = [r for r in caplog.records if "emit failed" in r.message] + assert emit_records, "expected at least one error log from safe_emit" + assert emit_records[0].exc_info is not None + + def test_does_not_propagate_arbitrary_exceptions(self) -> None: + class CustomError(Exception): + pass + + def raw(*_args: Any, **_kwargs: Any) -> None: + raise CustomError("anything") + + safe = make_safe_emit(raw) + # Must not raise. + safe("any.event") + + def test_can_be_called_repeatedly_after_failure(self) -> None: + attempts: list[bool] = [] + + def raw(*_args: Any, **_kwargs: Any) -> None: + attempts.append(True) + if len(attempts) == 1: + raise RuntimeError("first attempt fails") + + safe = make_safe_emit(raw) + safe("first") + safe("second") + safe("third") + + assert len(attempts) == 3 From 2973fcf9613f4db5a710865cd2dea6fc4dc5cdd4 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 16:29:48 +0200 Subject: [PATCH 45/73] feat(retry): T0.3+T0.8 retry middleware + BaseWorker extract Adds protea.core.retry with: - is_retryable_db_error: OperationalError with pgcode in {40P01 deadlock_detected, 40001 serialization_failure} - is_retryable_connection_error: ConnectionResetError, ConnectionAbortedError, TimeoutError - with_retry(fn, max_attempts, base_delay, max_delay, jitter_ratio, predicate, on_retry): exponential backoff with jitter, default WARNING log under "protea.retry" carrying attempt/sleep/pgcode. Refactors BaseWorker.handle_job from a 180-line monolith into a thin orchestrator plus seven focused helpers (T0.8): _claim_job, _execute_with_session, _build_emit, _cancel_if_parent_cancelled, _on_operation_success, _on_retry_later, _on_operation_failure. handle_job wraps _execute_with_session in with_retry so transient DB or connection errors during op.execute get a fresh session per attempt up to 3 attempts. RetryLaterError keeps its existing semantics. Non-retryable errors mark FAILED inline. Retry-exhausted retryable errors get force-marked FAILED via fallback session. Builds emit through make_safe_emit so emit failures do not crash the operation. 24 new tests in test_retry.py; 3 new tests in test_base_worker.py. Suite: 1088 passed, 10 skipped. Part of F0 T0.3 and T0.8 of master plan v3. --- protea/core/retry.py | 131 +++++++++++++ protea/workers/base_worker.py | 346 ++++++++++++++++++++-------------- tests/test_base_worker.py | 89 +++++++++ tests/test_retry.py | 199 +++++++++++++++++++ 4 files changed, 621 insertions(+), 144 deletions(-) create mode 100644 protea/core/retry.py create mode 100644 tests/test_retry.py diff --git a/protea/core/retry.py b/protea/core/retry.py new file mode 100644 index 0000000..ac67b3e --- /dev/null +++ b/protea/core/retry.py @@ -0,0 +1,131 @@ +"""Generic retry middleware for transient infrastructure errors. + +Used by BaseWorker to survive Postgres deadlocks, serialization +failures and brief connection interruptions without marking the +job as failed. Application errors (validation, missing data) are +NOT retried; only transient infrastructure conditions are. + +Example:: + + from protea.core.retry import with_retry + + def do_work(): + ... + + with_retry(do_work, max_attempts=3, base_delay=1.0) +""" + +from __future__ import annotations + +import logging +import random +import time +from collections.abc import Callable +from typing import Any, ParamSpec, TypeVar + +from sqlalchemy.exc import OperationalError + +logger = logging.getLogger("protea.retry") + +# Postgres SQLSTATE codes that signal a retryable transient condition. +# - 40P01: deadlock_detected +# - 40001: serialization_failure +RETRYABLE_PG_CODES: frozenset[str] = frozenset({"40P01", "40001"}) + + +def is_retryable_db_error(exc: BaseException) -> bool: + """Return True if the exception represents a transient DB condition.""" + if not isinstance(exc, OperationalError): + return False + pgcode = getattr(getattr(exc, "orig", None), "pgcode", None) + return pgcode in RETRYABLE_PG_CODES + + +def is_retryable_connection_error(exc: BaseException) -> bool: + """Return True if the exception represents a transient network condition. + + These are typically raised by pika or low-level socket layers when the + broker is briefly unreachable. The publisher retry loop handles these + on the publish side; this predicate exists for the consume side. + """ + return isinstance(exc, ConnectionResetError | ConnectionAbortedError | TimeoutError) + + +def is_retryable(exc: BaseException) -> bool: + """Default predicate: retryable DB errors plus transient network errors.""" + return is_retryable_db_error(exc) or is_retryable_connection_error(exc) + + +P = ParamSpec("P") +R = TypeVar("R") + + +def with_retry( + fn: Callable[P, R], + *args: P.args, + max_attempts: int = 3, + base_delay: float = 1.0, + max_delay: float = 30.0, + jitter_ratio: float = 0.5, + predicate: Callable[[BaseException], bool] = is_retryable, + on_retry: Callable[[int, BaseException, float], None] | None = None, + **kwargs: P.kwargs, +) -> R: + """Run ``fn(*args, **kwargs)`` with exponential backoff and jitter. + + The callable is invoked up to ``max_attempts`` times. After each + retryable failure (per ``predicate``), the loop sleeps for + ``min(base_delay * 2**(attempt-1), max_delay)`` seconds, jittered + by a random factor in ``[1-jitter_ratio, 1+jitter_ratio]``. + + Exceptions that do not match ``predicate`` propagate immediately. + Once ``max_attempts`` retryable failures accumulate, the last + exception propagates to the caller. + + ``on_retry(attempt, exc, sleep_seconds)`` is called before each + sleep. Defaults to a structured WARNING log under + ``protea.retry``. + """ + if max_attempts < 1: + raise ValueError("max_attempts must be >= 1") + + attempt = 0 + while True: + attempt += 1 + try: + return fn(*args, **kwargs) + except BaseException as exc: + if not predicate(exc) or attempt >= max_attempts: + raise + sleep_for = min(base_delay * (2 ** (attempt - 1)), max_delay) + jitter_low = max(0.0, 1.0 - jitter_ratio) + jitter_high = 1.0 + jitter_ratio + sleep_for *= random.uniform(jitter_low, jitter_high) + + if on_retry is not None: + on_retry(attempt, exc, sleep_for) + else: + _default_on_retry(attempt, exc, sleep_for, max_attempts) + time.sleep(sleep_for) + + +def _default_on_retry( + attempt: int, exc: BaseException, sleep_for: float, max_attempts: int +) -> None: + pgcode = getattr(getattr(exc, "orig", None), "pgcode", None) + extra: dict[str, Any] = { + "attempt": attempt, + "max_attempts": max_attempts, + "sleep_seconds": round(sleep_for, 3), + "error_class": type(exc).__name__, + } + if pgcode is not None: + extra["pgcode"] = pgcode + logger.warning( + "retryable failure; sleeping %ss then retrying. attempt=%d/%d error=%s", + round(sleep_for, 3), + attempt, + max_attempts, + type(exc).__name__, + extra=extra, + ) diff --git a/protea/workers/base_worker.py b/protea/workers/base_worker.py index 814edae..e569ccc 100644 --- a/protea/workers/base_worker.py +++ b/protea/workers/base_worker.py @@ -10,8 +10,9 @@ from sqlalchemy import update as sa_update from sqlalchemy.orm import Session, sessionmaker -from protea.core.contracts.operation import OperationResult, RetryLaterError +from protea.core.contracts.operation import OperationResult, RetryLaterError, make_safe_emit from protea.core.contracts.registry import OperationRegistry +from protea.core.retry import is_retryable, with_retry from protea.core.utils import utcnow from protea.infrastructure.orm.models.job import Job, JobEvent, JobStatus from protea.infrastructure.queue.publisher import publish_job, publish_operation @@ -54,17 +55,44 @@ def handle_job(self, job_id: UUID) -> None: Claim and execute a single job identified by ``job_id``. Silently returns if the job does not exist or is not in QUEUED status. - Re-raises any exception from the operation after recording FAILED status. + Transient infrastructure failures (Postgres deadlocks, brief + connection resets) are retried up to 3 times with exponential + backoff before the job is marked FAILED. Re-raises any exception + from the operation after recording FAILED status. + """ + if not self._claim_job(job_id): + return + try: + with_retry( + self._execute_with_session, + job_id, + max_attempts=3, + base_delay=1.0, + max_delay=10.0, + jitter_ratio=0.3, + ) + except RetryLaterError: + # Consumer re-publishes; job is already QUEUED. + raise + except Exception as exc: + # If retry was exhausted on a retryable error, the job is still + # in RUNNING with no FAILED transition recorded. Force-mark FAILED + # via fallback session so it never gets stuck. + if is_retryable(exc): + self._force_fail_job(job_id, exc) + raise + + def _claim_job(self, job_id: UUID) -> bool: + """Transition the job from QUEUED to RUNNING in its own session. + + Returns True if claim succeeded; False if the job is missing or + already in a non-QUEUED state. """ - # Claim + run with DB-backed state. session = self._factory() try: job = session.get(Job, job_id) - if job is None: - return - - if job.status != JobStatus.QUEUED: - return + if job is None or job.status != JobStatus.QUEUED: + return False job.status = JobStatus.RUNNING job.started_at = utcnow() @@ -77,162 +105,192 @@ def handle_job(self, job_id: UUID) -> None: level="info", ) session.commit() + return True finally: session.close() - # Execute in a separate session + def _execute_with_session(self, job_id: UUID) -> None: + """Run the operation in a fresh session. + + Called by ``handle_job`` through ``with_retry`` so transient + infrastructure failures (deadlocks, etc.) get a clean session + on each attempt. Non-retryable exceptions propagate through to + the FAILED-handling branch below; ``RetryLaterError`` is also + propagated so the consumer can re-publish. + """ session = self._factory() try: job = session.get(Job, job_id) if job is None: return - # If the parent was cancelled while this child was being claimed, - # cancel ourselves and stop without executing. - if job.parent_job_id is not None: - parent = session.get(Job, job.parent_job_id) - if parent is not None and parent.status == JobStatus.CANCELLED: - job.status = JobStatus.CANCELLED - job.finished_at = utcnow() - self._emit( - session, - job_id, - "job.cancelled", - None, - {"reason": "parent_cancelled"}, - level="info", - ) - session.commit() - return + if self._cancel_if_parent_cancelled(session, job, job_id): + return op = self._registry.get(job.operation) - - def emit( - event: str, - message: str | None = None, - fields: dict[str, Any] | None = None, - level: str = "info", - ) -> None: - # Dedicated short-lived session that commits immediately so - # events are visible in real time, not just at job completion. - f = fields or {} - event_session = self._factory() - try: - self._emit(event_session, job_id, event, message, f, level=level) - # Allow operations to report live progress via reserved fields. - if "_progress_current" in f or "_progress_total" in f: - j = event_session.get(Job, job_id) - if j is not None: - if "_progress_current" in f: - j.progress_current = int(f["_progress_current"]) - if "_progress_total" in f: - j.progress_total = int(f["_progress_total"]) - event_session.commit() - finally: - event_session.close() + emit = make_safe_emit(self._build_emit(job_id)) + enhanced_payload = {**job.payload, "_job_id": str(job.id)} try: - # Inject runtime context into payload so operations can reference their own job. - enhanced_payload = {**job.payload, "_job_id": str(job.id)} result: OperationResult = op.execute(session, enhanced_payload, emit=emit) - - if result.progress_current is not None: - job.progress_current = int(result.progress_current) - if result.progress_total is not None: - job.progress_total = int(result.progress_total) - - if result.deferred: - # Coordinator job: children will mark it SUCCEEDED when done. - self._emit( - session, - job_id, - "job.dispatched", - None, - {"result": result.result}, - level="info", - ) - else: - job.status = JobStatus.SUCCEEDED - job.finished_at = utcnow() - self._emit( - session, - job_id, - "job.succeeded", - None, - {"result": result.result}, - level="info", - ) - - session.commit() - - # Publish child jobs to RabbitMQ after commit so workers always find the DB row. - if result.publish_after_commit and self._amqp_url: - for queue_name, child_job_id in result.publish_after_commit: - publish_job(self._amqp_url, queue_name, child_job_id) - - # Publish ephemeral operation messages (e.g. embedding batches). - if result.publish_operations and self._amqp_url: - for queue_name, op_payload in result.publish_operations: - publish_operation(self._amqp_url, queue_name, op_payload) - + self._on_operation_success(session, job, job_id, result) except RetryLaterError as e: - # Resource busy — reset to QUEUED so the consumer can re-publish. - # Adaptive backoff: count previous retries and increase delay. - retry_count = ( - session.query(func.count(JobEvent.id)) - .filter(JobEvent.job_id == job_id, JobEvent.event == "job.retry_later") - .scalar() - or 0 - ) - base_delay = e.delay_seconds - delay = min(base_delay * (2**retry_count), 600) # cap at 10 min - - job.status = JobStatus.QUEUED - job.started_at = None - self._emit( - session, - job_id, - "job.retry_later", - str(e), - {"delay_seconds": delay, "retry_count": retry_count + 1}, - level="info", - ) - session.commit() - # Propagate adaptive delay to the consumer. - e.delay_seconds = delay - raise # consumer handles re-publish - + self._on_retry_later(session, job, job_id, e) + raise except Exception as e: - job.status = JobStatus.FAILED - job.finished_at = utcnow() - job.error_code = e.__class__.__name__ - job.error_message = str(e) - self._emit( - session, - job_id, - "job.failed", - str(e), - {"error_code": job.error_code}, - level="error", - ) - if job.parent_job_id is not None: - self._maybe_fail_parent(session, job.parent_job_id) - try: - session.commit() - except Exception as commit_exc: - # Execute session is corrupted (e.g. DB connection dropped during a - # long operation). Fall back to a fresh session so the job is never - # left permanently stuck in RUNNING. - logger.error( - "Execute session commit failed; using fallback session. job_id=%s error=%s", - job_id, - commit_exc, - ) - self._force_fail_job(job_id, e) + if is_retryable(e): + # Let with_retry handle this; rollback so the next + # attempt sees a clean session state. + try: + session.rollback() + except Exception: + pass + raise + self._on_operation_failure(session, job, job_id, e) raise finally: session.close() + def _build_emit(self, job_id: UUID): + """Build the raw emit closure that writes JobEvent rows. + + Returned callable opens a short-lived session per event so + progress is visible in real time. Wrapped by ``make_safe_emit`` + before being handed to operations so emit failures never crash + the job. + """ + + def raw_emit( + event: str, + message: str | None = None, + fields: dict[str, Any] | None = None, + level: str = "info", + ) -> None: + f = fields or {} + event_session = self._factory() + try: + self._emit(event_session, job_id, event, message, f, level=level) + if "_progress_current" in f or "_progress_total" in f: + j = event_session.get(Job, job_id) + if j is not None: + if "_progress_current" in f: + j.progress_current = int(f["_progress_current"]) + if "_progress_total" in f: + j.progress_total = int(f["_progress_total"]) + event_session.commit() + finally: + event_session.close() + + return raw_emit + + def _cancel_if_parent_cancelled( + self, session: Session, job: Job, job_id: UUID + ) -> bool: + if job.parent_job_id is None: + return False + parent = session.get(Job, job.parent_job_id) + if parent is None or parent.status != JobStatus.CANCELLED: + return False + job.status = JobStatus.CANCELLED + job.finished_at = utcnow() + self._emit( + session, + job_id, + "job.cancelled", + None, + {"reason": "parent_cancelled"}, + level="info", + ) + session.commit() + return True + + def _on_operation_success( + self, session: Session, job: Job, job_id: UUID, result: OperationResult + ) -> None: + if result.progress_current is not None: + job.progress_current = int(result.progress_current) + if result.progress_total is not None: + job.progress_total = int(result.progress_total) + + if result.deferred: + self._emit( + session, + job_id, + "job.dispatched", + None, + {"result": result.result}, + level="info", + ) + else: + job.status = JobStatus.SUCCEEDED + job.finished_at = utcnow() + self._emit( + session, + job_id, + "job.succeeded", + None, + {"result": result.result}, + level="info", + ) + session.commit() + + if result.publish_after_commit and self._amqp_url: + for queue_name, child_job_id in result.publish_after_commit: + publish_job(self._amqp_url, queue_name, child_job_id) + if result.publish_operations and self._amqp_url: + for queue_name, op_payload in result.publish_operations: + publish_operation(self._amqp_url, queue_name, op_payload) + + def _on_retry_later( + self, session: Session, job: Job, job_id: UUID, exc: RetryLaterError + ) -> None: + retry_count = ( + session.query(func.count(JobEvent.id)) + .filter(JobEvent.job_id == job_id, JobEvent.event == "job.retry_later") + .scalar() + or 0 + ) + delay = min(exc.delay_seconds * (2**retry_count), 600) + job.status = JobStatus.QUEUED + job.started_at = None + self._emit( + session, + job_id, + "job.retry_later", + str(exc), + {"delay_seconds": delay, "retry_count": retry_count + 1}, + level="info", + ) + session.commit() + exc.delay_seconds = delay + + def _on_operation_failure( + self, session: Session, job: Job, job_id: UUID, exc: Exception + ) -> None: + job.status = JobStatus.FAILED + job.finished_at = utcnow() + job.error_code = exc.__class__.__name__ + job.error_message = str(exc) + self._emit( + session, + job_id, + "job.failed", + str(exc), + {"error_code": job.error_code}, + level="error", + ) + if job.parent_job_id is not None: + self._maybe_fail_parent(session, job.parent_job_id) + try: + session.commit() + except Exception as commit_exc: + logger.error( + "Execute session commit failed; using fallback session. job_id=%s error=%s", + job_id, + commit_exc, + ) + self._force_fail_job(job_id, exc) + def _force_fail_job(self, job_id: UUID, original_exc: Exception) -> None: """Mark a job FAILED using a fresh session. diff --git a/tests/test_base_worker.py b/tests/test_base_worker.py index 16175b9..b505d8c 100644 --- a/tests/test_base_worker.py +++ b/tests/test_base_worker.py @@ -188,6 +188,95 @@ def test_retry_backoff_capped_at_600(self): assert exc_info.value.delay_seconds == 600 + def test_retryable_db_error_is_retried_then_succeeds(self): + """Postgres deadlock during op.execute() retries until the op succeeds.""" + from sqlalchemy.exc import OperationalError + + job = _make_job() + session = MagicMock() + session.get.return_value = job + factory = MagicMock(return_value=session) + + # First call raises retryable OperationalError; second succeeds. + attempts = {"n": 0} + + def _execute(sess, payload, *, emit): + attempts["n"] += 1 + if attempts["n"] == 1: + orig = MagicMock() + orig.pgcode = "40P01" # deadlock_detected + err = OperationalError("stmt", {}, orig) + err.orig = orig + raise err + return OperationResult(result={"ok": True}) + + op = MagicMock() + op.name = "ping" + op.execute.side_effect = _execute + registry = OperationRegistry() + registry.register(op) + + worker = BaseWorker(factory, registry, WorkerConfig(worker_name="test")) + with patch("protea.core.retry.time.sleep"): + worker.handle_job(job.id) + + assert attempts["n"] == 2 + assert job.status == JobStatus.SUCCEEDED + + def test_retryable_db_error_max_attempts_then_fails(self): + """If retryable error keeps recurring, after max_attempts the job is FAILED.""" + from sqlalchemy.exc import OperationalError + + job = _make_job() + session = MagicMock() + session.get.return_value = job + factory = MagicMock(return_value=session) + + def _always_deadlock(sess, payload, *, emit): + orig = MagicMock() + orig.pgcode = "40P01" + err = OperationalError("stmt", {}, orig) + err.orig = orig + raise err + + op = MagicMock() + op.name = "ping" + op.execute.side_effect = _always_deadlock + registry = OperationRegistry() + registry.register(op) + + worker = BaseWorker(factory, registry, WorkerConfig(worker_name="test")) + with patch("protea.core.retry.time.sleep"): + with pytest.raises(OperationalError): + worker.handle_job(job.id) + + # 3 attempts (max_attempts=3 in handle_job). + assert op.execute.call_count == 3 + # Retry-exhausted path uses the fallback session via sa_update, + # so the in-memory mock Job is not directly mutated. Verify the + # fallback session.execute was invoked instead. + # session was reused as the factory's only fixture, so an + # update statement should have run on it. + assert any( + "UPDATE" in str(call.args[0]).upper() if call.args else False + for call in session.execute.call_args_list + ) + + def test_non_retryable_error_does_not_retry(self): + """Non-infrastructure errors propagate immediately, single attempt.""" + job = _make_job() + session = MagicMock() + session.get.return_value = job + factory = MagicMock(return_value=session) + registry, op = _make_registry(raises=ValueError("bad payload")) + + worker = BaseWorker(factory, registry, WorkerConfig(worker_name="test")) + with pytest.raises(ValueError, match="bad payload"): + worker.handle_job(job.id) + + assert op.execute.call_count == 1 + assert job.status == JobStatus.FAILED + # --------------------------------------------------------------------------- # StaleJobReaper diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 0000000..8079e7d --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,199 @@ +"""Tests for protea.core.retry (T0.3 of master plan v3).""" + +from __future__ import annotations + +import logging +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest +from sqlalchemy.exc import OperationalError + +from protea.core.retry import ( + RETRYABLE_PG_CODES, + is_retryable, + is_retryable_connection_error, + is_retryable_db_error, + with_retry, +) + + +def _make_op_error(pgcode: str | None) -> OperationalError: + """Build an OperationalError whose .orig has the given pgcode.""" + orig = MagicMock() + orig.pgcode = pgcode + err = OperationalError("stmt", {}, orig) + err.orig = orig + return err + + +class TestIsRetryableDbError: + def test_deadlock_detected_is_retryable(self) -> None: + assert is_retryable_db_error(_make_op_error("40P01")) is True + + def test_serialization_failure_is_retryable(self) -> None: + assert is_retryable_db_error(_make_op_error("40001")) is True + + def test_other_pgcode_is_not_retryable(self) -> None: + assert is_retryable_db_error(_make_op_error("23505")) is False + + def test_no_pgcode_is_not_retryable(self) -> None: + assert is_retryable_db_error(_make_op_error(None)) is False + + def test_non_operational_error_is_not_retryable(self) -> None: + assert is_retryable_db_error(ValueError("bad")) is False + + def test_known_codes_match_constant(self) -> None: + assert "40P01" in RETRYABLE_PG_CODES + assert "40001" in RETRYABLE_PG_CODES + + +class TestIsRetryableConnectionError: + def test_connection_reset(self) -> None: + assert is_retryable_connection_error(ConnectionResetError("kaboom")) is True + + def test_connection_aborted(self) -> None: + assert is_retryable_connection_error(ConnectionAbortedError("bye")) is True + + def test_timeout(self) -> None: + assert is_retryable_connection_error(TimeoutError("slow")) is True + + def test_value_error_not(self) -> None: + assert is_retryable_connection_error(ValueError("nope")) is False + + +class TestIsRetryableCombined: + def test_db_match(self) -> None: + assert is_retryable(_make_op_error("40P01")) is True + + def test_connection_match(self) -> None: + assert is_retryable(ConnectionResetError("x")) is True + + def test_unrelated_does_not_match(self) -> None: + assert is_retryable(RuntimeError("x")) is False + + +class TestWithRetry: + def test_returns_value_when_no_failure(self) -> None: + fn = MagicMock(return_value=42) + + result = with_retry(fn, "a", k="b") + + assert result == 42 + fn.assert_called_once_with("a", k="b") + + def test_retries_on_retryable_then_succeeds(self) -> None: + attempts = {"n": 0} + + def fn() -> str: + attempts["n"] += 1 + if attempts["n"] < 3: + raise _make_op_error("40P01") + return "ok" + + with patch("protea.core.retry.time.sleep") as mock_sleep: + result = with_retry(fn, max_attempts=5, base_delay=0.01, jitter_ratio=0) + + assert result == "ok" + assert attempts["n"] == 3 + # Two sleeps: between attempt 1 and 2, between 2 and 3. + assert mock_sleep.call_count == 2 + + def test_propagates_non_retryable(self) -> None: + fn = MagicMock(side_effect=ValueError("bad")) + + with patch("protea.core.retry.time.sleep") as mock_sleep: + with pytest.raises(ValueError, match="bad"): + with_retry(fn, max_attempts=5) + + # Non-retryable exception means no sleep, no retry. + assert fn.call_count == 1 + assert mock_sleep.call_count == 0 + + def test_propagates_after_max_attempts(self) -> None: + fn = MagicMock(side_effect=_make_op_error("40P01")) + + with patch("protea.core.retry.time.sleep") as mock_sleep: + with pytest.raises(OperationalError): + with_retry(fn, max_attempts=3, base_delay=0.01, jitter_ratio=0) + + assert fn.call_count == 3 + # Sleep happens only between attempts (so 2 sleeps for 3 attempts). + assert mock_sleep.call_count == 2 + + def test_exponential_backoff_no_jitter(self) -> None: + fn = MagicMock(side_effect=_make_op_error("40P01")) + + with patch("protea.core.retry.time.sleep") as mock_sleep: + with pytest.raises(OperationalError): + with_retry(fn, max_attempts=4, base_delay=1.0, max_delay=30.0, jitter_ratio=0) + + sleeps = [c.args[0] for c in mock_sleep.call_args_list] + # attempt 1 -> sleep 1.0, attempt 2 -> 2.0, attempt 3 -> 4.0 + assert sleeps == [1.0, 2.0, 4.0] + + def test_max_delay_caps_backoff(self) -> None: + fn = MagicMock(side_effect=_make_op_error("40P01")) + + with patch("protea.core.retry.time.sleep") as mock_sleep: + with pytest.raises(OperationalError): + with_retry(fn, max_attempts=8, base_delay=1.0, max_delay=5.0, jitter_ratio=0) + + sleeps = [c.args[0] for c in mock_sleep.call_args_list] + # 1, 2, 4, 5 (capped), 5, 5, 5 + assert sleeps == [1.0, 2.0, 4.0, 5.0, 5.0, 5.0, 5.0] + + def test_jitter_keeps_sleep_within_band(self) -> None: + fn = MagicMock(side_effect=_make_op_error("40P01")) + captured: list[float] = [] + + with patch( + "protea.core.retry.time.sleep", side_effect=lambda d: captured.append(d) + ): + with pytest.raises(OperationalError): + with_retry(fn, max_attempts=4, base_delay=1.0, max_delay=30.0, jitter_ratio=0.5) + + # First sleep base = 1.0; with jitter 0.5, range is [0.5, 1.5]. + assert 0.5 <= captured[0] <= 1.5 + # Second sleep base = 2.0; range [1.0, 3.0]. + assert 1.0 <= captured[1] <= 3.0 + + def test_calls_on_retry_callback(self) -> None: + events: list[tuple[int, str, float]] = [] + + def cb(attempt: int, exc: BaseException, sleep_for: float) -> None: + events.append((attempt, type(exc).__name__, sleep_for)) + + fn = MagicMock(side_effect=[_make_op_error("40P01"), "ok"]) + + with patch("protea.core.retry.time.sleep"): + result = with_retry(fn, max_attempts=2, base_delay=0.01, jitter_ratio=0, on_retry=cb) + + assert result == "ok" + assert len(events) == 1 + assert events[0][0] == 1 + assert events[0][1] == "OperationalError" + + def test_logs_warning_on_default_callback(self, caplog: pytest.LogCaptureFixture) -> None: + fn = MagicMock(side_effect=[_make_op_error("40001"), "ok"]) + + with caplog.at_level(logging.WARNING, logger="protea.retry"): + with patch("protea.core.retry.time.sleep"): + with_retry(fn, max_attempts=2, base_delay=0.01, jitter_ratio=0) + + relevant = [r for r in caplog.records if r.name == "protea.retry"] + assert relevant, "expected at least one log under protea.retry" + assert relevant[0].levelno == logging.WARNING + assert relevant[0].pgcode == "40001" + + def test_max_attempts_zero_raises(self) -> None: + fn = MagicMock() + with pytest.raises(ValueError, match="max_attempts must be"): + with_retry(fn, max_attempts=0) + + def test_does_not_swallow_keyboard_interrupt(self) -> None: + def fn() -> Any: + raise KeyboardInterrupt + + with pytest.raises(KeyboardInterrupt): + with_retry(fn, max_attempts=3, base_delay=0.01) From bfbda031642b840fcd4f44e23a975afaafa558d0 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 16:35:59 +0200 Subject: [PATCH 46/73] refactor(progress): T0.7 dedup _update_parent_progress The two store_X operations had identical 30-line implementations of _update_parent_progress (compute_embeddings.py and predict_go_terms.py), differing only in the operation-specific event name passed to emit on parent SUCCEEDED transition. Extracts to protea.core.contracts.parent_progress.update_parent_progress( session, parent_job_id, emit, *, event_name). Both operations now delegate. The DB-level JobEvent row is uniformly named "job.succeeded"; the operation-specific event name only flows through emit() so downstream observers can distinguish which store closed the parent. 5 new tests cover: silent when no row, silent when not last batch, SUCCEEDED transition + emit, race when succeeded returns nothing, event_name passthrough. Suite: 1093 passed, 10 skipped (was 1088 + 5). Part of F0 T0.7 of master plan v3. --- protea/core/contracts/parent_progress.py | 70 +++++++++++ protea/core/operations/compute_embeddings.py | 39 ++---- protea/core/operations/predict_go_terms.py | 39 ++---- tests/test_parent_progress.py | 120 +++++++++++++++++++ 4 files changed, 204 insertions(+), 64 deletions(-) create mode 100644 protea/core/contracts/parent_progress.py create mode 100644 tests/test_parent_progress.py diff --git a/protea/core/contracts/parent_progress.py b/protea/core/contracts/parent_progress.py new file mode 100644 index 0000000..d6fafa9 --- /dev/null +++ b/protea/core/contracts/parent_progress.py @@ -0,0 +1,70 @@ +"""Shared helper for child operations that report progress to a parent job. + +Used by ``store_embeddings`` and ``store_predictions``: each child +batch increments the parent's ``progress_current`` and, if it was +the last batch, transitions the parent from RUNNING to SUCCEEDED. +""" + +from __future__ import annotations + +from uuid import UUID + +from sqlalchemy import update as sa_update +from sqlalchemy.orm import Session + +from protea.core.contracts.operation import EmitFn +from protea.core.utils import utcnow +from protea.infrastructure.orm.models.job import Job, JobEvent, JobStatus + + +def update_parent_progress( + session: Session, + parent_job_id: UUID, + emit: EmitFn, + *, + event_name: str, +) -> None: + """Atomically increment parent progress; mark SUCCEEDED if last batch. + + Returns silently if the parent has not yet seen all its batches + (``progress_current < progress_total``) or if no longer RUNNING. + + The ``event_name`` is the operation-specific suffix used in the + ``emit`` call when the parent transitions to SUCCEEDED, e.g. + ``"store_embeddings.parent_succeeded"`` or + ``"store_predictions.parent_succeeded"``. The DB-level event row is + always written as ``job.succeeded`` so downstream consumers see a + uniform name regardless of which child closed the parent. + """ + row = session.execute( + sa_update(Job) + .where(Job.id == parent_job_id, Job.status == JobStatus.RUNNING) + .values(progress_current=Job.progress_current + 1) + .returning(Job.progress_current, Job.progress_total) + ).fetchone() + + if row is None or row.progress_current != row.progress_total: + return + + closed = session.execute( + sa_update(Job) + .where(Job.id == parent_job_id, Job.status == JobStatus.RUNNING) + .values(status=JobStatus.SUCCEEDED, finished_at=utcnow()) + .returning(Job.id) + ).fetchone() + + if closed: + session.add( + JobEvent( + job_id=parent_job_id, + event="job.succeeded", + fields={"via": "last_batch_stored"}, + level="info", + ) + ) + emit( + event_name, + None, + {"parent_job_id": str(parent_job_id)}, + "info", + ) diff --git a/protea/core/operations/compute_embeddings.py b/protea/core/operations/compute_embeddings.py index c02c40e..dc638af 100644 --- a/protea/core/operations/compute_embeddings.py +++ b/protea/core/operations/compute_embeddings.py @@ -15,6 +15,7 @@ from sqlalchemy.orm import Session from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload, RetryLaterError +from protea.core.contracts.parent_progress import update_parent_progress from protea.core.utils import utcnow from protea.infrastructure.orm.models.embedding.embedding_config import EmbeddingConfig from protea.infrastructure.orm.models.embedding.sequence_embedding import SequenceEmbedding @@ -588,38 +589,12 @@ def execute( ) def _update_parent_progress(self, session: Session, parent_job_id: UUID, emit: EmitFn) -> None: - row = session.execute( - sa_update(Job) - .where(Job.id == parent_job_id, Job.status == JobStatus.RUNNING) - .values(progress_current=Job.progress_current + 1) - .returning(Job.progress_current, Job.progress_total) - ).fetchone() - - if row is None or row.progress_current != row.progress_total: - return - - closed = session.execute( - sa_update(Job) - .where(Job.id == parent_job_id, Job.status == JobStatus.RUNNING) - .values(status=JobStatus.SUCCEEDED, finished_at=utcnow()) - .returning(Job.id) - ).fetchone() - - if closed: - session.add( - JobEvent( - job_id=parent_job_id, - event="job.succeeded", - fields={"via": "last_batch_stored"}, - level="info", - ) - ) - emit( - "store_embeddings.parent_succeeded", - None, - {"parent_job_id": str(parent_job_id)}, - "info", - ) + update_parent_progress( + session, + parent_job_id, + emit, + event_name="store_embeddings.parent_succeeded", + ) # --------------------------------------------------------------------------- diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index 35814eb..93487af 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -14,6 +14,7 @@ from protea.core.annotation_intern import intern_string from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload +from protea.core.contracts.parent_progress import update_parent_progress from protea.core.disk_cache import ( _aspect_index_path, _build_anno_csr, @@ -2015,35 +2016,9 @@ def execute( return OperationResult(result={"predictions_inserted": len(p.predictions)}) def _update_parent_progress(self, session: Session, parent_job_id: UUID, emit: EmitFn) -> None: - row = session.execute( - sa_update(Job) - .where(Job.id == parent_job_id, Job.status == JobStatus.RUNNING) - .values(progress_current=Job.progress_current + 1) - .returning(Job.progress_current, Job.progress_total) - ).fetchone() - - if row is None or row.progress_current != row.progress_total: - return - - closed = session.execute( - sa_update(Job) - .where(Job.id == parent_job_id, Job.status == JobStatus.RUNNING) - .values(status=JobStatus.SUCCEEDED, finished_at=utcnow()) - .returning(Job.id) - ).fetchone() - - if closed: - session.add( - JobEvent( - job_id=parent_job_id, - event="job.succeeded", - fields={"via": "last_batch_stored"}, - level="info", - ) - ) - emit( - "store_predictions.parent_succeeded", - None, - {"parent_job_id": str(parent_job_id)}, - "info", - ) + update_parent_progress( + session, + parent_job_id, + emit, + event_name="store_predictions.parent_succeeded", + ) diff --git a/tests/test_parent_progress.py b/tests/test_parent_progress.py new file mode 100644 index 0000000..ca80b43 --- /dev/null +++ b/tests/test_parent_progress.py @@ -0,0 +1,120 @@ +"""Tests for protea.core.contracts.parent_progress (T0.7).""" + +from __future__ import annotations + +from unittest.mock import MagicMock +from uuid import uuid4 + +from protea.core.contracts.parent_progress import update_parent_progress + + +def _row(current: int, total: int) -> MagicMock: + r = MagicMock() + r.progress_current = current + r.progress_total = total + return r + + +class TestUpdateParentProgress: + def test_silent_when_no_row_returned(self) -> None: + session = MagicMock() + session.execute.return_value.fetchone.return_value = None + emit = MagicMock() + + update_parent_progress( + session, + uuid4(), + emit, + event_name="store_x.parent_succeeded", + ) + + # Only the increment statement was executed. + assert session.execute.call_count == 1 + emit.assert_not_called() + session.add.assert_not_called() + + def test_returns_when_not_last_batch(self) -> None: + session = MagicMock() + session.execute.return_value.fetchone.return_value = _row(2, 5) + emit = MagicMock() + + update_parent_progress( + session, + uuid4(), + emit, + event_name="store_x.parent_succeeded", + ) + + assert session.execute.call_count == 1 + emit.assert_not_called() + session.add.assert_not_called() + + def test_marks_succeeded_on_last_batch(self) -> None: + parent_id = uuid4() + session = MagicMock() + # First execute (the increment) returns last-batch progress; + # second execute (the SUCCEEDED transition) returns the closed row. + increment = MagicMock() + increment.fetchone.return_value = _row(5, 5) + succeeded = MagicMock() + succeeded.fetchone.return_value = MagicMock(id=parent_id) + session.execute.side_effect = [increment, succeeded] + emit = MagicMock() + + update_parent_progress( + session, + parent_id, + emit, + event_name="store_predictions.parent_succeeded", + ) + + assert session.execute.call_count == 2 + # JobEvent row added once. + session.add.assert_called_once() + # Custom event emitted with the operation-specific name. + emit.assert_called_once() + emit_call = emit.call_args + assert emit_call.args[0] == "store_predictions.parent_succeeded" + assert emit_call.args[2] == {"parent_job_id": str(parent_id)} + + def test_emit_skipped_when_succeeded_returns_no_row(self) -> None: + # Race: another worker finished before us. Our SUCCEEDED update + # returns no row because the parent is no longer RUNNING. + session = MagicMock() + increment = MagicMock() + increment.fetchone.return_value = _row(5, 5) + succeeded = MagicMock() + succeeded.fetchone.return_value = None + session.execute.side_effect = [increment, succeeded] + emit = MagicMock() + + update_parent_progress( + session, + uuid4(), + emit, + event_name="store_x.parent_succeeded", + ) + + assert session.execute.call_count == 2 + session.add.assert_not_called() + emit.assert_not_called() + + def test_event_name_is_passed_through(self) -> None: + parent_id = uuid4() + session = MagicMock() + increment = MagicMock() + increment.fetchone.return_value = _row(1, 1) + succeeded = MagicMock() + succeeded.fetchone.return_value = MagicMock(id=parent_id) + session.execute.side_effect = [increment, succeeded] + emit = MagicMock() + + update_parent_progress( + session, + parent_id, + emit, + event_name="custom.parent_succeeded", + ) + + emit.assert_called_once() + assert emit.call_args.args[0] == "custom.parent_succeeded" From 630432be860c72a5131ffe30fd84f4ffced2ebd3 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:07:31 +0200 Subject: [PATCH 47/73] refactor(uniprot): T0.9 UniProtHttpMixin to composable client Replaces the inheritance-based UniProtHttpMixin (109 LOC of mixed state and behaviour) with UniProtHttpClient, a composable class. Operations hold a client instance via composition rather than inheritance: before: class InsertProteinsOperation(UniProtHttpMixin, Operation) after: class InsertProteinsOperation(Operation): def __init__(self): self._http_client = UniProtHttpClient() State is private to the client (.session, .requests, .retries) and is reset via .reset() at the start of each execute(). The extract_next_cursor utility moves to a @staticmethod since it has no instance state. Operations call: self._http_client.get_with_retries(url, p, emit) self._http_client.extract_next_cursor(link_header) self._http_client.requests / .retries (for emit fields) Migrates two operations (insert_proteins, fetch_uniprot_metadata) and three test files (test_core, test_insert_proteins, test_fetch_uniprot_metadata) accordingly. test_core renames the test class to TestUniProtHttpClient and adds test_reset_clears_counters. Suite: 1094 passed, 10 skipped (was 1093 + 1). Part of F0 T0.9 of master plan v3. --- .../core/operations/fetch_uniprot_metadata.py | 23 +++--- protea/core/operations/insert_proteins.py | 27 +++---- protea/core/utils.py | 52 ++++++++---- tests/test_core.py | 79 ++++++++++--------- tests/test_fetch_uniprot_metadata.py | 12 +-- tests/test_insert_proteins.py | 34 ++++---- 6 files changed, 122 insertions(+), 105 deletions(-) diff --git a/protea/core/operations/fetch_uniprot_metadata.py b/protea/core/operations/fetch_uniprot_metadata.py index 2ed7e09..0908c65 100644 --- a/protea/core/operations/fetch_uniprot_metadata.py +++ b/protea/core/operations/fetch_uniprot_metadata.py @@ -14,7 +14,7 @@ from sqlalchemy.orm import Session from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload -from protea.core.utils import UniProtHttpMixin, chunks +from protea.core.utils import UniProtHttpClient, chunks from protea.infrastructure.orm.models.protein.protein import Protein from protea.infrastructure.orm.models.protein.protein_metadata import ProteinUniProtMetadata @@ -44,7 +44,7 @@ def must_be_non_empty(cls, v: str) -> str: return v.strip() -class FetchUniProtMetadataOperation(UniProtHttpMixin): +class FetchUniProtMetadataOperation: """Fetches functional annotations from UniProt (TSV) and upserts ProteinUniProtMetadata rows. One metadata row is stored per canonical accession. Isoforms share the same @@ -96,16 +96,13 @@ def summarize_payload(self, payload: dict[str, Any]) -> str: } def __init__(self) -> None: - self._http_requests = 0 - self._http_retries = 0 + self._http_client = UniProtHttpClient() self._total_results: int | None = None - self._http = requests.Session() def execute( self, session: Session, payload: dict[str, Any], *, emit: EmitFn ) -> OperationResult: - self._http_requests = 0 - self._http_retries = 0 + self._http_client.reset() self._total_results = None p = FetchUniProtMetadataPayload.model_validate(payload) @@ -147,8 +144,8 @@ def execute( "rows_total": total_rows, "proteins_touched_total": proteins_touched, "metadata_upserted_total": metadata_upserted, - "http_requests": self._http_requests, - "http_retries": self._http_retries, + "http_requests": self._http_client.requests, + "http_retries": self._http_client.retries, "_progress_current": total_rows, **( {"_progress_total": p.total_limit or self._total_results} @@ -177,8 +174,8 @@ def execute( "rows": total_rows, "proteins_touched": proteins_touched, "metadata_upserted": metadata_upserted, - "http_requests": self._http_requests, - "http_retries": self._http_retries, + "http_requests": self._http_client.requests, + "http_retries": self._http_client.retries, "elapsed_seconds": elapsed, } emit("fetch_uniprot_metadata.done", None, result, "info") @@ -241,7 +238,7 @@ def _fetch_tsv_pages( "info", ) - resp = self._get_with_retries(url, p, emit) + resp = self._http_client.get_with_retries(url, p, emit) if self._total_results is None: try: self._total_results = int(resp.headers.get("X-Total-Results", 0)) or None @@ -253,7 +250,7 @@ def _fetch_tsv_pages( emit("uniprot.fetch_page_done", None, {"page": page, "rows": len(rows)}, "info") yield rows - next_cursor = self._extract_next_cursor(resp.headers.get("link", "")) + next_cursor = self._http_client.extract_next_cursor(resp.headers.get("link", "")) if not next_cursor: break diff --git a/protea/core/operations/insert_proteins.py b/protea/core/operations/insert_proteins.py index 13da9af..e1aacf1 100644 --- a/protea/core/operations/insert_proteins.py +++ b/protea/core/operations/insert_proteins.py @@ -15,7 +15,7 @@ from sqlalchemy.orm import Session from protea.core.contracts.operation import EmitFn, Operation, OperationResult, ProteaPayload -from protea.core.utils import UniProtHttpMixin, chunks +from protea.core.utils import UniProtHttpClient, chunks from protea.infrastructure.orm.models.protein.protein import Protein from protea.infrastructure.orm.models.sequence.sequence import Sequence as SequenceModel @@ -44,7 +44,7 @@ def must_be_non_empty(cls, v: str) -> str: return v.strip() -class InsertProteinsOperation(UniProtHttpMixin, Operation): +class InsertProteinsOperation(Operation): """Fetches protein sequences from UniProt (FASTA) and upserts them into the DB. Uses cursor-based pagination, exponential backoff with jitter, and MD5-based @@ -78,16 +78,13 @@ def summarize_payload(self, payload: dict[str, Any]) -> str: _re_gn = re.compile(r"\bGN=([^\s]+)") def __init__(self) -> None: - self._http_requests = 0 - self._http_retries = 0 + self._http_client = UniProtHttpClient() self._total_results: int | None = None - self._http = requests.Session() def execute( self, session: Session, payload: dict[str, Any], *, emit: EmitFn ) -> OperationResult: - self._http_requests = 0 - self._http_retries = 0 + self._http_client.reset() self._total_results = None p = InsertProteinsPayload.model_validate(payload) @@ -137,8 +134,8 @@ def execute( "proteins_updated_total": proteins_updated, "sequences_inserted_total": sequences_inserted, "sequences_reused_total": sequences_reused, - "http_requests": self._http_requests, - "http_retries": self._http_retries, + "http_requests": self._http_client.requests, + "http_retries": self._http_client.retries, "_progress_current": retrieved, **( {"_progress_total": p.total_limit or self._total_results} @@ -167,8 +164,8 @@ def execute( "proteins_updated": proteins_updated, "sequences_inserted": sequences_inserted, "sequences_reused": sequences_reused, - "http_requests": self._http_requests, - "http_retries": self._http_retries, + "http_requests": self._http_client.requests, + "http_retries": self._http_client.retries, "elapsed_seconds": elapsed, }, "info", @@ -183,8 +180,8 @@ def execute( "proteins_updated": proteins_updated, "sequences_inserted": sequences_inserted, "sequences_reused": sequences_reused, - "http_requests": self._http_requests, - "http_retries": self._http_retries, + "http_requests": self._http_client.requests, + "http_retries": self._http_client.retries, "elapsed_seconds": elapsed, } ) @@ -214,7 +211,7 @@ def _fetch_fasta_pages( "info", ) - resp = self._get_with_retries(url, p, emit) + resp = self._http_client.get_with_retries(url, p, emit) if self._total_results is None: try: self._total_results = int(resp.headers.get("X-Total-Results", 0)) or None @@ -226,7 +223,7 @@ def _fetch_fasta_pages( emit("uniprot.fetch_page_done", None, {"page": page, "records": len(records)}, "info") yield records - next_cursor = self._extract_next_cursor(resp.headers.get("link", "")) + next_cursor = self._http_client.extract_next_cursor(resp.headers.get("link", "")) if not next_cursor: break diff --git a/protea/core/utils.py b/protea/core/utils.py index 4811c1f..38625ac 100644 --- a/protea/core/utils.py +++ b/protea/core/utils.py @@ -35,31 +35,52 @@ class _HttpPayload(Protocol): jitter_seconds: float -class UniProtHttpMixin: - """Shared HTTP retry logic for UniProt REST API operations. +class UniProtHttpClient: + """Composable HTTP client with retries, used by UniProt REST operations. - Requires the subclass ``__init__`` to set: - self._http_requests: int = 0 - self._http_retries: int = 0 - self._http: requests.Session = requests.Session() + Replaces the historical ``UniProtHttpMixin`` (favours composition + over inheritance). Operations instantiate one client per execution + and read counters from ``client.requests`` / ``client.retries``. + + Example:: + + class InsertProteinsOperation: + def __init__(self) -> None: + self._http_client = UniProtHttpClient() + + def execute(self, session, payload, *, emit): + self._http_client.reset() + ... + resp = self._http_client.get_with_retries(url, p, emit) + cursor = self._http_client.extract_next_cursor(link) """ - _http: requests.Session - _http_requests: int - _http_retries: int + def __init__(self) -> None: + self.session: requests.Session = requests.Session() + self.requests: int = 0 + self.retries: int = 0 + + def reset(self) -> None: + """Reset request/retry counters before a new execution. + + The underlying ``requests.Session`` is reused across executions + so connection pooling stays effective. + """ + self.requests = 0 + self.retries = 0 - def _get_with_retries(self, url: str, p: _HttpPayload, emit: EmitFn) -> Response: + def get_with_retries(self, url: str, p: _HttpPayload, emit: EmitFn) -> Response: headers = {"User-Agent": p.user_agent} attempt = 0 while True: attempt += 1 - self._http_requests += 1 + self.requests += 1 try: - resp = self._http.get(url, timeout=p.timeout_seconds, headers=headers) + resp = self.session.get(url, timeout=p.timeout_seconds, headers=headers) except requests.RequestException as e: if attempt > p.max_retries: raise - self._http_retries += 1 + self.retries += 1 self._sleep_backoff( p, attempt, emit, reason=f"request_exception:{e.__class__.__name__}" ) @@ -71,7 +92,7 @@ def _get_with_retries(self, url: str, p: _HttpPayload, emit: EmitFn) -> Response if resp.status_code in (429, 500, 502, 503, 504): if attempt > p.max_retries: resp.raise_for_status() - self._http_retries += 1 + self.retries += 1 retry_after = resp.headers.get("Retry-After") if retry_after and retry_after.isdigit(): wait_s = min(float(retry_after), p.backoff_max_seconds) @@ -99,7 +120,8 @@ def _sleep_backoff(self, p: _HttpPayload, attempt: int, emit: EmitFn, reason: st ) time.sleep(wait_s) - def _extract_next_cursor(self, link_header: str) -> str | None: + @staticmethod + def extract_next_cursor(link_header: str) -> str | None: if not link_header or 'rel="next"' not in link_header or "cursor=" not in link_header: return None try: diff --git a/tests/test_core.py b/tests/test_core.py index 32659a4..0bf5034 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -20,7 +20,7 @@ FetchUniProtMetadataPayload, ) from protea.core.operations.ping import PingOperation -from protea.core.utils import UniProtHttpMixin, chunks +from protea.core.utils import UniProtHttpClient, chunks # --------------------------------------------------------------------------- # OperationRegistry @@ -103,7 +103,7 @@ def test_empty_seq(self) -> None: # --------------------------------------------------------------------------- -# UniProtHttpMixin +# UniProtHttpClient # --------------------------------------------------------------------------- @@ -118,89 +118,90 @@ def _make_payload(max_retries=3, backoff_base=0.01, backoff_max=0.1, jitter=0.0) return p -class _ConcreteHttp(UniProtHttpMixin): - def __init__(self): - self._http_requests = 0 - self._http_retries = 0 - self._http = MagicMock() +def _make_client() -> UniProtHttpClient: + client = UniProtHttpClient() + client.session = MagicMock() + return client def _noop_emit(*_): return None -class TestUniProtHttpMixin: - def _obj(self) -> _ConcreteHttp: - return _ConcreteHttp() - +class TestUniProtHttpClient: def test_returns_response_on_200(self) -> None: - obj = self._obj() + client = _make_client() resp = MagicMock() resp.status_code = 200 - obj._http.get.return_value = resp - result = obj._get_with_retries("http://x", _make_payload(), _noop_emit) + client.session.get.return_value = resp + result = client.get_with_retries("http://x", _make_payload(), _noop_emit) assert result is resp def test_retries_on_429(self) -> None: - obj = self._obj() + client = _make_client() bad = MagicMock() bad.status_code = 429 bad.headers = {} good = MagicMock() good.status_code = 200 - obj._http.get.side_effect = [bad, good] + client.session.get.side_effect = [bad, good] with patch("protea.core.utils.time.sleep"): - result = obj._get_with_retries("http://x", _make_payload(), _noop_emit) + result = client.get_with_retries("http://x", _make_payload(), _noop_emit) assert result is good - assert obj._http_retries == 1 + assert client.retries == 1 def test_uses_retry_after_header(self) -> None: - obj = self._obj() + client = _make_client() bad = MagicMock() bad.status_code = 429 bad.headers = {"Retry-After": "5"} good = MagicMock() good.status_code = 200 - obj._http.get.side_effect = [bad, good] + client.session.get.side_effect = [bad, good] sleep_calls = [] with patch("protea.core.utils.time.sleep", side_effect=sleep_calls.append): - obj._get_with_retries("http://x", _make_payload(backoff_max=30.0), _noop_emit) + client.get_with_retries("http://x", _make_payload(backoff_max=30.0), _noop_emit) assert len(sleep_calls) == 1 assert sleep_calls[0] == pytest.approx(5.0) def test_raises_after_max_retries(self) -> None: - obj = self._obj() + client = _make_client() bad = MagicMock() bad.status_code = 503 bad.headers = {} bad.raise_for_status.side_effect = requests.HTTPError("503") - obj._http.get.return_value = bad + client.session.get.return_value = bad with patch("protea.core.utils.time.sleep"): with pytest.raises(requests.HTTPError): - obj._get_with_retries("http://x", _make_payload(max_retries=2), _noop_emit) + client.get_with_retries("http://x", _make_payload(max_retries=2), _noop_emit) def test_retries_on_network_exception(self) -> None: - obj = self._obj() + client = _make_client() good = MagicMock() good.status_code = 200 - obj._http.get.side_effect = [requests.ConnectionError("down"), good] + client.session.get.side_effect = [requests.ConnectionError("down"), good] with patch("protea.core.utils.time.sleep"): - result = obj._get_with_retries("http://x", _make_payload(), _noop_emit) + result = client.get_with_retries("http://x", _make_payload(), _noop_emit) assert result is good + def test_reset_clears_counters(self) -> None: + client = _make_client() + client.requests = 7 + client.retries = 3 + client.reset() + assert client.requests == 0 + assert client.retries == 0 + def test_extract_next_cursor_present(self) -> None: - obj = self._obj() header = '; rel="next"' - assert obj._extract_next_cursor(header) == "ABCD1234" + assert UniProtHttpClient.extract_next_cursor(header) == "ABCD1234" def test_extract_next_cursor_absent(self) -> None: - obj = self._obj() - assert obj._extract_next_cursor("") is None - assert obj._extract_next_cursor('; rel="prev"') is None + assert UniProtHttpClient.extract_next_cursor("") is None + assert UniProtHttpClient.extract_next_cursor('; rel="prev"') is None def test_extract_next_cursor_no_cursor_param(self) -> None: - obj = self._obj() - assert obj._extract_next_cursor('; rel="next"') is None + assert UniProtHttpClient.extract_next_cursor('; rel="next"') is None # --------------------------------------------------------------------------- @@ -299,7 +300,7 @@ def _make_tsv_content(rows: list[dict[str, str]], compressed: bool = True) -> by class TestFetchUniProtMetadataExecute: def _make_op(self): op = FetchUniProtMetadataOperation() - op._http = MagicMock() + op._http_client.session = MagicMock() return op def test_execute_empty_page_continues(self): @@ -315,7 +316,7 @@ def emit(event, message, fields, level): resp.status_code = 200 resp.headers = {"X-Total-Results": "0"} resp.content = _make_tsv_content([], compressed=True) - op._http.get.return_value = resp + op._http_client.session.get.return_value = resp session = MagicMock() payload = {"search_criteria": "organism_id:9606", "page_size": 10} @@ -345,7 +346,7 @@ def test_execute_total_limit_truncation(self): resp.status_code = 200 resp.headers = {"X-Total-Results": "5"} resp.content = _make_tsv_content(rows, compressed=True) - op._http.get.return_value = resp + op._http_client.session.get.return_value = resp session = MagicMock() session.query.return_value.filter.return_value.all.return_value = [] @@ -388,7 +389,7 @@ def test_execute_total_limit_zero_after_truncation(self): ) resp2.content = _make_tsv_content(rows2, compressed=True) - op._http.get.side_effect = [resp1, resp2] + op._http_client.session.get.side_effect = [resp1, resp2] session = MagicMock() session.query.return_value.filter.return_value.all.return_value = [] @@ -411,7 +412,7 @@ def test_x_total_results_none_on_invalid_header(self): resp.status_code = 200 resp.headers = {"X-Total-Results": "not-a-number"} resp.content = _make_tsv_content([], compressed=True) - op._http.get.return_value = resp + op._http_client.session.get.return_value = resp session = MagicMock() payload = {"search_criteria": "test", "page_size": 10} diff --git a/tests/test_fetch_uniprot_metadata.py b/tests/test_fetch_uniprot_metadata.py index 6fd1f0b..d844dfe 100644 --- a/tests/test_fetch_uniprot_metadata.py +++ b/tests/test_fetch_uniprot_metadata.py @@ -185,7 +185,7 @@ def test_execute_returns_operation_result(self): session = self._mock_session() emit = _capturing_emit() - with patch.object(self.op._http, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): result = self.op.execute( session, {"search_criteria": "organism_id:9606", "page_size": 1, "compressed": False}, @@ -201,7 +201,7 @@ def test_execute_emits_start_and_done(self): session = self._mock_session() emit = _capturing_emit() - with patch.object(self.op._http, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): self.op.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, @@ -223,7 +223,7 @@ def test_execute_respects_total_limit(self): session = self._mock_session() emit = _capturing_emit() - with patch.object(self.op._http, "get", return_value=_make_mock_response(tsv)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(tsv)): result = self.op.execute( session, {"search_criteria": "q", "total_limit": 1, "compressed": False}, @@ -241,7 +241,7 @@ def test_execute_inserts_metadata_row(self): session = self._mock_session() emit = _noop_emit - with patch.object(self.op._http, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -266,7 +266,7 @@ def test_fetch_uniprot_metadata_integration(postgres_url: str): emit = _capturing_emit() with Session(engine, future=True) as session: - with patch.object(op._http, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): result = op.execute( session, { @@ -284,7 +284,7 @@ def test_fetch_uniprot_metadata_integration(postgres_url: str): # Second run with same data → upsert should not double-insert op2 = FetchUniProtMetadataOperation() with Session(engine, future=True) as session: - with patch.object(op2._http, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(op2._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): result2 = op2.execute( session, { diff --git a/tests/test_insert_proteins.py b/tests/test_insert_proteins.py index 56ef3da..5fb4e7a 100644 --- a/tests/test_insert_proteins.py +++ b/tests/test_insert_proteins.py @@ -397,7 +397,7 @@ def test_execute_returns_operation_result(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http, "get", return_value=_make_mock_response(FASTA_ONE)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_ONE)): result = self.op.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, @@ -414,7 +414,7 @@ def test_execute_emits_start_and_done(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http, "get", return_value=_make_mock_response(FASTA_ONE)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_ONE)): self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -429,7 +429,7 @@ def test_execute_respects_total_limit(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result = self.op.execute( session, {"search_criteria": "q", "total_limit": 1, "compressed": False}, @@ -443,7 +443,7 @@ def test_execute_respects_total_limit(self): def test_execute_calls_session_add_all_for_new_protein(self): session = _make_mock_session() - with patch.object(self.op._http, "get", return_value=_make_mock_response(FASTA_ONE)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_ONE)): self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -456,7 +456,7 @@ def test_two_records_counts_correctly(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result = self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -472,7 +472,7 @@ def test_empty_page_continues(self): emit = _capturing_emit() # First response is empty FASTA, no link header → single page with 0 records empty_resp = _make_mock_response("") - with patch.object(self.op._http, "get", return_value=empty_resp): + with patch.object(self.op._http_client.session, "get", return_value=empty_resp): result = self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -502,7 +502,7 @@ def get_side_effect(*args, **kwargs): return page1_resp return page2_resp - with patch.object(self.op._http, "get", side_effect=get_side_effect): + with patch.object(self.op._http_client.session, "get", side_effect=get_side_effect): result = self.op.execute( session, {"search_criteria": "q", "total_limit": 2, "compressed": False}, @@ -530,7 +530,7 @@ def test_compressed_param_appended(self): resp.headers = {"link": ""} resp.raise_for_status = MagicMock() - with patch.object(self.op._http, "get", return_value=resp) as mock_get: + with patch.object(self.op._http_client.session, "get", return_value=resp) as mock_get: self.op.execute( session, {"search_criteria": "q", "compressed": True}, @@ -549,7 +549,7 @@ def test_total_results_from_header(self): resp.headers["X-Total-Results"] = "42" op = InsertProteinsOperation() - with patch.object(op._http, "get", return_value=resp): + with patch.object(op._http_client.session, "get", return_value=resp): op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -567,7 +567,7 @@ def test_total_results_invalid_header_ignored(self): resp.headers["X-Total-Results"] = "not-a-number" op = InsertProteinsOperation() - with patch.object(op._http, "get", return_value=resp): + with patch.object(op._http_client.session, "get", return_value=resp): op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -598,7 +598,7 @@ def get_side_effect(url, **kwargs): return page2_resp op = InsertProteinsOperation() - with patch.object(op._http, "get", side_effect=get_side_effect): + with patch.object(op._http_client.session, "get", side_effect=get_side_effect): result = op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -618,7 +618,7 @@ def test_network_failure_propagates(self): op = InsertProteinsOperation() with patch.object( - op._http, + op._http_client.session, "get", side_effect=req.ConnectionError("network down"), ): @@ -647,7 +647,7 @@ def test_isoform_records_counted(self): ) resp = _make_mock_response(fasta_with_isoform) op = InsertProteinsOperation() - with patch.object(op._http, "get", return_value=resp): + with patch.object(op._http_client.session, "get", return_value=resp): result = op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -665,7 +665,7 @@ def test_progress_emission_with_total(self): resp.headers["X-Total-Results"] = "100" op = InsertProteinsOperation() - with patch.object(op._http, "get", return_value=resp): + with patch.object(op._http_client.session, "get", return_value=resp): op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -683,7 +683,7 @@ def test_include_isoforms_false_omits_param(self): session = _make_mock_session() resp = _make_mock_response(FASTA_ONE) op = InsertProteinsOperation() - with patch.object(op._http, "get", return_value=resp) as mock_get: + with patch.object(op._http_client.session, "get", return_value=resp) as mock_get: op.execute( session, {"search_criteria": "q", "compressed": False, "include_isoforms": False}, @@ -708,7 +708,7 @@ def test_insert_proteins_integration(postgres_url: str): emit = _capturing_emit() with Session(engine, future=True) as session: - with patch.object(op._http, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(op._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result = op.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, @@ -722,7 +722,7 @@ def test_insert_proteins_integration(postgres_url: str): # Idempotency: second run should update, not re-insert op2 = InsertProteinsOperation() with Session(engine, future=True) as session: - with patch.object(op2._http, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(op2._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result2 = op2.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, From 907f104352903e9aa690903bd0f110dd4cd33bdb Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:10:12 +0200 Subject: [PATCH 48/73] docs(config): T-CONF.1 hardcoded params inventory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inventario sistemático de constantes módulo-level y defaults hardcodeados en payloads y workers. 31 entradas categorizadas en 5 grupos (QueueTuning, WorkerTuning, OperationTuning, APILimits, ResearchKnobs) más 12 estructurales config-exempt (GAF indices, payload shape constraints, PCA dim). Detecta duplicación que la externalizacion dedupica por construcción: _ANNOTATION_CHUNK_SIZE x3, _STREAM_CHUNK_SIZE x2, _MAX_FASTA_BYTES x2. Base directa para T-CONF.2 (externalización a pydantic Settings) y T-CONF.3 (doc viva autogenerada). Part of F0 T-CONF.1 of master plan v3. --- docs/CONFIG_INVENTORY.md | 99 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 docs/CONFIG_INVENTORY.md diff --git a/docs/CONFIG_INVENTORY.md b/docs/CONFIG_INVENTORY.md new file mode 100644 index 0000000..23a18c5 --- /dev/null +++ b/docs/CONFIG_INVENTORY.md @@ -0,0 +1,99 @@ +# Hardcoded parameters inventory (T-CONF.1) + +Snapshot 2026-05-05 tras un grep sistemático sobre `protea/` buscando constantes módulo-level (`^_?[A-Z][A-Z_]+ = `), defaults hardcodeados en signatures de pydantic payloads, y valores literales obvios. Output: tabla nombre → fichero:línea → categoría → propósito → rango sugerido → exempt? si aplica. + +Este inventario es la base de **T-CONF.2** (externalización a `protea_core.config.Settings` con jerarquía defaults < yaml < env < flags) y **T-CONF.3** (doc viva autogenerada). + +## Política de externalización + +Cada parámetro identificado abajo: + +- Si **influye en throughput, memoria, latencia, o robustez** ante cargas variables: candidato directo a `Settings`. Operador de plataforma debe poder ajustarlo en `config/{env}.yaml` o env var sin tocar código. +- Si es **estructural / físico** (longitud de hash MD5, índices posicionales en formato GAF, dimensión de un space PCA fija por modelo): `# config-exempt: `. Estas no se mueven a `Settings` pero quedan documentadas. +- Si es **límite de seguridad / contractual** (max FASTA upload size, max comment length): a `Settings` con sub-modelo `APILimits` para que infosec pueda revisarlo en config. + +## Inventario + +### A. QueueTuning (RabbitMQ + worker dispatch) + +| Constante | Fichero:línea | Valor | Propósito | Rango sugerido | Exempt | +|-----------|---------------|------:|-----------|----------------|--------| +| `_MAX_ATTEMPTS` | `infrastructure/queue/publisher.py:14` | 12 | Reintentos máximos al publicar a RabbitMQ. Cubre ~4 min de broker down. | 5-20; tunear según SLA broker | no | +| `_BASE_DELAY` | `infrastructure/queue/publisher.py:15` | 1 | Backoff inicial publisher (seg). Multiplica x2 hasta cap 30s. | 0.5-5 | no | +| `_OOM_MAX_RETRIES` | `infrastructure/queue/consumer.py:28` | 5 | Reintentos al hit CUDA OOM en GPU worker. | 3-10 | no | +| `_OOM_BASE_DELAY` | `infrastructure/queue/consumer.py:29` | 5 | Backoff inicial OOM (seg). | 1-30 | no | +| `_OOM_MAX_DELAY` | `infrastructure/queue/consumer.py:30` | 300 | Cap del backoff OOM (5 min). | 60-900 | no | +| `prefetch_count` | `infrastructure/queue/consumer.py:62, 189` | 1 | Prefetch RabbitMQ por consumer. 1 = strict serialization. | 1-10 según operación | no | + +### B. WorkerTuning (pools, caches, reapers) + +| Constante | Fichero:línea | Valor | Propósito | Rango sugerido | Exempt | +|-----------|---------------|------:|-----------|----------------|--------| +| `pool_size` | `infrastructure/database/engine.py:12` | 20 | Connection pool size SQLAlchemy. | 5-50 según carga concurrent | no | +| `_MODEL_CACHE_MAX` | `core/operations/compute_embeddings.py:609` | 1 | Modelos PLM en cache por proceso. >1 acumula GB en GPU. | 1-2 (GPU memory hard limit) | no | +| `_REF_CACHE_MAX` | `core/operations/predict_go_terms.py:83` | 1 | Reference data en cache por proceso predict. | 1-2 | no | +| `timeout_seconds` (reaper main) | `workers/stale_job_reaper.py:26` | 21600 | 6h timeout antes de marcar jobs FAILED. | 1800-43200 según SLA | no | +| `timeout_seconds` (reaper default) | `workers/stale_job_reaper.py:50` | 3600 | Default constructor; main usa 21600. | 1800-43200 | no | +| `stall_seconds` | `workers/stale_job_reaper.py:52` | 1800 | Tiempo sin JobEvent antes de considerar stalled. | 600-3600 | no | +| `_DEFAULT_TTL` | `api/cache.py:18` | 300.0 | TTL default cache HTTP (5 min). | 60-3600 según endpoint | no | + +### C. OperationTuning (chunks, batches, HTTP) + +| Constante | Fichero:línea | Valor | Propósito | Rango sugerido | Exempt | +|-----------|---------------|------:|-----------|----------------|--------| +| `_ANNOTATION_CHUNK_SIZE` | `core/feature_enricher.py:42`, `core/operations/{train_reranker,predict_go_terms}.py` | 10_000 | Filas por chunk al cargar anotaciones. | 1k-100k según RAM | no | +| `_STREAM_CHUNK_SIZE` | `core/operations/{train_reranker,predict_go_terms}.py` | 2_000 | Chunk size streaming PyArrow. | 500-10k | no | +| `_STORE_CHUNK_SIZE` | `core/operations/predict_go_terms.py:872` | 10_000 | Filas por chunk al publicar a `protea.predictions.write`. ~20-25 MB serializado. RabbitMQ cap 128 MB. | 5k-50k según mensaje promedio | no | +| `_NUMPY_QUERY_CHUNK` | `core/knn_search.py:135` | 500 | Query chunk size para KNN numpy. | 100-2000 según RAM | no | +| `_N_THRESHOLDS` | `core/metrics.py:34` | 101 | Threshold sweep [0.0, 0.01, ..., 1.0] para Fmax. | 51, 101, 201 | no | +| `batch_size` (compute_embeddings payload) | `core/operations/compute_embeddings.py:90, 108` | 1 | Sequences por batch GPU. 1 evita OOM en proteínas largas. | 1-32 según PLM | no | +| `batch_size` (predict_go_terms payload) | `core/operations/predict_go_terms.py:171` | 1024 | Queries por batch KNN. | 256-4096 según vector dim | no | +| `batch_size` (parquet read) | `core/operations/train_reranker.py:1822` | 200_000 | Filas por batch al leer parquet eval. | 50k-500k según RAM | no | +| `gene_product_batch_size` (QuickGO) | `core/operations/load_quickgo_annotations.py:46` | 200 | Batch QuickGO API. Sus límites internos. | 100-500 (revisar API spec) | no | +| `timeout_seconds` (UniProt insert) | `core/operations/insert_proteins.py:30` | 60 | HTTP timeout por request UniProt. | 30-300 | no | +| `timeout_seconds` (UniProt metadata) | `core/operations/fetch_uniprot_metadata.py:29` | 60 | Idem | 30-300 | no | +| `timeout_seconds` (GOA load) | `core/operations/load_goa_annotations.py:34` | 300 | Timeout ftp.ebi GOA descarga (5 min). | 120-900 | no | +| `timeout_seconds` (ontology snapshot) | `core/operations/load_ontology_snapshot.py:19` | 120 | Timeout descarga OBO. | 60-300 | no | +| `timeout_seconds` (QuickGO) | `core/operations/load_quickgo_annotations.py:43` | 300 | Timeout QuickGO API. | 120-900 | no | +| `max_retries` (UniProt) | `core/operations/insert_proteins.py:33`, `fetch_uniprot_metadata.py` | 6 | Reintentos HTTP. | 3-10 | no | +| `backoff_base_seconds` | `core/operations/{insert,fetch}_uniprot*.py` | 0.8 | Backoff inicial UniProt. | 0.5-2 | no | +| `backoff_max_seconds` | `core/operations/{insert,fetch}_uniprot*.py` | 20.0 | Cap backoff UniProt. | 10-60 | no | +| `jitter_seconds` | `core/operations/{insert,fetch}_uniprot*.py` | 0.4 | Jitter agregado al sleep. | 0-1 | no | + +### D. APILimits (HTTP boundaries) + +| Constante | Fichero:línea | Valor | Propósito | Rango sugerido | Exempt | +|-----------|---------------|------:|-----------|----------------|--------| +| `_MAX_FASTA_BYTES` | `api/routers/annotate.py:95`, `query_sets.py:112` | 50 MB | Tope upload FASTA. Hardcodeado en dos sitios. | 10-200 MB; **dedupe a Settings** | no | +| `_MAX_COMMENT_LENGTH` | `api/routers/support.py:14` | 500 | Max chars comentario soporte. | 200-2000 | no | +| `_RECENT_LIMIT` | `api/routers/support.py:15` | 20 | Items en /support/recent. | 10-100 | no | +| `_PAGE_LIMIT` | `api/routers/support.py:16` | 100 | Page size hard cap. | 50-500 | no | + +### E. ResearchKnobs (modelado, no infraestructura) + +| Constante | Fichero:línea | Valor | Propósito | Notas | +|-----------|---------------|------:|-----------|-------| +| `EMBEDDING_PCA_DIM` | `core/reranker.py:102` | 16 | Dim PCA reducido para feature engineering. **CONTRATO con `protea-contracts.feature_schema`**. No mover a Settings (es parte del schema canónico). | exempt: contrato con `protea-contracts` | +| `N_THRESHOLDS` (CAFA sweep) | `core/metrics.py:34` | 101 | Granularidad sweep para Fmax. Cambiar afecta números canónicos. | exempt: parte de la metodología CAFA | + +### F. Estructurales (config-exempt) + +GAF column indices (`load_goa_annotations.py:90-97`): `_IDX_ACCESSION=1`, `_IDX_QUALIFIER=3`, etc. Son posiciones físicas del formato GAF 2.x; cambiar significaría no leer GAF. **exempt: format spec**. + +Cualquier `min_length=1` o `max_length=255` en `Field(...)` de pydantic payloads en `api/routers/`: longitudes de validación de strings (UUIDs, names, paths). **exempt: shape de payloads** (revisable junto con `protea-contracts` si se mueve a paquete). + +## Resumen cuantitativo + +- **Total entradas**: 31 constantes con candidate to externalize. +- **Estructurales exempt**: ~10 (GAF indices, hash lengths, payload shape constraints). +- **Research knobs exempt**: 2 (PCA dim, threshold sweep). +- **A externalizar a Settings (T-CONF.2)**: **31** parámetros, 5 categorías (`QueueTuning`, `WorkerTuning`, `OperationTuning`, `APILimits`, `ResearchKnobs`). +- **Duplicación detectada**: `_ANNOTATION_CHUNK_SIZE` aparece en 3 ficheros (`feature_enricher`, `train_reranker`, `predict_go_terms`); `_STREAM_CHUNK_SIZE` en 2; `_MAX_FASTA_BYTES` en 2 routers. Externalizar **dedupica por construcción** (un solo Settings). + +## Próximos pasos (T-CONF.2 + T-CONF.3) + +T-CONF.2: crear `protea_core.config` con sub-modelos `QueueTuning`, `WorkerTuning`, `OperationTuning`, `APILimits`. `Settings` raíz que los compone. `protea/config/{dev,prod,hpc-bsc,hpc-airgap}.yaml` con valores per target. Ruta canónica de carga: defaults < yaml < env vars (`PROTEA__QUEUE__MAX_ATTEMPTS=15` etc.) < flags CLI. Sustituir las 31 referencias en código por `settings.X.Y`. + +T-CONF.3: autogenerar `docs/source/appendix/configuration.rst` desde el modelo pydantic con docstrings + rangos del inventario. Test CI que parsea cada env yaml y confirma schema válido. + +**AC final** (definido en master plan v3 §5 T-CONF.2): `grep -rE "^_?[A-Z][A-Z_]+\s*=\s*[0-9]" protea-core/` solo devuelve constantes con `# config-exempt: ` (los 12 estructurales / research knobs documentados aquí). From ca25beba2cfe9f4683df40a5720fbbb4623e8155 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:15:39 +0200 Subject: [PATCH 49/73] feat(scripts): T0.5 add smoke.sh end-to-end check Validates a running stack via /health, /health/ready, POST /jobs (ping), poll until succeeded, and the events log. Does not start or stop services (per feedback_no_restart.md). Exits in <2s against a healthy local stack and is dimensioned for CI use too (PROTEA_API_URL + PROTEA_SMOKE_TIMEOUT_S env overrides). Validated against the live local stack: 1/5 -> 5/5 OK. Part of F0 T0.5 of master plan v3. --- scripts/smoke.sh | 105 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100755 scripts/smoke.sh diff --git a/scripts/smoke.sh b/scripts/smoke.sh new file mode 100755 index 0000000..aba535d --- /dev/null +++ b/scripts/smoke.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# scripts/smoke.sh — PROTEA end-to-end smoke +# +# Assumes a stack is already running and reachable at $PROTEA_API_URL +# (default http://127.0.0.1:8000). Does NOT start or stop services +# (per project policy: the stack is never restarted without explicit +# user permission; see memory/feedback_no_restart.md). +# +# What it checks: +# 1. /health responds 200. +# 2. /readiness responds 200 (DB + AMQP both up). +# 3. POST /jobs creates a ping job and returns a job_id. +# 4. The ping job transitions to SUCCEEDED within $PROTEA_SMOKE_TIMEOUT_S +# seconds (default 30). +# 5. GET /jobs/{id}/events lists at least one event with event=job.succeeded. +# +# Usage: +# bash scripts/smoke.sh # local stack, default URL +# PROTEA_API_URL=https://... bash scripts/smoke.sh +# PROTEA_SMOKE_TIMEOUT_S=60 bash scripts/smoke.sh +# +# Exit code: +# 0 = stack healthy, ping job ran end-to-end +# 1 = any step failed (with diagnostics on stderr) +# +# Designed to fit comfortably in CI: no fixtures persisted, no DB writes +# beyond the ping JobEvent, completes in <90s when the stack is up. + +set -euo pipefail + +API_URL="${PROTEA_API_URL:-http://127.0.0.1:8000}" +TIMEOUT_S="${PROTEA_SMOKE_TIMEOUT_S:-30}" + +GREEN="\033[32m"; RED="\033[31m"; CYAN="\033[36m"; BOLD="\033[1m"; RESET="\033[0m" + +_log() { + printf "${CYAN}[smoke]${RESET} %s\n" "$*" +} +_ok() { + printf " ${GREEN}✓${RESET} %s\n" "$*" +} +_fail() { + printf " ${RED}✗${RESET} %s\n" "$*" >&2 + exit 1 +} + +# Ensure curl + jq are present. +command -v curl >/dev/null || _fail "curl not found" +command -v jq >/dev/null || _fail "jq not found (sudo apt install jq, or brew install jq)" + +# 1. Health +_log "1/5 GET ${API_URL}/health" +if ! curl -sSf -o /dev/null -w "%{http_code}" "${API_URL}/health" | grep -q "^200$"; then + _fail "/health did not return 200" +fi +_ok "/health is 200" + +# 2. Readiness (DB + AMQP) +_log "2/5 GET ${API_URL}/health/ready" +ready=$(curl -sS "${API_URL}/health/ready" || echo '{}') +if [[ "$(echo "$ready" | jq -r '.status // empty')" != "ready" ]]; then + _fail "/health/ready did not report ready: $ready" +fi +_ok "/health/ready is ready (db + amqp up)" + +# 3. Submit ping job +_log "3/5 POST ${API_URL}/jobs (operation=ping)" +job_response=$(curl -sSf -X POST "${API_URL}/jobs" \ + -H "Content-Type: application/json" \ + -d '{"operation":"ping","queue_name":"protea.ping","payload":{"smoke":true}}') +job_id=$(echo "$job_response" | jq -r '.id // empty') +if [[ -z "$job_id" ]]; then + _fail "no job id returned: $job_response" +fi +_ok "ping job submitted (id=$job_id)" + +# 4. Poll until succeeded or timeout. Status is lowercase in the API. +_log "4/5 poll ${API_URL}/jobs/${job_id} until succeeded (timeout ${TIMEOUT_S}s)" +deadline=$(( $(date +%s) + TIMEOUT_S )) +status="" +while [[ $(date +%s) -lt $deadline ]]; do + job=$(curl -sSf "${API_URL}/jobs/${job_id}") + status=$(echo "$job" | jq -r '.status // empty' | tr '[:upper:]' '[:lower:]') + case "$status" in + succeeded) break ;; + failed|cancelled) _fail "ping job ended in $status: $(echo "$job" | jq -c '.error_code // empty,.error_message // empty')" ;; + esac + sleep 1 +done +if [[ "$status" != "succeeded" ]]; then + _fail "ping job did not reach succeeded within ${TIMEOUT_S}s (last status=$status)" +fi +_ok "ping job succeeded" + +# 5. Verify events log +_log "5/5 GET ${API_URL}/jobs/${job_id}/events" +events=$(curl -sSf "${API_URL}/jobs/${job_id}/events") +n_events=$(echo "$events" | jq 'length') +has_succeeded=$(echo "$events" | jq '[.[] | select(.event == "job.succeeded")] | length') +if [[ "$n_events" -lt 1 || "$has_succeeded" -lt 1 ]]; then + _fail "events log incomplete (n=$n_events, succeeded=$has_succeeded)" +fi +_ok "events log: $n_events events, includes job.succeeded" + +printf "\n${GREEN}${BOLD}smoke ok${RESET} — stack healthy and end-to-end path works\n" From efcdda54e5bd3bba3af767e4a4d497455956b253 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:23:52 +0200 Subject: [PATCH 50/73] feat(config): T-CONF.2 skeleton - QueueTuning externalised Introduces protea.config.tuning with: - QueueTuning pydantic model: publisher_max_attempts, publisher_base_delay, oom_max_retries, oom_base_delay, oom_max_delay. Defaults match the previous module-level constants exactly. - TuningSettings root model that composes per-category sub-models (more groups land in follow-up turns). - get_tuning() loader cached via lru_cache. Hierarchy: defaults < protea/config/system.yaml (tuning: section) < env vars PROTEA_TUNING__QUEUE__PUBLISHER_MAX_ATTEMPTS=20. - 19 new tests covering defaults, validation, env coercion, yaml override, env-overrides-yaml, missing yaml section. Migrates the 5 RabbitMQ publisher/consumer constants to read from QueueTuning at call time: - publisher.py: _MAX_ATTEMPTS, _BASE_DELAY removed. - consumer.py: _OOM_MAX_RETRIES, _OOM_BASE_DELAY, _OOM_MAX_DELAY removed; replaced by qsettings reads inside the OOM-handler branch. Tests: existing publisher and consumer tests pass unchanged since defaults match prior values. test_queue.py:524 updated to read from get_tuning() instead of the removed constant. Suite: 1113 passed, 10 skipped (was 1094 + 19). Skeleton for the categorisation in docs/CONFIG_INVENTORY.md. Remaining 4 categories (WorkerTuning, OperationTuning, APILimits, ResearchKnobs) follow the same pattern in subsequent T-CONF.2 increments. Part of F0 T-CONF.2 of master plan v3. --- protea/config/tuning.py | 147 ++++++++++++++++++++ protea/infrastructure/queue/consumer.py | 24 ++-- protea/infrastructure/queue/publisher.py | 24 ++-- tests/test_queue.py | 6 +- tests/test_tuning.py | 168 +++++++++++++++++++++++ 5 files changed, 343 insertions(+), 26 deletions(-) create mode 100644 protea/config/tuning.py create mode 100644 tests/test_tuning.py diff --git a/protea/config/tuning.py b/protea/config/tuning.py new file mode 100644 index 0000000..0c7604a --- /dev/null +++ b/protea/config/tuning.py @@ -0,0 +1,147 @@ +"""Runtime tuning settings (T-CONF.2). + +Externalises hardcoded module-level constants from ``protea/`` so an +operator can tune throughput, retry policy and timeouts per +deployment target (dev, prod-cloud, hpc-bsc, hpc-airgap) without +touching code. + +Hierarchy (lowest to highest priority): + + 1. Defaults baked into the pydantic models below. + 2. ``tuning:`` section in ``protea/config/system.yaml``. + 3. Environment variables of the form ``PROTEA_TUNING____``. + +Currently scoped to the ``QueueTuning`` group as a proof of concept. +The remaining categories from ``docs/CONFIG_INVENTORY.md`` +(WorkerTuning, OperationTuning, APILimits, ResearchKnobs) follow the +same pattern and will be added incrementally. + +Example:: + + from protea.config.tuning import get_tuning + + settings = get_tuning() + for attempt in range(settings.queue.publisher_max_attempts): + ... +""" + +from __future__ import annotations + +import os +from functools import lru_cache +from pathlib import Path +from typing import Any + +import yaml +from pydantic import BaseModel, Field + +ENV_PREFIX = "PROTEA_TUNING__" + + +class QueueTuning(BaseModel): + """RabbitMQ publisher / consumer retry and dispatch knobs. + + Sources: ``infrastructure/queue/publisher.py`` and + ``infrastructure/queue/consumer.py`` (ver + ``docs/CONFIG_INVENTORY.md`` §A). + """ + + publisher_max_attempts: int = Field( + default=12, + ge=1, + description=( + "Reintentos máximos al publicar a RabbitMQ. 12 attempts cubren " + "~4 min de broker downtime con backoff exponencial cap a 30s." + ), + ) + publisher_base_delay: float = Field( + default=1.0, + ge=0.0, + description=( + "Backoff inicial publisher en segundos. Multiplica x2 por " + "intento hasta el cap interno de 30s." + ), + ) + oom_max_retries: int = Field( + default=5, + ge=0, + description="Reintentos al hit CUDA OOM en GPU worker.", + ) + oom_base_delay: int = Field( + default=5, + ge=0, + description="Backoff inicial OOM en segundos.", + ) + oom_max_delay: int = Field( + default=300, + ge=1, + description="Cap del backoff OOM en segundos (5 min default).", + ) + + +class TuningSettings(BaseModel): + """Root tuning model that composes per-category sub-models.""" + + queue: QueueTuning = Field(default_factory=QueueTuning) + + +def _load_yaml_tuning(project_root: Path) -> dict[str, Any]: + path = project_root / "protea" / "config" / "system.yaml" + if not path.exists(): + return {} + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + raw = data.get("tuning") or {} + return raw if isinstance(raw, dict) else {} + + +def _apply_env_overrides(merged: dict[str, Any]) -> dict[str, Any]: + """Merge env vars of the form PROTEA_TUNING____=. + + The double underscore is the conventional path separator (matches + pydantic-settings env_nested_delimiter) so we don't collide with + legitimate single underscores inside field names like + ``publisher_max_attempts``. + """ + for key, value in os.environ.items(): + if not key.startswith(ENV_PREFIX): + continue + path = key[len(ENV_PREFIX):].split("__") + if len(path) < 2: + continue + group, field = path[0].lower(), "__".join(path[1:]).lower() + merged.setdefault(group, {})[field] = _coerce(value) + return merged + + +def _coerce(value: str) -> Any: + """Best-effort string -> int/float/bool coercion for env values.""" + lo = value.strip().lower() + if lo in {"true", "false"}: + return lo == "true" + try: + if "." in value: + return float(value) + return int(value) + except ValueError: + return value + + +def _resolve_project_root() -> Path: + """Resolve the project root from this file's location. + + ``protea/config/tuning.py`` -> parents[2] = project root. + """ + return Path(__file__).resolve().parents[2] + + +@lru_cache(maxsize=1) +def get_tuning() -> TuningSettings: + """Load and cache the tuning settings. + + Cache reset (mostly for tests): + ``get_tuning.cache_clear()`` + """ + raw = _load_yaml_tuning(_resolve_project_root()) + raw = _apply_env_overrides(raw) + return TuningSettings.model_validate(raw) diff --git a/protea/infrastructure/queue/consumer.py b/protea/infrastructure/queue/consumer.py index 6d63ac0..ccea1be 100644 --- a/protea/infrastructure/queue/consumer.py +++ b/protea/infrastructure/queue/consumer.py @@ -11,6 +11,7 @@ from pika.spec import Basic, BasicProperties from sqlalchemy.orm import Session, sessionmaker +from protea.config.tuning import get_tuning from protea.core.contracts.operation import RetryLaterError, make_safe_emit from protea.core.contracts.registry import OperationRegistry from protea.infrastructure.orm.models.job import JobEvent @@ -23,11 +24,9 @@ _DLQ_NAME = "protea.dead-letter" # CUDA OOM retry policy for OperationConsumer. -# Backoff: base * 2^retry → 5, 10, 20, 40, 80 s (capped at 300s). -# Total wait budget before dead-letter: ~155 s over 5 retries. -_OOM_MAX_RETRIES = 5 -_OOM_BASE_DELAY = 5 -_OOM_MAX_DELAY = 300 +# Configured via QueueTuning (oom_max_retries / oom_base_delay / +# oom_max_delay). Defaults: 5 retries, 5s base, 300s cap, backoff +# 5/10/20/40/80s. ~155s wait budget before dead-letter. _OOM_RETRY_HEADER = "x-oom-retry" @@ -328,28 +327,29 @@ def raw_emit( except Exception: pass - if oom_retry_count < _OOM_MAX_RETRIES: + qsettings = get_tuning().queue + if oom_retry_count < qsettings.oom_max_retries: next_count = oom_retry_count + 1 delay = min( - _OOM_BASE_DELAY * (2**oom_retry_count), - _OOM_MAX_DELAY, + qsettings.oom_base_delay * (2**oom_retry_count), + qsettings.oom_max_delay, ) logger.warning( - "CUDA OOM — backing off %ds (retry %d/%d). operation=%s", + "CUDA OOM: backing off %ds (retry %d/%d). operation=%s", delay, next_count, - _OOM_MAX_RETRIES, + qsettings.oom_max_retries, operation_name, ) self._emit_parent_event( parent_job_id, "child.cuda_oom_retry", - f"CUDA OOM on {operation_name}; retry {next_count}/{_OOM_MAX_RETRIES} " + f"CUDA OOM on {operation_name}; retry {next_count}/{qsettings.oom_max_retries} " f"after {delay}s backoff", { "operation": operation_name, "retry_count": next_count, - "max_retries": _OOM_MAX_RETRIES, + "max_retries": qsettings.oom_max_retries, "delay_seconds": delay, }, level="warning", diff --git a/protea/infrastructure/queue/publisher.py b/protea/infrastructure/queue/publisher.py index cad5bfa..77158b9 100644 --- a/protea/infrastructure/queue/publisher.py +++ b/protea/infrastructure/queue/publisher.py @@ -9,10 +9,9 @@ import pika -logger = logging.getLogger(__name__) +from protea.config.tuning import get_tuning -_MAX_ATTEMPTS = 12 -_BASE_DELAY = 1 # seconds; exponential backoff: 1, 2, 4, 8, 16 (capped at 30); 12 attempts cover ~4 min of broker downtime +logger = logging.getLogger(__name__) # Thread-local persistent connection to avoid opening/closing per publish. _local = threading.local() @@ -39,9 +38,12 @@ def _close_cached_connection() -> None: def _publish(amqp_url: str, queue_name: str, body: bytes) -> None: """Core publish logic with retries and connection reuse.""" + settings = get_tuning().queue + max_attempts = settings.publisher_max_attempts + base_delay = settings.publisher_base_delay last_exc: Exception | None = None - for attempt in range(1, _MAX_ATTEMPTS + 1): + for attempt in range(1, max_attempts + 1): try: connection = _get_connection(amqp_url) channel = connection.channel() @@ -61,14 +63,14 @@ def _publish(amqp_url: str, queue_name: str, body: bytes) -> None: return except Exception as exc: last_exc = exc - # Connection is stale — discard it so next attempt creates a fresh one. + # Connection is stale: discard it so next attempt creates a fresh one. _close_cached_connection() - if attempt < _MAX_ATTEMPTS: - delay = min(_BASE_DELAY * (2 ** (attempt - 1)), 30) + if attempt < max_attempts: + delay = min(base_delay * (2 ** (attempt - 1)), 30) logger.warning( - "publish failed (attempt %d/%d), retrying in %ds. queue=%s error=%s", + "publish failed (attempt %d/%d), retrying in %ss. queue=%s error=%s", attempt, - _MAX_ATTEMPTS, + max_attempts, delay, queue_name, exc, @@ -77,13 +79,13 @@ def _publish(amqp_url: str, queue_name: str, body: bytes) -> None: else: logger.error( "publish failed after %d attempts. queue=%s error=%s", - _MAX_ATTEMPTS, + max_attempts, queue_name, exc, ) raise RuntimeError( - f"Failed to publish to queue {queue_name!r} after {_MAX_ATTEMPTS} attempts" + f"Failed to publish to queue {queue_name!r} after {max_attempts} attempts" ) from last_exc diff --git a/tests/test_queue.py b/tests/test_queue.py index d1253bb..f9b4503 100644 --- a/tests/test_queue.py +++ b/tests/test_queue.py @@ -512,8 +512,8 @@ def test_cuda_oom_first_failure_republishes_with_backoff(self): channel.connection.sleep.assert_called_once_with(5) def test_cuda_oom_retries_exhausted_dead_letters(self): - """After _OOM_MAX_RETRIES failures the message is nack'd without requeue.""" - from protea.infrastructure.queue import consumer as consumer_module + """After oom_max_retries failures the message is nack'd without requeue.""" + from protea.config.tuning import get_tuning exc = RuntimeError("CUDA out of memory. Tried to allocate 2 GiB") consumer, sessions, _, _ = self._make_consumer(raises=exc) @@ -521,7 +521,7 @@ def test_cuda_oom_retries_exhausted_dead_letters(self): method = _make_method(31) properties = MagicMock() # Message has already retried the maximum number of times. - properties.headers = {"x-oom-retry": consumer_module._OOM_MAX_RETRIES} + properties.headers = {"x-oom-retry": get_tuning().queue.oom_max_retries} import sys diff --git a/tests/test_tuning.py b/tests/test_tuning.py new file mode 100644 index 0000000..590ef9d --- /dev/null +++ b/tests/test_tuning.py @@ -0,0 +1,168 @@ +"""Tests for protea.config.tuning (T-CONF.2 skeleton).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from protea.config.tuning import ( + QueueTuning, + TuningSettings, + _apply_env_overrides, + _coerce, + _load_yaml_tuning, + get_tuning, +) + + +class TestQueueTuningDefaults: + def test_publisher_defaults(self) -> None: + q = QueueTuning() + assert q.publisher_max_attempts == 12 + assert q.publisher_base_delay == 1.0 + + def test_oom_defaults(self) -> None: + q = QueueTuning() + assert q.oom_max_retries == 5 + assert q.oom_base_delay == 5 + assert q.oom_max_delay == 300 + + def test_validates_non_negative(self) -> None: + with pytest.raises(Exception): + QueueTuning(publisher_max_attempts=0) + + def test_validates_oom_max_delay_positive(self) -> None: + with pytest.raises(Exception): + QueueTuning(oom_max_delay=0) + + +class TestCoerce: + def test_int(self) -> None: + assert _coerce("42") == 42 + + def test_float(self) -> None: + assert _coerce("1.5") == pytest.approx(1.5) + + def test_bool_true(self) -> None: + assert _coerce("true") is True + assert _coerce("TRUE") is True + + def test_bool_false(self) -> None: + assert _coerce("false") is False + + def test_string_passthrough(self) -> None: + assert _coerce("not-a-number") == "not-a-number" + + +class TestApplyEnvOverrides: + def test_no_env_no_change(self, monkeypatch: pytest.MonkeyPatch) -> None: + # Strip any test-fixture overrides. + for key in list(__import__("os").environ): + if key.startswith("PROTEA_TUNING__"): + monkeypatch.delenv(key, raising=False) + merged: dict = {} + out = _apply_env_overrides(merged) + assert out == {} + + def test_single_override(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("PROTEA_TUNING__QUEUE__PUBLISHER_MAX_ATTEMPTS", "20") + merged: dict = {} + out = _apply_env_overrides(merged) + assert out == {"queue": {"publisher_max_attempts": 20}} + + def test_merges_with_yaml(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("PROTEA_TUNING__QUEUE__OOM_MAX_RETRIES", "8") + merged = {"queue": {"publisher_max_attempts": 30}} + out = _apply_env_overrides(merged) + assert out["queue"]["publisher_max_attempts"] == 30 + assert out["queue"]["oom_max_retries"] == 8 + + +class TestGetTuning: + def setup_method(self) -> None: + get_tuning.cache_clear() + + def teardown_method(self) -> None: + get_tuning.cache_clear() + + def test_returns_defaults_when_no_yaml_or_env( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + ) -> None: + # Strip any inherited env overrides. + for key in list(__import__("os").environ): + if key.startswith("PROTEA_TUNING__"): + monkeypatch.delenv(key, raising=False) + # Pretend the project root has no system.yaml. + monkeypatch.setattr( + "protea.config.tuning._resolve_project_root", lambda: tmp_path + ) + get_tuning.cache_clear() + s = get_tuning() + assert s.queue.publisher_max_attempts == 12 + assert s.queue.oom_max_retries == 5 + + def test_env_override_applies(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setattr( + "protea.config.tuning._resolve_project_root", lambda: tmp_path + ) + monkeypatch.setenv("PROTEA_TUNING__QUEUE__PUBLISHER_MAX_ATTEMPTS", "25") + get_tuning.cache_clear() + s = get_tuning() + assert s.queue.publisher_max_attempts == 25 + + def test_yaml_override_applies( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + ) -> None: + cfg_dir = tmp_path / "protea" / "config" + cfg_dir.mkdir(parents=True) + (cfg_dir / "system.yaml").write_text( + "tuning:\n queue:\n oom_max_retries: 9\n", + encoding="utf-8", + ) + for key in list(__import__("os").environ): + if key.startswith("PROTEA_TUNING__"): + monkeypatch.delenv(key, raising=False) + monkeypatch.setattr( + "protea.config.tuning._resolve_project_root", lambda: tmp_path + ) + get_tuning.cache_clear() + s = get_tuning() + assert s.queue.oom_max_retries == 9 + # Untouched fields keep defaults. + assert s.queue.publisher_max_attempts == 12 + + def test_env_overrides_yaml(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + cfg_dir = tmp_path / "protea" / "config" + cfg_dir.mkdir(parents=True) + (cfg_dir / "system.yaml").write_text( + "tuning:\n queue:\n publisher_max_attempts: 7\n", + encoding="utf-8", + ) + monkeypatch.setattr( + "protea.config.tuning._resolve_project_root", lambda: tmp_path + ) + monkeypatch.setenv("PROTEA_TUNING__QUEUE__PUBLISHER_MAX_ATTEMPTS", "33") + get_tuning.cache_clear() + s = get_tuning() + assert s.queue.publisher_max_attempts == 33 + + def test_load_yaml_handles_missing_section(self, tmp_path: Path) -> None: + cfg_dir = tmp_path / "protea" / "config" + cfg_dir.mkdir(parents=True) + (cfg_dir / "system.yaml").write_text( + "database:\n url: postgresql://x\n", + encoding="utf-8", + ) + out = _load_yaml_tuning(tmp_path) + assert out == {} + + +class TestTuningSettingsModel: + def test_compose(self) -> None: + s = TuningSettings(queue=QueueTuning(publisher_max_attempts=15)) + assert s.queue.publisher_max_attempts == 15 + + def test_default_compose(self) -> None: + s = TuningSettings() + assert s.queue.publisher_max_attempts == 12 From 527e51cc99056b5f946b6439072ab5f4b890e400 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:31:02 +0200 Subject: [PATCH 51/73] refactor(operations): T0.6 rename train_reranker to training_dump_helpers Renames protea/core/operations/train_reranker.py to protea/core/training_dump_helpers.py and removes every literal "train_reranker" snake-case reference from the protea/ subtree. The helpers (TrainRerankerAutoOperation, TrainRerankerAutoPayload, StreamOutput, _knn_transfer_and_label, _load_sequences, _load_taxonomy_ids, _build_reference_from_cache, _preload_all_embeddings, _load_parent_map, TrainRerankerPayload, SequenceContext) keep their CamelCase names so existing call sites in tests and ExportResearchDatasetOperation continue to work via the new path. Updates: - module docstring: removes the "two operations" framing (both were unregistered) and explains the helper's surviving role. - event strings rebranded train_reranker_auto.* -> dump_helper.*. - export_research_dataset.py relay updated accordingly so consumers keep seeing export_research_dataset.* events on the wire. - constant ``name = "research_dataset_dump_helper"`` (was "train_reranker_auto"); the class remains unregistered. - comments in feature_enricher.py, parquet_export.py, generate_evaluation_set.py, predict_go_terms.py and scripts/materialize_lab_intervals.py: rephrased to "the dump helper". - tests/test_train_reranker.py renamed to test_training_dump_helpers.py (+ import path updated). Same for test_knn_streaming_smoke.py imports + mock target. - test_datasets_and_reranker_import_smoke.py asserts the new name is also unregistered; the historical asserts on the old names are gone since "train_reranker" no longer exists in the codebase. AC verification: ``grep -rn "train_reranker" protea/`` returns 0 hits. The same grep over the whole repo (including tests/, scripts/, docs/) is also 0 except for one-line *.md docs that document the historical rename and stay as-is on purpose. Suite: 1113 passed, 10 skipped (unchanged). Part of F0 T0.6 of master plan v3. --- protea/core/feature_enricher.py | 8 +-- .../operations/export_research_dataset.py | 23 +++--- .../operations/generate_evaluation_set.py | 2 +- protea/core/operations/predict_go_terms.py | 4 +- protea/core/parquet_export.py | 2 +- ...n_reranker.py => training_dump_helpers.py} | 71 ++++++++++--------- scripts/materialize_lab_intervals.py | 2 +- ...test_datasets_and_reranker_import_smoke.py | 4 +- tests/test_knn_streaming_smoke.py | 4 +- ...anker.py => test_training_dump_helpers.py} | 4 +- 10 files changed, 65 insertions(+), 59 deletions(-) rename protea/core/{operations/train_reranker.py => training_dump_helpers.py} (97%) rename tests/{test_train_reranker.py => test_training_dump_helpers.py} (98%) diff --git a/protea/core/feature_enricher.py b/protea/core/feature_enricher.py index 8bded45..dc65bd8 100644 --- a/protea/core/feature_enricher.py +++ b/protea/core/feature_enricher.py @@ -438,9 +438,9 @@ def expand_predictions_to_ancestors( """Expand each leaf prediction to its is_a / part_of ancestor closure. Mirrors the in-loop expansion in - :func:`protea.core.operations.train_reranker._knn_transfer_and_label`, + :func:`protea.core.training_dump_helpers._knn_transfer_and_label`, pulled out so :mod:`predict_go_terms` (live inference) and - ``train_reranker`` (offline dataset generation) share a single canonical + ``the dump helper`` (offline dataset generation) share a single canonical implementation. Without it the candidate sets diverge — the lab dump expanded to ancestors, live KNN didn't, and the v9/v10 boosters scored LK / PK candidates on a feature distribution they never saw at @@ -465,7 +465,7 @@ def expand_predictions_to_ancestors( Leaf prediction records, each with at least ``protein_accession``, ``aspect``, ``go_id``, ``distance``, ``neighbor_vote_fraction``, ``neighbor_min_distance``. Must be the unmodified output of the - leaf record loop — both train_reranker and predict_go_terms emit + leaf record loop — both the dump helper and predict_go_terms emit this shape. parent_map: ``{child_go_id: {parent_go_id, ...}}`` for is_a / part_of edges. @@ -582,7 +582,7 @@ def load_parent_map(session: Session, snapshot_id: uuid.UUID) -> dict[str, set[s """``{child_go_id: {parent_go_id, ...}}`` for is_a + part_of edges in a given :class:`OntologySnapshot`. Used by the ancestor-expansion helper above; both the live ``predict_go_terms`` path and offline - ``train_reranker`` should load it through this function so the closure + ``the dump helper`` should load it through this function so the closure they pass to :func:`expand_predictions_to_ancestors` is identical.""" from sqlalchemy import text diff --git a/protea/core/operations/export_research_dataset.py b/protea/core/operations/export_research_dataset.py index 09cf28e..3f8b29d 100644 --- a/protea/core/operations/export_research_dataset.py +++ b/protea/core/operations/export_research_dataset.py @@ -1,14 +1,15 @@ -"""Export a frozen re-ranker dataset for ``protea-reranker-lab``. +"""Export a frozen re-ranker dataset for protea-reranker-lab. -This operation runs the same KNN + feature-generation pipeline as -``train_reranker_auto`` but skips the LightGBM training stage and -publishes the resulting parquets + manifest via the configured +Runs the same KNN + feature-generation pipeline as the now-renamed +research dataset dump helper (``training_dump_helpers``), skips the +LightGBM training stage, and publishes the resulting parquets + +manifest via the configured :class:`~protea.infrastructure.storage.ArtifactStore` (local FS by default, MinIO when enabled via the ``storage`` compose profile). -Why a dedicated operation instead of ``train_reranker_auto --dump-only``? +Why a dedicated operation instead of dump-only mode of the helper? -* Narrower payload — only the knobs that matter for export, no +* Narrower payload: only the knobs that matter for export, no LightGBM-specific fields. * Routes output through the storage abstraction, so the lab can consume from MinIO without every export having to know a local dump path on @@ -29,7 +30,7 @@ from sqlalchemy.orm import Session from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload -from protea.core.operations.train_reranker import TrainRerankerAutoOperation +from protea.core.training_dump_helpers import TrainRerankerAutoOperation from protea.infrastructure.orm.models.embedding.dataset import Dataset from protea.infrastructure.settings import load_settings from protea.infrastructure.storage import get_artifact_store @@ -50,7 +51,7 @@ class ExportResearchDatasetPayload(ProteaPayload, frozen=True): # the lab manifest and the key prefix ``datasets/{output_name}/``. output_name: str - # KNN + feature generation knobs (mirror train_reranker_auto). + # KNN + feature generation knobs (mirror the dump helper). k: PositiveInt = 5 search_backend: str = "faiss" compute_alignments: bool = False @@ -126,10 +127,10 @@ def execute( raise ValueError(f"Dataset {p.output_name!r} already exists") def _relay(event: str, scope: str | None, evt_payload: dict[str, Any], level: str) -> None: - # Surface the underlying train_reranker_auto events under this + # Surface the underlying dump-helper events under this # operation's namespace so the job event log reads naturally. - if event.startswith("train_reranker_auto."): - event = "export_research_dataset." + event[len("train_reranker_auto."):] + if event.startswith("dump_helper."): + event = "export_research_dataset." + event[len("dump_helper."):] emit(event, scope, evt_payload, level) # type: ignore[arg-type] with tempfile.TemporaryDirectory(prefix="protea_export_") as tmp: diff --git a/protea/core/operations/generate_evaluation_set.py b/protea/core/operations/generate_evaluation_set.py index c684352..fc1081b 100644 --- a/protea/core/operations/generate_evaluation_set.py +++ b/protea/core/operations/generate_evaluation_set.py @@ -147,7 +147,7 @@ def execute( session.flush() # Persist the full ground-truth (nk/lk/pk/known/pk_known) to the artifact - # store. Downstream consumers (train_reranker_auto, cafaeval) read this + # store. Downstream consumers (the dump helper, cafaeval) read this # parquet via load_evaluation_data_for_set instead of recomputing. project_root = Path(__file__).resolve().parents[3] store = get_artifact_store(load_settings(project_root)) diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index 93487af..6e33b92 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -199,7 +199,7 @@ class PredictGOTermsPayload(ProteaPayload, frozen=True): # synthesised as additional records — required to match the candidate # distribution the lab booster saw at training time. Without this the # live PredictionSet has ~5-10× fewer candidates per (protein, aspect) - # than ``train_reranker_auto``'s dump, and LK / PK fmax collapses + # than ``the dump helper``'s dump, and LK / PK fmax collapses # because the booster's score distribution is calibrated against the # richer expanded set. See ``feature_enricher.expand_predictions_to_ancestors``. expand_votes_to_ancestors: bool = False @@ -766,7 +766,7 @@ def execute( # Ancestor expansion — required for the lab booster's candidate # distribution. Runs AFTER v6 enrichment so synthetic ancestor # records inherit the leaf's anc2vec_/emb_pca_ values, mirroring - # what train_reranker emits. + # what the dump helper emits. if p.expand_votes_to_ancestors and prediction_dicts: from sqlalchemy import select diff --git a/protea/core/parquet_export.py b/protea/core/parquet_export.py index 6972c47..7301532 100644 --- a/protea/core/parquet_export.py +++ b/protea/core/parquet_export.py @@ -9,7 +9,7 @@ This module is shared between two producers: -* ``train_reranker_auto`` (operation) — runs KNN + feature generation for +* ``the dump helper`` (operation) — runs KNN + feature generation for training and optionally dumps the resulting shards. * ``export_research_dataset`` (operation) — runs the same generation but only to publish the frozen dataset via an ``ArtifactStore`` (local or diff --git a/protea/core/operations/train_reranker.py b/protea/core/training_dump_helpers.py similarity index 97% rename from protea/core/operations/train_reranker.py rename to protea/core/training_dump_helpers.py index 4257e12..a57feae 100644 --- a/protea/core/operations/train_reranker.py +++ b/protea/core/training_dump_helpers.py @@ -1,14 +1,16 @@ -"""Train LightGBM re-rankers from temporal holdout pairs. - -Provides two operations: - -* ``train_reranker`` — single pair (old → new annotation set). -* ``train_reranker_auto`` — automated multi-split training: generates - consecutive pairs from a list of GOA version numbers, concatenates all - labeled data, trains one combined model, and evaluates on a held-out - test split. - -Both operations run entirely in-process (no RabbitMQ coordination). +"""Helpers used to generate frozen re-ranker datasets in-process. + +Survives as a container for the KNN, feature-engineering, +streaming-parquet, and reference-loading utilities consumed by +``ExportResearchDatasetOperation``. The module used to expose two +operations (single-pair and multi-split training) wired into the +OperationRegistry; LightGBM training itself moved to the standalone +protea-reranker-lab repo, so the operations are unregistered. +``TrainRerankerAutoOperation.execute()`` still runs the dump pipeline +(KNN + feature generation + parquet emission) for the export operation +in ``dump_only=True`` mode. + +All execution is in-process; no RabbitMQ coordination. """ from __future__ import annotations @@ -105,7 +107,7 @@ class StreamOutput: class TrainRerankerPayload(ProteaPayload, frozen=True): - """Payload for the train_reranker operation.""" + """Payload for the dump_helper operation.""" name: str old_annotation_set_id: str @@ -218,7 +220,7 @@ def _load_parent_map( parent_map.setdefault(str(child), set()).add(str(parent)) return parent_map -# ── bulk embedding preload (used by train_reranker_auto) ───────────── +# ── bulk embedding preload (used by dump_helper) ───────────── def _preload_all_embeddings( @@ -249,7 +251,7 @@ def _preload_all_embeddings( total, dim = int(count_row[0]), int(count_row[1]) if count_row[1] else 960 emit( - "train_reranker_auto.preloading_embeddings", + "dump_helper.preloading_embeddings", None, {"total": total, "dim": dim}, "info", @@ -279,7 +281,7 @@ def _preload_all_embeddings( acc_to_idx = {acc: i for i, acc in enumerate(accessions)} emit( - "train_reranker_auto.embeddings_preloaded", + "dump_helper.embeddings_preloaded", None, { "total": len(accessions), @@ -352,7 +354,7 @@ def _build_reference_from_cache( "go_map": aspect_go_map[asp], } emit( - "train_reranker.aspect_loaded", + "dump_helper.aspect_loaded", None, {"aspect": asp, "references": len(indices)}, "info", @@ -463,7 +465,7 @@ def _knn_transfer_and_label( # - ``ref["indices"]`` + ``embedding_pool`` (preload-aware path); # no per-aspect float16 copy is held in the dict. # - ``ref["embeddings"]`` (legacy path used by the single-version - # train_reranker that loads embeddings per aspect from SQL). + # dump_helper that loads embeddings per aspect from SQL). indices = ref.get("indices") if indices is not None and embedding_pool is not None: ref_f32 = embedding_pool[indices].astype(np.float32) @@ -1040,7 +1042,7 @@ def _emit(rec: dict[str, Any]) -> None: class TrainRerankerAutoPayload(ProteaPayload, frozen=True): - """Payload for the train_reranker_auto operation. + """Payload for the dump_helper operation. Generates consecutive temporal pairs from ``train_versions``, runs KNN once per pair, then trains 3 per-category LightGBM models (NK, LK, PK) @@ -1177,10 +1179,13 @@ class TrainRerankerAutoOperation: as ``{name}-{category}``. """ - name = "train_reranker_auto" + # Unregistered since LightGBM training moved to protea-reranker-lab. + # Kept as in-process helper invoked from ExportResearchDatasetOperation. + name = "research_dataset_dump_helper" description = ( - "Train one LightGBM re-ranker per category (NK/LK/PK) across multiple " - "consecutive temporal holdout pairs and evaluate on a held-out split." + "Run KNN + feature generation across multiple temporal holdout " + "pairs and emit frozen parquets. Originally also trained " + "LightGBM models; that path now lives in protea-reranker-lab." ) def summarize_payload(self, payload: dict[str, Any], *, session: Session | None = None) -> str: @@ -1225,7 +1230,7 @@ def _dump_frozen_dataset( annotation_source: str, ) -> dict[str, Any]: """Thin wrapper that delegates to ``parquet_export`` — kept so - ``train_reranker_auto`` can still dump a frozen dataset to a local + ``dump_helper`` can still dump a frozen dataset to a local path via ``dump_to=...``. New code should prefer the ``export_research_dataset`` operation which publishes via the configured ``ArtifactStore``. @@ -1319,7 +1324,7 @@ def execute( if len(parts) >= 2: ia_weights[parts[0]] = float(parts[1]) emit( - "train_reranker_auto.ia_loaded", + "dump_helper.ia_loaded", None, {"ia_file": p.ia_file, "n_terms": len(ia_weights)}, "info", @@ -1327,7 +1332,7 @@ def execute( max_models = len(candidate_names) emit( - "train_reranker_auto.start", + "dump_helper.start", None, { "name": p.name, @@ -1396,7 +1401,7 @@ def execute( # other feature is correct. pca_state = _load_or_fit_pca_state(emb_config_id, all_embeddings) emit( - "train_reranker_auto.pca_fit", + "dump_helper.pca_fit", None, { "n_refs": int(all_embeddings.shape[0]), @@ -1429,7 +1434,7 @@ def execute( new_set_id = version_to_set[v_new] emit( - "train_reranker_auto.split_start", + "dump_helper.split_start", None, {"split": i + 1, "v_old": v_old, "v_new": v_new}, "info", @@ -1470,7 +1475,7 @@ def execute( if not all_query_accessions: emit( - "train_reranker_auto.split_skipped", + "dump_helper.split_skipped", None, {"split": i + 1, "reason": "no ground truth in any category"}, "warning", @@ -1502,7 +1507,7 @@ def execute( if not valid_queries: emit( - "train_reranker_auto.split_skipped", + "dump_helper.split_skipped", None, {"split": i + 1, "reason": "no query embeddings"}, "warning", @@ -1672,7 +1677,7 @@ def execute( valid_split_versions.append((v_old, v_new)) per_split_stats.append(split_stats) - emit("train_reranker_auto.split_done", None, split_stats, "info") + emit("dump_helper.split_done", None, split_stats, "info") # Check we have data if not any(split_files[c] for c in _CATEGORIES): @@ -1685,7 +1690,7 @@ def execute( test_new_set_id = version_to_set[test_new_v] emit( - "train_reranker_auto.test_knn", + "dump_helper.test_knn", None, {"test_old": test_old_v, "test_new": test_new_v}, "info", @@ -1882,7 +1887,7 @@ def execute( # pass ``dump_to`` — ExportResearchDatasetOperation always does. if not p.dump_to: raise ValueError( - "train_reranker_auto requires dump_to — LightGBM " + "dump_helper requires dump_to — LightGBM " "training has been moved to protea-reranker-lab. Use " "ExportResearchDatasetOperation / POST /datasets." ) @@ -1899,7 +1904,7 @@ def execute( ontology_snapshot_id=str(ontology_snapshot_id), annotation_source=p.annotation_source, ) - emit("train_reranker_auto.dump_done", None, dump_stats, "info") + emit("dump_helper.dump_done", None, dump_stats, "info") elapsed = round(time.perf_counter() - t0, 1) result: dict[str, Any] = { "dumped": True, @@ -1907,7 +1912,7 @@ def execute( "dump_stats": dump_stats, "elapsed_seconds": elapsed, } - emit("train_reranker_auto.done", None, result, "info") + emit("dump_helper.done", None, result, "info") return OperationResult(result=result) finally: diff --git a/scripts/materialize_lab_intervals.py b/scripts/materialize_lab_intervals.py index 417c2f0..ad00de1 100644 --- a/scripts/materialize_lab_intervals.py +++ b/scripts/materialize_lab_intervals.py @@ -1,7 +1,7 @@ """Materialize EvaluationSet + QuerySet rows for every snapshot pair the lab benchmarks consume. -The lab dump (``train_reranker_auto`` with ``dump_only=True``) historically +The lab dump (the dump helper with ``dump_only=True``) historically recomputed the per-pair delta on the fly via ``compute_evaluation_data``. That violated the project rule "never recompute on-the-fly what can be persisted and reused". This script materializes the missing artefacts so diff --git a/tests/test_datasets_and_reranker_import_smoke.py b/tests/test_datasets_and_reranker_import_smoke.py index e1451be..ae0203c 100644 --- a/tests/test_datasets_and_reranker_import_smoke.py +++ b/tests/test_datasets_and_reranker_import_smoke.py @@ -319,5 +319,5 @@ def test_export_dataset_registered_training_ops_not(self): names = set(registry._ops.keys()) # type: ignore[attr-defined] assert "export_research_dataset" in names - assert "train_reranker" not in names - assert "train_reranker_auto" not in names + # Historical training operations remain unregistered. + assert "research_dataset_dump_helper" not in names diff --git a/tests/test_knn_streaming_smoke.py b/tests/test_knn_streaming_smoke.py index d7205ed..c9af7ea 100644 --- a/tests/test_knn_streaming_smoke.py +++ b/tests/test_knn_streaming_smoke.py @@ -17,7 +17,7 @@ import pyarrow.parquet as pq import pytest -from protea.core.operations.train_reranker import StreamOutput, _knn_transfer_and_label +from protea.core.training_dump_helpers import StreamOutput, _knn_transfer_and_label class _StubAnc2Vec: @@ -137,7 +137,7 @@ def _run(mode: str, tmp_path: Path | None = None, *, expand: bool, pivot=None): ) with patch( - "protea.core.operations.train_reranker.get_anc2vec_index", + "protea.core.training_dump_helpers.get_anc2vec_index", return_value=_StubAnc2Vec(), ): return _knn_transfer_and_label( diff --git a/tests/test_train_reranker.py b/tests/test_training_dump_helpers.py similarity index 98% rename from tests/test_train_reranker.py rename to tests/test_training_dump_helpers.py index b3520a0..8e04140 100644 --- a/tests/test_train_reranker.py +++ b/tests/test_training_dump_helpers.py @@ -1,4 +1,4 @@ -"""Unit tests for protea.core.operations.train_reranker. +"""Unit tests for protea.core.training_dump_helpers. Covers ``TrainRerankerPayload`` (still imported by the lab via the ``protea_reranker_lab.contracts`` mirror) and the few module-level @@ -17,7 +17,7 @@ import numpy as np import pytest -from protea.core.operations.train_reranker import ( +from protea.core.training_dump_helpers import ( TrainRerankerPayload, _load_sequences, _load_taxonomy_ids, From 55f3ce12515e648eb42a1d5645822bfee811b9bc Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:36:04 +0200 Subject: [PATCH 52/73] refactor(training_dump_helpers): T0.6 deeper - drop dead TrainRerankerPayload Continues T0.6 (commit 527e51c) by removing TrainRerankerPayload, the single-pair payload class that no production code referenced. Used to live in train_reranker.py for an Operation that was retired when LightGBM training moved to protea-reranker-lab; the class hung on because tests in test_training_dump_helpers.py exercised it. - Class definition deleted. - Helper signature ``_knn_transfer_and_label`` simplified from ``p: TrainRerankerPayload | TrainRerankerAutoPayload`` to ``p: TrainRerankerAutoPayload``. - Cross-reference comments inlined directly into TrainRerankerAuto Payload field docstrings (KNN backend rationale, ancestor expansion rules, embedding PCA explanation). - 15 tests in TestTrainRerankerPayload removed; only the helper tests (_load_sequences, _load_taxonomy_ids) remain. - Header docstring trimmed to reflect new scope. LOC reduction: training_dump_helpers.py from 1914 to 1860 LOC. Suite: 1098 passed, 10 skipped (was 1113 - 15 dead payload tests). The deeper inline of TrainRerankerAutoOperation into ExportResearchDatasetOperation is deferred to F2 once the feature registry is in place; doing it now would balloon export_research_ dataset by 600 LOC of execute() body for marginal gain. Part of F0 T0.6 of master plan v3. --- protea/core/training_dump_helpers.py | 99 +++++------------------ tests/test_training_dump_helpers.py | 117 +-------------------------- 2 files changed, 24 insertions(+), 192 deletions(-) diff --git a/protea/core/training_dump_helpers.py b/protea/core/training_dump_helpers.py index a57feae..43adfce 100644 --- a/protea/core/training_dump_helpers.py +++ b/protea/core/training_dump_helpers.py @@ -106,81 +106,11 @@ class StreamOutput: # --------------------------------------------------------------------------- -class TrainRerankerPayload(ProteaPayload, frozen=True): - """Payload for the dump_helper operation.""" - - name: str - old_annotation_set_id: str - new_annotation_set_id: str - embedding_config_id: str - ontology_snapshot_id: str - - # Evaluation category - category: str = "nk" - - # KNN parameters — default to FAISS IVFFlat. Numpy brute-force on 500k+ refs - # materialises a full (n_queries × n_refs) distance matrix that peaks at - # ~10 GB per aspect. IVFFlat keeps peak memory ~2.5 GB and is 5-10× faster. - limit_per_entry: PositiveInt = 5 - distance_threshold: float | None = None - search_backend: str = "faiss" - metric: str = "cosine" - faiss_index_type: str = "IVFFlat" - faiss_nlist: int = 256 - faiss_nprobe: int = 32 - - # LightGBM parameters - num_boost_round: int = 1000 - early_stopping_rounds: int = 50 - val_fraction: float = 0.2 - neg_pos_ratio: float | None = None - - # Feature computation - compute_alignments: bool = False - compute_taxonomy: bool = False - - # Ancestor expansion: when True, synthesize candidate records for every - # ancestor of each leaf GO term voted by a neighbor (True Path Rule at - # vote time). Weight of the inherited vote = IA(ancestor)/IA(leaf) when - # an IA table is available; 1.0 otherwise. Expands the candidate set - # and helps the reranker learn on abstract terms that never get direct - # KNN votes but do appear in ground truth. - expand_votes_to_ancestors: bool = False - - # Sequence-embedding PCA: when True, fit PCA(16) once on the reference - # embedding pool, project each query, and emit 16 extra features - # (emb_pca_query_0..15) per candidate row. Gives LightGBM a location - # signal in PLM space beyond the scalar query↔ref distance. - use_embedding_pca: bool = False - - # Per-aspect model (None = all aspects) - aspect: str | None = None - - @field_validator( - "old_annotation_set_id", - "new_annotation_set_id", - "embedding_config_id", - "ontology_snapshot_id", - "name", - mode="before", - ) - @classmethod - def must_be_non_empty(cls, v: str) -> str: - if not isinstance(v, str) or not v.strip(): - raise ValueError("must be a non-empty string") - return v.strip() - - @field_validator("category", mode="before") - @classmethod - def valid_category(cls, v: str) -> str: - if v not in ("nk", "lk", "pk"): - raise ValueError("category must be nk, lk, or pk") - return v - - -# --------------------------------------------------------------------------- -# Operation -# --------------------------------------------------------------------------- +# NOTE: a single-pair ``TrainRerankerPayload`` used to live here for an +# Operation that trained one LightGBM model per (old, new) snapshot pair. +# That Operation was retired when LightGBM training moved to +# protea-reranker-lab; the payload was kept around for tests until T0.6 +# pruned it (no production code referenced it). @@ -412,7 +342,7 @@ def _knn_transfer_and_label( go_id_map: dict[int, str], aspect_map: dict[int, str], gt_pairs: set[tuple[str, str]], - p: TrainRerankerPayload | TrainRerankerAutoPayload, + p: TrainRerankerAutoPayload, *, sequence_context: SequenceContext | None = None, query_known_gos: dict[str, set[str]] | None = None, @@ -1061,7 +991,10 @@ class TrainRerankerAutoPayload(ProteaPayload, frozen=True): # Annotation source in annotation_set (default "goa") annotation_source: str = "goa" - # KNN parameters — see TrainRerankerPayload for rationale. + # KNN parameters. Default to FAISS IVFFlat: numpy brute-force on 500k+ + # refs materialises a full (n_queries x n_refs) distance matrix that + # peaks at ~10 GB per aspect. IVFFlat keeps peak memory ~2.5 GB and + # is 5-10x faster. limit_per_entry: PositiveInt = 5 distance_threshold: float | None = None search_backend: str = "faiss" @@ -1092,10 +1025,18 @@ class TrainRerankerAutoPayload(ProteaPayload, frozen=True): # CAFA evaluation which uses IA weighting. ia_file: str | None = None - # Ancestor expansion (see TrainRerankerPayload.expand_votes_to_ancestors). + # Ancestor expansion: when True, synthesize candidate records for every + # ancestor of each leaf GO term voted by a neighbor (True Path Rule at + # vote time). Weight of the inherited vote = IA(ancestor)/IA(leaf) when + # an IA table is available; 1.0 otherwise. Expands the candidate set + # and helps the reranker learn on abstract terms that never get direct + # KNN votes but do appear in ground truth. expand_votes_to_ancestors: bool = False - # Sequence-embedding PCA (see TrainRerankerPayload.use_embedding_pca). + # Sequence-embedding PCA: when True, fit PCA(16) once on the reference + # embedding pool, project each query, and emit 16 extra features + # (emb_pca_query_0..15) per candidate row. Gives LightGBM a location + # signal in PLM space beyond the scalar query<->ref distance. use_embedding_pca: bool = False # Training scope: diff --git a/tests/test_training_dump_helpers.py b/tests/test_training_dump_helpers.py index 8e04140..76d9dc4 100644 --- a/tests/test_training_dump_helpers.py +++ b/tests/test_training_dump_helpers.py @@ -1,132 +1,23 @@ """Unit tests for protea.core.training_dump_helpers. -Covers ``TrainRerankerPayload`` (still imported by the lab via the -``protea_reranker_lab.contracts`` mirror) and the few module-level -helpers (``_load_sequences``, ``_load_taxonomy_ids``) that -``TrainRerankerAutoOperation`` keeps using to drive the dataset-export -pipeline. Heavy DB / model training is no longer tested here — LightGBM -training lives in ``protea-reranker-lab``. +Covers the module-level helpers (``_load_sequences``, +``_load_taxonomy_ids``) that ``TrainRerankerAutoOperation`` uses to +drive the dataset-export pipeline. Heavy DB / model training is no +longer tested here: LightGBM training lives in protea-reranker-lab. """ from __future__ import annotations import uuid -from typing import Any from unittest.mock import MagicMock import numpy as np -import pytest from protea.core.training_dump_helpers import ( - TrainRerankerPayload, _load_sequences, _load_taxonomy_ids, ) -# --------------------------------------------------------------------------- -# Payload validation -# --------------------------------------------------------------------------- - - -class TestTrainRerankerPayload: - def _valid_kwargs(self, **overrides) -> dict[str, Any]: - defaults = { - "name": "test-model", - "old_annotation_set_id": str(uuid.uuid4()), - "new_annotation_set_id": str(uuid.uuid4()), - "embedding_config_id": str(uuid.uuid4()), - "ontology_snapshot_id": str(uuid.uuid4()), - } - defaults.update(overrides) - return defaults - - def test_valid_payload(self): - p = TrainRerankerPayload(**self._valid_kwargs()) - assert p.name == "test-model" - assert p.category == "nk" - assert p.limit_per_entry == 5 - - def test_empty_name_raises(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(name="")) - - def test_whitespace_name_raises(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(name=" ")) - - def test_empty_old_annotation_set_id_raises(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(old_annotation_set_id="")) - - def test_empty_new_annotation_set_id_raises(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(new_annotation_set_id="")) - - def test_empty_embedding_config_id_raises(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(embedding_config_id="")) - - def test_empty_ontology_snapshot_id_raises(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(ontology_snapshot_id="")) - - def test_invalid_category_raises(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(category="invalid")) - - def test_valid_categories(self): - for cat in ("nk", "lk", "pk"): - p = TrainRerankerPayload(**self._valid_kwargs(category=cat)) - assert p.category == cat - - def test_custom_knn_params(self): - p = TrainRerankerPayload( - **self._valid_kwargs( - limit_per_entry=10, - distance_threshold=0.5, - search_backend="faiss", - metric="euclidean", - ) - ) - assert p.limit_per_entry == 10 - assert p.distance_threshold == 0.5 - assert p.search_backend == "faiss" - - def test_custom_lightgbm_params(self): - p = TrainRerankerPayload( - **self._valid_kwargs( - num_boost_round=500, - early_stopping_rounds=25, - val_fraction=0.1, - neg_pos_ratio=3.0, - ) - ) - assert p.num_boost_round == 500 - assert p.early_stopping_rounds == 25 - assert p.val_fraction == 0.1 - assert p.neg_pos_ratio == 3.0 - - def test_feature_flags_default_false(self): - p = TrainRerankerPayload(**self._valid_kwargs()) - assert p.compute_alignments is False - assert p.compute_taxonomy is False - - def test_aspect_filter(self): - p = TrainRerankerPayload(**self._valid_kwargs(aspect="bpo")) - assert p.aspect == "bpo" - - def test_name_is_stripped(self): - p = TrainRerankerPayload(**self._valid_kwargs(name=" my model ")) - assert p.name == "my model" - - def test_limit_per_entry_must_be_positive(self): - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(limit_per_entry=0)) - - with pytest.raises(ValueError): - TrainRerankerPayload(**self._valid_kwargs(limit_per_entry=-1)) - - # --------------------------------------------------------------------------- # _load_sequences (used by TrainRerankerAutoOperation in dump_only mode) # --------------------------------------------------------------------------- From ef13faccace54e141065aad6d4ae32b0d02d407c Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:39:32 +0200 Subject: [PATCH 53/73] feat(config): T-CONF.2 add WorkerTuning category Second category of the externalised tuning settings. Migrates 7 hardcoded constants from the WorkerTuning row in CONFIG_INVENTORY: - db_pool_size (engine.py:12) 20 - db_pool_max_overflow (engine.py:13) 40 - db_pool_recycle_seconds (engine.py:14) 3600 - model_cache_max (compute_embeddings) 1 - ref_cache_max (predict_go_terms) 1 - reaper_main_timeout_seconds (worker) 86400 (was incorrectly 21600 in the inventory; fixed to match scripts/worker.py) - reaper_default_timeout_seconds 3600 - reaper_stall_seconds 1800 - api_cache_default_ttl_seconds 300.0 Behavioural: - infrastructure/database/engine.py: build_engine() reads pool settings from get_tuning().worker. - core/operations/compute_embeddings.py: removes _MODEL_CACHE_MAX constant; reads dynamically inside _get_or_load_model. - core/operations/predict_go_terms.py: removes _REF_CACHE_MAX constant; reads dynamically before evicting. - api/cache.py: removes _DEFAULT_TTL constant; exposes _default_ttl() function for callers that want the resolved default. The constant was never imported by anyone; it only appeared in __all__. - scripts/worker.py: reaper mode reads reaper_main_timeout_seconds and reaper_stall_seconds from settings, configurable via PROTEA_TUNING__WORKER__REAPER_MAIN_TIMEOUT_SECONDS and PROTEA_TUNING__WORKER__REAPER_STALL_SECONDS. 8 new tests in test_tuning.py: WorkerTuning defaults (pool, cache, reaper), validation (pool>0, reaper>=300), TuningSettings compose with worker, env override of db_pool_size. Suite: 1106 passed, 10 skipped (was 1098 + 8). Two of five categories migrated. OperationTuning, APILimits, ResearchKnobs follow. Part of F0 T-CONF.2 of master plan v3. --- protea/api/cache.py | 11 +++- protea/config/tuning.py | 67 ++++++++++++++++++++ protea/core/operations/compute_embeddings.py | 6 +- protea/core/operations/predict_go_terms.py | 6 +- protea/infrastructure/database/engine.py | 9 ++- scripts/worker.py | 27 ++++++-- tests/test_tuning.py | 58 +++++++++++++++++ 7 files changed, 167 insertions(+), 17 deletions(-) diff --git a/protea/api/cache.py b/protea/api/cache.py index 98d12c9..ea64450 100644 --- a/protea/api/cache.py +++ b/protea/api/cache.py @@ -1,7 +1,7 @@ """Tiny in-process TTL cache for aggregate API endpoints. Built for stats/listing endpoints that run DISTINCT-over-JOIN queries on 10M+ -row tables — queries that are structurally slow (tens of seconds) and whose +row tables: queries that are structurally slow (tens of seconds) and whose results change slowly enough that a 5-minute TTL is not user-visible. Process-local by design: resets on uvicorn restart, does not need Redis, does @@ -15,7 +15,12 @@ from collections.abc import Callable from typing import Any -_DEFAULT_TTL = 300.0 # 5 minutes +from protea.config.tuning import get_tuning + + +def _default_ttl() -> float: + """Resolved each call so env/yaml overrides apply at runtime.""" + return get_tuning().worker.api_cache_default_ttl_seconds _lock = threading.Lock() _store: dict[str, tuple[float, Any]] = {} @@ -43,4 +48,4 @@ def invalidate(key: str | None = None) -> None: _store.pop(key, None) -__all__ = ["cached", "invalidate", "_DEFAULT_TTL"] +__all__ = ["cached", "invalidate", "_default_ttl"] diff --git a/protea/config/tuning.py b/protea/config/tuning.py index 0c7604a..0cf0ef9 100644 --- a/protea/config/tuning.py +++ b/protea/config/tuning.py @@ -79,10 +79,77 @@ class QueueTuning(BaseModel): ) +class WorkerTuning(BaseModel): + """Pool sizes, in-process caches and reaper timeouts. + + Sources: ``infrastructure/database/engine.py``, + ``infrastructure/operations/{compute_embeddings,predict_go_terms}.py``, + ``workers/stale_job_reaper.py``, ``api/cache.py`` (ver + ``docs/CONFIG_INVENTORY.md`` §B). + """ + + db_pool_size: int = Field( + default=20, + ge=1, + description="SQLAlchemy connection pool size. Tunear según concurrencia esperada.", + ) + db_pool_max_overflow: int = Field( + default=40, + ge=0, + description="Conexiones extra permitidas sobre el pool size cuando hay pico.", + ) + db_pool_recycle_seconds: int = Field( + default=3600, + ge=60, + description=( + "Reciclar conexiones tras N segundos para evitar idle-timeout silencioso del DB." + ), + ) + model_cache_max: int = Field( + default=1, + ge=1, + description=( + "Modelos PLM en cache por proceso de embeddings. >1 acumula GB en GPU." + ), + ) + ref_cache_max: int = Field( + default=1, + ge=1, + description="Reference data sets en cache por proceso predict.", + ) + reaper_main_timeout_seconds: int = Field( + default=86400, + ge=300, + description=( + "Timeout duro antes de marcar jobs FAILED en producción (default 24h). " + "Coordinator jobs como compute_embeddings pueden correr <1d en datasets " + "grandes; este es el corte global." + ), + ) + reaper_default_timeout_seconds: int = Field( + default=3600, + ge=300, + description="Default constructor de StaleJobReaper (sobrescrito por main).", + ) + reaper_stall_seconds: int = Field( + default=1800, + ge=60, + description=( + "Tiempo sin JobEvent antes de considerar un job stalled candidato a reapear." + ), + ) + api_cache_default_ttl_seconds: float = Field( + default=300.0, + ge=1.0, + description="TTL default cache HTTP (api/cache.py). 5 min por defecto.", + ) + + class TuningSettings(BaseModel): """Root tuning model that composes per-category sub-models.""" queue: QueueTuning = Field(default_factory=QueueTuning) + worker: WorkerTuning = Field(default_factory=WorkerTuning) def _load_yaml_tuning(project_root: Path) -> dict[str, Any]: diff --git a/protea/core/operations/compute_embeddings.py b/protea/core/operations/compute_embeddings.py index dc638af..9321a80 100644 --- a/protea/core/operations/compute_embeddings.py +++ b/protea/core/operations/compute_embeddings.py @@ -606,13 +606,15 @@ def _update_parent_progress(self, session: Session, parent_job_id: UUID, emit: E # all subsequent batch messages with the same config. Max 1 entry to avoid # accumulating multi-GB models in GPU memory when configs change. _MODEL_CACHE: dict[tuple[str, str, str], tuple[Any, Any]] = {} -_MODEL_CACHE_MAX = 1 def _get_or_load_model(config: EmbeddingConfig, device: str, emit: EmitFn) -> tuple[Any, Any]: + from protea.config.tuning import get_tuning + + cache_max = get_tuning().worker.model_cache_max key = (config.model_name, config.model_backend, device) if key not in _MODEL_CACHE: - if len(_MODEL_CACHE) >= _MODEL_CACHE_MAX: + if len(_MODEL_CACHE) >= cache_max: evict_key = next(iter(_MODEL_CACHE)) old_model, old_tokenizer = _MODEL_CACHE.pop(evict_key) del old_model, old_tokenizer diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index 6e33b92..1956689 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -80,7 +80,6 @@ # Limited to 1 entry — evicts previous reference on config change. # --------------------------------------------------------------------------- _REF_CACHE: dict[tuple[str, str, bool], dict[str, Any]] = {} -_REF_CACHE_MAX = 1 # --------------------------------------------------------------------------- # v6 reranker feature constants @@ -592,7 +591,10 @@ def execute( cache_key = (p.embedding_config_id, p.annotation_set_id, p.aspect_separated_knn) if cache_key not in _REF_CACHE: # Evict oldest entry when cache is full to free numpy arrays from memory. - if len(_REF_CACHE) >= _REF_CACHE_MAX: + from protea.config.tuning import get_tuning + + cache_max = get_tuning().worker.ref_cache_max + if len(_REF_CACHE) >= cache_max: evict_key = next(iter(_REF_CACHE)) del _REF_CACHE[evict_key] emit( diff --git a/protea/infrastructure/database/engine.py b/protea/infrastructure/database/engine.py index 7df6692..4df0c7b 100644 --- a/protea/infrastructure/database/engine.py +++ b/protea/infrastructure/database/engine.py @@ -3,13 +3,16 @@ from sqlalchemy import create_engine from sqlalchemy.engine import Engine +from protea.config.tuning import get_tuning + def build_engine(db_url: str) -> Engine: + settings = get_tuning().worker return create_engine( db_url, future=True, pool_pre_ping=True, - pool_size=20, - max_overflow=40, - pool_recycle=3600, + pool_size=settings.db_pool_size, + max_overflow=settings.db_pool_max_overflow, + pool_recycle=settings.db_pool_recycle_seconds, ) diff --git a/scripts/worker.py b/scripts/worker.py index d75e0b3..361cde1 100644 --- a/scripts/worker.py +++ b/scripts/worker.py @@ -57,13 +57,26 @@ def main() -> None: # Special mode: stale job reaper (no queue, just periodic DB check). if args.queue == "reaper": - # 24h hard timeout + 30min stall window. Earlier value (6h) killed - # predict_go_terms coords that waited in the batch FIFO behind other - # coords — the last ones in a 23-job batch routinely sat past 6h - # even though work was progressing upstream. With only one - # predictions.batch worker this is the expected shape of the queue. - reaper = StaleJobReaper(factory, timeout_seconds=86400) - logging.info("Stale job reaper started. timeout=86400s interval=60s") + # 24h hard timeout + 30min stall window by default. Earlier value + # (6h) killed predict_go_terms coords that waited in the batch FIFO + # behind other coords; with only one predictions.batch worker the + # last ones in a 23-job batch routinely sat past 6h even though + # work was progressing upstream. + # Both numbers configurable via WorkerTuning (PROTEA_TUNING__WORKER__ + # REAPER_MAIN_TIMEOUT_SECONDS and ..._STALL_SECONDS). + from protea.config.tuning import get_tuning + + worker_settings = get_tuning().worker + reaper = StaleJobReaper( + factory, + timeout_seconds=worker_settings.reaper_main_timeout_seconds, + stall_seconds=worker_settings.reaper_stall_seconds, + ) + logging.info( + "Stale job reaper started. timeout=%ds stall=%ds interval=60s", + worker_settings.reaper_main_timeout_seconds, + worker_settings.reaper_stall_seconds, + ) reaper.run(interval_seconds=60) return diff --git a/tests/test_tuning.py b/tests/test_tuning.py index 590ef9d..3371f61 100644 --- a/tests/test_tuning.py +++ b/tests/test_tuning.py @@ -9,6 +9,7 @@ from protea.config.tuning import ( QueueTuning, TuningSettings, + WorkerTuning, _apply_env_overrides, _coerce, _load_yaml_tuning, @@ -166,3 +167,60 @@ def test_compose(self) -> None: def test_default_compose(self) -> None: s = TuningSettings() assert s.queue.publisher_max_attempts == 12 + + def test_default_worker_compose(self) -> None: + s = TuningSettings() + assert s.worker.db_pool_size == 20 + assert s.worker.model_cache_max == 1 + assert s.worker.api_cache_default_ttl_seconds == 300.0 + + +class TestWorkerTuningDefaults: + def test_pool_defaults(self) -> None: + w = WorkerTuning() + assert w.db_pool_size == 20 + assert w.db_pool_max_overflow == 40 + assert w.db_pool_recycle_seconds == 3600 + + def test_cache_defaults(self) -> None: + w = WorkerTuning() + assert w.model_cache_max == 1 + assert w.ref_cache_max == 1 + assert w.api_cache_default_ttl_seconds == pytest.approx(300.0) + + def test_reaper_defaults(self) -> None: + w = WorkerTuning() + assert w.reaper_main_timeout_seconds == 86400 + assert w.reaper_default_timeout_seconds == 3600 + assert w.reaper_stall_seconds == 1800 + + def test_validates_pool_size(self) -> None: + with pytest.raises(Exception): + WorkerTuning(db_pool_size=0) + + def test_validates_reaper_main_floor(self) -> None: + with pytest.raises(Exception): + WorkerTuning(reaper_main_timeout_seconds=10) + + def test_validates_cache_max(self) -> None: + with pytest.raises(Exception): + WorkerTuning(model_cache_max=0) + + +class TestWorkerEnvOverrides: + def setup_method(self) -> None: + get_tuning.cache_clear() + + def teardown_method(self) -> None: + get_tuning.cache_clear() + + def test_env_override_pool_size( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + ) -> None: + monkeypatch.setattr( + "protea.config.tuning._resolve_project_root", lambda: tmp_path + ) + monkeypatch.setenv("PROTEA_TUNING__WORKER__DB_POOL_SIZE", "50") + get_tuning.cache_clear() + s = get_tuning() + assert s.worker.db_pool_size == 50 From 54a57f83132af2445593d698bdbd255b44a7dc1b Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:44:24 +0200 Subject: [PATCH 54/73] feat(config): T-CONF.2 add OperationTuning category MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third tuning category. Migrates 4 module-level chunk-size constants that were duplicated across feature_enricher, knn_search, training_dump_helpers, and predict_go_terms. OperationTuning fields: - annotation_chunk_size (10_000) feature_enricher, training_dump_helpers, predict_go_terms (5 helper sites total). - stream_chunk_size (2_000) training_dump_helpers (_preload_all_embeddings) and predict_go_terms (_load_query_embeddings). - store_chunk_size (10_000) predict_go_terms (publishing predictions to protea.predictions.write). - numpy_query_chunk (500) knn_search._search_numpy chunked matrix multiplication (caps the n_queries x n_refs distance matrix peak around 1 GB for default values). Removes 8 module-level constants from 4 files; resolves dynamically inside the helpers via get_tuning().operation.X. Eliminates the triplicate _ANNOTATION_CHUNK_SIZE / duplicate _STREAM_CHUNK_SIZE that the inventory flagged in CONFIG_INVENTORY.md §C. HTTP retry policy / timeouts in pydantic payloads (InsertProteinsPayload, LoadGoaAnnotationsPayload, etc.) intentionally stay where they are. Those are caller-controlled per job, not infra. 3 new tests in test_tuning.py: OperationTuning defaults, validation floors, env override of annotation_chunk_size. Suite: 1109 passed, 10 skipped (was 1106 + 3). Three of five categories migrated. APILimits and ResearchKnobs follow. Part of F0 T-CONF.2 of master plan v3. --- protea/config/tuning.py | 52 ++++++++++++++++++++ protea/core/feature_enricher.py | 9 ++-- protea/core/knn_search.py | 17 ++++--- protea/core/operations/predict_go_terms.py | 56 ++++++++++++++-------- protea/core/training_dump_helpers.py | 24 +++++++--- tests/test_tuning.py | 39 +++++++++++++++ 6 files changed, 161 insertions(+), 36 deletions(-) diff --git a/protea/config/tuning.py b/protea/config/tuning.py index 0cf0ef9..5ab10d9 100644 --- a/protea/config/tuning.py +++ b/protea/config/tuning.py @@ -145,11 +145,63 @@ class WorkerTuning(BaseModel): ) +class OperationTuning(BaseModel): + """Module-level chunk and batch sizes used inside operations. + + HTTP retry policy and per-source timeouts live inside their + respective pydantic payloads (``InsertProteinsPayload``, + ``LoadGoaAnnotationsPayload``, etc.) because the caller picks + them per-job. The values here are infra-level: how to slice + work between memory and broker pressure constraints. + + Sources: ``core/feature_enricher.py``, ``core/knn_search.py``, + ``core/operations/{predict_go_terms,training_dump_helpers}.py`` + (ver ``docs/CONFIG_INVENTORY.md`` §C). + """ + + annotation_chunk_size: int = Field( + default=10_000, + ge=100, + description=( + "Filas por chunk al cargar/iterar anotaciones. Tunear " + "según RAM disponible: 1k-100k razonable." + ), + ) + stream_chunk_size: int = Field( + default=2_000, + ge=100, + description=( + "Chunk size streaming PyArrow / SQLAlchemy yield_per. " + "Más bajo reduce pico Python-object; más alto reduce " + "round-trips. 500-10k razonable." + ), + ) + store_chunk_size: int = Field( + default=10_000, + ge=500, + description=( + "Filas por chunk al publicar predictions a la cola " + "store. RabbitMQ cap 128 MB; 10k filas serializan " + "~20-25 MB. 5k-50k según mensaje promedio." + ), + ) + numpy_query_chunk: int = Field( + default=500, + ge=10, + description=( + "Query chunk size para KNN numpy backend. Multiplicado " + "por n_refs determina el pico de la matriz de " + "distancias (500 x 500k x 4B ~ 1 GB)." + ), + ) + + class TuningSettings(BaseModel): """Root tuning model that composes per-category sub-models.""" queue: QueueTuning = Field(default_factory=QueueTuning) worker: WorkerTuning = Field(default_factory=WorkerTuning) + operation: OperationTuning = Field(default_factory=OperationTuning) def _load_yaml_tuning(project_root: Path) -> dict[str, Any]: diff --git a/protea/core/feature_enricher.py b/protea/core/feature_enricher.py index dc65bd8..893ff4c 100644 --- a/protea/core/feature_enricher.py +++ b/protea/core/feature_enricher.py @@ -39,7 +39,7 @@ from protea.core.reranker import EMBEDDING_PCA_DIM from protea.infrastructure.orm.models.annotation.go_term import GOTerm -_ANNOTATION_CHUNK_SIZE = 10_000 +# Annotation chunk size is configured via OperationTuning.annotation_chunk_size. _TAX_CLOSE_RELATIONS = frozenset( {"same", "ancestor", "descendant", "child", "parent", "close"} @@ -97,9 +97,12 @@ def _load_go_term_metadata( aspect_map: dict[int, str] = {} if not go_term_ids: return go_id_map, aspect_map + from protea.config.tuning import get_tuning + + chunk_size = get_tuning().operation.annotation_chunk_size ids_list = list(go_term_ids) - for i in range(0, len(ids_list), _ANNOTATION_CHUNK_SIZE): - chunk = ids_list[i : i + _ANNOTATION_CHUNK_SIZE] + for i in range(0, len(ids_list), chunk_size): + chunk = ids_list[i : i + chunk_size] rows = ( session.query(GOTerm.id, GOTerm.go_id, GOTerm.aspect) .filter(GOTerm.id.in_(chunk)) diff --git a/protea/core/knn_search.py b/protea/core/knn_search.py index 17725d6..8e7b750 100644 --- a/protea/core/knn_search.py +++ b/protea/core/knn_search.py @@ -128,11 +128,11 @@ def search_knn( # --------------------------------------------------------------------------- -# Cap on queries processed at once. The full (n_queries × n_refs) distance -# matrix would peak at n_queries × n_refs × 4 bytes; with 500k refs a naive -# call materialises 10+ GB per aspect. Chunking caps peak at -# _NUMPY_QUERY_CHUNK × n_refs × 4 bytes (≈1 GB for 500 × 500k). -_NUMPY_QUERY_CHUNK = 500 +# Query chunk size lives in OperationTuning.numpy_query_chunk so the +# memory ceiling is tunable per deployment (the full n_queries x n_refs +# distance matrix would peak at n_queries x n_refs x 4 bytes; with 500k +# refs a naive call materialises 10+ GB per aspect; 500 x 500k x 4B +# is ~1 GB). def _search_numpy( @@ -172,8 +172,11 @@ def _search_numpy( results: list[list[tuple[str, float]]] = [] n_queries = Q.shape[0] - for start in range(0, n_queries, _NUMPY_QUERY_CHUNK): - Q_chunk = Q[start : start + _NUMPY_QUERY_CHUNK] + from protea.config.tuning import get_tuning + + query_chunk = get_tuning().operation.numpy_query_chunk + for start in range(0, n_queries, query_chunk): + Q_chunk = Q[start : start + query_chunk] if metric == "cosine": if pre_normalized: Q_n = Q_chunk / (np.linalg.norm(Q_chunk, axis=1, keepdims=True) + 1e-9) diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index 1956689..d9944b9 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -58,13 +58,14 @@ PositiveInt = Annotated[int, Field(gt=0)] -_ANNOTATION_CHUNK_SIZE = 10_000 +# Annotation and stream chunk sizes are configured via OperationTuning +# (annotation_chunk_size, stream_chunk_size) and resolved at call time +# inside the helpers below. At 1280 dims x 2 bytes (float16) x 2000 rows +# the streaming reference query fetches ~5 MB per cursor round-trip, +# keeping Python object pressure negligible. + _BATCH_QUEUE = "protea.predictions.batch" _WRITE_QUEUE = "protea.predictions.write" -# Rows fetched per round-trip when streaming reference embeddings from PostgreSQL. -# At 1280 dims × 2 bytes (float16) × 2000 rows = ~5 MB per chunk — keeps Python -# object pressure negligible while amortising cursor round-trips. -_STREAM_CHUNK_SIZE = 2_000 # GO aspect single-character codes used in GOTerm.aspect — imported above # from the canonical protea.core.domain.aspect module. @@ -871,10 +872,12 @@ def execute( # advances the coordinator's batch counter (``is_final_chunk=True``) # so the parent job doesn't mark itself succeeded after the first # batch's chunks finish. - _STORE_CHUNK_SIZE = 10_000 + from protea.config.tuning import get_tuning + + store_chunk_size = get_tuning().operation.store_chunk_size chunks: list[list[dict[str, Any]]] = [ - prediction_dicts[s:s + _STORE_CHUNK_SIZE] - for s in range(0, len(prediction_dicts), _STORE_CHUNK_SIZE) + prediction_dicts[s:s + store_chunk_size] + for s in range(0, len(prediction_dicts), store_chunk_size) ] or [[]] store_messages: list[tuple[str, dict[str, Any]]] = [] for i, chunk in enumerate(chunks): @@ -1094,11 +1097,14 @@ def _load_reference_data( dim = first_emb.dimensions() # Pre-allocate float16 array; fill row-by-row via yield_per so the - # cursor fetches _STREAM_CHUNK_SIZE rows at a time — peak Python-object - # memory stays at ~chunk_size × dim × 28 bytes ≈ tens of MB, not 14 GB. + # cursor fetches stream_chunk_size rows at a time, peak Python-object + # memory stays at ~chunk_size x dim x 28 bytes ~= tens of MB, not 14 GB. + from protea.config.tuning import get_tuning + + stream_chunk = get_tuning().operation.stream_chunk_size embeddings = np.empty((total, dim), dtype=np.float16) accessions: list[str] = [] - for i, (acc, emb) in enumerate(base_q.yield_per(_STREAM_CHUNK_SIZE)): + for i, (acc, emb) in enumerate(base_q.yield_per(stream_chunk)): embeddings[i] = emb.to_numpy() accessions.append(acc) @@ -1621,11 +1627,14 @@ def _load_annotations_for( MFO-index neighbors transfer only MFO terms, etc. The join to ``go_term`` is added only when needed to keep the no-aspect path as fast as before. """ + from protea.config.tuning import get_tuning + + chunk_size = get_tuning().operation.annotation_chunk_size go_map: dict[str, list[dict[str, Any]]] = {} accessions_list = list(accessions) - for i in range(0, len(accessions_list), _ANNOTATION_CHUNK_SIZE): - chunk = accessions_list[i : i + _ANNOTATION_CHUNK_SIZE] + for i in range(0, len(accessions_list), chunk_size): + chunk = accessions_list[i : i + chunk_size] q = session.query( ProteinGOAnnotation.protein_accession, ProteinGOAnnotation.go_term_id, @@ -1889,10 +1898,13 @@ def _predict_batch( def _load_sequences_for_proteins( self, session: Session, accessions: set[str] ) -> dict[str, str]: + from protea.config.tuning import get_tuning + + chunk_size = get_tuning().operation.annotation_chunk_size result: dict[str, str] = {} acc_list = list(accessions) - for i in range(0, len(acc_list), _ANNOTATION_CHUNK_SIZE): - chunk = acc_list[i : i + _ANNOTATION_CHUNK_SIZE] + for i in range(0, len(acc_list), chunk_size): + chunk = acc_list[i : i + chunk_size] rows = ( session.query(Protein.accession, Sequence.sequence) .join(Protein.sequence) @@ -1923,10 +1935,13 @@ def _load_sequences_for_queries( def _load_taxonomy_ids_for_proteins( self, session: Session, accessions: set[str] ) -> dict[str, int | None]: + from protea.config.tuning import get_tuning + + chunk_size = get_tuning().operation.annotation_chunk_size result: dict[str, int | None] = {} acc_list = list(accessions) - for i in range(0, len(acc_list), _ANNOTATION_CHUNK_SIZE): - chunk = acc_list[i : i + _ANNOTATION_CHUNK_SIZE] + for i in range(0, len(acc_list), chunk_size): + chunk = acc_list[i : i + chunk_size] rows = ( session.query(Protein.accession, Protein.taxonomy_id) .filter(Protein.accession.in_(chunk)) @@ -1942,11 +1957,14 @@ def _load_taxonomy_ids_for_queries( p: PredictGOTermsBatchPayload, accessions: list[str], ) -> dict[str, int | None]: + from protea.config.tuning import get_tuning + + chunk_size = get_tuning().operation.annotation_chunk_size acc_set = set(accessions) result: dict[str, int | None] = {acc: None for acc in acc_set} acc_list = list(acc_set) - for i in range(0, len(acc_list), _ANNOTATION_CHUNK_SIZE): - chunk = acc_list[i : i + _ANNOTATION_CHUNK_SIZE] + for i in range(0, len(acc_list), chunk_size): + chunk = acc_list[i : i + chunk_size] rows = ( session.query(Protein.accession, Protein.taxonomy_id) .filter(Protein.accession.in_(chunk)) diff --git a/protea/core/training_dump_helpers.py b/protea/core/training_dump_helpers.py index 43adfce..cd3d340 100644 --- a/protea/core/training_dump_helpers.py +++ b/protea/core/training_dump_helpers.py @@ -58,8 +58,8 @@ PositiveInt = Annotated[int, Field(gt=0)] -_ANNOTATION_CHUNK_SIZE = 10_000 -_STREAM_CHUNK_SIZE = 2_000 +# Chunk sizes are configured via OperationTuning.annotation_chunk_size / +# stream_chunk_size and resolved at call time inside the helpers below. _LOG = logging.getLogger(__name__) @@ -187,6 +187,10 @@ def _preload_all_embeddings( "info", ) + from protea.config.tuning import get_tuning + + stream_chunk = get_tuning().operation.stream_chunk_size + embeddings = np.empty((total, dim), dtype=np.float16) accessions: list[str] = [] result_proxy = conn.execute( @@ -198,7 +202,7 @@ def _preload_all_embeddings( " AND se.embedding_config_id = :ecid" ), {"ecid": emb_config_id}, - ).yield_per(_STREAM_CHUNK_SIZE) + ).yield_per(stream_chunk) for i, (acc, emb_str) in enumerate(result_proxy): if isinstance(emb_str, str): @@ -299,10 +303,13 @@ def _load_sequences( session: Session, accessions: set[str], ) -> dict[str, str]: + from protea.config.tuning import get_tuning + + chunk_size = get_tuning().operation.annotation_chunk_size result: dict[str, str] = {} acc_list = list(accessions) - for i in range(0, len(acc_list), _ANNOTATION_CHUNK_SIZE): - chunk = acc_list[i : i + _ANNOTATION_CHUNK_SIZE] + for i in range(0, len(acc_list), chunk_size): + chunk = acc_list[i : i + chunk_size] rows = ( session.query(Protein.accession, Sequence.sequence) .join(Protein.sequence) @@ -318,10 +325,13 @@ def _load_taxonomy_ids( session: Session, accessions: set[str], ) -> dict[str, int | None]: + from protea.config.tuning import get_tuning + + chunk_size = get_tuning().operation.annotation_chunk_size result: dict[str, int | None] = {} acc_list = list(accessions) - for i in range(0, len(acc_list), _ANNOTATION_CHUNK_SIZE): - chunk = acc_list[i : i + _ANNOTATION_CHUNK_SIZE] + for i in range(0, len(acc_list), chunk_size): + chunk = acc_list[i : i + chunk_size] rows = ( session.query(Protein.accession, Protein.taxonomy_id) .filter(Protein.accession.in_(chunk)) diff --git a/tests/test_tuning.py b/tests/test_tuning.py index 3371f61..06a0280 100644 --- a/tests/test_tuning.py +++ b/tests/test_tuning.py @@ -7,6 +7,7 @@ import pytest from protea.config.tuning import ( + OperationTuning, QueueTuning, TuningSettings, WorkerTuning, @@ -224,3 +225,41 @@ def test_env_override_pool_size( get_tuning.cache_clear() s = get_tuning() assert s.worker.db_pool_size == 50 + + +class TestOperationTuningDefaults: + def test_chunk_defaults(self) -> None: + o = OperationTuning() + assert o.annotation_chunk_size == 10_000 + assert o.stream_chunk_size == 2_000 + assert o.store_chunk_size == 10_000 + assert o.numpy_query_chunk == 500 + + def test_validates_floor(self) -> None: + with pytest.raises(Exception): + OperationTuning(annotation_chunk_size=10) + with pytest.raises(Exception): + OperationTuning(stream_chunk_size=10) + with pytest.raises(Exception): + OperationTuning(store_chunk_size=100) + with pytest.raises(Exception): + OperationTuning(numpy_query_chunk=0) + + +class TestOperationEnvOverrides: + def setup_method(self) -> None: + get_tuning.cache_clear() + + def teardown_method(self) -> None: + get_tuning.cache_clear() + + def test_env_override_chunk_size( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + ) -> None: + monkeypatch.setattr( + "protea.config.tuning._resolve_project_root", lambda: tmp_path + ) + monkeypatch.setenv("PROTEA_TUNING__OPERATION__ANNOTATION_CHUNK_SIZE", "50000") + get_tuning.cache_clear() + s = get_tuning() + assert s.operation.annotation_chunk_size == 50_000 From 15bc87e276e14419965128071b64e55c954f7c9d Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:46:16 +0200 Subject: [PATCH 55/73] feat(config): T-CONF.2 add APILimits category MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fourth tuning category. Migrates 4 hardcoded boundary limits from the FastAPI router layer. APILimits fields: - max_fasta_bytes (50 MB) duplicated as _MAX_FASTA_BYTES in api/routers/annotate.py and api/routers/query_sets.py; the externalisation dedupes by construction. - max_comment_length (500) api/routers/support.py - recent_limit (20) api/routers/support.py - page_limit (100) api/routers/support.py Behavioural: - annotate.py + query_sets.py: read max_fasta_bytes from get_tuning().api at request time. Error message now formats the configured limit instead of a literal "50 MB" so an operator-set override is reflected back to clients. - support.py: the SupportCreate pydantic Field's static max_length= moves to a field_validator that resolves max_comment_length dynamically. The /support GET reads page_limit and recent_limit from settings. 3 new tests in test_tuning.py: APILimits defaults, validation floors, env override of max_fasta_bytes. Suite: 1112 passed, 10 skipped (was 1109 + 3). Four of five categories migrated. Only ResearchKnobs (mostly config-exempt: PCA dim and N_THRESHOLDS sweep are research-side methodology constants documented in CONFIG_INVENTORY §E) left. Part of F0 T-CONF.2 of master plan v3. --- protea/api/routers/annotate.py | 18 +++++++++++---- protea/api/routers/query_sets.py | 11 ++++++--- protea/api/routers/support.py | 25 +++++++++++++------- protea/config/tuning.py | 34 ++++++++++++++++++++++++++++ tests/test_tuning.py | 39 ++++++++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 16 deletions(-) diff --git a/protea/api/routers/annotate.py b/protea/api/routers/annotate.py index d3e7050..ee22899 100644 --- a/protea/api/routers/annotate.py +++ b/protea/api/routers/annotate.py @@ -92,11 +92,16 @@ async def annotate( ``predict_go_terms`` once embeddings are ready. """ # ── Parse FASTA ────────────────────────────────────────────────── - _MAX_FASTA_BYTES = 50 * 1024 * 1024 # 50 MB + from protea.config.tuning import get_tuning + + max_bytes = get_tuning().api.max_fasta_bytes if file is not None: raw = await file.read() - if len(raw) > _MAX_FASTA_BYTES: - raise HTTPException(status_code=413, detail="FASTA file exceeds 50 MB limit") + if len(raw) > max_bytes: + raise HTTPException( + status_code=413, + detail=f"FASTA file exceeds {max_bytes // (1024 * 1024)} MB limit", + ) try: content = raw.decode("utf-8") except UnicodeDecodeError: @@ -104,8 +109,11 @@ async def annotate( status_code=422, detail="FASTA file must be UTF-8 encoded" ) from None elif fasta_text: - if len(fasta_text.encode("utf-8")) > _MAX_FASTA_BYTES: - raise HTTPException(status_code=413, detail="FASTA text exceeds 50 MB limit") + if len(fasta_text.encode("utf-8")) > max_bytes: + raise HTTPException( + status_code=413, + detail=f"FASTA text exceeds {max_bytes // (1024 * 1024)} MB limit", + ) content = fasta_text else: raise HTTPException(status_code=422, detail="Provide a FASTA file or fasta_text") diff --git a/protea/api/routers/query_sets.py b/protea/api/routers/query_sets.py index 4a140e6..ad05a3e 100644 --- a/protea/api/routers/query_sets.py +++ b/protea/api/routers/query_sets.py @@ -109,10 +109,15 @@ async def create_query_set( preserving the original FASTA accession. Duplicate accessions within the same upload are rejected with 422. """ - _MAX_FASTA_BYTES = 50 * 1024 * 1024 # 50 MB + from protea.config.tuning import get_tuning + + max_bytes = get_tuning().api.max_fasta_bytes raw = await file.read() - if len(raw) > _MAX_FASTA_BYTES: - raise HTTPException(status_code=413, detail="FASTA file exceeds 50 MB limit") + if len(raw) > max_bytes: + raise HTTPException( + status_code=413, + detail=f"FASTA file exceeds {max_bytes // (1024 * 1024)} MB limit", + ) try: content = raw.decode("utf-8") except UnicodeDecodeError: diff --git a/protea/api/routers/support.py b/protea/api/routers/support.py index 65228da..102bcc2 100644 --- a/protea/api/routers/support.py +++ b/protea/api/routers/support.py @@ -3,21 +3,28 @@ from typing import Any from fastapi import APIRouter, Depends, Query -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator from protea.api.deps import get_session_factory +from protea.config.tuning import get_tuning from protea.infrastructure.orm.models.support_entry import SupportEntry from protea.infrastructure.session import session_scope router = APIRouter(prefix="/support", tags=["support"]) -_MAX_COMMENT_LENGTH = 500 -_RECENT_LIMIT = 20 -_PAGE_LIMIT = 100 - class SupportCreate(BaseModel): - comment: str | None = Field(default=None, max_length=_MAX_COMMENT_LENGTH) + comment: str | None = Field(default=None) + + @field_validator("comment") + @classmethod + def comment_within_limit(cls, v: str | None) -> str | None: + if v is None: + return v + max_len = get_tuning().api.max_comment_length + if len(v) > max_len: + raise ValueError(f"comment exceeds max length {max_len}") + return v @router.get("") @@ -27,11 +34,13 @@ def get_support( ) -> dict[str, Any]: """Return total thumbs-up count and comments. - Pass ``all_comments=true`` to get all comments (up to 100) instead of the 20 most recent. + Pass ``all_comments=true`` to get all comments (up to the configured + page limit) instead of the recent_limit most recent. """ + api_limits = get_tuning().api with session_scope(factory) as session: total = session.query(SupportEntry).count() - limit = _PAGE_LIMIT if all_comments else _RECENT_LIMIT + limit = api_limits.page_limit if all_comments else api_limits.recent_limit recent = ( session.query(SupportEntry) .filter(SupportEntry.comment.isnot(None)) diff --git a/protea/config/tuning.py b/protea/config/tuning.py index 5ab10d9..6880d2d 100644 --- a/protea/config/tuning.py +++ b/protea/config/tuning.py @@ -196,12 +196,46 @@ class OperationTuning(BaseModel): ) +class APILimits(BaseModel): + """HTTP boundary limits enforced at the FastAPI router layer. + + Sources: ``api/routers/{annotate,query_sets,support}.py`` (ver + ``docs/CONFIG_INVENTORY.md`` §D). + """ + + max_fasta_bytes: int = Field( + default=50 * 1024 * 1024, + ge=1024, + description=( + "Tope upload FASTA en bytes. 50 MB cubre la mayoría de " + "submissions; subir si el caso de uso lo justifica. " + "Hardcodeado antes en dos routers; este campo dedupica." + ), + ) + max_comment_length: int = Field( + default=500, + ge=1, + description="Caracteres máximos por comentario en /support.", + ) + recent_limit: int = Field( + default=20, + ge=1, + description="Items devueltos por defecto en /support/recent.", + ) + page_limit: int = Field( + default=100, + ge=1, + description="Page size hard cap para list endpoints de soporte.", + ) + + class TuningSettings(BaseModel): """Root tuning model that composes per-category sub-models.""" queue: QueueTuning = Field(default_factory=QueueTuning) worker: WorkerTuning = Field(default_factory=WorkerTuning) operation: OperationTuning = Field(default_factory=OperationTuning) + api: APILimits = Field(default_factory=APILimits) def _load_yaml_tuning(project_root: Path) -> dict[str, Any]: diff --git a/tests/test_tuning.py b/tests/test_tuning.py index 06a0280..cfb9ec6 100644 --- a/tests/test_tuning.py +++ b/tests/test_tuning.py @@ -7,6 +7,7 @@ import pytest from protea.config.tuning import ( + APILimits, OperationTuning, QueueTuning, TuningSettings, @@ -263,3 +264,41 @@ def test_env_override_chunk_size( get_tuning.cache_clear() s = get_tuning() assert s.operation.annotation_chunk_size == 50_000 + + +class TestAPILimitsDefaults: + def test_defaults(self) -> None: + a = APILimits() + assert a.max_fasta_bytes == 50 * 1024 * 1024 + assert a.max_comment_length == 500 + assert a.recent_limit == 20 + assert a.page_limit == 100 + + def test_validates_floor(self) -> None: + with pytest.raises(Exception): + APILimits(max_fasta_bytes=100) + with pytest.raises(Exception): + APILimits(max_comment_length=0) + with pytest.raises(Exception): + APILimits(recent_limit=0) + with pytest.raises(Exception): + APILimits(page_limit=0) + + +class TestAPILimitsEnvOverrides: + def setup_method(self) -> None: + get_tuning.cache_clear() + + def teardown_method(self) -> None: + get_tuning.cache_clear() + + def test_env_override_fasta_bytes( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + ) -> None: + monkeypatch.setattr( + "protea.config.tuning._resolve_project_root", lambda: tmp_path + ) + monkeypatch.setenv("PROTEA_TUNING__API__MAX_FASTA_BYTES", "104857600") # 100 MB + get_tuning.cache_clear() + s = get_tuning() + assert s.api.max_fasta_bytes == 104_857_600 From cb24813a4f276d69e2e9896cba46d91f618f3e7f Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 17:47:21 +0200 Subject: [PATCH 56/73] docs(config): T-CONF.3 add Tuning settings section to configuration.rst Documents the four migrated TuningSettings categories (Queue, Worker, Operation, APILimits) with field/default/purpose tables, YAML and env-override examples, and config-exempt category callouts (PCA dim, N_THRESHOLDS, GAF indices). Lives inside the existing appendix/configuration.rst so the reference is a single document. Part of F0 T-CONF.3 of master plan v3. --- docs/source/appendix/configuration.rst | 172 +++++++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/docs/source/appendix/configuration.rst b/docs/source/appendix/configuration.rst index 099bdc6..fde52b3 100644 --- a/docs/source/appendix/configuration.rst +++ b/docs/source/appendix/configuration.rst @@ -163,3 +163,175 @@ credentials ``guest`` / ``guest``). The seven PROTEA queues are: - ``store_predictions`` — bulk GOPrediction insert (ephemeral) Queues are declared at worker startup and survive broker restarts. + +Tuning settings +--------------- + +PROTEA exposes throughput, retry policy and boundary limits through +``protea.config.tuning.TuningSettings`` (pydantic). Values are +resolved per call (defaults < ``tuning:`` section in +``protea/config/system.yaml`` < env vars). + +Env var convention: ``PROTEA_TUNING____``. Double +underscore is the path separator (matches pydantic-settings' +``env_nested_delimiter``) so it never collides with single +underscores inside field names. + +Categories are derived from ``docs/CONFIG_INVENTORY.md`` (T-CONF.1 +of master plan v3) and migrated incrementally in T-CONF.2. + +QueueTuning +~~~~~~~~~~~ + +RabbitMQ publisher and consumer policy. + +.. list-table:: + :widths: 30 12 58 + :header-rows: 1 + + * - Field + - Default + - Purpose + * - ``publisher_max_attempts`` + - 12 + - Reintentos máximos al publicar a RabbitMQ. 12 attempts cubren ~4 min de broker downtime con backoff exponencial cap a 30s. + * - ``publisher_base_delay`` + - 1.0 + - Backoff inicial publisher en segundos. Multiplica x2 por intento. + * - ``oom_max_retries`` + - 5 + - Reintentos al hit CUDA OOM en GPU worker. + * - ``oom_base_delay`` + - 5 + - Backoff inicial OOM en segundos. + * - ``oom_max_delay`` + - 300 + - Cap del backoff OOM en segundos (5 min). + +YAML excerpt:: + + tuning: + queue: + publisher_max_attempts: 12 + oom_max_retries: 5 + +Env override example:: + + PROTEA_TUNING__QUEUE__PUBLISHER_MAX_ATTEMPTS=20 + +WorkerTuning +~~~~~~~~~~~~ + +Pool sizes, in-process caches, reaper timeouts, HTTP cache TTL. + +.. list-table:: + :widths: 32 12 56 + :header-rows: 1 + + * - Field + - Default + - Purpose + * - ``db_pool_size`` + - 20 + - SQLAlchemy connection pool size. + * - ``db_pool_max_overflow`` + - 40 + - Conexiones extra permitidas durante picos. + * - ``db_pool_recycle_seconds`` + - 3600 + - Reciclar conexiones tras N segundos. + * - ``model_cache_max`` + - 1 + - Modelos PLM en cache por proceso de embeddings. + * - ``ref_cache_max`` + - 1 + - Reference data sets en cache por proceso predict. + * - ``reaper_main_timeout_seconds`` + - 86400 + - Timeout duro antes de marcar jobs FAILED en producción (24h). + * - ``reaper_default_timeout_seconds`` + - 3600 + - Default constructor de StaleJobReaper. + * - ``reaper_stall_seconds`` + - 1800 + - Tiempo sin JobEvent antes de considerar un job stalled. + * - ``api_cache_default_ttl_seconds`` + - 300.0 + - TTL default cache HTTP. + +OperationTuning +~~~~~~~~~~~~~~~ + +Module-level chunk and batch sizes used inside operations. + +.. list-table:: + :widths: 28 12 60 + :header-rows: 1 + + * - Field + - Default + - Purpose + * - ``annotation_chunk_size`` + - 10_000 + - Filas por chunk al cargar/iterar anotaciones. + * - ``stream_chunk_size`` + - 2_000 + - Chunk size streaming PyArrow / SQLAlchemy yield_per. + * - ``store_chunk_size`` + - 10_000 + - Filas por chunk al publicar predictions a la cola store. + * - ``numpy_query_chunk`` + - 500 + - Query chunk size para KNN numpy backend (caps memoria de la matriz de distancias). + +HTTP retry policy and per-source timeouts (UniProt, GOA, QuickGO, +ontology) live inside the respective pydantic payloads +(``InsertProteinsPayload``, ``LoadGoaAnnotationsPayload``, etc.) by +design: callers pick them per-job rather than as global infra +defaults. + +APILimits +~~~~~~~~~ + +HTTP boundary limits enforced at the FastAPI router layer. + +.. list-table:: + :widths: 26 14 60 + :header-rows: 1 + + * - Field + - Default + - Purpose + * - ``max_fasta_bytes`` + - 52428800 (50 MB) + - Tope upload FASTA en bytes. Aplica a ``annotate`` y ``query_sets``. + * - ``max_comment_length`` + - 500 + - Caracteres máximos por comentario en /support. + * - ``recent_limit`` + - 20 + - Items devueltos por defecto en /support/recent. + * - ``page_limit`` + - 100 + - Page size hard cap para list endpoints de soporte. + +Config-exempt: research methodology constants +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following constants are **deliberately not** in TuningSettings +because changing them would shift the canonical numbers reported +in the thesis and papers: + +- ``EMBEDDING_PCA_DIM = 16`` (``core/reranker.py``): part of the + feature schema contract that ``protea-contracts`` will own; it + gates compatibility with trained boosters. +- ``N_THRESHOLDS = 101`` (``core/metrics.py``): CAFA Fmax sweep + granularity. Changing it produces non-comparable Fmax numbers. + +Structural exempt +~~~~~~~~~~~~~~~~~ + +Format-spec positional indices live in code (e.g. GAF column indices +in ``core/operations/load_goa_annotations.py``). They are not +configurable because doing so would mean PROTEA stops reading the +GAF format. From f5e7d4407392df4a13153da1d47764d406bb91ec Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 18:01:22 +0200 Subject: [PATCH 57/73] chore(deps): T0.15 add C-stack repos as plugins poetry group Adds protea-contracts, protea-method, protea-sources, protea-runners and protea-backends as develop=true path-deps under [tool.poetry.group.plugins.dependencies]. Install with poetry install --with plugins. End-to-end discovery verified: importlib.metadata.entry_points(group='protea.sources|runners|backends') resolves correctly from inside PROTEA's venv: 3 sources (goa/quickgo/uniprot), 3 runners (baseline/knn/lightgbm), 4 backends (ankh/esm/esm3c/t5). Suite still 1112 passed, 10 skipped. Part of F0 T0.15 of master plan v3. --- poetry.lock | 131 ++++++++++++++++++++++++++++++++++++++++++------- pyproject.toml | 14 ++++++ 2 files changed, 127 insertions(+), 18 deletions(-) diff --git a/poetry.lock b/poetry.lock index e9dca0f..74124bb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -69,7 +69,7 @@ version = "0.7.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -496,7 +496,7 @@ version = "2026.2.25" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa"}, {file = "certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7"}, @@ -606,7 +606,7 @@ version = "3.4.5" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "charset_normalizer-3.4.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4167a621a9a1a986c73777dbc15d4b5eac8ac5c10393374109a343d4013ec765"}, {file = "charset_normalizer-3.4.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f64c6bf8f32f9133b668c7f7a7cbdbc453412bc95ecdbd157f3b1e377a92990"}, @@ -1147,7 +1147,7 @@ version = "1.11.0.post1" description = "A library for efficient similarity search and clustering of dense vectors." optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "plugins"] markers = "python_version >= \"3.14\"" files = [ {file = "faiss_cpu-1.11.0.post1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:e079d44ea22919f6477fea553b05854c68838ab553e1c6b1237437a8becdf89d"}, @@ -1200,7 +1200,7 @@ version = "1.13.2" description = "A library for efficient similarity search and clustering of dense vectors." optional = false python-versions = "<3.15,>=3.10" -groups = ["main"] +groups = ["main", "plugins"] markers = "python_version < \"3.14\"" files = [ {file = "faiss_cpu-1.13.2-cp310-abi3-macosx_14_0_arm64.whl", hash = "sha256:a9064eb34f8f64438dd5b95c8f03a780b1a3f0b99c46eeacb1f0b5d15fc02dc1"}, @@ -1673,7 +1673,7 @@ version = "3.11" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, @@ -2051,7 +2051,7 @@ version = "4.6.0" description = "LightGBM Python-package" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "lightgbm-4.6.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:b7a393de8a334d5c8e490df91270f0763f83f959574d504c7ccb9eee4aef70ed"}, {file = "lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:2dafd98d4e02b844ceb0b61450a660681076b1ea6c7adb8c566dfd66832aafad"}, @@ -2565,7 +2565,7 @@ version = "2.4.3" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.11" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "numpy-2.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:33b3bf58ee84b172c067f56aeadc7ee9ab6de69c5e800ab5b10295d54c581adb"}, {file = "numpy-2.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ba7b51e71c05aa1f9bc3641463cd82308eab40ce0d5c7e1fd4038cbf9938147"}, @@ -3090,7 +3090,7 @@ version = "26.0" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529"}, {file = "packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4"}, @@ -3441,6 +3441,63 @@ files = [ [package.dependencies] wcwidth = "*" +[[package]] +name = "protea-backends" +version = "0.0.1" +description = "Protein language model embedding backends for the PROTEA stack: ESM family, T5/ProtT5, Ankh, ESM-C." +optional = false +python-versions = ">=3.12,<4.0" +groups = ["plugins"] +files = [] +develop = true + +[package.dependencies] +numpy = ">=1.24" +protea-contracts = {path = "../protea-contracts", develop = true} + +[package.source] +type = "directory" +url = "../protea-backends" + +[[package]] +name = "protea-contracts" +version = "0.0.1" +description = "Shared ABCs, payload schemas, feature registry contract and compute_schema_sha helper for the PROTEA stack." +optional = false +python-versions = ">=3.12,<4.0" +groups = ["plugins"] +files = [] +develop = true + +[package.dependencies] +numpy = ">=1.24" +pyarrow = ">=14" +pydantic = ">=2.5" + +[package.source] +type = "directory" +url = "../protea-contracts" + +[[package]] +name = "protea-method" +version = "0.0.1" +description = "Pure inference: KNN search, feature compute, apply reranker. Standalone library; no FastAPI, no SQLAlchemy." +optional = false +python-versions = ">=3.12,<4.0" +groups = ["plugins"] +files = [] +develop = true + +[package.dependencies] +faiss-cpu = ">=1.7" +lightgbm = ">=4.0" +numpy = ">=1.24" +protea-contracts = {path = "../protea-contracts", develop = true} + +[package.source] +type = "directory" +url = "../protea-method" + [[package]] name = "protea-reranker-lab" version = "0.2.0" @@ -3465,6 +3522,44 @@ wandb = ">=0.16" type = "directory" url = "../protea-reranker-lab" +[[package]] +name = "protea-runners" +version = "0.0.1" +description = "Experiment runner plugins for the PROTEA stack: LightGBM training, KNN baseline, future GNN and retrieval-neural runners." +optional = false +python-versions = ">=3.12,<4.0" +groups = ["plugins"] +files = [] +develop = true + +[package.dependencies] +lightgbm = ">=4.0" +numpy = ">=1.24" +protea-contracts = {path = "../protea-contracts", develop = true} +pyarrow = ">=14" + +[package.source] +type = "directory" +url = "../protea-runners" + +[[package]] +name = "protea-sources" +version = "0.0.1" +description = "Annotation source plugins for the PROTEA stack: GOA, QuickGO, UniProt, future InterProScan." +optional = false +python-versions = ">=3.12,<4.0" +groups = ["plugins"] +files = [] +develop = true + +[package.dependencies] +protea-contracts = {path = "../protea-contracts", develop = true} +requests = ">=2.31" + +[package.source] +type = "directory" +url = "../protea-sources" + [[package]] name = "protobuf" version = "7.34.1" @@ -3639,7 +3734,7 @@ version = "23.0.1" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.10" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "pyarrow-23.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:3fab8f82571844eb3c460f90a75583801d14ca0cc32b1acc8c361650e006fd56"}, {file = "pyarrow-23.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3f91c038b95f71ddfc865f11d5876c42f343b4495535bd262c7b321b0b94507c"}, @@ -3776,7 +3871,7 @@ version = "2.12.5" description = "Data validation using Python type hints" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d"}, {file = "pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49"}, @@ -3798,7 +3893,7 @@ version = "2.41.5" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146"}, {file = "pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2"}, @@ -4265,7 +4360,7 @@ version = "2.32.5" description = "Python HTTP for Humans." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6"}, {file = "requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf"}, @@ -4436,7 +4531,7 @@ version = "1.17.1" description = "Fundamental algorithms for scientific computing in Python" optional = false python-versions = ">=3.11" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec"}, {file = "scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696"}, @@ -5689,7 +5784,7 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, @@ -5701,7 +5796,7 @@ version = "0.4.2" description = "Runtime typing introspection tools" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7"}, {file = "typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464"}, @@ -5729,7 +5824,7 @@ version = "2.6.3" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main", "dev", "plugins"] files = [ {file = "urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4"}, {file = "urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed"}, @@ -5996,4 +6091,4 @@ storage = ["minio"] [metadata] lock-version = "2.1" python-versions = ">=3.12,<4.0" -content-hash = "e448bd2c78d67963819913f12afa9a2489d621e20673fb9fb97afe0de3849438" +content-hash = "9d6d78a4fadabf20d04a86de20085774d552e1b099c2ba704c567cbfacef676a" diff --git a/pyproject.toml b/pyproject.toml index f9b3ff5..21eb055 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,20 @@ dependencies = [ storage = ["minio (>=7.2,<8.0)"] +[tool.poetry.group.plugins.dependencies] +# F0 T0.15 of master plan v3: protea-core references the C-stack repos +# as a poetry group so contributors can install with +# ``poetry install --with plugins`` once the bootstrap repos exist on +# disk. Real ABCs / payloads land in F1 (T1.1-T1.5); during F0 these +# packages export only their version sentinel and entry_point stubs, +# which is enough to validate the multi-repo layout. +protea-contracts = { path = "../protea-contracts", develop = true } +protea-method = { path = "../protea-method", develop = true } +protea-sources = { path = "../protea-sources", develop = true } +protea-runners = { path = "../protea-runners", develop = true } +protea-backends = { path = "../protea-backends", develop = true } + + [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] build-backend = "poetry.core.masonry.api" From 780d3cf87d23e505c35ffad64b989e5c42423822 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 19:00:56 +0200 Subject: [PATCH 58/73] feat(ci): T0.4 add security workflow (pip-audit + bandit) Adds .github/workflows/security.yml with two jobs: - pip-audit: scans installed dependencies against the OSV database. Non-blocking in F0 (the existing surface has 22 known CVEs, all in third-party transitive deps; transformers 4.48.x dominates with 11 CVEs that need a coordinated bump). Master plan v3 F-OPS T-OPS.7 will flip this to fail on severity HIGH. - bandit: security static analysis against protea/. Runs in HIGH severity + HIGH confidence mode at F0 (zero findings now); will tighten in F-OPS. Triggers: push, PR, and a weekly cron (Mon 06:00 UTC) so freshly disclosed CVEs surface even when no PR has landed. Inline fixes for the two bandit B324 findings (weak MD5 hash): - protea/core/reranker.py: cache key tag in _load_artifact_to_disk. - protea/infrastructure/orm/models/sequence/sequence.py: sequence dedup key. Both pass usedforsecurity=False (Python 3.9+ flag) to declare intent; collision resistance is irrelevant in either context (cache key tag and dedup hash, not security primitives). Bandit config in pyproject.toml [tool.bandit]: excludes tests/ and the lab archeology dump script; skips B404/B603/B101 (subprocess imports + assert usage) which are project-level acceptable. Suite: 1112 passed, 10 skipped (unchanged). Part of F0 T0.4 of master plan v3. --- .github/workflows/security.yml | 70 +++++++++++++++++++ protea/core/reranker.py | 6 +- .../orm/models/sequence/sequence.py | 5 +- pyproject.toml | 13 ++++ 4 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/security.yml diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml new file mode 100644 index 0000000..45a6db0 --- /dev/null +++ b/.github/workflows/security.yml @@ -0,0 +1,70 @@ +name: Security + +on: + push: + pull_request: + schedule: + # Weekly Monday 06:00 UTC: catches new CVEs against pinned deps + # even if no PR has landed. + - cron: "0 6 * * 1" + +jobs: + audit: + name: pip-audit + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + poetry-version: ["2.1.0"] + + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - uses: abatilo/actions-poetry@v3 + with: + poetry-version: ${{ matrix.poetry-version }} + + - name: Add poetry to PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Install production deps in the poetry venv + # poetry 2.x removed `poetry export`; install with the main group + # only and then audit the resolved environment in-place. + run: poetry install --only main + + - name: Install pip-audit + run: pip install pip-audit + + - name: pip-audit (non-blocking in F0; blocking once F-OPS T-OPS.7 lands) + # F0 stance: surface findings without breaking the pipeline so + # the team can triage. F-OPS T-OPS.7 of master plan v3 will + # flip this to fail on severity HIGH. + run: poetry run pip-audit --strict --vulnerability-service osv || true + + bandit: + name: bandit (security static analysis) + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install bandit + run: pip install "bandit[toml]" + + - name: Run bandit on protea/ + # Severity HIGH and confidence HIGH only at F0; tighten in F-OPS. + # Bandit reads its config from pyproject.toml ([tool.bandit]). + run: bandit --severity-level high --confidence-level high -r protea/ -c pyproject.toml || true diff --git a/protea/core/reranker.py b/protea/core/reranker.py index 9a59693..57a787b 100644 --- a/protea/core/reranker.py +++ b/protea/core/reranker.py @@ -340,9 +340,11 @@ def load_reranker( cache_dir = cache_dir or _default_cache_dir() cache_dir.mkdir(parents=True, exist_ok=True) # Disambiguate the on-disk cache file by URI hash so two boosters with - # the same schema_sha don't overwrite each other's blobs. + # the same schema_sha don't overwrite each other's blobs. usedforsecurity=False + # because this is a cache key tag, not a security primitive (MD5 collision + # resistance is irrelevant here). import hashlib - uri_tag = hashlib.md5(artifact_uri.encode()).hexdigest()[:8] + uri_tag = hashlib.md5(artifact_uri.encode(), usedforsecurity=False).hexdigest()[:8] path = cache_dir / f"{feature_schema_sha}_{uri_tag}.txt" if not path.exists(): diff --git a/protea/infrastructure/orm/models/sequence/sequence.py b/protea/infrastructure/orm/models/sequence/sequence.py index 24f026c..9f8d547 100644 --- a/protea/infrastructure/orm/models/sequence/sequence.py +++ b/protea/infrastructure/orm/models/sequence/sequence.py @@ -41,7 +41,10 @@ class Sequence(Base): @staticmethod def compute_hash(seq: str) -> str: - return hashlib.md5(seq.encode("utf-8")).hexdigest() + # usedforsecurity=False: this hash is the dedup key for the + # protein sequence table, not a security primitive. MD5 collision + # resistance is irrelevant; we just need a stable 32-hex digest. + return hashlib.md5(seq.encode("utf-8"), usedforsecurity=False).hexdigest() def __init__(self, *args: object, **kwargs: object) -> None: super().__init__(*args, **kwargs) diff --git a/pyproject.toml b/pyproject.toml index 21eb055..77eac8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,9 +85,22 @@ target-version = "py312" line-length = 100 [tool.ruff.lint] +# F0 ruleset: keeps the existing baseline (E/F/W/I/UP/B) and ignores +# enforced for project conventions. Master plan v3 F-OPS phase will +# extend this to the full ruff catalog (N, RUF, SIM, PT, etc.). select = ["E", "F", "W", "I", "UP", "B"] ignore = ["E501", "B008"] +[tool.bandit] +# Bandit security scan (run via .github/workflows/security.yml). +# Skips test files (assert statements + magic mocks would create noise) +# and the lab archeology that lives outside protea/. +exclude_dirs = ["tests", "scripts/dump_reranker_dataset.py"] +# B404: subprocess import (legitimate in scripts/manage.sh wrappers). +# B603: subprocess call without shell=True (we use it correctly). +# B101: assert_used (assertions in non-test code; project keeps a few). +skips = ["B404", "B603", "B101"] + [tool.ruff.lint.per-file-ignores] # Standalone runner scripts insert PROJECT_ROOT into sys.path before # importing the protea package, so module-level imports cannot live at From 16f1e58bf435f8116d071e43c4c1dbeb4738b873 Mon Sep 17 00:00:00 2001 From: frapercan Date: Tue, 5 May 2026 19:24:06 +0200 Subject: [PATCH 59/73] refactor(contracts): T1.5 PROTEA imports from protea-contracts Removes the duplicated definitions of feature schema, payload classes and ProteaPayload base from PROTEA. They now live exclusively in ``protea-contracts`` (v0.1.0). PROTEA modules re-export the names from their original module locations so existing imports keep working; new code should import from ``protea_contracts`` directly. Files touched: - protea/core/reranker.py - Drops 73 lines of NUMERIC_FEATURES / EMBEDDING_PCA_DIM / CATEGORICAL_FEATURES / ALL_FEATURES / LABEL_COLUMN definitions. - Re-exports the same names from protea_contracts. - fit_embedding_pca remains local (it's logic, not contract). - protea/core/contracts/operation.py - Drops the 11-line ProteaPayload class definition. - Re-exports it from protea_contracts. - Drops the now-unused ``BaseModel, ConfigDict`` import. - protea/core/operations/predict_go_terms.py - Drops 119 lines of PredictGOTermsPayload / PredictGOTermsBatchPayload / StorePredictionsPayload classes. - Re-exports them from protea_contracts. - Drops now-unused imports (Annotated, Field, field_validator) and the local PositiveInt alias. Net diff: -218 / +30 in PROTEA. Logic preserved exactly: every existing call site (15 files imported one of these names) keeps working through the re-exports. Suite: 1112 passed, 10 skipped (unchanged). The protea-contracts suite (71 passed, cov 95%) covers the moved definitions; PROTEA's existing tests cover the integration. Part of F1 T1.5 of master plan v3. --- protea/core/contracts/operation.py | 18 +-- protea/core/operations/predict_go_terms.py | 135 ++------------------- protea/core/reranker.py | 92 ++------------ 3 files changed, 27 insertions(+), 218 deletions(-) diff --git a/protea/core/contracts/operation.py b/protea/core/contracts/operation.py index a05245a..3304982 100644 --- a/protea/core/contracts/operation.py +++ b/protea/core/contracts/operation.py @@ -7,9 +7,13 @@ from typing import Any, Literal, Protocol from uuid import UUID -from pydantic import BaseModel, ConfigDict from sqlalchemy.orm import Session +# T1.5 of master plan v3: ProteaPayload is owned by protea-contracts. +# Re-export here so existing imports of ``ProteaPayload`` from this +# module keep working; new code should import from ``protea_contracts``. +from protea_contracts import ProteaPayload + Level = Literal["info", "warning", "error"] EmitFn = Callable[[str, str | None, dict[str, Any], Level], None] @@ -83,18 +87,6 @@ def __init__(self, reason: str, delay_seconds: int = 60) -> None: # noqa: B042 self.delay_seconds = delay_seconds -class ProteaPayload(BaseModel): - """Immutable, strictly-typed base class for all operation payloads. - - Subclass and declare fields using Pydantic annotations. Validation runs - automatically via ``model_validate(dict)`` — no manual parsing needed. - ``strict=True`` prevents silent type coercion (e.g. ``"yes"`` is not a - valid ``bool``). - """ - - model_config = ConfigDict(strict=True, frozen=True) - - class Operation(Protocol): """Protocol that every domain operation must satisfy. diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index d9944b9..ca98eb6 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -3,11 +3,10 @@ import time import uuid from pathlib import Path -from typing import Annotated, Any +from typing import Any from uuid import UUID import numpy as np -from pydantic import Field, field_validator from sqlalchemy import update as sa_update from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.orm import Session @@ -56,8 +55,6 @@ from protea.infrastructure.settings import load_settings from protea.infrastructure.storage import get_artifact_store -PositiveInt = Annotated[int, Field(gt=0)] - # Annotation and stream chunk sizes are configured via OperationTuning # (annotation_chunk_size, stream_chunk_size) and resolved at call time # inside the helpers below. At 1280 dims x 2 bytes (float16) x 2000 rows @@ -156,127 +153,15 @@ def _row_from_prediction( # --------------------------------------------------------------------------- # Payloads # --------------------------------------------------------------------------- - - -class PredictGOTermsPayload(ProteaPayload, frozen=True): - """Payload for the predict_go_terms coordinator job.""" - - embedding_config_id: str - annotation_set_id: str - ontology_snapshot_id: str - query_accessions: list[str] | None = None - query_set_id: str | None = None - limit_per_entry: PositiveInt = 5 - distance_threshold: float | None = None - batch_size: PositiveInt = 1024 - - # Search backend - search_backend: str = "numpy" - metric: str = "cosine" - faiss_index_type: str = "Flat" - faiss_nlist: int = 100 - faiss_nprobe: int = 10 - faiss_hnsw_m: int = 32 - faiss_hnsw_ef_search: int = 64 - - # Feature engineering — enabled by default so that every PredictionSet - # carries the full scoring/reranking feature set. Callers must opt *out* - # explicitly when they want a lean KNN-only run (e.g. for a quick smoke - # test or a backend where NW/SW alignment is prohibitive). - compute_alignments: bool = True - compute_taxonomy: bool = True - compute_reranker_features: bool = True - - # v6 reranker features (opt-in): 6 Anc2Vec + 3 tax_voters + 16 emb_pca. - # When enabled, the PCA state is fit once per ``EmbeddingConfig`` (or - # reused from ``artifacts/pca/{config_id}.npz``) and the 25 extra columns - # are persisted on every ``GOPrediction`` row. Required at predict time - # for any prediction_set that will be rerank-scored with a v6 model. - compute_v6_features: bool = False - - # Ancestor expansion of leaf GO predictions (opt-in). When enabled, - # every leaf candidate gets its is_a / part_of ancestor closure - # synthesised as additional records — required to match the candidate - # distribution the lab booster saw at training time. Without this the - # live PredictionSet has ~5-10× fewer candidates per (protein, aspect) - # than ``the dump helper``'s dump, and LK / PK fmax collapses - # because the booster's score distribution is calibrated against the - # richer expanded set. See ``feature_enricher.expand_predictions_to_ancestors``. - expand_votes_to_ancestors: bool = False - - # Per-aspect KNN indices (opt-in) - # When True, three separate KNN indices are built — one per GO aspect (P/F/C). - # Each index contains only reference proteins annotated in that aspect, and only - # annotations of that aspect are transferred from matched neighbors. - # This guarantees that every query protein receives BPO, MFO, and CCO candidates - # even if its nearest neighbors in a unified index happen to be annotated only in - # one or two aspects (a common cause of BPO recall ceilings). - # Memory cost: 3× the reference embedding array; search time: 3 KNN calls per batch. - aspect_separated_knn: bool = True - - # Optional reranker promoted from protea-reranker-lab. When set, the - # batch worker scores predictions with the referenced booster after - # validating ``feature_schema_sha`` against the live feature set — - # mismatch degrades to KNN-distance ordering (never crashes). - reranker_model_id: str | None = None - - @field_validator( - "embedding_config_id", "annotation_set_id", "ontology_snapshot_id", mode="before" - ) - @classmethod - def must_be_non_empty(cls, v: str) -> str: - if not isinstance(v, str) or not v.strip(): - raise ValueError("must be a non-empty string") - return v.strip() - - -class PredictGOTermsBatchPayload(ProteaPayload, frozen=True): - """Payload for one KNN batch dispatched by the coordinator.""" - - embedding_config_id: str - annotation_set_id: str - ontology_snapshot_id: str - prediction_set_id: str - parent_job_id: str - query_accessions: list[str] - query_set_id: str | None = None - limit_per_entry: PositiveInt = 5 - distance_threshold: float | None = None - search_backend: str = "numpy" - metric: str = "cosine" - faiss_index_type: str = "Flat" - faiss_nlist: int = 100 - faiss_nprobe: int = 10 - faiss_hnsw_m: int = 32 - faiss_hnsw_ef_search: int = 64 - # Feature engineering — kept in sync with PredictGOTermsPayload defaults. - compute_alignments: bool = True - compute_taxonomy: bool = True - compute_reranker_features: bool = True - compute_v6_features: bool = False - expand_votes_to_ancestors: bool = False - aspect_separated_knn: bool = True - - # Reranker context propagated from the coordinator. ``artifact_uri`` - # and ``feature_schema_sha`` are snapshotted at dispatch time so the - # worker does not have to re-query the RerankerModel row. - reranker_model_id: str | None = None - reranker_artifact_uri: str | None = None - reranker_feature_schema_sha: str | None = None - - -class StorePredictionsPayload(ProteaPayload, frozen=True): - """Payload carrying serialized prediction dicts to the write worker.""" - - parent_job_id: str - prediction_set_id: str - predictions: list[dict[str, Any]] - # When the upstream batch chunks its predictions across multiple write - # messages (because the full payload exceeds the RabbitMQ 128 MB limit), - # only the last chunk should advance the coordinator's batch counter — - # otherwise ``progress_current`` ticks once per chunk and the parent - # job marks itself succeeded long before all batches finish. - is_final_chunk: bool = True +# T1.5 of master plan v3: payloads now live in protea-contracts. +# Re-export here so existing imports of these classes from this module +# keep working; new code should import from ``protea_contracts``. + +from protea_contracts import ( # noqa: E402 + PredictGOTermsBatchPayload, + PredictGOTermsPayload, + StorePredictionsPayload, +) # --------------------------------------------------------------------------- diff --git a/protea/core/reranker.py b/protea/core/reranker.py index 57a787b..a5ecfdd 100644 --- a/protea/core/reranker.py +++ b/protea/core/reranker.py @@ -30,85 +30,19 @@ from protea.infrastructure.storage import ArtifactStore, LocalFsArtifactStore -logger = logging.getLogger(__name__) - -# --------------------------------------------------------------------------- -# Feature definitions -# --------------------------------------------------------------------------- +# T1.5 of master plan v3: the feature schema is owned by protea-contracts. +# Re-export here so existing call sites that import from +# ``protea.core.reranker`` keep working; new code should import from +# ``protea_contracts`` directly. +from protea_contracts import ( + ALL_FEATURES, + CATEGORICAL_FEATURES, + EMBEDDING_PCA_DIM, + LABEL_COLUMN, + NUMERIC_FEATURES, +) -NUMERIC_FEATURES: list[str] = [ - "distance", - # NW alignment - "identity_nw", - "similarity_nw", - "alignment_score_nw", - "gaps_pct_nw", - "alignment_length_nw", - # SW alignment - "identity_sw", - "similarity_sw", - "alignment_score_sw", - "gaps_pct_sw", - "alignment_length_sw", - # Lengths - "length_query", - "length_ref", - # Taxonomy - "taxonomic_distance", - "taxonomic_common_ancestors", - # Re-ranker features - "vote_count", - "k_position", - "go_term_frequency", - "ref_annotation_density", - "neighbor_distance_std", - # Consensus features (per candidate term, computed over voting neighbors) - "neighbor_vote_fraction", - "neighbor_min_distance", - "neighbor_mean_distance", - # Anc2Vec semantic-coherence features (GO release 2020-10-06 pretrained) - "anc2vec_neighbor_cos", - "anc2vec_neighbor_maxcos", - "anc2vec_has_emb", - # Query-side Anc2Vec (PK-killer): candidate vs query's pre-cutoff annotations - "anc2vec_query_known_cos", - "anc2vec_query_known_maxcos", - "anc2vec_query_known_count", - # Taxonomic consensus across voting neighbors (requires compute_taxonomy=True) - "tax_voters_same_frac", - "tax_voters_close_frac", - "tax_voters_mean_common_ancestors", - # Sequence-embedding PCA — 16-dim query projection onto the top principal - # components of the reference embedding pool (use_embedding_pca flag). - # NaN when the flag is disabled: LightGBM treats them as missing. - "emb_pca_query_0", - "emb_pca_query_1", - "emb_pca_query_2", - "emb_pca_query_3", - "emb_pca_query_4", - "emb_pca_query_5", - "emb_pca_query_6", - "emb_pca_query_7", - "emb_pca_query_8", - "emb_pca_query_9", - "emb_pca_query_10", - "emb_pca_query_11", - "emb_pca_query_12", - "emb_pca_query_13", - "emb_pca_query_14", - "emb_pca_query_15", -] - -EMBEDDING_PCA_DIM = 16 - -CATEGORICAL_FEATURES: list[str] = [ - "qualifier", - "evidence_code", - "taxonomic_relation", - "aspect", -] - -ALL_FEATURES: list[str] = NUMERIC_FEATURES + CATEGORICAL_FEATURES +logger = logging.getLogger(__name__) def fit_embedding_pca( @@ -141,8 +75,6 @@ def fit_embedding_pca( components = vh[:k].astype(np.float32) return mean.astype(np.float32), components -LABEL_COLUMN = "label" - # --------------------------------------------------------------------------- # Data preparation From ce26517c95683717784b99df8824113f7a038150 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Miguel=20P=C3=A9rez=20Canales?= Date: Tue, 5 May 2026 19:36:22 +0200 Subject: [PATCH 60/73] test(contracts): T1.7 cross-repo invariant tests Pins the contract between protea_contracts (canonical) and PROTEA's re-exports / future registry. 14 tests in 4 classes: - TestReexportIdentity (7 tests, active): every constant PROTEA still re-exports must be the same object as protea_contracts (ALL_FEATURES, NUMERIC, CATEGORICAL, EMBEDDING_PCA_DIM, LABEL_COLUMN, ProteaPayload, the 3 predict payloads). Hard guarantee that 'from protea.core.reranker import ALL_FEATURES' will not silently diverge from 'from protea_contracts import ALL_FEATURES'. - TestShaConsistency (2 tests, active): compute_schema_sha produces the same digest regardless of caller path; pinned to the golden 145592ed186c so PROTEA CI fails before the booster cache invalidates. - TestFeatureFamilyCoverage (3 tests, active): every family member lives in ALL_FEATURES; emb_pca family size matches EMBEDDING_PCA_DIM; canonical naming. - TestRegistryCoversContracts (2 tests, skipped): activates automatically when F2B.1 ships protea/core/features/registry.py; asserts set(REGISTRY.names()) == set(ALL_FEATURES) and family map equality. Suite: 1124 passed, 12 skipped (was 1112 + 12 active + 2 dormant). Part of F1 T1.7 of master plan v3. --- tests/test_feature_contract.py | 181 +++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 tests/test_feature_contract.py diff --git a/tests/test_feature_contract.py b/tests/test_feature_contract.py new file mode 100644 index 0000000..7569dac --- /dev/null +++ b/tests/test_feature_contract.py @@ -0,0 +1,181 @@ +"""Invariant tests cross-repo (T1.7 of master plan v3). + +Pins the contract between ``protea_contracts`` (canonical schema) +and PROTEA's re-exports / future registry. Any drift here means the +booster cache is at risk. + +Two tiers of assertions: + +1. **Re-export identity** (active now): every name PROTEA still + exposes from its old module locations must be the *same object* + (or equal value) as the canonical one in ``protea_contracts``. +2. **Registry cover** (skipped until F2B.1 of master plan v3 lands + the in-process registry in ``protea/core/features/``): the live + registry must cover exactly ``ALL_FEATURES`` and group them + under the same family map. + +Once F2B.1 ships, the skipped tests turn green automatically and +the suite gains a hard guarantee that the platform's feature +generation matches what every booster was trained against. +""" + +from __future__ import annotations + +import importlib + +import pytest + +import protea_contracts + + +# --------------------------------------------------------------------------- +# Re-export identity (active) +# --------------------------------------------------------------------------- + + +class TestReexportIdentity: + """Every constant PROTEA still re-exports must be the same object + or equal value as the canonical one in protea_contracts. Hard + guarantee that ``from protea.core.reranker import ALL_FEATURES`` + does NOT diverge from ``from protea_contracts import ALL_FEATURES``. + """ + + def test_all_features_identity(self) -> None: + from protea.core.reranker import ALL_FEATURES as proto_all + + assert proto_all is protea_contracts.ALL_FEATURES + + def test_numeric_features_identity(self) -> None: + from protea.core.reranker import NUMERIC_FEATURES as proto_num + + assert proto_num is protea_contracts.NUMERIC_FEATURES + + def test_categorical_features_identity(self) -> None: + from protea.core.reranker import CATEGORICAL_FEATURES as proto_cat + + assert proto_cat is protea_contracts.CATEGORICAL_FEATURES + + def test_embedding_pca_dim_value(self) -> None: + from protea.core.reranker import EMBEDDING_PCA_DIM as proto_dim + + assert proto_dim == protea_contracts.EMBEDDING_PCA_DIM + + def test_label_column_value(self) -> None: + from protea.core.reranker import LABEL_COLUMN as proto_label + + assert proto_label == protea_contracts.LABEL_COLUMN + + def test_protea_payload_identity(self) -> None: + from protea.core.contracts.operation import ProteaPayload as proto_base + + assert proto_base is protea_contracts.ProteaPayload + + def test_predict_payloads_identity(self) -> None: + from protea.core.operations.predict_go_terms import ( + PredictGOTermsBatchPayload, + PredictGOTermsPayload, + StorePredictionsPayload, + ) + + assert PredictGOTermsPayload is protea_contracts.PredictGOTermsPayload + assert PredictGOTermsBatchPayload is protea_contracts.PredictGOTermsBatchPayload + assert StorePredictionsPayload is protea_contracts.StorePredictionsPayload + + +# --------------------------------------------------------------------------- +# Sha consistency (active) +# --------------------------------------------------------------------------- + + +class TestShaConsistency: + """compute_schema_sha must produce the same digest from any caller + on the same column list. Pinning here catches accidental drift + if PROTEA ever stops re-exporting and grows its own copy.""" + + def test_all_features_sha_matches_canonical(self) -> None: + sha_via_contracts = protea_contracts.compute_schema_sha( + protea_contracts.ALL_FEATURES + ) + # Re-import through PROTEA path — must hit the same constant + # and the same function (re-exported). + from protea.core.reranker import ALL_FEATURES as proto_all + + sha_via_protea = protea_contracts.compute_schema_sha(proto_all) + assert sha_via_contracts == sha_via_protea + + def test_sha_is_pinned_to_golden(self) -> None: + # Mirrors the golden test inside protea-contracts: bumping any + # column forces a SemVer major bump on protea-contracts AND a + # re-train of every downstream LightGBM booster. Pinned here + # too so PROTEA's CI fails before the booster cache invalidates. + sha = protea_contracts.compute_schema_sha(protea_contracts.ALL_FEATURES) + assert sha == "145592ed186c" + + +# --------------------------------------------------------------------------- +# Feature-family coverage invariants (active) +# --------------------------------------------------------------------------- + + +class TestFeatureFamilyCoverage: + def test_every_family_member_in_all_features(self) -> None: + all_set = set(protea_contracts.ALL_FEATURES) + offenders = [] + for family, cols in protea_contracts.FEATURE_FAMILIES.items(): + for col in cols: + if col not in all_set: + offenders.append((family, col)) + assert offenders == [], ( + "FEATURE_FAMILIES references columns missing from ALL_FEATURES: " + f"{offenders}" + ) + + def test_emb_pca_family_size_matches_dim(self) -> None: + emb_pca = protea_contracts.FEATURE_FAMILIES["emb_pca"] + assert len(emb_pca) == protea_contracts.EMBEDDING_PCA_DIM + + def test_emb_pca_family_names_are_canonical(self) -> None: + expected = [ + f"emb_pca_query_{i}" for i in range(protea_contracts.EMBEDDING_PCA_DIM) + ] + assert protea_contracts.FEATURE_FAMILIES["emb_pca"] == expected + + +# --------------------------------------------------------------------------- +# Future registry cover (skipped until F2B.1) +# --------------------------------------------------------------------------- + + +def _registry_module_available() -> bool: + try: + importlib.import_module("protea.core.features.registry") + return True + except ImportError: + return False + + +@pytest.mark.skipif( + not _registry_module_available(), + reason=( + "F2B.1 of master plan v3 lands the in-process feature registry " + "in protea/core/features/registry.py. Until then this contract " + "test is dormant; once the registry ships it activates " + "automatically and pins the registry-vs-contracts invariant." + ), +) +class TestRegistryCoversContracts: + def test_registry_names_match_all_features(self) -> None: + from protea.core.features.registry import REGISTRY # type: ignore[import-not-found] + + assert set(REGISTRY.names()) == set(protea_contracts.ALL_FEATURES) + + def test_registry_families_match_contracts(self) -> None: + from protea.core.features.registry import REGISTRY # type: ignore[import-not-found] + + # Order inside each family list does not matter at the registry + # level; the dataset side enforces order via ALL_FEATURES. + registry_fams = {k: sorted(v) for k, v in REGISTRY.families().items()} + contract_fams = { + k: sorted(v) for k, v in protea_contracts.FEATURE_FAMILIES.items() + } + assert registry_fams == contract_fams From 50191007d93cf903d497bf1a77d696f55ff23e30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Miguel=20P=C3=A9rez=20Canales?= Date: Wed, 6 May 2026 00:06:34 +0200 Subject: [PATCH 61/73] feat(boundary): T1.8 column invariant on export + canonical sha on inference Two boundary validations against the canonical protea_contracts schema: Export side (parquet_export.py): before writing train/eval parquets, compute compute_schema_sha([c for c in shard.columns if c in ALL_FEATURES]) and compare to compute_schema_sha(ALL_FEATURES). Mismatch raises ValueError with the missing/extras list, instead of silently shipping a partial dump that LightGBM training would choke on. Pure invariant check; the legacy schema_sha hash in the manifest is unchanged (T1.6 of master plan v3 owns the migration to schema_sha_v2). Inference side (predict_go_terms._apply_reranker_if_aligned): switches the import of compute_feature_schema_sha from protea_reranker_lab.contracts to protea_contracts. Functions are byte-identical so behaviour is preserved; the canonical source is now protea_contracts (single source of truth). 5 new tests in test_parquet_export_boundary.py: full columns pass, missing column in train raises, missing column in eval raises, typo feature name raises, empty eval shard skipped. Suite: 1129 passed, 12 skipped (was 1124 + 5). Part of F1 T1.8 of master plan v3. --- protea/core/operations/predict_go_terms.py | 4 +- protea/core/parquet_export.py | 28 ++++++ tests/test_parquet_export_boundary.py | 107 +++++++++++++++++++++ 3 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 tests/test_parquet_export_boundary.py diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index ca98eb6..54ec1cd 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -826,7 +826,9 @@ def _apply_reranker_if_aligned( return None try: - from protea_reranker_lab.contracts import compute_feature_schema_sha + # T1.8 boundary validation: live sha computed via the canonical + # protea_contracts implementation (single source of truth). + from protea_contracts import compute_feature_schema_sha except Exception as exc: emit( "reranker.skipped", diff --git a/protea/core/parquet_export.py b/protea/core/parquet_export.py index 7301532..fa046ae 100644 --- a/protea/core/parquet_export.py +++ b/protea/core/parquet_export.py @@ -29,6 +29,7 @@ from protea.core.reranker import ALL_FEATURES, LABEL_COLUMN from protea.infrastructure.storage import ArtifactStore +from protea_contracts import compute_schema_sha as _canonical_schema_sha logger = logging.getLogger(__name__) @@ -181,6 +182,30 @@ def export_reranker_parquets( train_df = _reorder(train_df, reserved) eval_df = _reorder(eval_df, reserved) + # T1.8 boundary validation: before writing, the actual feature columns + # of every non-empty shard must equal ALL_FEATURES exactly. The + # canonical compute_schema_sha (lab format) is used on both sides; if + # the shard is missing or carries unknown feature columns the sha + # differs and we raise instead of silently shipping a partial dump. + canonical_features_sha = _canonical_schema_sha(list(ALL_FEATURES)) + for shard_name, shard in (("train", train_df), ("eval", eval_df)): + if shard.empty: + continue + present_features = [c for c in shard.columns if c in ALL_FEATURES] + present_sha = _canonical_schema_sha(present_features) + if present_sha != canonical_features_sha: + missing = [c for c in ALL_FEATURES if c not in shard.columns] + extras = [ + c + for c in shard.columns + if c not in ALL_FEATURES and c not in reserved + ] + raise ValueError( + f"{shard_name} shard fails the canonical column invariant. " + f"missing={missing!r} extras={extras!r}. " + "All ALL_FEATURES columns must be present before write." + ) + train_path = stage_dir / "train.parquet" eval_path = stage_dir / "eval.parquet" manifest_path = stage_dir / "manifest.json" @@ -189,6 +214,9 @@ def export_reranker_parquets( if not eval_df.empty: eval_df.to_parquet(eval_path, index=False, compression="snappy") + # Legacy schema_sha hash kept in the manifest until T1.6 of master + # plan v3 lands the schema_sha_v2 migration. The T1.8 invariant + # above already guarantees the column set is correct. schema_sha = hashlib.sha256( json.dumps(list(ALL_FEATURES), sort_keys=True).encode() ).hexdigest()[:12] diff --git a/tests/test_parquet_export_boundary.py b/tests/test_parquet_export_boundary.py new file mode 100644 index 0000000..30e35f2 --- /dev/null +++ b/tests/test_parquet_export_boundary.py @@ -0,0 +1,107 @@ +"""T1.8 boundary validation tests for parquet_export. + +Pins the boundary invariant: shards written to disk must contain +exactly the canonical ``ALL_FEATURES`` columns (plus reserved +columns). Missing or unknown feature columns must raise instead of +silently shipping a partial dump that future LightGBM training would +choke on. +""" + +from __future__ import annotations + +from pathlib import Path + +import pandas as pd +import pytest + +from protea_contracts import ALL_FEATURES + + +def _full_feature_row() -> dict[str, object]: + """Build a single shard row with reserved cols + every ALL_FEATURES col. + + Per-shard reserved columns differ from the merged-dump reserved set: + shards carry ``go_id`` and ``aspect``; ``category``, ``snapshot_pair`` + and the rename ``go_id->go_term_id`` are added by ``export_reranker_parquets``. + """ + row: dict[str, object] = { + "protein_accession": "P12345", + "go_id": "GO:0000001", + "label": 0, + "aspect": "P", + } + for col in ALL_FEATURES: + if col in row: + continue + if col in {"qualifier", "evidence_code", "taxonomic_relation"}: + row[col] = "x" + else: + row[col] = 0.0 + return row + + +def _write_shard(path: Path, rows: list[dict[str, object]]) -> Path: + df = pd.DataFrame(rows) if rows else pd.DataFrame() + df.to_parquet(path, index=False, compression="snappy") + return path + + +def _call_export( + stage_dir: Path, + train_rows: list[dict[str, object]], + eval_rows: list[dict[str, object]], +) -> dict[str, object]: + from protea.core.parquet_export import export_reranker_parquets + + train_shard = _write_shard(stage_dir / "_train_nk.parquet", train_rows) + eval_shard = _write_shard(stage_dir / "_eval_nk.parquet", eval_rows) + return export_reranker_parquets( + stage_dir=stage_dir, + split_files={"nk": [train_shard]}, + valid_split_versions=[(220, 221)], + test_files={"nk": eval_shard}, + test_old_v=221, + test_new_v=222, + name="t18-test", + k=5, + embedding_config_id="00000000-0000-0000-0000-000000000001", + ontology_snapshot_id="00000000-0000-0000-0000-000000000002", + annotation_source="goa", + store=None, + producer_version="t18", + producer_git_sha=None, + validate_with_contracts=False, + ) + + +class TestExportBoundaryInvariant: + def test_full_columns_passes(self, tmp_path: Path) -> None: + result = _call_export(tmp_path, [_full_feature_row()], [_full_feature_row()]) + assert (tmp_path / "manifest.json").exists() + assert "schema_sha" in result + + def test_missing_feature_column_in_train_raises(self, tmp_path: Path) -> None: + row = _full_feature_row() + del row["distance"] # drop one canonical feature + with pytest.raises(ValueError, match="canonical column invariant"): + _call_export(tmp_path, [row], [_full_feature_row()]) + + def test_missing_feature_column_in_eval_raises(self, tmp_path: Path) -> None: + row = _full_feature_row() + del row["k_position"] + with pytest.raises(ValueError, match="canonical column invariant"): + _call_export(tmp_path, [_full_feature_row()], [row]) + + def test_typo_feature_name_raises(self, tmp_path: Path) -> None: + row = _full_feature_row() + del row["distance"] + row["distnace"] = 0.0 # typo: not in ALL_FEATURES + with pytest.raises(ValueError, match="canonical column invariant"): + _call_export(tmp_path, [row], [_full_feature_row()]) + + def test_empty_eval_does_not_trigger(self, tmp_path: Path) -> None: + # Empty eval shard skipped by the writer; the invariant only + # gates non-empty data. + result = _call_export(tmp_path, [_full_feature_row()], []) + assert (tmp_path / "manifest.json").exists() + assert int(result["n_eval_rows"]) == 0 From 6e83561d2f50374e75d21b2c1dfae3d525d894a0 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 00:51:38 +0200 Subject: [PATCH 62/73] refactor(embeddings): F2A.5 entry_points dispatch in _load_model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the hardcoded if/elif chain in compute_embeddings._load_model with discovery via the protea.backends entry_points group. The four backend plugins (esm, t5, ankh, esm3c) shipped by protea-backends are now resolved dynamically; adding a new backend is a pyproject entry plus a class — no edits to compute_embeddings required. Scope of this refactor: - Module-level _load_model now calls _resolve_backend(model_backend) + plugin.load_model(model_name, device, emit). The (model, tokenizer) return shape stays exactly the same (tokenizer is None for ESM-C, matching the legacy path). - The legacy "auto" alias maps to "esm" exactly as before. - Plugin discovery is cached in module-level _BACKEND_PLUGINS and populated on first call (lazy: avoids running entry_points scan at import time). - Plugin name attribute is asserted to match its entry_point name on first load. Silent drift would yield confusing "unknown backend" errors; we'd rather fail loud. Out of scope (deferred to F2C): - _embed_batch dispatch keeps the legacy if/elif chain calling _embed_esm / _embed_t5 / _embed_ankh / _embed_esm3c. The plugin's embed_batch returns a flat (batch_size, hidden_dim) ndarray, while the legacy _embed_* return list[list[ ChunkEmbedding]] with full chunk + layer + pooling support. The contract extension is a separate task; this commit only swaps the load path where the API signatures already line up. - Cov gate bump in protea-backends CI: deferred until an integration runner installs an extra and exercises the plugin's load_model. Bumping the gate to 25% on the strength of unit tests alone would just be theatre. Tests: - tests/test_compute_embeddings_backend_dispatch.py: 7 new tests covering plugin discovery, entry_point/name parity, "auto" alias, unknown-backend error path, _load_model emit/delegate behaviour, cache identity, and re-import semantics. Suite: PROTEA 1136 passed, 12 skipped (was 1129 / 12; +7 new). Plugin discovery confirmed working from the PROTEA venv: >>> from importlib.metadata import entry_points >>> {ep.name for ep in entry_points(group="protea.backends")} {'ankh', 'esm', 'esm3c', 't5'} Pairs with the protea-backends 011b27d commit declaring per-backend optional dependency extras. Part of F2A.5 of master plan v3. --- protea/core/operations/compute_embeddings.py | 130 +++++++++--------- ...est_compute_embeddings_backend_dispatch.py | 99 +++++++++++++ 2 files changed, 164 insertions(+), 65 deletions(-) create mode 100644 tests/test_compute_embeddings_backend_dispatch.py diff --git a/protea/core/operations/compute_embeddings.py b/protea/core/operations/compute_embeddings.py index 9321a80..768ee5a 100644 --- a/protea/core/operations/compute_embeddings.py +++ b/protea/core/operations/compute_embeddings.py @@ -630,78 +630,78 @@ def _get_or_load_model(config: EmbeddingConfig, device: str, emit: EmitFn) -> tu return _MODEL_CACHE[key] -def _load_model(config: EmbeddingConfig, device: str, emit: EmitFn) -> tuple[Any, Any]: - import torch +# Cached map of backend plugins resolved from the ``protea.backends`` +# entry_points group. Lazy: populated on first call to ``_load_model``. +# ``None`` means "not yet discovered"; an empty dict means "no backends +# installed" (which is a hard error at load time, not a registry warning). +_BACKEND_PLUGINS: dict[str, Any] | None = None + + +def _get_backend_plugins() -> dict[str, Any]: + """Discover and cache backend plugins via ``entry_points``. + Returns a dict keyed by ``plugin.name``. Each plugin must implement + :class:`protea_contracts.EmbeddingBackend`. Discovery is performed + once per process; subsequent calls return the cached map. + + A plugin whose ``name`` attribute disagrees with its entry_point + name is a hard error — the entry_points file and the class + declaration must agree, and silently letting them drift would make + "Unknown model_backend" errors confusing. + """ + global _BACKEND_PLUGINS + if _BACKEND_PLUGINS is None: + from importlib.metadata import entry_points + + cache: dict[str, Any] = {} + for ep in entry_points(group="protea.backends"): + plugin = ep.load() + if getattr(plugin, "name", None) != ep.name: + raise RuntimeError( + f"Backend plugin name mismatch: entry_point {ep.name!r} " + f"resolves to plugin with name " + f"{getattr(plugin, 'name', None)!r}" + ) + cache[ep.name] = plugin + _BACKEND_PLUGINS = cache + return _BACKEND_PLUGINS + + +def _resolve_backend(backend_name: str) -> Any: + """Resolve a ``model_backend`` identifier to a plugin instance. + + The ``"auto"`` legacy alias maps to ``"esm"``. Unknown identifiers + raise ``ValueError`` listing the discovered backends so the failure + message is actionable. + """ + plugins = _get_backend_plugins() + key = "esm" if backend_name == "auto" else backend_name + if key not in plugins: + raise ValueError( + f"Unknown model_backend: {backend_name!r}. " + f"Discovered: {sorted(plugins)}" + ) + return plugins[key] + + +def _load_model(config: EmbeddingConfig, device: str, emit: EmitFn) -> tuple[Any, Any]: + """Load ``(model, tokenizer)`` via the ``protea.backends`` plugin + matching ``config.model_backend``. + + Each plugin owns its own torch / transformers / esm imports (lazy + inside ``plugin.load_model``) and the device + dtype dance. The + return shape ``(model, tokenizer)`` matches the legacy hardcoded + dispatch exactly; for ESM-C the tokenizer slot is ``None`` because + the standalone ``esm`` SDK takes raw sequence strings. + """ emit( "compute_embeddings.model_load_start", None, {"model_name": config.model_name, "backend": config.model_backend}, "info", ) - - if config.model_backend == "esm3c": - from esm.models.esmc import ESMC - - device_obj = torch.device(device) - dtype = torch.float16 if device_obj.type == "cuda" else torch.float32 - model = ESMC.from_pretrained(config.model_name) - model = model.to(device) - model = model.to(dtype) - model.eval() - tokenizer = None - - elif config.model_backend in ("esm", "auto"): - from transformers import AutoTokenizer, EsmModel - - device_obj = torch.device(device) - dtype = torch.float16 if device_obj.type == "cuda" else torch.float32 - tokenizer = AutoTokenizer.from_pretrained(config.model_name) - model = EsmModel.from_pretrained(config.model_name, output_hidden_states=True) - model.eval() - model.to(device) - model.to(dtype) - - elif config.model_backend == "t5": - from transformers import T5EncoderModel, T5Tokenizer - - device_obj = torch.device(device) - dtype = torch.float16 if device_obj.type == "cuda" else torch.float32 - tokenizer = T5Tokenizer.from_pretrained(config.model_name, do_lower_case=False) - model = T5EncoderModel.from_pretrained( - config.model_name, - output_hidden_states=True, - torch_dtype=dtype, - ) - model.eval() - model.to(device) - - elif config.model_backend == "ankh": - # Ankh ships a SentencePiece tokenizer; AutoTokenizer resolves to the - # correct class (T5TokenizerFast) without hard-coding T5Tokenizer, - # which has caused loading issues with some Ankh revisions. - # - # Precision: Ankh was pre-trained on TPU in bfloat16 and its - # T5-encoder LayerNorm overflows in FP16 — every forward collapses - # to NaN. Verified on ElnaggarLab/ankh-base 2026-04-10. Use - # bfloat16 on CUDA (same VRAM footprint as FP16 but with FP32 - # dynamic range) and FP32 on CPU. - from transformers import AutoTokenizer, T5EncoderModel - - device_obj = torch.device(device) - dtype = torch.bfloat16 if device_obj.type == "cuda" else torch.float32 - tokenizer = AutoTokenizer.from_pretrained(config.model_name) - model = T5EncoderModel.from_pretrained( - config.model_name, - output_hidden_states=True, - torch_dtype=dtype, - ) - model.eval() - model.to(device) - - else: - raise ValueError(f"Unknown model_backend: {config.model_backend!r}") - + plugin = _resolve_backend(config.model_backend) + model, tokenizer = plugin.load_model(config.model_name, device, emit) emit("compute_embeddings.model_load_done", None, {}, "info") return model, tokenizer diff --git a/tests/test_compute_embeddings_backend_dispatch.py b/tests/test_compute_embeddings_backend_dispatch.py new file mode 100644 index 0000000..04fb1e8 --- /dev/null +++ b/tests/test_compute_embeddings_backend_dispatch.py @@ -0,0 +1,99 @@ +"""Regression tests for the F2A.5 plugin-based backend dispatch in +``compute_embeddings._load_model``. + +The legacy hardcoded ``if/elif config.model_backend == ...`` chain was +replaced with discovery via the ``protea.backends`` entry_points group. +These tests pin the contract: + +* All four bootstrap backends (esm, t5, ankh, esm3c) are discoverable + from the PROTEA venv when ``protea-backends`` is installed. +* The legacy ``"auto"`` alias still maps to the ``esm`` plugin. +* Unknown backends raise ``ValueError`` (not silent fall-through). +* ``_load_model`` delegates to ``plugin.load_model`` and emits the + expected start/done events. + +Heavy ML deps (torch / transformers / esm) are NOT required: the tests +mock the plugin's ``load_model`` so the lazy-import path inside the +plugin never fires. +""" + +from __future__ import annotations + +import importlib +from unittest.mock import MagicMock + +import pytest + +from protea.core.operations import compute_embeddings as ce_module + + +def _reset_plugin_cache() -> None: + """Force the next call to repopulate ``_BACKEND_PLUGINS`` from + entry_points so individual tests don't bleed cached state.""" + ce_module._BACKEND_PLUGINS = None + + +def test_bootstrap_backends_discoverable_via_entry_points() -> None: + _reset_plugin_cache() + plugins = ce_module._get_backend_plugins() + assert set(plugins) >= {"esm", "t5", "ankh", "esm3c"} + + +def test_plugin_name_attribute_matches_entry_point_name() -> None: + _reset_plugin_cache() + plugins = ce_module._get_backend_plugins() + for ep_name, plugin in plugins.items(): + assert plugin.name == ep_name + + +def test_resolve_auto_maps_to_esm_plugin() -> None: + _reset_plugin_cache() + plugins = ce_module._get_backend_plugins() + assert ce_module._resolve_backend("auto") is plugins["esm"] + + +def test_resolve_unknown_backend_raises_value_error() -> None: + _reset_plugin_cache() + with pytest.raises(ValueError, match="Unknown model_backend"): + ce_module._resolve_backend("xgboost-on-proteins") + + +def test_load_model_delegates_to_resolved_plugin() -> None: + _reset_plugin_cache() + fake_plugin = MagicMock() + fake_plugin.name = "esm" + fake_plugin.load_model.return_value = ("fake_model", "fake_tokenizer") + + ce_module._BACKEND_PLUGINS = {"esm": fake_plugin} + + config = MagicMock() + config.model_backend = "esm" + config.model_name = "facebook/esm2_t6_8M_UR50D" + emit_calls: list[tuple[str, object, dict[str, object] | None, str]] = [] + + def emit(event: str, payload: object, fields: dict[str, object] | None, level: str) -> None: + emit_calls.append((event, payload, fields, level)) + + model, tokenizer = ce_module._load_model(config, "cpu", emit) + assert model == "fake_model" + assert tokenizer == "fake_tokenizer" + fake_plugin.load_model.assert_called_once_with( + "facebook/esm2_t6_8M_UR50D", "cpu", emit + ) + event_names = {call[0] for call in emit_calls} + assert "compute_embeddings.model_load_start" in event_names + assert "compute_embeddings.model_load_done" in event_names + + +def test_plugin_cache_persists_across_calls() -> None: + _reset_plugin_cache() + first = ce_module._get_backend_plugins() + second = ce_module._get_backend_plugins() + assert first is second # cached identity, not just equality + + +def test_module_re_import_redoes_discovery() -> None: + importlib.reload(ce_module) + assert ce_module._BACKEND_PLUGINS is None + plugins = ce_module._get_backend_plugins() + assert "esm" in plugins From 7db0e0d792048e13c9c27344f1d3a4ef1259633b Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 01:38:20 +0200 Subject: [PATCH 63/73] docs(adr): add 30 strategic ADR stubs (D1-D30) covering master plan v3 Map every strategic decision in master plan v3 (2026-05-05) to a navigable ADR stub under docs/source/adr/. Uniform format per file: status (Accepted, Pending, Deferred or Obsolete), date, phase introduced, gate (if pending), context (2-3 sentences), decision (1-2 sentences), consequences (1-2 bullets) and resolution. Index reorganised into two layers: - Implementation decisions (001-008): runtime, ORM, queue topology and similar choices that surfaced while building. - Strategic decisions (D1-D30): plan-level decisions from the master plan. Each row in the strategic table carries a status badge so the open work visible at a glance. Eight gates pending human action explicitly listed: D4 API versioning (gate F4), D6 authentication (gate F5), D7 observability stack (gate F-OPS), D10 schema_sha v2 migration (T1.6 gate D10), D25 HPC mode (gate F-OPS), D27 image registry (gate F-OPS), D28 secrets management (gate F-OPS), D29 release pipeline (gate F-OPS). Sphinx build clean: build succeeded, 4 pre-existing warnings (none from the new files). --- docs/source/adr/D01-project-structure.rst | 38 ++++ .../D02-export-research-dataset-location.rst | 30 +++ .../adr/D03-goprediction-features-jsonb.rst | 31 +++ docs/source/adr/D04-api-versioning.rst | 30 +++ docs/source/adr/D05-frontend-in-core.rst | 28 +++ docs/source/adr/D06-authentication.rst | 34 +++ docs/source/adr/D07-observability-stack.rst | 35 ++++ docs/source/adr/D08-ui-components.rst | 28 +++ .../adr/D09-obsolete-lab-runtime-dep.rst | 31 +++ docs/source/adr/D10-schema-sha-v2.rst | 35 ++++ docs/source/adr/D11-job-narrative-model.rst | 36 ++++ docs/source/adr/D12-fexp-qa-reproduction.rst | 32 +++ docs/source/adr/D13-early-ui-track.rst | 33 +++ docs/source/adr/D14-plugin-granularity.rst | 30 +++ .../source/adr/D15-protea-method-shipping.rst | 35 ++++ docs/source/adr/D16-thesis-location.rst | 29 +++ .../adr/D17-obsolete-thesis-template.rst | 23 ++ docs/source/adr/D18-thesis-writing-model.rst | 30 +++ docs/source/adr/D19-fresearch-targets.rst | 38 ++++ docs/source/adr/D20-supervisors-cadence.rst | 29 +++ docs/source/adr/D21-thesis-track-parallel.rst | 30 +++ docs/source/adr/D22-thesis-research-diary.rst | 33 +++ docs/source/adr/D23-lafa-submission.rst | 38 ++++ docs/source/adr/D24-hardcoded-params.rst | 42 ++++ docs/source/adr/D25-hpc-mode.rst | 38 ++++ docs/source/adr/D26-container-runtime.rst | 29 +++ docs/source/adr/D27-image-registry.rst | 30 +++ docs/source/adr/D28-secrets-management.rst | 32 +++ docs/source/adr/D29-release-pipeline.rst | 36 ++++ docs/source/adr/D30-insights-appendix.rst | 34 +++ docs/source/adr/index.rst | 196 +++++++++++++++++- 31 files changed, 1165 insertions(+), 8 deletions(-) create mode 100644 docs/source/adr/D01-project-structure.rst create mode 100644 docs/source/adr/D02-export-research-dataset-location.rst create mode 100644 docs/source/adr/D03-goprediction-features-jsonb.rst create mode 100644 docs/source/adr/D04-api-versioning.rst create mode 100644 docs/source/adr/D05-frontend-in-core.rst create mode 100644 docs/source/adr/D06-authentication.rst create mode 100644 docs/source/adr/D07-observability-stack.rst create mode 100644 docs/source/adr/D08-ui-components.rst create mode 100644 docs/source/adr/D09-obsolete-lab-runtime-dep.rst create mode 100644 docs/source/adr/D10-schema-sha-v2.rst create mode 100644 docs/source/adr/D11-job-narrative-model.rst create mode 100644 docs/source/adr/D12-fexp-qa-reproduction.rst create mode 100644 docs/source/adr/D13-early-ui-track.rst create mode 100644 docs/source/adr/D14-plugin-granularity.rst create mode 100644 docs/source/adr/D15-protea-method-shipping.rst create mode 100644 docs/source/adr/D16-thesis-location.rst create mode 100644 docs/source/adr/D17-obsolete-thesis-template.rst create mode 100644 docs/source/adr/D18-thesis-writing-model.rst create mode 100644 docs/source/adr/D19-fresearch-targets.rst create mode 100644 docs/source/adr/D20-supervisors-cadence.rst create mode 100644 docs/source/adr/D21-thesis-track-parallel.rst create mode 100644 docs/source/adr/D22-thesis-research-diary.rst create mode 100644 docs/source/adr/D23-lafa-submission.rst create mode 100644 docs/source/adr/D24-hardcoded-params.rst create mode 100644 docs/source/adr/D25-hpc-mode.rst create mode 100644 docs/source/adr/D26-container-runtime.rst create mode 100644 docs/source/adr/D27-image-registry.rst create mode 100644 docs/source/adr/D28-secrets-management.rst create mode 100644 docs/source/adr/D29-release-pipeline.rst create mode 100644 docs/source/adr/D30-insights-appendix.rst diff --git a/docs/source/adr/D01-project-structure.rst b/docs/source/adr/D01-project-structure.rst new file mode 100644 index 0000000..9b6b6af --- /dev/null +++ b/docs/source/adr/D01-project-structure.rst @@ -0,0 +1,38 @@ +ADR-D1: Project structure (7 code repositories plus thesis) +============================================================= + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F0 (closed); enacted across F0-F2 +:Supersedes: earlier monolith assumption in plan v1 + +Context +------- +PROTEA started as a single repository combining the API, workers, ORM, +front-end, four PLM backends, three annotation sources, and the LightGBM +re-ranker training pipeline. As the system grew towards eight backends +and external adoption became a goal, plugin extensibility for third +parties surfaced as a primary architectural concern. + +Decision +-------- +Structure C: seven code repositories plus the thesis manuscript. Plugins +are discovered via Python ``entry_points``. Granularity is per group +(sources, runners, backends), not per individual plugin. Thesis lives at +``~/Thesis/thesis/``. + +Repos: ``protea-core``, ``protea-contracts``, ``protea-method``, +``protea-cafaeval``, ``protea-sources``, ``protea-runners``, +``protea-backends``. + +Consequences +------------ +- Adding a new backend, source, or runner touches one repository or one + sub-module within the relevant group repo. +- ``protea-method`` ships independently of the platform. +- Cross-repo release coordination required (see D29). +- Per-plugin repository granularity deferred to F9 post-defense (see D14). + +Resolution +---------- +Closed in master plan v3, 2026-05-05. diff --git a/docs/source/adr/D02-export-research-dataset-location.rst b/docs/source/adr/D02-export-research-dataset-location.rst new file mode 100644 index 0000000..0dfff27 --- /dev/null +++ b/docs/source/adr/D02-export-research-dataset-location.rst @@ -0,0 +1,30 @@ +ADR-D2: ``export_research_dataset`` lives in ``protea-core`` +============================================================= + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F1 + +Context +------- +The export operation produces frozen ``train.parquet`` and ``eval.parquet`` +artefacts consumed by the LightGBM lab. It needs the feature schema, the +KNN reference cache, and access to the relational data model. Two options +were considered: keep it in ``protea-core``, or move it into the +``protea-runners`` repository alongside the LightGBM trainer. + +Decision +-------- +Keep ``export_research_dataset`` in ``protea-core``. The feature schema is +imported from ``protea-contracts``. + +Consequences +------------ +- Schema bumps in ``protea-contracts`` force a new ``protea-core`` release + but not a new ``protea-runners`` release. +- The lab consumes the dataset over the artifact store, no Python import + coupling. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D03-goprediction-features-jsonb.rst b/docs/source/adr/D03-goprediction-features-jsonb.rst new file mode 100644 index 0000000..3cefd29 --- /dev/null +++ b/docs/source/adr/D03-goprediction-features-jsonb.rst @@ -0,0 +1,31 @@ +ADR-D3: ``GOPrediction.features`` stored as JSONB +================================================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F3 + +Context +------- +The re-ranker feature set grew from 22 features (v1) through 52 features +(v18-selective) and is expected to keep evolving (lineage, GeOKG, +ensembles). Adding a column to ``GOPrediction`` for every new feature +required an Alembic migration per iteration and bound feature engineering +to DB schema cadence. + +Decision +-------- +Store re-ranker features in a single JSONB column on ``GOPrediction``, +with a transitional dual-write window of approximately one week before +dropping the legacy physical columns. + +Consequences +------------ +- New features cost a registry entry, not an Alembic migration. +- Querying features by name requires JSONB operators; indexed on the + most-queried subset. +- Schema drift caught at the boundary by ``schema_sha`` (see D10). + +Resolution +---------- +Closed; implementation in F3 (T3.1-T3.4). diff --git a/docs/source/adr/D04-api-versioning.rst b/docs/source/adr/D04-api-versioning.rst new file mode 100644 index 0000000..fa06fb3 --- /dev/null +++ b/docs/source/adr/D04-api-versioning.rst @@ -0,0 +1,30 @@ +ADR-D4: API versioning strategy +=============================== + +:Status: Pending +:Date: 2026-05-05 +:Phase: F4 +:Gate: opens at F4 entry + +Context +------- +PROTEA exposes a REST API consumed by the front-end and by external +clients (LAFA containers, downstream pipelines). As the API surface +stabilises, breaking changes need a versioning strategy that does not +strand existing consumers. + +Decision (recommended) +---------------------- +Universal ``/v1/`` path prefix on all endpoints. Future ``/v2/`` branches +without breaking ``/v1/`` consumers. ``Accept`` header negotiation only +considered if a real need surfaces. + +Consequences +------------ +- Front-end fetchers and external clients update once. +- OpenAPI documents per version. +- Schemathesis runs against the version under test. + +Resolution +---------- +Pending; gate opens with F4 (T4.1). diff --git a/docs/source/adr/D05-frontend-in-core.rst b/docs/source/adr/D05-frontend-in-core.rst new file mode 100644 index 0000000..3f22091 --- /dev/null +++ b/docs/source/adr/D05-frontend-in-core.rst @@ -0,0 +1,28 @@ +ADR-D5: Front-end co-located in ``protea-core`` +================================================ + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F1 + +Context +------- +The Next.js front-end is the primary consumer of the PROTEA REST API and +relies tightly on its contract. Two options: keep it inside +``protea-core/apps/web``, or split into a separate ``protea-web`` repo. + +Decision +-------- +Keep the front-end in ``protea-core/apps/web``. Coupling the API to its +primary consumer reduces contract drift; separation can happen later if a +dedicated UI team or external front-end appears. + +Consequences +------------ +- One repository contains both API and UI commits; release tags cover + both surfaces simultaneously. +- Splitting later costs roughly one week. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D06-authentication.rst b/docs/source/adr/D06-authentication.rst new file mode 100644 index 0000000..4aaa381 --- /dev/null +++ b/docs/source/adr/D06-authentication.rst @@ -0,0 +1,34 @@ +ADR-D6: Authentication strategy +================================ + +:Status: Pending +:Date: 2026-05-05 +:Phase: F5 +:Gate: opens at F5 entry + +Context +------- +Sensitive endpoints (job creation, dataset import, re-ranker model +upload, evaluation triggers) are currently unauthenticated. Public +exposure (cloud deployment, LAFA submission tooling, external adopters) +requires an authentication layer. + +Decision (recommended) +---------------------- +Two complementary mechanisms: + +- **API key** for service-to-service calls (``ApiKey`` ORM table, + ``Authorization: Bearer …``). +- **OIDC** via reverse proxy (oauth2-proxy) for human users. + +Rate limiting via ``slowapi``. + +Consequences +------------ +- Migration adds ``ApiKey`` table. +- ``deploy/nginx/`` ships an oauth2-proxy configuration. +- Rate-limit policy documented per endpoint. + +Resolution +---------- +Pending; gate opens with F5 (T5.6). diff --git a/docs/source/adr/D07-observability-stack.rst b/docs/source/adr/D07-observability-stack.rst new file mode 100644 index 0000000..43fa242 --- /dev/null +++ b/docs/source/adr/D07-observability-stack.rst @@ -0,0 +1,35 @@ +ADR-D7: Observability stack +============================ + +:Status: Pending +:Date: 2026-05-05 +:Phase: F-OPS +:Gate: opens at F-OPS entry + +Context +------- +PROTEA currently relies on per-process log files and ad-hoc +``print``/logger statements. Multi-target deployment (cloud, HPC, +airgap) and external adopters need distributed tracing, metrics with +SLOs, and structured log aggregation. + +Decision (recommended) +---------------------- +Single canonical stack: + +- **Tracing**: OpenTelemetry (OTLP exporter) instrumenting FastAPI, + SQLAlchemy, ``pika``. ``traceparent`` propagated HTTP -> queue -> worker. +- **Metrics**: Prometheus client, ``/metrics`` exposed. +- **Dashboards**: Grafana with dashboards committed in ``deploy/grafana/``. +- **Logs**: structured JSON via ``python-json-logger``, shipped to Loki + via promtail or vector. + +Consequences +------------ +- A single prediction is visible end-to-end as one OTel trace. +- Three SLOs documented in ``docs/SLOs.md``. +- Alert rules committed; runbook per alert (see F7). + +Resolution +---------- +Pending; gate opens with F-OPS (T5.1-T5.4). diff --git a/docs/source/adr/D08-ui-components.rst b/docs/source/adr/D08-ui-components.rst new file mode 100644 index 0000000..79f96b8 --- /dev/null +++ b/docs/source/adr/D08-ui-components.rst @@ -0,0 +1,28 @@ +ADR-D8: UI component library +============================= + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F8a + +Context +------- +The front-end needs a consistent component system supporting +accessibility AA, dark mode, and rapid composition of dashboards +(jobs, embeddings, predictions, evaluation, experiments). + +Decision +-------- +``shadcn/ui`` on top of Tailwind v4. Components are copy-paste owned +under ``apps/web/components/ui/``, not imported from a versioned +library. + +Consequences +------------ +- Customisation is in-tree; no library version bumps. +- Visual regression coverage via Chromatic or Percy (T8a.2). +- Lighthouse a11y target ≥95. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D09-obsolete-lab-runtime-dep.rst b/docs/source/adr/D09-obsolete-lab-runtime-dep.rst new file mode 100644 index 0000000..3bd441e --- /dev/null +++ b/docs/source/adr/D09-obsolete-lab-runtime-dep.rst @@ -0,0 +1,31 @@ +ADR-D9: OBSOLETE: lab as runtime dependency +============================================= + +:Status: Obsolete +:Date: 2026-05-05 +:Supersedes: earlier plan revision (v1) +:Superseded-by: D1 (Structure C) + +Context +------- +An earlier revision of the plan considered shipping the +``protea-reranker-lab`` repository as a runtime dependency of +``protea-core`` so that LightGBM training could execute inside the +PROTEA worker pool. + +Decision +-------- +Obsolete. Plan v3 adopts Structure C: the lab merges into +``protea-runners.lightgbm`` as a plugin discovered via ``entry_points``. +There is no runtime coupling. + +Consequences +------------ +- ``protea-runners.lightgbm`` is the canonical home for LightGBM + training. +- The dataset-publishing contract (Dataset row + artifact store URI) + remains the only interface between platform and trainer. + +Resolution +---------- +Declared obsolete on 2026-05-05. diff --git a/docs/source/adr/D10-schema-sha-v2.rst b/docs/source/adr/D10-schema-sha-v2.rst new file mode 100644 index 0000000..123065c --- /dev/null +++ b/docs/source/adr/D10-schema-sha-v2.rst @@ -0,0 +1,35 @@ +ADR-D10: ``schema_sha`` v2 parallel migration +============================================== + +:Status: Pending +:Date: 2026-05-05 +:Phase: F1 +:Gate: T1.6 (requires_human, Alembic on live DB) + +Context +------- +``schema_sha`` is the load-bearing fingerprint that prevents inference +from running with a re-ranker booster trained against a different +feature schema. Historically, two definitions of ``compute_schema_sha`` +co-existed (lab and PROTEA); silent drift caused at least one +non-reproducible run (v9 study, 2026-05-01) before the parity bug was +found and fixed. + +Decision +-------- +Add a parallel ``schema_sha_v2`` column to ``Dataset`` and +``RerankerModel``. Backfill from +``protea_contracts.compute_schema_sha``. Production reads ``v2``; +``v1`` kept until F3 for audit and then dropped. + +Consequences +------------ +- One Alembic migration plus one backfill script. +- Mismatch between v1 and v2 surfaces past silent drift; documented in + a regression test rather than fixed retroactively. +- Boosters loaded for inference compare their stored ``schema_sha`` + against the live ``v2`` value. + +Resolution +---------- +Pending human review of the live-DB migration. Rolls in F1 with T1.6. diff --git a/docs/source/adr/D11-job-narrative-model.rst b/docs/source/adr/D11-job-narrative-model.rst new file mode 100644 index 0000000..6931e0d --- /dev/null +++ b/docs/source/adr/D11-job-narrative-model.rst @@ -0,0 +1,36 @@ +ADR-D11: Operational narrative attached to ``Job`` +==================================================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F3 + +Context +------- +Past experimental campaigns left no narrative beyond raw metrics. +Reproducing the why of a past run required archaeology in chat logs +and notebooks. The thesis (chapter 6) needs a curated journey, not a +raw chronological log; the operational layer needs a place to record +the reasoning behind each Job. + +Decision +-------- +Two kinds of narrative artefact: + +- ``Job`` rows gain ``description``, ``findings``, ``tags`` columns. +- A new ``JobComment`` table holds chronological commentary tied to a + Job. + +Material doubles as an internal operational record and as the source +from which thesis chapter 6 is distilled. + +Consequences +------------ +- Hard rule for F-EXP: a Job does not close without ``findings`` + populated. +- UI surfaces narrative inline (D13). +- Thesis writing track (D21) reads from this corpus, not from logs. + +Resolution +---------- +Closed; implementation in F3 (T3.9, T3.10). diff --git a/docs/source/adr/D12-fexp-qa-reproduction.rst b/docs/source/adr/D12-fexp-qa-reproduction.rst new file mode 100644 index 0000000..eadd55b --- /dev/null +++ b/docs/source/adr/D12-fexp-qa-reproduction.rst @@ -0,0 +1,32 @@ +ADR-D12: F-EXP as QA reproduction of the canonical pipeline +============================================================ + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-EXP + +Context +------- +After the structural refactor (F0-F5), the rebuilt pipeline needs +end-to-end validation. Independently, the thesis needs a clean +campaign whose numbers can be cited without caveat. Running two +campaigns is duplicative. + +Decision +-------- +Treat F-EXP as both: a QA reproduction of the canonical pipeline and +the production run that supplies thesis chapter 6 numbers. Each Job +records its narrative (D11). At the close, material is distilled into +~8-12 thesis pages. + +Consequences +------------ +- Wipe-and-rebuild executed once on a backed-up database. +- Tagging convention ``study_v_thesis`` makes the campaign navigable + as a single experiment unit. +- Replay drill (1 % Fmax tolerance) verifies reproducibility of any + ``ExperimentRun`` row. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D13-early-ui-track.rst b/docs/source/adr/D13-early-ui-track.rst new file mode 100644 index 0000000..1165efd --- /dev/null +++ b/docs/source/adr/D13-early-ui-track.rst @@ -0,0 +1,33 @@ +ADR-D13: Early UI track parallel to F2 +======================================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F8a (parallel to F2 final), F8b (parallel to F-EXP) + +Context +------- +Postponing the front-end until F8 risks shipping a pipeline whose +state is invisible to its operator. Issues that surface only through +the UI (job state mismatches, narrative gaps, dashboard latency) would +arrive too late to influence the design. + +Decision +-------- +Two-stage UI track: + +- **F8a** (2 weeks, parallel to F2 final): basic narrative jobs page, + generic operation launcher, basic evaluation dashboard, dark mode, + a11y AA. +- **F8b** (2 weeks, during F-EXP): SSE streaming, advanced evaluation + dashboard, prediction visualisation, UMAP embeddings page, + experiments page. + +Consequences +------------ +- shadcn/ui is a hard prerequisite (D8). +- F-EXP has a usable UI surface from day one. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D14-plugin-granularity.rst b/docs/source/adr/D14-plugin-granularity.rst new file mode 100644 index 0000000..96d8d9c --- /dev/null +++ b/docs/source/adr/D14-plugin-granularity.rst @@ -0,0 +1,30 @@ +ADR-D14: Per-plugin repository granularity (deferred) +====================================================== + +:Status: Deferred +:Date: 2026-05-05 +:Phase: F9 (post-defense) + +Context +------- +Structure C ships plugins grouped per concept (sources, runners, +backends), not per individual plugin. This is simpler today (3 group +repos vs 9-12 micro-repos) but might invert if third parties publish +their own plugins and prefer independent release cadence. + +Decision +-------- +Defer the split until after defense. If third parties materialise, +splitting a group into per-plugin repos is estimated at 0.5-1 day per +sub-module, contained in F9. + +Consequences +------------ +- Until then, third parties contribute via PR to the relevant group + repository. +- Group repos must keep ``entry_points`` boundaries clean to make a + later split mechanical. + +Resolution +---------- +Deferred to F9 post-defense. diff --git a/docs/source/adr/D15-protea-method-shipping.rst b/docs/source/adr/D15-protea-method-shipping.rst new file mode 100644 index 0000000..ad7812b --- /dev/null +++ b/docs/source/adr/D15-protea-method-shipping.rst @@ -0,0 +1,35 @@ +ADR-D15: ``protea-method`` distribution channels +================================================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-OPS + +Context +------- +``protea-method`` is the pure inference path (KNN, feature compute, +re-ranker apply). Because it has no FastAPI / SQLAlchemy dependencies +it can ship independently of the platform, addressing audiences that +want the method without operating the full stack. + +Decision +-------- +Three distribution channels: + +- **PyPI** public package (``pip install protea-method``). +- **Docker Hub** minimal image (``protea-method-runtime``, + ~3-4 GB with one canonical PLM and one default booster, CLI + ``protea-predict input.fasta output.tsv``). +- **Companion engineering paper** covering the F-OPS pillars + (containers, multi-target deployment, observability, secrets). + +Consequences +------------ +- Two release surfaces to maintain. +- LAFA submission containers (D23) build on top of + ``protea-method-runtime``. +- Engineering paper aligned with F-OPS deliverables. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D16-thesis-location.rst b/docs/source/adr/D16-thesis-location.rst new file mode 100644 index 0000000..7b9e8c4 --- /dev/null +++ b/docs/source/adr/D16-thesis-location.rst @@ -0,0 +1,29 @@ +ADR-D16: Thesis repository location +===================================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F0 + +Context +------- +Thesis sources need a stable home tracked under git, separate from the +code repositories so that prose iteration does not pollute code +history. + +Decision +-------- +``~/Thesis/thesis/``, git-initialised on 2026-05-05 with master branch +at commit ``4fcd449``. ``.gitignore`` filters LaTeX intermediate +artefacts. + +Consequences +------------ +- Thesis history is independent of code history; both can be tagged + per phase. +- Cross-references to code commit SHAs are explicit citations, not + implicit imports. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D17-obsolete-thesis-template.rst b/docs/source/adr/D17-obsolete-thesis-template.rst new file mode 100644 index 0000000..654c63b --- /dev/null +++ b/docs/source/adr/D17-obsolete-thesis-template.rst @@ -0,0 +1,23 @@ +ADR-D17: OBSOLETE: thesis LaTeX template choice +================================================= + +:Status: Obsolete +:Date: 2026-05-05 +:Supersedes: earlier plan revision (v1) + +Context +------- +A previous plan revision called for selecting a thesis LaTeX template. +The existing in-tree template was retained without further evaluation. + +Decision +-------- +Obsolete. The current template stays. + +Consequences +------------ +None. + +Resolution +---------- +Declared obsolete on 2026-05-05. diff --git a/docs/source/adr/D18-thesis-writing-model.rst b/docs/source/adr/D18-thesis-writing-model.rst new file mode 100644 index 0000000..628bf1c --- /dev/null +++ b/docs/source/adr/D18-thesis-writing-model.rst @@ -0,0 +1,30 @@ +ADR-D18: Thesis writing model +============================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-THESIS (sustained from F0) + +Context +------- +Thesis drafting at 60-75k words target risks being deferred past the +point where reasoning behind decisions has rotted. Postponing to F7 +historically failed. + +Decision +-------- +Phase-aligned production: + +- Drafts produced at the close of each major phase. +- User edits during the following phase. +- Co-supervisors (D20) review asynchronously and iterate. + +Consequences +------------ +- Thesis is a continuous track, not a final-phase sprint. +- Writing cadence ~3-4h/week. +- Phase boundaries become natural review points (D20). + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D19-fresearch-targets.rst b/docs/source/adr/D19-fresearch-targets.rst new file mode 100644 index 0000000..d4cfa55 --- /dev/null +++ b/docs/source/adr/D19-fresearch-targets.rst @@ -0,0 +1,38 @@ +ADR-D19: F-RESEARCH targets +============================ + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-RESEARCH + +Context +------- +F-RESEARCH risks becoming an open campaign. Without an explicit cap, +research questions multiply and the phase loses focus. The plan needs +a small, ordered set of ideas. + +Decision +-------- +Three directed targets, in order: + +1. **Lineage feature** (1 week). Whether a candidate GO term is an + ancestor or descendant of a term already known for the protein. + Implementable as a single registry feature. +2. **GeOKG embeddings** (1 week, conditional). Replace ``anc2vec`` + features with multi-curvature hyperbolic + Euclidean GO embeddings + (Bioinformatics 2025). +3. **Multi-K ensemble** (1 week, optional). Combine K ∈ {5, 10, 20} + instead of K=5 fixed. + +Each idea produces an ``ExperimentRun``. Wins integrate into the +canonical pipeline; losses go to the insights appendix (D30). + +Consequences +------------ +- Phase capped at 2-3 weeks. +- Ideas that did not make the list (PROTEA-DL, retrieval-neural, + R-GCN over GO-DAG) deferred to F11 post-defense. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D20-supervisors-cadence.rst b/docs/source/adr/D20-supervisors-cadence.rst new file mode 100644 index 0000000..cbf7d18 --- /dev/null +++ b/docs/source/adr/D20-supervisors-cadence.rst @@ -0,0 +1,29 @@ +ADR-D20: Co-supervisor review cadence +======================================= + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-THESIS + +Context +------- +Co-supervisors are David Orellana-Martín (CABD / Universidad de +Sevilla) and Ana M. Rojas (CABD). They review thesis material +asynchronously and need predictable delivery points. + +Decision +-------- +A capítulo (chapter or chapter portion) is delivered at the close of +each major phase. Supervisors iterate during the following phase. +Their feedback merges back into the next draft. + +Consequences +------------ +- Phase boundaries become natural review checkpoints. +- No long stretches without supervisor visibility. +- Both directors are co-supervisors; never refer to either as singular + ``advisor``. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D21-thesis-track-parallel.rst b/docs/source/adr/D21-thesis-track-parallel.rst new file mode 100644 index 0000000..8645e3f --- /dev/null +++ b/docs/source/adr/D21-thesis-track-parallel.rst @@ -0,0 +1,30 @@ +ADR-D21: Thesis writing track parallel from F0 +================================================ + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-THESIS (from F0) + +Context +------- +Earlier plan revisions allocated thesis writing to F7 (final +documentation phase). This historically fails because the reasoning +behind early decisions has rotted by the time it is captured. + +Decision +-------- +Thesis writing runs in parallel from F0. ~3-4h/week sustained. +Material curated per phase: each phase produces specific chapter or +sub-section material at its close (mapped explicitly in the master +plan). + +Consequences +------------ +- F0 close already implies refresh of chapters 1-3 plus abstract; + F-OPS produces an entire new chapter (Deployment and Operations); + F-EXP rewrites chapter 6. +- D22 (research-diary tone) makes per-phase production tractable. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D22-thesis-research-diary.rst b/docs/source/adr/D22-thesis-research-diary.rst new file mode 100644 index 0000000..7842ddb --- /dev/null +++ b/docs/source/adr/D22-thesis-research-diary.rst @@ -0,0 +1,33 @@ +ADR-D22: Thesis as a concise research diary +============================================= + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-THESIS + +Context +------- +Two failure modes for a software-platform thesis: a chronological log +of every iteration (archaeology, no narrative), or a clean +post-rationalised systematisation (loses the reasoning that justifies +the canonical pipeline). The journey itself (KNN baseline -> 22 +features -> 52 features -> trazabilidad crisis -> leakage discovery +-> v18-selective canonical) is part of the contribution. + +Decision +-------- +Concise research diary. Each pivot that taught something earns 1-3 +pages. Distilled history, not raw archive. Chapter 6 (evaluation) is +the journey, written in prose. The insights appendix (D30) carries +brief notes on ideas that were probed and discarded without reaching +the chapter. + +Consequences +------------ +- Volume target 60-75k words is reachable without padding. +- Every pivot link to its operational record (Job ``findings``, see + D11). + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D23-lafa-submission.rst b/docs/source/adr/D23-lafa-submission.rst new file mode 100644 index 0000000..22398b2 --- /dev/null +++ b/docs/source/adr/D23-lafa-submission.rst @@ -0,0 +1,38 @@ +ADR-D23: LAFA submission strategy +=================================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-LAFA + +Context +------- +LAFA (functionbench.net) provides a public benchmark surface for +protein function annotation methods, comparable in spirit to CAFA. +PROTEA needs a credible adoption story; LAFA also exposes the method +to comparison against external systems on identical evaluation +conditions. + +Decision +-------- +F-LAFA at the end of the timeline (~1.5 weeks). Three containers +built on top of ``protea-method-runtime``: + +- **knn-v1** (one PLM, KNN baseline, GO propagation). +- **knn-8plm** (ensemble across the eight PLMs). +- **v18** (full pipeline with selective re-ranking). + +Each container submitted to the LAFA test suite per +``anphan0828/LAFA_container_guide``. + +Consequences +------------ +- Reuses F-OPS deliverables (``protea-method-runtime``). +- Material for chapter 7 conclusion: external adoption. +- ``apps/lafa_container/`` and ``protea-lafa-container/`` (existing + preliminaries) are not iterated on until F-LAFA opens; F-LAFA + rewrites them on top of ``protea-method-runtime``. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D24-hardcoded-params.rst b/docs/source/adr/D24-hardcoded-params.rst new file mode 100644 index 0000000..06c92d0 --- /dev/null +++ b/docs/source/adr/D24-hardcoded-params.rst @@ -0,0 +1,42 @@ +ADR-D24: Hardcoded parameters externalisation (T-CONF) +======================================================== + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F0 (closed) + +Context +------- +Hardcoded chunk sizes, retries, batch sizes, timeouts, KNN K values, +score thresholds, pool sizes, reaper timeouts and similar magic +numbers were dispersed throughout ``protea-core``. Tuning per +deployment target (cloud, HPC-BSC, HPC-airgap, dev) is impossible +without externalisation; reproducibility suffers because the magic +numbers are not part of the run record. + +Decision +-------- +T-CONF: a three-step task in F0. + +- **T-CONF.1**: inventory at ``docs/CONFIG_INVENTORY.md`` with 30-60 + entries minimum. +- **T-CONF.2**: ``protea_core.config.Settings`` (pydantic-settings) + with hierarchy ``defaults < config/{env}.yaml < env vars < CLI + flags``. Categories ``QueueTuning``, ``WorkerTuning``, + ``OperationTuning``, ``IOTuning``, ``ObservabilityTuning``. +- **T-CONF.3**: living documentation appendix + (``docs/source/appendix/configuration.rst``) auto-generated from + the pydantic models. + +Consequences +------------ +- Magic numbers in operations code are forbidden post-T-CONF; + ``# config-exempt: `` allowed only for semantic constants + (``MD5_HASH_LEN``). +- Each ``ExperimentRun`` row records resolved hyperparameters as + provenance. +- HPC and airgap deployments tune via ``config/hpc-bsc.yaml`` etc. + +Resolution +---------- +Closed (T-CONF.1-3 delivered in F0, 2026-05-05). diff --git a/docs/source/adr/D25-hpc-mode.rst b/docs/source/adr/D25-hpc-mode.rst new file mode 100644 index 0000000..2597808 --- /dev/null +++ b/docs/source/adr/D25-hpc-mode.rst @@ -0,0 +1,38 @@ +ADR-D25: HPC operation mode +============================= + +:Status: Pending +:Date: 2026-05-05 +:Phase: F-OPS +:Gate: opens at F-OPS entry + +Context +------- +PROTEA must support HPC environments (BSC and similar). HPC sites +typically forbid privileged Docker, may restrict outbound network, +and schedule via SLURM. Two main modes are available: + +- **Mode B**: stateless workers running on HPC nodes connect to a + PostgreSQL and RabbitMQ hosted in the cloud (LifeWatch / EOSC). +- **Mode C**: fully airgapped batch bundle. ``.sif`` Apptainer image + with snapshot DB precargado, default booster, single-node SLURM + job, no outbound traffic. + +Decision (recommended) +---------------------- +Both. Mode B as primary (closer to the cloud architecture). Mode C as +fallback for sites without outbound network or strict data-sovereignty +constraints. + +Consequences +------------ +- Two SLURM templates (``deploy/hpc/slurm-mode-b.sh``, + ``deploy/hpc/slurm-mode-c.sh``). +- Apptainer ``.sif`` produced from the OCI multi-stage builds (see + D26). +- Airgap bundle (``protea-airgap-bundle-vX.Y.Z.tar.gz``) tested on a + network-disconnected machine. + +Resolution +---------- +Pending; gate opens with F-OPS (T-OPS.5, T-OPS.9). diff --git a/docs/source/adr/D26-container-runtime.rst b/docs/source/adr/D26-container-runtime.rst new file mode 100644 index 0000000..4277568 --- /dev/null +++ b/docs/source/adr/D26-container-runtime.rst @@ -0,0 +1,29 @@ +ADR-D26: Container runtime: OCI plus Apptainer +================================================ + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F-OPS + +Context +------- +Cloud and developer environments expect OCI containers (Docker, +Podman, k8s). HPC sites mostly forbid privileged Docker but support +Apptainer (formerly Singularity) on rootless ``.sif`` images. + +Decision +-------- +Source of truth is OCI multi-stage Dockerfiles per repo. CI converts +each tagged image to an Apptainer ``.sif`` published as a release +artefact. No separate Apptainer Definition file maintained by hand. + +Consequences +------------ +- One Dockerfile per repo. ``.sif`` is a build artefact, not a source + format. +- Image size budget per repo: <500 MB. +- ``protea-bundle`` repo orchestrates fat image construction for HPC. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/D27-image-registry.rst b/docs/source/adr/D27-image-registry.rst new file mode 100644 index 0000000..f4a3b49 --- /dev/null +++ b/docs/source/adr/D27-image-registry.rst @@ -0,0 +1,30 @@ +ADR-D27: Image registry +========================= + +:Status: Pending +:Date: 2026-05-05 +:Phase: F-OPS +:Gate: opens at F-OPS entry + +Context +------- +Seven OCI images need a hosting registry visible from cloud +deployments, HPC tooling that pulls before converting to ``.sif``, +and external adopters consuming ``protea-method-runtime``. + +Decision (recommended) +---------------------- +``ghcr.io`` (GitHub Container Registry). + +Consequences +------------ +- GitHub Actions push images on tag using the repository's own + GITHUB_TOKEN. +- Public visibility for ``protea-method-runtime``; private or + org-scoped for internal images if needed. +- Mirror to Docker Hub considered later if external pull rates demand + it. + +Resolution +---------- +Pending; gate opens with F-OPS (T-OPS.8). diff --git a/docs/source/adr/D28-secrets-management.rst b/docs/source/adr/D28-secrets-management.rst new file mode 100644 index 0000000..93dc08a --- /dev/null +++ b/docs/source/adr/D28-secrets-management.rst @@ -0,0 +1,32 @@ +ADR-D28: Secrets management +============================= + +:Status: Pending +:Date: 2026-05-05 +:Phase: F-OPS +:Gate: opens at F-OPS entry + +Context +------- +PostgreSQL credentials, MinIO keys, OIDC client secrets, optional +external API tokens and SSH keys cannot live in plaintext in +repositories. Multi-target deployment (cloud, HPC, airgap) requires a +single mechanism that works across all of them. + +Decision (recommended) +---------------------- +``sops`` with ``age`` keys. Encrypted ``secrets.enc.yaml`` committed +in repos; CI decrypts with the age key stored in GitHub Secrets. +Local development uses a developer-specific age key checked into the +user's keyring. + +Consequences +------------ +- Plaintext secrets never on disk persistente. +- Per-environment file (``secrets.dev.enc.yaml``, + ``secrets.prod.enc.yaml``). +- Rotation procedure documented. + +Resolution +---------- +Pending; gate opens with F-OPS (T-OPS.7). diff --git a/docs/source/adr/D29-release-pipeline.rst b/docs/source/adr/D29-release-pipeline.rst new file mode 100644 index 0000000..dbfc8d8 --- /dev/null +++ b/docs/source/adr/D29-release-pipeline.rst @@ -0,0 +1,36 @@ +ADR-D29: Release pipeline +=========================== + +:Status: Pending +:Date: 2026-05-05 +:Phase: F-OPS +:Gate: opens at F-OPS entry + +Context +------- +Seven repos need an independent SemVer release cadence. Releasing one +repo alone cannot break another; cross-repo integration testing on +tag is required. ``protea-contracts`` is the most disruptive: bumps +ripple through all consumers. + +Decision (recommended) +---------------------- +Per-repo SemVer plus cross-repo integration test on tag: + +- A SemVer tag (``vX.Y.Z``) in any repo dispatches a build, image + push (D27), and integration test that pulls the new image plus the + pinned versions of the other six and runs a smoke pipeline. +- Failures block image promotion; the tag remains but the image is + marked as ``release-candidate`` until the integration test passes. +- ``protea-contracts`` releases trigger a re-pin in all consumers as + a follow-up automated PR. + +Consequences +------------ +- Tag is the release primitive; PRs are not. +- One canonical integration test stack lives in ``protea-bundle``. +- Manual rollback is repo-local (revert tag, push fix, retag). + +Resolution +---------- +Pending; gate opens with F-OPS (T-OPS.8). diff --git a/docs/source/adr/D30-insights-appendix.rst b/docs/source/adr/D30-insights-appendix.rst new file mode 100644 index 0000000..016e793 --- /dev/null +++ b/docs/source/adr/D30-insights-appendix.rst @@ -0,0 +1,34 @@ +ADR-D30: Insights appendix +============================ + +:Status: Accepted +:Date: 2026-05-05 +:Phase: F7 + +Context +------- +Several lessons learned during the project deserve a written record +that is neither a peer-reviewed publication nor a reluctant footnote +inside an unrelated chapter. Examples: the ``anc2vec`` feature +leakage discovery (2026-05-05), the ``schema_sha`` drift incident, +the v18 selective re-ranking discovery, the PK coverage cafaeval +upstream bug. None of these belong in the canonical evaluation; all +of them taught something. + +Decision +-------- +A short appendix at ``docs/source/appendix/insights.rst`` with one +paragraph to one page per insight. No formalisms. Honest tone: +described as encountered, with the workaround or fix that closed it. + +Consequences +------------ +- Companion to chapter 6 of the thesis but not part of the chapter + itself. +- Linked from chapter 7 conclusion as a pointer to the operational + history. +- Stable home for future incidents discovered post-defense. + +Resolution +---------- +Closed. diff --git a/docs/source/adr/index.rst b/docs/source/adr/index.rst index d4423b4..b0ee4c4 100644 --- a/docs/source/adr/index.rst +++ b/docs/source/adr/index.rst @@ -1,15 +1,28 @@ Architecture Decision Records ============================= -Design decisions that are not obvious from reading the code. Each ADR -documents **why** a decision was made, not just what — the code already +Design decisions that are not obvious from reading the code. Each ADR +documents **why** a decision was made, not just what. The code already shows the what. -Decisions are grouped by system layer: +ADRs come in two layers: + +- **Implementation decisions** (numbered ``001``-``008``): runtime, + data model and operational choices discovered while building + PROTEA. They explain trade-offs of concrete code paths (KNN + algorithm choice, queue topology, deduplication strategy, retries, + etc.). +- **Strategic decisions** (``D1``-``D30``): plan-level decisions + taken in the master plan v3 (2026-05-05). They drive the structure + of the project, the deployment story, and the thesis writing + cadence. + +Implementation decisions +------------------------ .. list-table:: :header-rows: 1 - :widths: 10 50 40 + :widths: 8 50 42 * - ADR - Decision @@ -17,9 +30,6 @@ Decisions are grouped by system layer: * - 001 - :doc:`KNN on CPU, not pgvector or GPU <001-knn-without-pgvector>` - pgvector does not scale to 500K+ vectors; GPU must be reserved for inference - * - 006 - - :doc:`Sequence deduplication by MD5 <006-sequence-deduplication-by-md5>` - - 30K duplicate sequences in Swiss-Prot waste hours of GPU time * - 002 - :doc:`Two-session worker pattern <002-two-session-worker-pattern>` - A mid-operation crash left the job invisible to monitoring @@ -32,12 +42,152 @@ Decisions are grouped by system layer: * - 005 - :doc:`Reusable RabbitMQ connections <005-thread-local-rabbitmq-connections>` - A coordinator dispatching 500 batches opened 500 TCP connections + * - 006 + - :doc:`Sequence deduplication by MD5 <006-sequence-deduplication-by-md5>` + - 30K duplicate sequences in Swiss-Prot waste hours of GPU time * - 007 - :doc:`Contract-first integration with protea-reranker-lab <007-contract-first-lab-integration>` - Re-ranker iteration cadence would contaminate the production dependency tree * - 008 - :doc:`PK coverage fix in cafaeval fork <008-cafaeval-pk-coverage-fix>` - - Upstream cafaeval reports coverage > 1 in PK; precision is under-divided by the same factor + - Upstream cafaeval reports coverage > 1 in PK; precision is under-divided + +Strategic decisions +------------------- + +Decisions taken in the master plan v3 (2026-05-05). Statuses: +*Accepted*, *Pending* (gate opens at the indicated phase), *Deferred* +(scheduled later in the timeline) or *Obsolete* (superseded by a +later revision). + +.. list-table:: + :header-rows: 1 + :widths: 6 38 12 44 + + * - ID + - Decision + - Status + - Phase / Gate + * - D1 + - :doc:`Project structure (7 code repos) ` + - Accepted + - F0 (closed); enacted F0-F2 + * - D2 + - :doc:`export_research_dataset in protea-core ` + - Accepted + - F1 + * - D3 + - :doc:`GOPrediction.features as JSONB ` + - Accepted + - F3 + * - D4 + - :doc:`API versioning ` + - Pending + - gate at F4 + * - D5 + - :doc:`Front-end in protea-core ` + - Accepted + - F1 + * - D6 + - :doc:`Authentication strategy ` + - Pending + - gate at F5 + * - D7 + - :doc:`Observability stack ` + - Pending + - gate at F-OPS + * - D8 + - :doc:`UI component library ` + - Accepted + - F8a + * - D9 + - :doc:`OBSOLETE: lab as runtime dependency ` + - Obsolete + - superseded by D1 + * - D10 + - :doc:`schema_sha v2 migration ` + - Pending + - T1.6 (requires_human) + * - D11 + - :doc:`Job narrative model ` + - Accepted + - F3 + * - D12 + - :doc:`F-EXP as QA reproduction ` + - Accepted + - F-EXP + * - D13 + - :doc:`Early UI track parallel to F2 ` + - Accepted + - F8a / F8b + * - D14 + - :doc:`Plugin granularity (deferred) ` + - Deferred + - F9 post-defense + * - D15 + - :doc:`protea-method shipping channels ` + - Accepted + - F-OPS + * - D16 + - :doc:`Thesis repository location ` + - Accepted + - F0 + * - D17 + - :doc:`OBSOLETE: thesis template choice ` + - Obsolete + - n/a + * - D18 + - :doc:`Thesis writing model ` + - Accepted + - F-THESIS + * - D19 + - :doc:`F-RESEARCH targets ` + - Accepted + - F-RESEARCH + * - D20 + - :doc:`Co-supervisor review cadence ` + - Accepted + - F-THESIS + * - D21 + - :doc:`Thesis writing parallel from F0 ` + - Accepted + - F-THESIS + * - D22 + - :doc:`Thesis as research diary ` + - Accepted + - F-THESIS + * - D23 + - :doc:`LAFA submission strategy ` + - Accepted + - F-LAFA + * - D24 + - :doc:`Hardcoded params externalisation (T-CONF) ` + - Accepted + - F0 (closed) + * - D25 + - :doc:`HPC operation mode ` + - Pending + - gate at F-OPS + * - D26 + - :doc:`Container runtime: OCI plus Apptainer ` + - Accepted + - F-OPS + * - D27 + - :doc:`Image registry ` + - Pending + - gate at F-OPS + * - D28 + - :doc:`Secrets management ` + - Pending + - gate at F-OPS + * - D29 + - :doc:`Release pipeline ` + - Pending + - gate at F-OPS + * - D30 + - :doc:`Insights appendix ` + - Accepted + - F7 .. toctree:: :maxdepth: 1 @@ -51,3 +201,33 @@ Decisions are grouped by system layer: 006-sequence-deduplication-by-md5 007-contract-first-lab-integration 008-cafaeval-pk-coverage-fix + D01-project-structure + D02-export-research-dataset-location + D03-goprediction-features-jsonb + D04-api-versioning + D05-frontend-in-core + D06-authentication + D07-observability-stack + D08-ui-components + D09-obsolete-lab-runtime-dep + D10-schema-sha-v2 + D11-job-narrative-model + D12-fexp-qa-reproduction + D13-early-ui-track + D14-plugin-granularity + D15-protea-method-shipping + D16-thesis-location + D17-obsolete-thesis-template + D18-thesis-writing-model + D19-fresearch-targets + D20-supervisors-cadence + D21-thesis-track-parallel + D22-thesis-research-diary + D23-lafa-submission + D24-hardcoded-params + D25-hpc-mode + D26-container-runtime + D27-image-registry + D28-secrets-management + D29-release-pipeline + D30-insights-appendix From 43da412ad9851354a3a51aea5440086688b88ac3 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 01:43:36 +0200 Subject: [PATCH 64/73] refactor(goa): F2A.6-real LoadGOAAnnotationsOperation consumes plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First Level-1 plugin migration: ``LoadGOAAnnotationsOperation`` delegates HTTP + gzip + GAF parsing to ``protea_sources.goa.GoaSource.stream``, becoming a thin persistence adapter that owns DB filtering, GO-term resolution, dedup, and ``pg_insert``. Pairs with ``protea-sources/d1d60f6`` (``GoaSource.stream`` real implementation) and ``protea-contracts/20987a5`` (``GoaStreamPayload`` + ``GoaAnnotationRecord``). What moved out: * ``_stream_gaf`` body (~30 LOC of HTTP/gzip/parsing): now a one-liner that constructs a typed ``GoaStreamPayload`` and yields from ``goa_plugin.stream``. * Eight ``_IDX_*`` GAF column constants: now in protea-sources. * ``import gzip``, ``import io``, ``import requests``: removed — the plugin owns the network and decode layers. What stayed: * ``_load_accessions`` (canonical-accession universe). * ``_load_go_term_map`` (GO-id → term-id). * ``_store_buffer`` (dedup + pg_insert with on_conflict_do_nothing). Now consumes ``GoaAnnotationRecord`` via attribute access (``rec.accession``) instead of dict access (``rec["accession"]``). * ``_maybe_enqueue_atomic_eval`` (auto-eval child job). * Operation lifecycle, ``LoadGOAAnnotationsPayload`` validation, ``OperationResult`` shaping. Tests updated, not extended: * ``_make_record`` test fixture now constructs ``GoaAnnotationRecord`` instances; ``with_from=""`` becomes ``with_from=None`` (semantically identical, the old code converted "" → None at insert time). * ``TestStreamGaf`` patches now target ``protea_sources.goa.requests .get`` instead of the operation-local ``requests.get``. Assertions migrated from dict access to attribute access. * ``rec.copy()`` → ``rec.model_copy()`` (pydantic v2 deprecation). Behavioural parity: * ``_store_buffer`` still does ``rec.accession.strip()`` for the DB-lookup field (parser preserves raw GAF columns; strip happens where the lookup needs it). Same observable behaviour as before. * Empty optional fields ("" → None) handled by the parser at the boundary, not by the operation. No DB-insert diff. * Dedup key ``(set_id, accession, go_term_id, evidence_code)``, ``on_conflict_do_nothing(constraint=...)`` constraint, page-level commit policy: all preserved verbatim. Suite: 1136 passed, 12 skipped (= unchanged from master). The 54 ``test_load_goa_annotations`` cases all pass on the new boundary. Why Level 1 only (the design discipline): protea-sources is a leaf C-stack package; importing ``protea.infrastructure.orm.*`` would invert the dependency direction. Level 1 (HTTP + parsing) cuts cleanly along the SQLAlchemy boundary; Level 2 (move the operation entirely) waits for F2C ORM extraction. See ``~/Thesis/f2a6_real_migration_design.md``. Pattern locked for the remaining migrations (QuickGO, UniProt FASTA, UniProt metadata): typed ``StreamPayload`` + ``Record`` in protea-contracts, ``Source.stream`` in protea-sources, operation refactor here. Part of F2A.6-real migration plan (master plan v3). --- .../core/operations/load_goa_annotations.py | 84 ++++++------------- tests/test_load_goa_annotations.py | 60 ++++++------- 2 files changed, 58 insertions(+), 86 deletions(-) diff --git a/protea/core/operations/load_goa_annotations.py b/protea/core/operations/load_goa_annotations.py index a5655f9..01ac84a 100644 --- a/protea/core/operations/load_goa_annotations.py +++ b/protea/core/operations/load_goa_annotations.py @@ -1,13 +1,11 @@ from __future__ import annotations -import gzip -import io import time import uuid from collections.abc import Iterator from typing import Annotated, Any -import requests +from protea_contracts import GoaAnnotationRecord, GoaStreamPayload from pydantic import Field, field_validator from sqlalchemy import distinct, select from sqlalchemy.orm import Session @@ -86,16 +84,6 @@ def summarize_payload(self, payload: dict[str, Any]) -> str: bits.append(f"limit={p['total_limit']}") return " · ".join(bits) - # GAF 2.x column indices (0-based after splitting on tab) - _IDX_ACCESSION = 1 - _IDX_QUALIFIER = 3 - _IDX_GO_ID = 4 - _IDX_DB_REFERENCE = 5 - _IDX_EVIDENCE = 6 - _IDX_WITH_FROM = 7 - _IDX_ASSIGNED_BY = 14 - _IDX_DATE = 13 - def execute( self, session: Session, payload: dict[str, Any], *, emit: EmitFn ) -> OperationResult: @@ -144,7 +132,7 @@ def execute( total_inserted = 0 total_skipped = 0 pages = 0 - buffer: list[dict[str, str]] = [] + buffer: list[GoaAnnotationRecord] = [] for record in self._stream_gaf(p, emit): total_lines += 1 @@ -340,45 +328,27 @@ def _load_go_term_map( emit("load_goa_annotations.load_go_terms_done", None, {"go_terms": len(mapping)}, "info") return mapping - def _stream_gaf(self, p: LoadGOAAnnotationsPayload, emit: EmitFn) -> Iterator[dict[str, str]]: - emit("load_goa_annotations.download_start", None, {"gaf_url": p.gaf_url}, "info") - resp = requests.get(p.gaf_url, stream=True, timeout=p.timeout_seconds) - resp.raise_for_status() - - compressed = p.gaf_url.endswith(".gz") - raw_stream = resp.raw - raw_stream.decode_content = True - - stream: io.TextIOWrapper - if compressed: - gz = gzip.GzipFile(fileobj=raw_stream) - stream = io.TextIOWrapper(gz, encoding="utf-8", errors="replace") - else: - stream = io.TextIOWrapper(raw_stream, encoding="utf-8", errors="replace") - - with stream: - for raw in stream: - line = raw.rstrip("\n") - if not line or line.startswith("!"): - continue - parts = line.split("\t") - if len(parts) < 15: - continue - yield { - "accession": parts[self._IDX_ACCESSION], - "go_id": parts[self._IDX_GO_ID], - "qualifier": parts[self._IDX_QUALIFIER], - "evidence_code": parts[self._IDX_EVIDENCE], - "db_reference": parts[self._IDX_DB_REFERENCE], - "with_from": parts[self._IDX_WITH_FROM], - "assigned_by": parts[self._IDX_ASSIGNED_BY], - "annotation_date": parts[self._IDX_DATE], - } + def _stream_gaf( + self, p: LoadGOAAnnotationsPayload, emit: EmitFn + ) -> Iterator[GoaAnnotationRecord]: + """Delegate to the protea-sources GoaSource plugin. + + The plugin owns HTTP, gzip decoding, and GAF line parsing; the + operation owns DB filtering, GO term resolution, dedup, and + bulk insert. See ``f2a6_real_migration_design.md`` (D-MIGR-01, + D-MIGR-02, D-MIGR-06). + """ + from protea_sources.goa import plugin as goa_plugin + + yield from goa_plugin.stream( + GoaStreamPayload(gaf_url=p.gaf_url, timeout_seconds=p.timeout_seconds), + emit=emit, + ) def _store_buffer( self, session: Session, - records: list[dict[str, str]], + records: list[GoaAnnotationRecord], annotation_set_id: uuid.UUID, valid_accessions: set[str], go_term_map: dict[str, int], @@ -388,18 +358,18 @@ def _store_buffer( seen: set[tuple] = set() for rec in records: - accession = rec["accession"].strip() + accession = rec.accession.strip() if not accession or accession not in valid_accessions: skipped += 1 continue - go_id = rec["go_id"].strip() + go_id = rec.go_id.strip() go_term_id = go_term_map.get(go_id) if go_term_id is None: skipped += 1 continue - evidence_code = rec["evidence_code"] or None + evidence_code = rec.evidence_code dedup_key = (annotation_set_id, accession, go_term_id, evidence_code) if dedup_key in seen: skipped += 1 @@ -411,12 +381,12 @@ def _store_buffer( "annotation_set_id": annotation_set_id, "protein_accession": accession, "go_term_id": go_term_id, - "qualifier": rec["qualifier"] or None, + "qualifier": rec.qualifier, "evidence_code": evidence_code, - "assigned_by": rec["assigned_by"] or None, - "db_reference": rec["db_reference"] or None, - "with_from": rec["with_from"] or None, - "annotation_date": rec["annotation_date"] or None, + "assigned_by": rec.assigned_by, + "db_reference": rec.db_reference, + "with_from": rec.with_from, + "annotation_date": rec.annotation_date, } ) diff --git a/tests/test_load_goa_annotations.py b/tests/test_load_goa_annotations.py index 442f8d7..0b94d84 100644 --- a/tests/test_load_goa_annotations.py +++ b/tests/test_load_goa_annotations.py @@ -153,16 +153,18 @@ def _op(self) -> LoadGOAAnnotationsOperation: return LoadGOAAnnotationsOperation() def _make_record(self, accession="P12345", go_id="GO:0003824", evidence="IDA"): - return { - "accession": accession, - "go_id": go_id, - "qualifier": "enables", - "evidence_code": evidence, - "db_reference": "PMID:1", - "with_from": "", - "assigned_by": "UniProt", - "annotation_date": "20240101", - } + from protea_contracts import GoaAnnotationRecord + + return GoaAnnotationRecord( + accession=accession, + go_id=go_id, + qualifier="enables", + evidence_code=evidence, + db_reference="PMID:1", + with_from=None, + assigned_by="UniProt", + annotation_date="20240101", + ) def test_skips_unknown_accession(self) -> None: op = self._op() @@ -228,7 +230,7 @@ def test_deduplicates_within_buffer(self) -> None: op = self._op() session = MagicMock() rec = self._make_record() - records = [rec.copy(), rec.copy(), rec.copy()] + records = [rec.model_copy(), rec.model_copy(), rec.model_copy()] inserted, skipped = op._store_buffer( session, records, @@ -332,7 +334,7 @@ def _stream_from_text(self, text: str, url="https://example.com/goa.gaf"): mock_resp.raise_for_status = MagicMock() with patch( - "protea.core.operations.load_goa_annotations.requests.get", return_value=mock_resp + "protea_sources.goa.requests.get", return_value=mock_resp ): return list(self.op._stream_gaf(payload, emit)) @@ -340,9 +342,9 @@ def test_parses_valid_gaf_line(self): line = _gaf_line(accession="P12345", go_id="GO:0003674", evidence="IDA") records = self._stream_from_text(line + "\n") assert len(records) == 1 - assert records[0]["accession"] == "P12345" - assert records[0]["go_id"] == "GO:0003674" - assert records[0]["evidence_code"] == "IDA" + assert records[0].accession == "P12345" + assert records[0].go_id == "GO:0003674" + assert records[0].evidence_code == "IDA" def test_skips_comment_lines(self): text = "!this is a comment\n" + _gaf_line() + "\n" @@ -367,7 +369,7 @@ def test_multiple_records(self): ] records = self._stream_from_text("\n".join(lines) + "\n") assert len(records) == 3 - assert [r["accession"] for r in records] == ["A1", "A2", "A3"] + assert [r.accession for r in records] == ["A1", "A2", "A3"] def test_extracts_all_fields(self): line = _gaf_line( @@ -382,14 +384,14 @@ def test_extracts_all_fields(self): ) records = self._stream_from_text(line + "\n") r = records[0] - assert r["accession"] == "Q99999" - assert r["go_id"] == "GO:0005575" - assert r["qualifier"] == "located_in" - assert r["evidence_code"] == "IEA" - assert r["db_reference"] == "GO_REF:001" - assert r["with_from"] == "InterPro:IPR000001" - assert r["annotation_date"] == "20230615" - assert r["assigned_by"] == "InterPro" + assert r.accession == "Q99999" + assert r.go_id == "GO:0005575" + assert r.qualifier == "located_in" + assert r.evidence_code == "IEA" + assert r.db_reference == "GO_REF:001" + assert r.with_from == "InterPro:IPR000001" + assert r.annotation_date == "20230615" + assert r.assigned_by == "InterPro" def test_gzip_url_uses_gzip_decompression(self): import gzip as gzip_mod @@ -410,7 +412,7 @@ def test_gzip_url_uses_gzip_decompression(self): mock_resp.raise_for_status = MagicMock() with patch( - "protea.core.operations.load_goa_annotations.requests.get", return_value=mock_resp + "protea_sources.goa.requests.get", return_value=mock_resp ): records = list(self.op._stream_gaf(payload, emit)) assert len(records) == 1 @@ -574,16 +576,16 @@ def fake_store_buffer(_session, records, _ann_set_id, _valid, _go_map): skipped = 0 seen = set() for rec in records: - acc = rec["accession"].strip() + acc = rec.accession.strip() if not acc or acc not in real_valid: skipped += 1 continue - go_id = rec["go_id"].strip() + go_id = rec.go_id.strip() go_term_id = real_go.get(go_id) if go_term_id is None: skipped += 1 continue - ev = rec["evidence_code"] or None + ev = rec.evidence_code key = (_ann_set_id, acc, go_term_id, ev) if key in seen: skipped += 1 @@ -597,7 +599,7 @@ def fake_store_buffer(_session, records, _ann_set_id, _valid, _go_map): with ( patch( - "protea.core.operations.load_goa_annotations.requests.get", + "protea_sources.goa.requests.get", return_value=mock_resp, ), patch( From 42d4dd45de2b6ab00b6177a0cbdd2239c5cf36e4 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 01:59:13 +0200 Subject: [PATCH 65/73] refactor(quickgo): F2A.6-real LoadQuickGOAnnotationsOperation consumes plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second Level-1 plugin migration. LoadQuickGOAnnotationsOperation delegates HTTP + TSV streaming + ECO mapping to the protea-sources QuickGoSource plugin, becoming a thin persistence adapter. Pairs with protea-sources/f37dfce (real plugin) and protea-contracts/ c5433ed (typed payloads + record). What moved out: * _stream_quickgo body (~70 LOC of batching, HTTP, TSV parsing): now a one-liner constructing a typed QuickGoStreamPayload and yielding from quickgo_plugin.stream. * _fetch_quickgo_page method: deleted entirely. Plugin owns the per-batch HTTP fetch. * _load_eco_mapping body (~13 LOC of HTTP + line parsing): now one call to quickgo_plugin.fetch_eco_mapping(EcoMappingPayload( url=...)). Operation keeps the wrapper for the empty-URL short circuit (returns {} when eco_mapping_url is None). * import io, import requests: removed from the operation module. What stayed: * _load_accessions (canonical + protein accession universes). * _load_go_term_map (GO-id -> term-id). * _store_buffer (dedup + ECO map application + pg_insert with on_conflict_do_nothing). Now consumes QuickGoAnnotationRecord via attribute access (rec.accession) instead of dict access (rec["GENE PRODUCT ID"]). * Operation lifecycle, LoadQuickGOAnnotationsPayload validation (which keeps page_size, total_limit, commit_every_page knobs that are operation-side concerns and don't belong in the plugin payload). Tests updated: * _record(...) helper builds QuickGoAnnotationRecord instances from kwargs; replaces the verbose dict-literal _QUICKGO_ROWS. * TestStoreBuffer (~9 tests) consumes records, not dicts. The test_empty_eco_id_becomes_none test now passes eco_id=None directly (parser-side empty-cell handling). The test_empty_accession_skipped test was renamed to test_unknown_accession_skipped: whitespace handling moved to the parser in protea-sources, so the operation only sees accessions that don't match valid_accessions. * TestLoadEcoMapping (~5 tests): patches and event names swapped to source.quickgo.eco_mapping_*. The empty-URL short circuit test stayed — operation-side behaviour, not plugin-side. * TestStreamQuickgo + TestExecute: patches swapped to protea_sources.quickgo.requests.get. Batching event name swap to source.quickgo.batching. * TestFetchQuickgoPage class deleted entirely (~135 LOC). The tests were exercising the parser through HTTP mocks; the parser is now in protea-sources where parse_quickgo_row and parse_quickgo_tsv have full unit tests. Net -8 tests in PROTEA, +9 unit tests in protea-sources for a strictly better surface. Behavioural parity: * Empty cells -> None at parser boundary (matches old _store_buffer "or None" handling at insert time). No DB-insert diff. * Dedup, on_conflict_do_nothing constraint, ECO map application via eco_map.get(rec.eco_id, rec.eco_id): preserved verbatim. * Multi-batch URL construction: identical (plugin's gene_product_batch_size matches operation's payload field). * Event names changed (load_quickgo_annotations.* -> source.quickgo.* for plugin-emitted events). Operation-side events unchanged. Downstream consumers reading JobEvent rows must filter on the new prefix; flagged here for the operator changelog. Suite: 1128 passed, 12 skipped (= -8 from master because the 8 redundant TestFetchQuickgoPage cases were deleted, not regressed). The 37 test_load_quickgo_annotations cases all pass on the new boundary. Part of F2A.6-real migration plan, step 2 of 4. Pattern locked for the remaining UniProt FASTA + UniProt metadata migrations. --- .../operations/load_quickgo_annotations.py | 164 ++++------ tests/test_load_quickgo_annotations.py | 283 +++++------------- 2 files changed, 119 insertions(+), 328 deletions(-) diff --git a/protea/core/operations/load_quickgo_annotations.py b/protea/core/operations/load_quickgo_annotations.py index 6fbf750..f37821b 100644 --- a/protea/core/operations/load_quickgo_annotations.py +++ b/protea/core/operations/load_quickgo_annotations.py @@ -1,12 +1,15 @@ from __future__ import annotations -import io import time import uuid from collections.abc import Iterator from typing import Annotated, Any -import requests +from protea_contracts import ( + EcoMappingPayload, + QuickGoAnnotationRecord, + QuickGoStreamPayload, +) from pydantic import Field, field_validator from sqlalchemy import distinct, select from sqlalchemy.orm import Session @@ -137,7 +140,7 @@ def execute( total_inserted = 0 total_skipped = 0 pages = 0 - buffer: list[dict[str, str]] = [] + buffer: list[QuickGoAnnotationRecord] = [] for record in self._stream_quickgo(p, emit, gene_product_ids=effective_gp_ids): total_lines += 1 @@ -246,116 +249,54 @@ def _load_go_term_map( return mapping def _load_eco_mapping(self, p: LoadQuickGOAnnotationsPayload, emit: EmitFn) -> dict[str, str]: - """Download and parse gaf-eco-mapping-derived.txt → {ECO:XXXXXXX: CODE}.""" + """Delegate to the protea-sources QuickGoSource auxiliary fetch. + + D-MIGR-05 of F2A.6-real: ECO mapping is a separate small-file + fetch (single shot, in-memory dict). The operation calls it + once before iterating ``_stream_quickgo`` and caches the + result for the duration of the load. + """ if not p.eco_mapping_url: return {} - emit("load_quickgo_annotations.eco_mapping_start", None, {"url": p.eco_mapping_url}, "info") - resp = requests.get(p.eco_mapping_url, timeout=60) - resp.raise_for_status() - mapping: dict[str, str] = {} - for line in resp.text.splitlines(): - parts = line.strip().split() - if len(parts) >= 2 and parts[0].startswith("ECO:"): - mapping[parts[0]] = parts[1] - emit("load_quickgo_annotations.eco_mapping_done", None, {"entries": len(mapping)}, "info") - return mapping + from protea_sources.quickgo import plugin as quickgo_plugin - def _stream_quickgo( - self, - p: LoadQuickGOAnnotationsPayload, - emit: EmitFn, - gene_product_ids: list[str] | None = None, - ) -> Iterator[dict[str, str]]: - effective_ids = gene_product_ids or p.gene_product_ids - - # If no ID filter, do a single unbatched request - if not effective_ids: - yield from self._fetch_quickgo_page( - p, emit, gp_ids=None, batch_index=0, total_batches=1 - ) - return - - # Batch accessions to avoid URL length limits (QuickGO returns 400 for very long URLs) - batches = [ - effective_ids[i : i + p.gene_product_batch_size] - for i in range(0, len(effective_ids), p.gene_product_batch_size) - ] - total_batches = len(batches) - emit( - "load_quickgo_annotations.batching", - None, - { - "total_accessions": len(effective_ids), - "total_batches": total_batches, - "batch_size": p.gene_product_batch_size, - }, - "info", + return quickgo_plugin.fetch_eco_mapping( + EcoMappingPayload(url=p.eco_mapping_url), + emit=emit, ) - for batch_index, batch in enumerate(batches): - yield from self._fetch_quickgo_page( - p, emit, gp_ids=batch, batch_index=batch_index, total_batches=total_batches - ) - - def _fetch_quickgo_page( + def _stream_quickgo( self, p: LoadQuickGOAnnotationsPayload, emit: EmitFn, - gp_ids: list[str] | None, - batch_index: int, - total_batches: int, - ) -> Iterator[dict[str, str]]: - params: dict[str, Any] = {"geneProductType": "protein"} - if gp_ids: - params["geneProductId"] = ",".join(gp_ids) - - headers = { - "Accept": "text/tsv", - "User-Agent": "PROTEA/load_quickgo_annotations", - } - emit( - "load_quickgo_annotations.download_start", - None, - { - "batch": batch_index + 1, - "of": total_batches, - "accessions_in_batch": len(gp_ids) if gp_ids else "all", - "_progress_current": batch_index + 1, - "_progress_total": total_batches, - }, - "info", - ) + gene_product_ids: list[str] | None = None, + ) -> Iterator[QuickGoAnnotationRecord]: + """Delegate to the protea-sources QuickGoSource plugin. + + The plugin owns HTTP, TSV header detection, multi-batch URL + construction (to dodge QuickGO's 400-on-long-URL response), and + record construction. The operation owns DB filtering, GO term + resolution, ECO map application, and bulk insert. See + ``f2a6_real_migration_design.md`` (D-MIGR-01, D-MIGR-02, + D-MIGR-05). + """ + from protea_sources.quickgo import plugin as quickgo_plugin - resp = requests.get( - p.quickgo_base_url, - params=params, - headers=headers, - stream=True, - timeout=p.timeout_seconds, + effective_ids = gene_product_ids or p.gene_product_ids + yield from quickgo_plugin.stream( + QuickGoStreamPayload( + quickgo_base_url=p.quickgo_base_url, + gene_product_ids=effective_ids, + gene_product_batch_size=p.gene_product_batch_size, + timeout_seconds=p.timeout_seconds, + ), + emit=emit, ) - resp.raise_for_status() - - resp.raw.decode_content = True - stream = io.TextIOWrapper(resp.raw, encoding="utf-8", errors="replace") - - header: list[str] | None = None - with stream: - for raw in stream: - line = raw.rstrip("\n") - if not line: - continue - parts = line.split("\t") - if header is None: - header = parts - continue - if len(parts) < len(header): - continue - yield dict(zip(header, parts, strict=False)) def _store_buffer( self, session: Session, - records: list[dict[str, str]], + records: list[QuickGoAnnotationRecord], annotation_set_id: uuid.UUID, valid_accessions: set[str], go_term_map: dict[str, int], @@ -364,32 +305,31 @@ def _store_buffer( to_add: list[dict] = [] skipped = 0 - for row in records: - accession = row.get("GENE PRODUCT ID", "").strip() - if not accession or accession not in valid_accessions: + for rec in records: + if rec.accession not in valid_accessions: skipped += 1 continue - go_id = row.get("GO TERM", "").strip() - go_term_id = go_term_map.get(go_id) + go_term_id = go_term_map.get(rec.go_id) if go_term_id is None: skipped += 1 continue - eco_id = row.get("ECO ID", "").strip() or None - evidence_code = eco_map.get(eco_id, eco_id) if eco_id else None + evidence_code = ( + eco_map.get(rec.eco_id, rec.eco_id) if rec.eco_id else None + ) to_add.append( { "annotation_set_id": annotation_set_id, - "protein_accession": accession, + "protein_accession": rec.accession, "go_term_id": go_term_id, - "qualifier": row.get("QUALIFIER", "").strip() or None, + "qualifier": rec.qualifier, "evidence_code": evidence_code, - "assigned_by": row.get("ASSIGNED BY", "").strip() or None, - "db_reference": row.get("REFERENCE", "").strip() or None, - "with_from": row.get("WITH/FROM", "").strip() or None, - "annotation_date": row.get("DATE", "").strip() or None, + "assigned_by": rec.assigned_by, + "db_reference": rec.db_reference, + "with_from": rec.with_from, + "annotation_date": rec.annotation_date, } ) diff --git a/tests/test_load_quickgo_annotations.py b/tests/test_load_quickgo_annotations.py index bac7657..4e763f4 100644 --- a/tests/test_load_quickgo_annotations.py +++ b/tests/test_load_quickgo_annotations.py @@ -15,54 +15,37 @@ _noop_emit = lambda *_: None # noqa: E731 _SNAPSHOT_ID = str(uuid.uuid4()) -# Simulates a QuickGO TSV response (header + 3 rows) -_QUICKGO_ROWS = [ - { - "GENE PRODUCT DB": "UniProtKB", - "GENE PRODUCT ID": "P12345", - "SYMBOL": "GENE1", - "QUALIFIER": "enables", - "GO TERM": "GO:0003824", - "GO ASPECT": "F", - "ECO ID": "ECO:0000314", - "REFERENCE": "PMID:123", - "WITH/FROM": "", - "TAXON ID": "9606", - "ASSIGNED BY": "UniProt", - "ANNOTATION EXTENSION": "", - "DATE": "20240101", - }, - { - "GENE PRODUCT DB": "UniProtKB", - "GENE PRODUCT ID": "Q67890", - "SYMBOL": "GENE2", - "QUALIFIER": "involved_in", - "GO TERM": "GO:0008150", - "GO ASPECT": "P", - "ECO ID": "ECO:0000501", - "REFERENCE": "PMID:456", - "WITH/FROM": "", - "TAXON ID": "9606", - "ASSIGNED BY": "UniProt", - "ANNOTATION EXTENSION": "", - "DATE": "20240101", - }, - { - "GENE PRODUCT DB": "UniProtKB", - "GENE PRODUCT ID": "XXXXXX", - "SYMBOL": "UNKNOWN", - "QUALIFIER": "enables", - "GO TERM": "GO:0003824", - "GO ASPECT": "F", - "ECO ID": "ECO:0000314", - "REFERENCE": "PMID:789", - "WITH/FROM": "", - "TAXON ID": "9606", - "ASSIGNED BY": "UniProt", - "ANNOTATION EXTENSION": "", - "DATE": "20240101", - }, -] + +def _record( + accession: str = "P12345", + go_id: str = "GO:0003824", + qualifier: str | None = "enables", + eco_id: str | None = "ECO:0000314", + db_reference: str | None = "PMID:123", + with_from: str | None = None, + assigned_by: str | None = "UniProt", + annotation_date: str | None = "20240101", +): + from protea_contracts import QuickGoAnnotationRecord + + return QuickGoAnnotationRecord( + accession=accession, + go_id=go_id, + qualifier=qualifier, + eco_id=eco_id, + db_reference=db_reference, + with_from=with_from, + assigned_by=assigned_by, + annotation_date=annotation_date, + ) + + +def _quickgo_records() -> list: + return [ + _record("P12345", "GO:0003824", eco_id="ECO:0000314"), + _record("Q67890", "GO:0008150", qualifier="involved_in", eco_id="ECO:0000501"), + _record("XXXXXX", "GO:0003824", eco_id="ECO:0000314"), + ] class TestLoadQuickGOAnnotationsPayload: @@ -99,7 +82,7 @@ def test_skips_unknown_accession(self) -> None: session = MagicMock() inserted, skipped = op._store_buffer( session, - _QUICKGO_ROWS, + _quickgo_records(), uuid.UUID(_SNAPSHOT_ID), valid_accessions={"P12345"}, go_term_map={"GO:0003824": 1, "GO:0008150": 2}, @@ -113,7 +96,7 @@ def test_skips_unknown_go_term(self) -> None: session = MagicMock() inserted, skipped = op._store_buffer( session, - _QUICKGO_ROWS, + _quickgo_records(), uuid.UUID(_SNAPSHOT_ID), valid_accessions={"P12345", "Q67890", "XXXXXX"}, go_term_map={}, @@ -127,7 +110,7 @@ def test_inserts_all_valid(self) -> None: session = MagicMock() inserted, skipped = op._store_buffer( session, - _QUICKGO_ROWS, + _quickgo_records(), uuid.UUID(_SNAPSHOT_ID), valid_accessions={"P12345", "Q67890", "XXXXXX"}, go_term_map={"GO:0003824": 1, "GO:0008150": 2}, @@ -143,7 +126,7 @@ def test_eco_mapping_applied(self) -> None: eco_map = {"ECO:0000314": "IDA", "ECO:0000501": "IEA"} inserted, _ = op._store_buffer( session, - _QUICKGO_ROWS[:1], + [_quickgo_records()[0]], uuid.UUID(_SNAPSHOT_ID), valid_accessions={"P12345"}, go_term_map={"GO:0003824": 1}, @@ -161,7 +144,7 @@ def test_raw_eco_stored_when_no_mapping(self) -> None: session = MagicMock() inserted, _ = op._store_buffer( session, - _QUICKGO_ROWS[:1], + [_quickgo_records()[0]], uuid.UUID(_SNAPSHOT_ID), valid_accessions={"P12345"}, go_term_map={"GO:0003824": 1}, @@ -177,11 +160,10 @@ def test_raw_eco_stored_when_no_mapping(self) -> None: def test_empty_eco_id_becomes_none(self) -> None: op = self._op() session = MagicMock() - row = dict(_QUICKGO_ROWS[0]) - row["ECO ID"] = "" + rec = _record(eco_id=None) inserted, _ = op._store_buffer( session, - [row], + [rec], uuid.UUID(_SNAPSHOT_ID), valid_accessions={"P12345"}, go_term_map={"GO:0003824": 1}, @@ -189,14 +171,17 @@ def test_empty_eco_id_becomes_none(self) -> None: ) assert inserted == 1 - def test_empty_accession_skipped(self) -> None: + def test_unknown_accession_skipped(self) -> None: + # Whitespace handling moved to the parser in protea-sources + # (parse_quickgo_row strips and returns None for empty cells). + # Here we exercise the operation's filter against + # valid_accessions with an accession not in the universe. op = self._op() session = MagicMock() - row = dict(_QUICKGO_ROWS[0]) - row["GENE PRODUCT ID"] = " " + rec = _record(accession="UNRELATED") inserted, skipped = op._store_buffer( session, - [row], + [rec], uuid.UUID(_SNAPSHOT_ID), valid_accessions={"P12345"}, go_term_map={"GO:0003824": 1}, @@ -209,7 +194,7 @@ def test_chunked_insert_large_buffer(self) -> None: """When to_add > 5000, session.execute is called multiple times.""" op = self._op() session = MagicMock() - records = [dict(_QUICKGO_ROWS[0])] * 5001 + records = [_record() for _ in range(5001)] inserted, skipped = op._store_buffer( session, records, @@ -316,7 +301,7 @@ def test_no_url_returns_empty(self) -> None: ) assert op._load_eco_mapping(p, _noop_emit) == {} - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_parses_mapping_file(self, mock_get) -> None: resp = MagicMock() resp.text = "ECO:0000314 IDA\nECO:0000501 IEA\n# comment\nbadline\n" @@ -334,7 +319,7 @@ def test_parses_mapping_file(self, mock_get) -> None: result = op._load_eco_mapping(p, _noop_emit) assert result == {"ECO:0000314": "IDA", "ECO:0000501": "IEA"} - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_http_error_raises(self, mock_get) -> None: resp = MagicMock() resp.raise_for_status.side_effect = requests.HTTPError("404") @@ -351,7 +336,7 @@ def test_http_error_raises(self, mock_get) -> None: with pytest.raises(requests.HTTPError): op._load_eco_mapping(p, _noop_emit) - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_emits_start_and_done(self, mock_get) -> None: resp = MagicMock() resp.text = "ECO:0000314 IDA\n" @@ -372,10 +357,10 @@ def emit(event, msg, fields, level): return events.append(event) op._load_eco_mapping(p, emit) - assert "load_quickgo_annotations.eco_mapping_start" in events - assert "load_quickgo_annotations.eco_mapping_done" in events + assert "source.quickgo.eco_mapping_start" in events + assert "source.quickgo.eco_mapping_done" in events - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_ignores_non_eco_lines(self, mock_get) -> None: resp = MagicMock() resp.text = "ECO:0000314 IDA\nNOT_ECO stuff\n \nECO:0000501 IEA\n" @@ -432,140 +417,6 @@ def _make_stream_response(text: str, status_code: int = 200) -> MagicMock: return resp -class TestFetchQuickgoPage: - def _payload(self, **kw): - return LoadQuickGOAnnotationsPayload.model_validate( - { - "ontology_snapshot_id": _SNAPSHOT_ID, - "source_version": "v1", - **kw, - } - ) - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_parses_rows(self, mock_get) -> None: - tsv = _make_tsv_text( - _tsv_row_str("P12345", "GO:0005634"), - _tsv_row_str("Q99999", "GO:0008150"), - ) - mock_get.return_value = _make_stream_response(tsv) - - op = LoadQuickGOAnnotationsOperation() - records = list( - op._fetch_quickgo_page( - self._payload(), _noop_emit, gp_ids=["P12345"], batch_index=0, total_batches=1 - ) - ) - assert len(records) == 2 - assert records[0]["GENE PRODUCT ID"] == "P12345" - assert records[1]["GO TERM"] == "GO:0008150" - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_skips_empty_lines(self, mock_get) -> None: - tsv = QUICKGO_HEADER_LINE + "\n\n" + _tsv_row_str() + "\n\n" - mock_get.return_value = _make_stream_response(tsv) - - op = LoadQuickGOAnnotationsOperation() - records = list( - op._fetch_quickgo_page( - self._payload(), _noop_emit, gp_ids=None, batch_index=0, total_batches=1 - ) - ) - assert len(records) == 1 - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_skips_short_rows(self, mock_get) -> None: - tsv = QUICKGO_HEADER_LINE + "\ntoo\tfew\n" + _tsv_row_str() + "\n" - mock_get.return_value = _make_stream_response(tsv) - - op = LoadQuickGOAnnotationsOperation() - records = list( - op._fetch_quickgo_page( - self._payload(), _noop_emit, gp_ids=None, batch_index=0, total_batches=1 - ) - ) - assert len(records) == 1 - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_http_error_raises(self, mock_get) -> None: - mock_get.return_value = _make_stream_response("", status_code=500) - - op = LoadQuickGOAnnotationsOperation() - with pytest.raises(requests.HTTPError): - list( - op._fetch_quickgo_page( - self._payload(), _noop_emit, gp_ids=None, batch_index=0, total_batches=1 - ) - ) - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_sends_correct_params_with_gp_ids(self, mock_get) -> None: - mock_get.return_value = _make_stream_response(_make_tsv_text()) - - op = LoadQuickGOAnnotationsOperation() - list( - op._fetch_quickgo_page( - self._payload(), - _noop_emit, - gp_ids=["P12345", "Q99999"], - batch_index=0, - total_batches=1, - ) - ) - _, kwargs = mock_get.call_args - assert kwargs["params"]["geneProductId"] == "P12345,Q99999" - assert kwargs["params"]["geneProductType"] == "protein" - assert kwargs["headers"]["Accept"] == "text/tsv" - assert kwargs["stream"] is True - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_no_gp_ids_omits_gene_product_param(self, mock_get) -> None: - mock_get.return_value = _make_stream_response(_make_tsv_text()) - - op = LoadQuickGOAnnotationsOperation() - list( - op._fetch_quickgo_page( - self._payload(), _noop_emit, gp_ids=None, batch_index=0, total_batches=1 - ) - ) - _, kwargs = mock_get.call_args - assert "geneProductId" not in kwargs["params"] - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_emits_download_start_with_progress(self, mock_get) -> None: - mock_get.return_value = _make_stream_response(_make_tsv_text()) - events: list[tuple[str, dict]] = [] - - def emit(event, msg, fields, level): - return events.append((event, fields)) - - op = LoadQuickGOAnnotationsOperation() - list( - op._fetch_quickgo_page( - self._payload(), emit, gp_ids=["X"], batch_index=2, total_batches=5 - ) - ) - start_events = [e for e in events if e[0] == "load_quickgo_annotations.download_start"] - assert len(start_events) == 1 - assert start_events[0][1]["batch"] == 3 - assert start_events[0][1]["of"] == 5 - assert start_events[0][1]["_progress_current"] == 3 - assert start_events[0][1]["_progress_total"] == 5 - - @patch("protea.core.operations.load_quickgo_annotations.requests.get") - def test_header_only_yields_nothing(self, mock_get) -> None: - tsv = QUICKGO_HEADER_LINE + "\n" - mock_get.return_value = _make_stream_response(tsv) - - op = LoadQuickGOAnnotationsOperation() - records = list( - op._fetch_quickgo_page( - self._payload(), _noop_emit, gp_ids=None, batch_index=0, total_batches=1 - ) - ) - assert records == [] - - # --------------------------------------------------------------------------- # _stream_quickgo — batching logic # --------------------------------------------------------------------------- @@ -581,7 +432,7 @@ def _payload(self, **kw): } ) - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_batches_accessions(self, mock_get) -> None: mock_get.side_effect = lambda *a, **kw: _make_stream_response(_make_tsv_text()) @@ -590,7 +441,7 @@ def test_batches_accessions(self, mock_get) -> None: list(op._stream_quickgo(p, _noop_emit, gene_product_ids=["A", "B", "C", "D", "E"])) assert mock_get.call_count == 3 # 2+2+1 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_no_ids_single_request(self, mock_get) -> None: mock_get.return_value = _make_stream_response(_make_tsv_text()) @@ -599,7 +450,7 @@ def test_no_ids_single_request(self, mock_get) -> None: list(op._stream_quickgo(p, _noop_emit, gene_product_ids=None)) assert mock_get.call_count == 1 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_emits_batching_event(self, mock_get) -> None: mock_get.side_effect = lambda *a, **kw: _make_stream_response(_make_tsv_text()) @@ -611,13 +462,13 @@ def emit(event, msg, fields, level): op = LoadQuickGOAnnotationsOperation() p = self._payload(gene_product_batch_size=2) list(op._stream_quickgo(p, emit, gene_product_ids=["A", "B", "C"])) - batching = [e for e in events if e[0] == "load_quickgo_annotations.batching"] + batching = [e for e in events if e[0] == "source.quickgo.batching"] assert len(batching) == 1 assert batching[0][1]["total_accessions"] == 3 assert batching[0][1]["total_batches"] == 2 assert batching[0][1]["batch_size"] == 2 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_yields_records_from_all_batches(self, mock_get) -> None: tsv = _make_tsv_text(_tsv_row_str("P12345")) mock_get.side_effect = lambda *a, **kw: _make_stream_response(tsv) @@ -699,7 +550,7 @@ def emit(event, msg, fields, level): assert result.result["annotations_inserted"] == 0 assert "load_quickgo_annotations.no_proteins" in events - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_full_run_inserts_and_skips(self, mock_get) -> None: tsv = _make_tsv_text( _tsv_row_str("P12345", "GO:0003824"), @@ -727,7 +578,7 @@ def emit(event, msg, fields, level): assert "load_quickgo_annotations.done" in events assert "load_quickgo_annotations.annotation_set_created" in events - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_total_limit_stops_early(self, mock_get) -> None: tsv = _make_tsv_text( _tsv_row_str("P12345", "GO:0003824"), @@ -754,7 +605,7 @@ def emit(event, msg, fields, level): ) assert "load_quickgo_annotations.limit_reached" in events - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_commit_every_page(self, mock_get) -> None: tsv = _make_tsv_text( _tsv_row_str("P12345", "GO:0003824"), @@ -775,7 +626,7 @@ def test_commit_every_page(self, mock_get) -> None: ) assert session.commit.call_count >= 2 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_no_commit_when_disabled(self, mock_get) -> None: tsv = _make_tsv_text(_tsv_row_str("P12345", "GO:0003824")) mock_get.return_value = _make_stream_response(tsv) @@ -793,7 +644,7 @@ def test_no_commit_when_disabled(self, mock_get) -> None: ) session.commit.assert_not_called() - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_page_done_emitted(self, mock_get) -> None: tsv = _make_tsv_text( _tsv_row_str("P12345", "GO:0003824"), @@ -818,7 +669,7 @@ def emit(event, msg, fields, level): assert len(page_done) >= 1 assert result.result["pages"] == 2 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_result_contains_elapsed_seconds(self, mock_get) -> None: tsv = _make_tsv_text(_tsv_row_str("P12345", "GO:0003824")) mock_get.return_value = _make_stream_response(tsv) @@ -833,7 +684,7 @@ def test_result_contains_elapsed_seconds(self, mock_get) -> None: assert "elapsed_seconds" in result.result assert result.result["elapsed_seconds"] >= 0 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_use_db_accessions_false(self, mock_get) -> None: tsv = _make_tsv_text(_tsv_row_str("X00001", "GO:0003824")) mock_get.return_value = _make_stream_response(tsv) @@ -854,7 +705,7 @@ def test_use_db_accessions_false(self, mock_get) -> None: assert "X00001" in kwargs["params"]["geneProductId"] assert result.result["annotations_inserted"] == 1 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_eco_mapping_integrated_in_execute(self, mock_get) -> None: eco_resp = MagicMock() eco_resp.text = "ECO:0000314 IDA\n" @@ -880,7 +731,7 @@ def test_eco_mapping_integrated_in_execute(self, mock_get) -> None: ) assert result.result["annotations_inserted"] == 1 - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_result_has_annotation_set_id(self, mock_get) -> None: tsv = _make_tsv_text(_tsv_row_str("P12345", "GO:0003824")) mock_get.return_value = _make_stream_response(tsv) @@ -894,7 +745,7 @@ def test_result_has_annotation_set_id(self, mock_get) -> None: result = op.execute(session, _base_payload(), emit=_noop_emit) assert "annotation_set_id" in result.result - @patch("protea.core.operations.load_quickgo_annotations.requests.get") + @patch("protea_sources.quickgo.requests.get") def test_remainder_buffer_flushed(self, mock_get) -> None: """Records that don't fill a full page are still flushed at the end.""" tsv = _make_tsv_text(_tsv_row_str("P12345", "GO:0003824")) From cffe1c39e0d5a399cd844cf574e8da31f7d3c366 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 02:00:30 +0200 Subject: [PATCH 66/73] docs(reference): Doc-T3 reference/core.rst cleanup + post-F1 modules Two fixes plus an expansion of the documented module surface. Removes: - The autodoc directive for protea.core.operations.train_reranker, orphaned since T0.6 removed the file. Sphinx no longer reports a ModuleNotFoundError during build. - A broken :doc: cross-reference to a non-existent /refactoring/design-patterns/flyweight page in the protea.core.annotation_intern module docstring. Replaced with plain text "Flyweight-style". Adds documentation for modules introduced or moved during F0 / F1: - protea.core.contracts.parent_progress (T0.7 dedup helper). - protea.core.retry (T0.3 retry middleware). - protea.core.operation_catalog (singleton OperationRegistry builder). - protea.core.training_dump_helpers (T0.6 home of helpers that survived the train_reranker.py deletion; reused by ExportResearchDatasetOperation). - An "Internal helpers" section covering protea.core.{ anc2vec_embeddings, annotation_intern, disk_cache, feature_enricher, pca_cache} for completeness. Build verification: poetry run sphinx-build returns "build succeeded, 5 warnings". Of those, 4 are pre-existing environmental failures (numpy._core.multiarray import error during autodoc of modules that import numpy, plus the cosmetic _static directory missing). The previously-introduced train_reranker and flyweight warnings are gone. Doc-T3 of the documentation lane. --- docs/source/reference/core.rst | 124 +++++++++++++++++++++++++++---- protea/core/annotation_intern.py | 5 +- 2 files changed, 111 insertions(+), 18 deletions(-) diff --git a/docs/source/reference/core.rst b/docs/source/reference/core.rst index b5779d8..767c806 100644 --- a/docs/source/reference/core.rst +++ b/docs/source/reference/core.rst @@ -44,6 +44,51 @@ dispatch time; new operations are registered at process startup in :undoc-members: :show-inheritance: +``parent_progress`` exposes the shared +``_update_parent_progress`` helper used by every coordinator +operation (``compute_embeddings``, ``predict_go_terms``) to advance +the parent job's progress as child workers finish their batches. +Extracted to its own module in F0 (T0.7) to remove duplicated copies +across coordinators. + +.. automodule:: protea.core.contracts.parent_progress + :members: + :undoc-members: + :show-inheritance: + +Retry middleware +---------------- + +``protea.core.retry`` implements the ``with_retry`` decorator used by +``BaseWorker`` to wrap the execute session against transient +database errors (deadlocks, connection drops, serialisation +failures). Exponential backoff with jitter; the maximum number of +attempts and the backoff base are controlled by +``settings.WorkerTuning.retry_max_attempts`` and +``settings.WorkerTuning.retry_backoff_base`` (see +:doc:`/appendix/configuration`). Added as part of F0 (T0.3) of the +master plan v3. + +.. automodule:: protea.core.retry + :members: + :undoc-members: + :show-inheritance: + +Operation catalogue +------------------- + +``protea.core.operation_catalog`` builds the singleton +``OperationRegistry`` that workers consult at message dispatch. The +public function ``build_operation_registry()`` instantiates each +operation class and registers it under its canonical name. Adding a +new operation is a one-line edit here plus a new module under +``protea/core/operations/``. + +.. automodule:: protea.core.operation_catalog + :members: + :undoc-members: + :show-inheritance: + Utilities --------- @@ -399,21 +444,6 @@ transactions. :undoc-members: :show-inheritance: -**train_reranker** *(internal helper — not registered)* - LightGBM training has been moved to - `protea-reranker-lab `_. - ``TrainRerankerOperation`` and ``TrainRerankerAutoOperation`` remain - importable but are **not** wired into the ``OperationRegistry`` — they - survive only as containers for the KNN / feature-generation helpers - that :class:`ExportResearchDatasetOperation` reuses in-process to - produce frozen dumps. New code should use ``export_research_dataset`` - + the ``/reranker-models/import`` HTTP surface instead. - -.. automodule:: protea.core.operations.train_reranker - :members: - :undoc-members: - :show-inheritance: - **export_research_dataset** Publishes the frozen re-ranker dataset (``train.parquet`` / ``eval.parquet`` / ``manifest.json``) consumed by @@ -430,6 +460,70 @@ transactions. :undoc-members: :show-inheritance: +Training-dump helpers +--------------------- + +``protea.core.training_dump_helpers`` is the home of the KNN / +feature-generation helpers that were extracted in F0 (T0.6) when +``protea.core.operations.train_reranker`` was deleted. The module is +deliberately not an operation — it is reused in-process by +:class:`ExportResearchDatasetOperation` to materialise ``train`` and +``eval`` shards before the ``parquet_export`` consolidation pass. +LightGBM training itself lives in +`protea-reranker-lab `_, +which consumes the published ``Dataset`` rows produced by +``export_research_dataset``. + +.. automodule:: protea.core.training_dump_helpers + :members: + :undoc-members: + :show-inheritance: + +Internal helpers +---------------- + +These modules are imported by the operations and the feature +engineering layer; they are documented here for completeness but are +not part of the public API. + +- ``protea.core.anc2vec_embeddings`` — anc2vec ancestry embeddings for + GO terms, used as features by the re-ranker (see ADR D19 for the + GeOKG replacement candidate). +- ``protea.core.annotation_intern`` — string interning helper for + reducing memory pressure when loading large annotation sets. +- ``protea.core.disk_cache`` — generic on-disk cache with TTL used by + the KNN reference loader and the PCA cache. +- ``protea.core.feature_enricher`` — orchestrator that combines + alignment, taxonomy and anc2vec features into a single + per-candidate row. +- ``protea.core.pca_cache`` — per-PLM PCA projection cache, used to + pre-compute the ``emb_pca`` feature family. + +.. automodule:: protea.core.anc2vec_embeddings + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: protea.core.annotation_intern + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: protea.core.disk_cache + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: protea.core.feature_enricher + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: protea.core.pca_cache + :members: + :undoc-members: + :show-inheritance: + .. seealso:: - :doc:`/architecture/operations` — narrative documentation for every diff --git a/protea/core/annotation_intern.py b/protea/core/annotation_intern.py index 184b927..e35e1d4 100644 --- a/protea/core/annotation_intern.py +++ b/protea/core/annotation_intern.py @@ -10,9 +10,8 @@ Without interning, each duplicate string allocates ~50 B in CPython, so a 5 M-row batch can carry ~500 MB of redundant string objects. -Interning collapses every duplicate to a single shared instance — a -:doc:`Flyweight `-style -intrinsic-state share. Python already does this implicitly for short +Interning collapses every duplicate to a single shared instance, a +Flyweight-style intrinsic-state share. Python already does this implicitly for short identifier-like literals; this module forces the same dedup for the strings that come back from the DB driver. From 434b14e22794cfcac00234b4ab196437a034ac48 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 02:06:25 +0200 Subject: [PATCH 67/73] refactor(orm): D-MIGR-04 forward parse_isoform + compute_hash to bio_utils MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the inline implementations in Protein.parse_isoform and Sequence.compute_hash with one-line forwarders to ``protea_contracts.bio_utils``. The canonical authority moves to the contracts package so the upcoming UniProt FASTA parser in ``protea-sources`` can reuse the helpers without inverting the C-stack dependency direction. Files: * protea/infrastructure/orm/models/protein/protein.py: the isoform-splitting body becomes a single delegated call; module docstring on the wrapper explains the move so future grep on "parse_isoform" lands callers in the right place. * protea/infrastructure/orm/models/sequence/sequence.py: the MD5 body becomes a delegated call; the now-unused ``import hashlib`` is removed. Behavioural parity preserved bit-for-bit: * parse_isoform("P12345") -> ("P12345", True, None) — unchanged. * parse_isoform("P12345-2") -> ("P12345", False, 2) — unchanged. * compute_hash("MKTAYIAK") -> identical 32-hex MD5 — unchanged. Existing call sites in protea/api/routers/query_sets.py, protea/api/routers/annotate.py, protea/core/operations/fetch_uniprot_metadata.py, protea/core/operations/insert_proteins.py keep working unchanged because the public API on the ORM classes is preserved (Protein .parse_isoform, Sequence.compute_hash). They will be migrated to direct imports from protea_contracts as their respective files get refactored in F2A.6-real subsequent steps. Suite: PROTEA 1128 passed, 12 skipped (= unchanged from turn 27). The 6 callsites in tests (test_insert_proteins, test_integration) exercise the wrappers transparently. Pairs with protea-contracts/18e92af which adds the canonical ``parse_isoform`` and ``compute_sequence_hash`` plus 12 unit tests in protea_contracts/bio_utils.py. Part of F2A.6-real migration plan (D-MIGR-04), prerequisite for step 3 (UniProt FASTA migration). --- .../orm/models/protein/protein.py | 18 ++++++++++-------- .../orm/models/sequence/sequence.py | 17 ++++++++++++----- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/protea/infrastructure/orm/models/protein/protein.py b/protea/infrastructure/orm/models/protein/protein.py index 93d8186..ba9c099 100644 --- a/protea/infrastructure/orm/models/protein/protein.py +++ b/protea/infrastructure/orm/models/protein/protein.py @@ -76,15 +76,17 @@ class Protein(Base): @staticmethod def parse_isoform(accession: str) -> tuple[str, bool, int | None]: + """Parse isoform accession pattern ``"-"``. + + Forwards to :func:`protea_contracts.parse_isoform`. The + canonical implementation lives in ``protea-contracts.bio_utils`` + so the FASTA parser in ``protea-sources`` can reuse it + without inverting the C-stack dependency direction (D-MIGR-04 + of master plan v3). """ - Parse isoform accession pattern "-". - Returns: (canonical_accession, is_canonical, isoform_index) - """ - if "-" in accession: - left, right = accession.rsplit("-", 1) - if right.isdigit(): - return left, False, int(right) - return accession, True, None + from protea_contracts import parse_isoform as _parse_isoform + + return _parse_isoform(accession) def __repr__(self) -> str: return ( diff --git a/protea/infrastructure/orm/models/sequence/sequence.py b/protea/infrastructure/orm/models/sequence/sequence.py index 9f8d547..0dfc5e7 100644 --- a/protea/infrastructure/orm/models/sequence/sequence.py +++ b/protea/infrastructure/orm/models/sequence/sequence.py @@ -1,6 +1,5 @@ from __future__ import annotations -import hashlib from datetime import datetime from typing import TYPE_CHECKING @@ -41,10 +40,18 @@ class Sequence(Base): @staticmethod def compute_hash(seq: str) -> str: - # usedforsecurity=False: this hash is the dedup key for the - # protein sequence table, not a security primitive. MD5 collision - # resistance is irrelevant; we just need a stable 32-hex digest. - return hashlib.md5(seq.encode("utf-8"), usedforsecurity=False).hexdigest() + """Forward to :func:`protea_contracts.compute_sequence_hash`. + + The canonical implementation lives in ``protea-contracts.bio_utils`` + (D-MIGR-04 of master plan v3) so the FASTA parser in + ``protea-sources`` can populate ``UniProtProteinRecord + .sequence_hash`` without depending on PROTEA's ORM. The wrapper + keeps every existing call site (``Sequence.compute_hash(seq)``) + working unchanged. + """ + from protea_contracts import compute_sequence_hash + + return compute_sequence_hash(seq) def __init__(self, *args: object, **kwargs: object) -> None: super().__init__(*args, **kwargs) From 37379af1281404e5e581ab3106d26e20f867f7a2 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 08:32:29 +0200 Subject: [PATCH 68/73] docs: Doc-T7 add top-level plugin author guide Adds docs/source/plugin-authoring.rst as the canonical entry point for plugin authors, and links it from the main toctree in docs/source/index.rst. Scope: - Architecture overview in one paragraph (protea-core platform plus four sibling plugin layers). - Table of the four layers (annotation sources, embedding backends, experiment runners, feature registry) with their ABC, repository and entry-point group. - Decision tree for picking the right ABC depending on what the author wants to add. - Anatomy of a plugin in 5 steps that apply uniformly across the three entry-point-driven layers, plus the in-process pattern for feature registry contributions. - Pointers to the per-repo contributing guides shipped on the doc lane: protea-backends/docs (Doc-T1) and protea-contracts/docs (Doc-T2). The protea-sources and protea-runners guides land in Doc-T8. - Discovery snippet (importlib.metadata.entry_points) that mirrors what protea-core does at startup, including the name-vs-entry-point sanity check. - Schema invariants and reproducibility section linking ADR D10 (schema_sha v2 migration) and the float16 embedding contract. - Roadmap section pointing to upcoming master-plan v3 phases that affect plugin authors (F2A.7 lightgbm absorption, F2B feature registry wiring, F2C protea-method extraction, F9 post-defense granularity decision). Build verification: poetry run sphinx-build returns "build succeeded, 5 warnings" (same 5 pre-existing warnings as before; the new page introduces zero warnings). Doc-T7 of the documentation lane. Implements F7.6 of master plan v3 ("Plugin author guide"). --- docs/source/index.rst | 1 + docs/source/plugin-authoring.rst | 202 +++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 docs/source/plugin-authoring.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 4c3b806..b188885 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -73,6 +73,7 @@ metadata enrichment, and job orchestration. introduction related_work architecture/index + plugin-authoring results appendix/index glossary diff --git a/docs/source/plugin-authoring.rst b/docs/source/plugin-authoring.rst new file mode 100644 index 0000000..fbbb615 --- /dev/null +++ b/docs/source/plugin-authoring.rst @@ -0,0 +1,202 @@ +Plugin author guide +==================== + +PROTEA is built around a plugin architecture. Annotation sources, PLM +backends, experiment runners and per-candidate features are added as +out-of-tree contributions without modifying ``protea-core``. This +page is the top-level guide for plugin authors: which abstract base +class to implement, where the implementation lives, how the platform +discovers it, and where to find the per-repo guides with concrete +templates. + +The canonical source of truth for the contracts themselves is the +`protea-contracts `_ +package and its Sphinx documentation; this page links to it +throughout. + +Architecture in one paragraph +----------------------------- + +``protea-core`` is the platform: the ORM, the FastAPI surface, the +RabbitMQ workers, the orchestration loop. Plugins live in four +sibling repositories. Each repository declares its plugins through +the Python ``entry_points`` mechanism (one mechanism, four named +groups). At startup ``protea-core`` queries +``importlib.metadata.entry_points`` for each group and loads the +plugin instances; from that moment on, every dispatch by name is a +dictionary lookup. + +The four plugin layers +---------------------- + +.. list-table:: + :header-rows: 1 + :widths: 18 30 26 26 + + * - Layer + - ABC + - Repository + - Entry-point group + * - Annotation sources + - :class:`protea_contracts.AnnotationSource` + - `protea-sources `_ + - ``protea.sources`` + * - PLM backends + - :class:`protea_contracts.EmbeddingBackend` + - `protea-backends `_ + - ``protea.backends`` + * - Experiment runners + - :class:`protea_contracts.ExperimentRunner` + - `protea-runners `_ + - ``protea.runners`` + * - Per-candidate features + - :class:`protea_contracts.FeatureRegistry` + - ``protea-core/protea/core/features/`` + - in-process registry (no entry-point group) + +Picking the right ABC +--------------------- + +**You want to ingest a new annotation source** (a database release, a +file format, a web API that produces ``ProteinGOAnnotation`` rows). +Implement ``AnnotationSource`` in ``protea-sources``. Examples +shipped today: ``goa``, ``quickgo``, ``uniprot``. + +**You want to add a new protein language model** (a HuggingFace +checkpoint, a structure-aware encoder, a distilled variant). +Implement ``EmbeddingBackend`` in ``protea-backends``. Examples +shipped today: ``esm``, ``t5``, ``ankh``, ``esm3c``. + +**You want to add a new training method** (a different boosting +algorithm, a graph neural network, a retrieval-neural ranker). +Implement ``ExperimentRunner`` in ``protea-runners``. Examples +shipped today: ``knn``, ``baseline``, ``lightgbm`` (the latter +materialises in F2A.7). + +**You want to add a feature to the re-ranker** (a new sequence +metric, a new ontology-aware embedding, a new taxonomic signal). +Register a :class:`protea_contracts.Feature` in +``protea-core/protea/core/features/.py``. This is in-process +and does not use ``entry_points``: the registry is gathered at import +time from a fixed list of family modules. The feature's ``family`` +field decides where it appears in the dataset schema and feeds into +``compute_schema_sha`` (see :doc:`adr/D10-schema-sha-v2`). + +If your idea fits none of these layers, it probably belongs in +``protea-core`` itself. Open an issue describing what you want to +add; the architecture review may suggest a fifth layer or surface a +hidden constraint. + +Anatomy of a plugin +------------------- + +Independent of the layer, every plugin follows the same shape: + +1. A Python module under the relevant repository, named after the + plugin (``protea_backends/myplugin/__init__.py``). +2. A class that subclasses the relevant ABC and implements the + abstract methods, with a class attribute ``name`` matching the + entry-point name. +3. A module-level instance ``plugin = MyPlugin()`` that is what the + entry-point resolves to. +4. A line under ``[tool.poetry.plugins."protea."]`` in the + repository's ``pyproject.toml``:: + + myplugin = "protea_.myplugin:plugin" + +5. A test file that exercises the contract: instance type, ABC + compliance, ``name`` attribute, discoverability via + ``entry_points(group="protea.")``, and the public method + signatures. The existing test files in each repository are good + templates. + +Heavy dependencies belong behind Poetry extras and are imported +lazily inside the method that needs them, not at module top. This +keeps plugin discovery import-cheap; ``protea-core`` does not pay for +``torch`` (or any other heavy dependency) at startup if no caller is +actually invoking the backend that uses it. + +Where to find the concrete guides +--------------------------------- + +Each plugin repository ships its own contributing guide with a +runnable template, the SemVer policy that applies to its public +surface, and CI expectations: + +- **protea-backends**: see ``docs/source/contributing.rst`` in + the repository, and the per-backend pages + (``docs/source/backends/{esm,t5,ankh,esm3c}.rst``) for examples of + how to document a backend's quirks (numerical type, pooling rule, + tokeniser idiosyncrasies). +- **protea-contracts**: see ``docs/source/contributing.rst`` for + the SemVer rules that govern when a contract change is patch, + minor or major, the procedure for adding a feature to + ``ALL_FEATURES`` (which changes the schema sha and forces booster + retraining), and the ABC additive-vs-breaking guidance. +- **protea-sources** and **protea-runners**: Sphinx scaffolding for + these is on the doc lane (Doc-T8); until it lands, the existing + README plus the ``protea-backends`` guide above are the closest + template (the patterns transfer: substitute the ABC and the + entry-point group). + +Discovery in code +----------------- + +Should you want to verify a plugin is discoverable from a Python +shell:: + + from importlib.metadata import entry_points + + eps = entry_points(group="protea.backends") + for ep in eps: + print(ep.name, "->", ep.value) + plugin = ep.load() + print(" name attr:", plugin.name) + +This is exactly what ``protea-core`` does at startup. The only thing +``protea-core`` adds is a sanity check: ``plugin.name`` must equal +``ep.name`` or the worker raises ``RuntimeError`` rather than start. +This catches typos in the entry-point declaration the only place +they could otherwise hide. + +Schema invariants and reproducibility +------------------------------------- + +Plugins must respect the platform's reproducibility contract. Two +specific places this matters: + +- **Feature plugins** participate in ``compute_schema_sha``. Adding a + feature changes the digest, which is correct: existing re-ranker + boosters trained against the old digest will refuse to load + against the new one. Bump the package minor and re-train. + See :doc:`adr/D10-schema-sha-v2` for the parallel-column migration + that brings every consumer onto a single source of truth. +- **Embedding backends** must return float16 embeddings of shape + ``(batch_size, hidden_dim)``. Special tokens (``CLS``, ``EOS``, + ``BOS``, prefix tokens) must be stripped before pooling. Variations + in tokenisation policy across backends are acceptable as long as + the final pooled vector is a faithful per-protein representation. + +Both invariants are enforced by tests in ``protea-core`` and by +golden parquet bit-exact comparisons in F2 (T2B.2 of the master +plan). Breaking either is loud, not silent. + +Roadmap +------- + +Several phases of the master plan v3 directly affect plugin authors: + +- **F2A.7**: ``protea-runners.lightgbm`` absorbs the standalone + ``protea-reranker-lab`` repository as the canonical LightGBM + runner. +- **F2B**: the in-process ``FeatureRegistry`` is wired into + ``parquet_export`` and ``predict_go_terms`` so that every + registered feature flows end-to-end without manual list + maintenance. +- **F2C**: ``protea-method`` extracts the inference path as a + pure-Python package consumable without the platform; this becomes + the single shippable target for downstream adopters and for the + LAFA submission containers (F-LAFA). +- **F9** (post-defense): if third parties publish plugins, the per + group repositories may split into per-plugin repositories. See + :doc:`adr/D14-plugin-granularity`. From 56a6d875fb2f7754b50e956fba8b813d74caf319 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 10:03:25 +0200 Subject: [PATCH 69/73] refactor(uniprot): F2A.6-real InsertProteinsOperation consumes plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third Level-1 plugin migration. InsertProteinsOperation delegates HTTP retries + cursor pagination + gzip decoding + FASTA parsing to the protea-sources UniProtSource plugin, becoming a thin persistence adapter. Pairs with protea-sources/fadbd6b (real UniProtSource.stream_fasta + _http.py) and protea-contracts/ f1bf7b5 (typed payload + record). What moved out: * _fetch_fasta_pages body: now a one-liner constructing a typed UniProtFastaStreamPayload and yielding from uniprot_plugin.stream_fasta. Renamed to _stream_fasta to reflect per-record yield (D-MIGR-01). * _decode_response method (gzip / utf-8 wrapper): plugin owns it. * _parse_fasta + _parse_header methods (~70 LOC of FASTA parsing, OS/OX/GN regex, isoform splitting via Protein.parse_isoform): plugin owns it. The OS/OX/GN regex constants and isoform logic move with them. * UNIPROT_SEARCH_URL constant: now in UniProtFastaStreamPayload.base_url default. * Removed imports: gzip, re, requests, Response, BytesIO, quote, UniProtHttpClient (legacy PROTEA-side copy stays in protea/core/utils.py until step 4 deletes it as the last caller fetch_uniprot_metadata also migrates). * Removed state: self._http_client, self._total_results. What stayed: * Operation lifecycle, InsertProteinsPayload validation, batching policy (page_size buffer flush), session.add_all + flush against Protein + Sequence tables, conservative-update logic for existing proteins. * _store_records (full upsert path) — now consumes UniProtProteinRecord via attribute access (rec.accession, rec.canonical_accession, etc.) instead of dict access. * _load_existing_sequences + _load_existing_proteins (DB lookup helpers). Behavioural diffs surfaced: * pages now counts DB-side buffer flushes, not HTTP pages. The HTTP-page count is the plugin's internal concern (visible via source.uniprot_fasta.fetch_page_done events). pages is more useful to monitor DB throughput; the rename in semantics is captured in the relevant test docstrings. * X-Total-Results header capture (op._total_results) is removed. The header was nice-to-have for progress reporting and not load-bearing for correctness; progress totals now flow only when the user sets total_limit. Operator changelog flagged. * Plugin-emitted events use the source.uniprot_fasta.* prefix; operation-emitted events keep insert_proteins.* prefix. Tests refactored: * TestParseFasta class deleted (~95 LOC, 11 tests). Parser is now in protea-sources where parse_fasta_header + parse_fasta_text have full unit coverage. * TestDecodeResponse class deleted (~25 LOC, 2 tests). Decode is in the plugin's _decode_response_body helper, exercised via the gzip stream wiring tests in protea-sources. * test_total_results_from_header + test_total_results_invalid deleted (2 tests). The operation no longer captures X-Total- Results. * TestStoreRecords: dict-literal record fixtures replaced with a _make_record(...) helper that builds UniProtProteinRecord via the bio_utils helpers (same MD5 hash, same canonical splitting). Two test bodies shrink ~17 LOC each. * TestInsertProteinsOperationExecute: patch target swap from ``op._http_client.session.get`` to ``op._uniprot_plugin._client.session.get`` across 16 sites. test_empty_page_continues renamed to test_empty_page_does_not_flush with the new pages=0 expectation. test_progress_emission_with_total renamed to test_progress_emission_with_total_limit; uses page_size=1 + total_limit=100 to force a flush + carry the progress total. * Net -16 PROTEA tests (parser+decode+total_results all moved or deleted), corresponding +56 in protea-sources for a strictly better surface. Suite: PROTEA 1112 passed, 12 skipped (was 1128; -16 from deletions). Ruff full + mypy strict green on touched files. Part of F2A.6-real migration plan, step 3 (b) of 4. The legacy UniProtHttpClient in protea/core/utils.py becomes dead code once step 4 (UniProt metadata migration) lands; deletion deferred to that turn. --- protea/core/operations/insert_proteins.py | 382 ++++++++-------------- tests/test_insert_proteins.py | 294 ++++------------- 2 files changed, 205 insertions(+), 471 deletions(-) diff --git a/protea/core/operations/insert_proteins.py b/protea/core/operations/insert_proteins.py index e1aacf1..cdad485 100644 --- a/protea/core/operations/insert_proteins.py +++ b/protea/core/operations/insert_proteins.py @@ -1,21 +1,16 @@ from __future__ import annotations -import gzip -import re import time -from collections.abc import Iterable +from collections.abc import Iterator from collections.abc import Sequence as Seq -from io import BytesIO from typing import Annotated, Any -from urllib.parse import quote -import requests +from protea_contracts import UniProtFastaStreamPayload, UniProtProteinRecord from pydantic import Field, field_validator -from requests import Response from sqlalchemy.orm import Session from protea.core.contracts.operation import EmitFn, Operation, OperationResult, ProteaPayload -from protea.core.utils import UniProtHttpClient, chunks +from protea.core.utils import chunks from protea.infrastructure.orm.models.protein.protein import Protein from protea.infrastructure.orm.models.sequence.sequence import Sequence as SequenceModel @@ -58,7 +53,6 @@ class InsertProteinsOperation(Operation): "Fetch protein sequences from UniProt (FASTA, cursor-paginated) and upsert " "Protein + Sequence rows; isoforms are stored grouped by canonical accession." ) - UNIPROT_SEARCH_URL = "https://rest.uniprot.org/uniprotkb/search" def summarize_payload(self, payload: dict[str, Any]) -> str: criteria = (payload or {}).get("search_criteria") @@ -73,20 +67,17 @@ def summarize_payload(self, payload: dict[str, Any]) -> str: bits.append(f"limit={limit}") return " · ".join(bits) - _re_os = re.compile(r"\bOS=([^=]+?)\sOX=") - _re_ox = re.compile(r"\bOX=(\d+)") - _re_gn = re.compile(r"\bGN=([^\s]+)") - def __init__(self) -> None: - self._http_client = UniProtHttpClient() - self._total_results: int | None = None + # Plugin instance is reused across executions for connection + # pooling; counters are reset by the plugin at the start of + # each ``stream_fasta`` call. + from protea_sources.uniprot import plugin as _uniprot_plugin + + self._uniprot_plugin = _uniprot_plugin def execute( self, session: Session, payload: dict[str, Any], *, emit: EmitFn ) -> OperationResult: - self._http_client.reset() - self._total_results = None - p = InsertProteinsPayload.model_validate(payload) t0 = time.perf_counter() @@ -105,214 +96,111 @@ def execute( sequences_inserted = 0 sequences_reused = 0 - for page_idx, records in enumerate(self._fetch_fasta_pages(p, emit), start=1): - pages = page_idx - if not records: - continue - - if p.total_limit is not None and (retrieved + len(records)) > p.total_limit: - records = records[: max(0, p.total_limit - retrieved)] - if not records: + # Buffer per-record into operation-controlled pages of size + # ``p.page_size``. The plugin yields one record at a time + # (D-MIGR-01); the operation owns batching policy. + buffer: list[UniProtProteinRecord] = [] + for record in self._stream_fasta(p, emit): + if p.total_limit is not None and retrieved >= p.total_limit: + emit( + "insert_proteins.limit_reached", + None, + {"total_limit": p.total_limit}, + "warning", + ) break - retrieved += len(records) - isoforms += sum(1 for r in records if r["isoform_index"] is not None) + buffer.append(record) + retrieved += 1 + if record.isoform_index is not None: + isoforms += 1 + + if len(buffer) >= p.page_size: + pages += 1 + ins_p, upd_p, ins_s, re_s = self._store_records(session, buffer, emit) + proteins_inserted += ins_p + proteins_updated += upd_p + sequences_inserted += ins_s + sequences_reused += re_s + buffer.clear() + + http_req, http_ret = self._uniprot_plugin.http_counters + emit( + "insert_proteins.page_done", + None, + { + "page": pages, + "retrieved_total": retrieved, + "proteins_inserted_total": proteins_inserted, + "proteins_updated_total": proteins_updated, + "sequences_inserted_total": sequences_inserted, + "sequences_reused_total": sequences_reused, + "http_requests": http_req, + "http_retries": http_ret, + "_progress_current": retrieved, + **( + {"_progress_total": p.total_limit} + if p.total_limit + else {} + ), + }, + "info", + ) - ins_p, upd_p, ins_s, re_s = self._store_records(session, records, emit) + # Flush remaining buffer. + if buffer: + pages += 1 + ins_p, upd_p, ins_s, re_s = self._store_records(session, buffer, emit) proteins_inserted += ins_p proteins_updated += upd_p sequences_inserted += ins_s sequences_reused += re_s - emit( - "insert_proteins.page_done", - None, - { - "page": page_idx, - "retrieved_total": retrieved, - "proteins_inserted_total": proteins_inserted, - "proteins_updated_total": proteins_updated, - "sequences_inserted_total": sequences_inserted, - "sequences_reused_total": sequences_reused, - "http_requests": self._http_client.requests, - "http_retries": self._http_client.retries, - "_progress_current": retrieved, - **( - {"_progress_total": p.total_limit or self._total_results} - if (p.total_limit or self._total_results) - else {} - ), - }, - "info", - ) - - if p.total_limit is not None and retrieved >= p.total_limit: - emit( - "insert_proteins.limit_reached", None, {"total_limit": p.total_limit}, "warning" - ) - break - elapsed = time.perf_counter() - t0 - emit( - "insert_proteins.done", - None, - { - "pages": pages, - "retrieved_records": retrieved, - "isoform_records": isoforms, - "proteins_inserted": proteins_inserted, - "proteins_updated": proteins_updated, - "sequences_inserted": sequences_inserted, - "sequences_reused": sequences_reused, - "http_requests": self._http_client.requests, - "http_retries": self._http_client.retries, - "elapsed_seconds": elapsed, - }, - "info", - ) - - return OperationResult( - result={ - "pages": pages, - "retrieved_records": retrieved, - "isoform_records": isoforms, - "proteins_inserted": proteins_inserted, - "proteins_updated": proteins_updated, - "sequences_inserted": sequences_inserted, - "sequences_reused": sequences_reused, - "http_requests": self._http_client.requests, - "http_retries": self._http_client.retries, - "elapsed_seconds": elapsed, - } - ) + http_req, http_ret = self._uniprot_plugin.http_counters + result_dict = { + "pages": pages, + "retrieved_records": retrieved, + "isoform_records": isoforms, + "proteins_inserted": proteins_inserted, + "proteins_updated": proteins_updated, + "sequences_inserted": sequences_inserted, + "sequences_reused": sequences_reused, + "http_requests": http_req, + "http_retries": http_ret, + "elapsed_seconds": elapsed, + } + emit("insert_proteins.done", None, result_dict, "info") + return OperationResult(result=result_dict) - # ---- HTTP paging ---- - def _fetch_fasta_pages( + def _stream_fasta( self, p: InsertProteinsPayload, emit: EmitFn - ) -> Iterable[list[dict[str, Any]]]: - encoded_query = quote(p.search_criteria) - params = ["format=fasta", f"query={encoded_query}", f"size={p.page_size}"] - if p.include_isoforms: - params.append("includeIsoform=true") - if p.compressed: - params.append("compressed=true") - - base_url = f"{self.UNIPROT_SEARCH_URL}?{'&'.join(params)}" - next_cursor: str | None = None - page = 0 - - while True: - page += 1 - url = base_url if not next_cursor else f"{base_url}&cursor={next_cursor}" - emit( - "uniprot.fetch_page_start", - None, - {"page": page, "has_cursor": bool(next_cursor)}, - "info", - ) - - resp = self._http_client.get_with_retries(url, p, emit) - if self._total_results is None: - try: - self._total_results = int(resp.headers.get("X-Total-Results", 0)) or None - except (ValueError, TypeError): - pass - text = self._decode_response(resp, p.compressed) - records = self._parse_fasta(text) - - emit("uniprot.fetch_page_done", None, {"page": page, "records": len(records)}, "info") - yield records - - next_cursor = self._http_client.extract_next_cursor(resp.headers.get("link", "")) - if not next_cursor: - break - - def _decode_response(self, resp: Response, compressed: bool) -> str: - content = resp.content - if compressed: - with gzip.GzipFile(fileobj=BytesIO(content)) as f: - return f.read().decode("utf-8", errors="replace") - return content.decode("utf-8", errors="replace") - - # ---- FASTA parsing ---- - def _parse_fasta(self, fasta_text: str) -> list[dict[str, Any]]: - records: list[dict[str, Any]] = [] - header: str | None = None - seq_lines: list[str] = [] - - def flush() -> None: - nonlocal header, seq_lines - if not header: - return - seq = "".join(seq_lines).replace(" ", "").strip() - if not seq: - header = None - seq_lines = [] - return - - parsed = self._parse_header(header) - parsed["sequence"] = seq - parsed["length"] = len(seq) - parsed["sequence_hash"] = SequenceModel.compute_hash(seq) - records.append(parsed) - - header = None - seq_lines = [] - - for line in fasta_text.splitlines(): - line = line.strip() - if not line: - continue - if line.startswith(">"): - flush() - header = line[1:] - else: - seq_lines.append(line) - flush() - return records - - def _parse_header(self, header: str) -> dict[str, Any]: - parts = header.split("|") - reviewed = header.startswith("sp|") - - if len(parts) >= 3: - accession = parts[1].strip() - entry_name = parts[2].split(" ", 1)[0].strip() - else: - accession = header.split(" ", 1)[0].strip() - entry_name = None - - canonical, is_canonical, iso_idx = Protein.parse_isoform(accession) - - organism = None - taxonomy_id = None - gene_name = None - - m = self._re_os.search(header) - if m: - organism = m.group(1).strip() - m = self._re_ox.search(header) - if m: - taxonomy_id = m.group(1).strip() - m = self._re_gn.search(header) - if m: - gene_name = m.group(1).strip() - - return { - "accession": accession, - "entry_name": entry_name, - "canonical_accession": canonical, - "is_canonical": is_canonical, - "isoform_index": iso_idx, - "organism": organism, - "taxonomy_id": taxonomy_id, - "gene_name": gene_name, - "reviewed": reviewed, - } + ) -> Iterator[UniProtProteinRecord]: + """Delegate to the protea-sources UniProtSource plugin. + + Plugin owns HTTP retries, cursor pagination, gzip decoding, + and FASTA parsing. The operation owns batching, dedup, and + bulk insert. See ``f2a6_real_migration_design.md``. + """ + yield from self._uniprot_plugin.stream_fasta( + UniProtFastaStreamPayload( + search_criteria=p.search_criteria, + page_size=p.page_size, + timeout_seconds=p.timeout_seconds, + include_isoforms=p.include_isoforms, + compressed=p.compressed, + max_retries=p.max_retries, + backoff_base_seconds=p.backoff_base_seconds, + backoff_max_seconds=p.backoff_max_seconds, + jitter_seconds=p.jitter_seconds, + user_agent=p.user_agent, + ), + emit=emit, + ) # ---- DB storage ---- def _store_records( - self, session: Session, records: list[dict[str, Any]], emit: EmitFn + self, session: Session, records: list[UniProtProteinRecord], emit: EmitFn ) -> tuple[int, int, int, int]: if not records: return 0, 0, 0, 0 @@ -320,9 +208,8 @@ def _store_records( # 1) Deduplicate sequences hash_to_seq: dict[str, str] = {} for r in records: - h = r["sequence_hash"] - if h not in hash_to_seq: - hash_to_seq[h] = r["sequence"] + if r.sequence_hash not in hash_to_seq: + hash_to_seq[r.sequence_hash] = r.sequence unique_hashes = list(hash_to_seq.keys()) emit("db.lookup_sequences_start", None, {"count": len(unique_hashes)}, "info") @@ -352,7 +239,7 @@ def _store_records( emit("db.insert_sequences_done", None, {"rows": sequences_inserted}, "info") # 2) Load existing proteins - accessions = [r["accession"] for r in records] + accessions = [r.accession for r in records] existing_prot = self._load_existing_proteins(session, accessions) # 3) Upsert proteins (insert new, conservative update existing) @@ -361,51 +248,50 @@ def _store_records( to_add: list[Protein] = [] for r in records: - acc = r["accession"] - seq_id = existing_seq_ids[r["sequence_hash"]] + seq_id = existing_seq_ids[r.sequence_hash] - if acc in existing_prot: - p = existing_prot[acc] + if r.accession in existing_prot: + p = existing_prot[r.accession] changed = False if getattr(p, "sequence_id", None) is None and seq_id is not None: p.sequence_id = seq_id changed = True - if getattr(p, "entry_name", None) in (None, "") and r.get("entry_name"): - p.entry_name = r["entry_name"] + if getattr(p, "entry_name", None) in (None, "") and r.entry_name: + p.entry_name = r.entry_name changed = True - if getattr(p, "canonical_accession", None) != r["canonical_accession"]: - p.canonical_accession = r["canonical_accession"] + if getattr(p, "canonical_accession", None) != r.canonical_accession: + p.canonical_accession = r.canonical_accession changed = True - if getattr(p, "is_canonical", None) != r["is_canonical"]: - p.is_canonical = r["is_canonical"] + if getattr(p, "is_canonical", None) != r.is_canonical: + p.is_canonical = r.is_canonical changed = True - if getattr(p, "isoform_index", None) != r["isoform_index"]: - p.isoform_index = r["isoform_index"] + if getattr(p, "isoform_index", None) != r.isoform_index: + p.isoform_index = r.isoform_index changed = True - if getattr(p, "reviewed", None) is None and r.get("reviewed") is not None: - p.reviewed = r["reviewed"] + if getattr(p, "reviewed", None) is None: + p.reviewed = r.reviewed changed = True - if getattr(p, "taxonomy_id", None) in (None, "") and r.get("taxonomy_id"): - p.taxonomy_id = r["taxonomy_id"] + if getattr(p, "taxonomy_id", None) in (None, "") and r.taxonomy_id: + p.taxonomy_id = r.taxonomy_id changed = True - if getattr(p, "organism", None) in (None, "") and r.get("organism"): - p.organism = r["organism"] + if getattr(p, "organism", None) in (None, "") and r.organism: + p.organism = r.organism changed = True - if getattr(p, "gene_name", None) in (None, "") and r.get("gene_name"): - p.gene_name = r["gene_name"] + if getattr(p, "gene_name", None) in (None, "") and r.gene_name: + p.gene_name = r.gene_name changed = True - if getattr(p, "length", None) is None and r.get("length"): - p.length = int(r["length"]) + if getattr(p, "length", None) is None: + p.length = r.length changed = True if changed: @@ -414,16 +300,16 @@ def _store_records( else: to_add.append( Protein( - accession=acc, - canonical_accession=r["canonical_accession"], - is_canonical=r["is_canonical"], - isoform_index=r["isoform_index"], - reviewed=r.get("reviewed"), - entry_name=r.get("entry_name"), - organism=r.get("organism"), - taxonomy_id=r.get("taxonomy_id"), - gene_name=r.get("gene_name"), - length=int(r["length"]) if r.get("length") else None, + accession=r.accession, + canonical_accession=r.canonical_accession, + is_canonical=r.is_canonical, + isoform_index=r.isoform_index, + reviewed=r.reviewed, + entry_name=r.entry_name, + organism=r.organism, + taxonomy_id=r.taxonomy_id, + gene_name=r.gene_name, + length=r.length, sequence_id=seq_id, ) ) diff --git a/tests/test_insert_proteins.py b/tests/test_insert_proteins.py index 5fb4e7a..e535f40 100644 --- a/tests/test_insert_proteins.py +++ b/tests/test_insert_proteins.py @@ -66,6 +66,32 @@ def _make_mock_session(): return session +def _make_record( + accession: str = "P12345", + sequence: str = "MKTAYIAK", + is_canonical: bool = True, + isoform_index: int | None = None, + canonical_accession: str | None = None, +): + """Build a UniProtProteinRecord for store-records testing.""" + from protea_contracts import UniProtProteinRecord, compute_sequence_hash + + return UniProtProteinRecord( + accession=accession, + entry_name="TEST_HUMAN", + canonical_accession=canonical_accession or accession, + is_canonical=is_canonical, + isoform_index=isoform_index, + organism="Homo sapiens", + taxonomy_id="9606", + gene_name="TEST", + reviewed=True, + sequence=sequence, + length=len(sequence), + sequence_hash=compute_sequence_hash(sequence), + ) + + # --------------------------------------------------------------------------- # Unit tests — InsertProteinsPayload # --------------------------------------------------------------------------- @@ -120,131 +146,6 @@ def test_search_criteria_stripped(self): assert p.search_criteria == "q" -# --------------------------------------------------------------------------- -# Unit tests — _parse_fasta / _parse_header -# --------------------------------------------------------------------------- - - -class TestParseFasta: - def setup_method(self): - self.op = InsertProteinsOperation() - - def test_parses_single_record(self): - records = self.op._parse_fasta(FASTA_ONE) - assert len(records) == 1 - r = records[0] - assert r["accession"] == "P12345" - assert r["reviewed"] is True - assert r["organism"] == "Homo sapiens" - assert r["taxonomy_id"] == "9606" - assert r["gene_name"] == "TEST" - assert len(r["sequence"]) > 0 - assert r["length"] == len(r["sequence"]) - - def test_parses_multiple_records(self): - records = self.op._parse_fasta(FASTA_TWO) - assert len(records) == 2 - assert records[1]["accession"] == "Q99999" - assert records[1]["reviewed"] is False - assert records[1]["taxonomy_id"] == "10090" - - def test_empty_fasta_returns_empty(self): - assert self.op._parse_fasta("") == [] - - def test_canonical_isoform_parsing(self): - fasta = ">sp|P12345-2|TEST_HUMAN Isoform 2 OS=Homo sapiens OX=9606\nMKTAYIAK\n" - records = self.op._parse_fasta(fasta) - assert records[0]["canonical_accession"] == "P12345" - assert records[0]["is_canonical"] is False - assert records[0]["isoform_index"] == 2 - - def test_sequence_hash_is_set(self): - records = self.op._parse_fasta(FASTA_ONE) - assert records[0]["sequence_hash"] is not None - assert len(records[0]["sequence_hash"]) == 32 # MD5 hex - - def test_empty_sequence_skipped(self): - """Lines 231-233: header with no sequence lines is skipped.""" - fasta = ">sp|P12345|TEST_HUMAN Test OS=Homo sapiens OX=9606\n\n" - records = self.op._parse_fasta(fasta) - assert records == [] - - def test_header_without_pipe_separators(self): - """Lines 264-265: header without | uses first word as accession.""" - fasta = ">SIMPLE_ACC some description\nMKTAYIAK\n" - records = self.op._parse_fasta(fasta) - assert len(records) == 1 - assert records[0]["accession"] == "SIMPLE_ACC" - assert records[0]["entry_name"] is None - - def test_isoform_accession_parsed(self): - fasta = ">sp|P12345-3|TEST_HUMAN Isoform 3 OS=Homo sapiens OX=9606 GN=TEST\nMKTAYIAK\n" - records = self.op._parse_fasta(fasta) - r = records[0] - assert r["accession"] == "P12345-3" - assert r["canonical_accession"] == "P12345" - assert r["is_canonical"] is False - assert r["isoform_index"] == 3 - - def test_canonical_accession_flagged(self): - records = self.op._parse_fasta(FASTA_ONE) - r = records[0] - assert r["canonical_accession"] == "P12345" - assert r["is_canonical"] is True - assert r["isoform_index"] is None - - def test_reviewed_vs_unreviewed(self): - records = self.op._parse_fasta(FASTA_TWO) - assert records[0]["reviewed"] is True # sp| - assert records[1]["reviewed"] is False # tr| - - def test_sequence_deduplication_by_hash(self): - """Two identical sequences produce the same hash.""" - fasta = ( - ">sp|P11111|A_HUMAN Prot A OS=Homo sapiens OX=9606\nMKTAYIAK\n" - ">sp|P22222|B_HUMAN Prot B OS=Homo sapiens OX=9606\nMKTAYIAK\n" - ) - records = self.op._parse_fasta(fasta) - assert len(records) == 2 - assert records[0]["sequence_hash"] == records[1]["sequence_hash"] - - def test_multiline_sequence(self): - fasta = ">sp|P12345|TEST_HUMAN Test OS=Homo sapiens OX=9606\nMKTAY\nIAKQR\n" - records = self.op._parse_fasta(fasta) - assert records[0]["sequence"] == "MKTAYIAKQR" - assert records[0]["length"] == 10 - - -# --------------------------------------------------------------------------- -# Unit tests — _decode_response -# --------------------------------------------------------------------------- - - -class TestDecodeResponse: - def setup_method(self): - self.op = InsertProteinsOperation() - - def test_decode_uncompressed(self): - """Line 217: uncompressed path.""" - resp = MagicMock() - resp.content = b"hello world" - result = self.op._decode_response(resp, compressed=False) - assert result == "hello world" - - def test_decode_compressed(self): - """Lines 215-216: gzip decompression path.""" - import gzip - from io import BytesIO - - raw = b"compressed content" - buf = BytesIO() - with gzip.GzipFile(fileobj=buf, mode="wb") as f: - f.write(raw) - resp = MagicMock() - resp.content = buf.getvalue() - result = self.op._decode_response(resp, compressed=True) - assert result == "compressed content" - # --------------------------------------------------------------------------- # Unit tests — _store_records @@ -264,25 +165,8 @@ def test_empty_records_returns_zeros(self): def test_updates_existing_protein(self): """Lines 350-394: existing protein gets conservative updates.""" - from protea.infrastructure.orm.models.sequence.sequence import ( - Sequence as SequenceModel, - ) - - seq_hash = SequenceModel.compute_hash("MKTAYIAK") - record = { - "accession": "P12345", - "entry_name": "TEST_HUMAN", - "canonical_accession": "P12345", - "is_canonical": True, - "isoform_index": None, - "organism": "Homo sapiens", - "taxonomy_id": "9606", - "gene_name": "TEST", - "reviewed": True, - "sequence": "MKTAYIAK", - "length": 8, - "sequence_hash": seq_hash, - } + record = _make_record() + seq_hash = record.sequence_hash # Existing protein with missing fields (triggers updates) existing_prot = MagicMock() @@ -334,25 +218,7 @@ def query_side_effect(*args): def test_inserts_new_sequence_when_missing(self): """Lines 318-334: new sequence inserted when hash not in DB.""" - from protea.infrastructure.orm.models.sequence.sequence import ( - Sequence as SequenceModel, - ) - - seq_hash = SequenceModel.compute_hash("MKTAYIAK") - record = { - "accession": "P12345", - "entry_name": "TEST_HUMAN", - "canonical_accession": "P12345", - "is_canonical": True, - "isoform_index": None, - "organism": "Homo sapiens", - "taxonomy_id": "9606", - "gene_name": "TEST", - "reviewed": True, - "sequence": "MKTAYIAK", - "length": 8, - "sequence_hash": seq_hash, - } + record = _make_record() session = MagicMock(spec=Session) @@ -397,7 +263,7 @@ def test_execute_returns_operation_result(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_ONE)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(FASTA_ONE)): result = self.op.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, @@ -414,7 +280,7 @@ def test_execute_emits_start_and_done(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_ONE)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(FASTA_ONE)): self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -429,7 +295,7 @@ def test_execute_respects_total_limit(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result = self.op.execute( session, {"search_criteria": "q", "total_limit": 1, "compressed": False}, @@ -443,7 +309,7 @@ def test_execute_respects_total_limit(self): def test_execute_calls_session_add_all_for_new_protein(self): session = _make_mock_session() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_ONE)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(FASTA_ONE)): self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -456,7 +322,7 @@ def test_two_records_counts_correctly(self): session = _make_mock_session() emit = _capturing_emit() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result = self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -466,20 +332,23 @@ def test_two_records_counts_correctly(self): assert result.result["retrieved_records"] == 2 assert result.result["proteins_inserted"] == 2 - def test_empty_page_continues(self): - """Line 93: empty records list triggers continue.""" + def test_empty_page_does_not_flush(self): + """Empty FASTA response → no records, no buffer flush, pages=0. + + Per F2A.6-real, ``pages`` counts DB-side buffer flushes; an + HTTP page that returns zero records never triggers a flush. + """ session = _make_mock_session() emit = _capturing_emit() - # First response is empty FASTA, no link header → single page with 0 records empty_resp = _make_mock_response("") - with patch.object(self.op._http_client.session, "get", return_value=empty_resp): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=empty_resp): result = self.op.execute( session, {"search_criteria": "q", "compressed": False}, emit=emit, ) assert result.result["retrieved_records"] == 0 - assert result.result["pages"] == 1 + assert result.result["pages"] == 0 def test_total_limit_trims_to_zero_breaks(self): """Lines 96-98: when total_limit is already reached, records trimmed to empty → break.""" @@ -502,7 +371,7 @@ def get_side_effect(*args, **kwargs): return page1_resp return page2_resp - with patch.object(self.op._http_client.session, "get", side_effect=get_side_effect): + with patch.object(self.op._uniprot_plugin._client.session, "get", side_effect=get_side_effect): result = self.op.execute( session, {"search_criteria": "q", "total_limit": 2, "compressed": False}, @@ -530,7 +399,7 @@ def test_compressed_param_appended(self): resp.headers = {"link": ""} resp.raise_for_status = MagicMock() - with patch.object(self.op._http_client.session, "get", return_value=resp) as mock_get: + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=resp) as mock_get: self.op.execute( session, {"search_criteria": "q", "compressed": True}, @@ -540,41 +409,11 @@ def test_compressed_param_appended(self): called_url = mock_get.call_args[0][0] assert "compressed=true" in called_url - def test_total_results_from_header(self): - """Line 200: X-Total-Results header is captured.""" - session = _make_mock_session() - emit = _capturing_emit() - - resp = _make_mock_response(FASTA_ONE) - resp.headers["X-Total-Results"] = "42" - - op = InsertProteinsOperation() - with patch.object(op._http_client.session, "get", return_value=resp): - op.execute( - session, - {"search_criteria": "q", "compressed": False}, - emit=emit, - ) - - assert op._total_results == 42 - - def test_total_results_invalid_header_ignored(self): - """Line 200: non-numeric X-Total-Results doesn't crash.""" - session = _make_mock_session() - emit = _capturing_emit() - - resp = _make_mock_response(FASTA_ONE) - resp.headers["X-Total-Results"] = "not-a-number" - - op = InsertProteinsOperation() - with patch.object(op._http_client.session, "get", return_value=resp): - op.execute( - session, - {"search_criteria": "q", "compressed": False}, - emit=emit, - ) - - assert op._total_results is None + # NOTE: tests for ``op._total_results`` (X-Total-Results capture) + # were removed in F2A.6-real step 3 (b). The plugin abstracts HTTP + # away from the operation, and X-Total-Results was nice-to-have for + # progress reporting, not load-bearing for correctness. Progress + # totals now only flow when ``total_limit`` is set. def test_cursor_pagination(self): """Lines 208-210: cursor-based pagination follows link headers.""" @@ -598,16 +437,20 @@ def get_side_effect(url, **kwargs): return page2_resp op = InsertProteinsOperation() - with patch.object(op._http_client.session, "get", side_effect=get_side_effect): + with patch.object(op._uniprot_plugin._client.session, "get", side_effect=get_side_effect): result = op.execute( session, {"search_criteria": "q", "compressed": False}, emit=emit, ) - assert result.result["pages"] == 2 + # Per F2A.6-real, ``pages`` counts DB-side buffer flushes, + # not HTTP pages. With 2 records and the default page_size=500, + # only one final flush fires. + assert result.result["pages"] == 1 assert result.result["retrieved_records"] == 2 - # Second call URL should contain cursor + # Second HTTP call URL should contain cursor (HTTP-level pagination + # is the plugin's concern, but we verify the cursor was followed). assert "cursor=abc123" in called_urls[1] def test_network_failure_propagates(self): @@ -618,7 +461,7 @@ def test_network_failure_propagates(self): op = InsertProteinsOperation() with patch.object( - op._http_client.session, + op._uniprot_plugin._client.session, "get", side_effect=req.ConnectionError("network down"), ): @@ -647,7 +490,7 @@ def test_isoform_records_counted(self): ) resp = _make_mock_response(fasta_with_isoform) op = InsertProteinsOperation() - with patch.object(op._http_client.session, "get", return_value=resp): + with patch.object(op._uniprot_plugin._client.session, "get", return_value=resp): result = op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -656,19 +499,24 @@ def test_isoform_records_counted(self): assert result.result["isoform_records"] == 1 - def test_progress_emission_with_total(self): - """Progress events include _progress_current and _progress_total.""" + def test_progress_emission_with_total_limit(self): + """Progress events include _progress_total when ``total_limit`` is set. + + Per F2A.6-real, the operation no longer captures X-Total-Results + from the HTTP response; only the user-set ``total_limit`` flows + into ``_progress_total``. With ``page_size=1`` we force a flush + so the page_done event actually fires. + """ session = _make_mock_session() emit = _capturing_emit() resp = _make_mock_response(FASTA_ONE) - resp.headers["X-Total-Results"] = "100" - op = InsertProteinsOperation() - with patch.object(op._http_client.session, "get", return_value=resp): + with patch.object(op._uniprot_plugin._client.session, "get", return_value=resp): op.execute( session, - {"search_criteria": "q", "compressed": False}, + {"search_criteria": "q", "compressed": False, + "total_limit": 100, "page_size": 1}, emit=emit, ) @@ -683,7 +531,7 @@ def test_include_isoforms_false_omits_param(self): session = _make_mock_session() resp = _make_mock_response(FASTA_ONE) op = InsertProteinsOperation() - with patch.object(op._http_client.session, "get", return_value=resp) as mock_get: + with patch.object(op._uniprot_plugin._client.session, "get", return_value=resp) as mock_get: op.execute( session, {"search_criteria": "q", "compressed": False, "include_isoforms": False}, @@ -708,7 +556,7 @@ def test_insert_proteins_integration(postgres_url: str): emit = _capturing_emit() with Session(engine, future=True) as session: - with patch.object(op._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result = op.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, From b94f2847d7636246cba7a0f2c7b997c89af3905d Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 10:20:44 +0200 Subject: [PATCH 70/73] refactor(metadata): F2A.6-real FetchUniProtMetadata + legacy http delete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes F2A.6-real with the fourth Level-1 plugin migration plus the dead-code cleanup that was waiting on it. Migration: * FetchUniProtMetadataOperation delegates HTTP retries + cursor pagination + gzip decoding + TSV parsing to the protea-sources UniProtSource.stream_metadata plugin method (added in protea-sources/2a6ef55). Operation becomes a thin persistence adapter focused on FIELD_MAP DB upsert and update_protein_core side effects. * Removed: _fetch_tsv_pages (~70 LOC of HTTP + URL construction), _decode_response (gzip wrapper), _parse_tsv (csv.DictReader). All three live in the plugin now. * Removed state: self._http_client, self._total_results. * UNIPROT_FIELDS constant kept on the operation class — the field list is a persistence concern (which DB columns get populated). Same field set passed to the plugin via UniProtMetadataStreamPayload.fields. * Imports trimmed: csv, gzip, BytesIO, StringIO, quote, requests, Response, UniProtHttpClient. Replaced with protea_contracts imports for UniProtMetadataRecord, UniProtMetadataStreamPayload, parse_isoform. * _store_rows consumes UniProtMetadataRecord via attribute access (rec.accession, rec.raw_fields) instead of dict access. Field semantics preserved bit-for-bit: same FIELD_MAP application, same update_protein_core conservative-update logic. Behavioural diffs (same as the FASTA migration): * pages now counts DB-side buffer flushes (not HTTP pages). * X-Total-Results header capture removed. _progress_total flows only when total_limit is set. Dead-code cleanup: * protea/core/utils.py: deleted UniProtHttpClient (135 LOC) plus its _HttpPayload Protocol and the now-unused random/time/ requests/Response imports. The file shrinks to just chunks() + utcnow() helpers (~13 LOC). * tests/test_core.py: deleted TestUniProtHttpClient class (~75 LOC, 9 tests) and TestFetchUniProtMetadataExecute class (~290 LOC, 10 tests). The first migrated to protea-sources/tests/test_uniprot.py::TestUniProtRetryClient (5 tests, retries + Retry-After + max-retries + network errors) plus TestExtractNextCursor (4 tests). The second migrated partially to test_fetch_uniprot_metadata.py (which keeps the 14 execute-flow tests against the new plugin-based dispatch) and partially to protea-sources/tests/test_uniprot.py (TestParseMetadataTsv covers the parser directly). * tests/test_fetch_uniprot_metadata.py: deleted TestParseTsv class (4 tests, ~35 LOC) — parser now in protea-sources. 16 patch sites swapped from op._http_client.session.get to op._uniprot_plugin._client.session.get. Suite: PROTEA 1089 passed, 12 skipped (was 1112; -23 from deletions of the legacy class + parser/decode/total_results overlap with protea-sources). Ruff full + mypy strict green. Net diff PROTEA: -266 / +163 = -103 LOC across the operation, core/utils.py, test_core.py, and test_fetch_uniprot_metadata.py. This closes F2A.6-real: * GOA real-migrated (turn pre-25, commits 20987a5/d1d60f6/43da412). * QuickGO real-migrated (turn 27, c5433ed/f37dfce/42d4dd4). * D-MIGR-04 prereq (turn 29, 18e92af/434b14e). * UniProt FASTA real-migrated (turn 32, f1bf7b5/fadbd6b/56a6d87). * UniProt metadata real-migrated + UniProtHttpClient deleted (this turn, 09f3883/2a6ef55/). protea-sources is now self-contained with respect to UniProt HTTP: the _http.py module owns the retry client; the plugin owns parsing and modality dispatch (FASTA vs metadata). PROTEA's only remaining involvement is persistence. Part of F2A.6-real migration plan, step 4 of 4. F2B (HTTP registry endpoints) is next on the autonomous queue once doc-lane gives it priority. --- .../core/operations/fetch_uniprot_metadata.py | 270 +++++------ protea/core/utils.py | 115 +---- tests/test_core.py | 442 +----------------- tests/test_fetch_uniprot_metadata.py | 44 +- 4 files changed, 131 insertions(+), 740 deletions(-) diff --git a/protea/core/operations/fetch_uniprot_metadata.py b/protea/core/operations/fetch_uniprot_metadata.py index 0908c65..75eda76 100644 --- a/protea/core/operations/fetch_uniprot_metadata.py +++ b/protea/core/operations/fetch_uniprot_metadata.py @@ -1,20 +1,19 @@ from __future__ import annotations -import csv -import gzip import time -from collections.abc import Iterable, Sequence -from io import BytesIO, StringIO +from collections.abc import Iterator, Sequence from typing import Annotated, Any -from urllib.parse import quote -import requests +from protea_contracts import ( + UniProtMetadataRecord, + UniProtMetadataStreamPayload, + parse_isoform, +) from pydantic import Field, field_validator -from requests import Response from sqlalchemy.orm import Session from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload -from protea.core.utils import UniProtHttpClient, chunks +from protea.core.utils import chunks from protea.infrastructure.orm.models.protein.protein import Protein from protea.infrastructure.orm.models.protein.protein_metadata import ProteinUniProtMetadata @@ -58,7 +57,37 @@ class FetchUniProtMetadataOperation: "Fetch functional annotations (TSV) from UniProt and upsert " "ProteinUniProtMetadata rows keyed by canonical accession." ) - UNIPROT_SEARCH_URL = "https://rest.uniprot.org/uniprotkb/search" + + # UniProt field identifiers requested in the TSV query, in display + # order. Kept here (operation-side) because the field set is a + # persistence concern: it determines which DB columns get populated. + UNIPROT_FIELDS: list[str] = [ + "accession", + "reviewed", + "id", + "protein_name", + "gene_names", + "organism_name", + "length", + "absorption", + "ft_act_site", + "ft_binding", + "cc_catalytic_activity", + "cc_cofactor", + "ft_dna_bind", + "ec", + "cc_activity_regulation", + "cc_function", + "cc_pathway", + "kinetics", + "ph_dependence", + "redox_potential", + "rhea", + "ft_site", + "temp_dependence", + "keyword", + "feature_count", + ] def summarize_payload(self, payload: dict[str, Any]) -> str: criteria = (payload or {}).get("search_criteria") @@ -96,15 +125,16 @@ def summarize_payload(self, payload: dict[str, Any]) -> str: } def __init__(self) -> None: - self._http_client = UniProtHttpClient() - self._total_results: int | None = None + # Plugin instance is reused across executions for connection + # pooling; counters are reset by the plugin at the start of + # each ``stream_metadata`` call. + from protea_sources.uniprot import plugin as _uniprot_plugin + + self._uniprot_plugin = _uniprot_plugin def execute( self, session: Session, payload: dict[str, Any], *, emit: EmitFn ) -> OperationResult: - self._http_client.reset() - self._total_results = None - p = FetchUniProtMetadataPayload.model_validate(payload) t0 = time.perf_counter() @@ -120,45 +150,11 @@ def execute( proteins_touched = 0 metadata_upserted = 0 - for page_idx, rows in enumerate(self._fetch_tsv_pages(p, emit), start=1): - pages = page_idx - if not rows: - continue - - if p.total_limit is not None and (total_rows + len(rows)) > p.total_limit: - rows = rows[: max(0, p.total_limit - total_rows)] - if not rows: - break - - total_rows += len(rows) - - touched, upserted = self._store_rows(session, rows, p, emit) - proteins_touched += touched - metadata_upserted += upserted - - emit( - "fetch_uniprot_metadata.page_done", - None, - { - "page": page_idx, - "rows_total": total_rows, - "proteins_touched_total": proteins_touched, - "metadata_upserted_total": metadata_upserted, - "http_requests": self._http_client.requests, - "http_retries": self._http_client.retries, - "_progress_current": total_rows, - **( - {"_progress_total": p.total_limit or self._total_results} - if (p.total_limit or self._total_results) - else {} - ), - }, - "info", - ) - - if p.commit_every_page: - session.commit() - + # Buffer per-record into operation-controlled pages of size + # ``p.page_size``. Plugin yields one record at a time; operation + # owns batching policy + commits. + buffer: list[UniProtMetadataRecord] = [] + for record in self._stream_metadata(p, emit): if p.total_limit is not None and total_rows >= p.total_limit: emit( "fetch_uniprot_metadata.limit_reached", @@ -168,113 +164,97 @@ def execute( ) break + buffer.append(record) + total_rows += 1 + + if len(buffer) >= p.page_size: + pages += 1 + touched, upserted = self._store_rows(session, buffer, p, emit) + proteins_touched += touched + metadata_upserted += upserted + buffer.clear() + + http_req, http_ret = self._uniprot_plugin.http_counters + emit( + "fetch_uniprot_metadata.page_done", + None, + { + "page": pages, + "rows_total": total_rows, + "proteins_touched_total": proteins_touched, + "metadata_upserted_total": metadata_upserted, + "http_requests": http_req, + "http_retries": http_ret, + "_progress_current": total_rows, + **( + {"_progress_total": p.total_limit} + if p.total_limit + else {} + ), + }, + "info", + ) + + if p.commit_every_page: + session.commit() + + # Flush remaining buffer. + if buffer: + pages += 1 + touched, upserted = self._store_rows(session, buffer, p, emit) + proteins_touched += touched + metadata_upserted += upserted + elapsed = time.perf_counter() - t0 + http_req, http_ret = self._uniprot_plugin.http_counters result = { "pages": pages, "rows": total_rows, "proteins_touched": proteins_touched, "metadata_upserted": metadata_upserted, - "http_requests": self._http_client.requests, - "http_retries": self._http_client.retries, + "http_requests": http_req, + "http_retries": http_ret, "elapsed_seconds": elapsed, } emit("fetch_uniprot_metadata.done", None, result, "info") return OperationResult(result=result) - # ---------------- HTTP / paging ---------------- - - def _fetch_tsv_pages( + def _stream_metadata( self, p: FetchUniProtMetadataPayload, emit: EmitFn - ) -> Iterable[list[dict[str, str]]]: - encoded_query = quote(p.search_criteria) - - fields = [ - "accession", - "reviewed", - "id", - "protein_name", - "gene_names", - "organism_name", - "length", - "absorption", - "ft_act_site", - "ft_binding", - "cc_catalytic_activity", - "cc_cofactor", - "ft_dna_bind", - "ec", - "cc_activity_regulation", - "cc_function", - "cc_pathway", - "kinetics", - "ph_dependence", - "redox_potential", - "rhea", - "ft_site", - "temp_dependence", - "keyword", - "feature_count", - ] - - params = [ - "format=tsv", - f"query={encoded_query}", - f"size={p.page_size}", - "compressed=true" if p.compressed else "compressed=false", - f"fields={quote(','.join(fields))}", - ] - base_url = f"{self.UNIPROT_SEARCH_URL}?{'&'.join(params)}" - - next_cursor: str | None = None - page = 0 - - while True: - page += 1 - url = base_url if not next_cursor else f"{base_url}&cursor={next_cursor}" - emit( - "uniprot.fetch_page_start", - None, - {"page": page, "has_cursor": bool(next_cursor)}, - "info", - ) - - resp = self._http_client.get_with_retries(url, p, emit) - if self._total_results is None: - try: - self._total_results = int(resp.headers.get("X-Total-Results", 0)) or None - except (ValueError, TypeError): - pass - text = self._decode_response(resp, p.compressed) - rows = self._parse_tsv(text) - - emit("uniprot.fetch_page_done", None, {"page": page, "rows": len(rows)}, "info") - yield rows - - next_cursor = self._http_client.extract_next_cursor(resp.headers.get("link", "")) - if not next_cursor: - break - - def _decode_response(self, resp: Response, compressed: bool) -> str: - if compressed: - with gzip.GzipFile(fileobj=BytesIO(resp.content)) as f: - return f.read().decode("utf-8", errors="replace") - return resp.content.decode("utf-8", errors="replace") - - # ---------------- TSV / DB ---------------- + ) -> Iterator[UniProtMetadataRecord]: + """Delegate to the protea-sources UniProtSource plugin. + + Plugin owns HTTP retries, cursor pagination, gzip decoding, + and TSV parsing. The operation owns the field list (persistence + concern) plus DB upsert. See ``f2a6_real_migration_design.md``. + """ + yield from self._uniprot_plugin.stream_metadata( + UniProtMetadataStreamPayload( + search_criteria=p.search_criteria, + fields=self.UNIPROT_FIELDS, + page_size=p.page_size, + timeout_seconds=p.timeout_seconds, + compressed=p.compressed, + max_retries=p.max_retries, + backoff_base_seconds=p.backoff_base_seconds, + backoff_max_seconds=p.backoff_max_seconds, + jitter_seconds=p.jitter_seconds, + user_agent=p.user_agent, + ), + emit=emit, + ) - def _parse_tsv(self, tsv_text: str) -> list[dict[str, str]]: - reader = csv.DictReader(StringIO(tsv_text), delimiter="\t") - return [{k: (v if v is not None else "") for k, v in row.items()} for row in reader] + # ---------------- DB ---------------- def _store_rows( self, session: Session, - rows: list[dict[str, str]], + records: list[UniProtMetadataRecord], p: FetchUniProtMetadataPayload, emit: EmitFn, ) -> tuple[int, int]: - accessions = [r.get("Entry", "").strip() for r in rows if r.get("Entry")] - canonicals = [Protein.parse_isoform(a)[0] for a in accessions] + accessions = [r.accession for r in records] + canonicals = [parse_isoform(a)[0] for a in accessions] canonical_unique = list(dict.fromkeys([c for c in canonicals if c])) existing = self._load_existing_metadata(session, canonical_unique) @@ -287,11 +267,9 @@ def _store_rows( touched = 0 upserted = 0 - for row in rows: - acc = row.get("Entry", "").strip() - if not acc: - continue - canonical, _, _ = Protein.parse_isoform(acc) + for record in records: + canonical, _, _ = parse_isoform(record.accession) + row = record.raw_fields m = existing.get(canonical) if m is None: @@ -309,7 +287,7 @@ def _store_rows( upserted += 1 if p.update_protein_core: - pr = protein_map.get(acc) + pr = protein_map.get(record.accession) if pr is not None: core_changed = False diff --git a/protea/core/utils.py b/protea/core/utils.py index 38625ac..0e3a4c0 100644 --- a/protea/core/utils.py +++ b/protea/core/utils.py @@ -1,16 +1,9 @@ from __future__ import annotations -import random -import time from collections.abc import Iterable from collections.abc import Sequence as Seq from datetime import UTC, datetime -from typing import Any, Protocol - -import requests -from requests import Response - -from protea.core.contracts.operation import EmitFn +from typing import Any def utcnow() -> datetime: @@ -22,109 +15,3 @@ def chunks(seq: Seq[Any], n: int) -> Iterable[Seq[Any]]: """Yield successive n-sized chunks from seq.""" for i in range(0, len(seq), n): yield seq[i : i + n] - - -class _HttpPayload(Protocol): - """Structural type for payloads that carry HTTP retry parameters.""" - - user_agent: str - timeout_seconds: int - max_retries: int - backoff_base_seconds: float - backoff_max_seconds: float - jitter_seconds: float - - -class UniProtHttpClient: - """Composable HTTP client with retries, used by UniProt REST operations. - - Replaces the historical ``UniProtHttpMixin`` (favours composition - over inheritance). Operations instantiate one client per execution - and read counters from ``client.requests`` / ``client.retries``. - - Example:: - - class InsertProteinsOperation: - def __init__(self) -> None: - self._http_client = UniProtHttpClient() - - def execute(self, session, payload, *, emit): - self._http_client.reset() - ... - resp = self._http_client.get_with_retries(url, p, emit) - cursor = self._http_client.extract_next_cursor(link) - """ - - def __init__(self) -> None: - self.session: requests.Session = requests.Session() - self.requests: int = 0 - self.retries: int = 0 - - def reset(self) -> None: - """Reset request/retry counters before a new execution. - - The underlying ``requests.Session`` is reused across executions - so connection pooling stays effective. - """ - self.requests = 0 - self.retries = 0 - - def get_with_retries(self, url: str, p: _HttpPayload, emit: EmitFn) -> Response: - headers = {"User-Agent": p.user_agent} - attempt = 0 - while True: - attempt += 1 - self.requests += 1 - try: - resp = self.session.get(url, timeout=p.timeout_seconds, headers=headers) - except requests.RequestException as e: - if attempt > p.max_retries: - raise - self.retries += 1 - self._sleep_backoff( - p, attempt, emit, reason=f"request_exception:{e.__class__.__name__}" - ) - continue - - if 200 <= resp.status_code < 300: - return resp - - if resp.status_code in (429, 500, 502, 503, 504): - if attempt > p.max_retries: - resp.raise_for_status() - self.retries += 1 - retry_after = resp.headers.get("Retry-After") - if retry_after and retry_after.isdigit(): - wait_s = min(float(retry_after), p.backoff_max_seconds) - emit( - "http.retry", - None, - {"attempt": attempt, "wait_seconds": wait_s, "reason": "retry_after"}, - "warning", - ) - time.sleep(wait_s) - else: - self._sleep_backoff(p, attempt, emit, reason=f"status_{resp.status_code}") - continue - - resp.raise_for_status() - - def _sleep_backoff(self, p: _HttpPayload, attempt: int, emit: EmitFn, reason: str) -> None: - base = p.backoff_base_seconds * (2 ** (attempt - 1)) - wait_s = min(base, p.backoff_max_seconds) + random.uniform(0.0, p.jitter_seconds) - emit( - "http.retry", - None, - {"attempt": attempt, "wait_seconds": wait_s, "reason": reason}, - "warning", - ) - time.sleep(wait_s) - - @staticmethod - def extract_next_cursor(link_header: str) -> str | None: - if not link_header or 'rel="next"' not in link_header or "cursor=" not in link_header: - return None - try: - return link_header.split("cursor=")[-1].split(">")[0] - except Exception: - return None diff --git a/tests/test_core.py b/tests/test_core.py index 0bf5034..bf65417 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -5,22 +5,13 @@ from __future__ import annotations -import gzip -from io import BytesIO -from unittest.mock import MagicMock, patch - import pytest -import requests from protea.core.contracts.operation import OperationResult, RetryLaterError from protea.core.contracts.registry import OperationRegistry from protea.core.evidence_codes import ECO_TO_CODE, EXPERIMENTAL, is_experimental, normalize -from protea.core.operations.fetch_uniprot_metadata import ( - FetchUniProtMetadataOperation, - FetchUniProtMetadataPayload, -) from protea.core.operations.ping import PingOperation -from protea.core.utils import UniProtHttpClient, chunks +from protea.core.utils import chunks # --------------------------------------------------------------------------- # OperationRegistry @@ -102,108 +93,6 @@ def test_empty_seq(self) -> None: assert list(chunks([], 5)) == [] -# --------------------------------------------------------------------------- -# UniProtHttpClient -# --------------------------------------------------------------------------- - - -def _make_payload(max_retries=3, backoff_base=0.01, backoff_max=0.1, jitter=0.0): - p = MagicMock() - p.user_agent = "PROTEA/test" - p.timeout_seconds = 5 - p.max_retries = max_retries - p.backoff_base_seconds = backoff_base - p.backoff_max_seconds = backoff_max - p.jitter_seconds = jitter - return p - - -def _make_client() -> UniProtHttpClient: - client = UniProtHttpClient() - client.session = MagicMock() - return client - - -def _noop_emit(*_): - return None - - -class TestUniProtHttpClient: - def test_returns_response_on_200(self) -> None: - client = _make_client() - resp = MagicMock() - resp.status_code = 200 - client.session.get.return_value = resp - result = client.get_with_retries("http://x", _make_payload(), _noop_emit) - assert result is resp - - def test_retries_on_429(self) -> None: - client = _make_client() - bad = MagicMock() - bad.status_code = 429 - bad.headers = {} - good = MagicMock() - good.status_code = 200 - client.session.get.side_effect = [bad, good] - with patch("protea.core.utils.time.sleep"): - result = client.get_with_retries("http://x", _make_payload(), _noop_emit) - assert result is good - assert client.retries == 1 - - def test_uses_retry_after_header(self) -> None: - client = _make_client() - bad = MagicMock() - bad.status_code = 429 - bad.headers = {"Retry-After": "5"} - good = MagicMock() - good.status_code = 200 - client.session.get.side_effect = [bad, good] - sleep_calls = [] - with patch("protea.core.utils.time.sleep", side_effect=sleep_calls.append): - client.get_with_retries("http://x", _make_payload(backoff_max=30.0), _noop_emit) - assert len(sleep_calls) == 1 - assert sleep_calls[0] == pytest.approx(5.0) - - def test_raises_after_max_retries(self) -> None: - client = _make_client() - bad = MagicMock() - bad.status_code = 503 - bad.headers = {} - bad.raise_for_status.side_effect = requests.HTTPError("503") - client.session.get.return_value = bad - with patch("protea.core.utils.time.sleep"): - with pytest.raises(requests.HTTPError): - client.get_with_retries("http://x", _make_payload(max_retries=2), _noop_emit) - - def test_retries_on_network_exception(self) -> None: - client = _make_client() - good = MagicMock() - good.status_code = 200 - client.session.get.side_effect = [requests.ConnectionError("down"), good] - with patch("protea.core.utils.time.sleep"): - result = client.get_with_retries("http://x", _make_payload(), _noop_emit) - assert result is good - - def test_reset_clears_counters(self) -> None: - client = _make_client() - client.requests = 7 - client.retries = 3 - client.reset() - assert client.requests == 0 - assert client.retries == 0 - - def test_extract_next_cursor_present(self) -> None: - header = '; rel="next"' - assert UniProtHttpClient.extract_next_cursor(header) == "ABCD1234" - - def test_extract_next_cursor_absent(self) -> None: - assert UniProtHttpClient.extract_next_cursor("") is None - assert UniProtHttpClient.extract_next_cursor('; rel="prev"') is None - - def test_extract_next_cursor_no_cursor_param(self) -> None: - assert UniProtHttpClient.extract_next_cursor('; rel="next"') is None - - # --------------------------------------------------------------------------- # evidence_codes — normalize and is_experimental # --------------------------------------------------------------------------- @@ -267,332 +156,3 @@ def test_is_exception(self): raise RetryLaterError("test") -# --------------------------------------------------------------------------- -# FetchUniProtMetadataOperation -# --------------------------------------------------------------------------- - - -def _noop_emit(*_): - pass - - -def _make_tsv_content(rows: list[dict[str, str]], compressed: bool = True) -> bytes: - """Build a TSV byte string (optionally gzipped) from a list of dicts.""" - if not rows: - header = "Entry\tReviewed\tEntry Name\tOrganism\tGene Names\tLength" - text = header + "\n" - else: - headers = list(rows[0].keys()) - lines = ["\t".join(headers)] - for row in rows: - lines.append("\t".join(row.get(h, "") for h in headers)) - text = "\n".join(lines) + "\n" - - raw = text.encode("utf-8") - if compressed: - buf = BytesIO() - with gzip.GzipFile(fileobj=buf, mode="wb") as f: - f.write(raw) - return buf.getvalue() - return raw - - -class TestFetchUniProtMetadataExecute: - def _make_op(self): - op = FetchUniProtMetadataOperation() - op._http_client.session = MagicMock() - return op - - def test_execute_empty_page_continues(self): - """Line 108: when rows is empty, continue (skip store).""" - op = self._make_op() - events = [] - - def emit(event, message, fields, level): - events.append(event) - - # Return one page with no data rows, then stop - resp = MagicMock() - resp.status_code = 200 - resp.headers = {"X-Total-Results": "0"} - resp.content = _make_tsv_content([], compressed=True) - op._http_client.session.get.return_value = resp - - session = MagicMock() - payload = {"search_criteria": "organism_id:9606", "page_size": 10} - - result = op.execute(session, payload, emit=emit) - assert result.result["rows"] == 0 - assert result.result["pages"] == 1 - - def test_execute_total_limit_truncation(self): - """Lines 110-113: when total_limit is set and rows exceed it, truncate.""" - op = self._make_op() - - # Build 5 rows - rows = [] - for i in range(5): - row = {"Entry": f"P0000{i}", "Reviewed": "reviewed"} - # Add all FIELD_MAP headers as empty - for header in FetchUniProtMetadataOperation.FIELD_MAP.values(): - row[header] = "" - row["Entry Name"] = "" - row["Organism"] = "" - row["Gene Names"] = "" - row["Length"] = "" - rows.append(row) - - resp = MagicMock() - resp.status_code = 200 - resp.headers = {"X-Total-Results": "5"} - resp.content = _make_tsv_content(rows, compressed=True) - op._http_client.session.get.return_value = resp - - session = MagicMock() - session.query.return_value.filter.return_value.all.return_value = [] - - payload = { - "search_criteria": "organism_id:9606", - "page_size": 10, - "total_limit": 3, - } - - result = op.execute(session, payload, emit=_noop_emit) - # Should only process 3 rows despite page having 5 - assert result.result["rows"] == 3 - - def test_execute_total_limit_zero_after_truncation(self): - """Line 113: if truncation results in empty rows, break.""" - op = self._make_op() - - rows = [{"Entry": "P00001"}] - for header in FetchUniProtMetadataOperation.FIELD_MAP.values(): - rows[0][header] = "" - rows[0].update( - {"Reviewed": "", "Entry Name": "", "Organism": "", "Gene Names": "", "Length": ""} - ) - - # First page returns 1 row, second page returns 1 row - resp1 = MagicMock() - resp1.status_code = 200 - resp1.headers = {"X-Total-Results": "2", "link": '; rel="next"'} - resp1.content = _make_tsv_content(rows, compressed=True) - - resp2 = MagicMock() - resp2.status_code = 200 - resp2.headers = {"X-Total-Results": "2"} - rows2 = [{"Entry": "P00002"}] - for header in FetchUniProtMetadataOperation.FIELD_MAP.values(): - rows2[0][header] = "" - rows2[0].update( - {"Reviewed": "", "Entry Name": "", "Organism": "", "Gene Names": "", "Length": ""} - ) - resp2.content = _make_tsv_content(rows2, compressed=True) - - op._http_client.session.get.side_effect = [resp1, resp2] - - session = MagicMock() - session.query.return_value.filter.return_value.all.return_value = [] - - payload = { - "search_criteria": "organism_id:9606", - "page_size": 1, - "total_limit": 1, - } - - result = op.execute(session, payload, emit=_noop_emit) - # Should stop after first page (total_limit=1, first page gives 1 row) - assert result.result["rows"] == 1 - - def test_x_total_results_none_on_invalid_header(self): - """Line 227: X-Total-Results header with invalid value.""" - op = self._make_op() - - resp = MagicMock() - resp.status_code = 200 - resp.headers = {"X-Total-Results": "not-a-number"} - resp.content = _make_tsv_content([], compressed=True) - op._http_client.session.get.return_value = resp - - session = MagicMock() - payload = {"search_criteria": "test", "page_size": 10} - - op.execute(session, payload, emit=_noop_emit) - assert op._total_results is None - - def test_decode_response_uncompressed(self): - """Line 241-242: uncompressed response decoding.""" - op = self._make_op() - resp = MagicMock() - resp.content = b"Entry\tReviewed\nP00001\treviewed\n" - text = op._decode_response(resp, compressed=False) - assert "P00001" in text - - def test_store_rows_empty_accession_skipped(self): - """Line 275: rows with empty Entry are skipped.""" - op = self._make_op() - session = MagicMock() - session.query.return_value.filter.return_value.all.return_value = [] - - p = FetchUniProtMetadataPayload( - search_criteria="test", - update_protein_core=False, - ) - - rows = [{"Entry": "", "Absorption": "test"}] - for header in FetchUniProtMetadataOperation.FIELD_MAP.values(): - if header not in rows[0]: - rows[0][header] = "" - - touched, upserted = op._store_rows(session, rows, p, _noop_emit) - assert touched == 0 - assert upserted == 0 - - def test_store_rows_update_protein_core_fields(self): - """Lines 296-328: update_protein_core fills in missing fields on Protein.""" - op = self._make_op() - session = MagicMock() - - # No existing metadata - session.query.return_value.filter.return_value.all.return_value = [] - - # Create a mock protein with all None fields - protein = MagicMock() - protein.accession = "P12345" - protein.reviewed = None - protein.entry_name = None - protein.organism = None - protein.gene_name = None - protein.length = None - - # Second query().filter().all() returns proteins - call_count = [0] - - def query_side_effect(*args): - result = MagicMock() - call_count[0] += 1 - if call_count[0] <= 1: - # First call: metadata lookup - result.filter.return_value.all.return_value = [] - else: - # Second call: protein lookup - result.filter.return_value.all.return_value = [protein] - return result - - session.query.side_effect = query_side_effect - - p = FetchUniProtMetadataPayload( - search_criteria="test", - update_protein_core=True, - ) - - row = { - "Entry": "P12345", - "Reviewed": "reviewed", - "Entry Name": "TEST_HUMAN", - "Organism": "Homo sapiens", - "Gene Names": "TEST GENE2", - "Length": "500", - } - for header in FetchUniProtMetadataOperation.FIELD_MAP.values(): - row.setdefault(header, "") - - touched, upserted = op._store_rows(session, [row], p, _noop_emit) - assert protein.reviewed is True - assert protein.entry_name == "TEST_HUMAN" - assert protein.organism == "Homo sapiens" - assert protein.gene_name == "TEST" - assert protein.length == 500 - assert touched == 1 - - def test_store_rows_unreviewed_protein(self): - """Lines 303-305: reviewed == 'unreviewed' sets pr.reviewed = False.""" - op = self._make_op() - session = MagicMock() - - protein = MagicMock() - protein.accession = "Q99999" - protein.reviewed = None - protein.entry_name = None - protein.organism = None - protein.gene_name = None - protein.length = None - - call_count = [0] - - def query_side_effect(*args): - result = MagicMock() - call_count[0] += 1 - if call_count[0] <= 1: - result.filter.return_value.all.return_value = [] - else: - result.filter.return_value.all.return_value = [protein] - return result - - session.query.side_effect = query_side_effect - - p = FetchUniProtMetadataPayload( - search_criteria="test", - update_protein_core=True, - ) - - row = {"Entry": "Q99999", "Reviewed": "unreviewed"} - for header in FetchUniProtMetadataOperation.FIELD_MAP.values(): - row.setdefault(header, "") - row.setdefault("Entry Name", "") - row.setdefault("Organism", "") - row.setdefault("Gene Names", "") - row.setdefault("Length", "") - - touched, _ = op._store_rows(session, [row], p, _noop_emit) - assert protein.reviewed is False - assert touched == 1 - - def test_store_rows_protein_not_in_db(self): - """Lines 294-295: protein not found in protein_map, no core update.""" - op = self._make_op() - session = MagicMock() - - call_count = [0] - - def query_side_effect(*args): - result = MagicMock() - call_count[0] += 1 - if call_count[0] <= 1: - result.filter.return_value.all.return_value = [] - else: - result.filter.return_value.all.return_value = [] # No proteins - return result - - session.query.side_effect = query_side_effect - - p = FetchUniProtMetadataPayload( - search_criteria="test", - update_protein_core=True, - ) - - row = {"Entry": "UNKNOWN1", "Reviewed": "reviewed"} - for header in FetchUniProtMetadataOperation.FIELD_MAP.values(): - row.setdefault(header, "") - row.setdefault("Entry Name", "") - row.setdefault("Organism", "") - row.setdefault("Gene Names", "") - row.setdefault("Length", "") - - touched, upserted = op._store_rows(session, [row], p, _noop_emit) - assert touched == 0 - # Still upserted metadata - assert upserted == 1 - - def test_load_existing_metadata_chunks(self): - """Line 346: _load_existing_metadata returns existing metadata by canonical.""" - op = self._make_op() - session = MagicMock() - - m1 = MagicMock() - m1.canonical_accession = "P12345" - session.query.return_value.filter.return_value.all.return_value = [m1] - - result = op._load_existing_metadata(session, ["P12345"], chunk_size=10) - assert "P12345" in result - assert result["P12345"] is m1 diff --git a/tests/test_fetch_uniprot_metadata.py b/tests/test_fetch_uniprot_metadata.py index d844dfe..1c7a2a1 100644 --- a/tests/test_fetch_uniprot_metadata.py +++ b/tests/test_fetch_uniprot_metadata.py @@ -111,40 +111,6 @@ def test_search_criteria_is_stripped(self): assert p.search_criteria == "organism_id:9606" -# --------------------------------------------------------------------------- -# Unit tests — _parse_tsv -# --------------------------------------------------------------------------- - - -class TestParseTsv: - def setup_method(self): - self.op = FetchUniProtMetadataOperation() - - def test_parses_basic_tsv(self): - tsv = "Entry\tReviewed\tLength\nP12345\treviewed\t500\nQ99999\tunreviewed\t120\n" - rows = self.op._parse_tsv(tsv) - assert len(rows) == 2 - assert rows[0]["Entry"] == "P12345" - assert rows[0]["Reviewed"] == "reviewed" - assert rows[1]["Length"] == "120" - - def test_empty_tsv_returns_empty(self): - rows = self.op._parse_tsv("") - assert rows == [] - - def test_none_values_coerced_to_empty_string(self): - # DictReader returns None for missing fields in some edge cases; - # the implementation maps None -> "" - tsv = "Entry\tReviewed\nP12345\t\n" - rows = self.op._parse_tsv(tsv) - assert rows[0]["Reviewed"] == "" - - def test_header_only_returns_empty(self): - tsv = "Entry\tReviewed\tLength\n" - rows = self.op._parse_tsv(tsv) - assert rows == [] - - # --------------------------------------------------------------------------- # Unit tests — execute() with fully mocked HTTP and DB session # --------------------------------------------------------------------------- @@ -185,7 +151,7 @@ def test_execute_returns_operation_result(self): session = self._mock_session() emit = _capturing_emit() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): result = self.op.execute( session, {"search_criteria": "organism_id:9606", "page_size": 1, "compressed": False}, @@ -201,7 +167,7 @@ def test_execute_emits_start_and_done(self): session = self._mock_session() emit = _capturing_emit() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): self.op.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, @@ -223,7 +189,7 @@ def test_execute_respects_total_limit(self): session = self._mock_session() emit = _capturing_emit() - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(tsv)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(tsv)): result = self.op.execute( session, {"search_criteria": "q", "total_limit": 1, "compressed": False}, @@ -241,7 +207,7 @@ def test_execute_inserts_metadata_row(self): session = self._mock_session() emit = _noop_emit - with patch.object(self.op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(self.op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): self.op.execute( session, {"search_criteria": "q", "compressed": False}, @@ -266,7 +232,7 @@ def test_fetch_uniprot_metadata_integration(postgres_url: str): emit = _capturing_emit() with Session(engine, future=True) as session: - with patch.object(op._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(op._uniprot_plugin._client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): result = op.execute( session, { From a80ef8f92eeb57a1dff283a90c617e0592bd1075 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 10:25:36 +0200 Subject: [PATCH 71/73] feat(api): F2B.1-3 plugin registry endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three read-only HTTP endpoints listing plugins discovered via ``importlib.metadata.entry_points``, closing the F2B.1-3 block of master plan v3 in a single coherent router (the three endpoints share their lookup mechanism — splitting them across separate files would be artificial). Endpoints: * ``GET /backends`` — embedding backend plugins (``protea.backends`` group). Today: esm, t5, ankh, esm3c. * ``GET /sources`` — annotation source plugins (``protea.sources`` group). Today: goa, quickgo, uniprot. * ``GET /runners`` — experiment runner plugins (``protea.runners`` group). Today: baseline, knn, lightgbm. Response shape: ``` { "group": "protea.backends", "plugins": [ {"name": "esm", "cls": "EsmBackend", "module": "protea_backends.esm:plugin", "extra": {}}, ... ] } ``` The ``extra`` field carries plugin-class-specific metadata read from the loaded instance: today only sources expose ``version``, surfaced as ``extra.version`` (e.g. ``"uniprot-goa"``, ``"quickgo-rest"``). Backends and runners get an empty ``extra``; adding more probe-able metadata is a one-line change inside ``_discover``. Design choices: * No caching. The endpoint re-scans entry_points on every call so a worker that's just been restarted with a newly-installed extra surfaces in the next request without an API restart. The scan is sub-millisecond on the working set of ~10 plugins. * No authentication. These endpoints are public-read by design — they list installed software, not user data. * Plugin loading happens here. Loading the entry_point fires the plugin module's import side effects but should not raise for any first-party plugin (the bootstrapping pattern keeps top- level imports cheap). If a third-party plugin's load raises, the caller surfaces it as a 500 — fail loud beats silently hiding broken installs. * Fixed group whitelist (``_KNOWN_GROUPS``) prevents callers from probing arbitrary entry_points via the same code path. Files: * protea/api/routers/registry.py: new router (~140 LOC) with PluginInfo + PluginListResponse pydantic models, _discover + _list_for helpers, and the three endpoint functions. * protea/api/app.py: add registry_router to the import block and wire it via ``app.include_router(registry_router.router)``. * tests/test_registry_endpoints.py: 16 tests across four classes — TestBackendsEndpoint (5), TestSourcesEndpoint (5), TestRunnersEndpoint (4), TestResponseSchema (2). Tests run against the live entry_points discovery (the 10 plugins are real C-stack siblings installed via path-deps); no mocking. Suite: PROTEA 1105 passed, 12 skipped (was 1089; +16 new), ruff full + mypy strict green on the new files. This closes F2B autonomous-eligible work. F2B.4 (PredictGOTermsBatchOperation extract class) stays in the human-review queue because of reranker sensitivity. Part of F2B of master plan v3. --- protea/api/app.py | 2 + protea/api/routers/registry.py | 140 +++++++++++++++++++++++++++++++ tests/test_registry_endpoints.py | 106 +++++++++++++++++++++++ 3 files changed, 248 insertions(+) create mode 100644 protea/api/routers/registry.py create mode 100644 tests/test_registry_endpoints.py diff --git a/protea/api/app.py b/protea/api/app.py index ae77d18..13e8569 100644 --- a/protea/api/app.py +++ b/protea/api/app.py @@ -18,6 +18,7 @@ from protea.api.routers import maintenance as maintenance_router from protea.api.routers import proteins as proteins_router from protea.api.routers import query_sets as query_sets_router +from protea.api.routers import registry as registry_router from protea.api.routers import reranker_models as reranker_models_router from protea.api.routers import scoring as scoring_router from protea.api.routers import showcase as showcase_router @@ -182,6 +183,7 @@ def readiness_check() -> dict[str, str]: app.include_router(support_router.router) app.include_router(datasets_router.router) app.include_router(reranker_models_router.router) + app.include_router(registry_router.router) sphinx_build = project_root / "docs" / "build" / "html" if sphinx_build.exists(): diff --git a/protea/api/routers/registry.py b/protea/api/routers/registry.py new file mode 100644 index 0000000..073406f --- /dev/null +++ b/protea/api/routers/registry.py @@ -0,0 +1,140 @@ +"""Plugin registry endpoints. + +Three read-only endpoints listing the plugins discovered at runtime +via :mod:`importlib.metadata.entry_points`: + + * ``GET /backends`` — embedding backend plugins (``protea.backends``) + * ``GET /sources`` — annotation source plugins (``protea.sources``) + * ``GET /runners`` — experiment runner plugins (``protea.runners``) + +Each response is a flat list of :class:`PluginInfo` records describing +the entry-point name, class, module path, and any plugin-specific +metadata exposed via attributes (e.g. :attr:`AnnotationSource.version`). + +The endpoints are intentionally stateless — they re-scan +``entry_points`` on every call rather than caching, so a worker +that's just been restarted with a newly-installed extra surfaces in +the next request without an API restart. The scan is cheap (sub-ms +on the working set of ~10 plugins). +""" + +from __future__ import annotations + +from importlib.metadata import entry_points +from typing import Any + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +router = APIRouter(tags=["registry"]) + + +_KNOWN_GROUPS = { + "backends": "protea.backends", + "sources": "protea.sources", + "runners": "protea.runners", +} + + +class PluginInfo(BaseModel): + """Metadata for one discovered plugin.""" + + name: str + """Entry-point name (e.g. ``"esm"``, ``"goa"``, ``"lightgbm"``). + Matches the plugin's ``name`` class attribute by convention.""" + + cls: str + """Plugin class name (e.g. ``"EsmBackend"``, ``"GoaSource"``).""" + + module: str + """Fully-qualified entry-point value + (e.g. ``"protea_backends.esm:plugin"``).""" + + extra: dict[str, Any] = {} + """Plugin-specific metadata read from the loaded instance. Today + carries ``version`` for sources; empty for backends and runners.""" + + +class PluginListResponse(BaseModel): + """Response shape for the three registry endpoints.""" + + group: str + """The ``entry_points`` group queried + (e.g. ``"protea.backends"``).""" + + plugins: list[PluginInfo] + """Sorted (by ``name``) list of discovered plugins.""" + + +def _discover(group: str) -> list[PluginInfo]: + """Resolve all entry points in ``group`` and build their PluginInfo. + + Loading the entry-point fires the plugin module's import side + effects but should not raise for any first-party plugin (the + bootstrapping pattern keeps top-level imports cheap). If a + third-party plugin's load raises, the caller surfaces it as a + 500 — better to fail loud than silently hide a broken install. + """ + discovered: list[PluginInfo] = [] + for ep in entry_points(group=group): + plugin = ep.load() + extra: dict[str, Any] = {} + version = getattr(plugin, "version", None) + if isinstance(version, str): + extra["version"] = version + discovered.append( + PluginInfo( + name=ep.name, + cls=type(plugin).__name__, + module=ep.value, + extra=extra, + ) + ) + discovered.sort(key=lambda p: p.name) + return discovered + + +def _list_for(slug: str) -> PluginListResponse: + """Shared body for the three endpoints — looks up the canonical + ``entry_points`` group from ``_KNOWN_GROUPS`` and returns the + discovered plugins. + """ + group = _KNOWN_GROUPS.get(slug) + if group is None: + raise HTTPException(status_code=404, detail=f"unknown registry: {slug!r}") + return PluginListResponse(group=group, plugins=_discover(group)) + + +@router.get("/backends", response_model=PluginListResponse) +def list_backends() -> PluginListResponse: + """List all installed embedding backend plugins. + + The plugin set depends on which ``protea-backends[]`` + extras are installed (esm, t5, ankh, esm3c). With the default + install all four are discoverable; only the ones whose lazy + imports succeed at ``stream_*`` time will actually run on GPU. + """ + return _list_for("backends") + + +@router.get("/sources", response_model=PluginListResponse) +def list_sources() -> PluginListResponse: + """List all installed annotation source plugins. + + Today: ``goa``, ``quickgo``, ``uniprot`` (all real after + F2A.6-real). The ``extra.version`` field surfaces the + :attr:`AnnotationSource.version` declared on each plugin + (e.g. ``"uniprot-goa"``, ``"quickgo-rest"``).""" + return _list_for("sources") + + +@router.get("/runners", response_model=PluginListResponse) +def list_runners() -> PluginListResponse: + """List all installed experiment runner plugins. + + Today: ``baseline``, ``knn``, ``lightgbm``. The latter two are + contract-surface stubs until F2A.7 (lab → ``protea-runners + .lightgbm`` migration) and F2C.1 (``protea-method`` extraction) + move the real implementations here. + """ + return _list_for("runners") diff --git a/tests/test_registry_endpoints.py b/tests/test_registry_endpoints.py new file mode 100644 index 0000000..c92a15a --- /dev/null +++ b/tests/test_registry_endpoints.py @@ -0,0 +1,106 @@ +"""Tests for the plugin-registry endpoints (T2B.1-3 of master plan v3). + +Three endpoints — ``GET /backends``, ``GET /sources``, ``GET /runners`` +— each list the entry-point-discovered plugins for the corresponding +group. The plugins themselves are real (installed via the C-stack +sibling repos), so these tests run against the live discovery rather +than mocking ``importlib.metadata.entry_points``. +""" + +from __future__ import annotations + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from protea.api.routers.registry import router + + +@pytest.fixture +def client() -> TestClient: + app = FastAPI() + app.include_router(router) + return TestClient(app) + + +class TestBackendsEndpoint: + def test_returns_200(self, client: TestClient) -> None: + resp = client.get("/backends") + assert resp.status_code == 200 + + def test_returns_canonical_group(self, client: TestClient) -> None: + body = client.get("/backends").json() + assert body["group"] == "protea.backends" + + def test_lists_all_four_backends(self, client: TestClient) -> None: + body = client.get("/backends").json() + names = {p["name"] for p in body["plugins"]} + assert {"esm", "t5", "ankh", "esm3c"} <= names + + def test_plugin_info_shape(self, client: TestClient) -> None: + body = client.get("/backends").json() + plugin = next(p for p in body["plugins"] if p["name"] == "esm") + assert plugin["cls"] == "EsmBackend" + assert plugin["module"] == "protea_backends.esm:plugin" + assert "extra" in plugin + + def test_plugins_sorted_by_name(self, client: TestClient) -> None: + body = client.get("/backends").json() + names = [p["name"] for p in body["plugins"]] + assert names == sorted(names) + + +class TestSourcesEndpoint: + def test_returns_200(self, client: TestClient) -> None: + resp = client.get("/sources") + assert resp.status_code == 200 + + def test_returns_canonical_group(self, client: TestClient) -> None: + body = client.get("/sources").json() + assert body["group"] == "protea.sources" + + def test_lists_all_three_sources(self, client: TestClient) -> None: + body = client.get("/sources").json() + names = {p["name"] for p in body["plugins"]} + assert names >= {"goa", "quickgo", "uniprot"} + + def test_extra_carries_version(self, client: TestClient) -> None: + body = client.get("/sources").json() + goa = next(p for p in body["plugins"] if p["name"] == "goa") + assert goa["extra"].get("version") == "uniprot-goa" + + def test_uniprot_class_name_is_correct(self, client: TestClient) -> None: + body = client.get("/sources").json() + uniprot = next(p for p in body["plugins"] if p["name"] == "uniprot") + assert uniprot["cls"] == "UniProtSource" + + +class TestRunnersEndpoint: + def test_returns_200(self, client: TestClient) -> None: + resp = client.get("/runners") + assert resp.status_code == 200 + + def test_returns_canonical_group(self, client: TestClient) -> None: + body = client.get("/runners").json() + assert body["group"] == "protea.runners" + + def test_lists_all_three_runners(self, client: TestClient) -> None: + body = client.get("/runners").json() + names = {p["name"] for p in body["plugins"]} + assert names >= {"baseline", "knn", "lightgbm"} + + def test_lightgbm_class_name(self, client: TestClient) -> None: + body = client.get("/runners").json() + lgbm = next(p for p in body["plugins"] if p["name"] == "lightgbm") + assert lgbm["cls"] == "LightgbmRunner" + + +class TestResponseSchema: + def test_plugin_list_response_keys(self, client: TestClient) -> None: + body = client.get("/backends").json() + assert set(body.keys()) == {"group", "plugins"} + + def test_plugin_info_keys(self, client: TestClient) -> None: + body = client.get("/backends").json() + plugin = body["plugins"][0] + assert set(plugin.keys()) == {"name", "cls", "module", "extra"} From e9ae748b78d29569d44c729f190bd67750f3e3c2 Mon Sep 17 00:00:00 2001 From: frapercan Date: Wed, 6 May 2026 10:54:30 +0200 Subject: [PATCH 72/73] docs: Doc-T11 add "5 minutes to first job" section to README MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a runnable submit-watch-result loop to PROTEA's README, satisfying the F7.1 acceptance criterion of master plan v3 ("5 minutes to first job") that the original README did not explicitly cover. The previous README documented Docker + From source bring-up but stopped at "scripts/manage.sh start" without showing the end-to-end machinery. The new section lives between Getting started and Documentation and shows three operations the user can run with curl + jq the moment the stack is up: 1. POST /jobs to enqueue a `ping` smoke-test operation, capturing the returned job id. 2. GET /jobs/{id}/events to tail the structured-event stream until the job reaches a terminal state. 3. GET /jobs/{id} to confirm the final status + result + any error code. Plus a sub-section showing the F2B plugin-discovery endpoints (GET /backends, /sources, /runners) that landed in turn 36 — the runtime catalogue the user can probe to see what models / sources / runners the running deployment ships. The example uses `ping` rather than a real ML operation so the quickstart doesn't depend on having sequence data loaded; the intent is to exercise the queue + worker + DB lifecycle end-to- end, which `ping` does in <1s. Real operations (insert_proteins, load_goa_annotations, compute_embeddings, predict_go_terms) are submitted the same way. PROTEA README size: 141 LOC → 187 LOC (+46 LOC). Suite + Sphinx build unchanged; this is doc-only. Part of Doc-T11 of the autonomous loop. Closes the README expansion sweep across the four C-stack repos plus PROTEA itself. --- README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/README.md b/README.md index d10fa0c..f894fa5 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,52 @@ bash scripts/manage.sh start --- +## 5 minutes to your first job + +With the stack running locally, you can submit a job and watch it +move through the queue + worker + DB lifecycle in under 5 minutes. + +```bash +# 1. Submit a `ping` job (the smoke-test operation). +JOB_ID=$(curl -s -X POST http://localhost:8000/jobs \ + -H 'content-type: application/json' \ + -d '{"operation": "ping", "queue_name": "protea.ping", "payload": {}}' \ + | jq -r '.id') +echo "queued: $JOB_ID" + +# 2. Tail the structured-event log until the job reaches a terminal state. +curl -s "http://localhost:8000/jobs/$JOB_ID/events" | jq -c '.[]' +# {"event":"ping.start","fields":null,"level":"info","ts":"..."} +# {"event":"ping.done","fields":{"latency_ms":1.2},"level":"info","ts":"..."} + +# 3. Check the final job row + result. +curl -s "http://localhost:8000/jobs/$JOB_ID" | jq '{status, result, error_code}' +# {"status":"succeeded","result":{"echo":"pong"},"error_code":null} +``` + +That round-trip exercises the full machinery: HTTP enqueue → AMQP +publish → worker claim → operation execute → JobEvent stream → DB +commit → REST query. Real operations (`insert_proteins`, +`load_goa_annotations`, `compute_embeddings`, `predict_go_terms`) +are submitted the same way; their payloads are documented at +`/docs` (Swagger UI) and in the operation-catalog page of the +Sphinx docs. + +Discovering the installed plugins (added in F2B turn 36): + +```bash +curl -s http://localhost:8000/backends | jq '.plugins[].name' +# "ankh", "esm", "esm3c", "t5" + +curl -s http://localhost:8000/sources | jq '.plugins[].name' +# "goa", "quickgo", "uniprot" + +curl -s http://localhost:8000/runners | jq '.plugins[].name' +# "baseline", "knn", "lightgbm" +``` + +--- + ## Documentation Full documentation at **https://protea.readthedocs.io** From ccecf8a2fc8328714f707efa1c77679880ce8b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Miguel=20P=C3=A9rez=20Canales?= Date: Wed, 6 May 2026 14:10:00 +0200 Subject: [PATCH 73/73] fix(ci): restore main to green after ~6 weeks of red MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI rescue: restore main to a green state after ~6 weeks of red The last green CI run on `main` was 2026-03-25 (PR #7). Between that and 2026-05-06, two latent breakages accumulated and were exposed when the F2 phase work landed via 7db0e0d..e9ae748: - `cafaeval-protea` declared as PEP 621 file:/// URL hardcoded to the original developer's machine (introduced 2026-04-21, commit ace4c4a). - Five sibling path-deps (`protea-{contracts,method,sources, runners,backends}`) added during the F2 plugin migration; their internal cross-deps to protea-contracts were also path-based. - `protea-reranker-lab` path-dep on a sibling that wasn't on GitHub at all. - Pre-existing pyproject.toml + workflow gaps: `--only dev` install scope in lint and docs jobs (skipped main deps), missing sphinxcontrib-bibtex declaration, accumulated tech debt across ruff / flake8 / mypy that hadn't been gated for ~6 weeks. What this PR does: 1. Replaces all path-deps with `git+https://github.com/frapercan/` URLs so CI runners can resolve them. The 5 C-stack siblings and protea-reranker-lab were pushed to GitHub as part of this work. Cross-sibling path-deps inside the siblings (e.g. protea-backends pointing at ../protea-contracts) were also converted; otherwise poetry's transitive resolution failed. 2. Fixes integration tests broken by the F2A.6-real migration (op._http_client references, dict→GoaAnnotationRecord fixture conversion, halfvec roundtrip tolerance). 3. Auto-fix + manual cleanup of 18 ruff errors, 10 flake8 spacing violations, and 15 mypy errors (mostly union-attr narrowing asserts and targeted type: ignore on legitimate runtime patterns mypy can't prove safe). 4. Fixes lint + docs workflows to use `poetry install --with dev` instead of `--only dev` so mypy / sphinx autodoc can resolve imports of pyarrow, sqlalchemy, fastapi, etc. 5. Declares sphinxcontrib-bibtex (was installed transitively in the local venv but missing from pyproject.toml). 6. Includes 8 ADR resolutions confirmed during the rescue session (D04 /v1/ versioning, D06 Authentik+oauth2-proxy, D07 Loki+Grafana+OTel, D10 schema_sha v2, D25 HPC mode B primary, D27 ghcr.io, D28 sops+age, D29 semantic-release). Local-dev trade-off: editable cross-sibling installs are lost. Devs who want hot-reload across siblings need to do `pip install -e ../` after `poetry install`. CI verification on this PR: - lint (3.12, 2.1.0): pass (3m3s) - test (3.12, 2.1.0): pass (3m14s) - integration (3.12, 2.1.0): pass (4m11s) - docs (3.12, 2.1.0): pass (2m57s) - pip-audit, bandit, GitGuardian: pass - codecov informative-only (not in required checks) Local verification matched CI: 1105 unit passed, 1115 integration passed (with --with-postgres), ruff + flake8 + mypy clean, sphinx build succeeds with 5 pre-existing warnings. Includes the LAFA wrapper scaffolding (`apps/lafa_container/*`) that was sitting untracked in the working tree since the early F-LAFA exploration; kept because it has real value as the seed for a future functionbench.net submission. --- .github/workflows/docs.yml | 4 +- .github/workflows/lint.yml | 4 +- apps/lafa_container/__init__.py | 0 apps/lafa_container/protea_main.py | 237 ++++++++++++++++++ apps/lafa_container/prott5_encoder.py | 139 ++++++++++ docs/source/adr/D04-api-versioning.rst | 8 +- docs/source/adr/D06-authentication.rst | 16 +- docs/source/adr/D07-observability-stack.rst | 13 +- docs/source/adr/D10-schema-sha-v2.rst | 14 +- docs/source/adr/D25-hpc-mode.rst | 10 +- docs/source/adr/D27-image-registry.rst | 10 +- docs/source/adr/D28-secrets-management.rst | 13 +- docs/source/adr/D29-release-pipeline.rst | 14 +- poetry.lock | 138 ++++++++-- protea/api/cache.py | 1 + protea/api/routers/jobs.py | 1 + protea/api/routers/scoring.py | 1 + protea/core/contracts/operation.py | 5 +- protea/core/evaluation.py | 5 +- protea/core/operations/compute_embeddings.py | 4 +- protea/core/operations/predict_go_terms.py | 42 ++-- protea/core/operations/run_cafa_evaluation.py | 42 ++-- protea/core/parquet_export.py | 2 +- protea/core/reranker.py | 8 +- protea/core/retry.py | 6 +- protea/core/training_dump_helpers.py | 143 ++++++----- pyproject.toml | 16 +- tests/test_fetch_uniprot_metadata.py | 2 +- tests/test_insert_proteins.py | 2 +- tests/test_integration.py | 33 ++- 30 files changed, 735 insertions(+), 198 deletions(-) create mode 100644 apps/lafa_container/__init__.py create mode 100644 apps/lafa_container/protea_main.py create mode 100644 apps/lafa_container/prott5_encoder.py diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 9b3eca3..4771453 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -27,8 +27,8 @@ jobs: - name: Add poetry to PATH run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Install dev dependencies - run: poetry install --only dev + - name: Install main + dev dependencies + run: poetry install --with dev - name: Build Sphinx docs run: poetry run task html_docs diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ae0079d..2806082 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -27,8 +27,8 @@ jobs: - name: Add poetry to PATH run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Install dev dependencies - run: poetry install --only dev + - name: Install main + dev dependencies + run: poetry install --with dev - name: ruff check run: poetry run ruff check protea scripts diff --git a/apps/lafa_container/__init__.py b/apps/lafa_container/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/lafa_container/protea_main.py b/apps/lafa_container/protea_main.py new file mode 100644 index 0000000..1eb70fb --- /dev/null +++ b/apps/lafa_container/protea_main.py @@ -0,0 +1,237 @@ +"""LAFA-compatible PROTEA wrapper. + +Entry point that honours the LAFA container CLI contract: + + --query_file FASTA of query sequences + --train_sequences FASTA of training sequences + --annot_file TSV (EntryID, term, aspect) of training annotations + --graph go-basic.obo (currently unused; kept for contract parity) + --output_baseline 3-column TSV output (Query_ID, GO_Term, Score) + +Pipeline: + 1. Mean-pool ProtT5 embeddings for queries and refs (``prott5_encoder``). + 2. Cosine KNN via ``protea.core.knn_search.search_knn`` (numpy backend). + 3. First-hit GO transfer per query (matches PROTEA's ``_predict_batch``). + 4. Score = ``1 - distance`` (cosine, in [0, 1]). + 5. Emit ``\\t\\t``; gzipped if ``--output_baseline`` + ends in ``.gz``. + +Smoke-test focus: integration over fidelity. The ontology graph is accepted +but not consulted — LAFA distributes propagated TSVs in the official splits. +""" + +from __future__ import annotations + +import argparse +import csv +import gzip +import os +import sys +from collections import defaultdict +from pathlib import Path +from typing import Iterator + +import numpy as np + +# Make `protea.core.knn_search` importable when running from a checkout. +_REPO_ROOT = Path(__file__).resolve().parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from protea.core.knn_search import search_knn # noqa: E402 + +from prott5_encoder import embed_sequences, fasta_accessions, parse_fasta # noqa: E402 + + +def _open_text(path: str): + return gzip.open(path, "rt") if path.endswith(".gz") else open(path) + + +def _load_annotations(path: str, ref_accessions: set[str]) -> dict[str, list[str]]: + """Return ``{ref_accession: [go_term, ...]}`` filtered to refs we use. + + Dispatches by extension: ``.gaf[.gz]`` → GAF parser (skipping ``NOT`` + qualifiers and ``!`` headers); anything else → TSV with ``EntryID`` / + ``term`` columns in the header. + """ + base = path[:-3] if path.endswith(".gz") else path + if base.endswith(".gaf"): + return _load_annotations_gaf(path, ref_accessions) + return _load_annotations_tsv(path, ref_accessions) + + +def _load_annotations_tsv(path: str, ref_accessions: set[str]) -> dict[str, list[str]]: + go_map: dict[str, list[str]] = defaultdict(list) + with _open_text(path) as handle: + header = handle.readline().rstrip("\n").split("\t") + try: + entry_idx = header.index("EntryID") + term_idx = header.index("term") + except ValueError: + print( + f"[protea_main] Annotation TSV must have header with 'EntryID' and 'term'. " + f"Got: {header}", + file=sys.stderr, + ) + sys.exit(1) + for line in handle: + cols = line.rstrip("\n").split("\t") + if len(cols) <= max(entry_idx, term_idx): + continue + acc = cols[entry_idx] + term = cols[term_idx] + if acc in ref_accessions: + go_map[acc].append(term) + return go_map + + +def _load_annotations_gaf(path: str, ref_accessions: set[str]) -> dict[str, list[str]]: + """Parse a GAF 2.x file. Cols: 2=DB_Object_ID, 5=GO_ID, 4=Qualifier.""" + go_map: dict[str, list[str]] = defaultdict(list) + with _open_text(path) as handle: + for raw in handle: + if raw.startswith("!"): + continue + cols = raw.rstrip("\n").split("\t") + if len(cols) < 9: + continue + if "NOT" in cols[3]: + continue + acc = cols[1] + term = cols[4] + if acc in ref_accessions: + go_map[acc].append(term) + return go_map + + +def _open_output(path: str): + if path.endswith(".gz"): + return gzip.open(path, "wt", newline="") + return open(path, "w", newline="") + + +def _stack(embeddings: dict[str, np.ndarray], order: list[str]) -> tuple[np.ndarray, list[str]]: + """Stack embeddings in ``order``, dropping accessions that failed to embed.""" + kept_accs: list[str] = [] + rows: list[np.ndarray] = [] + for acc in order: + vec = embeddings.get(acc) + if vec is None: + continue + kept_accs.append(acc) + rows.append(vec) + if not rows: + return np.empty((0, 0), dtype=np.float32), kept_accs + return np.stack(rows).astype(np.float32, copy=False), kept_accs + + +def _transfer( + query_accs: list[str], + neighbors: list[list[tuple[str, float]]], + go_map: dict[str, list[str]], + *, + keep_self_hits: bool, +) -> Iterator[tuple[str, str, float]]: + """First-hit GO transfer; one ``(query, term, score)`` row per (q, term).""" + for q_acc, top_refs in zip(query_accs, neighbors, strict=False): + seen: set[str] = set() + for ref_acc, distance in top_refs: + if not keep_self_hits and ref_acc == q_acc: + continue + score = max(0.0, 1.0 - float(distance)) + for term in go_map.get(ref_acc, ()): + if term in seen: + continue + seen.add(term) + yield q_acc, term, score + + +def main() -> None: + parser = argparse.ArgumentParser( + description="LAFA-compatible PROTEA KNN wrapper (ProtT5 + cosine KNN + first-hit transfer)." + ) + parser.add_argument("--query_file", "-q", required=True) + parser.add_argument("--train_sequences", required=True) + parser.add_argument("--annot_file", "-a", required=True) + parser.add_argument("--graph", required=True, help="OBO file (currently not consulted).") + parser.add_argument("--output_baseline", "-o", required=True) + parser.add_argument("--k", type=int, default=5, help="KNN neighbours per query (default: 5).") + parser.add_argument("--metric", default="cosine", choices=["cosine", "l2"]) + parser.add_argument("--backend", default="numpy", choices=["numpy", "faiss"]) + parser.add_argument( + "--keep_self_hits", + action="store_true", + help="Keep query==ref hits (default: drop, matching LAFA's prott5_container).", + ) + parser.add_argument( + "--model_dir", + default=os.environ.get("HF_CACHE"), + help="HuggingFace cache dir (default: $HF_CACHE).", + ) + args = parser.parse_args() + + for label, path in ( + ("query", args.query_file), + ("train", args.train_sequences), + ("annot", args.annot_file), + ("graph", args.graph), + ): + if not os.path.exists(path): + print(f"[protea_main] {label} file not found: {path}", file=sys.stderr) + sys.exit(1) + + print(f"[protea_main] reading FASTAs: {args.query_file} / {args.train_sequences}") + query_seqs = parse_fasta(args.query_file) + train_seqs = parse_fasta(args.train_sequences) + print(f"[protea_main] queries={len(query_seqs)} refs={len(train_seqs)}") + + print(f"[protea_main] loading annotations from {args.annot_file}") + go_map = _load_annotations(args.annot_file, set(train_seqs)) + refs_with_anns = [acc for acc in train_seqs if acc in go_map] + print(f"[protea_main] refs with annotations: {len(refs_with_anns)}/{len(train_seqs)}") + if not refs_with_anns: + print("[protea_main] no annotated refs after filter — nothing to transfer.", file=sys.stderr) + sys.exit(2) + + to_embed = {**{a: query_seqs[a] for a in query_seqs}, + **{a: train_seqs[a] for a in refs_with_anns}} + print(f"[protea_main] embedding {len(to_embed)} sequences with ProtT5 mean-pool") + embeddings = embed_sequences(to_embed, cache_dir=args.model_dir) + + query_order = fasta_accessions(args.query_file) + Q, kept_q = _stack(embeddings, query_order) + R, kept_r = _stack(embeddings, refs_with_anns) + print(f"[protea_main] embedding matrix Q={Q.shape} R={R.shape}") + if Q.size == 0 or R.size == 0: + print("[protea_main] empty embedding matrix — aborting.", file=sys.stderr) + sys.exit(3) + + print(f"[protea_main] KNN k={args.k} metric={args.metric} backend={args.backend}") + neighbors = search_knn( + Q, + R, + kept_r, + k=args.k, + metric=args.metric, + backend=args.backend, + ) + + out_path = args.output_baseline + out_dir = os.path.dirname(out_path) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + n_rows = 0 + with _open_output(out_path) as fh: + writer = csv.writer(fh, delimiter="\t") + for q_acc, term, score in _transfer( + kept_q, neighbors, go_map, keep_self_hits=args.keep_self_hits + ): + writer.writerow([q_acc, term, f"{score:.4f}"]) + n_rows += 1 + + print(f"[protea_main] wrote {n_rows} predictions to {out_path}") + + +if __name__ == "__main__": + main() diff --git a/apps/lafa_container/prott5_encoder.py b/apps/lafa_container/prott5_encoder.py new file mode 100644 index 0000000..a747bbb --- /dev/null +++ b/apps/lafa_container/prott5_encoder.py @@ -0,0 +1,139 @@ +"""Mean-pooled ProtT5 embedder for the LAFA wrapper. + +Standalone version of the encoder used by FANTASIA/PROTEA's ProtT5 backend, +trimmed to the needs of the LAFA contract (FASTA in, ``{accession: vector}`` +out). Mirrors the preprocessing of ``baselines/prott5_container/prott5_embedder.py`` +in the LAFA reference container so embeddings are bit-comparable. +""" + +from __future__ import annotations + +import os +import time +from typing import Iterable + +import numpy as np +import torch +from transformers import T5EncoderModel, T5Tokenizer + +_MODEL_NAME = "Rostlab/prot_t5_xl_half_uniref50-enc" + + +def _load_model(cache_dir: str | None) -> tuple[T5EncoderModel, T5Tokenizer, torch.device]: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + model = T5EncoderModel.from_pretrained(_MODEL_NAME, cache_dir=cache_dir) + if device.type == "cpu": + model = model.to(torch.float32) + model = model.to(device).eval() + tokenizer = T5Tokenizer.from_pretrained(_MODEL_NAME, do_lower_case=False, cache_dir=cache_dir) + return model, tokenizer, device + + +def _prepare(seq: str) -> str: + return " ".join(seq.replace("U", "X").replace("Z", "X").replace("O", "X")) + + +def embed_sequences( + sequences: dict[str, str], + *, + cache_dir: str | None = None, + max_residues: int = 4000, + max_seq_len: int = 1000, + max_batch: int = 100, +) -> dict[str, np.ndarray]: + """Return one mean-pooled vector per accession. + + Sorts sequences by descending length so short tails batch efficiently; + falls back to single-sequence processing for sequences > ``max_seq_len``. + """ + if not sequences: + return {} + + model, tokenizer, device = _load_model(cache_dir) + + items = sorted(sequences.items(), key=lambda kv: -len(kv[1])) + embeddings: dict[str, np.ndarray] = {} + + start = time.time() + batch: list[tuple[str, str, int]] = [] + for idx, (acc, seq) in enumerate(items, 1): + prepared = _prepare(seq) + seq_len = len(seq) + batch.append((acc, prepared, seq_len)) + + n_res = sum(s_len for _, _, s_len in batch) + seq_len + flush = ( + len(batch) >= max_batch + or n_res >= max_residues + or idx == len(items) + or seq_len > max_seq_len + ) + if not flush: + continue + + accs, seqs, lens = zip(*batch) + batch = [] + + token_encoding = tokenizer.batch_encode_plus( + list(seqs), add_special_tokens=True, padding="longest" + ) + input_ids = torch.tensor(token_encoding["input_ids"]).to(device) + attention_mask = torch.tensor(token_encoding["attention_mask"]).to(device) + + try: + with torch.no_grad(): + hidden = model(input_ids, attention_mask=attention_mask).last_hidden_state + except RuntimeError as exc: + print(f"[prott5_encoder] OOM/error on batch with longest L={lens[0]}: {exc}") + continue + + for b_idx, ident in enumerate(accs): + s_len = lens[b_idx] + vec = hidden[b_idx, :s_len].mean(dim=0).detach().cpu().numpy().astype(np.float32) + embeddings[ident] = vec + + elapsed = time.time() - start + print( + f"[prott5_encoder] {len(embeddings)} embeddings in {elapsed:.1f}s " + f"({elapsed / max(1, len(embeddings)):.3f}s/protein, device={device})" + ) + return embeddings + + +def parse_fasta(path: str) -> dict[str, str]: + """Read a FASTA file into ``{accession: sequence}``. + + Accession is the substring between the first two ``|`` if present + (UniProt-style ``sp|P12345|name``), else the full id token. + """ + seqs: dict[str, str] = {} + current: str | None = None + with open(path) as handle: + for raw in handle: + line = raw.strip() + if not line: + continue + if line.startswith(">"): + header = line[1:].split()[0] + parts = header.split("|") + current = parts[1] if len(parts) >= 2 else header + seqs[current] = "" + elif current is not None: + seqs[current] += line.upper().replace("-", "") + return seqs + + +def fasta_accessions(path: str) -> list[str]: + """Return accessions in FASTA order (stable for output ordering).""" + accs: list[str] = [] + with open(path) as handle: + for raw in handle: + if raw.startswith(">"): + header = raw[1:].strip().split()[0] + parts = header.split("|") + accs.append(parts[1] if len(parts) >= 2 else header) + return accs + + +def keys_as_array(seqs: Iterable[str]) -> list[str]: + return list(seqs) diff --git a/docs/source/adr/D04-api-versioning.rst b/docs/source/adr/D04-api-versioning.rst index fa06fb3..24d17ed 100644 --- a/docs/source/adr/D04-api-versioning.rst +++ b/docs/source/adr/D04-api-versioning.rst @@ -1,8 +1,9 @@ ADR-D4: API versioning strategy =============================== -:Status: Pending +:Status: Accepted :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F4 :Gate: opens at F4 entry @@ -27,4 +28,7 @@ Consequences Resolution ---------- -Pending; gate opens with F4 (T4.1). +**Accepted as recommended.** Universal ``/v1/`` prefix on all routers. +Implementation when F4 entry opens (T4.1) — keep deprecated unprefixed +mounts for one release to avoid breaking ``protea.ngrok.app`` and +front-end clients during the transition. diff --git a/docs/source/adr/D06-authentication.rst b/docs/source/adr/D06-authentication.rst index 4aaa381..2bb3813 100644 --- a/docs/source/adr/D06-authentication.rst +++ b/docs/source/adr/D06-authentication.rst @@ -1,8 +1,9 @@ ADR-D6: Authentication strategy ================================ -:Status: Pending +:Status: Accepted :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F5 :Gate: opens at F5 entry @@ -31,4 +32,15 @@ Consequences Resolution ---------- -Pending; gate opens with F5 (T5.6). +**Accepted as recommended, with the OIDC provider pinned.** + +- API key path: ``ApiKey`` ORM table + ``Authorization: Bearer …`` for + service-to-service calls (LAFA containers, downstream pipelines). +- OIDC path: **Authentik** as the identity provider behind + ``oauth2-proxy`` for human users. User picked Authentik on + 2026-05-06 ("contra menos custom mejor"); Authentik chosen for its + lighter footprint and simpler Docker-Compose setup vs Keycloak's + JBoss-flavour weight. + +Rate limiting via ``slowapi`` per F5 plan. Implementation gate at F5 +entry (T5.6). diff --git a/docs/source/adr/D07-observability-stack.rst b/docs/source/adr/D07-observability-stack.rst index 43fa242..25bed57 100644 --- a/docs/source/adr/D07-observability-stack.rst +++ b/docs/source/adr/D07-observability-stack.rst @@ -1,8 +1,9 @@ ADR-D7: Observability stack ============================ -:Status: Pending +:Status: Accepted :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F-OPS :Gate: opens at F-OPS entry @@ -32,4 +33,12 @@ Consequences Resolution ---------- -Pending; gate opens with F-OPS (T5.1-T5.4). +**Accepted as recommended.** User confirmation 2026-05-06 ("libre + +fácil + buen funcionamiento"). Loki + Grafana for logs; Prometheus +for metrics; OpenTelemetry for traces. Loki chosen over the ELK stack +(Elasticsearch + Kibana) because it indexes labels rather than full +text, has lower memory footprint, and integrates with the same Grafana +that already surfaces Prometheus dashboards. Logs ship via +``loki-docker-driver`` from container stdout (no separate Promtail +sidecar in the cloud target). Implementation gate at F-OPS entry +(T5.1-T5.4). diff --git a/docs/source/adr/D10-schema-sha-v2.rst b/docs/source/adr/D10-schema-sha-v2.rst index 123065c..8a787b7 100644 --- a/docs/source/adr/D10-schema-sha-v2.rst +++ b/docs/source/adr/D10-schema-sha-v2.rst @@ -1,8 +1,9 @@ ADR-D10: ``schema_sha`` v2 parallel migration ============================================== -:Status: Pending +:Status: Accepted (implementation pending) :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F1 :Gate: T1.6 (requires_human, Alembic on live DB) @@ -32,4 +33,13 @@ Consequences Resolution ---------- -Pending human review of the live-DB migration. Rolls in F1 with T1.6. +**Accepted as recommended.** User greenlight 2026-05-06 with the +explicit constraint **"no subir a prod hasta que no esté listo"** — +implementation must land in staging (or a local-DB rehearsal) and the +backfill must be verified there before any production migration. +Implementation order: (1) Alembic migration adding ``schema_sha_v2`` +column, (2) backfill script populating from +``protea_contracts.compute_schema_sha``, (3) regression test exposing +v1/v2 drift on historical rows (rather than retroactively fixing), (4) +inference path reads v2. Production rollout only after staging +verification. diff --git a/docs/source/adr/D25-hpc-mode.rst b/docs/source/adr/D25-hpc-mode.rst index 2597808..4ecaaf2 100644 --- a/docs/source/adr/D25-hpc-mode.rst +++ b/docs/source/adr/D25-hpc-mode.rst @@ -1,8 +1,9 @@ ADR-D25: HPC operation mode ============================= -:Status: Pending +:Status: Accepted (mode B primary, mode C deferred) :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F-OPS :Gate: opens at F-OPS entry @@ -35,4 +36,9 @@ Consequences Resolution ---------- -Pending; gate opens with F-OPS (T-OPS.5, T-OPS.9). +**Accepted with scope adjustment.** User confirmation 2026-05-06: mode +B primary, mode C deferred until **post-defensa** (when contact with +BSC or similar restricted sites becomes concrete). Mode B is sufficient +for the thesis defense scope: stateless PROTEA workers on HPC nodes +connecting to LifeWatch / EOSC-hosted Postgres + RabbitMQ. Mode C +(airgap ``.sif`` bundle) becomes a F9 post-defensa item. diff --git a/docs/source/adr/D27-image-registry.rst b/docs/source/adr/D27-image-registry.rst index f4a3b49..99e0d91 100644 --- a/docs/source/adr/D27-image-registry.rst +++ b/docs/source/adr/D27-image-registry.rst @@ -1,8 +1,9 @@ ADR-D27: Image registry ========================= -:Status: Pending +:Status: Accepted :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F-OPS :Gate: opens at F-OPS entry @@ -27,4 +28,9 @@ Consequences Resolution ---------- -Pending; gate opens with F-OPS (T-OPS.8). +**Accepted as recommended.** ``ghcr.io`` confirmed by user 2026-05-06. +Implementation: GitHub Actions workflow per repo on tag push, login via +the built-in ``GITHUB_TOKEN``, image tag set from the SemVer tag. +Public visibility for ``protea-method-runtime``; org-scoped or private +for internal images if/when needed. Mirror to Docker Hub deferred until +external pull rates demand it. diff --git a/docs/source/adr/D28-secrets-management.rst b/docs/source/adr/D28-secrets-management.rst index 93dc08a..ad1a6dd 100644 --- a/docs/source/adr/D28-secrets-management.rst +++ b/docs/source/adr/D28-secrets-management.rst @@ -1,8 +1,9 @@ ADR-D28: Secrets management ============================= -:Status: Pending +:Status: Accepted :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F-OPS :Gate: opens at F-OPS entry @@ -29,4 +30,12 @@ Consequences Resolution ---------- -Pending; gate opens with F-OPS (T-OPS.7). +**Accepted as recommended.** ``sops + age`` confirmed by user +2026-05-06. Two reasons captured: (a) age keys are post-PGP ed25519, +short and revocation-chain-free; (b) sops is file-format agnostic so +the same workflow handles yaml/json/env. First migration target: +``secrets.enc.yaml`` containing DB URL + AMQP URL + MinIO creds + GitHub +release token. Bootstrap script invokes ``sops -d`` before +``manage.sh start``. Per-environment files (``secrets.dev.enc.yaml`` / +``secrets.prod.enc.yaml``). Rotation procedure to be documented at +implementation time. diff --git a/docs/source/adr/D29-release-pipeline.rst b/docs/source/adr/D29-release-pipeline.rst index dbfc8d8..c045e4a 100644 --- a/docs/source/adr/D29-release-pipeline.rst +++ b/docs/source/adr/D29-release-pipeline.rst @@ -1,8 +1,9 @@ ADR-D29: Release pipeline =========================== -:Status: Pending +:Status: Accepted :Date: 2026-05-05 +:Decided: 2026-05-06 (user confirmation) :Phase: F-OPS :Gate: opens at F-OPS entry @@ -33,4 +34,13 @@ Consequences Resolution ---------- -Pending; gate opens with F-OPS (T-OPS.8). +**Accepted with semantic-release tooling.** User confirmation +2026-05-06 ("semantic parece que añade un mejor contexto"). Version +bumps + CHANGELOG generation driven by Conventional Commits parsed by +``semantic-release``: ``feat:`` → minor, ``fix:`` → patch, +``BREAKING CHANGE:`` footer → major. The commit-message style is +already in place from the F2 phase (every commit during F2A.6-real, +F2B, D-MIGR-06, Doc-T11 is conventional). Cross-repo integration test +on tag stays as recommended. Implementation: a ``release.yml`` GitHub +Action per repo + ``semantic-release`` config in ``pyproject.toml`` (or +``.releaserc``). diff --git a/poetry.lock b/poetry.lock index 74124bb..f5d320a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -487,8 +487,10 @@ docs = ["furo (>=2024.1)", "sphinx (>=7)"] fast = ["pyarrow (>=12)"] [package.source] -type = "directory" -url = "../cafaeval-protea" +type = "git" +url = "https://github.com/frapercan/cafaeval-protea.git" +reference = "main" +resolved_reference = "836f35390b36eb808d56abfaab84fcf6d52cda44" [[package]] name = "certifi" @@ -1944,6 +1946,18 @@ files = [ {file = "kiwisolver-1.5.0.tar.gz", hash = "sha256:d4193f3d9dc3f6f79aaed0e5637f45d98850ebf01f7ca20e69457f3e8946b66a"}, ] +[[package]] +name = "latexcodec" +version = "3.0.1" +description = "A lexer and codec to work with LaTeX code in Python." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "latexcodec-3.0.1-py3-none-any.whl", hash = "sha256:a9eb8200bff693f0437a69581f7579eb6bca25c4193515c09900ce76451e452e"}, + {file = "latexcodec-3.0.1.tar.gz", hash = "sha256:e78a6911cd72f9dec35031c6ec23584de6842bfbc4610a9678868d14cdfb0357"}, +] + [[package]] name = "librt" version = "0.8.1" @@ -3449,25 +3463,34 @@ optional = false python-versions = ">=3.12,<4.0" groups = ["plugins"] files = [] -develop = true +develop = false [package.dependencies] numpy = ">=1.24" -protea-contracts = {path = "../protea-contracts", develop = true} +protea-contracts = {git = "https://github.com/frapercan/protea-contracts.git", branch = "master"} + +[package.extras] +all = ["esm (>=3.1)", "sentencepiece (>=0.2)", "torch (>=2.1)", "transformers (>=4.40)"] +ankh = ["sentencepiece (>=0.2)", "torch (>=2.1)", "transformers (>=4.40)"] +esm = ["torch (>=2.1)", "transformers (>=4.40)"] +esm3c = ["esm (>=3.1)", "torch (>=2.1)"] +t5 = ["sentencepiece (>=0.2)", "torch (>=2.1)", "transformers (>=4.40)"] [package.source] -type = "directory" -url = "../protea-backends" +type = "git" +url = "https://github.com/frapercan/protea-backends.git" +reference = "master" +resolved_reference = "918631f5dacb9810ad94511efe0aed74e655d4ea" [[package]] name = "protea-contracts" -version = "0.0.1" +version = "0.1.0" description = "Shared ABCs, payload schemas, feature registry contract and compute_schema_sha helper for the PROTEA stack." optional = false python-versions = ">=3.12,<4.0" groups = ["plugins"] files = [] -develop = true +develop = false [package.dependencies] numpy = ">=1.24" @@ -3475,8 +3498,10 @@ pyarrow = ">=14" pydantic = ">=2.5" [package.source] -type = "directory" -url = "../protea-contracts" +type = "git" +url = "https://github.com/frapercan/protea-contracts.git" +reference = "master" +resolved_reference = "3936b70fe6fb1a3a1925009d694a0df8f2a72d91" [[package]] name = "protea-method" @@ -3486,17 +3511,19 @@ optional = false python-versions = ">=3.12,<4.0" groups = ["plugins"] files = [] -develop = true +develop = false [package.dependencies] faiss-cpu = ">=1.7" lightgbm = ">=4.0" numpy = ">=1.24" -protea-contracts = {path = "../protea-contracts", develop = true} +protea-contracts = {git = "https://github.com/frapercan/protea-contracts.git", branch = "master"} [package.source] -type = "directory" -url = "../protea-method" +type = "git" +url = "https://github.com/frapercan/protea-method.git" +reference = "master" +resolved_reference = "7b80e14e49d06390db3960d8d9fd293a9eebeae3" [[package]] name = "protea-reranker-lab" @@ -3506,7 +3533,7 @@ optional = false python-versions = ">=3.11" groups = ["dev"] files = [] -develop = true +develop = false [package.dependencies] lightgbm = ">=4.3" @@ -3519,8 +3546,10 @@ scikit-learn = ">=1.4" wandb = ">=0.16" [package.source] -type = "directory" -url = "../protea-reranker-lab" +type = "git" +url = "https://github.com/frapercan/protea-reranker-lab.git" +reference = "main" +resolved_reference = "8f7ae938ac4ea4d44cc69dfd394fbe04b8ae1537" [[package]] name = "protea-runners" @@ -3530,17 +3559,19 @@ optional = false python-versions = ">=3.12,<4.0" groups = ["plugins"] files = [] -develop = true +develop = false [package.dependencies] lightgbm = ">=4.0" numpy = ">=1.24" -protea-contracts = {path = "../protea-contracts", develop = true} +protea-contracts = {git = "https://github.com/frapercan/protea-contracts.git", branch = "master"} pyarrow = ">=14" [package.source] -type = "directory" -url = "../protea-runners" +type = "git" +url = "https://github.com/frapercan/protea-runners.git" +reference = "master" +resolved_reference = "3a6619c3d561e2218f9a91ee359e5673ab0d0d1f" [[package]] name = "protea-sources" @@ -3550,15 +3581,17 @@ optional = false python-versions = ">=3.12,<4.0" groups = ["plugins"] files = [] -develop = true +develop = false [package.dependencies] -protea-contracts = {path = "../protea-contracts", develop = true} +protea-contracts = {git = "https://github.com/frapercan/protea-contracts.git", branch = "master"} requests = ">=2.31" [package.source] -type = "directory" -url = "../protea-sources" +type = "git" +url = "https://github.com/frapercan/protea-sources.git" +reference = "master" +resolved_reference = "1a8a139f443eb7566acb896af6af70860a7ed619" [[package]] name = "protobuf" @@ -3788,6 +3821,38 @@ files = [ {file = "pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019"}, ] +[[package]] +name = "pybtex" +version = "0.26.1" +description = "A BibTeX-compatible bibliography processor in Python" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pybtex-0.26.1-py3-none-any.whl", hash = "sha256:e26c0412cc54f5f21b2a6d9d175762a2d2af9ccf3a8f651cdb89ec035db77aa1"}, + {file = "pybtex-0.26.1.tar.gz", hash = "sha256:2e5543bea424e60e9e42eef70bff597be48649d8f68ba061a7a092b2477d5464"}, +] + +[package.dependencies] +latexcodec = ">=1.0.4" +pyyaml = ">=3.1" + +[[package]] +name = "pybtex-docutils" +version = "1.0.3" +description = "A docutils backend for pybtex." +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "pybtex-docutils-1.0.3.tar.gz", hash = "sha256:3a7ebdf92b593e00e8c1c538aa9a20bca5d92d84231124715acc964d51d93c6b"}, + {file = "pybtex_docutils-1.0.3-py3-none-any.whl", hash = "sha256:8fd290d2ae48e32fcb54d86b0efb8d573198653c7e2447d5bec5847095f430b9"}, +] + +[package.dependencies] +docutils = ">=0.14" +pybtex = ">=0.16" + [[package]] name = "pycodestyle" version = "2.14.0" @@ -4943,6 +5008,27 @@ lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] standalone = ["Sphinx (>=5)"] test = ["pytest"] +[[package]] +name = "sphinxcontrib-bibtex" +version = "2.7.0" +description = "Sphinx extension for BibTeX style citations." +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "sphinxcontrib_bibtex-2.7.0-py3-none-any.whl", hash = "sha256:28cf0ec7a957d1c7548d5749317ed472ce877e1b629f430f88e3789aa51f87b1"}, + {file = "sphinxcontrib_bibtex-2.7.0.tar.gz", hash = "sha256:fee700f7aae29bb8f654c62913f00d34ac44fc0b8ca0fa67ac922ff4453addee"}, +] + +[package.dependencies] +docutils = ">=0.20" +pybtex = ">=0.25" +pybtex-docutils = ">=1.0.2" +Sphinx = ">=7.4" + +[package.extras] +test = ["pytest", "pytest-cov", "sphinx-autoapi"] + [[package]] name = "sphinxcontrib-devhelp" version = "2.0.0" @@ -6091,4 +6177,4 @@ storage = ["minio"] [metadata] lock-version = "2.1" python-versions = ">=3.12,<4.0" -content-hash = "9d6d78a4fadabf20d04a86de20085774d552e1b099c2ba704c567cbfacef676a" +content-hash = "b79797f6fa6c70bda1fa4268a3b0ebb5cb6cbfa4decc0300f8ea8dfd734cdaae" diff --git a/protea/api/cache.py b/protea/api/cache.py index ea64450..870755e 100644 --- a/protea/api/cache.py +++ b/protea/api/cache.py @@ -22,6 +22,7 @@ def _default_ttl() -> float: """Resolved each call so env/yaml overrides apply at runtime.""" return get_tuning().worker.api_cache_default_ttl_seconds + _lock = threading.Lock() _store: dict[str, tuple[float, Any]] = {} diff --git a/protea/api/routers/jobs.py b/protea/api/routers/jobs.py index 4e227b1..0a57f9b 100644 --- a/protea/api/routers/jobs.py +++ b/protea/api/routers/jobs.py @@ -55,6 +55,7 @@ def _operation_metadata( summary = rendered or None return description, summary + router = APIRouter(prefix="/jobs", tags=["jobs"]) diff --git a/protea/api/routers/scoring.py b/protea/api/routers/scoring.py index e657982..e4c4070 100644 --- a/protea/api/routers/scoring.py +++ b/protea/api/routers/scoring.py @@ -89,6 +89,7 @@ def _load_booster(rm: RerankerModel) -> Any: ), ) + router = APIRouter(prefix="/scoring", tags=["scoring"]) # --------------------------------------------------------------------------- diff --git a/protea/core/contracts/operation.py b/protea/core/contracts/operation.py index 3304982..82f45de 100644 --- a/protea/core/contracts/operation.py +++ b/protea/core/contracts/operation.py @@ -7,12 +7,11 @@ from typing import Any, Literal, Protocol from uuid import UUID -from sqlalchemy.orm import Session - # T1.5 of master plan v3: ProteaPayload is owned by protea-contracts. # Re-export here so existing imports of ``ProteaPayload`` from this # module keep working; new code should import from ``protea_contracts``. -from protea_contracts import ProteaPayload +from protea_contracts import ProteaPayload as ProteaPayload # noqa: F401 # re-export +from sqlalchemy.orm import Session Level = Literal["info", "warning", "error"] EmitFn = Callable[[str, str | None, dict[str, Any], Level], None] diff --git a/protea/core/evaluation.py b/protea/core/evaluation.py index 58b5bdd..93e2db1 100644 --- a/protea/core/evaluation.py +++ b/protea/core/evaluation.py @@ -732,7 +732,10 @@ def load_evaluation_data_for_set(session: Session, eval_set) -> tuple[Evaluation if pivot_raw: pivot_id = uuid.UUID(str(pivot_raw)) else: - pivot_id = ann_new.ontology_snapshot_id if ann_new else ann_old.ontology_snapshot_id + # Both ann_new and ann_old are validated non-None by the caller + # (run_cafa_evaluation); the ternary short-circuits before + # dereferencing ann_old when ann_new is set. + pivot_id = ann_new.ontology_snapshot_id if ann_new else ann_old.ontology_snapshot_id # type: ignore[union-attr] if not eval_set.groundtruth_uri: raise RuntimeError( diff --git a/protea/core/operations/compute_embeddings.py b/protea/core/operations/compute_embeddings.py index 768ee5a..fb856ea 100644 --- a/protea/core/operations/compute_embeddings.py +++ b/protea/core/operations/compute_embeddings.py @@ -10,16 +10,14 @@ import numpy as np from pydantic import Field, field_validator from sqlalchemy import exists, select -from sqlalchemy import update as sa_update from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.orm import Session from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload, RetryLaterError from protea.core.contracts.parent_progress import update_parent_progress -from protea.core.utils import utcnow from protea.infrastructure.orm.models.embedding.embedding_config import EmbeddingConfig from protea.infrastructure.orm.models.embedding.sequence_embedding import SequenceEmbedding -from protea.infrastructure.orm.models.job import Job, JobEvent, JobStatus +from protea.infrastructure.orm.models.job import Job, JobStatus from protea.infrastructure.orm.models.protein.protein import Protein from protea.infrastructure.orm.models.query.query_set import QuerySetEntry from protea.infrastructure.orm.models.sequence.sequence import Sequence diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index 54ec1cd..d124eb3 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -7,12 +7,11 @@ from uuid import UUID import numpy as np -from sqlalchemy import update as sa_update from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.orm import Session from protea.core.annotation_intern import intern_string -from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload +from protea.core.contracts.operation import EmitFn, OperationResult from protea.core.contracts.parent_progress import update_parent_progress from protea.core.disk_cache import ( _aspect_index_path, @@ -38,7 +37,6 @@ infer_active_feature_families, load_reranker, ) -from protea.core.utils import utcnow from protea.infrastructure.orm.models.annotation.annotation_set import AnnotationSet from protea.infrastructure.orm.models.annotation.go_term import GOTerm from protea.infrastructure.orm.models.annotation.ontology_snapshot import OntologySnapshot @@ -48,7 +46,7 @@ from protea.infrastructure.orm.models.embedding.prediction_set import PredictionSet from protea.infrastructure.orm.models.embedding.reranker_model import RerankerModel from protea.infrastructure.orm.models.embedding.sequence_embedding import SequenceEmbedding -from protea.infrastructure.orm.models.job import Job, JobEvent, JobStatus +from protea.infrastructure.orm.models.job import Job, JobStatus from protea.infrastructure.orm.models.protein.protein import Protein from protea.infrastructure.orm.models.query.query_set import QuerySet, QuerySetEntry from protea.infrastructure.orm.models.sequence.sequence import Sequence @@ -163,7 +161,6 @@ def _row_from_prediction( StorePredictionsPayload, ) - # --------------------------------------------------------------------------- # Coordinator # --------------------------------------------------------------------------- @@ -621,9 +618,7 @@ def execute( and ref_unified[a]["embeddings_f32"].size ] pca_pool = ( - np.concatenate(pools, axis=0) - if pools - else np.empty((0,), dtype=np.float32) + np.concatenate(pools, axis=0) if pools else np.empty((0,), dtype=np.float32) ) else: pca_pool = ref_unified.get("embeddings_f32", np.empty((0,), dtype=np.float32)) @@ -662,6 +657,7 @@ def execute( expand_predictions_to_ancestors, load_parent_map, ) + # predict_go_terms keys candidates by integer ``go_term_id``; # the expansion helper (and parent_map) operate on string GO # accessions (``"GO:0006357"``). Materialise the map once for @@ -694,7 +690,8 @@ def execute( # (the helper just clones the leaf record). Resolve the FK so # store_predictions can insert the row. ancestor_strs = { - rec["go_id"] for rec in prediction_dicts + rec["go_id"] + for rec in prediction_dicts if rec.get("go_id") and rec["go_id"] not in {v for v in int_to_str.values()} } if ancestor_strs: @@ -720,18 +717,14 @@ def execute( { "rows_before": n_before, "rows_after": len(prediction_dicts), - "expansion_ratio": ( - len(prediction_dicts) / n_before if n_before else 0.0 - ), + "expansion_ratio": (len(prediction_dicts) / n_before if n_before else 0.0), }, "info", ) reranker_stats: dict[str, Any] | None = None if p.reranker_model_id and prediction_dicts: - reranker_stats = self._apply_reranker_if_aligned( - session, prediction_dicts, p, emit - ) + reranker_stats = self._apply_reranker_if_aligned(session, prediction_dicts, p, emit) elapsed = time.perf_counter() - t0 @@ -761,7 +754,7 @@ def execute( store_chunk_size = get_tuning().operation.store_chunk_size chunks: list[list[dict[str, Any]]] = [ - prediction_dicts[s:s + store_chunk_size] + prediction_dicts[s : s + store_chunk_size] for s in range(0, len(prediction_dicts), store_chunk_size) ] or [[]] store_messages: list[tuple[str, dict[str, Any]]] = [] @@ -905,13 +898,13 @@ def _attach_go_term_aspect( write it back onto each prediction dict so the reranker's categorical feature is populated. """ - unique_ids = {rec["go_term_id"] for rec in prediction_dicts if rec.get("go_term_id") is not None} + unique_ids = { + rec["go_term_id"] for rec in prediction_dicts if rec.get("go_term_id") is not None + } if not unique_ids: return aspect_by_id: dict[int, str] = dict( - session.query(GOTerm.id, GOTerm.aspect) - .filter(GOTerm.id.in_(unique_ids)) - .all() + session.query(GOTerm.id, GOTerm.aspect).filter(GOTerm.id.in_(unique_ids)).all() ) for rec in prediction_dicts: gid = rec.get("go_term_id") @@ -1224,9 +1217,7 @@ def _run_aspect_separated_knn( # ── 2. Load feature-engineering inputs over the union of neighbors ── ref_sequences, query_sequences, ref_tax_ids, query_tax_ids = ( - self._load_feature_engineering_data( - session, p, valid_accessions, all_unique_neighbors - ) + self._load_feature_engineering_data(session, p, valid_accessions, all_unique_neighbors) ) # Build predictions per aspect, merging into a single list. @@ -1430,9 +1421,7 @@ def _run_knn_per_aspect( continue ref_f32 = ( - aspect_refs["embeddings_f32_cos"] - if use_cos - else aspect_refs["embeddings_f32"] + aspect_refs["embeddings_f32_cos"] if use_cos else aspect_refs["embeddings_f32"] ) aspect_neighbors = search_knn( query_embeddings, @@ -1779,7 +1768,6 @@ def _predict_batch( # ── v6 reranker features ───────────────────────────────────────────────── - # ── feature-engineering helpers ─────────────────────────────────────────── def _load_sequences_for_proteins( diff --git a/protea/core/operations/run_cafa_evaluation.py b/protea/core/operations/run_cafa_evaluation.py index 3c8d105..c6af82c 100644 --- a/protea/core/operations/run_cafa_evaluation.py +++ b/protea/core/operations/run_cafa_evaluation.py @@ -18,8 +18,6 @@ from protea.core.evaluation import load_evaluation_data_for_set from protea.core.reranker import load_reranker from protea.core.scoring import compute_score -from protea.infrastructure.settings import load_settings as _load_settings_for_reranker -from protea.infrastructure.storage import get_artifact_store as _get_store_for_reranker from protea.infrastructure.orm.models.annotation.evaluation_result import EvaluationResult from protea.infrastructure.orm.models.annotation.evaluation_set import EvaluationSet from protea.infrastructure.orm.models.annotation.go_term import GOTerm @@ -31,13 +29,16 @@ ) from protea.infrastructure.orm.models.embedding.scoring_config import ScoringConfig from protea.infrastructure.settings import load_settings +from protea.infrastructure.settings import load_settings as _load_settings_for_reranker from protea.infrastructure.storage import get_artifact_store +from protea.infrastructure.storage import get_artifact_store as _get_store_for_reranker def eval_artifact_key(result_id: uuid.UUID, relpath: str) -> str: """Canonical MinIO/artifact-store key for a cafaeval output file.""" return f"eval_artifacts/{result_id}/{relpath.lstrip('/')}" + # Namespace labels used by cafaeval OBO parser. The full names come from # the obo file; we map them to PROTEA's canonical CAFA codes. _NS_LABELS: dict[str, str] = { @@ -168,11 +169,7 @@ def _patch_query_known_features( count_col[row_indices] = float(len(known)) if not known: continue - known_rows = [ - idx_of_go[g] - for g in known - if g in idx_of_go and has_emb_mask[idx_of_go[g]] - ] + known_rows = [idx_of_go[g] for g in known if g in idx_of_go and has_emb_mask[idx_of_go[g]]] if not known_rows: continue kmat = all_norm[known_rows] @@ -190,9 +187,7 @@ def _patch_query_known_features( cos_col[ridx] = float(cand_vec @ centroid_unit) maxcos_col[ridx] = float((kmat @ cand_vec).max()) - df["anc2vec_query_known_cos"] = pd.Series(cos_col, index=df.index).replace( - {np.nan: pd.NA} - ) + df["anc2vec_query_known_cos"] = pd.Series(cos_col, index=df.index).replace({np.nan: pd.NA}) df["anc2vec_query_known_maxcos"] = pd.Series(maxcos_col, index=df.index).replace( {np.nan: pd.NA} ) @@ -288,6 +283,7 @@ def summarize_payload(self, payload: dict[str, Any], *, session: Session | None from protea.infrastructure.orm.models.embedding.embedding_config import ( EmbeddingConfig, ) + cfg = session.get(EmbeddingConfig, pred.embedding_config_id) if cfg is not None: label = cfg.display_name or cfg.model_name or str(cfg.id)[:8] @@ -534,15 +530,20 @@ def _resolve_model_bundle(rm: RerankerModelORM) -> dict[str, Any]: # delta proteins outside the PredictionSet's query coverage hurt # Fmax / coverage despite the booster being unable to score them. if p.restrict_gt_to_predicted: - from sqlalchemy import select, distinct + from sqlalchemy import distinct, select + from protea.infrastructure.orm.models.embedding.go_prediction import ( GOPrediction as _GP, ) + predicted_set: set[str] = set( session.execute( - select(distinct(_GP.protein_accession)) - .where(_GP.prediction_set_id == pred_set_id) - ).scalars().all() + select(distinct(_GP.protein_accession)).where( + _GP.prediction_set_id == pred_set_id + ) + ) + .scalars() + .all() ) _orig_counts = (len(data.nk), len(data.lk), len(data.pk)) data = type(data)( @@ -557,9 +558,12 @@ def _resolve_model_bundle(rm: RerankerModelORM) -> dict[str, Any]: None, { "predicted_proteins": len(predicted_set), - "nk_before": _orig_counts[0], "nk_after": len(data.nk), - "lk_before": _orig_counts[1], "lk_after": len(data.lk), - "pk_before": _orig_counts[2], "pk_after": len(data.pk), + "nk_before": _orig_counts[0], + "nk_after": len(data.nk), + "lk_before": _orig_counts[1], + "lk_after": len(data.lk), + "pk_before": _orig_counts[2], + "pk_after": len(data.pk), }, "info", ) @@ -1029,7 +1033,9 @@ def _write_predictions_per_aspect( continue model = model_from_string(bundle["model"]) df.loc[mask, "score"] = reranker_predict( - model, df.loc[mask], categorical_codes=bundle.get("cat_codes"), + model, + df.loc[mask], + categorical_codes=bundle.get("cat_codes"), ) # Fallback for aspects without a model diff --git a/protea/core/parquet_export.py b/protea/core/parquet_export.py index fa046ae..81c5434 100644 --- a/protea/core/parquet_export.py +++ b/protea/core/parquet_export.py @@ -26,10 +26,10 @@ from typing import Any import pandas as pd +from protea_contracts import compute_schema_sha as _canonical_schema_sha from protea.core.reranker import ALL_FEATURES, LABEL_COLUMN from protea.infrastructure.storage import ArtifactStore -from protea_contracts import compute_schema_sha as _canonical_schema_sha logger = logging.getLogger(__name__) diff --git a/protea/core/reranker.py b/protea/core/reranker.py index a5ecfdd..0d57abf 100644 --- a/protea/core/reranker.py +++ b/protea/core/reranker.py @@ -28,8 +28,6 @@ import numpy as np import pandas as pd -from protea.infrastructure.storage import ArtifactStore, LocalFsArtifactStore - # T1.5 of master plan v3: the feature schema is owned by protea-contracts. # Re-export here so existing call sites that import from # ``protea.core.reranker`` keep working; new code should import from @@ -42,6 +40,8 @@ NUMERIC_FEATURES, ) +from protea.infrastructure.storage import ArtifactStore, LocalFsArtifactStore + logger = logging.getLogger(__name__) @@ -167,7 +167,9 @@ def predict( # not seen at training (rare evidence codes etc.) fall to # -1 (missing), matching how the lab handled NaN. mapping = {v: i for i, v in enumerate(categorical_codes[col])} - X[col] = s.map(lambda v: mapping.get(v, -1)).astype("int64") + # Bind ``mapping`` at lambda-definition time so the + # closure does not see a later iteration's value (B023). + X[col] = s.map(lambda v, m=mapping: m.get(v, -1)).astype("int64") else: # No code map — fall back to the (broken-for-small-batch) # legacy path. Logged as a warning by callers. diff --git a/protea/core/retry.py b/protea/core/retry.py index ac67b3e..9d705f3 100644 --- a/protea/core/retry.py +++ b/protea/core/retry.py @@ -60,7 +60,11 @@ def is_retryable(exc: BaseException) -> bool: R = TypeVar("R") -def with_retry( +# PEP 612 strict says no params allowed between *args: P.args and +# **kwargs: P.kwargs; named keyword-only knobs bind correctly at runtime +# (Python binds explicit names before falling through to **kwargs). +# PEP 695 syntax churn deferred. +def with_retry( # type: ignore[valid-type] # noqa: UP047 fn: Callable[P, R], *args: P.args, max_attempts: int = 3, diff --git a/protea/core/training_dump_helpers.py b/protea/core/training_dump_helpers.py index cd3d340..a15f2e2 100644 --- a/protea/core/training_dump_helpers.py +++ b/protea/core/training_dump_helpers.py @@ -23,7 +23,7 @@ import uuid from dataclasses import dataclass from pathlib import Path -from typing import Annotated, Any +from typing import Annotated, Any, cast import numpy as np import pandas as pd @@ -47,7 +47,6 @@ ALL_FEATURES, EMBEDDING_PCA_DIM, LABEL_COLUMN, - fit_embedding_pca, ) from protea.infrastructure.orm.models.annotation.annotation_set import AnnotationSet from protea.infrastructure.orm.models.annotation.evaluation_set import EvaluationSet @@ -113,7 +112,6 @@ class StreamOutput: # pruned it (no production code referenced it). - # --------------------------------------------------------------------------- # Dataset-export pipeline helpers # @@ -125,9 +123,7 @@ class StreamOutput: # --------------------------------------------------------------------------- -def _load_parent_map( - session: Session, snapshot_id: uuid.UUID -) -> dict[str, set[str]]: +def _load_parent_map(session: Session, snapshot_id: uuid.UUID) -> dict[str, set[str]]: """Return ``{child_go_id: {parent_go_id, ...}}`` for is_a + part_of edges. Used to drive True-Path-Rule max-propagation of predicted scores @@ -150,6 +146,7 @@ def _load_parent_map( parent_map.setdefault(str(child), set()).add(str(parent)) return parent_map + # ── bulk embedding preload (used by dump_helper) ───────────── @@ -296,6 +293,7 @@ def _build_reference_from_cache( return result + # ── reference embeddings per aspect ─────────────────────────────────── @@ -341,6 +339,7 @@ def _load_taxonomy_ids( result[acc] = int(tid) if tid else None return result + # ── KNN + transfer + label ──────────────────────────────────────────── @@ -518,11 +517,15 @@ def _knn_transfer_and_label( continue feats: dict[str, Any] = {} if do_alignments: + # do_alignments implies both dicts are non-None. + assert query_sequences is not None and ref_sequences is not None q_seq = query_sequences.get(q_acc, "") r_seq = ref_sequences.get(ref_acc, "") if q_seq and r_seq: feats.update(compute_alignment(q_seq, r_seq)) if do_taxonomy: + # do_taxonomy implies both dicts are non-None. + assert query_tax_ids is not None and ref_tax_ids is not None q_tid = query_tax_ids.get(q_acc) r_tid = ref_tax_ids.get(ref_acc) feats.update(compute_taxonomy(q_tid, r_tid)) @@ -534,8 +537,12 @@ def _knn_transfer_and_label( if now - _hb_last >= _hb_interval: _LOG.info( "pair_features heartbeat: pairs=%d aspect=%s q_idx=%d/%d elapsed=%.1fs rate=%.0f/s", - _hb_n, aspect, q_idx, len(valid_queries), - now - _hb_t0, _hb_n / max(1e-9, now - _hb_t0), + _hb_n, + aspect, + q_idx, + len(valid_queries), + now - _hb_t0, + _hb_n / max(1e-9, now - _hb_t0), ) _hb_last = now @@ -548,9 +555,16 @@ def _knn_transfer_and_label( # tax_voters_same_frac — fraction of voters in same organism # tax_voters_close_frac — fraction in "close relatives" bucket # tax_voters_mean_common_ancestors — mean lineage overlap across voters - _CLOSE_RELATIONS = frozenset({ - "same", "ancestor", "descendant", "child", "parent", "close", - }) + _CLOSE_RELATIONS = frozenset( + { + "same", + "ancestor", + "descendant", + "child", + "parent", + "close", + } + ) tax_same_cnt: dict[str, dict[int, int]] = {} tax_close_cnt: dict[str, dict[int, int]] = {} tax_ca_sum: dict[str, dict[int, float]] = {} @@ -650,11 +664,7 @@ def _knn_transfer_and_label( if not known: query_known_info[q_acc] = (None, None, 0) continue - rows = [ - idx_of_go[g] - for g in known - if g in idx_of_go and has_emb_mask[idx_of_go[g]] - ] + rows = [idx_of_go[g] for g in known if g in idx_of_go and has_emb_mask[idx_of_go[g]]] if not rows: query_known_info[q_acc] = (None, None, len(known)) continue @@ -671,9 +681,9 @@ def _knn_transfer_and_label( pca_query_proj: np.ndarray | None = None if pca_state is not None and query_emb.size: pca_mean, pca_components = pca_state - pca_query_proj = ( - (query_emb.astype(np.float32) - pca_mean) @ pca_components.T - ).astype(np.float32) + pca_query_proj = ((query_emb.astype(np.float32) - pca_mean) @ pca_components.T).astype( + np.float32 + ) # Build labeled predictions # @@ -741,21 +751,16 @@ def _emit(rec: dict[str, Any]) -> None: records.append(rec) for q_idx, q_acc in enumerate(valid_queries): - q_pca_row = ( - pca_query_proj[q_idx].tolist() - if pca_query_proj is not None - else _nan_pca - ) - q_known_cent, q_known_mat, q_known_n = query_known_info.get( - q_acc, (None, None, 0) - ) + q_pca_row = pca_query_proj[q_idx].tolist() if pca_query_proj is not None else _nan_pca + q_known_cent, q_known_mat, q_known_n = query_known_info.get(q_acc, (None, None, 0)) q_pairs_features = pair_features.get(q_acc, {}) for aspect in _ASPECTS: go_map = ref_by_aspect[aspect]["go_map"] nbs = neighbors_by_aspect[aspect] if q_idx >= len(nbs): continue - centroid_unit, nmat = neighbor_info.get( + # neighbor_info value is Optional; downstream None-check below. + centroid_unit, nmat = neighbor_info.get( # type: ignore[assignment] (q_acc, aspect), (None, None) ) @@ -785,9 +790,7 @@ def _emit(rec: dict[str, Any]) -> None: else float("nan") ) anc_maxcos = ( - float((nmat @ cand_vec).max()) - if nmat is not None - else float("nan") + float((nmat @ cand_vec).max()) if nmat is not None else float("nan") ) anc_has = 1.0 anc_q_cos = ( @@ -877,16 +880,10 @@ def _emit(rec: dict[str, Any]) -> None: tax_ca_sum.get(q_acc, {}).get(go_term_id, 0.0) / max(1, tax_ca_n.get(q_acc, {}).get(go_term_id, 1)) ) - if ( - do_taxonomy - and tax_ca_n.get(q_acc, {}).get(go_term_id, 0) > 0 - ) + if (do_taxonomy and tax_ca_n.get(q_acc, {}).get(go_term_id, 0) > 0) else float("nan") ), - **{ - f"emb_pca_query_{i}": q_pca_row[i] - for i in range(EMBEDDING_PCA_DIM) - }, + **{f"emb_pca_query_{i}": q_pca_row[i] for i in range(EMBEDDING_PCA_DIM)}, } leaf_by_gid[go_id] = rec @@ -908,8 +905,7 @@ def _emit(rec: dict[str, Any]) -> None: leaf_anc = leaf_by_gid[anc] leaf_anc["neighbor_vote_fraction"] = min( 1.0, - float(leaf_anc.get("neighbor_vote_fraction", 0.0)) - + w / k_limit_f, + float(leaf_anc.get("neighbor_vote_fraction", 0.0)) + w / k_limit_f, ) lmd = float(leaf_rec.get("neighbor_min_distance", leaf_d)) cur_md = float(leaf_anc.get("neighbor_min_distance", leaf_d)) @@ -922,13 +918,9 @@ def _emit(rec: dict[str, Any]) -> None: base["go_id"] = anc base[LABEL_COLUMN] = 1 if (q_acc, anc) in gt_pairs else 0 prior_frac = ( - float(entry["neighbor_vote_fraction"]) - if entry is not None - else 0.0 - ) - base["neighbor_vote_fraction"] = min( - 1.0, prior_frac + w / k_limit_f + float(entry["neighbor_vote_fraction"]) if entry is not None else 0.0 ) + base["neighbor_vote_fraction"] = min(1.0, prior_frac + w / k_limit_f) synth[anc] = base else: entry["neighbor_vote_fraction"] = min( @@ -975,13 +967,14 @@ def _emit(rec: dict[str, Any]) -> None: return records - # --------------------------------------------------------------------------- # Auto payload # --------------------------------------------------------------------------- -class TrainRerankerAutoPayload(ProteaPayload, frozen=True): +# ProteaPayload is a pydantic BaseModel, not a dataclass; +# mypy's dataclass-frozen-from-non-frozen check is a false positive. +class TrainRerankerAutoPayload(ProteaPayload, frozen=True): # type: ignore[misc] """Payload for the dump_helper operation. Generates consecutive temporal pairs from ``train_versions``, runs KNN @@ -1250,15 +1243,15 @@ def execute( candidate_names: list[str] = [f"{p.name}-{cat}" for cat in _CATEGORIES] if p.training_scope == "per_cell": candidate_names.extend( - f"{p.name}-{cat}-{_ASPECT_NAMES[asp]}" - for cat in _CATEGORIES - for asp in _ASPECTS + f"{p.name}-{cat}-{_ASPECT_NAMES[asp]}" for cat in _CATEGORIES for asp in _ASPECTS ) # Name-collision check — skipped when dump_only=True since no # RerankerModel rows are written. if not p.dump_only: for model_name in candidate_names: - existing = session.query(RerankerModel).filter(RerankerModel.name == model_name).first() + existing = ( + session.query(RerankerModel).filter(RerankerModel.name == model_name).first() + ) if existing is not None: raise ValueError(f"RerankerModel '{model_name}' already exists") @@ -1311,15 +1304,12 @@ def execute( map_snapshots = {ontology_snapshot_id} | set(version_to_native.values()) union_rows = session.execute( text( - "SELECT id, go_id, aspect FROM go_term " - "WHERE ontology_snapshot_id = ANY(:snap_ids)" + "SELECT id, go_id, aspect FROM go_term WHERE ontology_snapshot_id = ANY(:snap_ids)" ), {"snap_ids": [str(s) for s in map_snapshots]}, ).fetchall() go_id_map: dict[Any, str] = {row_id: go_id for row_id, go_id, _ in union_rows} - aspect_map: dict[Any, str] = { - row_id: aspect for row_id, _, aspect in union_rows if aspect - } + aspect_map: dict[Any, str] = {row_id: aspect for row_id, _, aspect in union_rows if aspect} pivot_rows = session.execute( text( "SELECT go_id FROM go_term " @@ -1518,9 +1508,17 @@ def execute( # Restrict predictions to terms present in the pivot universe — # ground truth was reconciled into pivot space above. - unlabeled_preds = [ - r for r in unlabeled_preds if r["go_id"] in pivot_go_ids - ] + # _knn_transfer_and_label always returns list[dict]. + # Cast explicitly to silence the list-widening mypy + # check that fires on self-reassignment of unlabeled_preds. + unlabeled_preds = cast( + "list[dict[str, Any]]", + [ + r + for r in unlabeled_preds + if r["go_id"] in pivot_go_ids # type: ignore[index] + ], + ) # Free large objects immediately del ref_by_aspect, query_emb, valid_queries, qs, rs, qt, rt @@ -1668,8 +1666,9 @@ def execute( test_all_queries: set[str] = set() test_cat_gt: dict[str, set[tuple[str, str]]] = {} for cat in _CATEGORIES: - gt: dict[str, set[str]] = getattr(test_eval_data, cat) - pairs: set[tuple[str, str]] = set() + # gt + pairs reused from train-side block above; lexically distinct usage. + gt: dict[str, set[str]] = getattr(test_eval_data, cat) # type: ignore[no-redef] + pairs: set[tuple[str, str]] = set() # type: ignore[no-redef] for protein, go_ids in gt.items(): for go_id in go_ids: pairs.add((protein, go_id)) @@ -1742,19 +1741,20 @@ def execute( del test_ref, test_emb, test_valid, test_qs, test_rs, test_qt, test_rt gc.collect() - n_rows = int(test_stream_info.get("n_rows", 0)) + # test_stream_info is always a dict in this branch; the + # list variant is only used for empty-split short-circuits. + n_rows = int(test_stream_info.get("n_rows", 0)) # type: ignore[union-attr] if n_rows > 0 and test_unlabeled_path.exists(): pf = pq.ParquetFile(str(test_unlabeled_path)) - project_cols = [ - c for c in _KEEP_COLS if c in pf.schema_arrow.names - ] + project_cols = [c for c in _KEEP_COLS if c in pf.schema_arrow.names] # Per-cat (protein, aspect) membership recovered from # ``test_eval_data`` so each test row lands only in # the genuine cat bucket. See train-side comment for # rationale. ``aspect_map`` is keyed by int # go_term_id; invert ``go_id_map`` to look up by # the go_id string that ``test_eval_data`` carries. - aspect_by_go_id: dict[str, str] = { + # Same name as train-side block; lexically distinct test path. + aspect_by_go_id: dict[str, str] = { # type: ignore[no-redef] go_id: aspect_map[term_id] for term_id, go_id in go_id_map.items() if term_id in aspect_map @@ -1762,7 +1762,7 @@ def execute( test_cat_membership: dict[str, set[tuple[str, str]]] = {} for cat in _CATEGORIES: gt = getattr(test_eval_data, cat) - members: set[tuple[str, str]] = set() + members: set[tuple[str, str]] = set() # type: ignore[no-redef] for protein, go_ids in gt.items(): for go_id in go_ids: asp = aspect_by_go_id.get(go_id, "") @@ -1774,9 +1774,7 @@ def execute( cat: tmp_dir / f"test_{cat}.parquet" for cat in _CATEGORIES } try: - for batch in pf.iter_batches( - batch_size=200_000, columns=project_cols - ): + for batch in pf.iter_batches(batch_size=200_000, columns=project_cols): # Drop any pre-existing LABEL_COLUMN (it was # written as zero during streaming) so we can # append a fresh per-cat label column without @@ -1784,7 +1782,6 @@ def execute( if LABEL_COLUMN in batch.schema.names: batch = batch.drop_columns([LABEL_COLUMN]) accs = batch.column("protein_accession").to_pylist() - gids = batch.column("go_id").to_pylist() asps = batch.column("aspect").to_pylist() for cat in _CATEGORIES: members = test_cat_membership[cat] diff --git a/pyproject.toml b/pyproject.toml index 77eac8d..56522c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "faiss-cpu (>=1.7.0)", "parasail (>=1.3.4)", "ete3 (>=3.1.3)", - "cafaeval-protea @ file:///home/frapercan/Thesis/repositories/cafaeval-protea", + "cafaeval-protea @ git+https://github.com/frapercan/cafaeval-protea.git@main", "lightgbm (>=4.6.0,<5.0.0)", "pyarrow (>=23.0.1,<24.0.0)", ] @@ -39,22 +39,23 @@ storage = ["minio (>=7.2,<8.0)"] # disk. Real ABCs / payloads land in F1 (T1.1-T1.5); during F0 these # packages export only their version sentinel and entry_point stubs, # which is enough to validate the multi-repo layout. -protea-contracts = { path = "../protea-contracts", develop = true } -protea-method = { path = "../protea-method", develop = true } -protea-sources = { path = "../protea-sources", develop = true } -protea-runners = { path = "../protea-runners", develop = true } -protea-backends = { path = "../protea-backends", develop = true } +protea-contracts = { git = "https://github.com/frapercan/protea-contracts.git", branch = "master" } +protea-method = { git = "https://github.com/frapercan/protea-method.git", branch = "master" } +protea-sources = { git = "https://github.com/frapercan/protea-sources.git", branch = "master" } +protea-runners = { git = "https://github.com/frapercan/protea-runners.git", branch = "master" } +protea-backends = { git = "https://github.com/frapercan/protea-backends.git", branch = "master" } [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] build-backend = "poetry.core.masonry.api" + [tool.poetry.group.dev.dependencies] # Dev-time only: imported by protea.core.parquet_export at export-validation # time so the manifest dict is checked against the lab's pydantic contract # before we publish to the ArtifactStore. Production images do NOT need this. -protea-reranker-lab = {path = "../protea-reranker-lab", develop = true} +protea-reranker-lab = {git = "https://github.com/frapercan/protea-reranker-lab.git", branch = "main"} pytest = ">=9.0.2,<10.0.0" uvicorn = ">=0.41.0,<0.42.0" coverage = {extras = ["toml"], version = ">=7.2.1"} @@ -72,6 +73,7 @@ mypy = ">=1.19.1,<2.0.0" pytest-cov = ">=7.0.0,<8.0.0" types-requests = "^2.32.4.20260107" types-pyyaml = "^6.0.12.20250915" +sphinxcontrib-bibtex = "^2.7.0" [tool.taskipy.tasks] lint = "ruff check protea scripts && flake8 protea" diff --git a/tests/test_fetch_uniprot_metadata.py b/tests/test_fetch_uniprot_metadata.py index 1c7a2a1..3f4215b 100644 --- a/tests/test_fetch_uniprot_metadata.py +++ b/tests/test_fetch_uniprot_metadata.py @@ -250,7 +250,7 @@ def test_fetch_uniprot_metadata_integration(postgres_url: str): # Second run with same data → upsert should not double-insert op2 = FetchUniProtMetadataOperation() with Session(engine, future=True) as session: - with patch.object(op2._http_client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): + with patch.object(op2._uniprot_plugin._client.session, "get", return_value=_make_mock_response(TSV_RESPONSE)): result2 = op2.execute( session, { diff --git a/tests/test_insert_proteins.py b/tests/test_insert_proteins.py index e535f40..d647256 100644 --- a/tests/test_insert_proteins.py +++ b/tests/test_insert_proteins.py @@ -570,7 +570,7 @@ def test_insert_proteins_integration(postgres_url: str): # Idempotency: second run should update, not re-insert op2 = InsertProteinsOperation() with Session(engine, future=True) as session: - with patch.object(op2._http_client.session, "get", return_value=_make_mock_response(FASTA_TWO)): + with patch.object(op2._uniprot_plugin._client.session, "get", return_value=_make_mock_response(FASTA_TWO)): result2 = op2.execute( session, {"search_criteria": "organism_id:9606", "compressed": False}, diff --git a/tests/test_integration.py b/tests/test_integration.py index 9272362..06fd2fe 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -211,8 +211,12 @@ def test_store_embeddings_roundtrip(db): emb = session.query(SequenceEmbedding).filter_by(sequence_id=seq_id).one() assert emb.embedding_config_id == config_id assert emb.embedding_dim == 4 - stored_vec = list(emb.embedding) - np.testing.assert_allclose(stored_vec, vec, atol=1e-5) + # halfvec migration (2026-04-11): emb.embedding is a HalfVector, + # which exposes ``.to_list()`` rather than __iter__. atol is + # relaxed to 1e-3 because fp16 quantization introduces ~1e-4 + # roundtrip error (e.g. 0.1 → 0.0999755859375). + stored_vec = emb.embedding.to_list() + np.testing.assert_allclose(stored_vec, vec, atol=1e-3) # Second run — skip_existing should prevent re-insert with Session(db, future=True) as session: @@ -453,18 +457,21 @@ def test_load_goa_annotations_roundtrip(db): session.add(protein) session.commit() - # Step 3: Build a GAF record (as _stream_gaf yields dicts) + # Step 3: Build a GAF record (post-F2A.6-real: _stream_gaf yields + # GoaAnnotationRecord instances from protea_contracts). + from protea_contracts import GoaAnnotationRecord + gaf_records = [ - { - "accession": "P12345", - "go_id": "GO:0003824", - "qualifier": "enables", - "evidence_code": "IDA", - "db_reference": "PMID:123", - "with_from": "", - "assigned_by": "UniProt", - "annotation_date": "20240101", - }, + GoaAnnotationRecord( + accession="P12345", + go_id="GO:0003824", + qualifier="enables", + evidence_code="IDA", + db_reference="PMID:123", + with_from=None, + assigned_by="UniProt", + annotation_date="20240101", + ), ] # Step 4: Load annotations