diff --git a/.smell-baseline.json b/.smell-baseline.json index f0a5317..749842a 100644 --- a/.smell-baseline.json +++ b/.smell-baseline.json @@ -20,8 +20,8 @@ "kind": "class", "path": "protea/core/training_dump_helpers.py", "name": "TrainRerankerAutoOperation", - "line": 1133, - "metric": 770, + "line": 1136, + "metric": 750, "threshold": 500 }, { @@ -48,7 +48,7 @@ "path": "protea/core/training_dump_helpers.py", "name": "", "line": 0, - "metric": 1902, + "metric": 1885, "threshold": 800 }, { @@ -434,7 +434,7 @@ "kind": "method", "path": "protea/core/training_dump_helpers.py", "name": "_preload_all_embeddings", - "line": 179, + "line": 182, "metric": 73, "threshold": 60 }, @@ -443,7 +443,7 @@ "kind": "method", "path": "protea/core/training_dump_helpers.py", "name": "_build_reference_from_cache", - "line": 254, + "line": 257, "metric": 67, "threshold": 60 }, @@ -452,7 +452,7 @@ "kind": "method", "path": "protea/core/training_dump_helpers.py", "name": "_knn_transfer_and_label", - "line": 372, + "line": 375, "metric": 624, "threshold": 60 }, @@ -461,8 +461,8 @@ "kind": "method", "path": "protea/core/training_dump_helpers.py", "name": "TrainRerankerAutoOperation.execute", - "line": 1240, - "metric": 663, + "line": 1212, + "metric": 674, "threshold": 60 }, { @@ -618,15 +618,6 @@ "metric": 7, "threshold": 6 }, - { - "key": "params::protea/core/training_dump_helpers.py::TrainRerankerAutoOperation._dump_frozen_dataset", - "kind": "params", - "path": "protea/core/training_dump_helpers.py", - "name": "TrainRerankerAutoOperation._dump_frozen_dataset", - "line": 1190, - "metric": 11, - "threshold": 6 - }, { "key": "params::protea/infrastructure/queue/consumer.py::OperationConsumer.__init__", "kind": "params", diff --git a/protea/core/training_dump_helpers.py b/protea/core/training_dump_helpers.py index 717b0ac..6436c80 100644 --- a/protea/core/training_dump_helpers.py +++ b/protea/core/training_dump_helpers.py @@ -23,7 +23,10 @@ import uuid from dataclasses import dataclass from pathlib import Path -from typing import Annotated, Any, cast +from typing import TYPE_CHECKING, Annotated, Any, cast + +if TYPE_CHECKING: + from protea.core.parquet_export import ParquetExportContext import numpy as np import pandas as pd @@ -1187,54 +1190,23 @@ def summarize_payload(self, payload: dict[str, Any], *, session: Session | None return " · ".join(bits) @staticmethod - def _dump_frozen_dataset( - *, - dump_dir: Path, - split_files: dict[str, list[Path]], - valid_split_versions: list[tuple[int, int]], - test_files: dict[str, Path | None], - test_old_v: int, - test_new_v: int, - name: str, - k: int, - embedding_config_id: str, - ontology_snapshot_id: str, - annotation_source: str, - ) -> dict[str, Any]: + def _dump_frozen_dataset(ctx: ParquetExportContext) -> dict[str, Any]: """Thin wrapper that delegates to ``parquet_export`` — kept so ``dump_helper`` can still dump a frozen dataset to a local path via ``dump_to=...``. New code should prefer the ``export_research_dataset`` operation which publishes via the configured ``ArtifactStore``. + + The caller is responsible for filling ``producer_version`` and + ``producer_git_sha`` on the context. Pass ``store=None`` to + skip artifact-store upload. """ - from protea import __version__ as _protea_version - from protea.core.parquet_export import ( - ParquetExportContext, - export_reranker_parquets, - resolve_protea_git_sha, - ) + from protea.core.parquet_export import export_reranker_parquets - result = export_reranker_parquets( - ParquetExportContext( - stage_dir=dump_dir, - split_files=split_files, - valid_split_versions=valid_split_versions, - test_files=test_files, - test_old_v=test_old_v, - test_new_v=test_new_v, - name=name, - k=k, - embedding_config_id=embedding_config_id, - ontology_snapshot_id=ontology_snapshot_id, - annotation_source=annotation_source, - store=None, - producer_version=_protea_version, - producer_git_sha=resolve_protea_git_sha(), - ) - ) + result = export_reranker_parquets(ctx) # Preserve the historical return contract — callers rely on # ``dump_dir`` instead of ``stage_dir``. - result["dump_dir"] = result.pop("stage_dir", str(dump_dir)) + result["dump_dir"] = result.pop("stage_dir", str(ctx.stage_dir)) return result def execute( @@ -1874,18 +1846,29 @@ def execute( "training has been moved to protea-reranker-lab. Use " "ExportResearchDatasetOperation / POST /datasets." ) + from protea import __version__ as _protea_version + from protea.core.parquet_export import ( + ParquetExportContext, + resolve_protea_git_sha, + ) + dump_stats = self._dump_frozen_dataset( - dump_dir=Path(p.dump_to), - split_files=split_files, - valid_split_versions=valid_split_versions, - test_files=test_files, - test_old_v=test_old_v, - test_new_v=test_new_v, - name=p.name, - k=int(p.limit_per_entry), - embedding_config_id=str(emb_config_id), - ontology_snapshot_id=str(ontology_snapshot_id), - annotation_source=p.annotation_source, + ParquetExportContext( + stage_dir=Path(p.dump_to), + split_files=split_files, + valid_split_versions=valid_split_versions, + test_files=test_files, + test_old_v=test_old_v, + test_new_v=test_new_v, + name=p.name, + k=int(p.limit_per_entry), + embedding_config_id=str(emb_config_id), + ontology_snapshot_id=str(ontology_snapshot_id), + annotation_source=p.annotation_source, + store=None, + producer_version=_protea_version, + producer_git_sha=resolve_protea_git_sha(), + ) ) emit("dump_helper.dump_done", None, dump_stats, "info") elapsed = round(time.perf_counter() - t0, 1)