Skip to content
Merged
25 changes: 8 additions & 17 deletions .smell-baseline.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
"kind": "class",
"path": "protea/core/training_dump_helpers.py",
"name": "TrainRerankerAutoOperation",
"line": 1133,
"metric": 770,
"line": 1136,
"metric": 750,
"threshold": 500
},
{
Expand All @@ -48,7 +48,7 @@
"path": "protea/core/training_dump_helpers.py",
"name": "",
"line": 0,
"metric": 1902,
"metric": 1885,
"threshold": 800
},
{
Expand Down Expand Up @@ -434,7 +434,7 @@
"kind": "method",
"path": "protea/core/training_dump_helpers.py",
"name": "_preload_all_embeddings",
"line": 179,
"line": 182,
"metric": 73,
"threshold": 60
},
Expand All @@ -443,7 +443,7 @@
"kind": "method",
"path": "protea/core/training_dump_helpers.py",
"name": "_build_reference_from_cache",
"line": 254,
"line": 257,
"metric": 67,
"threshold": 60
},
Expand All @@ -452,7 +452,7 @@
"kind": "method",
"path": "protea/core/training_dump_helpers.py",
"name": "_knn_transfer_and_label",
"line": 372,
"line": 375,
"metric": 624,
"threshold": 60
},
Expand All @@ -461,8 +461,8 @@
"kind": "method",
"path": "protea/core/training_dump_helpers.py",
"name": "TrainRerankerAutoOperation.execute",
"line": 1240,
"metric": 663,
"line": 1212,
"metric": 674,
"threshold": 60
},
{
Expand Down Expand Up @@ -618,15 +618,6 @@
"metric": 7,
"threshold": 6
},
{
"key": "params::protea/core/training_dump_helpers.py::TrainRerankerAutoOperation._dump_frozen_dataset",
"kind": "params",
"path": "protea/core/training_dump_helpers.py",
"name": "TrainRerankerAutoOperation._dump_frozen_dataset",
"line": 1190,
"metric": 11,
"threshold": 6
},
{
"key": "params::protea/infrastructure/queue/consumer.py::OperationConsumer.__init__",
"kind": "params",
Expand Down
85 changes: 34 additions & 51 deletions protea/core/training_dump_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Annotated, Any, cast
from typing import TYPE_CHECKING, Annotated, Any, cast

if TYPE_CHECKING:
from protea.core.parquet_export import ParquetExportContext

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1187,54 +1190,23 @@ def summarize_payload(self, payload: dict[str, Any], *, session: Session | None
return " · ".join(bits)

@staticmethod
def _dump_frozen_dataset(
*,
dump_dir: Path,
split_files: dict[str, list[Path]],
valid_split_versions: list[tuple[int, int]],
test_files: dict[str, Path | None],
test_old_v: int,
test_new_v: int,
name: str,
k: int,
embedding_config_id: str,
ontology_snapshot_id: str,
annotation_source: str,
) -> dict[str, Any]:
def _dump_frozen_dataset(ctx: ParquetExportContext) -> dict[str, Any]:
"""Thin wrapper that delegates to ``parquet_export`` — kept so
``dump_helper`` can still dump a frozen dataset to a local
path via ``dump_to=...``. New code should prefer the
``export_research_dataset`` operation which publishes via the
configured ``ArtifactStore``.

The caller is responsible for filling ``producer_version`` and
``producer_git_sha`` on the context. Pass ``store=None`` to
skip artifact-store upload.
"""
from protea import __version__ as _protea_version
from protea.core.parquet_export import (
ParquetExportContext,
export_reranker_parquets,
resolve_protea_git_sha,
)
from protea.core.parquet_export import export_reranker_parquets

result = export_reranker_parquets(
ParquetExportContext(
stage_dir=dump_dir,
split_files=split_files,
valid_split_versions=valid_split_versions,
test_files=test_files,
test_old_v=test_old_v,
test_new_v=test_new_v,
name=name,
k=k,
embedding_config_id=embedding_config_id,
ontology_snapshot_id=ontology_snapshot_id,
annotation_source=annotation_source,
store=None,
producer_version=_protea_version,
producer_git_sha=resolve_protea_git_sha(),
)
)
result = export_reranker_parquets(ctx)
# Preserve the historical return contract — callers rely on
# ``dump_dir`` instead of ``stage_dir``.
result["dump_dir"] = result.pop("stage_dir", str(dump_dir))
result["dump_dir"] = result.pop("stage_dir", str(ctx.stage_dir))
return result

def execute(
Expand Down Expand Up @@ -1874,18 +1846,29 @@ def execute(
"training has been moved to protea-reranker-lab. Use "
"ExportResearchDatasetOperation / POST /datasets."
)
from protea import __version__ as _protea_version
from protea.core.parquet_export import (
ParquetExportContext,
resolve_protea_git_sha,
)

dump_stats = self._dump_frozen_dataset(
dump_dir=Path(p.dump_to),
split_files=split_files,
valid_split_versions=valid_split_versions,
test_files=test_files,
test_old_v=test_old_v,
test_new_v=test_new_v,
name=p.name,
k=int(p.limit_per_entry),
embedding_config_id=str(emb_config_id),
ontology_snapshot_id=str(ontology_snapshot_id),
annotation_source=p.annotation_source,
ParquetExportContext(
stage_dir=Path(p.dump_to),
split_files=split_files,
valid_split_versions=valid_split_versions,
test_files=test_files,
test_old_v=test_old_v,
test_new_v=test_new_v,
name=p.name,
k=int(p.limit_per_entry),
embedding_config_id=str(emb_config_id),
ontology_snapshot_id=str(ontology_snapshot_id),
annotation_source=p.annotation_source,
store=None,
producer_version=_protea_version,
producer_git_sha=resolve_protea_git_sha(),
)
)
emit("dump_helper.dump_done", None, dump_stats, "info")
elapsed = round(time.perf_counter() - t0, 1)
Expand Down
Loading