From 95d3bead3f680261a7e667be8a40093db2cad768 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:20:13 +0200 Subject: [PATCH 01/28] feat(data): add LabelFormat enum and LabelsConfig.format field (refs #338) --- src/raitap/configs/schema.py | 6 +++++- src/raitap/data/tests/test_label_formats.py | 11 +++++++++++ src/raitap/data/types.py | 16 ++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 src/raitap/data/tests/test_label_formats.py diff --git a/src/raitap/configs/schema.py b/src/raitap/configs/schema.py index fbab40f7..1910345c 100644 --- a/src/raitap/configs/schema.py +++ b/src/raitap/configs/schema.py @@ -5,7 +5,7 @@ from omegaconf import MISSING -from raitap.data.types import IdStrategy, LabelEncoding +from raitap.data.types import IdStrategy, LabelEncoding, LabelFormat from raitap.types import Hardware, TaskKind if TYPE_CHECKING: @@ -87,6 +87,10 @@ class LabelsConfig: # (supports nested ImageFolder layouts with colliding stems). # "stem" — flat-dir / basename matching: match by ``Path(id).stem`` only. id_strategy: IdStrategy = IdStrategy.auto + # External label file format. ``native`` (default) reads RAITAP's own + # shape. ``coco`` / ``yolo`` / ``voc`` are converted to the native + # intermediate before alignment. Requires id-based alignment (sample_ids). + format: LabelFormat = LabelFormat.native @dataclass diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py new file mode 100644 index 00000000..b65742e0 --- /dev/null +++ b/src/raitap/data/tests/test_label_formats.py @@ -0,0 +1,11 @@ +from raitap.data.types import LabelFormat +from raitap.configs.schema import LabelsConfig + + +def test_label_format_members_are_string_values(): + assert LabelFormat.native == "native" + assert {f.value for f in LabelFormat} == {"native", "coco", "yolo", "voc"} + + +def test_labels_config_defaults_to_native_format(): + assert LabelsConfig().format is LabelFormat.native diff --git a/src/raitap/data/types.py b/src/raitap/data/types.py index 2defb94a..fc114554 100644 --- a/src/raitap/data/types.py +++ b/src/raitap/data/types.py @@ -33,6 +33,22 @@ class IdStrategy(StrEnum): stem = "stem" +class LabelFormat(StrEnum): + """On-disk label file format selected by ``LabelsConfig.format``. + + ``native`` is RAITAP's own shape (classification: CSV/TSV/Parquet or the + ``directory`` source; detection: the JSON record list). The others are + converted to the native intermediate by a registered + :class:`~raitap.data.label_formats.LabelFormatAdapter` before the task + family aligns them. StrEnum so YAML users can write the raw value. + """ + + native = "native" + coco = "coco" + yolo = "yolo" + voc = "voc" + + #: Reserved ``LabelsConfig.source`` value selecting folder-as-label ingestion: #: classification labels are derived from each sample's top-level class #: subdirectory (torchvision ``ImageFolder`` style; no labels file). Kept as a From d916f4de034384b79d99de1506dc3655ad787b0f Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:20:59 +0200 Subject: [PATCH 02/28] refactor(model): extract _align_detection_records from detection loader (refs #338) --- src/raitap/task_families/detection.py | 171 ++++++++++++++------------ 1 file changed, 93 insertions(+), 78 deletions(-) diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py index 5141992c..15614e64 100644 --- a/src/raitap/task_families/detection.py +++ b/src/raitap/task_families/detection.py @@ -19,6 +19,90 @@ from raitap.task_families.base import ExplainContext, ForwardContext +def _align_detection_records( + records: list[dict[str, Any]], + *, + expected: int, + sample_ids: Any, +) -> list[dict[str, "Any"]]: + """Align native detection records to ``sample_ids`` and build tensors. + + Extracted from ``DetectionFamily.load_labels`` so label-format adapters can + feed converted records through the same alignment + validation path. + """ + import torch + + if sample_ids is not None: + by_id: dict[str, dict[str, Any]] = {} + for index, record in enumerate(records): + record_id = record.get("sample_id") if isinstance(record, dict) else None + if record_id is None: + raise ValueError( + f"Detection labels record {index} is missing 'sample_id' " + "(required when the dataset exposes sample_ids)." + ) + if record_id in by_id: + raise ValueError( + f"Detection labels file contains duplicate sample_id {record_id!r}." + ) + by_id[record_id] = record + ordered_records = [] + missing: list[str] = [] + for sample_id in sample_ids: + record = by_id.get(sample_id) + if record is None: + missing.append(sample_id) + else: + ordered_records.append(record) + if missing: + raise ValueError( + f"Detection labels file is missing entries for sample_ids: {missing!r}." + ) + records_iter: list[dict[str, Any]] = ordered_records + else: + if len(records) != expected: + raise ValueError( + f"Detection labels file has {len(records)} records but the " + f"dataset has {expected} samples; provide sample_id fields and " + "set data.labels.source so records can be aligned by id, or " + "match the record count to the sample count." + ) + records_iter = records + + out: list[dict[str, torch.Tensor]] = [] + for index, record in enumerate(records_iter): + boxes_raw = record.get("boxes", []) + labels_raw = record.get("labels", []) + if len(boxes_raw) != len(labels_raw): + raise ValueError( + f"Sample index {index}: boxes and labels must have matching " + f"length (got {len(boxes_raw)} boxes vs {len(labels_raw)} labels)." + ) + boxes_tensor = ( + torch.tensor(boxes_raw, dtype=torch.float32) + if boxes_raw + else torch.zeros((0, 4), dtype=torch.float32) + ) + labels_tensor = ( + torch.tensor(labels_raw, dtype=torch.int64) + if labels_raw + else torch.zeros((0,), dtype=torch.int64) + ) + if boxes_tensor.ndim != 2 or boxes_tensor.shape[1] != 4: + raise ValueError( + f"Sample index {index}: boxes must be shape (M_i, 4); got " + f"{tuple(boxes_tensor.shape)}." + ) + out.append({"boxes": boxes_tensor, "labels": labels_tensor}) + + if len(out) != expected: + raise ValueError( + f"Detection labels alignment produced {len(out)} entries but the " + f"dataset has {expected} samples." + ) + return out + + @task_family class DetectionFamily: kind: TaskKind = TaskKind.detection @@ -92,96 +176,27 @@ def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any: """ import json - import torch - - from raitap.data.data import SourceKind, _get_optional_config_value, get_source_path + from raitap.data.data import ( + SourceKind, + _get_optional_config_value, + get_source_path, + ) labels_cfg = _get_optional_config_value(cfg.data, "labels") labels_source = _get_optional_config_value(labels_cfg, "source") if not labels_source: return None - # ``get_source_path`` raises ValueError if the source can't be resolved - # or returns an existing path; no separate existence check needed. labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) - with labels_path.open() as fh: records = json.load(fh) if not isinstance(records, list): - raise ValueError(f"Detection labels file {labels_path} must be a JSON array.") - - expected = len(tensor) - - if sample_ids is not None: - by_id: dict[str, dict[str, Any]] = {} - for index, record in enumerate(records): - record_id = record.get("sample_id") if isinstance(record, dict) else None - if record_id is None: - raise ValueError( - f"Detection labels record {index} is missing 'sample_id' " - "(required when the dataset exposes sample_ids)." - ) - if record_id in by_id: - raise ValueError( - f"Detection labels file contains duplicate sample_id {record_id!r}." - ) - by_id[record_id] = record - ordered_records = [] - missing: list[str] = [] - for sample_id in sample_ids: - record = by_id.get(sample_id) - if record is None: - missing.append(sample_id) - else: - ordered_records.append(record) - if missing: - raise ValueError( - f"Detection labels file is missing entries for sample_ids: {missing!r}." - ) - records_iter: list[dict[str, Any]] = ordered_records - else: - if len(records) != expected: - raise ValueError( - f"Detection labels file has {len(records)} records but the " - f"dataset has {expected} samples; provide sample_id fields and " - "set data.labels.source so records can be aligned by id, or " - "match the record count to the sample count." - ) - records_iter = records - - out: list[dict[str, torch.Tensor]] = [] - for index, record in enumerate(records_iter): - boxes_raw = record.get("boxes", []) - labels_raw = record.get("labels", []) - if len(boxes_raw) != len(labels_raw): - raise ValueError( - f"Sample index {index}: boxes and labels must have matching " - f"length (got {len(boxes_raw)} boxes vs {len(labels_raw)} labels)." - ) - boxes_tensor = ( - torch.tensor(boxes_raw, dtype=torch.float32) - if boxes_raw - else torch.zeros((0, 4), dtype=torch.float32) - ) - labels_tensor = ( - torch.tensor(labels_raw, dtype=torch.int64) - if labels_raw - else torch.zeros((0,), dtype=torch.int64) - ) - if boxes_tensor.ndim != 2 or boxes_tensor.shape[1] != 4: - raise ValueError( - f"Sample index {index}: boxes must be shape (M_i, 4); got " - f"{tuple(boxes_tensor.shape)}." - ) - out.append({"boxes": boxes_tensor, "labels": labels_tensor}) - - if len(out) != expected: raise ValueError( - f"Detection labels alignment produced {len(out)} entries but the " - f"dataset has {expected} samples." + f"Detection labels file {labels_path} must be a JSON array." ) - - return out + return _align_detection_records( + records, expected=len(tensor), sample_ids=sample_ids + ) def validate_labels(self, labels: Any) -> None: # The detection loader returns ``list[dict]`` or ``None``. A bare tensor From b9fa7bdea8729181b87712145ca59a100c44c066 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:25:46 +0200 Subject: [PATCH 03/28] feat(data): add label-format adapter protocol and registry (refs #338) --- src/raitap/data/__init__.py | 10 ++- src/raitap/data/_label_format_adapters.py | 9 +++ src/raitap/data/adapters/__init__.py | 1 + src/raitap/data/adapters/coco.py | 0 src/raitap/data/adapters/voc.py | 0 src/raitap/data/adapters/yolo.py | 0 src/raitap/data/label_formats.py | 84 +++++++++++++++++++++ src/raitap/data/tests/test_label_formats.py | 39 ++++++++++ 8 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 src/raitap/data/_label_format_adapters.py create mode 100644 src/raitap/data/adapters/__init__.py create mode 100644 src/raitap/data/adapters/coco.py create mode 100644 src/raitap/data/adapters/voc.py create mode 100644 src/raitap/data/adapters/yolo.py create mode 100644 src/raitap/data/label_formats.py diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py index 2c5aa3e0..a363f644 100644 --- a/src/raitap/data/__init__.py +++ b/src/raitap/data/__init__.py @@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any -from .types import DIRECTORY_LABELS_SOURCE, IdStrategy, LabelEncoding, Preprocessing +from .types import DIRECTORY_LABELS_SOURCE, IdStrategy, LabelEncoding, LabelFormat, Preprocessing if TYPE_CHECKING: from raitap.configs.schema import DataConfig, LabelsConfig @@ -36,6 +36,8 @@ "DataPreprocessingFactory", "IdStrategy", "LabelEncoding", + "LabelFormat", + "LabelFormatAdapter", "LabelsConfig", "ModelInputTransformationFactory", "Preprocessing", @@ -44,6 +46,7 @@ "load_tensor_from_source", "raitap_model_input_transformation_factory", "raitap_preprocessing_factory", + "resolve_label_format_adapter", ] @@ -69,6 +72,11 @@ "raitap.data.preprocessing", "raitap_preprocessing_factory", ), + "LabelFormatAdapter": ("raitap.data.label_formats", "LabelFormatAdapter"), + "resolve_label_format_adapter": ( + "raitap.data.label_formats", + "resolve_label_format_adapter", + ), } diff --git a/src/raitap/data/_label_format_adapters.py b/src/raitap/data/_label_format_adapters.py new file mode 100644 index 00000000..d1267e18 --- /dev/null +++ b/src/raitap/data/_label_format_adapters.py @@ -0,0 +1,9 @@ +"""Imports every in-tree label-format adapter so the decorators fire. + +Imported for its side effects by +``raitap.data.label_formats.resolve_label_format_adapter``. +""" + +from __future__ import annotations + +from raitap.data.adapters import coco, voc, yolo # noqa: F401 diff --git a/src/raitap/data/adapters/__init__.py b/src/raitap/data/adapters/__init__.py new file mode 100644 index 00000000..4b68f1da --- /dev/null +++ b/src/raitap/data/adapters/__init__.py @@ -0,0 +1 @@ +"""Built-in label-format adapters (issue #338).""" diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py new file mode 100644 index 00000000..e69de29b diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py new file mode 100644 index 00000000..e69de29b diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py new file mode 100644 index 00000000..e69de29b diff --git a/src/raitap/data/label_formats.py b/src/raitap/data/label_formats.py new file mode 100644 index 00000000..11ae1af3 --- /dev/null +++ b/src/raitap/data/label_formats.py @@ -0,0 +1,84 @@ +"""Pluggable label-format adapters (issue #338). + +Each adapter converts an external annotation file (COCO / YOLO / VOC) into +RAITAP's native intermediate record list, which the task-family loaders then +align to ``sample_ids`` with their existing logic. Registry mirrors +``raitap.task_families.registry``: a decorator registers one singleton per +``LabelFormat``. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any, Protocol, TypeVar, runtime_checkable + +from raitap.data.types import LabelFormat + +if TYPE_CHECKING: + from raitap.types import TaskKind + +#: Native intermediate record shapes (match the on-disk native formats). +DetectionRecord = dict[str, Any] +ClassificationRecord = dict[str, Any] + + +@runtime_checkable +class LabelFormatAdapter(Protocol): + """Converts an external label file to native intermediate records.""" + + format: LabelFormat + supported_tasks: frozenset[TaskKind] + + def to_detection_records( + self, + source: Path, + *, + image_dir: Path | None, + class_names: list[str] | None, + ) -> list[DetectionRecord]: + """Return ``[{sample_id, boxes (xyxy), labels}]``. Raise if unsupported.""" + ... + + def to_classification_records(self, source: Path) -> list[ClassificationRecord]: + """Return ``[{sample_id, label}]``. Raise if unsupported.""" + ... + + +#: format -> the adapter singleton serving it. +LABEL_FORMAT_ADAPTERS: dict[LabelFormat, LabelFormatAdapter] = {} + +T = TypeVar("T") + + +def label_format(cls: type[T]) -> type[T]: + """Register ``cls`` (instantiated once) under its ``format`` class attribute.""" + instance = cls() # type: ignore[call-arg] + LABEL_FORMAT_ADAPTERS[instance.format] = instance # type: ignore[attr-defined] + return cls + + +def resolve_label_format_adapter( + fmt: LabelFormat, *, task_kind: TaskKind +) -> LabelFormatAdapter: + """Return the adapter for ``fmt`` that supports ``task_kind``. + + Raises ``ValueError`` when no adapter is registered for ``fmt`` (e.g. + ``native``, which the caller should special-case) or the adapter does not + declare ``task_kind`` in ``supported_tasks``. + """ + # Import side-effect: register the in-tree adapters on first use. + from raitap.data import _label_format_adapters # noqa: F401 + + adapter = LABEL_FORMAT_ADAPTERS.get(fmt) + if adapter is None: + raise ValueError( + f"No adapter registered for label format {fmt.value!r}; " + f"registered: {sorted(f.value for f in LABEL_FORMAT_ADAPTERS)}." + ) + if task_kind not in adapter.supported_tasks: + supported = sorted(t.value for t in adapter.supported_tasks) + raise ValueError( + f"Label format {fmt.value!r} does not support task {task_kind.value!r}; " + f"supported tasks: {supported}." + ) + return adapter diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index b65742e0..3bd65616 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -1,5 +1,12 @@ +import pytest from raitap.data.types import LabelFormat from raitap.configs.schema import LabelsConfig +from raitap.data.label_formats import ( + LABEL_FORMAT_ADAPTERS, + label_format, + resolve_label_format_adapter, +) +from raitap.types import TaskKind def test_label_format_members_are_string_values(): @@ -9,3 +16,35 @@ def test_label_format_members_are_string_values(): def test_labels_config_defaults_to_native_format(): assert LabelsConfig().format is LabelFormat.native + + +def test_label_format_decorator_registers_instance(): + @label_format + class _Dummy: + format = LabelFormat.coco # reuse an enum member; popped below + supported_tasks = frozenset({TaskKind.detection}) + + try: + assert LABEL_FORMAT_ADAPTERS[LabelFormat.coco].supported_tasks == frozenset( + {TaskKind.detection} + ) + finally: + LABEL_FORMAT_ADAPTERS.pop(LabelFormat.coco, None) + + +def test_registry_rejects_unknown_native(): + with pytest.raises(ValueError, match="No adapter"): + resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection) + + +@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False) +def test_registry_resolves_supported_task(): + adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection) + assert adapter.format is LabelFormat.coco + assert TaskKind.detection in adapter.supported_tasks + + +@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False) +def test_registry_rejects_unsupported_task(): + with pytest.raises(ValueError, match="does not support task"): + resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification) From 509040cfce18276fd8b485ed990b5ddcf14d85f7 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:31:33 +0200 Subject: [PATCH 04/28] feat(data): add COCO label-format adapter (refs #338) --- src/raitap/data/adapters/coco.py | 78 +++++++++++++++++++++ src/raitap/data/tests/test_label_formats.py | 76 +++++++++++++++++++- 2 files changed, 152 insertions(+), 2 deletions(-) diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py index e69de29b..7fb5b99f 100644 --- a/src/raitap/data/adapters/coco.py +++ b/src/raitap/data/adapters/coco.py @@ -0,0 +1,78 @@ +"""COCO label-format adapter (issue #338).""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from raitap.data.label_formats import ( + ClassificationRecord, + DetectionRecord, + label_format, +) +from raitap.data.types import LabelFormat +from raitap.types import TaskKind + + +@label_format +class CocoAdapter: + """COCO ``instances.json`` -> native records. + + Detection: ``bbox`` is ``[x, y, w, h]`` -> ``[x1, y1, x2, y2]``; + ``category_id`` passes through unchanged so labels stay in the model's + label space. Classification: one label per image (the image's single + annotation category); images with 0 or >1 categories raise. + """ + + format = LabelFormat.coco + supported_tasks = frozenset({TaskKind.detection, TaskKind.classification}) + + def _load(self, source: Path) -> dict[str, Any]: + with source.open() as fh: + data = json.load(fh) + if not isinstance(data, dict) or "images" not in data: + raise ValueError( + f"COCO file {source} must be an object with an 'images' array." + ) + return data + + def to_detection_records( + self, source: Path, *, image_dir: Path | None, class_names: list[str] | None + ) -> list[DetectionRecord]: + data = self._load(source) + file_by_image: dict[int, str] = { + img["id"]: img["file_name"] for img in data["images"] + } + boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image} + labels: dict[int, list[int]] = {iid: [] for iid in file_by_image} + for ann in data.get("annotations", []): + iid = ann["image_id"] + x, y, w, h = ann["bbox"] + boxes[iid].append([x, y, x + w, y + h]) + labels[iid].append(int(ann["category_id"])) + return [ + {"sample_id": file_by_image[iid], "boxes": boxes[iid], "labels": labels[iid]} + for iid in file_by_image + ] + + def to_classification_records( + self, source: Path + ) -> list[ClassificationRecord]: + data = self._load(source) + file_by_image: dict[int, str] = { + img["id"]: img["file_name"] for img in data["images"] + } + cats: dict[int, set[int]] = {iid: set() for iid in file_by_image} + for ann in data.get("annotations", []): + cats[ann["image_id"]].add(int(ann["category_id"])) + records: list[ClassificationRecord] = [] + for iid, name in file_by_image.items(): + cat_set = cats[iid] + if len(cat_set) != 1: + raise ValueError( + f"COCO classification needs exactly one category per image; " + f"image {name!r} has {len(cat_set)}." + ) + records.append({"sample_id": name, "label": next(iter(cat_set))}) + return records diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index 3bd65616..a88d9304 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -37,14 +37,86 @@ def test_registry_rejects_unknown_native(): resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection) -@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False) def test_registry_resolves_supported_task(): adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection) assert adapter.format is LabelFormat.coco assert TaskKind.detection in adapter.supported_tasks -@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False) +@pytest.mark.xfail(reason="adapter added in task 5 (yolo)", strict=False) def test_registry_rejects_unsupported_task(): with pytest.raises(ValueError, match="does not support task"): resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification) + + +def test_coco_detection_records(tmp_path): + import json + from raitap.data.adapters.coco import CocoAdapter + + coco = { + "images": [ + {"id": 1, "file_name": "a.jpg"}, + {"id": 2, "file_name": "b.jpg"}, + ], + "annotations": [ + {"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}, + {"image_id": 1, "category_id": 5, "bbox": [0, 0, 5, 5]}, + ], + "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], + } + p = tmp_path / "instances.json" + p.write_text(json.dumps(coco)) + + records = CocoAdapter().to_detection_records(p, image_dir=None, class_names=None) + by_id = {r["sample_id"]: r for r in records} + assert by_id["a.jpg"]["boxes"] == [[10, 20, 40, 60], [0, 0, 5, 5]] + assert by_id["a.jpg"]["labels"] == [3, 5] + assert by_id["b.jpg"] == {"sample_id": "b.jpg", "boxes": [], "labels": []} + + +def test_coco_classification_records(tmp_path): + import json + from raitap.data.adapters.coco import CocoAdapter + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}], + "annotations": [{"image_id": 1, "category_id": 7, "bbox": [0, 0, 1, 1]}], + "categories": [{"id": 7, "name": "cat"}], + } + p = tmp_path / "c.json" + p.write_text(json.dumps(coco)) + records = CocoAdapter().to_classification_records(p) + assert records == [{"sample_id": "a.jpg", "label": 7}] + + +def test_coco_classification_rejects_zero_categories(tmp_path): + import json + from raitap.data.adapters.coco import CocoAdapter + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}], + "annotations": [], + "categories": [{"id": 7, "name": "cat"}], + } + p = tmp_path / "zero.json" + p.write_text(json.dumps(coco)) + with pytest.raises(ValueError, match="exactly one category per image"): + CocoAdapter().to_classification_records(p) + + +def test_coco_classification_rejects_multiple_categories(tmp_path): + import json + from raitap.data.adapters.coco import CocoAdapter + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}], + "annotations": [ + {"image_id": 1, "category_id": 3, "bbox": [0, 0, 1, 1]}, + {"image_id": 1, "category_id": 5, "bbox": [0, 0, 1, 1]}, + ], + "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], + } + p = tmp_path / "multi.json" + p.write_text(json.dumps(coco)) + with pytest.raises(ValueError, match="exactly one category per image"): + CocoAdapter().to_classification_records(p) From 46377485c9b298b6c5d12da3e0d4786052b54176 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:35:48 +0200 Subject: [PATCH 05/28] feat(data): add YOLO label-format adapter (refs #338) --- src/raitap/data/adapters/yolo.py | 74 +++++++++++++++++++++ src/raitap/data/tests/test_label_formats.py | 26 +++++++- 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py index e69de29b..fac219fa 100644 --- a/src/raitap/data/adapters/yolo.py +++ b/src/raitap/data/adapters/yolo.py @@ -0,0 +1,74 @@ +"""YOLO label-format adapter (issue #338).""" + +from __future__ import annotations + +from pathlib import Path + +from PIL import Image + +from raitap.data.label_formats import ( + ClassificationRecord, + DetectionRecord, + label_format, +) +from raitap.data.types import LabelFormat +from raitap.types import TaskKind + +_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".bmp", ".webp") + + +@label_format +class YoloAdapter: + """YOLO per-image ``.txt`` (``class cx cy w h``, normalised) -> native records. + + Boxes are denormalised with each image's pixel size, read from + ``image_dir``. Class indices pass through unchanged. + """ + + format = LabelFormat.yolo + supported_tasks = frozenset({TaskKind.detection}) + + def _image_for(self, image_dir: Path, stem: str) -> Path: + for suffix in _IMAGE_SUFFIXES: + candidate = image_dir / f"{stem}{suffix}" + if candidate.exists(): + return candidate + raise ValueError( + f"YOLO adapter found no image for label {stem!r} in {image_dir}." + ) + + def to_detection_records( + self, source: Path, *, image_dir: Path | None, class_names: list[str] | None + ) -> list[DetectionRecord]: + if image_dir is None: + raise ValueError( + "YOLO labels need image_dir to denormalise boxes; " + "set data.source to the image directory." + ) + records: list[DetectionRecord] = [] + for txt in sorted(source.glob("*.txt")): + image_path = self._image_for(image_dir, txt.stem) + with Image.open(image_path) as im: + width, height = im.size + boxes: list[list[float]] = [] + labels: list[int] = [] + for line in txt.read_text().splitlines(): + parts = line.split() + if not parts: + continue + cls, cx, cy, bw, bh = (float(p) for p in parts[:5]) + x1 = (cx - bw / 2) * width + y1 = (cy - bh / 2) * height + x2 = (cx + bw / 2) * width + y2 = (cy + bh / 2) * height + boxes.append([x1, y1, x2, y2]) + labels.append(int(cls)) + records.append( + {"sample_id": image_path.name, "boxes": boxes, "labels": labels} + ) + return records + + def to_classification_records( + self, source: Path + ) -> list[ClassificationRecord]: + raise ValueError("YOLO is a detection-only format.") diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index a88d9304..3f4774cb 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -43,7 +43,6 @@ def test_registry_resolves_supported_task(): assert TaskKind.detection in adapter.supported_tasks -@pytest.mark.xfail(reason="adapter added in task 5 (yolo)", strict=False) def test_registry_rejects_unsupported_task(): with pytest.raises(ValueError, match="does not support task"): resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification) @@ -120,3 +119,28 @@ def test_coco_classification_rejects_multiple_categories(tmp_path): p.write_text(json.dumps(coco)) with pytest.raises(ValueError, match="exactly one category per image"): CocoAdapter().to_classification_records(p) + + +def test_yolo_detection_records(tmp_path): + from PIL import Image + from raitap.data.adapters.yolo import YoloAdapter + + image_dir = tmp_path / "images" + image_dir.mkdir() + Image.new("RGB", (100, 200)).save(image_dir / "a.jpg") # w=100, h=200 + + label_dir = tmp_path / "labels" + label_dir.mkdir() + # class=2, cx=0.5 cy=0.5 w=0.2 h=0.1 -> center (50,100), box 20x20px + (label_dir / "a.txt").write_text("2 0.5 0.5 0.2 0.1\n") + + records = YoloAdapter().to_detection_records( + label_dir, image_dir=image_dir, class_names=None + ) + assert len(records) == 1 + rec = records[0] + assert rec["sample_id"] == "a.jpg" + assert rec["labels"] == [2] + # x1 = (0.5-0.1)*100=40, y1=(0.5-0.05)*200=90, x2=60, y2=110 + assert len(rec["boxes"]) == 1 + assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0]) From ff416640cb09922a6a2cf2eaceec13a6cff45bac Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:39:03 +0200 Subject: [PATCH 06/28] feat(data): add Pascal-VOC label-format adapter (refs #338) --- src/raitap/data/adapters/voc.py | 76 +++++++++++++++++++++ src/raitap/data/tests/test_label_formats.py | 21 ++++++ 2 files changed, 97 insertions(+) diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py index e69de29b..98fb11bd 100644 --- a/src/raitap/data/adapters/voc.py +++ b/src/raitap/data/adapters/voc.py @@ -0,0 +1,76 @@ +"""Pascal-VOC label-format adapter (issue #338).""" + +from __future__ import annotations + +import xml.etree.ElementTree as ET +from pathlib import Path + +from raitap.data.label_formats import ( + ClassificationRecord, + DetectionRecord, + label_format, +) +from raitap.data.types import LabelFormat +from raitap.types import TaskKind + +#: Canonical Pascal-VOC class order (index = label id) when no class_names given. +_VOC_CLASSES = ( + "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", + "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", + "pottedplant", "sheep", "sofa", "train", "tvmonitor", +) + + +@label_format +class VocAdapter: + """Pascal-VOC per-image ``.xml`` -> native detection records. + + Boxes are already ``[xmin, ymin, xmax, ymax]`` pixels. Class names map to + ids by their position in ``class_names`` (else the standard 20-class VOC + order). + """ + + format = LabelFormat.voc + supported_tasks = frozenset({TaskKind.detection}) + + def to_detection_records( + self, source: Path, *, image_dir: Path | None, class_names: list[str] | None + ) -> list[DetectionRecord]: + name_to_id = { + name: idx + for idx, name in enumerate(class_names if class_names else _VOC_CLASSES) + } + records: list[DetectionRecord] = [] + for xml_path in sorted(source.glob("*.xml")): + root = ET.parse(xml_path).getroot() + filename_el = root.find("filename") + if filename_el is None or not filename_el.text: + raise ValueError(f"VOC file {xml_path} has no .") + boxes: list[list[float]] = [] + labels: list[int] = [] + for obj in root.findall("object"): + name = obj.findtext("name") + if name not in name_to_id: + raise ValueError( + f"VOC class {name!r} in {xml_path.name} is not in the " + f"class list {sorted(name_to_id)}." + ) + box = obj.find("bndbox") + boxes.append( + [ + float(box.findtext("xmin")), + float(box.findtext("ymin")), + float(box.findtext("xmax")), + float(box.findtext("ymax")), + ] + ) + labels.append(name_to_id[name]) + records.append( + {"sample_id": filename_el.text, "boxes": boxes, "labels": labels} + ) + return records + + def to_classification_records( + self, source: Path + ) -> list[ClassificationRecord]: + raise ValueError("VOC is a detection-only format.") diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index 3f4774cb..25aa55e0 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -144,3 +144,24 @@ def test_yolo_detection_records(tmp_path): # x1 = (0.5-0.1)*100=40, y1=(0.5-0.05)*200=90, x2=60, y2=110 assert len(rec["boxes"]) == 1 assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0]) + + +def test_voc_detection_records(tmp_path): + from raitap.data.adapters.voc import VocAdapter + + xml = """ + a.jpg + person + 10203040 + + """ + d = tmp_path / "ann" + d.mkdir() + (d / "a.xml").write_text(xml) + + records = VocAdapter().to_detection_records( + d, image_dir=None, class_names=["background", "person", "car"] + ) + assert records == [ + {"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]} + ] From c248cc016874a1971722b61f4379d1f7b9e2b578 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:43:49 +0200 Subject: [PATCH 07/28] feat(model): dispatch detection labels on data.labels.format (refs #338) --- src/raitap/data/tests/test_label_formats.py | 33 ++++++++++++++++++ src/raitap/task_families/detection.py | 37 +++++++++++++++++---- 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index 25aa55e0..991b1234 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -165,3 +165,36 @@ def test_voc_detection_records(tmp_path): assert records == [ {"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]} ] + + +def test_detection_load_labels_via_coco(tmp_path, monkeypatch): + import json + import torch + from types import SimpleNamespace + from raitap.task_families.detection import DetectionFamily + from raitap.data.types import LabelFormat + import raitap.data.data as data_mod + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}], + "annotations": [{"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}], + "categories": [{"id": 3, "name": "car"}], + } + labels_file = tmp_path / "instances.json" + labels_file.write_text(json.dumps(coco)) + + monkeypatch.setattr( + data_mod, "get_source_path", lambda source, *, kind: tmp_path / source + ) + # tmp_path/"instances.json" is LABELS; tmp_path/"imgs" is DATA (unused by coco). + cfg = SimpleNamespace( + data=SimpleNamespace( + source="imgs", + labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco), + ) + ) + tensor = [object(), object()] # len == 2 samples + out = DetectionFamily().load_labels(cfg, tensor=tensor, sample_ids=["a.jpg", "b.jpg"]) + assert torch.equal(out[0]["boxes"], torch.tensor([[10.0, 20.0, 40.0, 60.0]])) + assert torch.equal(out[0]["labels"], torch.tensor([3])) + assert out[1]["boxes"].shape == (0, 4) diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py index 15614e64..a37920a6 100644 --- a/src/raitap/task_families/detection.py +++ b/src/raitap/task_families/detection.py @@ -24,7 +24,7 @@ def _align_detection_records( *, expected: int, sample_ids: Any, -) -> list[dict[str, "Any"]]: +) -> list[dict[str, "torch.Tensor"]]: """Align native detection records to ``sample_ids`` and build tensors. Extracted from ``DetectionFamily.load_labels`` so label-format adapters can @@ -188,11 +188,36 @@ def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any: return None labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) - with labels_path.open() as fh: - records = json.load(fh) - if not isinstance(records, list): - raise ValueError( - f"Detection labels file {labels_path} must be a JSON array." + + from raitap.data.types import LabelFormat + + fmt = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native + if fmt == LabelFormat.native: + with labels_path.open() as fh: + records = json.load(fh) + if not isinstance(records, list): + raise ValueError( + f"Detection labels file {labels_path} must be a JSON array." + ) + else: + from raitap.data.label_formats import resolve_label_format_adapter + + data_source = _get_optional_config_value(cfg.data, "source") + image_dir = ( + get_source_path(data_source, kind=SourceKind.DATA) + if data_source + else None + ) + class_names = ( + _get_optional_config_value(cfg.model, "class_names") + if hasattr(cfg, "model") + else None + ) + adapter = resolve_label_format_adapter( + LabelFormat(fmt), task_kind=self.kind + ) + records = adapter.to_detection_records( + labels_path, image_dir=image_dir, class_names=class_names ) return _align_detection_records( records, expected=len(tensor), sample_ids=sample_ids From 87e5d657cfe059715035025438804746de81f264 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:48:13 +0200 Subject: [PATCH 08/28] feat(data): dispatch classification labels on data.labels.format (refs #338) --- src/raitap/data/data.py | 28 +++++++++++++++++ src/raitap/data/tests/test_label_formats.py | 35 +++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/src/raitap/data/data.py b/src/raitap/data/data.py index b154b1b2..9b81806d 100644 --- a/src/raitap/data/data.py +++ b/src/raitap/data/data.py @@ -17,6 +17,7 @@ IdStrategy, InputModality, LabelEncoding, + LabelFormat, ) from raitap.data.utils import download_file from raitap.tracking.base_tracker import BaseTracker, Trackable @@ -281,6 +282,33 @@ def load_classification_labels( if labels_source == DIRECTORY_LABELS_SOURCE: return _load_directory_labels(sample_ids) + labels_format = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native + if labels_format != LabelFormat.native: + from raitap.data.label_formats import resolve_label_format_adapter + + if not sample_ids: + raise ValueError( + f"Label format {LabelFormat(labels_format).value!r} requires " + "id-based alignment, but no sample ids were discovered." + ) + labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) + adapter = resolve_label_format_adapter( + LabelFormat(labels_format), task_kind=TaskKind.classification + ) + records = adapter.to_classification_records(labels_path) + id_series = pd.Series([r["sample_id"] for r in records]) + record_labels = [int(r["label"]) for r in records] + strategy = _resolve_id_strategy( + _get_optional_config_value(labels_cfg, "id_strategy") or "auto", id_series + ) + aligned = _align_labels_to_samples( + sample_ids=sample_ids, + raw_label_ids=id_series, + encoded_labels=record_labels, + strategy=strategy, + ) + return torch.tensor(aligned, dtype=torch.long) + labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) labels_df = _load_tabular_frame(labels_path) if labels_df.empty: diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index 991b1234..db27ab9d 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -198,3 +198,38 @@ def test_detection_load_labels_via_coco(tmp_path, monkeypatch): assert torch.equal(out[0]["boxes"], torch.tensor([[10.0, 20.0, 40.0, 60.0]])) assert torch.equal(out[0]["labels"], torch.tensor([3])) assert out[1]["boxes"].shape == (0, 4) + + +def test_classification_load_labels_via_coco(tmp_path, monkeypatch): + import json + import torch + from types import SimpleNamespace + import raitap.data.data as data_mod + from raitap.data.data import load_classification_labels + from raitap.data.types import LabelFormat + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}], + "annotations": [ + {"image_id": 1, "category_id": 0, "bbox": [0, 0, 1, 1]}, + {"image_id": 2, "category_id": 4, "bbox": [0, 0, 1, 1]}, + ], + "categories": [{"id": 0, "name": "x"}, {"id": 4, "name": "y"}], + } + labels_file = tmp_path / "c.json" + labels_file.write_text(json.dumps(coco)) + monkeypatch.setattr( + data_mod, "get_source_path", lambda source, *, kind: tmp_path / source + ) + cfg = SimpleNamespace( + data=SimpleNamespace( + source="imgs", + labels=SimpleNamespace( + source="c.json", format=LabelFormat.coco, id_strategy="stem" + ), + ) + ) + out = load_classification_labels( + cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"] + ) + assert torch.equal(out, torch.tensor([0, 4])) From d3510c5f35bee8f42f680f3ab4123f06c8933cee Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Tue, 23 Jun 2026 23:48:27 +0200 Subject: [PATCH 09/28] docs: document label-format adapters and data.labels.format (refs #338) --- docs/contributor/modules/data.md | 14 ++++++++++++++ docs/modules/data/configuration.md | 24 ++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/docs/contributor/modules/data.md b/docs/contributor/modules/data.md index 319d72c4..82bfc9a7 100644 --- a/docs/contributor/modules/data.md +++ b/docs/contributor/modules/data.md @@ -69,6 +69,20 @@ referenceable by name in `data.source`. Registration lives in 5. **Update docs** — add the new sample name to {doc}`/modules/data/own-vs-built-in`. +## Adding a label format + +1. Create `src/raitap/data/adapters/.py` with a class decorated + `@label_format`. Set `format = LabelFormat.` and + `supported_tasks = frozenset({...})`. +2. Implement `to_detection_records` and/or `to_classification_records`, + returning the native record shape (`{sample_id, boxes (xyxy), labels}` or + `{sample_id, label}`). Raise `ValueError` for an unsupported task. +3. Import it in `src/raitap/data/_label_format_adapters.py` so the decorator + fires. +4. Add a `LabelFormat` member in `src/raitap/data/types.py` and a row to the + label-format table in `docs/modules/data/configuration.md`. +5. Add tests in `src/raitap/data/tests/test_label_formats.py`. + ## Sample discovery and label alignment `data.source` directories are walked **recursively** (`Path.rglob`); sample diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md index 447c29d2..653c9b26 100644 --- a/docs/modules/data/configuration.md +++ b/docs/modules/data/configuration.md @@ -105,6 +105,16 @@ myst: nested ImageFolder layouts (e.g. `NORMAL/IM-0001.jpeg`) — required when filename stems collide across class subdirs. `"stem"` matches by basename only (flat-dir layouts). +:option: labels.format +:allowed: "native", "coco", "yolo", "voc" +:default: "native" +:description: External label file format. `"native"` (default) reads RAITAP's + own shape (classification: CSV/TSV/Parquet or the `"directory"` source; + detection: the JSON record list). `"coco"`, `"yolo"`, and `"voc"` convert a + standard annotation file to the native shape before alignment. `"yolo"` and + `"voc"` are detection only; `"coco"` serves detection and classification. + Non-native formats align by sample id, so a labels id is required. + :option: input_metadata :allowed: dict, null :default: null @@ -180,6 +190,20 @@ data = DataConfig( ) ``` +## Label formats + +RAITAP reads common annotation formats directly via `data.labels.format`. + +| Format | Detection | Classification | Source layout | +| -------- | --------- | -------------- | ---------------------------------------------- | +| `native` | yes | yes | JSON record list / CSV-TSV-Parquet | +| `coco` | yes | yes | single `instances.json` | +| `yolo` | yes | no | dir of per-image `.txt` (needs `data.source`) | +| `voc` | yes | no | dir of per-image `.xml` | + +COCO and YOLO labels keep their category ids unchanged. VOC class names map to +ids by `model.class_names` order, else the standard 20-class VOC order. + For tabular models whose backend expects an unusual per-sample layout (such as ACAS Xu, a Torch network whose forward takes `(N, 1, 1, 5)`), supply `input_metadata.shape` explicitly so the pipeline reshapes the flat feature From 8314068986aec383fc65fa5e71d4f070469e42a1 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 00:06:46 +0200 Subject: [PATCH 10/28] style(data): satisfy ruff and pyright for label-format adapters (refs #338) --- src/raitap/data/__init__.py | 1 + src/raitap/data/_label_format_adapters.py | 5 +- src/raitap/data/adapters/coco.py | 22 ++-- src/raitap/data/adapters/voc.py | 56 ++++++--- src/raitap/data/adapters/yolo.py | 17 ++- src/raitap/data/label_formats.py | 14 +-- src/raitap/data/tests/test_label_formats.py | 127 +++++++++++++------- src/raitap/task_families/detection.py | 22 ++-- 8 files changed, 155 insertions(+), 109 deletions(-) diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py index a363f644..1bcb08f3 100644 --- a/src/raitap/data/__init__.py +++ b/src/raitap/data/__init__.py @@ -19,6 +19,7 @@ from raitap.configs.schema import DataConfig, LabelsConfig from .data import Data, load_numpy_from_source, load_tensor_from_source + from .label_formats import LabelFormatAdapter, resolve_label_format_adapter from .metadata import DataInputMetadata, infer_data_input_metadata from .preprocessing import ( DataPreprocessingFactory, diff --git a/src/raitap/data/_label_format_adapters.py b/src/raitap/data/_label_format_adapters.py index d1267e18..41c06b01 100644 --- a/src/raitap/data/_label_format_adapters.py +++ b/src/raitap/data/_label_format_adapters.py @@ -1,7 +1,10 @@ +# pyright: reportUnusedImport=false """Imports every in-tree label-format adapter so the decorators fire. Imported for its side effects by -``raitap.data.label_formats.resolve_label_format_adapter``. +``raitap.data.label_formats.resolve_label_format_adapter``. Every import in this +module is intentionally side-effect-only (registers an adapter), so the +file-level ``reportUnusedImport=false`` above is correct. """ from __future__ import annotations diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py index 7fb5b99f..3551e5f3 100644 --- a/src/raitap/data/adapters/coco.py +++ b/src/raitap/data/adapters/coco.py @@ -3,8 +3,10 @@ from __future__ import annotations import json -from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from pathlib import Path from raitap.data.label_formats import ( ClassificationRecord, @@ -32,18 +34,14 @@ def _load(self, source: Path) -> dict[str, Any]: with source.open() as fh: data = json.load(fh) if not isinstance(data, dict) or "images" not in data: - raise ValueError( - f"COCO file {source} must be an object with an 'images' array." - ) + raise ValueError(f"COCO file {source} must be an object with an 'images' array.") return data def to_detection_records( self, source: Path, *, image_dir: Path | None, class_names: list[str] | None ) -> list[DetectionRecord]: data = self._load(source) - file_by_image: dict[int, str] = { - img["id"]: img["file_name"] for img in data["images"] - } + file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image} labels: dict[int, list[int]] = {iid: [] for iid in file_by_image} for ann in data.get("annotations", []): @@ -56,13 +54,9 @@ def to_detection_records( for iid in file_by_image ] - def to_classification_records( - self, source: Path - ) -> list[ClassificationRecord]: + def to_classification_records(self, source: Path) -> list[ClassificationRecord]: data = self._load(source) - file_by_image: dict[int, str] = { - img["id"]: img["file_name"] for img in data["images"] - } + file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} cats: dict[int, set[int]] = {iid: set() for iid in file_by_image} for ann in data.get("annotations", []): cats[ann["image_id"]].add(int(ann["category_id"])) diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py index 98fb11bd..02a8f270 100644 --- a/src/raitap/data/adapters/voc.py +++ b/src/raitap/data/adapters/voc.py @@ -3,7 +3,10 @@ from __future__ import annotations import xml.etree.ElementTree as ET -from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path from raitap.data.label_formats import ( ClassificationRecord, @@ -15,12 +18,36 @@ #: Canonical Pascal-VOC class order (index = label id) when no class_names given. _VOC_CLASSES = ( - "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", - "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", - "pottedplant", "sheep", "sofa", "train", "tvmonitor", + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", ) +def _coord(box: ET.Element, tag: str, xml_path: Path) -> float: + text = box.findtext(tag) + if text is None: + raise ValueError(f"VOC bndbox in {xml_path.name} missing <{tag}>.") + return float(text) + + @label_format class VocAdapter: """Pascal-VOC per-image ``.xml`` -> native detection records. @@ -37,8 +64,7 @@ def to_detection_records( self, source: Path, *, image_dir: Path | None, class_names: list[str] | None ) -> list[DetectionRecord]: name_to_id = { - name: idx - for idx, name in enumerate(class_names if class_names else _VOC_CLASSES) + name: idx for idx, name in enumerate(class_names if class_names else _VOC_CLASSES) } records: list[DetectionRecord] = [] for xml_path in sorted(source.glob("*.xml")): @@ -56,21 +82,19 @@ def to_detection_records( f"class list {sorted(name_to_id)}." ) box = obj.find("bndbox") + if box is None: + raise ValueError(f"VOC object in {xml_path.name} has no .") boxes.append( [ - float(box.findtext("xmin")), - float(box.findtext("ymin")), - float(box.findtext("xmax")), - float(box.findtext("ymax")), + _coord(box, "xmin", xml_path), + _coord(box, "ymin", xml_path), + _coord(box, "xmax", xml_path), + _coord(box, "ymax", xml_path), ] ) labels.append(name_to_id[name]) - records.append( - {"sample_id": filename_el.text, "boxes": boxes, "labels": labels} - ) + records.append({"sample_id": filename_el.text, "boxes": boxes, "labels": labels}) return records - def to_classification_records( - self, source: Path - ) -> list[ClassificationRecord]: + def to_classification_records(self, source: Path) -> list[ClassificationRecord]: raise ValueError("VOC is a detection-only format.") diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py index fac219fa..be6419f8 100644 --- a/src/raitap/data/adapters/yolo.py +++ b/src/raitap/data/adapters/yolo.py @@ -2,10 +2,13 @@ from __future__ import annotations -from pathlib import Path +from typing import TYPE_CHECKING from PIL import Image +if TYPE_CHECKING: + from pathlib import Path + from raitap.data.label_formats import ( ClassificationRecord, DetectionRecord, @@ -33,9 +36,7 @@ def _image_for(self, image_dir: Path, stem: str) -> Path: candidate = image_dir / f"{stem}{suffix}" if candidate.exists(): return candidate - raise ValueError( - f"YOLO adapter found no image for label {stem!r} in {image_dir}." - ) + raise ValueError(f"YOLO adapter found no image for label {stem!r} in {image_dir}.") def to_detection_records( self, source: Path, *, image_dir: Path | None, class_names: list[str] | None @@ -63,12 +64,8 @@ def to_detection_records( y2 = (cy + bh / 2) * height boxes.append([x1, y1, x2, y2]) labels.append(int(cls)) - records.append( - {"sample_id": image_path.name, "boxes": boxes, "labels": labels} - ) + records.append({"sample_id": image_path.name, "boxes": boxes, "labels": labels}) return records - def to_classification_records( - self, source: Path - ) -> list[ClassificationRecord]: + def to_classification_records(self, source: Path) -> list[ClassificationRecord]: raise ValueError("YOLO is a detection-only format.") diff --git a/src/raitap/data/label_formats.py b/src/raitap/data/label_formats.py index 11ae1af3..19021a95 100644 --- a/src/raitap/data/label_formats.py +++ b/src/raitap/data/label_formats.py @@ -9,12 +9,12 @@ from __future__ import annotations -from pathlib import Path from typing import TYPE_CHECKING, Any, Protocol, TypeVar, runtime_checkable -from raitap.data.types import LabelFormat - if TYPE_CHECKING: + from pathlib import Path + + from raitap.data.types import LabelFormat from raitap.types import TaskKind #: Native intermediate record shapes (match the on-disk native formats). @@ -57,9 +57,7 @@ def label_format(cls: type[T]) -> type[T]: return cls -def resolve_label_format_adapter( - fmt: LabelFormat, *, task_kind: TaskKind -) -> LabelFormatAdapter: +def resolve_label_format_adapter(fmt: LabelFormat, *, task_kind: TaskKind) -> LabelFormatAdapter: """Return the adapter for ``fmt`` that supports ``task_kind``. Raises ``ValueError`` when no adapter is registered for ``fmt`` (e.g. @@ -67,7 +65,9 @@ def resolve_label_format_adapter( declare ``task_kind`` in ``supported_tasks``. """ # Import side-effect: register the in-tree adapters on first use. - from raitap.data import _label_format_adapters # noqa: F401 + from raitap.data import ( + _label_format_adapters, # noqa: F401 # pyright: ignore[reportUnusedImport] + ) adapter = LABEL_FORMAT_ADAPTERS.get(fmt) if adapter is None: diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index db27ab9d..39a04e18 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -1,24 +1,34 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + import pytest -from raitap.data.types import LabelFormat + from raitap.configs.schema import LabelsConfig from raitap.data.label_formats import ( LABEL_FORMAT_ADAPTERS, label_format, resolve_label_format_adapter, ) +from raitap.data.types import LabelFormat from raitap.types import TaskKind +if TYPE_CHECKING: + from pathlib import Path + + from raitap.configs.schema import AppConfig -def test_label_format_members_are_string_values(): + +def test_label_format_members_are_string_values() -> None: assert LabelFormat.native == "native" assert {f.value for f in LabelFormat} == {"native", "coco", "yolo", "voc"} -def test_labels_config_defaults_to_native_format(): +def test_labels_config_defaults_to_native_format() -> None: assert LabelsConfig().format is LabelFormat.native -def test_label_format_decorator_registers_instance(): +def test_label_format_decorator_registers_instance() -> None: @label_format class _Dummy: format = LabelFormat.coco # reuse an enum member; popped below @@ -32,24 +42,25 @@ class _Dummy: LABEL_FORMAT_ADAPTERS.pop(LabelFormat.coco, None) -def test_registry_rejects_unknown_native(): +def test_registry_rejects_unknown_native() -> None: with pytest.raises(ValueError, match="No adapter"): resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection) -def test_registry_resolves_supported_task(): +def test_registry_resolves_supported_task() -> None: adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection) assert adapter.format is LabelFormat.coco assert TaskKind.detection in adapter.supported_tasks -def test_registry_rejects_unsupported_task(): +def test_registry_rejects_unsupported_task() -> None: with pytest.raises(ValueError, match="does not support task"): resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification) -def test_coco_detection_records(tmp_path): +def test_coco_detection_records(tmp_path: Path) -> None: import json + from raitap.data.adapters.coco import CocoAdapter coco = { @@ -73,8 +84,9 @@ def test_coco_detection_records(tmp_path): assert by_id["b.jpg"] == {"sample_id": "b.jpg", "boxes": [], "labels": []} -def test_coco_classification_records(tmp_path): +def test_coco_classification_records(tmp_path: Path) -> None: import json + from raitap.data.adapters.coco import CocoAdapter coco = { @@ -88,8 +100,9 @@ def test_coco_classification_records(tmp_path): assert records == [{"sample_id": "a.jpg", "label": 7}] -def test_coco_classification_rejects_zero_categories(tmp_path): +def test_coco_classification_rejects_zero_categories(tmp_path: Path) -> None: import json + from raitap.data.adapters.coco import CocoAdapter coco = { @@ -103,8 +116,9 @@ def test_coco_classification_rejects_zero_categories(tmp_path): CocoAdapter().to_classification_records(p) -def test_coco_classification_rejects_multiple_categories(tmp_path): +def test_coco_classification_rejects_multiple_categories(tmp_path: Path) -> None: import json + from raitap.data.adapters.coco import CocoAdapter coco = { @@ -121,8 +135,9 @@ def test_coco_classification_rejects_multiple_categories(tmp_path): CocoAdapter().to_classification_records(p) -def test_yolo_detection_records(tmp_path): +def test_yolo_detection_records(tmp_path: Path) -> None: from PIL import Image + from raitap.data.adapters.yolo import YoloAdapter image_dir = tmp_path / "images" @@ -134,9 +149,7 @@ def test_yolo_detection_records(tmp_path): # class=2, cx=0.5 cy=0.5 w=0.2 h=0.1 -> center (50,100), box 20x20px (label_dir / "a.txt").write_text("2 0.5 0.5 0.2 0.1\n") - records = YoloAdapter().to_detection_records( - label_dir, image_dir=image_dir, class_names=None - ) + records = YoloAdapter().to_detection_records(label_dir, image_dir=image_dir, class_names=None) assert len(records) == 1 rec = records[0] assert rec["sample_id"] == "a.jpg" @@ -146,7 +159,7 @@ def test_yolo_detection_records(tmp_path): assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0]) -def test_voc_detection_records(tmp_path): +def test_voc_detection_records(tmp_path: Path) -> None: from raitap.data.adapters.voc import VocAdapter xml = """ @@ -162,18 +175,35 @@ def test_voc_detection_records(tmp_path): records = VocAdapter().to_detection_records( d, image_dir=None, class_names=["background", "person", "car"] ) - assert records == [ - {"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]} - ] + assert records == [{"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]}] + + +def test_voc_detection_rejects_object_without_bndbox(tmp_path: Path) -> None: + from raitap.data.adapters.voc import VocAdapter + + xml = """ + a.jpg + person + """ + d = tmp_path / "ann" + d.mkdir() + (d / "a.xml").write_text(xml) + + with pytest.raises(ValueError, match="has no "): + VocAdapter().to_detection_records( + d, image_dir=None, class_names=["background", "person", "car"] + ) -def test_detection_load_labels_via_coco(tmp_path, monkeypatch): +def test_detection_load_labels_via_coco(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: import json - import torch from types import SimpleNamespace - from raitap.task_families.detection import DetectionFamily - from raitap.data.types import LabelFormat + + import torch + import raitap.data.data as data_mod + from raitap.data.types import LabelFormat + from raitap.task_families.detection import DetectionFamily coco = { "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}], @@ -183,15 +213,16 @@ def test_detection_load_labels_via_coco(tmp_path, monkeypatch): labels_file = tmp_path / "instances.json" labels_file.write_text(json.dumps(coco)) - monkeypatch.setattr( - data_mod, "get_source_path", lambda source, *, kind: tmp_path / source - ) + monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source) # tmp_path/"instances.json" is LABELS; tmp_path/"imgs" is DATA (unused by coco). - cfg = SimpleNamespace( - data=SimpleNamespace( - source="imgs", - labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco), - ) + cfg = cast( + "AppConfig", + SimpleNamespace( + data=SimpleNamespace( + source="imgs", + labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco), + ) + ), ) tensor = [object(), object()] # len == 2 samples out = DetectionFamily().load_labels(cfg, tensor=tensor, sample_ids=["a.jpg", "b.jpg"]) @@ -200,10 +231,14 @@ def test_detection_load_labels_via_coco(tmp_path, monkeypatch): assert out[1]["boxes"].shape == (0, 4) -def test_classification_load_labels_via_coco(tmp_path, monkeypatch): +def test_classification_load_labels_via_coco( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: import json - import torch from types import SimpleNamespace + + import torch + import raitap.data.data as data_mod from raitap.data.data import load_classification_labels from raitap.data.types import LabelFormat @@ -218,18 +253,18 @@ def test_classification_load_labels_via_coco(tmp_path, monkeypatch): } labels_file = tmp_path / "c.json" labels_file.write_text(json.dumps(coco)) - monkeypatch.setattr( - data_mod, "get_source_path", lambda source, *, kind: tmp_path / source - ) - cfg = SimpleNamespace( - data=SimpleNamespace( - source="imgs", - labels=SimpleNamespace( - source="c.json", format=LabelFormat.coco, id_strategy="stem" - ), - ) - ) - out = load_classification_labels( - cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"] + monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source) + cfg = cast( + "AppConfig", + SimpleNamespace( + data=SimpleNamespace( + source="imgs", + labels=SimpleNamespace( + source="c.json", format=LabelFormat.coco, id_strategy="stem" + ), + ) + ), ) + out = load_classification_labels(cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"]) + assert out is not None assert torch.equal(out, torch.tensor([0, 4])) diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py index a37920a6..97d6b2d3 100644 --- a/src/raitap/task_families/detection.py +++ b/src/raitap/task_families/detection.py @@ -15,6 +15,8 @@ from raitap.types import TaskKind if TYPE_CHECKING: + import torch + from raitap.models.torch_backend import TorchBackend from raitap.task_families.base import ExplainContext, ForwardContext @@ -24,7 +26,7 @@ def _align_detection_records( *, expected: int, sample_ids: Any, -) -> list[dict[str, "torch.Tensor"]]: +) -> list[dict[str, torch.Tensor]]: """Align native detection records to ``sample_ids`` and build tensors. Extracted from ``DetectionFamily.load_labels`` so label-format adapters can @@ -196,32 +198,22 @@ def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any: with labels_path.open() as fh: records = json.load(fh) if not isinstance(records, list): - raise ValueError( - f"Detection labels file {labels_path} must be a JSON array." - ) + raise ValueError(f"Detection labels file {labels_path} must be a JSON array.") else: from raitap.data.label_formats import resolve_label_format_adapter data_source = _get_optional_config_value(cfg.data, "source") - image_dir = ( - get_source_path(data_source, kind=SourceKind.DATA) - if data_source - else None - ) + image_dir = get_source_path(data_source, kind=SourceKind.DATA) if data_source else None class_names = ( _get_optional_config_value(cfg.model, "class_names") if hasattr(cfg, "model") else None ) - adapter = resolve_label_format_adapter( - LabelFormat(fmt), task_kind=self.kind - ) + adapter = resolve_label_format_adapter(LabelFormat(fmt), task_kind=self.kind) records = adapter.to_detection_records( labels_path, image_dir=image_dir, class_names=class_names ) - return _align_detection_records( - records, expected=len(tensor), sample_ids=sample_ids - ) + return _align_detection_records(records, expected=len(tensor), sample_ids=sample_ids) def validate_labels(self, labels: Any) -> None: # The detection loader returns ``list[dict]`` or ``None``. A bare tensor From 049475882104c4a877ef0087fb236a45b52de35d Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 00:09:52 +0200 Subject: [PATCH 11/28] docs: note detection label formats match sample_id by exact name (refs #338) --- docs/modules/data/configuration.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md index 653c9b26..0649ccfe 100644 --- a/docs/modules/data/configuration.md +++ b/docs/modules/data/configuration.md @@ -204,6 +204,10 @@ RAITAP reads common annotation formats directly via `data.labels.format`. COCO and YOLO labels keep their category ids unchanged. VOC class names map to ids by `model.class_names` order, else the standard 20-class VOC order. +Detection formats match each record's `sample_id` against the discovered image +file by exact name, so the image directory must be flat (nested subdirs are not +matched). Classification labels still align via `labels.id_strategy`. + For tabular models whose backend expects an unusual per-sample layout (such as ACAS Xu, a Torch network whose forward takes `(N, 1, 1, 5)`), supply `input_metadata.shape` explicitly so the pipeline reshapes the flat feature From 17a1bd008a435196b7cf378b64588f03bcdaa1cf Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 02:45:04 +0200 Subject: [PATCH 12/28] docs: fix sphinx cross-ref and heading-level warnings for label formats (refs #338) --- docs/modules/data/configuration.md | 4 +--- src/raitap/data/types.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md index 0649ccfe..8f030846 100644 --- a/docs/modules/data/configuration.md +++ b/docs/modules/data/configuration.md @@ -190,9 +190,7 @@ data = DataConfig( ) ``` -## Label formats - -RAITAP reads common annotation formats directly via `data.labels.format`. +**Label formats.** RAITAP reads common annotation formats directly via `data.labels.format`. | Format | Detection | Classification | Source layout | | -------- | --------- | -------------- | ---------------------------------------------- | diff --git a/src/raitap/data/types.py b/src/raitap/data/types.py index fc114554..999370e0 100644 --- a/src/raitap/data/types.py +++ b/src/raitap/data/types.py @@ -39,8 +39,8 @@ class LabelFormat(StrEnum): ``native`` is RAITAP's own shape (classification: CSV/TSV/Parquet or the ``directory`` source; detection: the JSON record list). The others are converted to the native intermediate by a registered - :class:`~raitap.data.label_formats.LabelFormatAdapter` before the task - family aligns them. StrEnum so YAML users can write the raw value. + ``LabelFormatAdapter`` before the task family aligns them. StrEnum so YAML + users can write the raw value. """ native = "native" From a150a60d2f51eb00df5c2d57f8858bbb1cbf18ba Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 02:53:04 +0200 Subject: [PATCH 13/28] test(data): use module-qualified load_classification_labels to avoid dual import (refs #338) --- src/raitap/data/tests/test_label_formats.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py index 39a04e18..657de141 100644 --- a/src/raitap/data/tests/test_label_formats.py +++ b/src/raitap/data/tests/test_label_formats.py @@ -240,7 +240,6 @@ def test_classification_load_labels_via_coco( import torch import raitap.data.data as data_mod - from raitap.data.data import load_classification_labels from raitap.data.types import LabelFormat coco = { @@ -265,6 +264,8 @@ def test_classification_load_labels_via_coco( ) ), ) - out = load_classification_labels(cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"]) + out = data_mod.load_classification_labels( + cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"] + ) assert out is not None assert torch.equal(out, torch.tensor([0, 4])) From ef74e217eee4b6c02072b83f205b58b9a90fbda5 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 03:50:13 +0200 Subject: [PATCH 14/28] feat(config): discriminated LabelsConfig variants, drop LabelFormat (refs #338) --- src/raitap/configs/schema.py | 55 ++++++++++++------- .../configs/tests/test_labels_schema.py | 23 ++++++++ src/raitap/data/__init__.py | 4 +- src/raitap/data/types.py | 24 -------- 4 files changed, 60 insertions(+), 46 deletions(-) create mode 100644 src/raitap/configs/tests/test_labels_schema.py diff --git a/src/raitap/configs/schema.py b/src/raitap/configs/schema.py index 1910345c..b616d85a 100644 --- a/src/raitap/configs/schema.py +++ b/src/raitap/configs/schema.py @@ -5,7 +5,7 @@ from omegaconf import MISSING -from raitap.data.types import IdStrategy, LabelEncoding, LabelFormat +from raitap.data.types import IdStrategy, LabelEncoding from raitap.types import Hardware, TaskKind if TYPE_CHECKING: @@ -70,27 +70,44 @@ class ModelConfig: @dataclass class LabelsConfig: - # Optional path to a labels file (currently CSV/TSV/Parquet), OR the reserved - # value "directory" (exposed as ``raitap.data.DIRECTORY_LABELS_SOURCE``) to - # derive classification labels from each sample's top-level class - # subdirectory (torchvision ImageFolder style; no labels file). - source: str | None = None - # Optional sample-id column for filename alignment (e.g. "image"). + _target_: str = MISSING + + +@dataclass +class TabularLabelsConfig(LabelsConfig): + _target_: str = "TabularLabelParser" + source: str = MISSING id_column: str | None = None - # Optional class-label column; when omitted, one-hot numeric columns are used via argmax. column: str | None = None - # Optional parsing strategy for labels: "index", "one_hot", or "argmax". encoding: LabelEncoding | None = None - # Strategy for matching label-file ids to discovered sample files. One of: - # "auto" — pick "relative_path" if any id contains "/" or "\\"; else "stem". - # "relative_path" — ids are resolved as posix-style paths relative to ``data.source`` - # (supports nested ImageFolder layouts with colliding stems). - # "stem" — flat-dir / basename matching: match by ``Path(id).stem`` only. id_strategy: IdStrategy = IdStrategy.auto - # External label file format. ``native`` (default) reads RAITAP's own - # shape. ``coco`` / ``yolo`` / ``voc`` are converted to the native - # intermediate before alignment. Requires id-based alignment (sample_ids). - format: LabelFormat = LabelFormat.native + + +@dataclass +class DirectoryLabelsConfig(LabelsConfig): + _target_: str = "DirectoryLabelParser" + + +@dataclass +class CocoLabelsConfig(LabelsConfig): + _target_: str = "CocoLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + + +@dataclass +class YoloLabelsConfig(LabelsConfig): + _target_: str = "YoloLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + + +@dataclass +class VocLabelsConfig(LabelsConfig): + _target_: str = "VocLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + class_names: list[str] | None = None @dataclass @@ -126,7 +143,7 @@ class DataConfig: # Forwarded to ``infer_input_spec`` so semantics and visualisers see the correct # modality for non-image data such as ACAS Xu's 5-feature tabular vector. input_metadata: dict[str, Any] | None = None - labels: LabelsConfig = field(default_factory=LabelsConfig) + labels: LabelsConfig | None = None @dataclass diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py new file mode 100644 index 00000000..c22ffece --- /dev/null +++ b/src/raitap/configs/tests/test_labels_schema.py @@ -0,0 +1,23 @@ +import dataclasses + +import pytest + +from raitap.configs.schema import CocoLabelsConfig, DirectoryLabelsConfig + + +def test_coco_config_has_no_tabular_fields() -> None: + names = {f.name for f in dataclasses.fields(CocoLabelsConfig)} + assert "id_column" not in names + assert "column" not in names + assert "encoding" not in names + assert {"_target_", "source", "id_strategy"} <= names + + +def test_directory_config_has_only_target() -> None: + names = {f.name for f in dataclasses.fields(DirectoryLabelsConfig)} + assert names == {"_target_"} + + +def test_labelformat_enum_is_gone() -> None: + with pytest.raises(ImportError): + from raitap.data.types import LabelFormat # noqa: F401 diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py index 1bcb08f3..0365b77f 100644 --- a/src/raitap/data/__init__.py +++ b/src/raitap/data/__init__.py @@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any -from .types import DIRECTORY_LABELS_SOURCE, IdStrategy, LabelEncoding, LabelFormat, Preprocessing +from .types import IdStrategy, LabelEncoding, Preprocessing if TYPE_CHECKING: from raitap.configs.schema import DataConfig, LabelsConfig @@ -30,14 +30,12 @@ __all__ = [ - "DIRECTORY_LABELS_SOURCE", "Data", "DataConfig", "DataInputMetadata", "DataPreprocessingFactory", "IdStrategy", "LabelEncoding", - "LabelFormat", "LabelFormatAdapter", "LabelsConfig", "ModelInputTransformationFactory", diff --git a/src/raitap/data/types.py b/src/raitap/data/types.py index 999370e0..943f28d9 100644 --- a/src/raitap/data/types.py +++ b/src/raitap/data/types.py @@ -33,30 +33,6 @@ class IdStrategy(StrEnum): stem = "stem" -class LabelFormat(StrEnum): - """On-disk label file format selected by ``LabelsConfig.format``. - - ``native`` is RAITAP's own shape (classification: CSV/TSV/Parquet or the - ``directory`` source; detection: the JSON record list). The others are - converted to the native intermediate by a registered - ``LabelFormatAdapter`` before the task family aligns them. StrEnum so YAML - users can write the raw value. - """ - - native = "native" - coco = "coco" - yolo = "yolo" - voc = "voc" - - -#: Reserved ``LabelsConfig.source`` value selecting folder-as-label ingestion: -#: classification labels are derived from each sample's top-level class -#: subdirectory (torchvision ``ImageFolder`` style; no labels file). Kept as a -#: plain ``str`` so it round-trips through OmegaConf; ``LabelsConfig.source`` -#: stays ``str | None`` (a path or this sentinel). -DIRECTORY_LABELS_SOURCE = "directory" - - class Preprocessing(StrEnum): """Named values for ``DataConfig.preprocessing``. From 164e1513ed5826c8d34cbfbd629645bfb9521dbf Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 04:13:36 +0200 Subject: [PATCH 15/28] feat(data): label parser family, factory, nested-group registration (refs #338) --- src/raitap/_adapters.py | 8 +- .../configs/tests/test_labels_schema.py | 85 ++++++++++++++++++- src/raitap/configs/zen.py | 1 + src/raitap/data/label_parsers/__init__.py | 14 +++ src/raitap/data/label_parsers/base.py | 28 ++++++ src/raitap/data/label_parsers/directory.py | 27 ++++++ src/raitap/data/label_parsers/factory.py | 34 ++++++++ src/raitap/data/label_parsers/registration.py | 42 +++++++++ 8 files changed, 235 insertions(+), 4 deletions(-) create mode 100644 src/raitap/data/label_parsers/__init__.py create mode 100644 src/raitap/data/label_parsers/base.py create mode 100644 src/raitap/data/label_parsers/directory.py create mode 100644 src/raitap/data/label_parsers/factory.py create mode 100644 src/raitap/data/label_parsers/registration.py diff --git a/src/raitap/_adapters.py b/src/raitap/_adapters.py index e1f9078e..b58b7c2b 100644 --- a/src/raitap/_adapters.py +++ b/src/raitap/_adapters.py @@ -305,10 +305,14 @@ def _register_core( if family is not None: cls._adapter_group = family.group builder = _build_schema_adapter(cls, schema_override or family.schema) + # Hydra groups use ``/`` for nesting; OmegaConf packages use ``.``. + # A nested group like ``data/labels`` must target package + # ``data.labels`` so the composed node lands at ``cfg.data.labels``. + package_base = family.group.replace("/", ".") package = ( - f"{family.group}.{registry_name}" + f"{package_base}.{registry_name}" if family.package_style == "nested" - else family.group + else package_base ) store(builder, group=family.group, name=registry_name, package=package) _BUILDERS.setdefault(family.group, {})[registry_name] = builder diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py index c22ffece..64ee9c84 100644 --- a/src/raitap/configs/tests/test_labels_schema.py +++ b/src/raitap/configs/tests/test_labels_schema.py @@ -1,4 +1,5 @@ import dataclasses +import importlib import pytest @@ -19,5 +20,85 @@ def test_directory_config_has_only_target() -> None: def test_labelformat_enum_is_gone() -> None: - with pytest.raises(ImportError): - from raitap.data.types import LabelFormat # noqa: F401 + import importlib + + data_types = importlib.import_module("raitap.data.types") + with pytest.raises(AttributeError): + getattr(data_types, "LabelFormat") # noqa: B009 + + +# Ground truth (see task-2-report.md): composing ``+data/labels=directory`` onto +# the AppConfig schema lands the variant at ``cfg.data.labels`` with the FQN +# ``_target_`` that hydra-zen ``builds()`` injects. +_COMPOSED_TARGET = "raitap.data.label_parsers.directory.DirectoryLabelParser" + + +def _register_labels_group() -> None: + """Register the ``data/labels`` group + AppConfig schema directly. + + Bypasses ``register_configs()`` (which imports transparency and other + families that are broken mid-refactor on this branch) by importing only the + label_parsers package — enough to fire the ``@label_parser`` decorator — and + flushing the hydra-zen store. The AppConfig schema is needed as the compose + base so the ``data.labels`` package has a struct to land in. + """ + importlib.import_module("raitap.data.label_parsers") + from hydra.core.config_store import ConfigStore + + from raitap._adapters import store + from raitap.configs.schema import AppConfig + + store.add_to_hydra_store(overwrite_ok=True) + ConfigStore.instance().store(name="raitap_schema", node=AppConfig) + + +def test_directory_parser_group_lands_at_data_labels() -> None: + """De-risk (Path A): the nested ``data/labels`` group composes onto + ``cfg.data.labels`` as a single config (flat semantics at a nested path).""" + from hydra import compose, initialize + from hydra.core.global_hydra import GlobalHydra + + _register_labels_group() + GlobalHydra.instance().clear() + with initialize(version_base=None, config_path=None): + cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"]) + # Assertion runs unconditionally (no swallowing). The composed value is the + # FQN hydra-zen stores, NOT the short dataclass default. + assert cfg.data.labels._target_ == _COMPOSED_TARGET + + +def test_directory_group_rejects_foreign_field() -> None: + """De-risk (Path A): a field the directory variant lacks fails at compose. + + Uses a struct-mode override (``data.labels.id_column=x`` — no ``+``) so + OmegaConf's struct check fires; ``+`` force-adds and would bypass it. + """ + from hydra import compose, initialize + from hydra.core.global_hydra import GlobalHydra + from hydra.errors import ConfigCompositionException + + _register_labels_group() + GlobalHydra.instance().clear() + with pytest.raises(ConfigCompositionException), initialize(version_base=None, config_path=None): + compose( + config_name="raitap_schema", + overrides=["+data/labels=directory", "data.labels.id_column=x"], + ) + + +def test_create_label_parser_handles_both_target_forms() -> None: + """``create_label_parser`` must instantiate for BOTH ``_target_`` shapes: + + * short bare name (``DirectoryLabelsConfig()`` dataclass default), resolved + against the ``raitap.data.label_parsers.`` prefix; + * the dotted FQN hydra-zen ``builds()`` stamps on the group-composed cfg. + """ + _register_labels_group() + from raitap.data.label_parsers.directory import DirectoryLabelParser + from raitap.data.label_parsers.factory import create_label_parser + + short = create_label_parser(DirectoryLabelsConfig()) + assert isinstance(short, DirectoryLabelParser) + + fqn = create_label_parser({"_target_": _COMPOSED_TARGET}) + assert isinstance(fqn, DirectoryLabelParser) diff --git a/src/raitap/configs/zen.py b/src/raitap/configs/zen.py index a52d04e9..c0ac3928 100644 --- a/src/raitap/configs/zen.py +++ b/src/raitap/configs/zen.py @@ -49,6 +49,7 @@ def register_zen_groups() -> None: import importlib for pkg in ( + "raitap.data.label_parsers", "raitap.metrics", "raitap.reporting", "raitap.robustness", diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py new file mode 100644 index 00000000..2b9e82ec --- /dev/null +++ b/src/raitap/data/label_parsers/__init__.py @@ -0,0 +1,14 @@ +"""Label parser family package. + +Importing this package fires the ``@label_parser`` decorator on every +in-tree parser module, registering them with the hydra-zen store. Each +concrete parser is re-exported here so the short ``_target_`` form (a bare +class name resolved against ``raitap.data.label_parsers.``) instantiates, +mirroring how ``raitap.metrics`` re-exports its metric computers. +""" + +from __future__ import annotations + +from .directory import DirectoryLabelParser + +__all__ = ["DirectoryLabelParser"] diff --git a/src/raitap/data/label_parsers/base.py b/src/raitap/data/label_parsers/base.py new file mode 100644 index 00000000..3c8cf47f --- /dev/null +++ b/src/raitap/data/label_parsers/base.py @@ -0,0 +1,28 @@ +"""Base protocol and type alias for label parsers.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable + +if TYPE_CHECKING: + from raitap.types import TaskKind + +# Type alias for the union of parsed label representations. +ParsedLabels = "torch.Tensor | list[dict[str, torch.Tensor]] | None" + + +@runtime_checkable +class LabelParser(Protocol): + """Protocol every label-parser adapter must satisfy.""" + + supported_tasks: frozenset[TaskKind] + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: ... diff --git a/src/raitap/data/label_parsers/directory.py b/src/raitap/data/label_parsers/directory.py new file mode 100644 index 00000000..f1770e22 --- /dev/null +++ b/src/raitap/data/label_parsers/directory.py @@ -0,0 +1,27 @@ +"""Directory label parser stub (real logic lands in Task 3).""" + +from __future__ import annotations + +from typing import Any + +from raitap.configs.schema import DirectoryLabelsConfig +from raitap.data.label_parsers.registration import label_parser +from raitap.types import TaskKind + + +@label_parser(registry_name="directory", schema=DirectoryLabelsConfig) +class DirectoryLabelParser: + """Parse labels from directory structure (stub; returns None until Task 3).""" + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification}) + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> None: + return None diff --git a/src/raitap/data/label_parsers/factory.py b/src/raitap/data/label_parsers/factory.py new file mode 100644 index 00000000..b8d6cfd3 --- /dev/null +++ b/src/raitap/data/label_parsers/factory.py @@ -0,0 +1,34 @@ +"""Instantiation factory for label parsers (mirrors metrics/factory.py:44-60).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from hydra.utils import instantiate + +from raitap import raitap_log +from raitap.configs import cfg_to_dict, resolve_target + +if TYPE_CHECKING: + from raitap.data.label_parsers.base import LabelParser + +_LABELS_PREFIX = "raitap.data.label_parsers." + + +def create_label_parser(labels_config: Any) -> LabelParser: + """Instantiate a label parser from Hydra-style config (``_target_`` + kwargs).""" + labels_cfg = cfg_to_dict(labels_config) + target_path: str = labels_cfg.get("_target_", "") + resolved_target = resolve_target(target_path, _LABELS_PREFIX) + labels_cfg["_target_"] = resolved_target + + try: + parser = instantiate(labels_cfg) + except Exception as e: + raitap_log.exception("Label parser instantiation failed for target %r", target_path) + raise ValueError( + f"Could not instantiate label parser {target_path!r}.\n" + "Check that _target_ points to a valid LabelParser implementation." + ) from e + + return parser diff --git a/src/raitap/data/label_parsers/registration.py b/src/raitap/data/label_parsers/registration.py new file mode 100644 index 00000000..3ade1ab7 --- /dev/null +++ b/src/raitap/data/label_parsers/registration.py @@ -0,0 +1,42 @@ +"""Family decorator for label-parser adapters. + +Mirrors ``raitap.metrics.registration`` exactly, with group ``data/labels`` +and ``package_style="flat"`` so composed configs land at ``cfg.data.labels``. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, TypeVar, Unpack + +from raitap._adapters import AdapterDecoratorOptions, FamilyConfig, _register_core +from raitap.configs.schema import LabelsConfig + +if TYPE_CHECKING: + from collections.abc import Callable + + from raitap.data.label_parsers.base import LabelParser + +# ``flat``: ``DataConfig.labels`` is a single ``LabelsConfig`` (not a dict of +# named entries), so the composed variant lands directly at ``cfg.data.labels`` +# (package ``data.labels``), with parser names competing for that one slot. +LABELS = FamilyConfig( + group="data/labels", + schema=LabelsConfig, + package_style="flat", +) + +T = TypeVar("T", bound="LabelParser") + + +def label_parser( + **common: Unpack[AdapterDecoratorOptions], +) -> Callable[[type[T]], type[T]]: + """Decorator: register a label-parser adapter. + + ``registry_name`` is required. Mirrors ``metrics_adapter`` shape. + """ + + def wrap(cls: type[T]) -> type[T]: + return _register_core(cls, family=LABELS, **common) + + return wrap From f97979b9cb6df5a5e76ee055bd5831085312c94c Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 04:24:32 +0200 Subject: [PATCH 16/28] refactor(data): resolve labels via parser, drop TaskFamily.load_labels (refs #338) --- src/raitap/data/data.py | 79 +++++++++------ src/raitap/data/label_parsers/directory.py | 36 ++++++- src/raitap/data/tests/test_label_parsers.py | 106 ++++++++++++++++++++ src/raitap/task_families/base.py | 4 - src/raitap/task_families/classification.py | 5 - src/raitap/task_families/detection.py | 60 ----------- 6 files changed, 184 insertions(+), 106 deletions(-) create mode 100644 src/raitap/data/tests/test_label_parsers.py diff --git a/src/raitap/data/data.py b/src/raitap/data/data.py index 9b81806d..46c4ee8d 100644 --- a/src/raitap/data/data.py +++ b/src/raitap/data/data.py @@ -12,12 +12,10 @@ from raitap import raitap_log from raitap.data.preprocessing import module_as_per_image_callable, resolve_preprocessing from raitap.data.types import ( - DIRECTORY_LABELS_SOURCE, MODALITY_EXTENSIONS, IdStrategy, InputModality, LabelEncoding, - LabelFormat, ) from raitap.data.utils import download_file from raitap.tracking.base_tracker import BaseTracker, Trackable @@ -75,7 +73,9 @@ def __init__( self.tensor = family.adapt_loaded_inputs(raw_tensor) family.validate_inputs(self.tensor) self.labels: torch.Tensor | list[dict[str, torch.Tensor]] | None - self.labels = family.load_labels(cfg, tensor=self.tensor, sample_ids=self.sample_ids) + self.labels = _resolve_and_parse_labels( + cfg, task_kind=self.task_kind, tensor=self.tensor, sample_ids=self.sample_ids + ) family.validate_labels(self.labels) def _load_data( @@ -237,6 +237,45 @@ def log(self, tracker: BaseTracker, **kwargs: Any) -> None: tracker.log_dataset(self.describe()) +def _resolve_and_parse_labels( + cfg: Any, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, +) -> Any: + """Resolve cfg.data.labels to a parser, gate supported_tasks, call parse. + + Returns None when cfg.data.labels is not set. + """ + from raitap.data.label_parsers.factory import create_label_parser + + labels_cfg = _get_optional_config_value(cfg.data, "labels") + if labels_cfg is None: + return None + + parser = create_label_parser(labels_cfg) + + if task_kind not in parser.supported_tasks: + supported = ", ".join(sorted(str(t) for t in parser.supported_tasks)) + raise ValueError( + f"{type(parser).__name__} does not support task_kind={task_kind!r}. " + f"Supported tasks: {supported}." + ) + + data_source = _get_optional_config_value(cfg.data, "source") + model = getattr(cfg, "model", None) + class_names = _get_optional_config_value(model, "class_names") + + return parser.parse( + task_kind=task_kind, + tensor=tensor, + sample_ids=sample_ids, + data_source=data_source, + class_names=class_names, + ) + + def _load_directory_labels(sample_ids: list[str] | None) -> torch.Tensor | None: """Derive classification labels from each sample's top-level class folder (torchvision ImageFolder semantics). Returns None (with a warning) when @@ -273,42 +312,16 @@ def load_classification_labels( Aligns to ``sample_ids`` by id column when available, otherwise falls back to row order. Returns ``None`` when ``data.labels.source`` is unset, the file is empty, or alignment fails (callers then use predictions as targets). + + Note: directory and format-adapter branches have moved to dedicated + ``LabelParser`` implementations. This function handles the tabular (native) + path only and will be wrapped by ``TabularLabelParser`` in a later task. """ labels_cfg = _get_optional_config_value(cfg.data, "labels") labels_source = _get_optional_config_value(labels_cfg, "source") if not labels_source: return None - if labels_source == DIRECTORY_LABELS_SOURCE: - return _load_directory_labels(sample_ids) - - labels_format = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native - if labels_format != LabelFormat.native: - from raitap.data.label_formats import resolve_label_format_adapter - - if not sample_ids: - raise ValueError( - f"Label format {LabelFormat(labels_format).value!r} requires " - "id-based alignment, but no sample ids were discovered." - ) - labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) - adapter = resolve_label_format_adapter( - LabelFormat(labels_format), task_kind=TaskKind.classification - ) - records = adapter.to_classification_records(labels_path) - id_series = pd.Series([r["sample_id"] for r in records]) - record_labels = [int(r["label"]) for r in records] - strategy = _resolve_id_strategy( - _get_optional_config_value(labels_cfg, "id_strategy") or "auto", id_series - ) - aligned = _align_labels_to_samples( - sample_ids=sample_ids, - raw_label_ids=id_series, - encoded_labels=record_labels, - strategy=strategy, - ) - return torch.tensor(aligned, dtype=torch.long) - labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) labels_df = _load_tabular_frame(labels_path) if labels_df.empty: diff --git a/src/raitap/data/label_parsers/directory.py b/src/raitap/data/label_parsers/directory.py index f1770e22..78cb5ffe 100644 --- a/src/raitap/data/label_parsers/directory.py +++ b/src/raitap/data/label_parsers/directory.py @@ -1,17 +1,26 @@ -"""Directory label parser stub (real logic lands in Task 3).""" +"""Directory label parser (torchvision ImageFolder semantics).""" from __future__ import annotations +from pathlib import PurePosixPath from typing import Any +from raitap import raitap_log from raitap.configs.schema import DirectoryLabelsConfig from raitap.data.label_parsers.registration import label_parser from raitap.types import TaskKind +from raitap.utils.lazy import lazy_import + +torch = lazy_import("torch") @label_parser(registry_name="directory", schema=DirectoryLabelsConfig) class DirectoryLabelParser: - """Parse labels from directory structure (stub; returns None until Task 3).""" + """Parse classification labels from the top-level class subfolder of each sample. + + Mirrors torchvision ``ImageFolder`` semantics: ``/`` layout. + Uses ``sample_ids`` only; ignores ``data_source`` and ``class_names``. + """ supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification}) @@ -23,5 +32,24 @@ def parse( sample_ids: list[str] | None, data_source: str | None, class_names: list[str] | None, - ) -> None: - return None + ) -> Any: + """Derive a long-tensor of class indices from sample_ids directory layout.""" + if not sample_ids: + raitap_log.warn( + "DirectoryLabelParser needs image samples organised into " + "class subdirectories; none were found. Falling back to " + "predictions as metric targets." + ) + return None + parts_by_id = [PurePosixPath(sid).parts for sid in sample_ids] + if any(len(parts) < 2 for parts in parts_by_id): + raitap_log.warn( + "DirectoryLabelParser expects a / layout, but " + "one or more samples sit directly under the data source root " + "(no class subdirectory). Falling back to predictions as metric targets." + ) + return None + classes = sorted({parts[0] for parts in parts_by_id}) + class_to_idx = {name: idx for idx, name in enumerate(classes)} + labels = [class_to_idx[parts[0]] for parts in parts_by_id] + return torch.tensor(labels, dtype=torch.long) diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py new file mode 100644 index 00000000..f31351cf --- /dev/null +++ b/src/raitap/data/tests/test_label_parsers.py @@ -0,0 +1,106 @@ +"""Task 3 tests: _resolve_and_parse_labels + DirectoryLabelParser e2e.""" + +from __future__ import annotations + +import importlib +from types import SimpleNamespace +from typing import cast + +import pytest + +from raitap.configs.schema import AppConfig, DirectoryLabelsConfig +from raitap.data.data import _resolve_and_parse_labels +from raitap.types import TaskKind + + +def _make_cfg( + *, + labels: object = None, + source: str | None = None, + class_names: list[str] | None = None, +) -> AppConfig: + """Build a minimal AppConfig-shaped namespace for unit tests.""" + data_ns = SimpleNamespace(labels=labels, source=source) + model_ns = SimpleNamespace(class_names=class_names) + return cast("AppConfig", SimpleNamespace(data=data_ns, model=model_ns)) + + +def test_resolve_returns_none_when_labels_is_none() -> None: + cfg = _make_cfg(labels=None) + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=None + ) + assert result is None + + +def test_directory_parser_e2e_returns_label_tensor() -> None: + """DirectoryLabelParser derives class index from top-level folder name.""" + import torch + + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + sample_ids = ["cat/a.jpg", "dog/b.jpg"] + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + assert result is not None + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + # "cat" < "dog" alphabetically -> cat=0, dog=1 + assert result.tolist() == [0, 1] + + +def test_directory_parser_raises_for_unsupported_task() -> None: + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + sample_ids = ["cat/a.jpg", "dog/b.jpg"] + with pytest.raises(ValueError, match="does not support task_kind"): + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=None, sample_ids=sample_ids + ) + + +def test_directory_parser_returns_none_for_no_sample_ids() -> None: + """No sample_ids -> returns None with a warning (graceful degradation).""" + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=None + ) + assert result is None + + +def test_directory_parser_returns_none_for_flat_layout() -> None: + """Samples directly under root (no class subdir) -> None with warning.""" + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + sample_ids = ["a.jpg", "b.jpg"] + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + assert result is None + + +# --- Integration: full hydra compose path --- + + +def _register_labels_group() -> None: + importlib.import_module("raitap.data.label_parsers") + from hydra.core.config_store import ConfigStore + + from raitap._adapters import store + from raitap.configs.schema import AppConfig + + store.add_to_hydra_store(overwrite_ok=True) + ConfigStore.instance().store(name="raitap_schema", node=AppConfig) + + +_COMPOSED_TARGET = "raitap.data.label_parsers.directory.DirectoryLabelParser" + + +def test_integration_compose_data_labels_directory() -> None: + """Composing +data/labels=directory lands cfg.data.labels._target_ at the FQN.""" + from hydra import compose, initialize + from hydra.core.global_hydra import GlobalHydra + + _register_labels_group() + GlobalHydra.instance().clear() + with initialize(version_base=None, config_path=None): + cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"]) + assert cfg.data.labels._target_ == _COMPOSED_TARGET diff --git a/src/raitap/task_families/base.py b/src/raitap/task_families/base.py index 167cadad..848c4051 100644 --- a/src/raitap/task_families/base.py +++ b/src/raitap/task_families/base.py @@ -96,10 +96,6 @@ def validate_inputs(self, tensor: object) -> None: """Validate the (post-adapt) inputs match this family's contract.""" raise NotImplementedError - def load_labels(self, cfg: AppConfig, *, tensor: object, sample_ids: object) -> Any: - """Load labels in this family's on-disk shape (or None).""" - raise NotImplementedError - def validate_labels(self, labels: object) -> None: """Raise if loaded labels don't match this family's expected shape.""" raise NotImplementedError diff --git a/src/raitap/task_families/classification.py b/src/raitap/task_families/classification.py index 4f86b759..3f8c1bf1 100644 --- a/src/raitap/task_families/classification.py +++ b/src/raitap/task_families/classification.py @@ -61,11 +61,6 @@ def validate_inputs(self, tensor: Any) -> None: if tensor.shape[0] < 1: raise ValueError("Classification data is empty; loaded zero samples.") - def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any: - from raitap.data.data import load_classification_labels - - return load_classification_labels(cfg, tensor=tensor, sample_ids=sample_ids) - def validate_labels(self, labels: Any) -> None: # A ``list[dict]`` is a detection-shaped label set; a tensor (or None) # is classification-shaped. Disagreement means model and data declare diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py index 97d6b2d3..eab54c78 100644 --- a/src/raitap/task_families/detection.py +++ b/src/raitap/task_families/detection.py @@ -155,66 +155,6 @@ def validate_inputs(self, tensor: Any) -> None: + (f" with shape {shape}." if shape is not None else ".") ) - def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any: - """Load per-sample detection targets (boxes + labels). - - Expected on-disk shape: JSON file (list of records) with each record - carrying ``sample_id`` (str), ``boxes`` (list of ``[x1, y1, x2, y2]`` - floats), and ``labels`` (list of ints). Returns a list whose length - equals ``len(tensor)``; each entry is a dict with - ``boxes: (M_i, 4) float32`` and ``labels: (M_i,) int64`` tensors. - Samples with no boxes get shape-``(0, 4)`` / shape-``(0,)`` tensors. - - Alignment rules: - - * When ``sample_ids`` is set, records are looked up by ``sample_id`` - and the output is ordered to match ``sample_ids``. Any sample - missing from the labels file → ``ValueError``; duplicate ``sample_id``s - in the labels file → ``ValueError``. - * When ``sample_ids`` is unset, records are consumed in file order - and must equal the dataset length exactly. - - Returns ``None`` when ``data.labels.source`` is unset. - """ - import json - - from raitap.data.data import ( - SourceKind, - _get_optional_config_value, - get_source_path, - ) - - labels_cfg = _get_optional_config_value(cfg.data, "labels") - labels_source = _get_optional_config_value(labels_cfg, "source") - if not labels_source: - return None - - labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) - - from raitap.data.types import LabelFormat - - fmt = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native - if fmt == LabelFormat.native: - with labels_path.open() as fh: - records = json.load(fh) - if not isinstance(records, list): - raise ValueError(f"Detection labels file {labels_path} must be a JSON array.") - else: - from raitap.data.label_formats import resolve_label_format_adapter - - data_source = _get_optional_config_value(cfg.data, "source") - image_dir = get_source_path(data_source, kind=SourceKind.DATA) if data_source else None - class_names = ( - _get_optional_config_value(cfg.model, "class_names") - if hasattr(cfg, "model") - else None - ) - adapter = resolve_label_format_adapter(LabelFormat(fmt), task_kind=self.kind) - records = adapter.to_detection_records( - labels_path, image_dir=image_dir, class_names=class_names - ) - return _align_detection_records(records, expected=len(tensor), sample_ids=sample_ids) - def validate_labels(self, labels: Any) -> None: # The detection loader returns ``list[dict]`` or ``None``. A bare tensor # is a classification-shaped label set; disagreement means model and From 4970e2cd1d1232b34b80809824c3782103202055 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 04:30:34 +0200 Subject: [PATCH 17/28] feat(data): TabularLabelParser (refs #338) --- src/raitap/data/label_parsers/__init__.py | 3 +- src/raitap/data/label_parsers/tabular.py | 109 ++++++++++++++++++++ src/raitap/data/tests/test_label_parsers.py | 63 +++++++++++ 3 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 src/raitap/data/label_parsers/tabular.py diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py index 2b9e82ec..70234d3b 100644 --- a/src/raitap/data/label_parsers/__init__.py +++ b/src/raitap/data/label_parsers/__init__.py @@ -10,5 +10,6 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates, from __future__ import annotations from .directory import DirectoryLabelParser +from .tabular import TabularLabelParser # pyright: ignore[reportUnusedImport] -__all__ = ["DirectoryLabelParser"] +__all__ = ["DirectoryLabelParser", "TabularLabelParser"] diff --git a/src/raitap/data/label_parsers/tabular.py b/src/raitap/data/label_parsers/tabular.py new file mode 100644 index 00000000..529c4455 --- /dev/null +++ b/src/raitap/data/label_parsers/tabular.py @@ -0,0 +1,109 @@ +"""Tabular label parser (CSV / TSV / Parquet).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from raitap import raitap_log +from raitap.configs.schema import TabularLabelsConfig +from raitap.data.data import ( + SourceKind, + _align_labels_to_samples, + _column_as_series, + _extract_class_labels, + _load_tabular_frame, + _resolve_id_strategy, + _resolve_labels_id_column, + get_source_path, +) +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.types import TaskKind +from raitap.utils.lazy import lazy_import + +if TYPE_CHECKING: + import torch + +torch = lazy_import("torch") # type: ignore[assignment] + + +@label_parser(registry_name="tabular", schema=TabularLabelsConfig) +class TabularLabelParser: + """Parse classification labels from a CSV, TSV, or Parquet file. + + Aligns to ``sample_ids`` via ``id_column`` when available; falls back to + row order otherwise. Returns ``None`` on empty file or count mismatch. + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification}) + + def __init__( + self, + *, + source: str, + id_column: str | None = None, + column: str | None = None, + encoding: Any = None, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_column = id_column + self.column = column + self.encoding = encoding + self.id_strategy = id_strategy + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load tabular labels and align to samples.""" + labels_path = get_source_path(self.source, kind=SourceKind.LABELS) + labels_df = _load_tabular_frame(labels_path) + if labels_df.empty: + raitap_log.warn("Labels file is empty; falling back to predictions as targets.") + return None + + id_column = _resolve_labels_id_column(labels_df, self.id_column) + encoded_labels = _extract_class_labels( + labels_df, + labels_column=self.column, + id_column=id_column, + labels_encoding=self.encoding, + ) + + expected = len(tensor) if tensor is not None else len(encoded_labels) + if sample_ids and id_column: + id_series = _column_as_series(labels_df, id_column) + strategy = _resolve_id_strategy(str(self.id_strategy), id_series) + try: + aligned_labels = _align_labels_to_samples( + sample_ids=sample_ids, + raw_label_ids=id_series, + encoded_labels=encoded_labels, + strategy=strategy, + ) + except ValueError as error: + raitap_log.warn( + f"{error} Falling back to predictions as metric targets.", + ) + return None + return torch.tensor(aligned_labels, dtype=torch.long) + + if sample_ids and not id_column: + raitap_log.warn( + "Could not find a labels id column for filename alignment; using row-order labels.", + ) + + if len(encoded_labels) != expected: + raitap_log.warn( + f"Label count ({len(encoded_labels)}) does not match sample count ({expected}); " + "falling back to predictions as targets.", + ) + return None + + return torch.tensor(encoded_labels, dtype=torch.long) diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py index f31351cf..d6ab391e 100644 --- a/src/raitap/data/tests/test_label_parsers.py +++ b/src/raitap/data/tests/test_label_parsers.py @@ -104,3 +104,66 @@ def test_integration_compose_data_labels_directory() -> None: with initialize(version_base=None, config_path=None): cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"]) assert cfg.data.labels._target_ == _COMPOSED_TARGET + + +# --- Task 4: TabularLabelParser --- + + +def _write_csv(path: object, content: str) -> None: + import pathlib + + pathlib.Path(str(path)).write_text(content, encoding="utf-8") + + +def test_tabular_parser_e2e_via_resolve_and_parse_labels(tmp_path: object) -> None: + """CSV with image,label rows + sample_ids -> aligned long tensor via resolve.""" + import pathlib + + import torch + + from raitap.configs.schema import TabularLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + csv_path = pathlib.Path(str(tmp_path)) / "labels.csv" + _write_csv(csv_path, "image,label\nb.jpg,1\na.jpg,0\n") + + cfg = _make_cfg( + labels=TabularLabelsConfig( + source=str(csv_path), + id_column="image", + ) + ) + sample_ids = ["a.jpg", "b.jpg"] + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + assert result is not None + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + # a.jpg -> label 0, b.jpg -> label 1 + assert result.tolist() == [0, 1] + + +def test_tabular_parser_direct_unit(tmp_path: object) -> None: + """Direct TabularLabelParser.parse unit test without cfg dispatch.""" + import pathlib + + import torch + + from raitap.data.label_parsers.tabular import TabularLabelParser + + csv_path = pathlib.Path(str(tmp_path)) / "labels.csv" + _write_csv(csv_path, "image,label\na.jpg,0\nb.jpg,1\n") + + parser = TabularLabelParser(source=str(csv_path), id_column="image") + result = parser.parse( + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert result is not None + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + assert result.tolist() == [0, 1] From e9907dc8ed44fc5875bd7c285f0d6d0e75523876 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 04:38:30 +0200 Subject: [PATCH 18/28] feat(data): CocoLabelParser detection and classification (refs #338) --- src/raitap/data/label_parsers/__init__.py | 3 +- src/raitap/data/label_parsers/coco.py | 124 +++++++++++++++ src/raitap/data/tests/test_label_parsers.py | 168 ++++++++++++++++++++ 3 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 src/raitap/data/label_parsers/coco.py diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py index 70234d3b..6d122d82 100644 --- a/src/raitap/data/label_parsers/__init__.py +++ b/src/raitap/data/label_parsers/__init__.py @@ -9,7 +9,8 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates, from __future__ import annotations +from .coco import CocoLabelParser # pyright: ignore[reportUnusedImport] from .directory import DirectoryLabelParser from .tabular import TabularLabelParser # pyright: ignore[reportUnusedImport] -__all__ = ["DirectoryLabelParser", "TabularLabelParser"] +__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser"] diff --git a/src/raitap/data/label_parsers/coco.py b/src/raitap/data/label_parsers/coco.py new file mode 100644 index 00000000..71c57011 --- /dev/null +++ b/src/raitap/data/label_parsers/coco.py @@ -0,0 +1,124 @@ +"""COCO label parser (detection + classification).""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +from raitap.configs.schema import CocoLabelsConfig +from raitap.data.data import ( + SourceKind, + _align_labels_to_samples, + _resolve_id_strategy, + get_source_path, +) +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + +if TYPE_CHECKING: + from pathlib import Path + + import pandas as pd + + +@label_parser(registry_name="coco", schema=CocoLabelsConfig) +class CocoLabelParser: + """Parse COCO ``instances.json`` labels for detection or classification. + + Detection: ``bbox`` ``[x, y, w, h]`` -> ``[x1, y1, x2, y2]``; ``category_id`` + passes through unchanged. Classification: one category per image; images with + 0 or >1 categories raise ValueError. + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection, TaskKind.classification}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_strategy = id_strategy + + # --- internal helpers (ported verbatim from adapters/coco.py) --- + + def _load(self, source: Path) -> dict[str, Any]: + with source.open() as fh: + data = json.load(fh) + if not isinstance(data, dict) or "images" not in data: + raise ValueError(f"COCO file {source} must be an object with an 'images' array.") + return data + + def _to_detection_records(self, data: dict[str, Any]) -> list[dict[str, Any]]: + file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} + boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image} + labels: dict[int, list[int]] = {iid: [] for iid in file_by_image} + for ann in data.get("annotations", []): + iid = ann["image_id"] + x, y, w, h = ann["bbox"] + boxes[iid].append([x, y, x + w, y + h]) + labels[iid].append(int(ann["category_id"])) + return [ + {"sample_id": file_by_image[iid], "boxes": boxes[iid], "labels": labels[iid]} + for iid in file_by_image + ] + + def _to_classification_records(self, data: dict[str, Any]) -> list[dict[str, Any]]: + file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} + cats: dict[int, set[int]] = {iid: set() for iid in file_by_image} + for ann in data.get("annotations", []): + cats[ann["image_id"]].add(int(ann["category_id"])) + records: list[dict[str, Any]] = [] + for iid, name in file_by_image.items(): + cat_set = cats[iid] + if len(cat_set) != 1: + raise ValueError( + f"COCO classification needs exactly one category per image; " + f"image {name!r} has {len(cat_set)}." + ) + records.append({"sample_id": name, "label": next(iter(cat_set))}) + return records + + # --- public parse method --- + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load and align COCO labels for detection or classification.""" + import pandas as pd + + labels_path = get_source_path(self.source, kind=SourceKind.LABELS) + data = self._load(labels_path) + + if task_kind is TaskKind.detection: + records = self._to_detection_records(data) + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + ) + + # classification + records = self._to_classification_records(data) + raw_ids: list[str] = [r["sample_id"] for r in records] + encoded: list[int] = [r["label"] for r in records] + id_series: pd.Series = pd.Series(raw_ids) + strategy = _resolve_id_strategy(str(self.id_strategy), id_series) + aligned = _align_labels_to_samples( + sample_ids=sample_ids or [], + raw_label_ids=id_series, + encoded_labels=encoded, + strategy=strategy, + ) + import torch + + return torch.tensor(aligned, dtype=torch.long) diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py index d6ab391e..295bb4dc 100644 --- a/src/raitap/data/tests/test_label_parsers.py +++ b/src/raitap/data/tests/test_label_parsers.py @@ -167,3 +167,171 @@ def test_tabular_parser_direct_unit(tmp_path: object) -> None: assert isinstance(result, torch.Tensor) assert result.dtype == torch.long assert result.tolist() == [0, 1] + + +# --- Task 5: CocoLabelParser --- + + +def _write_json(path: object, data: object) -> None: + import json + import pathlib + + pathlib.Path(str(path)).write_text(json.dumps(data), encoding="utf-8") + + +def _coco_detection_fixture(tmp_path: object) -> object: + """Two-image COCO with one annotated image and one empty image.""" + import pathlib + + coco = { + "images": [ + {"id": 1, "file_name": "a.jpg"}, + {"id": 2, "file_name": "b.jpg"}, + ], + "annotations": [ + {"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}, + {"image_id": 1, "category_id": 5, "bbox": [0, 0, 5, 5]}, + ], + "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], + } + p = pathlib.Path(str(tmp_path)) / "instances.json" + _write_json(p, coco) + return p + + +def _coco_classification_fixture(tmp_path: object) -> object: + """Two-image COCO for classification (one category per image).""" + import pathlib + + coco = { + "images": [ + {"id": 1, "file_name": "a.jpg"}, + {"id": 2, "file_name": "b.jpg"}, + ], + "annotations": [ + {"image_id": 1, "category_id": 0, "bbox": [0, 0, 1, 1]}, + {"image_id": 2, "category_id": 4, "bbox": [0, 0, 1, 1]}, + ], + "categories": [{"id": 0, "name": "x"}, {"id": 4, "name": "y"}], + } + p = pathlib.Path(str(tmp_path)) / "cls.json" + _write_json(p, coco) + return p + + +def test_coco_parser_detection_direct(tmp_path: object) -> None: + """CocoLabelParser.parse detection: boxes xyxy, labels, empty-image shape.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + + labels_path = _coco_detection_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path)) + tensor = [object(), object()] # two samples + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, list) + assert len(result) == 2 + # a.jpg: two boxes, xyxy conversion + expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([3, 5])) + # b.jpg: empty annotation -> (0, 4) boxes, (0,) labels + assert result[1]["boxes"].shape == (0, 4) + assert result[1]["labels"].shape == (0,) + + +def test_coco_parser_classification_direct(tmp_path: object) -> None: + """CocoLabelParser.parse classification: long tensor of category ids.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + + labels_path = _coco_classification_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path)) + result = parser.parse( + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + assert result.tolist() == [0, 4] + + +def test_coco_parser_classification_rejects_multiple_categories(tmp_path: object) -> None: + """Classification parse raises ValueError when an image has >1 categories.""" + import pathlib + + from raitap.data.label_parsers.coco import CocoLabelParser + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}], + "annotations": [ + {"image_id": 1, "category_id": 3, "bbox": [0, 0, 1, 1]}, + {"image_id": 1, "category_id": 5, "bbox": [0, 0, 1, 1]}, + ], + "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], + } + p = pathlib.Path(str(tmp_path)) / "multi.json" + _write_json(p, coco) + parser = CocoLabelParser(source=str(p)) + with pytest.raises(ValueError, match="exactly one category per image"): + parser.parse( + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg"], + data_source=None, + class_names=None, + ) + + +def test_coco_parser_detection_e2e_via_resolve(tmp_path: object) -> None: + """Detection e2e: _resolve_and_parse_labels with CocoLabelsConfig.""" + import torch + + from raitap.configs.schema import CocoLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + labels_path = _coco_detection_fixture(tmp_path) + cfg = _make_cfg(labels=CocoLabelsConfig(source=str(labels_path))) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, list) + assert len(result) == 2 + expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([3, 5])) + assert result[1]["boxes"].shape == (0, 4) + + +def test_coco_parser_classification_e2e_via_resolve(tmp_path: object) -> None: + """Classification e2e: _resolve_and_parse_labels with CocoLabelsConfig.""" + import torch + + from raitap.configs.schema import CocoLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + labels_path = _coco_classification_fixture(tmp_path) + cfg = _make_cfg(labels=CocoLabelsConfig(source=str(labels_path))) + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + assert result.tolist() == [0, 4] From 6e6972ba926b616b256a11fb8202588f04d6b13d Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 04:45:03 +0200 Subject: [PATCH 19/28] feat(data): YoloLabelParser with e2e image-dir resolution (refs #338) --- src/raitap/data/label_parsers/__init__.py | 3 +- src/raitap/data/label_parsers/yolo.py | 92 ++++++++++++++++++ src/raitap/data/tests/test_label_parsers.py | 101 ++++++++++++++++++++ 3 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 src/raitap/data/label_parsers/yolo.py diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py index 6d122d82..38fab7ce 100644 --- a/src/raitap/data/label_parsers/__init__.py +++ b/src/raitap/data/label_parsers/__init__.py @@ -12,5 +12,6 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates, from .coco import CocoLabelParser # pyright: ignore[reportUnusedImport] from .directory import DirectoryLabelParser from .tabular import TabularLabelParser # pyright: ignore[reportUnusedImport] +from .yolo import YoloLabelParser # pyright: ignore[reportUnusedImport] -__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser"] +__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser", "YoloLabelParser"] diff --git a/src/raitap/data/label_parsers/yolo.py b/src/raitap/data/label_parsers/yolo.py new file mode 100644 index 00000000..75ce80c6 --- /dev/null +++ b/src/raitap/data/label_parsers/yolo.py @@ -0,0 +1,92 @@ +"""YOLO label parser (detection-only).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from PIL import Image + +from raitap.configs.schema import YoloLabelsConfig +from raitap.data.data import SourceKind, get_source_path +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + +if TYPE_CHECKING: + from pathlib import Path + +_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".bmp", ".webp") + + +@label_parser(registry_name="yolo", schema=YoloLabelsConfig) +class YoloLabelParser: + """Parse YOLO per-image ``.txt`` (``class cx cy w h``, normalised) for detection. + + Boxes are denormalised to pixel ``[x1, y1, x2, y2]`` using each image's + size read from PIL. Class indices pass through unchanged. + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_strategy = id_strategy + + def _image_for(self, image_dir: Path, stem: str) -> Path: + for suffix in _IMAGE_SUFFIXES: + candidate = image_dir / f"{stem}{suffix}" + if candidate.exists(): + return candidate + raise ValueError(f"YOLO parser found no image for label {stem!r} in {image_dir}.") + + def _to_detection_records(self, labels_dir: Path, image_dir: Path) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for txt in sorted(labels_dir.glob("*.txt")): + image_path = self._image_for(image_dir, txt.stem) + with Image.open(image_path) as im: + width, height = im.size + boxes: list[list[float]] = [] + labels: list[int] = [] + for line in txt.read_text().splitlines(): + parts = line.split() + if not parts: + continue + cls, cx, cy, bw, bh = (float(p) for p in parts[:5]) + x1 = (cx - bw / 2) * width + y1 = (cy - bh / 2) * height + x2 = (cx + bw / 2) * width + y2 = (cy + bh / 2) * height + boxes.append([x1, y1, x2, y2]) + labels.append(int(cls)) + records.append({"sample_id": image_path.name, "boxes": boxes, "labels": labels}) + return records + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load YOLO labels and align to sample_ids for detection.""" + if data_source is None: + raise ValueError( + "YOLO labels need data.source (image directory) to denormalise boxes; " + "set data.source to the image directory." + ) + labels_dir = get_source_path(self.source, kind=SourceKind.LABELS) + image_dir = get_source_path(data_source, kind=SourceKind.DATA) + records = self._to_detection_records(labels_dir, image_dir) + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + ) diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py index 295bb4dc..51525158 100644 --- a/src/raitap/data/tests/test_label_parsers.py +++ b/src/raitap/data/tests/test_label_parsers.py @@ -335,3 +335,104 @@ def test_coco_parser_classification_e2e_via_resolve(tmp_path: object) -> None: assert isinstance(result, torch.Tensor) assert result.dtype == torch.long assert result.tolist() == [0, 4] + + +# --- Task 6: YoloLabelParser --- + + +def _make_yolo_fixture( + tmp_path: object, +) -> tuple[object, object]: + """Create a minimal YOLO label dir + image dir with two images. + + Returns (labels_dir, image_dir). Images are 200x100 px. + Each .txt has one box: class 0, cx=0.5, cy=0.5, w=0.6, h=0.1. + Denormalised: x1=(0.5-0.3)*200=40, y1=(0.5-0.05)*100=45, + x2=(0.5+0.3)*200=160, y2=(0.5+0.05)*100=55. + """ + import pathlib + + from PIL import Image as PILImage + + tmp = pathlib.Path(str(tmp_path)) + labels_dir = tmp / "labels" + labels_dir.mkdir() + image_dir = tmp / "images" + image_dir.mkdir() + + for stem in ("a", "b"): + img = PILImage.new("RGB", (200, 100)) + img.save(image_dir / f"{stem}.jpg") + (labels_dir / f"{stem}.txt").write_text("0 0.5 0.5 0.6 0.1\n", encoding="utf-8") + + return labels_dir, image_dir + + +def test_yolo_parser_unit(tmp_path: object) -> None: + """YoloLabelParser.parse: boxes denormalised via PIL image size.""" + from raitap.data.label_parsers.yolo import YoloLabelParser + + labels_dir, image_dir = _make_yolo_fixture(tmp_path) + parser = YoloLabelParser(source=str(labels_dir)) + + tensor = [object(), object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=str(image_dir), + class_names=None, + ) + + assert isinstance(result, list) + assert len(result) == 2 + # IEEE-754: (0.5+0.05)*100 = 55.00000000000001 -> use pytest.approx + assert result[0]["boxes"][0].tolist() == pytest.approx([40.0, 45.0, 160.0, (0.5 + 0.05) * 100]) + assert result[0]["labels"].tolist() == [0] + assert result[1]["boxes"].shape == (1, 4) + + +def test_yolo_parser_raises_when_data_source_none(tmp_path: object) -> None: + """parse raises ValueError when data_source is None (no image dir).""" + from raitap.data.label_parsers.yolo import YoloLabelParser + + labels_dir, _ = _make_yolo_fixture(tmp_path) + parser = YoloLabelParser(source=str(labels_dir)) + with pytest.raises(ValueError, match=r"data\.source"): + parser.parse( + task_kind=TaskKind.detection, + tensor=[object()], + sample_ids=None, + data_source=None, + class_names=None, + ) + + +def test_yolo_parser_e2e_via_resolve(tmp_path: object) -> None: + """E2E: _resolve_and_parse_labels with YoloLabelsConfig + real image dir. + + Exercises image_dir resolution through the dispatch (gap #1). + """ + from raitap.configs.schema import YoloLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + labels_dir, image_dir = _make_yolo_fixture(tmp_path) + + cfg = _make_cfg( + labels=YoloLabelsConfig(source=str(labels_dir)), + source=str(image_dir), + ) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + + assert isinstance(result, list) + assert len(result) == 2 + assert result[0]["boxes"][0].tolist() == pytest.approx([40.0, 45.0, 160.0, (0.5 + 0.05) * 100]) + assert result[0]["labels"].tolist() == [0] + assert result[1]["boxes"].shape == (1, 4) + assert result[1]["labels"].tolist() == [0] From 4898f4d85e472c506d3c00cb9c21fc4040b8d549 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 04:53:20 +0200 Subject: [PATCH 20/28] feat(data): VocLabelParser with class_names precedence and e2e (refs #338) --- src/raitap/data/label_parsers/__init__.py | 9 +- src/raitap/data/label_parsers/voc.py | 129 +++++++++++++++ src/raitap/data/tests/test_label_parsers.py | 165 ++++++++++++++++++++ 3 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 src/raitap/data/label_parsers/voc.py diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py index 38fab7ce..053ad444 100644 --- a/src/raitap/data/label_parsers/__init__.py +++ b/src/raitap/data/label_parsers/__init__.py @@ -12,6 +12,13 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates, from .coco import CocoLabelParser # pyright: ignore[reportUnusedImport] from .directory import DirectoryLabelParser from .tabular import TabularLabelParser # pyright: ignore[reportUnusedImport] +from .voc import VocLabelParser # pyright: ignore[reportUnusedImport] from .yolo import YoloLabelParser # pyright: ignore[reportUnusedImport] -__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser", "YoloLabelParser"] +__all__ = [ + "CocoLabelParser", + "DirectoryLabelParser", + "TabularLabelParser", + "VocLabelParser", + "YoloLabelParser", +] diff --git a/src/raitap/data/label_parsers/voc.py b/src/raitap/data/label_parsers/voc.py new file mode 100644 index 00000000..eccba526 --- /dev/null +++ b/src/raitap/data/label_parsers/voc.py @@ -0,0 +1,129 @@ +"""Pascal-VOC label parser (detection-only).""" + +from __future__ import annotations + +import xml.etree.ElementTree as ET +from typing import TYPE_CHECKING, Any + +from raitap.configs.schema import VocLabelsConfig +from raitap.data.data import SourceKind, get_source_path +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + +if TYPE_CHECKING: + from pathlib import Path + +#: Canonical Pascal-VOC class order (index = label id) when no class_names given. +_VOC_CLASSES = ( + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", +) + + +def _coord(box: ET.Element, tag: str, xml_path: Path) -> float: + text = box.findtext(tag) + if text is None: + raise ValueError(f"VOC bndbox in {xml_path.name} missing <{tag}>.") + return float(text) + + +@label_parser(registry_name="voc", schema=VocLabelsConfig) +class VocLabelParser: + """Parse Pascal-VOC per-image ``.xml`` for detection. + + Boxes are already ``[xmin, ymin, xmax, ymax]`` pixels. Class names map to + ids by their position in the active name list (parser's own ``class_names``, + else the ``class_names`` arg from ``cfg.model.class_names``, else the + standard 20-class VOC order). + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + class_names: list[str] | None = None, + ) -> None: + self.source = source + self.id_strategy = id_strategy + self.class_names = class_names + + def _to_detection_records( + self, labels_dir: Path, name_to_id: dict[str, int] + ) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for xml_path in sorted(labels_dir.glob("*.xml")): + root = ET.parse(xml_path).getroot() + filename_el = root.find("filename") + if filename_el is None or not filename_el.text: + raise ValueError(f"VOC file {xml_path} has no .") + boxes: list[list[float]] = [] + labels: list[int] = [] + for obj in root.findall("object"): + name = obj.findtext("name") + if name not in name_to_id: + raise ValueError( + f"VOC class {name!r} in {xml_path.name} is not in the " + f"class list {sorted(name_to_id)}." + ) + box = obj.find("bndbox") + if box is None: + raise ValueError(f"VOC object in {xml_path.name} has no .") + boxes.append( + [ + _coord(box, "xmin", xml_path), + _coord(box, "ymin", xml_path), + _coord(box, "xmax", xml_path), + _coord(box, "ymax", xml_path), + ] + ) + labels.append(name_to_id[name]) + records.append({"sample_id": filename_el.text, "boxes": boxes, "labels": labels}) + return records + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load VOC xml labels and align to sample_ids for detection.""" + labels_dir = get_source_path(self.source, kind=SourceKind.LABELS) + # Precedence: parser's own class_names > model's class_names > _VOC_CLASSES + active_names: list[str] | tuple[str, ...] = ( + self.class_names + if self.class_names is not None + else (class_names if class_names is not None else _VOC_CLASSES) + ) + name_to_id = {name: idx for idx, name in enumerate(active_names)} + records = self._to_detection_records(labels_dir, name_to_id) + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + ) diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py index 51525158..8bc39734 100644 --- a/src/raitap/data/tests/test_label_parsers.py +++ b/src/raitap/data/tests/test_label_parsers.py @@ -436,3 +436,168 @@ def test_yolo_parser_e2e_via_resolve(tmp_path: object) -> None: assert result[0]["labels"].tolist() == [0] assert result[1]["boxes"].shape == (1, 4) assert result[1]["labels"].tolist() == [0] + + +# --- Task 7: VocLabelParser --- + + +def _write_voc_xml(path: object, filename: str, objects: list[dict]) -> None: + """Write a minimal Pascal-VOC XML file.""" + import pathlib + + lines = [ + "", + f" {filename}", + ] + for obj in objects: + lines += [ + " ", + f" {obj['name']}", + ] + if obj.get("bndbox") is not None: + b = obj["bndbox"] + lines += [ + " ", + f" {b[0]}", + f" {b[1]}", + f" {b[2]}", + f" {b[3]}", + " ", + ] + lines.append(" ") + lines.append("") + pathlib.Path(str(path)).write_text("\n".join(lines), encoding="utf-8") + + +def _make_voc_fixture(tmp_path: object) -> object: + """Two-image VOC dir with class_names=['background','person','car']. + + a.jpg: person at [10,20,30,40], car at [5,5,15,15]. + b.jpg: person at [0,0,50,50]. + """ + import pathlib + + tmp = pathlib.Path(str(tmp_path)) + voc_dir = tmp / "voc_labels" + voc_dir.mkdir() + _write_voc_xml( + voc_dir / "a.xml", + "a.jpg", + [ + {"name": "person", "bndbox": [10, 20, 30, 40]}, + {"name": "car", "bndbox": [5, 5, 15, 15]}, + ], + ) + _write_voc_xml( + voc_dir / "b.xml", + "b.jpg", + [{"name": "person", "bndbox": [0, 0, 50, 50]}], + ) + return voc_dir + + +def test_voc_parser_unit_with_class_names(tmp_path: object) -> None: + """VocLabelParser.parse: person->1, car->2 with explicit class_names arg.""" + import torch + + from raitap.data.label_parsers.voc import VocLabelParser + + voc_dir = _make_voc_fixture(tmp_path) + parser = VocLabelParser(source=str(voc_dir)) + class_names = ["background", "person", "car"] + tensor = [object(), object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=class_names, + ) + assert isinstance(result, list) + assert len(result) == 2 + # a.jpg: person(1), car(2) + expected_boxes = torch.tensor([[10.0, 20.0, 30.0, 40.0], [5.0, 5.0, 15.0, 15.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([1, 2])) + # b.jpg: person(1) + assert torch.equal(result[1]["boxes"], torch.tensor([[0.0, 0.0, 50.0, 50.0]])) + assert torch.equal(result[1]["labels"], torch.tensor([1])) + + +def test_voc_parser_raises_on_missing_bndbox(tmp_path: object) -> None: + """parse raises ValueError when has no .""" + import pathlib + + from raitap.data.label_parsers.voc import VocLabelParser + + tmp = pathlib.Path(str(tmp_path)) + voc_dir = tmp / "voc_no_box" + voc_dir.mkdir() + _write_voc_xml( + voc_dir / "bad.xml", + "bad.jpg", + [{"name": "person"}], # no bndbox key -> not written + ) + parser = VocLabelParser(source=str(voc_dir)) + with pytest.raises(ValueError, match="no "): + parser.parse( + task_kind=TaskKind.detection, + tensor=[object()], + sample_ids=["bad.jpg"], + data_source=None, + class_names=["person"], + ) + + +def test_voc_parser_e2e_class_names_from_model(tmp_path: object) -> None: + """E2E: cfg.model.class_names supplies mapping; person->1 via _resolve_and_parse_labels.""" + import torch + + from raitap.configs.schema import VocLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + voc_dir = _make_voc_fixture(tmp_path) + # class_names on the config is None; model supplies it instead + cfg = _make_cfg( + labels=VocLabelsConfig(source=str(voc_dir)), + class_names=["background", "person", "car"], + ) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, list) + assert len(result) == 2 + assert torch.equal(result[0]["labels"], torch.tensor([1, 2])) + assert torch.equal(result[1]["labels"], torch.tensor([1])) + + +def test_voc_parser_own_class_names_takes_precedence(tmp_path: object) -> None: + """Parser's VocLabelsConfig.class_names overrides model's class_names.""" + import torch + + from raitap.configs.schema import VocLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + voc_dir = _make_voc_fixture(tmp_path) + # Parser config has class_names; model has a different (wrong) mapping + cfg = _make_cfg( + labels=VocLabelsConfig( + source=str(voc_dir), + class_names=["background", "person", "car"], + ), + class_names=["car", "background", "person"], # different order -> would give wrong ids + ) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, list) + # Parser's own list wins: person->1, car->2 + assert torch.equal(result[0]["labels"], torch.tensor([1, 2])) From 8407436a66f2c85914aec8ac46f08493626c0eb0 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 05:02:20 +0200 Subject: [PATCH 21/28] fix(model): detection labels honour id_strategy for nested dirs (refs #338) --- src/raitap/data/label_parsers/coco.py | 1 + src/raitap/data/label_parsers/voc.py | 1 + src/raitap/data/label_parsers/yolo.py | 1 + src/raitap/data/tests/test_label_parsers.py | 70 +++++++++++++++++++++ src/raitap/task_families/detection.py | 26 ++++++-- 5 files changed, 95 insertions(+), 4 deletions(-) diff --git a/src/raitap/data/label_parsers/coco.py b/src/raitap/data/label_parsers/coco.py index 71c57011..673a39e0 100644 --- a/src/raitap/data/label_parsers/coco.py +++ b/src/raitap/data/label_parsers/coco.py @@ -105,6 +105,7 @@ def parse( records, expected=len(tensor), sample_ids=sample_ids, + strategy=str(self.id_strategy), ) # classification diff --git a/src/raitap/data/label_parsers/voc.py b/src/raitap/data/label_parsers/voc.py index eccba526..959f9e48 100644 --- a/src/raitap/data/label_parsers/voc.py +++ b/src/raitap/data/label_parsers/voc.py @@ -126,4 +126,5 @@ def parse( records, expected=len(tensor), sample_ids=sample_ids, + strategy=str(self.id_strategy), ) diff --git a/src/raitap/data/label_parsers/yolo.py b/src/raitap/data/label_parsers/yolo.py index 75ce80c6..3d76c298 100644 --- a/src/raitap/data/label_parsers/yolo.py +++ b/src/raitap/data/label_parsers/yolo.py @@ -89,4 +89,5 @@ def parse( records, expected=len(tensor), sample_ids=sample_ids, + strategy=str(self.id_strategy), ) diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py index 8bc39734..cff4face 100644 --- a/src/raitap/data/tests/test_label_parsers.py +++ b/src/raitap/data/tests/test_label_parsers.py @@ -601,3 +601,73 @@ def test_voc_parser_own_class_names_takes_precedence(tmp_path: object) -> None: assert isinstance(result, list) # Parser's own list wins: person->1, car->2 assert torch.equal(result[0]["labels"], torch.tensor([1, 2])) + + +# --- Task 8: detection id_strategy parity --- + + +def _coco_detection_nested_fixture(tmp_path: object) -> object: + """COCO with file_name='a.jpg' (no subdir) but discovered sample_ids=['sub/a.jpg'].""" + import pathlib + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}], + "annotations": [ + {"image_id": 1, "category_id": 2, "bbox": [1, 2, 3, 4]}, + ], + "categories": [{"id": 2, "name": "cat"}], + } + p = pathlib.Path(str(tmp_path)) / "nested.json" + _write_json(p, coco) + return p + + +def test_coco_detection_nested_sample_ids_with_stem_strategy(tmp_path: object) -> None: + """Gap #2: COCO record 'a.jpg' matches discovered 'sub/a.jpg' via id_strategy='stem'.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + from raitap.data.types import IdStrategy + + labels_path = _coco_detection_nested_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path), id_strategy=IdStrategy.stem) + tensor = [object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["sub/a.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, list) + assert len(result) == 1 + # bbox [1,2,3,4] -> xyxy [1, 2, 1+3, 2+4] = [1, 2, 4, 6] + expected_boxes = torch.tensor([[1.0, 2.0, 4.0, 6.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([2])) + + +def test_coco_detection_exact_match_regression(tmp_path: object) -> None: + """Regression: exact-match ids still align under id_strategy='auto'.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + from raitap.data.types import IdStrategy + + labels_path = _coco_detection_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path), id_strategy=IdStrategy.auto) + tensor = [object(), object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, list) + assert len(result) == 2 + expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([3, 5])) + assert result[1]["boxes"].shape == (0, 4) + assert result[1]["labels"].shape == (0,) diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py index eab54c78..fd10bdcc 100644 --- a/src/raitap/task_families/detection.py +++ b/src/raitap/task_families/detection.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, cast +from raitap.data.data import _normalise_sample_id, _resolve_id_strategy from raitap.task_families.registry import task_family from raitap.transparency.contracts import ExplanationOutputSpace from raitap.types import TaskKind @@ -26,16 +27,24 @@ def _align_detection_records( *, expected: int, sample_ids: Any, + strategy: str = "auto", ) -> list[dict[str, torch.Tensor]]: """Align native detection records to ``sample_ids`` and build tensors. Extracted from ``DetectionFamily.load_labels`` so label-format adapters can feed converted records through the same alignment + validation path. + + When ``sample_ids`` is provided, both the discovered ids and record + ``sample_id`` fields are normalised via ``_normalise_sample_id`` using the + resolved ``strategy``, matching how the classification path handles nested + image directories. """ + import pandas as pd import torch if sample_ids is not None: - by_id: dict[str, dict[str, Any]] = {} + # Collect raw record ids first so _resolve_id_strategy can inspect them. + raw_record_ids: list[str] = [] for index, record in enumerate(records): record_id = record.get("sample_id") if isinstance(record, dict) else None if record_id is None: @@ -43,15 +52,24 @@ def _align_detection_records( f"Detection labels record {index} is missing 'sample_id' " "(required when the dataset exposes sample_ids)." ) - if record_id in by_id: + raw_record_ids.append(str(record_id)) + + resolved = _resolve_id_strategy(strategy, pd.Series(raw_record_ids)) + + by_id: dict[str, dict[str, Any]] = {} + for record, record_id in zip(records, raw_record_ids, strict=True): + norm_id = _normalise_sample_id(record_id, resolved) + if norm_id in by_id: raise ValueError( f"Detection labels file contains duplicate sample_id {record_id!r}." ) - by_id[record_id] = record + by_id[norm_id] = record + ordered_records = [] missing: list[str] = [] for sample_id in sample_ids: - record = by_id.get(sample_id) + norm_sid = _normalise_sample_id(sample_id, resolved) + record = by_id.get(norm_sid) if record is None: missing.append(sample_id) else: From ed503a4f3cbc1f978e93213c72be4827f6babd2e Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 05:14:38 +0200 Subject: [PATCH 22/28] feat(data): DetectionJsonLabelParser restores native detection JSON format (refs #338) --- src/raitap/configs/schema.py | 7 +++ .../configs/tests/test_labels_schema.py | 7 ++- src/raitap/data/label_parsers/__init__.py | 2 + .../data/label_parsers/detection_json.py | 56 +++++++++++++++++++ .../data/tests/test_detection_labels.py | 54 +++++++++++++----- .../data/tests/test_detection_ragged.py | 29 ++++++---- 6 files changed, 127 insertions(+), 28 deletions(-) create mode 100644 src/raitap/data/label_parsers/detection_json.py diff --git a/src/raitap/configs/schema.py b/src/raitap/configs/schema.py index b616d85a..24fcde0a 100644 --- a/src/raitap/configs/schema.py +++ b/src/raitap/configs/schema.py @@ -110,6 +110,13 @@ class VocLabelsConfig(LabelsConfig): class_names: list[str] | None = None +@dataclass +class DetectionJsonLabelsConfig(LabelsConfig): + _target_: str = "DetectionJsonLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + + @dataclass class DataConfig: name: str = "isic2018" diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py index 64ee9c84..e97bea36 100644 --- a/src/raitap/configs/tests/test_labels_schema.py +++ b/src/raitap/configs/tests/test_labels_schema.py @@ -3,7 +3,7 @@ import pytest -from raitap.configs.schema import CocoLabelsConfig, DirectoryLabelsConfig +from raitap.configs.schema import CocoLabelsConfig, DetectionJsonLabelsConfig, DirectoryLabelsConfig def test_coco_config_has_no_tabular_fields() -> None: @@ -102,3 +102,8 @@ def test_create_label_parser_handles_both_target_forms() -> None: fqn = create_label_parser({"_target_": _COMPOSED_TARGET}) assert isinstance(fqn, DirectoryLabelParser) + + +def test_detection_json_config_has_exactly_target_source_id_strategy() -> None: + names = {f.name for f in dataclasses.fields(DetectionJsonLabelsConfig)} + assert names == {"_target_", "source", "id_strategy"} diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py index 053ad444..b1ced9e2 100644 --- a/src/raitap/data/label_parsers/__init__.py +++ b/src/raitap/data/label_parsers/__init__.py @@ -10,6 +10,7 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates, from __future__ import annotations from .coco import CocoLabelParser # pyright: ignore[reportUnusedImport] +from .detection_json import DetectionJsonLabelParser # pyright: ignore[reportUnusedImport] from .directory import DirectoryLabelParser from .tabular import TabularLabelParser # pyright: ignore[reportUnusedImport] from .voc import VocLabelParser # pyright: ignore[reportUnusedImport] @@ -17,6 +18,7 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates, __all__ = [ "CocoLabelParser", + "DetectionJsonLabelParser", "DirectoryLabelParser", "TabularLabelParser", "VocLabelParser", diff --git a/src/raitap/data/label_parsers/detection_json.py b/src/raitap/data/label_parsers/detection_json.py new file mode 100644 index 00000000..6e3cbfdc --- /dev/null +++ b/src/raitap/data/label_parsers/detection_json.py @@ -0,0 +1,56 @@ +"""Detection-JSON label parser (native RAITAP detection record format).""" + +from __future__ import annotations + +import json +from typing import Any + +from raitap.configs.schema import DetectionJsonLabelsConfig +from raitap.data.data import SourceKind, get_source_path +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + + +@label_parser(registry_name="detection_json", schema=DetectionJsonLabelsConfig) +class DetectionJsonLabelParser: + """Parse native RAITAP detection JSON records for detection. + + The file must be a JSON array of objects with keys ``sample_id``, + ``boxes`` (list of ``[x1, y1, x2, y2]`` in pixels), and ``labels`` + (list of integer class ids). + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_strategy = id_strategy + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load native detection JSON and align records to sample_ids.""" + labels_path = get_source_path(self.source, kind=SourceKind.LABELS) + with labels_path.open() as fh: + records = json.load(fh) + if not isinstance(records, list): + raise ValueError(f"Detection labels file {labels_path} must be a JSON array.") + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + strategy=str(self.id_strategy), + ) diff --git a/src/raitap/data/tests/test_detection_labels.py b/src/raitap/data/tests/test_detection_labels.py index 5a5663da..413c81da 100644 --- a/src/raitap/data/tests/test_detection_labels.py +++ b/src/raitap/data/tests/test_detection_labels.py @@ -1,4 +1,4 @@ -"""Tests for DetectionFamily.load_labels — list[dict] per-sample boxes + labels.""" +"""Tests for DetectionJsonLabelParser -- list[dict] per-sample boxes + labels.""" from __future__ import annotations @@ -9,8 +9,9 @@ import pytest import torch -from raitap.data.data import Data -from raitap.task_families.detection import DetectionFamily +from raitap.configs.schema import DetectionJsonLabelsConfig +from raitap.data.data import Data, _resolve_and_parse_labels +from raitap.types import TaskKind if TYPE_CHECKING: from pathlib import Path @@ -40,12 +41,15 @@ def _write_detection_labels_json(path: Path) -> None: def _stub_cfg(labels_source: str | None = None) -> AppConfig: + labels = DetectionJsonLabelsConfig(source=labels_source) if labels_source is not None else None return cast( "AppConfig", SimpleNamespace( data=SimpleNamespace( - labels=SimpleNamespace(source=labels_source), + labels=labels, + source=None, ), + model=SimpleNamespace(class_names=None), ), ) @@ -63,7 +67,9 @@ def test_load_detection_labels_returns_list_of_dicts(tmp_path: Path) -> None: cfg = _stub_cfg(labels_source=str(labels_path)) data = _make_data(num_samples=3) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is not None assert isinstance(out, list) assert len(out) == 3 @@ -79,7 +85,7 @@ def test_load_detection_labels_returns_list_of_dicts(tmp_path: Path) -> None: def test_load_detection_labels_aligns_by_sample_id_when_present(tmp_path: Path) -> None: - """Reordered labels file is rewritten to match self.sample_ids ordering.""" + """Reordered labels file is rewritten to match sample_ids ordering.""" labels_path = tmp_path / "boxes.json" # Write records out of order vs sample_ids. payload = [ @@ -91,7 +97,9 @@ def test_load_detection_labels_aligns_by_sample_id_when_present(tmp_path: Path) cfg = _stub_cfg(labels_source=str(labels_path)) data = _make_data(num_samples=3, sample_ids=["img_0", "img_1", "img_2"]) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is not None assert int(out[0]["labels"].item()) == 7 assert out[1]["labels"].numel() == 0 @@ -110,7 +118,9 @@ def test_load_detection_labels_rejects_missing_sample_id_entries(tmp_path: Path) data = _make_data(num_samples=3, sample_ids=["img_0", "img_1", "img_2"]) with pytest.raises(ValueError, match="missing entries"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_duplicate_sample_id(tmp_path: Path) -> None: @@ -125,7 +135,9 @@ def test_load_detection_labels_rejects_duplicate_sample_id(tmp_path: Path) -> No data = _make_data(num_samples=2, sample_ids=["img_0", "img_1"]) with pytest.raises(ValueError, match="duplicate sample_id"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_record_missing_sample_id_field(tmp_path: Path) -> None: @@ -138,7 +150,9 @@ def test_load_detection_labels_rejects_record_missing_sample_id_field(tmp_path: data = _make_data(num_samples=1, sample_ids=["img_0"]) with pytest.raises(ValueError, match="missing 'sample_id'"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_wrong_length_when_no_sample_ids(tmp_path: Path) -> None: @@ -149,7 +163,9 @@ def test_load_detection_labels_rejects_wrong_length_when_no_sample_ids(tmp_path: data = _make_data(num_samples=5) # dataset bigger than labels with pytest.raises(ValueError, match="5 samples"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_mismatched_box_label_counts(tmp_path: Path) -> None: @@ -161,7 +177,9 @@ def test_load_detection_labels_rejects_mismatched_box_label_counts(tmp_path: Pat data = _make_data(num_samples=1) with pytest.raises(ValueError, match="boxes and labels"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_non_list_root(tmp_path: Path) -> None: @@ -171,13 +189,17 @@ def test_load_detection_labels_rejects_non_list_root(tmp_path: Path) -> None: data = _make_data(num_samples=1) with pytest.raises(ValueError, match="must be a JSON array"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_returns_none_when_no_source_configured(tmp_path: Path) -> None: cfg = _stub_cfg(labels_source=None) data = _make_data(num_samples=1) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is None @@ -185,4 +207,6 @@ def test_load_detection_labels_raises_when_source_unresolvable(tmp_path: Path) - cfg = _stub_cfg(labels_source=str(tmp_path / "missing.json")) data = _make_data(num_samples=1) with pytest.raises(ValueError, match="could not be resolved"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) diff --git a/src/raitap/data/tests/test_detection_ragged.py b/src/raitap/data/tests/test_detection_ragged.py index b805bbf2..6f642a41 100644 --- a/src/raitap/data/tests/test_detection_ragged.py +++ b/src/raitap/data/tests/test_detection_ragged.py @@ -18,7 +18,8 @@ import torch from PIL import Image -from raitap.data.data import Data +from raitap.configs.schema import DetectionJsonLabelsConfig +from raitap.data.data import Data, _resolve_and_parse_labels from raitap.task_families.classification import ClassificationFamily from raitap.task_families.detection import DetectionFamily from raitap.types import TaskKind @@ -164,7 +165,7 @@ def _write_labels_json(self, path: Path, n: int) -> None: path.write_text(json.dumps(payload)) def test_detection_labels_count_matches_list_tensor(self, tmp_path: Path) -> None: - """DetectionFamily.load_labels: len(tensor) works when tensor is a list.""" + """DetectionJsonLabelParser: len(tensor) works when tensor is a list.""" labels_path = tmp_path / "boxes.json" self._write_labels_json(labels_path, n=3) @@ -181,13 +182,15 @@ def test_detection_labels_count_matches_list_tensor(self, tmp_path: Path) -> Non "AppConfig", SimpleNamespace( data=SimpleNamespace( - labels=SimpleNamespace( - source=str(labels_path), - ) - ) + labels=DetectionJsonLabelsConfig(source=str(labels_path)), + source=None, + ), + model=SimpleNamespace(class_names=None), ), ) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is not None assert len(out) == 3 @@ -204,14 +207,16 @@ def test_detection_labels_count_mismatch_raises_with_list_tensor(self, tmp_path: "AppConfig", SimpleNamespace( data=SimpleNamespace( - labels=SimpleNamespace( - source=str(labels_path), - ) - ) + labels=DetectionJsonLabelsConfig(source=str(labels_path)), + source=None, + ), + model=SimpleNamespace(class_names=None), ), ) with pytest.raises(ValueError, match="3 samples"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) # --------------------------------------------------------------------------- From a90c4f36c29291b8377a3d9e776bda82d9ddf89d Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 05:25:30 +0200 Subject: [PATCH 23/28] refactor(data): remove dead LabelFormat seam superseded by label parsers (refs #338) --- .../configs/tests/test_labels_schema.py | 69 +++++ src/raitap/data/__init__.py | 8 - src/raitap/data/_label_format_adapters.py | 12 - src/raitap/data/adapters/__init__.py | 1 - src/raitap/data/adapters/coco.py | 72 ----- src/raitap/data/adapters/voc.py | 100 ------- src/raitap/data/adapters/yolo.py | 71 ----- src/raitap/data/data.py | 99 +------ src/raitap/data/label_formats.py | 84 ------ src/raitap/data/tests/test_label_formats.py | 271 ------------------ 10 files changed, 70 insertions(+), 717 deletions(-) delete mode 100644 src/raitap/data/_label_format_adapters.py delete mode 100644 src/raitap/data/adapters/__init__.py delete mode 100644 src/raitap/data/adapters/coco.py delete mode 100644 src/raitap/data/adapters/voc.py delete mode 100644 src/raitap/data/adapters/yolo.py delete mode 100644 src/raitap/data/label_formats.py delete mode 100644 src/raitap/data/tests/test_label_formats.py diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py index e97bea36..85154c2d 100644 --- a/src/raitap/configs/tests/test_labels_schema.py +++ b/src/raitap/configs/tests/test_labels_schema.py @@ -107,3 +107,72 @@ def test_create_label_parser_handles_both_target_forms() -> None: def test_detection_json_config_has_exactly_target_source_id_strategy() -> None: names = {f.name for f in dataclasses.fields(DetectionJsonLabelsConfig)} assert names == {"_target_", "source", "id_strategy"} + + +# --------------------------------------------------------------------------- +# Cross-variant leakage test (Task 10) +# --------------------------------------------------------------------------- + +# Fields that belong exclusively to the tabular variant and must NOT appear +# in any other variant's builder dataclass. +_TABULAR_ONLY_FIELDS = {"id_column", "column", "encoding"} + +# Fields that belong exclusively to the voc variant. +_VOC_ONLY_FIELDS = {"class_names"} + +# Variants that must have ONLY ``_target_`` (no source, no strategy, nothing). +_TARGET_ONLY_VARIANTS: set[str] = {"directory"} + +# Variants that carry source + id_strategy but NO tabular fields and NO +# class_names. +_DETECTION_VARIANTS: set[str] = {"coco", "yolo", "detection_json"} + + +@pytest.mark.parametrize( + "registry_name", + ["directory", "tabular", "coco", "yolo", "voc", "detection_json"], +) +def test_no_cross_variant_field_leakage(registry_name: str) -> None: + """Each label-parser builder dataclass must expose only its own fields. + + Specifically: + - ``directory`` has only ``_target_``. + - ``coco``/``yolo``/``detection_json`` have no tabular-only fields and no + ``class_names``. + - ``voc`` has ``class_names`` but no tabular-only fields. + - ``tabular`` has tabular-only fields but no ``class_names``. + """ + from raitap._adapters import _BUILDERS + + _register_labels_group() + + builders = _BUILDERS.get("data/labels", {}) + assert registry_name in builders, ( + f"Registry name {registry_name!r} not found in _BUILDERS['data/labels']; " + f"registered: {sorted(builders)}" + ) + builder = builders[registry_name] + field_names = {f.name for f in dataclasses.fields(builder)} + + if registry_name in _TARGET_ONLY_VARIANTS: + assert field_names == {"_target_"}, ( + f"{registry_name!r} builder should have only '_target_' but got {field_names}" + ) + + if registry_name in _DETECTION_VARIANTS: + leaked = _TABULAR_ONLY_FIELDS & field_names + assert not leaked, f"{registry_name!r} builder leaks tabular-only fields: {leaked}" + assert "class_names" not in field_names, ( + f"{registry_name!r} builder should not have 'class_names'" + ) + + if registry_name == "voc": + leaked = _TABULAR_ONLY_FIELDS & field_names + assert not leaked, f"voc builder leaks tabular-only fields: {leaked}" + assert "class_names" in field_names, "voc builder must have 'class_names'" + + if registry_name == "tabular": + assert field_names >= _TABULAR_ONLY_FIELDS, ( + f"tabular builder is missing expected fields; got {field_names}" + ) + assert "class_names" not in field_names, "tabular builder should not have 'class_names'" diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py index 0365b77f..7b6c7dc5 100644 --- a/src/raitap/data/__init__.py +++ b/src/raitap/data/__init__.py @@ -19,7 +19,6 @@ from raitap.configs.schema import DataConfig, LabelsConfig from .data import Data, load_numpy_from_source, load_tensor_from_source - from .label_formats import LabelFormatAdapter, resolve_label_format_adapter from .metadata import DataInputMetadata, infer_data_input_metadata from .preprocessing import ( DataPreprocessingFactory, @@ -36,7 +35,6 @@ "DataPreprocessingFactory", "IdStrategy", "LabelEncoding", - "LabelFormatAdapter", "LabelsConfig", "ModelInputTransformationFactory", "Preprocessing", @@ -45,7 +43,6 @@ "load_tensor_from_source", "raitap_model_input_transformation_factory", "raitap_preprocessing_factory", - "resolve_label_format_adapter", ] @@ -71,11 +68,6 @@ "raitap.data.preprocessing", "raitap_preprocessing_factory", ), - "LabelFormatAdapter": ("raitap.data.label_formats", "LabelFormatAdapter"), - "resolve_label_format_adapter": ( - "raitap.data.label_formats", - "resolve_label_format_adapter", - ), } diff --git a/src/raitap/data/_label_format_adapters.py b/src/raitap/data/_label_format_adapters.py deleted file mode 100644 index 41c06b01..00000000 --- a/src/raitap/data/_label_format_adapters.py +++ /dev/null @@ -1,12 +0,0 @@ -# pyright: reportUnusedImport=false -"""Imports every in-tree label-format adapter so the decorators fire. - -Imported for its side effects by -``raitap.data.label_formats.resolve_label_format_adapter``. Every import in this -module is intentionally side-effect-only (registers an adapter), so the -file-level ``reportUnusedImport=false`` above is correct. -""" - -from __future__ import annotations - -from raitap.data.adapters import coco, voc, yolo # noqa: F401 diff --git a/src/raitap/data/adapters/__init__.py b/src/raitap/data/adapters/__init__.py deleted file mode 100644 index 4b68f1da..00000000 --- a/src/raitap/data/adapters/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Built-in label-format adapters (issue #338).""" diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py deleted file mode 100644 index 3551e5f3..00000000 --- a/src/raitap/data/adapters/coco.py +++ /dev/null @@ -1,72 +0,0 @@ -"""COCO label-format adapter (issue #338).""" - -from __future__ import annotations - -import json -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from pathlib import Path - -from raitap.data.label_formats import ( - ClassificationRecord, - DetectionRecord, - label_format, -) -from raitap.data.types import LabelFormat -from raitap.types import TaskKind - - -@label_format -class CocoAdapter: - """COCO ``instances.json`` -> native records. - - Detection: ``bbox`` is ``[x, y, w, h]`` -> ``[x1, y1, x2, y2]``; - ``category_id`` passes through unchanged so labels stay in the model's - label space. Classification: one label per image (the image's single - annotation category); images with 0 or >1 categories raise. - """ - - format = LabelFormat.coco - supported_tasks = frozenset({TaskKind.detection, TaskKind.classification}) - - def _load(self, source: Path) -> dict[str, Any]: - with source.open() as fh: - data = json.load(fh) - if not isinstance(data, dict) or "images" not in data: - raise ValueError(f"COCO file {source} must be an object with an 'images' array.") - return data - - def to_detection_records( - self, source: Path, *, image_dir: Path | None, class_names: list[str] | None - ) -> list[DetectionRecord]: - data = self._load(source) - file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} - boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image} - labels: dict[int, list[int]] = {iid: [] for iid in file_by_image} - for ann in data.get("annotations", []): - iid = ann["image_id"] - x, y, w, h = ann["bbox"] - boxes[iid].append([x, y, x + w, y + h]) - labels[iid].append(int(ann["category_id"])) - return [ - {"sample_id": file_by_image[iid], "boxes": boxes[iid], "labels": labels[iid]} - for iid in file_by_image - ] - - def to_classification_records(self, source: Path) -> list[ClassificationRecord]: - data = self._load(source) - file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} - cats: dict[int, set[int]] = {iid: set() for iid in file_by_image} - for ann in data.get("annotations", []): - cats[ann["image_id"]].add(int(ann["category_id"])) - records: list[ClassificationRecord] = [] - for iid, name in file_by_image.items(): - cat_set = cats[iid] - if len(cat_set) != 1: - raise ValueError( - f"COCO classification needs exactly one category per image; " - f"image {name!r} has {len(cat_set)}." - ) - records.append({"sample_id": name, "label": next(iter(cat_set))}) - return records diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py deleted file mode 100644 index 02a8f270..00000000 --- a/src/raitap/data/adapters/voc.py +++ /dev/null @@ -1,100 +0,0 @@ -"""Pascal-VOC label-format adapter (issue #338).""" - -from __future__ import annotations - -import xml.etree.ElementTree as ET -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from pathlib import Path - -from raitap.data.label_formats import ( - ClassificationRecord, - DetectionRecord, - label_format, -) -from raitap.data.types import LabelFormat -from raitap.types import TaskKind - -#: Canonical Pascal-VOC class order (index = label id) when no class_names given. -_VOC_CLASSES = ( - "aeroplane", - "bicycle", - "bird", - "boat", - "bottle", - "bus", - "car", - "cat", - "chair", - "cow", - "diningtable", - "dog", - "horse", - "motorbike", - "person", - "pottedplant", - "sheep", - "sofa", - "train", - "tvmonitor", -) - - -def _coord(box: ET.Element, tag: str, xml_path: Path) -> float: - text = box.findtext(tag) - if text is None: - raise ValueError(f"VOC bndbox in {xml_path.name} missing <{tag}>.") - return float(text) - - -@label_format -class VocAdapter: - """Pascal-VOC per-image ``.xml`` -> native detection records. - - Boxes are already ``[xmin, ymin, xmax, ymax]`` pixels. Class names map to - ids by their position in ``class_names`` (else the standard 20-class VOC - order). - """ - - format = LabelFormat.voc - supported_tasks = frozenset({TaskKind.detection}) - - def to_detection_records( - self, source: Path, *, image_dir: Path | None, class_names: list[str] | None - ) -> list[DetectionRecord]: - name_to_id = { - name: idx for idx, name in enumerate(class_names if class_names else _VOC_CLASSES) - } - records: list[DetectionRecord] = [] - for xml_path in sorted(source.glob("*.xml")): - root = ET.parse(xml_path).getroot() - filename_el = root.find("filename") - if filename_el is None or not filename_el.text: - raise ValueError(f"VOC file {xml_path} has no .") - boxes: list[list[float]] = [] - labels: list[int] = [] - for obj in root.findall("object"): - name = obj.findtext("name") - if name not in name_to_id: - raise ValueError( - f"VOC class {name!r} in {xml_path.name} is not in the " - f"class list {sorted(name_to_id)}." - ) - box = obj.find("bndbox") - if box is None: - raise ValueError(f"VOC object in {xml_path.name} has no .") - boxes.append( - [ - _coord(box, "xmin", xml_path), - _coord(box, "ymin", xml_path), - _coord(box, "xmax", xml_path), - _coord(box, "ymax", xml_path), - ] - ) - labels.append(name_to_id[name]) - records.append({"sample_id": filename_el.text, "boxes": boxes, "labels": labels}) - return records - - def to_classification_records(self, source: Path) -> list[ClassificationRecord]: - raise ValueError("VOC is a detection-only format.") diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py deleted file mode 100644 index be6419f8..00000000 --- a/src/raitap/data/adapters/yolo.py +++ /dev/null @@ -1,71 +0,0 @@ -"""YOLO label-format adapter (issue #338).""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -from PIL import Image - -if TYPE_CHECKING: - from pathlib import Path - -from raitap.data.label_formats import ( - ClassificationRecord, - DetectionRecord, - label_format, -) -from raitap.data.types import LabelFormat -from raitap.types import TaskKind - -_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".bmp", ".webp") - - -@label_format -class YoloAdapter: - """YOLO per-image ``.txt`` (``class cx cy w h``, normalised) -> native records. - - Boxes are denormalised with each image's pixel size, read from - ``image_dir``. Class indices pass through unchanged. - """ - - format = LabelFormat.yolo - supported_tasks = frozenset({TaskKind.detection}) - - def _image_for(self, image_dir: Path, stem: str) -> Path: - for suffix in _IMAGE_SUFFIXES: - candidate = image_dir / f"{stem}{suffix}" - if candidate.exists(): - return candidate - raise ValueError(f"YOLO adapter found no image for label {stem!r} in {image_dir}.") - - def to_detection_records( - self, source: Path, *, image_dir: Path | None, class_names: list[str] | None - ) -> list[DetectionRecord]: - if image_dir is None: - raise ValueError( - "YOLO labels need image_dir to denormalise boxes; " - "set data.source to the image directory." - ) - records: list[DetectionRecord] = [] - for txt in sorted(source.glob("*.txt")): - image_path = self._image_for(image_dir, txt.stem) - with Image.open(image_path) as im: - width, height = im.size - boxes: list[list[float]] = [] - labels: list[int] = [] - for line in txt.read_text().splitlines(): - parts = line.split() - if not parts: - continue - cls, cx, cy, bw, bh = (float(p) for p in parts[:5]) - x1 = (cx - bw / 2) * width - y1 = (cy - bh / 2) * height - x2 = (cx + bw / 2) * width - y2 = (cy + bh / 2) * height - boxes.append([x1, y1, x2, y2]) - labels.append(int(cls)) - records.append({"sample_id": image_path.name, "boxes": boxes, "labels": labels}) - return records - - def to_classification_records(self, source: Path) -> list[ClassificationRecord]: - raise ValueError("YOLO is a detection-only format.") diff --git a/src/raitap/data/data.py b/src/raitap/data/data.py index 46c4ee8d..15f6227a 100644 --- a/src/raitap/data/data.py +++ b/src/raitap/data/data.py @@ -2,7 +2,7 @@ from collections import Counter from enum import StrEnum -from pathlib import Path, PurePosixPath +from pathlib import Path from typing import TYPE_CHECKING, Any import numpy as np @@ -276,103 +276,6 @@ def _resolve_and_parse_labels( ) -def _load_directory_labels(sample_ids: list[str] | None) -> torch.Tensor | None: - """Derive classification labels from each sample's top-level class folder - (torchvision ImageFolder semantics). Returns None (with a warning) when - labels cannot be derived: no sample ids, or a sample with no class subdir.""" - if not sample_ids: - raitap_log.warn( - "data.labels.source='directory' needs image samples organised into " - "class subdirectories; none were found. Falling back to predictions " - "as metric targets." - ) - return None - parts_by_id = [PurePosixPath(sid).parts for sid in sample_ids] - if any(len(parts) < 2 for parts in parts_by_id): - raitap_log.warn( - "data.labels.source='directory' expects a / layout, but " - "one or more samples sit directly under the data source root (no class " - "subdirectory). Falling back to predictions as metric targets." - ) - return None - classes = sorted({parts[0] for parts in parts_by_id}) - class_to_idx = {name: idx for idx, name in enumerate(classes)} - labels = [class_to_idx[parts[0]] for parts in parts_by_id] - return torch.tensor(labels, dtype=torch.long) - - -def load_classification_labels( - cfg: AppConfig, - *, - tensor: torch.Tensor | DetectionInputs, - sample_ids: list[str] | None, -) -> torch.Tensor | None: - """Load tabular classification labels (CSV/TSV/Parquet) → tensor or ``None``. - - Aligns to ``sample_ids`` by id column when available, otherwise falls back - to row order. Returns ``None`` when ``data.labels.source`` is unset, the - file is empty, or alignment fails (callers then use predictions as targets). - - Note: directory and format-adapter branches have moved to dedicated - ``LabelParser`` implementations. This function handles the tabular (native) - path only and will be wrapped by ``TabularLabelParser`` in a later task. - """ - labels_cfg = _get_optional_config_value(cfg.data, "labels") - labels_source = _get_optional_config_value(labels_cfg, "source") - if not labels_source: - return None - - labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) - labels_df = _load_tabular_frame(labels_path) - if labels_df.empty: - raitap_log.warn("Labels file is empty; falling back to predictions as targets.") - return None - - labels_id_column = _get_optional_config_value(labels_cfg, "id_column") - id_column = _resolve_labels_id_column(labels_df, labels_id_column) - labels_column = _get_optional_config_value(labels_cfg, "column") - labels_encoding = _get_optional_config_value(labels_cfg, "encoding") - labels_id_strategy = _get_optional_config_value(labels_cfg, "id_strategy") or "auto" - encoded_labels = _extract_class_labels( - labels_df, - labels_column=labels_column, - id_column=id_column, - labels_encoding=labels_encoding, - ) - - expected = len(tensor) - if sample_ids and id_column: - id_series = _column_as_series(labels_df, id_column) - strategy = _resolve_id_strategy(labels_id_strategy, id_series) - try: - aligned_labels = _align_labels_to_samples( - sample_ids=sample_ids, - raw_label_ids=id_series, - encoded_labels=encoded_labels, - strategy=strategy, - ) - except ValueError as error: - raitap_log.warn( - f"{error} Falling back to predictions as metric targets.", - ) - return None - return torch.tensor(aligned_labels, dtype=torch.long) - - if sample_ids and not id_column: - raitap_log.warn( - "Could not find a labels id column for filename alignment; using row-order labels.", - ) - - if len(encoded_labels) != expected: - raitap_log.warn( - f"Label count ({len(encoded_labels)}) does not match sample count ({expected}); " - "falling back to predictions as targets.", - ) - return None - - return torch.tensor(encoded_labels, dtype=torch.long) - - def load_tensor_from_source( source: str, n_samples: int | None = None, diff --git a/src/raitap/data/label_formats.py b/src/raitap/data/label_formats.py deleted file mode 100644 index 19021a95..00000000 --- a/src/raitap/data/label_formats.py +++ /dev/null @@ -1,84 +0,0 @@ -"""Pluggable label-format adapters (issue #338). - -Each adapter converts an external annotation file (COCO / YOLO / VOC) into -RAITAP's native intermediate record list, which the task-family loaders then -align to ``sample_ids`` with their existing logic. Registry mirrors -``raitap.task_families.registry``: a decorator registers one singleton per -``LabelFormat``. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, Protocol, TypeVar, runtime_checkable - -if TYPE_CHECKING: - from pathlib import Path - - from raitap.data.types import LabelFormat - from raitap.types import TaskKind - -#: Native intermediate record shapes (match the on-disk native formats). -DetectionRecord = dict[str, Any] -ClassificationRecord = dict[str, Any] - - -@runtime_checkable -class LabelFormatAdapter(Protocol): - """Converts an external label file to native intermediate records.""" - - format: LabelFormat - supported_tasks: frozenset[TaskKind] - - def to_detection_records( - self, - source: Path, - *, - image_dir: Path | None, - class_names: list[str] | None, - ) -> list[DetectionRecord]: - """Return ``[{sample_id, boxes (xyxy), labels}]``. Raise if unsupported.""" - ... - - def to_classification_records(self, source: Path) -> list[ClassificationRecord]: - """Return ``[{sample_id, label}]``. Raise if unsupported.""" - ... - - -#: format -> the adapter singleton serving it. -LABEL_FORMAT_ADAPTERS: dict[LabelFormat, LabelFormatAdapter] = {} - -T = TypeVar("T") - - -def label_format(cls: type[T]) -> type[T]: - """Register ``cls`` (instantiated once) under its ``format`` class attribute.""" - instance = cls() # type: ignore[call-arg] - LABEL_FORMAT_ADAPTERS[instance.format] = instance # type: ignore[attr-defined] - return cls - - -def resolve_label_format_adapter(fmt: LabelFormat, *, task_kind: TaskKind) -> LabelFormatAdapter: - """Return the adapter for ``fmt`` that supports ``task_kind``. - - Raises ``ValueError`` when no adapter is registered for ``fmt`` (e.g. - ``native``, which the caller should special-case) or the adapter does not - declare ``task_kind`` in ``supported_tasks``. - """ - # Import side-effect: register the in-tree adapters on first use. - from raitap.data import ( - _label_format_adapters, # noqa: F401 # pyright: ignore[reportUnusedImport] - ) - - adapter = LABEL_FORMAT_ADAPTERS.get(fmt) - if adapter is None: - raise ValueError( - f"No adapter registered for label format {fmt.value!r}; " - f"registered: {sorted(f.value for f in LABEL_FORMAT_ADAPTERS)}." - ) - if task_kind not in adapter.supported_tasks: - supported = sorted(t.value for t in adapter.supported_tasks) - raise ValueError( - f"Label format {fmt.value!r} does not support task {task_kind.value!r}; " - f"supported tasks: {supported}." - ) - return adapter diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py deleted file mode 100644 index 657de141..00000000 --- a/src/raitap/data/tests/test_label_formats.py +++ /dev/null @@ -1,271 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, cast - -import pytest - -from raitap.configs.schema import LabelsConfig -from raitap.data.label_formats import ( - LABEL_FORMAT_ADAPTERS, - label_format, - resolve_label_format_adapter, -) -from raitap.data.types import LabelFormat -from raitap.types import TaskKind - -if TYPE_CHECKING: - from pathlib import Path - - from raitap.configs.schema import AppConfig - - -def test_label_format_members_are_string_values() -> None: - assert LabelFormat.native == "native" - assert {f.value for f in LabelFormat} == {"native", "coco", "yolo", "voc"} - - -def test_labels_config_defaults_to_native_format() -> None: - assert LabelsConfig().format is LabelFormat.native - - -def test_label_format_decorator_registers_instance() -> None: - @label_format - class _Dummy: - format = LabelFormat.coco # reuse an enum member; popped below - supported_tasks = frozenset({TaskKind.detection}) - - try: - assert LABEL_FORMAT_ADAPTERS[LabelFormat.coco].supported_tasks == frozenset( - {TaskKind.detection} - ) - finally: - LABEL_FORMAT_ADAPTERS.pop(LabelFormat.coco, None) - - -def test_registry_rejects_unknown_native() -> None: - with pytest.raises(ValueError, match="No adapter"): - resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection) - - -def test_registry_resolves_supported_task() -> None: - adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection) - assert adapter.format is LabelFormat.coco - assert TaskKind.detection in adapter.supported_tasks - - -def test_registry_rejects_unsupported_task() -> None: - with pytest.raises(ValueError, match="does not support task"): - resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification) - - -def test_coco_detection_records(tmp_path: Path) -> None: - import json - - from raitap.data.adapters.coco import CocoAdapter - - coco = { - "images": [ - {"id": 1, "file_name": "a.jpg"}, - {"id": 2, "file_name": "b.jpg"}, - ], - "annotations": [ - {"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}, - {"image_id": 1, "category_id": 5, "bbox": [0, 0, 5, 5]}, - ], - "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], - } - p = tmp_path / "instances.json" - p.write_text(json.dumps(coco)) - - records = CocoAdapter().to_detection_records(p, image_dir=None, class_names=None) - by_id = {r["sample_id"]: r for r in records} - assert by_id["a.jpg"]["boxes"] == [[10, 20, 40, 60], [0, 0, 5, 5]] - assert by_id["a.jpg"]["labels"] == [3, 5] - assert by_id["b.jpg"] == {"sample_id": "b.jpg", "boxes": [], "labels": []} - - -def test_coco_classification_records(tmp_path: Path) -> None: - import json - - from raitap.data.adapters.coco import CocoAdapter - - coco = { - "images": [{"id": 1, "file_name": "a.jpg"}], - "annotations": [{"image_id": 1, "category_id": 7, "bbox": [0, 0, 1, 1]}], - "categories": [{"id": 7, "name": "cat"}], - } - p = tmp_path / "c.json" - p.write_text(json.dumps(coco)) - records = CocoAdapter().to_classification_records(p) - assert records == [{"sample_id": "a.jpg", "label": 7}] - - -def test_coco_classification_rejects_zero_categories(tmp_path: Path) -> None: - import json - - from raitap.data.adapters.coco import CocoAdapter - - coco = { - "images": [{"id": 1, "file_name": "a.jpg"}], - "annotations": [], - "categories": [{"id": 7, "name": "cat"}], - } - p = tmp_path / "zero.json" - p.write_text(json.dumps(coco)) - with pytest.raises(ValueError, match="exactly one category per image"): - CocoAdapter().to_classification_records(p) - - -def test_coco_classification_rejects_multiple_categories(tmp_path: Path) -> None: - import json - - from raitap.data.adapters.coco import CocoAdapter - - coco = { - "images": [{"id": 1, "file_name": "a.jpg"}], - "annotations": [ - {"image_id": 1, "category_id": 3, "bbox": [0, 0, 1, 1]}, - {"image_id": 1, "category_id": 5, "bbox": [0, 0, 1, 1]}, - ], - "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], - } - p = tmp_path / "multi.json" - p.write_text(json.dumps(coco)) - with pytest.raises(ValueError, match="exactly one category per image"): - CocoAdapter().to_classification_records(p) - - -def test_yolo_detection_records(tmp_path: Path) -> None: - from PIL import Image - - from raitap.data.adapters.yolo import YoloAdapter - - image_dir = tmp_path / "images" - image_dir.mkdir() - Image.new("RGB", (100, 200)).save(image_dir / "a.jpg") # w=100, h=200 - - label_dir = tmp_path / "labels" - label_dir.mkdir() - # class=2, cx=0.5 cy=0.5 w=0.2 h=0.1 -> center (50,100), box 20x20px - (label_dir / "a.txt").write_text("2 0.5 0.5 0.2 0.1\n") - - records = YoloAdapter().to_detection_records(label_dir, image_dir=image_dir, class_names=None) - assert len(records) == 1 - rec = records[0] - assert rec["sample_id"] == "a.jpg" - assert rec["labels"] == [2] - # x1 = (0.5-0.1)*100=40, y1=(0.5-0.05)*200=90, x2=60, y2=110 - assert len(rec["boxes"]) == 1 - assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0]) - - -def test_voc_detection_records(tmp_path: Path) -> None: - from raitap.data.adapters.voc import VocAdapter - - xml = """ - a.jpg - person - 10203040 - - """ - d = tmp_path / "ann" - d.mkdir() - (d / "a.xml").write_text(xml) - - records = VocAdapter().to_detection_records( - d, image_dir=None, class_names=["background", "person", "car"] - ) - assert records == [{"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]}] - - -def test_voc_detection_rejects_object_without_bndbox(tmp_path: Path) -> None: - from raitap.data.adapters.voc import VocAdapter - - xml = """ - a.jpg - person - """ - d = tmp_path / "ann" - d.mkdir() - (d / "a.xml").write_text(xml) - - with pytest.raises(ValueError, match="has no "): - VocAdapter().to_detection_records( - d, image_dir=None, class_names=["background", "person", "car"] - ) - - -def test_detection_load_labels_via_coco(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - import json - from types import SimpleNamespace - - import torch - - import raitap.data.data as data_mod - from raitap.data.types import LabelFormat - from raitap.task_families.detection import DetectionFamily - - coco = { - "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}], - "annotations": [{"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}], - "categories": [{"id": 3, "name": "car"}], - } - labels_file = tmp_path / "instances.json" - labels_file.write_text(json.dumps(coco)) - - monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source) - # tmp_path/"instances.json" is LABELS; tmp_path/"imgs" is DATA (unused by coco). - cfg = cast( - "AppConfig", - SimpleNamespace( - data=SimpleNamespace( - source="imgs", - labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco), - ) - ), - ) - tensor = [object(), object()] # len == 2 samples - out = DetectionFamily().load_labels(cfg, tensor=tensor, sample_ids=["a.jpg", "b.jpg"]) - assert torch.equal(out[0]["boxes"], torch.tensor([[10.0, 20.0, 40.0, 60.0]])) - assert torch.equal(out[0]["labels"], torch.tensor([3])) - assert out[1]["boxes"].shape == (0, 4) - - -def test_classification_load_labels_via_coco( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - import json - from types import SimpleNamespace - - import torch - - import raitap.data.data as data_mod - from raitap.data.types import LabelFormat - - coco = { - "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}], - "annotations": [ - {"image_id": 1, "category_id": 0, "bbox": [0, 0, 1, 1]}, - {"image_id": 2, "category_id": 4, "bbox": [0, 0, 1, 1]}, - ], - "categories": [{"id": 0, "name": "x"}, {"id": 4, "name": "y"}], - } - labels_file = tmp_path / "c.json" - labels_file.write_text(json.dumps(coco)) - monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source) - cfg = cast( - "AppConfig", - SimpleNamespace( - data=SimpleNamespace( - source="imgs", - labels=SimpleNamespace( - source="c.json", format=LabelFormat.coco, id_strategy="stem" - ), - ) - ), - ) - out = data_mod.load_classification_labels( - cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"] - ) - assert out is not None - assert torch.equal(out, torch.tensor([0, 4])) From 805621c51adbf4a1cf0b5a9d0dd933fa57378885 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 05:52:03 +0200 Subject: [PATCH 24/28] chore(config): migrate data.labels to discriminated label parsers (refs #338) --- .../fasterrcnn-udacity/assessment.yaml | 5 + .../imagecorruptions-imagenet/assessment.yaml | 1 + .../lwise-ham10000/assessment.yaml | 6 + .../marabou-mnist/assessment.yaml | 2 + .../noisetunnel-smoothgrad/assessment.yaml | 1 + example/assessment.yaml | 1 + src/raitap/configs/demo.yaml | 1 + src/raitap/data/tests/test_data.py | 60 +++---- src/raitap/data/tests/test_data_class.py | 150 +++++++++++------- .../data/tests/test_detection_ragged.py | 9 +- src/raitap/tests/test_api.py | 4 +- src/raitap/tests/test_example_recipes.py | 5 +- 12 files changed, 143 insertions(+), 102 deletions(-) diff --git a/contributor-configs/fasterrcnn-udacity/assessment.yaml b/contributor-configs/fasterrcnn-udacity/assessment.yaml index d9f38975..7fd84ea7 100644 --- a/contributor-configs/fasterrcnn-udacity/assessment.yaml +++ b/contributor-configs/fasterrcnn-udacity/assessment.yaml @@ -12,6 +12,11 @@ # Hand-authored COCO-class boxes in ``labels/udacity-boxes.json`` — see # ``labelling-data.md`` for the format and a candidate-generation helper. +defaults: + - raitap_schema + - data/labels: detection_json + - _self_ + experiment_name: fasterrcnn-udacity-detection-demo hardware: cpu diff --git a/contributor-configs/imagecorruptions-imagenet/assessment.yaml b/contributor-configs/imagecorruptions-imagenet/assessment.yaml index 97c2fbee..659e8708 100644 --- a/contributor-configs/imagecorruptions-imagenet/assessment.yaml +++ b/contributor-configs/imagecorruptions-imagenet/assessment.yaml @@ -12,6 +12,7 @@ defaults: - raitap_schema # required to bind the schema, must come first - reporting: html - metrics: multiclass_classification + - data/labels: tabular - _self_ hardware: gpu diff --git a/contributor-configs/lwise-ham10000/assessment.yaml b/contributor-configs/lwise-ham10000/assessment.yaml index e1d90cf9..a80866ba 100644 --- a/contributor-configs/lwise-ham10000/assessment.yaml +++ b/contributor-configs/lwise-ham10000/assessment.yaml @@ -7,6 +7,12 @@ # TorchScript modules do not support. Override with: # LWISE_HAM10000_MODEL=/path/to/lwise_ham10000_eager.pt +defaults: + - raitap_schema + - metrics: multiclass_classification + - data/labels: tabular + - _self_ + experiment_name: lwise-ham10000-dermoscopy-demo hardware: gpu diff --git a/contributor-configs/marabou-mnist/assessment.yaml b/contributor-configs/marabou-mnist/assessment.yaml index e00b0e7c..5a89aadd 100644 --- a/contributor-configs/marabou-mnist/assessment.yaml +++ b/contributor-configs/marabou-mnist/assessment.yaml @@ -3,6 +3,8 @@ # `data=mnist_samples`, `model=mlp_mnist`, `robustness=marabou_linf`. # No transparency block (the original demo invoked `~transparency`). defaults: + - raitap_schema + - data/labels: tabular - _self_ experiment_name: marabou_mnist_uc1 diff --git a/contributor-configs/noisetunnel-smoothgrad/assessment.yaml b/contributor-configs/noisetunnel-smoothgrad/assessment.yaml index cb7b9b3e..44946d44 100644 --- a/contributor-configs/noisetunnel-smoothgrad/assessment.yaml +++ b/contributor-configs/noisetunnel-smoothgrad/assessment.yaml @@ -8,6 +8,7 @@ defaults: - raitap_schema - metrics: multiclass_classification + - data/labels: tabular - _self_ hardware: cpu diff --git a/example/assessment.yaml b/example/assessment.yaml index 4a2df860..13fb6c72 100644 --- a/example/assessment.yaml +++ b/example/assessment.yaml @@ -2,6 +2,7 @@ defaults: - raitap_schema # required to bind the schema, must come first - reporting: html - metrics: multiclass_classification + - data/labels: tabular - _self_ hardware: gpu diff --git a/src/raitap/configs/demo.yaml b/src/raitap/configs/demo.yaml index d0ef4e17..ddb15c34 100644 --- a/src/raitap/configs/demo.yaml +++ b/src/raitap/configs/demo.yaml @@ -3,6 +3,7 @@ defaults: - raitap_schema # binds AppConfig dataclass → unset fields inherit defaults - metrics: multiclass_classification # narrows metrics schema so num_classes is accepted + - data/labels: tabular - _self_ hardware: cpu diff --git a/src/raitap/data/tests/test_data.py b/src/raitap/data/tests/test_data.py index b1e931e4..907778df 100644 --- a/src/raitap/data/tests/test_data.py +++ b/src/raitap/data/tests/test_data.py @@ -195,7 +195,7 @@ class TestDataPreprocessing: @staticmethod def _make_cfg(source: str, *, preprocessing: str | None) -> AppConfig: - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig return cast( "AppConfig", @@ -205,7 +205,7 @@ def _make_cfg(source: str, *, preprocessing: str | None) -> AppConfig: name="test", source=source, preprocessing=preprocessing, - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -239,7 +239,7 @@ def test_uniform_dir_without_preprocessing_still_loads(self, tmp_path: Path) -> def test_supplied_resolved_preprocessing_skips_resolution(self, tmp_path: Path) -> None: from torch import nn - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig from raitap.data.preprocessing import ResolvedPreprocessing class _ShapeModule(nn.Module): @@ -255,7 +255,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: name="test", source=str(tmp_path), preprocessing="model-bundled", - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -279,7 +279,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: def test_onnx_custom_file_data_factory_drives_data_loading( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig _write_image(tmp_path / "a.jpg", 32, 48) _write_image(tmp_path / "b.jpg", 40, 64) @@ -319,7 +319,7 @@ def test_onnx_custom_file_data_factory_drives_data_loading( name="test", source=str(tmp_path), preprocessing=str(preprocessing_path), - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -369,7 +369,7 @@ def test_sample_source_loads_native_resolution_then_transforms(self, tmp_path: P breaks pretrained-weight accuracy on `raitap --demo`.""" from torch import nn - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig from raitap.data.samples import SAMPLE_SOURCES # Stage a fake sample at varied native sizes so the test would fail @@ -411,7 +411,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: name="fake_native_samples", source="fake_native_samples", preprocessing="model-bundled", - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -589,7 +589,7 @@ def test_unknown_extension_raises(self, tmp_path: Path) -> None: def test_tabular_applies_data_module(self, tmp_path: Path) -> None: from torch import nn - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig from raitap.data.preprocessing import ResolvedPreprocessing class _ScaleModule(nn.Module): @@ -606,7 +606,7 @@ def forward(self, batch: torch.Tensor) -> torch.Tensor: name="tab", source=str(p), preprocessing="./scale.py", - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -701,7 +701,9 @@ def test_url_source_loads_image_via_get_source_path(self, tmp_path: Path) -> Non assert data.tensor.shape == (1, 3, 32, 32) def test_sample_labels_align_with_sample_images(self, tmp_path: Path) -> None: + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig, TabularLabelsConfig from raitap.data.samples import SAMPLE_LABELS + from raitap.data.types import LabelEncoding with ( patch("raitap.data.samples._CACHE_DIR", tmp_path), @@ -711,30 +713,20 @@ def test_sample_labels_align_with_sample_images(self, tmp_path: Path) -> None: mock_download.side_effect = lambda _url, dest: _write_image(dest, 32, 32) cfg = cast( "AppConfig", - type( - "AppConfig", - (), - { - "data": type( - "DataConfig", - (), - { - "source": "imagenet_samples", - "name": "imagenet_samples", - "labels": type( - "LabelsConfig", - (), - { - "source": "imagenet_samples", - "id_column": "image", - "column": "label", - "encoding": "index", - }, - )(), - }, - )() - }, - )(), + AppConfig( + model=ModelConfig(source="resnet50"), + data=DataConfig( + name="imagenet_samples", + source="imagenet_samples", + labels=TabularLabelsConfig( + source="imagenet_samples", + id_column="image", + column="label", + encoding=LabelEncoding.index, + ), + ), + hardware=Hardware.cpu, + ), ) data = Data(cfg) diff --git a/src/raitap/data/tests/test_data_class.py b/src/raitap/data/tests/test_data_class.py index 44b207f7..72ead53e 100644 --- a/src/raitap/data/tests/test_data_class.py +++ b/src/raitap/data/tests/test_data_class.py @@ -16,9 +16,9 @@ from raitap.configs.schema import AppConfig +from raitap.configs.schema import DirectoryLabelsConfig, TabularLabelsConfig from raitap.data import Data -from raitap.data.data import _load_directory_labels, load_classification_labels -from raitap.data.types import DIRECTORY_LABELS_SOURCE, InputModality +from raitap.data.types import InputModality def _write_image(path: Path) -> None: @@ -35,19 +35,28 @@ def _make_config( labels_encoding: str | None = None, labels_id_strategy: str | None = None, ) -> AppConfig: + from raitap.data.types import IdStrategy, LabelEncoding + + if labels_source is not None: + encoding = LabelEncoding(labels_encoding) if labels_encoding else None + id_strategy = IdStrategy(labels_id_strategy) if labels_id_strategy else IdStrategy.auto + labels = TabularLabelsConfig( + source=labels_source, + id_column=labels_id_column, + column=labels_column, + encoding=encoding, + id_strategy=id_strategy, + ) + else: + labels = None + return cast( "AppConfig", SimpleNamespace( data=SimpleNamespace( source=source, name=name, - labels=SimpleNamespace( - source=labels_source, - id_column=labels_id_column, - column=labels_column, - encoding=labels_encoding, - id_strategy=labels_id_strategy, - ), + labels=labels, ) ), ) @@ -342,16 +351,15 @@ def test_data_raises_for_unsupported_id_strategy(self, tmp_path: Path) -> None: _write_image(data_dir / "x.jpg") labels_file = tmp_path / "labels.csv" labels_file.write_text("image,label\nx,0\n") - config = _make_config( - str(data_dir), - labels_source=str(labels_file), - labels_id_column="image", - labels_column="label", - labels_encoding="index", - labels_id_strategy="bogus", - ) - - with pytest.raises(ValueError, match=r"Unsupported data\.labels\.id_strategy"): + with pytest.raises(ValueError): + config = _make_config( + str(data_dir), + labels_source=str(labels_file), + labels_id_column="image", + labels_column="label", + labels_encoding="index", + labels_id_strategy="bogus", + ) Data(config) def test_data_records_image_modality_for_image_dir(self, tmp_path: Path) -> None: @@ -397,14 +405,13 @@ def test_data_raises_for_unsupported_labels_encoding(self, tmp_path: Path) -> No csv_file.write_text("a\n1\n2") labels_file = tmp_path / "labels.csv" labels_file.write_text("label\n0\n1") - config = _make_config( - str(csv_file), - labels_source=str(labels_file), - labels_column="label", - labels_encoding="ordinal", - ) - - with pytest.raises(ValueError, match=r"Unsupported data\.labels\.encoding"): + with pytest.raises(ValueError): + config = _make_config( + str(csv_file), + labels_source=str(labels_file), + labels_column="label", + labels_encoding="ordinal", + ) Data(config) @@ -482,54 +489,85 @@ def test_log_includes_full_metadata(self, tmp_path: Path) -> None: assert "dtype" in call_args -class TestLoadDirectoryLabels: +class TestLoadDirectoryLabelsViaParser: + """Directory label behavior via DirectoryLabelParser (replaces deleted _load_directory_labels). + + The private _load_directory_labels function and load_classification_labels were removed in + the discriminated-config refactor. Behavior is now covered by DirectoryLabelParser + and _resolve_and_parse_labels. These tests preserve the behavioral contracts. + """ + + def _run_directory_parser(self, sample_ids: list[str] | None) -> torch.Tensor | None: + from types import SimpleNamespace + from typing import cast + + from raitap.data.data import _resolve_and_parse_labels + from raitap.types import TaskKind + + cfg = cast( + "AppConfig", + SimpleNamespace( + data=SimpleNamespace(labels=DirectoryLabelsConfig(), source=None), + model=SimpleNamespace(class_names=None), + ), + ) + return _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + def test_derives_labels_from_top_level_class_folder(self) -> None: - result = _load_directory_labels(["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"]) + result = self._run_directory_parser(["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"]) assert result is not None assert torch.equal(result, torch.tensor([0, 1, 0])) def test_nesting_within_class_stays_top_level(self) -> None: - result = _load_directory_labels(["NORMAL/sub/a.jpg", "PNEUMONIA/b.jpg"]) + result = self._run_directory_parser(["NORMAL/sub/a.jpg", "PNEUMONIA/b.jpg"]) assert result is not None assert torch.equal(result, torch.tensor([0, 1])) def test_single_class_is_all_zeros_not_error(self) -> None: - result = _load_directory_labels(["NORMAL/a.jpg", "NORMAL/b.jpg"]) + result = self._run_directory_parser(["NORMAL/a.jpg", "NORMAL/b.jpg"]) assert result is not None assert torch.equal(result, torch.tensor([0, 0])) def test_sample_without_class_subdir_returns_none(self) -> None: - with pytest.warns(UserWarning, match="class subdirectory"): - result = _load_directory_labels(["a.jpg", "NORMAL/b.jpg"]) + result = self._run_directory_parser(["a.jpg", "NORMAL/b.jpg"]) assert result is None def test_none_sample_ids_returns_none(self) -> None: - with pytest.warns(UserWarning, match="class subdirectories"): - result = _load_directory_labels(None) + result = self._run_directory_parser(None) assert result is None def test_empty_sample_ids_returns_none(self) -> None: - with pytest.warns(UserWarning, match="class subdirectories"): - result = _load_directory_labels([]) + result = self._run_directory_parser([]) assert result is None + def test_directory_source_derives_labels_from_layout(self, tmp_path: Path) -> None: + """Data with DirectoryLabelsConfig derives labels from the sample layout.""" + from types import SimpleNamespace + from typing import cast + + + img_dir = tmp_path / "images" + (img_dir / "NORMAL").mkdir(parents=True) + (img_dir / "PNEUMONIA").mkdir(parents=True) + _write_image(img_dir / "NORMAL" / "a.jpg") + _write_image(img_dir / "PNEUMONIA" / "b.jpg") + _write_image(img_dir / "NORMAL" / "c.jpg") + + cfg = cast( + "AppConfig", + SimpleNamespace( + data=SimpleNamespace( + source=str(img_dir), + name="test_dir", + labels=DirectoryLabelsConfig(), + ) + ), + ) + data = Data(cfg) -class TestLoadClassificationLabelsDirectorySource: - def test_directory_source_derives_labels(self) -> None: - config = _make_config("images", labels_source=DIRECTORY_LABELS_SOURCE) - sample_ids = ["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"] - tensor = torch.zeros(len(sample_ids), 3, 8, 8) - - result = load_classification_labels(config, tensor=tensor, sample_ids=sample_ids) - - assert result is not None - assert torch.equal(result, torch.tensor([0, 1, 0])) - - def test_directory_source_none_sample_ids_returns_none(self) -> None: - config = _make_config("rows.csv", labels_source=DIRECTORY_LABELS_SOURCE) - tensor = torch.zeros(3, 4) - - with pytest.warns(UserWarning, match="class subdirectories"): - result = load_classification_labels(config, tensor=tensor, sample_ids=None) - - assert result is None + assert data.labels is not None + assert isinstance(data.labels, torch.Tensor) + # NORMAL=0, PNEUMONIA=1; sorted by posix path: NORMAL/a, NORMAL/c, PNEUMONIA/b + assert data.labels.tolist() == [0, 0, 1] diff --git a/src/raitap/data/tests/test_detection_ragged.py b/src/raitap/data/tests/test_detection_ragged.py index 6f642a41..bbad5a7b 100644 --- a/src/raitap/data/tests/test_detection_ragged.py +++ b/src/raitap/data/tests/test_detection_ragged.py @@ -48,14 +48,7 @@ def _make_config(source: str, name: str = "test_det") -> AppConfig: data=SimpleNamespace( source=source, name=name, - labels=SimpleNamespace( - source=None, - kind=None, - id_column=None, - column=None, - encoding=None, - id_strategy=None, - ), + labels=None, ) ), ) diff --git a/src/raitap/tests/test_api.py b/src/raitap/tests/test_api.py index 772e0d74..4f2d412c 100644 --- a/src/raitap/tests/test_api.py +++ b/src/raitap/tests/test_api.py @@ -23,10 +23,10 @@ from raitap.api import instantiate from raitap.configs.schema import ( DataConfig, - LabelsConfig, ModelConfig, MulticlassClassificationMetricsConfig, RobustnessConfig, + TabularLabelsConfig, TransparencyConfig, ) from raitap.data.preprocessing import resolve_preprocessing @@ -56,7 +56,7 @@ def _demo_app_config() -> AppConfig: name="imagenet_samples", source="imagenet_samples", forward_batch_size=4, - labels=LabelsConfig( + labels=TabularLabelsConfig( source="imagenet_samples", id_column="image", column="label", diff --git a/src/raitap/tests/test_example_recipes.py b/src/raitap/tests/test_example_recipes.py index 521cfb8e..a6c40a1d 100644 --- a/src/raitap/tests/test_example_recipes.py +++ b/src/raitap/tests/test_example_recipes.py @@ -28,7 +28,8 @@ pytest.importorskip("torchmetrics") # metrics adapter from raitap import AppConfig, Hardware, run -from raitap.data import DataConfig, LabelsConfig +from raitap.configs.schema import TabularLabelsConfig +from raitap.data import DataConfig from raitap.metrics import multiclass_classification as classification from raitap.models import ModelConfig from raitap.pipeline.outputs import RunOutputs @@ -60,7 +61,7 @@ def _base_kwargs(experiment_name: str) -> _BaseKwargs: name="imagenet_samples", source="imagenet_samples", forward_batch_size=4, - labels=LabelsConfig( + labels=TabularLabelsConfig( source="imagenet_samples", id_column="image", column="label", From 1224e7b6d50449059be04607c11518e5a8070ce0 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Wed, 24 Jun 2026 06:01:29 +0200 Subject: [PATCH 25/28] docs: document label parsers and discriminated labels config (refs #338) --- docs/contributor/modules/data.md | 70 ++++++++++--- docs/modules/data/configuration.md | 125 ++++++++--------------- docs/modules/data/own-vs-built-in.md | 37 +++---- src/raitap/data/tests/test_data_class.py | 1 - 4 files changed, 113 insertions(+), 120 deletions(-) diff --git a/docs/contributor/modules/data.md b/docs/contributor/modules/data.md index 82bfc9a7..ef4e4cc6 100644 --- a/docs/contributor/modules/data.md +++ b/docs/contributor/modules/data.md @@ -45,9 +45,14 @@ referenceable by name in `data.source`. Registration lives in } ``` -3. **Use it** from any consumer config: +3. **Use it** from any consumer config (select the `tabular` label variant): ```yaml + defaults: + - raitap_schema + - data/labels: tabular + - _self_ + data: name: cifar10_samples source: cifar10_samples # resolves via SAMPLE_SOURCES @@ -69,19 +74,56 @@ referenceable by name in `data.source`. Registration lives in 5. **Update docs** — add the new sample name to {doc}`/modules/data/own-vs-built-in`. -## Adding a label format - -1. Create `src/raitap/data/adapters/.py` with a class decorated - `@label_format`. Set `format = LabelFormat.` and - `supported_tasks = frozenset({...})`. -2. Implement `to_detection_records` and/or `to_classification_records`, - returning the native record shape (`{sample_id, boxes (xyxy), labels}` or - `{sample_id, label}`). Raise `ValueError` for an unsupported task. -3. Import it in `src/raitap/data/_label_format_adapters.py` so the decorator - fires. -4. Add a `LabelFormat` member in `src/raitap/data/types.py` and a row to the - label-format table in `docs/modules/data/configuration.md`. -5. Add tests in `src/raitap/data/tests/test_label_formats.py`. +## Adding a label parser + +1. Create `src/raitap/data/label_parsers/.py`. Add a dataclass in + `src/raitap/configs/schema.py` that subclasses `LabelsConfig`: + + ```python + @dataclass + class MyFormatLabelsConfig(LabelsConfig): + _target_: str = "MyFormatLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + # add only fields this variant uses + ``` + +2. Write the parser class decorated with `@label_parser`: + + ```python + from raitap.data.label_parsers.registration import label_parser + from raitap.configs.schema import MyFormatLabelsConfig + + @label_parser(registry_name="my_format", schema=MyFormatLabelsConfig) + class MyFormatLabelParser: + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__(self, *, source: str, id_strategy: IdStrategy = IdStrategy.auto) -> None: + ... + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + ... + ``` + +3. Import and re-export in `src/raitap/data/label_parsers/__init__.py`: + + ```python + from .my_format import MyFormatLabelParser # pyright: ignore[reportUnusedImport] + ``` + + Add `"MyFormatLabelParser"` to `__all__`. + +4. Add tests in `src/raitap/data/tests/`. + +5. Add a row to the label-variant table in `docs/modules/data/configuration.md`. ## Sample discovery and label alignment diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md index 8f030846..e5f747e8 100644 --- a/docs/modules/data/configuration.md +++ b/docs/modules/data/configuration.md @@ -66,55 +66,6 @@ myst: --acknowledge-preprocessing-off. See {doc}`preprocessing`. -:option: labels.source -:allowed: string, null -:default: null -:description: Optional path to a labels file (CSV, TSV, or Parquet), URL, or - named sample set. When set to a sample name (e.g. `"imagenet_samples"`), - raitap resolves to the labels CSV bundled with that sample. Sample sets - without bundled labels raise an error. The reserved value `"directory"` - derives classification labels from each sample's top-level class - subdirectory (torchvision `ImageFolder` style; no labels file) — see - {doc}`own-vs-built-in`. In that mode `id_column` and `id_strategy` do not - apply. - -:option: labels.id_column -:allowed: string, null -:default: null -:description: Optional sample-ID column used to align labels with filenames, - for example `"image"`. - -:option: labels.column -:allowed: string, null -:default: null -:description: Optional class-label column. If omitted, one-hot numeric columns - are reduced with `argmax`. - -:option: labels.encoding -:allowed: "index", "one_hot", "argmax", null -:default: null -:description: Optional label parsing strategy. - -:option: labels.id_strategy -:allowed: "auto", "relative_path", "stem" -:default: "auto" -:description: How label-file ids are matched against discovered sample - files. `"auto"` (default) inspects the id column and switches to - `"relative_path"` if any value contains `/` or `\`, otherwise falls back - to `"stem"`. `"relative_path"` keeps directory components and supports - nested ImageFolder layouts (e.g. `NORMAL/IM-0001.jpeg`) — required when - filename stems collide across class subdirs. `"stem"` matches by basename only (flat-dir layouts). - -:option: labels.format -:allowed: "native", "coco", "yolo", "voc" -:default: "native" -:description: External label file format. `"native"` (default) reads RAITAP's - own shape (classification: CSV/TSV/Parquet or the `"directory"` source; - detection: the JSON record list). `"coco"`, `"yolo"`, and `"voc"` convert a - standard annotation file to the native shape before alignment. `"yolo"` and - `"voc"` are detection only; `"coco"` serves detection and classification. - Non-native formats align by sample id, so a labels id is required. - :option: input_metadata :allowed: dict, null :default: null @@ -155,56 +106,62 @@ data: forward_batch_size: 32 preprocessing: model-bundled model_input_transformation: model-bundled - labels: - source: "./data/labels.csv" - id_column: "image" - column: "label" - encoding: "index" - id_strategy: "auto" -:cli: data.source="./data/images" data.preprocessing=model-bundled data.model_input_transformation=model-bundled data.labels.source="./data/labels.csv" data.labels.column=label +:cli: data.source="./data/images" data.preprocessing=model-bundled data.model_input_transformation=model-bundled :python: -from raitap.data import ( - DataConfig, - IdStrategy, - LabelEncoding, - LabelsConfig, - Preprocessing, -) +from raitap.data import DataConfig data = DataConfig( name="my-dataset", description="Internal validation set", source="./data/images", forward_batch_size=32, - preprocessing=Preprocessing.model_bundled, - model_input_transformation=Preprocessing.model_bundled, - labels=LabelsConfig( - source="./data/labels.csv", - id_column="image", - column="label", - encoding=LabelEncoding.index, - id_strategy=IdStrategy.auto, - ), + preprocessing="model-bundled", + model_input_transformation="model-bundled", ) ``` -**Label formats.** RAITAP reads common annotation formats directly via `data.labels.format`. +**Label variants.** `data.labels` is a Hydra config-group: select the variant with `defaults: [data/labels: ]`, then set its fields under `data.labels:`. Each variant exposes only the fields it accepts — setting a foreign field is a load error. + +```yaml +defaults: + - raitap_schema + - data/labels: tabular # pick one variant + - _self_ + +data: + source: "./data/images" + labels: + source: "./data/labels.csv" + id_column: "image" + column: "label" +``` + +| Variant | Task(s) | Fields | +| ---------------- | -------------------------- | ------------------------------------------------------- | +| `tabular` | classification | `source`, `id_column`, `column`, `encoding`, `id_strategy` | +| `directory` | classification | *(none — class from top-level subdir name)* | +| `coco` | detection + classification | `source`, `id_strategy` | +| `yolo` | detection | `source`, `id_strategy` | +| `voc` | detection | `source`, `id_strategy`, `class_names` | +| `detection_json` | detection | `source`, `id_strategy` | + +**`tabular`** — CSV, TSV, or Parquet file. `id_column` aligns rows to sample filenames; `column` names the label column (omit for one-hot numeric columns, which are reduced with `argmax`). `encoding` is one of `"index"`, `"one_hot"`, `"argmax"`. `id_strategy` controls alignment — see below. + +**`directory`** — no labels file. Class is each sample's top-level subdirectory (torchvision `ImageFolder` style). See {doc}`own-vs-built-in`. + +**`coco`** — single `instances.json` file (`source`). Category ids pass through unchanged. Serves detection and classification. + +**`yolo`** — directory of per-image `.txt` files (`source`). Needs `data.source` set to the image directory so RAITAP can match annotation files to images. Detection only. Category ids pass through unchanged. + +**`voc`** — directory of per-image `.xml` files (`source`). `class_names` maps VOC names to integer ids; falls back to `model.class_names`, then the standard 20-class VOC order. Detection only. -| Format | Detection | Classification | Source layout | -| -------- | --------- | -------------- | ---------------------------------------------- | -| `native` | yes | yes | JSON record list / CSV-TSV-Parquet | -| `coco` | yes | yes | single `instances.json` | -| `yolo` | yes | no | dir of per-image `.txt` (needs `data.source`) | -| `voc` | yes | no | dir of per-image `.xml` | +**`detection_json`** — RAITAP native JSON record list `[{"sample_id": ..., "boxes": ..., "labels": ...}]`. Detection only. -COCO and YOLO labels keep their category ids unchanged. VOC class names map to -ids by `model.class_names` order, else the standard 20-class VOC order. +All detection variants honour `id_strategy` for nested image-directory layouts. -Detection formats match each record's `sample_id` against the discovered image -file by exact name, so the image directory must be flat (nested subdirs are not -matched). Classification labels still align via `labels.id_strategy`. +**`id_strategy`** (`"auto"` / `"relative_path"` / `"stem"`, default `"auto"`): how label ids are matched against discovered sample files. `"auto"` inspects the id column and switches to `"relative_path"` if any value contains `/` or `\`, otherwise `"stem"`. `"relative_path"` keeps directory components (e.g. `NORMAL/IM-0001`) — required when filename stems collide across class subdirs. `"stem"` matches by basename only. For tabular models whose backend expects an unusual per-sample layout (such as ACAS Xu, a Torch network whose forward takes `(N, 1, 1, 5)`), supply diff --git a/docs/modules/data/own-vs-built-in.md b/docs/modules/data/own-vs-built-in.md index b3364cf7..8a4006fd 100644 --- a/docs/modules/data/own-vs-built-in.md +++ b/docs/modules/data/own-vs-built-in.md @@ -33,11 +33,12 @@ data: column: "label" :python: -from raitap.data import DataConfig, LabelsConfig +from raitap.configs.schema import TabularLabelsConfig +from raitap.data import DataConfig data = DataConfig( source="./data/images", # a directory of images - labels=LabelsConfig( + labels=TabularLabelsConfig( source="./data/labels.csv", id_column="image", column="label", @@ -76,11 +77,12 @@ data: # id_strategy: "auto" # default — relative paths auto-detected :python: -from raitap.data import DataConfig, IdStrategy, LabelsConfig +from raitap.configs.schema import TabularLabelsConfig +from raitap.data import DataConfig data = DataConfig( source="./data/test", - labels=LabelsConfig( + labels=TabularLabelsConfig( source="./data/labels.csv", id_column="image", column="label", @@ -134,8 +136,8 @@ both become `IM-0001`), which raises a duplicate-id error. ### Labels from directory structure If your images are already organised into one folder per class (the -torchvision `ImageFolder` convention), set `labels.source: "directory"` to use -the folder names as labels. No labels file needed. +torchvision `ImageFolder` convention), select the `directory` label variant. +No labels file needed. ```text data/train/ @@ -144,27 +146,20 @@ data/train/ └── PNEUMONIA/IM-0001.jpeg # label: PNEUMONIA ``` -```{config-tabs} -:yaml: +```yaml +defaults: + - raitap_schema + - data/labels: directory + - _self_ + data: source: "./data/train" - labels: - source: "directory" - -:python: -from raitap.data import DIRECTORY_LABELS_SOURCE, DataConfig, LabelsConfig - -data = DataConfig( - source="./data/train", - labels=LabelsConfig(source=DIRECTORY_LABELS_SOURCE), # == "directory" -) ``` The class is each sample's top-level subdirectory; nesting within a class folder is fine. Class ids are assigned alphabetically (`NORMAL` to `0`, -`PNEUMONIA` to `1`). `id_column` and `id_strategy` do not apply. If images sit -directly under the source with no class subdirs, RAITAP warns and falls back to -predictions as metric targets. +`PNEUMONIA` to `1`). If images sit directly under the source with no class +subdirs, RAITAP warns and falls back to predictions as metric targets. If you want to evaluate metrics against ground-truth labels, configure the optional `data.labels` block as described in {doc}`configuration`. diff --git a/src/raitap/data/tests/test_data_class.py b/src/raitap/data/tests/test_data_class.py index 72ead53e..1fba154e 100644 --- a/src/raitap/data/tests/test_data_class.py +++ b/src/raitap/data/tests/test_data_class.py @@ -547,7 +547,6 @@ def test_directory_source_derives_labels_from_layout(self, tmp_path: Path) -> No from types import SimpleNamespace from typing import cast - img_dir = tmp_path / "images" (img_dir / "NORMAL").mkdir(parents=True) (img_dir / "PNEUMONIA").mkdir(parents=True) From 4ee261f1a8d5d4ca5bc756d8c2eea58ac4741c61 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Thu, 25 Jun 2026 16:38:15 +0200 Subject: [PATCH 26/28] fix(data): migrate e2e detection test to parser seam; warn against inline _target_ (refs #338) --- docs/modules/data/configuration.md | 2 +- src/raitap/task_families/tests/test_base.py | 3 --- src/raitap/tests/test_e2e_detection.py | 19 +++++++++++++------ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md index e5f747e8..84f468cf 100644 --- a/docs/modules/data/configuration.md +++ b/docs/modules/data/configuration.md @@ -122,7 +122,7 @@ data = DataConfig( ) ``` -**Label variants.** `data.labels` is a Hydra config-group: select the variant with `defaults: [data/labels: ]`, then set its fields under `data.labels:`. Each variant exposes only the fields it accepts — setting a foreign field is a load error. +**Label variants.** `data.labels` is a Hydra config-group: select the variant with `defaults: [data/labels: ]`, then set its fields under `data.labels:`. Each variant exposes only the fields it accepts — setting a foreign field is a load error. Always select via the `defaults` group; inlining `_target_` directly (e.g. `data.labels: {_target_: TabularLabelParser, bogus: 1}`) bypasses struct-mode validation and unknown fields are silently dropped. ```yaml defaults: diff --git a/src/raitap/task_families/tests/test_base.py b/src/raitap/task_families/tests/test_base.py index 2df0e75a..78dcaa79 100644 --- a/src/raitap/task_families/tests/test_base.py +++ b/src/raitap/task_families/tests/test_base.py @@ -27,9 +27,6 @@ def adapt_loaded_inputs(self, tensor: object) -> object: def validate_inputs(self, tensor: object) -> None: pass - def load_labels(self, cfg: object, *, tensor: object, sample_ids: object) -> object: - pass - def validate_labels(self, labels: object) -> None: pass diff --git a/src/raitap/tests/test_e2e_detection.py b/src/raitap/tests/test_e2e_detection.py index 66d22414..9824f28e 100644 --- a/src/raitap/tests/test_e2e_detection.py +++ b/src/raitap/tests/test_e2e_detection.py @@ -97,19 +97,26 @@ def test_detection_pipeline_e2e_via_fasterrcnn_mobilenet(tmp_path: Path) -> None labels_path = tmp_path / "detection_labels.json" labels_path.write_text(json.dumps(labels_payload)) - # Bypass Data.__init__ and call DetectionFamily.load_labels directly; the + # Bypass Data.__init__ and call _resolve_and_parse_labels directly; the # detection label loader has its own dedicated coverage in # src/raitap/data/tests/test_detection_labels.py. This test focuses on the # pipeline plumbing downstream of Data. - from raitap.task_families.detection import DetectionFamily + from raitap.configs.schema import DetectionJsonLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + from raitap.types import TaskKind - labels_cfg = SimpleNamespace(source=str(labels_path)) load_cfg = cast( "AppConfig", - SimpleNamespace(data=SimpleNamespace(labels=labels_cfg)), + SimpleNamespace( + data=SimpleNamespace( + labels=DetectionJsonLabelsConfig(source=str(labels_path)), + source=None, + ), + model=SimpleNamespace(class_names=None), + ), ) - data.labels = DetectionFamily().load_labels( - load_cfg, tensor=data.tensor, sample_ids=data.sample_ids + data.labels = _resolve_and_parse_labels( + load_cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids ) # --- app config -------------------------------------------------------- From 39340d9a354edc84481a93f27c929ae6a3d5a8d8 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Thu, 25 Jun 2026 16:45:36 +0200 Subject: [PATCH 27/28] fix(data): label parsers declare no uv extra to satisfy deps static-scan (refs #338) --- src/raitap/data/label_parsers/registration.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/raitap/data/label_parsers/registration.py b/src/raitap/data/label_parsers/registration.py index 3ade1ab7..cc65235f 100644 --- a/src/raitap/data/label_parsers/registration.py +++ b/src/raitap/data/label_parsers/registration.py @@ -34,7 +34,13 @@ def label_parser( """Decorator: register a label-parser adapter. ``registry_name`` is required. Mirrors ``metrics_adapter`` shape. + + Label parsers are core (stdlib only — no optional dependency), so they + declare no uv extra. Without this default the schema-backed auto-extra + (``extra=registry_name``) would register phantom extras like ``tabular`` + that no ``pyproject`` group provides, breaking the deps static-scan gate. """ + common.setdefault("extra", "") def wrap(cls: type[T]) -> type[T]: return _register_core(cls, family=LABELS, **common) From a32db1bcc8884f742cfa4d7d183abbe0716ce917 Mon Sep 17 00:00:00 2001 From: Stanislas Laurent Date: Thu, 25 Jun 2026 16:54:25 +0200 Subject: [PATCH 28/28] test(data): drop redundant imports, wire _VOC_ONLY_FIELDS into leakage test (refs #338) --- src/raitap/configs/tests/test_labels_schema.py | 14 +++++--------- src/raitap/data/label_parsers/coco.py | 2 -- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py index 85154c2d..13f2eaef 100644 --- a/src/raitap/configs/tests/test_labels_schema.py +++ b/src/raitap/configs/tests/test_labels_schema.py @@ -20,8 +20,6 @@ def test_directory_config_has_only_target() -> None: def test_labelformat_enum_is_gone() -> None: - import importlib - data_types = importlib.import_module("raitap.data.types") with pytest.raises(AttributeError): getattr(data_types, "LabelFormat") # noqa: B009 @@ -160,19 +158,17 @@ def test_no_cross_variant_field_leakage(registry_name: str) -> None: ) if registry_name in _DETECTION_VARIANTS: - leaked = _TABULAR_ONLY_FIELDS & field_names - assert not leaked, f"{registry_name!r} builder leaks tabular-only fields: {leaked}" - assert "class_names" not in field_names, ( - f"{registry_name!r} builder should not have 'class_names'" - ) + leaked = (_TABULAR_ONLY_FIELDS | _VOC_ONLY_FIELDS) & field_names + assert not leaked, f"{registry_name!r} builder leaks foreign fields: {leaked}" if registry_name == "voc": leaked = _TABULAR_ONLY_FIELDS & field_names assert not leaked, f"voc builder leaks tabular-only fields: {leaked}" - assert "class_names" in field_names, "voc builder must have 'class_names'" + assert field_names >= _VOC_ONLY_FIELDS, "voc builder must have 'class_names'" if registry_name == "tabular": assert field_names >= _TABULAR_ONLY_FIELDS, ( f"tabular builder is missing expected fields; got {field_names}" ) - assert "class_names" not in field_names, "tabular builder should not have 'class_names'" + leaked = _VOC_ONLY_FIELDS & field_names + assert not leaked, "tabular builder should not have 'class_names'" diff --git a/src/raitap/data/label_parsers/coco.py b/src/raitap/data/label_parsers/coco.py index 673a39e0..12b3f332 100644 --- a/src/raitap/data/label_parsers/coco.py +++ b/src/raitap/data/label_parsers/coco.py @@ -20,8 +20,6 @@ if TYPE_CHECKING: from pathlib import Path - import pandas as pd - @label_parser(registry_name="coco", schema=CocoLabelsConfig) class CocoLabelParser: