From 95d3bead3f680261a7e667be8a40093db2cad768 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:20:13 +0200
Subject: [PATCH 01/28] feat(data): add LabelFormat enum and
 LabelsConfig.format field (refs #338)

---
 src/raitap/configs/schema.py                |  6 +++++-
 src/raitap/data/tests/test_label_formats.py | 11 +++++++++++
 src/raitap/data/types.py                    | 16 ++++++++++++++++
 3 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 src/raitap/data/tests/test_label_formats.py

diff --git a/src/raitap/configs/schema.py b/src/raitap/configs/schema.py
index fbab40f7..1910345c 100644
--- a/src/raitap/configs/schema.py
+++ b/src/raitap/configs/schema.py
@@ -5,7 +5,7 @@
 
 from omegaconf import MISSING
 
-from raitap.data.types import IdStrategy, LabelEncoding
+from raitap.data.types import IdStrategy, LabelEncoding, LabelFormat
 from raitap.types import Hardware, TaskKind
 
 if TYPE_CHECKING:
@@ -87,6 +87,10 @@ class LabelsConfig:
     #                     (supports nested ImageFolder layouts with colliding stems).
     #   "stem"          — flat-dir / basename matching: match by ``Path(id).stem`` only.
     id_strategy: IdStrategy = IdStrategy.auto
+    # External label file format. ``native`` (default) reads RAITAP's own
+    # shape. ``coco`` / ``yolo`` / ``voc`` are converted to the native
+    # intermediate before alignment. Requires id-based alignment (sample_ids).
+    format: LabelFormat = LabelFormat.native
 
 
 @dataclass
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
new file mode 100644
index 00000000..b65742e0
--- /dev/null
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -0,0 +1,11 @@
+from raitap.data.types import LabelFormat
+from raitap.configs.schema import LabelsConfig
+
+
+def test_label_format_members_are_string_values():
+    assert LabelFormat.native == "native"
+    assert {f.value for f in LabelFormat} == {"native", "coco", "yolo", "voc"}
+
+
+def test_labels_config_defaults_to_native_format():
+    assert LabelsConfig().format is LabelFormat.native
diff --git a/src/raitap/data/types.py b/src/raitap/data/types.py
index 2defb94a..fc114554 100644
--- a/src/raitap/data/types.py
+++ b/src/raitap/data/types.py
@@ -33,6 +33,22 @@ class IdStrategy(StrEnum):
     stem = "stem"
 
 
+class LabelFormat(StrEnum):
+    """On-disk label file format selected by ``LabelsConfig.format``.
+
+    ``native`` is RAITAP's own shape (classification: CSV/TSV/Parquet or the
+    ``directory`` source; detection: the JSON record list). The others are
+    converted to the native intermediate by a registered
+    :class:`~raitap.data.label_formats.LabelFormatAdapter` before the task
+    family aligns them. StrEnum so YAML users can write the raw value.
+    """
+
+    native = "native"
+    coco = "coco"
+    yolo = "yolo"
+    voc = "voc"
+
+
 #: Reserved ``LabelsConfig.source`` value selecting folder-as-label ingestion:
 #: classification labels are derived from each sample's top-level class
 #: subdirectory (torchvision ``ImageFolder`` style; no labels file). Kept as a

From d916f4de034384b79d99de1506dc3655ad787b0f Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:20:59 +0200
Subject: [PATCH 02/28] refactor(model): extract _align_detection_records from
 detection loader (refs #338)

---
 src/raitap/task_families/detection.py | 171 ++++++++++++++------------
 1 file changed, 93 insertions(+), 78 deletions(-)

diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py
index 5141992c..15614e64 100644
--- a/src/raitap/task_families/detection.py
+++ b/src/raitap/task_families/detection.py
@@ -19,6 +19,90 @@
     from raitap.task_families.base import ExplainContext, ForwardContext
 
 
+def _align_detection_records(
+    records: list[dict[str, Any]],
+    *,
+    expected: int,
+    sample_ids: Any,
+) -> list[dict[str, "Any"]]:
+    """Align native detection records to ``sample_ids`` and build tensors.
+
+    Extracted from ``DetectionFamily.load_labels`` so label-format adapters can
+    feed converted records through the same alignment + validation path.
+    """
+    import torch
+
+    if sample_ids is not None:
+        by_id: dict[str, dict[str, Any]] = {}
+        for index, record in enumerate(records):
+            record_id = record.get("sample_id") if isinstance(record, dict) else None
+            if record_id is None:
+                raise ValueError(
+                    f"Detection labels record {index} is missing 'sample_id' "
+                    "(required when the dataset exposes sample_ids)."
+                )
+            if record_id in by_id:
+                raise ValueError(
+                    f"Detection labels file contains duplicate sample_id {record_id!r}."
+                )
+            by_id[record_id] = record
+        ordered_records = []
+        missing: list[str] = []
+        for sample_id in sample_ids:
+            record = by_id.get(sample_id)
+            if record is None:
+                missing.append(sample_id)
+            else:
+                ordered_records.append(record)
+        if missing:
+            raise ValueError(
+                f"Detection labels file is missing entries for sample_ids: {missing!r}."
+            )
+        records_iter: list[dict[str, Any]] = ordered_records
+    else:
+        if len(records) != expected:
+            raise ValueError(
+                f"Detection labels file has {len(records)} records but the "
+                f"dataset has {expected} samples; provide sample_id fields and "
+                "set data.labels.source so records can be aligned by id, or "
+                "match the record count to the sample count."
+            )
+        records_iter = records
+
+    out: list[dict[str, torch.Tensor]] = []
+    for index, record in enumerate(records_iter):
+        boxes_raw = record.get("boxes", [])
+        labels_raw = record.get("labels", [])
+        if len(boxes_raw) != len(labels_raw):
+            raise ValueError(
+                f"Sample index {index}: boxes and labels must have matching "
+                f"length (got {len(boxes_raw)} boxes vs {len(labels_raw)} labels)."
+            )
+        boxes_tensor = (
+            torch.tensor(boxes_raw, dtype=torch.float32)
+            if boxes_raw
+            else torch.zeros((0, 4), dtype=torch.float32)
+        )
+        labels_tensor = (
+            torch.tensor(labels_raw, dtype=torch.int64)
+            if labels_raw
+            else torch.zeros((0,), dtype=torch.int64)
+        )
+        if boxes_tensor.ndim != 2 or boxes_tensor.shape[1] != 4:
+            raise ValueError(
+                f"Sample index {index}: boxes must be shape (M_i, 4); got "
+                f"{tuple(boxes_tensor.shape)}."
+            )
+        out.append({"boxes": boxes_tensor, "labels": labels_tensor})
+
+    if len(out) != expected:
+        raise ValueError(
+            f"Detection labels alignment produced {len(out)} entries but the "
+            f"dataset has {expected} samples."
+        )
+    return out
+
+
 @task_family
 class DetectionFamily:
     kind: TaskKind = TaskKind.detection
@@ -92,96 +176,27 @@ def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any:
         """
         import json
 
-        import torch
-
-        from raitap.data.data import SourceKind, _get_optional_config_value, get_source_path
+        from raitap.data.data import (
+            SourceKind,
+            _get_optional_config_value,
+            get_source_path,
+        )
 
         labels_cfg = _get_optional_config_value(cfg.data, "labels")
         labels_source = _get_optional_config_value(labels_cfg, "source")
         if not labels_source:
             return None
 
-        # ``get_source_path`` raises ValueError if the source can't be resolved
-        # or returns an existing path; no separate existence check needed.
         labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
-
         with labels_path.open() as fh:
             records = json.load(fh)
         if not isinstance(records, list):
-            raise ValueError(f"Detection labels file {labels_path} must be a JSON array.")
-
-        expected = len(tensor)
-
-        if sample_ids is not None:
-            by_id: dict[str, dict[str, Any]] = {}
-            for index, record in enumerate(records):
-                record_id = record.get("sample_id") if isinstance(record, dict) else None
-                if record_id is None:
-                    raise ValueError(
-                        f"Detection labels record {index} is missing 'sample_id' "
-                        "(required when the dataset exposes sample_ids)."
-                    )
-                if record_id in by_id:
-                    raise ValueError(
-                        f"Detection labels file contains duplicate sample_id {record_id!r}."
-                    )
-                by_id[record_id] = record
-            ordered_records = []
-            missing: list[str] = []
-            for sample_id in sample_ids:
-                record = by_id.get(sample_id)
-                if record is None:
-                    missing.append(sample_id)
-                else:
-                    ordered_records.append(record)
-            if missing:
-                raise ValueError(
-                    f"Detection labels file is missing entries for sample_ids: {missing!r}."
-                )
-            records_iter: list[dict[str, Any]] = ordered_records
-        else:
-            if len(records) != expected:
-                raise ValueError(
-                    f"Detection labels file has {len(records)} records but the "
-                    f"dataset has {expected} samples; provide sample_id fields and "
-                    "set data.labels.source so records can be aligned by id, or "
-                    "match the record count to the sample count."
-                )
-            records_iter = records
-
-        out: list[dict[str, torch.Tensor]] = []
-        for index, record in enumerate(records_iter):
-            boxes_raw = record.get("boxes", [])
-            labels_raw = record.get("labels", [])
-            if len(boxes_raw) != len(labels_raw):
-                raise ValueError(
-                    f"Sample index {index}: boxes and labels must have matching "
-                    f"length (got {len(boxes_raw)} boxes vs {len(labels_raw)} labels)."
-                )
-            boxes_tensor = (
-                torch.tensor(boxes_raw, dtype=torch.float32)
-                if boxes_raw
-                else torch.zeros((0, 4), dtype=torch.float32)
-            )
-            labels_tensor = (
-                torch.tensor(labels_raw, dtype=torch.int64)
-                if labels_raw
-                else torch.zeros((0,), dtype=torch.int64)
-            )
-            if boxes_tensor.ndim != 2 or boxes_tensor.shape[1] != 4:
-                raise ValueError(
-                    f"Sample index {index}: boxes must be shape (M_i, 4); got "
-                    f"{tuple(boxes_tensor.shape)}."
-                )
-            out.append({"boxes": boxes_tensor, "labels": labels_tensor})
-
-        if len(out) != expected:
             raise ValueError(
-                f"Detection labels alignment produced {len(out)} entries but the "
-                f"dataset has {expected} samples."
+                f"Detection labels file {labels_path} must be a JSON array."
             )
-
-        return out
+        return _align_detection_records(
+            records, expected=len(tensor), sample_ids=sample_ids
+        )
 
     def validate_labels(self, labels: Any) -> None:
         # The detection loader returns ``list[dict]`` or ``None``. A bare tensor

From b9fa7bdea8729181b87712145ca59a100c44c066 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:25:46 +0200
Subject: [PATCH 03/28] feat(data): add label-format adapter protocol and
 registry (refs #338)

---
 src/raitap/data/__init__.py                 | 10 ++-
 src/raitap/data/_label_format_adapters.py   |  9 +++
 src/raitap/data/adapters/__init__.py        |  1 +
 src/raitap/data/adapters/coco.py            |  0
 src/raitap/data/adapters/voc.py             |  0
 src/raitap/data/adapters/yolo.py            |  0
 src/raitap/data/label_formats.py            | 84 +++++++++++++++++++++
 src/raitap/data/tests/test_label_formats.py | 39 ++++++++++
 8 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 src/raitap/data/_label_format_adapters.py
 create mode 100644 src/raitap/data/adapters/__init__.py
 create mode 100644 src/raitap/data/adapters/coco.py
 create mode 100644 src/raitap/data/adapters/voc.py
 create mode 100644 src/raitap/data/adapters/yolo.py
 create mode 100644 src/raitap/data/label_formats.py

diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py
index 2c5aa3e0..a363f644 100644
--- a/src/raitap/data/__init__.py
+++ b/src/raitap/data/__init__.py
@@ -13,7 +13,7 @@
 
 from typing import TYPE_CHECKING, Any
 
-from .types import DIRECTORY_LABELS_SOURCE, IdStrategy, LabelEncoding, Preprocessing
+from .types import DIRECTORY_LABELS_SOURCE, IdStrategy, LabelEncoding, LabelFormat, Preprocessing
 
 if TYPE_CHECKING:
     from raitap.configs.schema import DataConfig, LabelsConfig
@@ -36,6 +36,8 @@
     "DataPreprocessingFactory",
     "IdStrategy",
     "LabelEncoding",
+    "LabelFormat",
+    "LabelFormatAdapter",
     "LabelsConfig",
     "ModelInputTransformationFactory",
     "Preprocessing",
@@ -44,6 +46,7 @@
     "load_tensor_from_source",
     "raitap_model_input_transformation_factory",
     "raitap_preprocessing_factory",
+    "resolve_label_format_adapter",
 ]
 
 
@@ -69,6 +72,11 @@
         "raitap.data.preprocessing",
         "raitap_preprocessing_factory",
     ),
+    "LabelFormatAdapter": ("raitap.data.label_formats", "LabelFormatAdapter"),
+    "resolve_label_format_adapter": (
+        "raitap.data.label_formats",
+        "resolve_label_format_adapter",
+    ),
 }
 
 
diff --git a/src/raitap/data/_label_format_adapters.py b/src/raitap/data/_label_format_adapters.py
new file mode 100644
index 00000000..d1267e18
--- /dev/null
+++ b/src/raitap/data/_label_format_adapters.py
@@ -0,0 +1,9 @@
+"""Imports every in-tree label-format adapter so the decorators fire.
+
+Imported for its side effects by
+``raitap.data.label_formats.resolve_label_format_adapter``.
+"""
+
+from __future__ import annotations
+
+from raitap.data.adapters import coco, voc, yolo  # noqa: F401
diff --git a/src/raitap/data/adapters/__init__.py b/src/raitap/data/adapters/__init__.py
new file mode 100644
index 00000000..4b68f1da
--- /dev/null
+++ b/src/raitap/data/adapters/__init__.py
@@ -0,0 +1 @@
+"""Built-in label-format adapters (issue #338)."""
diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/raitap/data/label_formats.py b/src/raitap/data/label_formats.py
new file mode 100644
index 00000000..11ae1af3
--- /dev/null
+++ b/src/raitap/data/label_formats.py
@@ -0,0 +1,84 @@
+"""Pluggable label-format adapters (issue #338).
+
+Each adapter converts an external annotation file (COCO / YOLO / VOC) into
+RAITAP's native intermediate record list, which the task-family loaders then
+align to ``sample_ids`` with their existing logic. Registry mirrors
+``raitap.task_families.registry``: a decorator registers one singleton per
+``LabelFormat``.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar, runtime_checkable
+
+from raitap.data.types import LabelFormat
+
+if TYPE_CHECKING:
+    from raitap.types import TaskKind
+
+#: Native intermediate record shapes (match the on-disk native formats).
+DetectionRecord = dict[str, Any]
+ClassificationRecord = dict[str, Any]
+
+
+@runtime_checkable
+class LabelFormatAdapter(Protocol):
+    """Converts an external label file to native intermediate records."""
+
+    format: LabelFormat
+    supported_tasks: frozenset[TaskKind]
+
+    def to_detection_records(
+        self,
+        source: Path,
+        *,
+        image_dir: Path | None,
+        class_names: list[str] | None,
+    ) -> list[DetectionRecord]:
+        """Return ``[{sample_id, boxes (xyxy), labels}]``. Raise if unsupported."""
+        ...
+
+    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
+        """Return ``[{sample_id, label}]``. Raise if unsupported."""
+        ...
+
+
+#: format -> the adapter singleton serving it.
+LABEL_FORMAT_ADAPTERS: dict[LabelFormat, LabelFormatAdapter] = {}
+
+T = TypeVar("T")
+
+
+def label_format(cls: type[T]) -> type[T]:
+    """Register ``cls`` (instantiated once) under its ``format`` class attribute."""
+    instance = cls()  # type: ignore[call-arg]
+    LABEL_FORMAT_ADAPTERS[instance.format] = instance  # type: ignore[attr-defined]
+    return cls
+
+
+def resolve_label_format_adapter(
+    fmt: LabelFormat, *, task_kind: TaskKind
+) -> LabelFormatAdapter:
+    """Return the adapter for ``fmt`` that supports ``task_kind``.
+
+    Raises ``ValueError`` when no adapter is registered for ``fmt`` (e.g.
+    ``native``, which the caller should special-case) or the adapter does not
+    declare ``task_kind`` in ``supported_tasks``.
+    """
+    # Import side-effect: register the in-tree adapters on first use.
+    from raitap.data import _label_format_adapters  # noqa: F401
+
+    adapter = LABEL_FORMAT_ADAPTERS.get(fmt)
+    if adapter is None:
+        raise ValueError(
+            f"No adapter registered for label format {fmt.value!r}; "
+            f"registered: {sorted(f.value for f in LABEL_FORMAT_ADAPTERS)}."
+        )
+    if task_kind not in adapter.supported_tasks:
+        supported = sorted(t.value for t in adapter.supported_tasks)
+        raise ValueError(
+            f"Label format {fmt.value!r} does not support task {task_kind.value!r}; "
+            f"supported tasks: {supported}."
+        )
+    return adapter
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index b65742e0..3bd65616 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -1,5 +1,12 @@
+import pytest
 from raitap.data.types import LabelFormat
 from raitap.configs.schema import LabelsConfig
+from raitap.data.label_formats import (
+    LABEL_FORMAT_ADAPTERS,
+    label_format,
+    resolve_label_format_adapter,
+)
+from raitap.types import TaskKind
 
 
 def test_label_format_members_are_string_values():
@@ -9,3 +16,35 @@ def test_label_format_members_are_string_values():
 
 def test_labels_config_defaults_to_native_format():
     assert LabelsConfig().format is LabelFormat.native
+
+
+def test_label_format_decorator_registers_instance():
+    @label_format
+    class _Dummy:
+        format = LabelFormat.coco  # reuse an enum member; popped below
+        supported_tasks = frozenset({TaskKind.detection})
+
+    try:
+        assert LABEL_FORMAT_ADAPTERS[LabelFormat.coco].supported_tasks == frozenset(
+            {TaskKind.detection}
+        )
+    finally:
+        LABEL_FORMAT_ADAPTERS.pop(LabelFormat.coco, None)
+
+
+def test_registry_rejects_unknown_native():
+    with pytest.raises(ValueError, match="No adapter"):
+        resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection)
+
+
+@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False)
+def test_registry_resolves_supported_task():
+    adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection)
+    assert adapter.format is LabelFormat.coco
+    assert TaskKind.detection in adapter.supported_tasks
+
+
+@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False)
+def test_registry_rejects_unsupported_task():
+    with pytest.raises(ValueError, match="does not support task"):
+        resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification)

From 509040cfce18276fd8b485ed990b5ddcf14d85f7 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:31:33 +0200
Subject: [PATCH 04/28] feat(data): add COCO label-format adapter (refs #338)

---
 src/raitap/data/adapters/coco.py            | 78 +++++++++++++++++++++
 src/raitap/data/tests/test_label_formats.py | 76 +++++++++++++++++++-
 2 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py
index e69de29b..7fb5b99f 100644
--- a/src/raitap/data/adapters/coco.py
+++ b/src/raitap/data/adapters/coco.py
@@ -0,0 +1,78 @@
+"""COCO label-format adapter (issue #338)."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from raitap.data.label_formats import (
+    ClassificationRecord,
+    DetectionRecord,
+    label_format,
+)
+from raitap.data.types import LabelFormat
+from raitap.types import TaskKind
+
+
+@label_format
+class CocoAdapter:
+    """COCO ``instances.json`` -> native records.
+
+    Detection: ``bbox`` is ``[x, y, w, h]`` -> ``[x1, y1, x2, y2]``;
+    ``category_id`` passes through unchanged so labels stay in the model's
+    label space. Classification: one label per image (the image's single
+    annotation category); images with 0 or >1 categories raise.
+    """
+
+    format = LabelFormat.coco
+    supported_tasks = frozenset({TaskKind.detection, TaskKind.classification})
+
+    def _load(self, source: Path) -> dict[str, Any]:
+        with source.open() as fh:
+            data = json.load(fh)
+        if not isinstance(data, dict) or "images" not in data:
+            raise ValueError(
+                f"COCO file {source} must be an object with an 'images' array."
+            )
+        return data
+
+    def to_detection_records(
+        self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
+    ) -> list[DetectionRecord]:
+        data = self._load(source)
+        file_by_image: dict[int, str] = {
+            img["id"]: img["file_name"] for img in data["images"]
+        }
+        boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image}
+        labels: dict[int, list[int]] = {iid: [] for iid in file_by_image}
+        for ann in data.get("annotations", []):
+            iid = ann["image_id"]
+            x, y, w, h = ann["bbox"]
+            boxes[iid].append([x, y, x + w, y + h])
+            labels[iid].append(int(ann["category_id"]))
+        return [
+            {"sample_id": file_by_image[iid], "boxes": boxes[iid], "labels": labels[iid]}
+            for iid in file_by_image
+        ]
+
+    def to_classification_records(
+        self, source: Path
+    ) -> list[ClassificationRecord]:
+        data = self._load(source)
+        file_by_image: dict[int, str] = {
+            img["id"]: img["file_name"] for img in data["images"]
+        }
+        cats: dict[int, set[int]] = {iid: set() for iid in file_by_image}
+        for ann in data.get("annotations", []):
+            cats[ann["image_id"]].add(int(ann["category_id"]))
+        records: list[ClassificationRecord] = []
+        for iid, name in file_by_image.items():
+            cat_set = cats[iid]
+            if len(cat_set) != 1:
+                raise ValueError(
+                    f"COCO classification needs exactly one category per image; "
+                    f"image {name!r} has {len(cat_set)}."
+                )
+            records.append({"sample_id": name, "label": next(iter(cat_set))})
+        return records
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index 3bd65616..a88d9304 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -37,14 +37,86 @@ def test_registry_rejects_unknown_native():
         resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection)
 
 
-@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False)
 def test_registry_resolves_supported_task():
     adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection)
     assert adapter.format is LabelFormat.coco
     assert TaskKind.detection in adapter.supported_tasks
 
 
-@pytest.mark.xfail(reason="adapter added in task 4/5", strict=False)
+@pytest.mark.xfail(reason="adapter added in task 5 (yolo)", strict=False)
 def test_registry_rejects_unsupported_task():
     with pytest.raises(ValueError, match="does not support task"):
         resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification)
+
+
+def test_coco_detection_records(tmp_path):
+    import json
+    from raitap.data.adapters.coco import CocoAdapter
+
+    coco = {
+        "images": [
+            {"id": 1, "file_name": "a.jpg"},
+            {"id": 2, "file_name": "b.jpg"},
+        ],
+        "annotations": [
+            {"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]},
+            {"image_id": 1, "category_id": 5, "bbox": [0, 0, 5, 5]},
+        ],
+        "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}],
+    }
+    p = tmp_path / "instances.json"
+    p.write_text(json.dumps(coco))
+
+    records = CocoAdapter().to_detection_records(p, image_dir=None, class_names=None)
+    by_id = {r["sample_id"]: r for r in records}
+    assert by_id["a.jpg"]["boxes"] == [[10, 20, 40, 60], [0, 0, 5, 5]]
+    assert by_id["a.jpg"]["labels"] == [3, 5]
+    assert by_id["b.jpg"] == {"sample_id": "b.jpg", "boxes": [], "labels": []}
+
+
+def test_coco_classification_records(tmp_path):
+    import json
+    from raitap.data.adapters.coco import CocoAdapter
+
+    coco = {
+        "images": [{"id": 1, "file_name": "a.jpg"}],
+        "annotations": [{"image_id": 1, "category_id": 7, "bbox": [0, 0, 1, 1]}],
+        "categories": [{"id": 7, "name": "cat"}],
+    }
+    p = tmp_path / "c.json"
+    p.write_text(json.dumps(coco))
+    records = CocoAdapter().to_classification_records(p)
+    assert records == [{"sample_id": "a.jpg", "label": 7}]
+
+
+def test_coco_classification_rejects_zero_categories(tmp_path):
+    import json
+    from raitap.data.adapters.coco import CocoAdapter
+
+    coco = {
+        "images": [{"id": 1, "file_name": "a.jpg"}],
+        "annotations": [],
+        "categories": [{"id": 7, "name": "cat"}],
+    }
+    p = tmp_path / "zero.json"
+    p.write_text(json.dumps(coco))
+    with pytest.raises(ValueError, match="exactly one category per image"):
+        CocoAdapter().to_classification_records(p)
+
+
+def test_coco_classification_rejects_multiple_categories(tmp_path):
+    import json
+    from raitap.data.adapters.coco import CocoAdapter
+
+    coco = {
+        "images": [{"id": 1, "file_name": "a.jpg"}],
+        "annotations": [
+            {"image_id": 1, "category_id": 3, "bbox": [0, 0, 1, 1]},
+            {"image_id": 1, "category_id": 5, "bbox": [0, 0, 1, 1]},
+        ],
+        "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}],
+    }
+    p = tmp_path / "multi.json"
+    p.write_text(json.dumps(coco))
+    with pytest.raises(ValueError, match="exactly one category per image"):
+        CocoAdapter().to_classification_records(p)

From 46377485c9b298b6c5d12da3e0d4786052b54176 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:35:48 +0200
Subject: [PATCH 05/28] feat(data): add YOLO label-format adapter (refs #338)

---
 src/raitap/data/adapters/yolo.py            | 74 +++++++++++++++++++++
 src/raitap/data/tests/test_label_formats.py | 26 +++++++-
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py
index e69de29b..fac219fa 100644
--- a/src/raitap/data/adapters/yolo.py
+++ b/src/raitap/data/adapters/yolo.py
@@ -0,0 +1,74 @@
+"""YOLO label-format adapter (issue #338)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from PIL import Image
+
+from raitap.data.label_formats import (
+    ClassificationRecord,
+    DetectionRecord,
+    label_format,
+)
+from raitap.data.types import LabelFormat
+from raitap.types import TaskKind
+
+_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
+
+
+@label_format
+class YoloAdapter:
+    """YOLO per-image ``.txt`` (``class cx cy w h``, normalised) -> native records.
+
+    Boxes are denormalised with each image's pixel size, read from
+    ``image_dir``. Class indices pass through unchanged.
+    """
+
+    format = LabelFormat.yolo
+    supported_tasks = frozenset({TaskKind.detection})
+
+    def _image_for(self, image_dir: Path, stem: str) -> Path:
+        for suffix in _IMAGE_SUFFIXES:
+            candidate = image_dir / f"{stem}{suffix}"
+            if candidate.exists():
+                return candidate
+        raise ValueError(
+            f"YOLO adapter found no image for label {stem!r} in {image_dir}."
+        )
+
+    def to_detection_records(
+        self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
+    ) -> list[DetectionRecord]:
+        if image_dir is None:
+            raise ValueError(
+                "YOLO labels need image_dir to denormalise boxes; "
+                "set data.source to the image directory."
+            )
+        records: list[DetectionRecord] = []
+        for txt in sorted(source.glob("*.txt")):
+            image_path = self._image_for(image_dir, txt.stem)
+            with Image.open(image_path) as im:
+                width, height = im.size
+            boxes: list[list[float]] = []
+            labels: list[int] = []
+            for line in txt.read_text().splitlines():
+                parts = line.split()
+                if not parts:
+                    continue
+                cls, cx, cy, bw, bh = (float(p) for p in parts[:5])
+                x1 = (cx - bw / 2) * width
+                y1 = (cy - bh / 2) * height
+                x2 = (cx + bw / 2) * width
+                y2 = (cy + bh / 2) * height
+                boxes.append([x1, y1, x2, y2])
+                labels.append(int(cls))
+            records.append(
+                {"sample_id": image_path.name, "boxes": boxes, "labels": labels}
+            )
+        return records
+
+    def to_classification_records(
+        self, source: Path
+    ) -> list[ClassificationRecord]:
+        raise ValueError("YOLO is a detection-only format.")
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index a88d9304..3f4774cb 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -43,7 +43,6 @@ def test_registry_resolves_supported_task():
     assert TaskKind.detection in adapter.supported_tasks
 
 
-@pytest.mark.xfail(reason="adapter added in task 5 (yolo)", strict=False)
 def test_registry_rejects_unsupported_task():
     with pytest.raises(ValueError, match="does not support task"):
         resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification)
@@ -120,3 +119,28 @@ def test_coco_classification_rejects_multiple_categories(tmp_path):
     p.write_text(json.dumps(coco))
     with pytest.raises(ValueError, match="exactly one category per image"):
         CocoAdapter().to_classification_records(p)
+
+
+def test_yolo_detection_records(tmp_path):
+    from PIL import Image
+    from raitap.data.adapters.yolo import YoloAdapter
+
+    image_dir = tmp_path / "images"
+    image_dir.mkdir()
+    Image.new("RGB", (100, 200)).save(image_dir / "a.jpg")  # w=100, h=200
+
+    label_dir = tmp_path / "labels"
+    label_dir.mkdir()
+    # class=2, cx=0.5 cy=0.5 w=0.2 h=0.1  -> center (50,100), box 20x20px
+    (label_dir / "a.txt").write_text("2 0.5 0.5 0.2 0.1\n")
+
+    records = YoloAdapter().to_detection_records(
+        label_dir, image_dir=image_dir, class_names=None
+    )
+    assert len(records) == 1
+    rec = records[0]
+    assert rec["sample_id"] == "a.jpg"
+    assert rec["labels"] == [2]
+    # x1 = (0.5-0.1)*100=40, y1=(0.5-0.05)*200=90, x2=60, y2=110
+    assert len(rec["boxes"]) == 1
+    assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0])

From ff416640cb09922a6a2cf2eaceec13a6cff45bac Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:39:03 +0200
Subject: [PATCH 06/28] feat(data): add Pascal-VOC label-format adapter (refs
 #338)

---
 src/raitap/data/adapters/voc.py             | 76 +++++++++++++++++++++
 src/raitap/data/tests/test_label_formats.py | 21 ++++++
 2 files changed, 97 insertions(+)

diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py
index e69de29b..98fb11bd 100644
--- a/src/raitap/data/adapters/voc.py
+++ b/src/raitap/data/adapters/voc.py
@@ -0,0 +1,76 @@
+"""Pascal-VOC label-format adapter (issue #338)."""
+
+from __future__ import annotations
+
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+from raitap.data.label_formats import (
+    ClassificationRecord,
+    DetectionRecord,
+    label_format,
+)
+from raitap.data.types import LabelFormat
+from raitap.types import TaskKind
+
+#: Canonical Pascal-VOC class order (index = label id) when no class_names given.
+_VOC_CLASSES = (
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+    "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+)
+
+
+@label_format
+class VocAdapter:
+    """Pascal-VOC per-image ``.xml`` -> native detection records.
+
+    Boxes are already ``[xmin, ymin, xmax, ymax]`` pixels. Class names map to
+    ids by their position in ``class_names`` (else the standard 20-class VOC
+    order).
+    """
+
+    format = LabelFormat.voc
+    supported_tasks = frozenset({TaskKind.detection})
+
+    def to_detection_records(
+        self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
+    ) -> list[DetectionRecord]:
+        name_to_id = {
+            name: idx
+            for idx, name in enumerate(class_names if class_names else _VOC_CLASSES)
+        }
+        records: list[DetectionRecord] = []
+        for xml_path in sorted(source.glob("*.xml")):
+            root = ET.parse(xml_path).getroot()
+            filename_el = root.find("filename")
+            if filename_el is None or not filename_el.text:
+                raise ValueError(f"VOC file {xml_path} has no <filename>.")
+            boxes: list[list[float]] = []
+            labels: list[int] = []
+            for obj in root.findall("object"):
+                name = obj.findtext("name")
+                if name not in name_to_id:
+                    raise ValueError(
+                        f"VOC class {name!r} in {xml_path.name} is not in the "
+                        f"class list {sorted(name_to_id)}."
+                    )
+                box = obj.find("bndbox")
+                boxes.append(
+                    [
+                        float(box.findtext("xmin")),
+                        float(box.findtext("ymin")),
+                        float(box.findtext("xmax")),
+                        float(box.findtext("ymax")),
+                    ]
+                )
+                labels.append(name_to_id[name])
+            records.append(
+                {"sample_id": filename_el.text, "boxes": boxes, "labels": labels}
+            )
+        return records
+
+    def to_classification_records(
+        self, source: Path
+    ) -> list[ClassificationRecord]:
+        raise ValueError("VOC is a detection-only format.")
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index 3f4774cb..25aa55e0 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -144,3 +144,24 @@ def test_yolo_detection_records(tmp_path):
     # x1 = (0.5-0.1)*100=40, y1=(0.5-0.05)*200=90, x2=60, y2=110
     assert len(rec["boxes"]) == 1
     assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0])
+
+
+def test_voc_detection_records(tmp_path):
+    from raitap.data.adapters.voc import VocAdapter
+
+    xml = """<annotation>
+      <filename>a.jpg</filename>
+      <object><name>person</name>
+        <bndbox><xmin>10</xmin><ymin>20</ymin><xmax>30</xmax><ymax>40</ymax></bndbox>
+      </object>
+    </annotation>"""
+    d = tmp_path / "ann"
+    d.mkdir()
+    (d / "a.xml").write_text(xml)
+
+    records = VocAdapter().to_detection_records(
+        d, image_dir=None, class_names=["background", "person", "car"]
+    )
+    assert records == [
+        {"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]}
+    ]

From c248cc016874a1971722b61f4379d1f7b9e2b578 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:43:49 +0200
Subject: [PATCH 07/28] feat(model): dispatch detection labels on
 data.labels.format (refs #338)

---
 src/raitap/data/tests/test_label_formats.py | 33 ++++++++++++++++++
 src/raitap/task_families/detection.py       | 37 +++++++++++++++++----
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index 25aa55e0..991b1234 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -165,3 +165,36 @@ def test_voc_detection_records(tmp_path):
     assert records == [
         {"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]}
     ]
+
+
+def test_detection_load_labels_via_coco(tmp_path, monkeypatch):
+    import json
+    import torch
+    from types import SimpleNamespace
+    from raitap.task_families.detection import DetectionFamily
+    from raitap.data.types import LabelFormat
+    import raitap.data.data as data_mod
+
+    coco = {
+        "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}],
+        "annotations": [{"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}],
+        "categories": [{"id": 3, "name": "car"}],
+    }
+    labels_file = tmp_path / "instances.json"
+    labels_file.write_text(json.dumps(coco))
+
+    monkeypatch.setattr(
+        data_mod, "get_source_path", lambda source, *, kind: tmp_path / source
+    )
+    # tmp_path/"instances.json" is LABELS; tmp_path/"imgs" is DATA (unused by coco).
+    cfg = SimpleNamespace(
+        data=SimpleNamespace(
+            source="imgs",
+            labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco),
+        )
+    )
+    tensor = [object(), object()]  # len == 2 samples
+    out = DetectionFamily().load_labels(cfg, tensor=tensor, sample_ids=["a.jpg", "b.jpg"])
+    assert torch.equal(out[0]["boxes"], torch.tensor([[10.0, 20.0, 40.0, 60.0]]))
+    assert torch.equal(out[0]["labels"], torch.tensor([3]))
+    assert out[1]["boxes"].shape == (0, 4)
diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py
index 15614e64..a37920a6 100644
--- a/src/raitap/task_families/detection.py
+++ b/src/raitap/task_families/detection.py
@@ -24,7 +24,7 @@ def _align_detection_records(
     *,
     expected: int,
     sample_ids: Any,
-) -> list[dict[str, "Any"]]:
+) -> list[dict[str, "torch.Tensor"]]:
     """Align native detection records to ``sample_ids`` and build tensors.
 
     Extracted from ``DetectionFamily.load_labels`` so label-format adapters can
@@ -188,11 +188,36 @@ def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any:
             return None
 
         labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
-        with labels_path.open() as fh:
-            records = json.load(fh)
-        if not isinstance(records, list):
-            raise ValueError(
-                f"Detection labels file {labels_path} must be a JSON array."
+
+        from raitap.data.types import LabelFormat
+
+        fmt = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native
+        if fmt == LabelFormat.native:
+            with labels_path.open() as fh:
+                records = json.load(fh)
+            if not isinstance(records, list):
+                raise ValueError(
+                    f"Detection labels file {labels_path} must be a JSON array."
+                )
+        else:
+            from raitap.data.label_formats import resolve_label_format_adapter
+
+            data_source = _get_optional_config_value(cfg.data, "source")
+            image_dir = (
+                get_source_path(data_source, kind=SourceKind.DATA)
+                if data_source
+                else None
+            )
+            class_names = (
+                _get_optional_config_value(cfg.model, "class_names")
+                if hasattr(cfg, "model")
+                else None
+            )
+            adapter = resolve_label_format_adapter(
+                LabelFormat(fmt), task_kind=self.kind
+            )
+            records = adapter.to_detection_records(
+                labels_path, image_dir=image_dir, class_names=class_names
             )
         return _align_detection_records(
             records, expected=len(tensor), sample_ids=sample_ids

From 87e5d657cfe059715035025438804746de81f264 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:48:13 +0200
Subject: [PATCH 08/28] feat(data): dispatch classification labels on
 data.labels.format (refs #338)

---
 src/raitap/data/data.py                     | 28 +++++++++++++++++
 src/raitap/data/tests/test_label_formats.py | 35 +++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/src/raitap/data/data.py b/src/raitap/data/data.py
index b154b1b2..9b81806d 100644
--- a/src/raitap/data/data.py
+++ b/src/raitap/data/data.py
@@ -17,6 +17,7 @@
     IdStrategy,
     InputModality,
     LabelEncoding,
+    LabelFormat,
 )
 from raitap.data.utils import download_file
 from raitap.tracking.base_tracker import BaseTracker, Trackable
@@ -281,6 +282,33 @@ def load_classification_labels(
     if labels_source == DIRECTORY_LABELS_SOURCE:
         return _load_directory_labels(sample_ids)
 
+    labels_format = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native
+    if labels_format != LabelFormat.native:
+        from raitap.data.label_formats import resolve_label_format_adapter
+
+        if not sample_ids:
+            raise ValueError(
+                f"Label format {LabelFormat(labels_format).value!r} requires "
+                "id-based alignment, but no sample ids were discovered."
+            )
+        labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
+        adapter = resolve_label_format_adapter(
+            LabelFormat(labels_format), task_kind=TaskKind.classification
+        )
+        records = adapter.to_classification_records(labels_path)
+        id_series = pd.Series([r["sample_id"] for r in records])
+        record_labels = [int(r["label"]) for r in records]
+        strategy = _resolve_id_strategy(
+            _get_optional_config_value(labels_cfg, "id_strategy") or "auto", id_series
+        )
+        aligned = _align_labels_to_samples(
+            sample_ids=sample_ids,
+            raw_label_ids=id_series,
+            encoded_labels=record_labels,
+            strategy=strategy,
+        )
+        return torch.tensor(aligned, dtype=torch.long)
+
     labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
     labels_df = _load_tabular_frame(labels_path)
     if labels_df.empty:
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index 991b1234..db27ab9d 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -198,3 +198,38 @@ def test_detection_load_labels_via_coco(tmp_path, monkeypatch):
     assert torch.equal(out[0]["boxes"], torch.tensor([[10.0, 20.0, 40.0, 60.0]]))
     assert torch.equal(out[0]["labels"], torch.tensor([3]))
     assert out[1]["boxes"].shape == (0, 4)
+
+
+def test_classification_load_labels_via_coco(tmp_path, monkeypatch):
+    import json
+    import torch
+    from types import SimpleNamespace
+    import raitap.data.data as data_mod
+    from raitap.data.data import load_classification_labels
+    from raitap.data.types import LabelFormat
+
+    coco = {
+        "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}],
+        "annotations": [
+            {"image_id": 1, "category_id": 0, "bbox": [0, 0, 1, 1]},
+            {"image_id": 2, "category_id": 4, "bbox": [0, 0, 1, 1]},
+        ],
+        "categories": [{"id": 0, "name": "x"}, {"id": 4, "name": "y"}],
+    }
+    labels_file = tmp_path / "c.json"
+    labels_file.write_text(json.dumps(coco))
+    monkeypatch.setattr(
+        data_mod, "get_source_path", lambda source, *, kind: tmp_path / source
+    )
+    cfg = SimpleNamespace(
+        data=SimpleNamespace(
+            source="imgs",
+            labels=SimpleNamespace(
+                source="c.json", format=LabelFormat.coco, id_strategy="stem"
+            ),
+        )
+    )
+    out = load_classification_labels(
+        cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"]
+    )
+    assert torch.equal(out, torch.tensor([0, 4]))

From d3510c5f35bee8f42f680f3ab4123f06c8933cee Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Tue, 23 Jun 2026 23:48:27 +0200
Subject: [PATCH 09/28] docs: document label-format adapters and
 data.labels.format (refs #338)

---
 docs/contributor/modules/data.md   | 14 ++++++++++++++
 docs/modules/data/configuration.md | 24 ++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/docs/contributor/modules/data.md b/docs/contributor/modules/data.md
index 319d72c4..82bfc9a7 100644
--- a/docs/contributor/modules/data.md
+++ b/docs/contributor/modules/data.md
@@ -69,6 +69,20 @@ referenceable by name in `data.source`. Registration lives in
 5. **Update docs** — add the new sample name to
    {doc}`/modules/data/own-vs-built-in`.
 
+## Adding a label format
+
+1. Create `src/raitap/data/adapters/<format>.py` with a class decorated
+   `@label_format`. Set `format = LabelFormat.<name>` and
+   `supported_tasks = frozenset({...})`.
+2. Implement `to_detection_records` and/or `to_classification_records`,
+   returning the native record shape (`{sample_id, boxes (xyxy), labels}` or
+   `{sample_id, label}`). Raise `ValueError` for an unsupported task.
+3. Import it in `src/raitap/data/_label_format_adapters.py` so the decorator
+   fires.
+4. Add a `LabelFormat` member in `src/raitap/data/types.py` and a row to the
+   label-format table in `docs/modules/data/configuration.md`.
+5. Add tests in `src/raitap/data/tests/test_label_formats.py`.
+
 ## Sample discovery and label alignment
 
 `data.source` directories are walked **recursively** (`Path.rglob`); sample
diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md
index 447c29d2..653c9b26 100644
--- a/docs/modules/data/configuration.md
+++ b/docs/modules/data/configuration.md
@@ -105,6 +105,16 @@ myst:
   nested ImageFolder layouts (e.g. `NORMAL/IM-0001.jpeg`) — required when
   filename stems collide across class subdirs. `"stem"` matches by basename only (flat-dir layouts).
 
+:option: labels.format
+:allowed: "native", "coco", "yolo", "voc"
+:default: "native"
+:description: External label file format. `"native"` (default) reads RAITAP's
+  own shape (classification: CSV/TSV/Parquet or the `"directory"` source;
+  detection: the JSON record list). `"coco"`, `"yolo"`, and `"voc"` convert a
+  standard annotation file to the native shape before alignment. `"yolo"` and
+  `"voc"` are detection only; `"coco"` serves detection and classification.
+  Non-native formats align by sample id, so a labels id is required.
+
 :option: input_metadata
 :allowed: dict, null
 :default: null
@@ -180,6 +190,20 @@ data = DataConfig(
 )
 ```
 
+## Label formats
+
+RAITAP reads common annotation formats directly via `data.labels.format`.
+
+| Format   | Detection | Classification | Source layout                                  |
+| -------- | --------- | -------------- | ---------------------------------------------- |
+| `native` | yes       | yes            | JSON record list / CSV-TSV-Parquet             |
+| `coco`   | yes       | yes            | single `instances.json`                        |
+| `yolo`   | yes       | no             | dir of per-image `.txt` (needs `data.source`)  |
+| `voc`    | yes       | no             | dir of per-image `.xml`                        |
+
+COCO and YOLO labels keep their category ids unchanged. VOC class names map to
+ids by `model.class_names` order, else the standard 20-class VOC order.
+
 For tabular models whose backend expects an unusual per-sample layout (such
 as ACAS Xu, a Torch network whose forward takes `(N, 1, 1, 5)`), supply
 `input_metadata.shape` explicitly so the pipeline reshapes the flat feature

From 8314068986aec383fc65fa5e71d4f070469e42a1 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 00:06:46 +0200
Subject: [PATCH 10/28] style(data): satisfy ruff and pyright for label-format
 adapters (refs #338)

---
 src/raitap/data/__init__.py                 |   1 +
 src/raitap/data/_label_format_adapters.py   |   5 +-
 src/raitap/data/adapters/coco.py            |  22 ++--
 src/raitap/data/adapters/voc.py             |  56 ++++++---
 src/raitap/data/adapters/yolo.py            |  17 ++-
 src/raitap/data/label_formats.py            |  14 +--
 src/raitap/data/tests/test_label_formats.py | 127 +++++++++++++-------
 src/raitap/task_families/detection.py       |  22 ++--
 8 files changed, 155 insertions(+), 109 deletions(-)

diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py
index a363f644..1bcb08f3 100644
--- a/src/raitap/data/__init__.py
+++ b/src/raitap/data/__init__.py
@@ -19,6 +19,7 @@
     from raitap.configs.schema import DataConfig, LabelsConfig
 
     from .data import Data, load_numpy_from_source, load_tensor_from_source
+    from .label_formats import LabelFormatAdapter, resolve_label_format_adapter
     from .metadata import DataInputMetadata, infer_data_input_metadata
     from .preprocessing import (
         DataPreprocessingFactory,
diff --git a/src/raitap/data/_label_format_adapters.py b/src/raitap/data/_label_format_adapters.py
index d1267e18..41c06b01 100644
--- a/src/raitap/data/_label_format_adapters.py
+++ b/src/raitap/data/_label_format_adapters.py
@@ -1,7 +1,10 @@
+# pyright: reportUnusedImport=false
 """Imports every in-tree label-format adapter so the decorators fire.
 
 Imported for its side effects by
-``raitap.data.label_formats.resolve_label_format_adapter``.
+``raitap.data.label_formats.resolve_label_format_adapter``. Every import in this
+module is intentionally side-effect-only (registers an adapter), so the
+file-level ``reportUnusedImport=false`` above is correct.
 """
 
 from __future__ import annotations
diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py
index 7fb5b99f..3551e5f3 100644
--- a/src/raitap/data/adapters/coco.py
+++ b/src/raitap/data/adapters/coco.py
@@ -3,8 +3,10 @@
 from __future__ import annotations
 
 import json
-from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 from raitap.data.label_formats import (
     ClassificationRecord,
@@ -32,18 +34,14 @@ def _load(self, source: Path) -> dict[str, Any]:
         with source.open() as fh:
             data = json.load(fh)
         if not isinstance(data, dict) or "images" not in data:
-            raise ValueError(
-                f"COCO file {source} must be an object with an 'images' array."
-            )
+            raise ValueError(f"COCO file {source} must be an object with an 'images' array.")
         return data
 
     def to_detection_records(
         self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
     ) -> list[DetectionRecord]:
         data = self._load(source)
-        file_by_image: dict[int, str] = {
-            img["id"]: img["file_name"] for img in data["images"]
-        }
+        file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]}
         boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image}
         labels: dict[int, list[int]] = {iid: [] for iid in file_by_image}
         for ann in data.get("annotations", []):
@@ -56,13 +54,9 @@ def to_detection_records(
             for iid in file_by_image
         ]
 
-    def to_classification_records(
-        self, source: Path
-    ) -> list[ClassificationRecord]:
+    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
         data = self._load(source)
-        file_by_image: dict[int, str] = {
-            img["id"]: img["file_name"] for img in data["images"]
-        }
+        file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]}
         cats: dict[int, set[int]] = {iid: set() for iid in file_by_image}
         for ann in data.get("annotations", []):
             cats[ann["image_id"]].add(int(ann["category_id"]))
diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py
index 98fb11bd..02a8f270 100644
--- a/src/raitap/data/adapters/voc.py
+++ b/src/raitap/data/adapters/voc.py
@@ -3,7 +3,10 @@
 from __future__ import annotations
 
 import xml.etree.ElementTree as ET
-from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 from raitap.data.label_formats import (
     ClassificationRecord,
@@ -15,12 +18,36 @@
 
 #: Canonical Pascal-VOC class order (index = label id) when no class_names given.
 _VOC_CLASSES = (
-    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
-    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
-    "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
 )
 
 
+def _coord(box: ET.Element, tag: str, xml_path: Path) -> float:
+    text = box.findtext(tag)
+    if text is None:
+        raise ValueError(f"VOC bndbox in {xml_path.name} missing <{tag}>.")
+    return float(text)
+
+
 @label_format
 class VocAdapter:
     """Pascal-VOC per-image ``.xml`` -> native detection records.
@@ -37,8 +64,7 @@ def to_detection_records(
         self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
     ) -> list[DetectionRecord]:
         name_to_id = {
-            name: idx
-            for idx, name in enumerate(class_names if class_names else _VOC_CLASSES)
+            name: idx for idx, name in enumerate(class_names if class_names else _VOC_CLASSES)
         }
         records: list[DetectionRecord] = []
         for xml_path in sorted(source.glob("*.xml")):
@@ -56,21 +82,19 @@ def to_detection_records(
                         f"class list {sorted(name_to_id)}."
                     )
                 box = obj.find("bndbox")
+                if box is None:
+                    raise ValueError(f"VOC object in {xml_path.name} has no <bndbox>.")
                 boxes.append(
                     [
-                        float(box.findtext("xmin")),
-                        float(box.findtext("ymin")),
-                        float(box.findtext("xmax")),
-                        float(box.findtext("ymax")),
+                        _coord(box, "xmin", xml_path),
+                        _coord(box, "ymin", xml_path),
+                        _coord(box, "xmax", xml_path),
+                        _coord(box, "ymax", xml_path),
                     ]
                 )
                 labels.append(name_to_id[name])
-            records.append(
-                {"sample_id": filename_el.text, "boxes": boxes, "labels": labels}
-            )
+            records.append({"sample_id": filename_el.text, "boxes": boxes, "labels": labels})
         return records
 
-    def to_classification_records(
-        self, source: Path
-    ) -> list[ClassificationRecord]:
+    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
         raise ValueError("VOC is a detection-only format.")
diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py
index fac219fa..be6419f8 100644
--- a/src/raitap/data/adapters/yolo.py
+++ b/src/raitap/data/adapters/yolo.py
@@ -2,10 +2,13 @@
 
 from __future__ import annotations
 
-from pathlib import Path
+from typing import TYPE_CHECKING
 
 from PIL import Image
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
 from raitap.data.label_formats import (
     ClassificationRecord,
     DetectionRecord,
@@ -33,9 +36,7 @@ def _image_for(self, image_dir: Path, stem: str) -> Path:
             candidate = image_dir / f"{stem}{suffix}"
             if candidate.exists():
                 return candidate
-        raise ValueError(
-            f"YOLO adapter found no image for label {stem!r} in {image_dir}."
-        )
+        raise ValueError(f"YOLO adapter found no image for label {stem!r} in {image_dir}.")
 
     def to_detection_records(
         self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
@@ -63,12 +64,8 @@ def to_detection_records(
                 y2 = (cy + bh / 2) * height
                 boxes.append([x1, y1, x2, y2])
                 labels.append(int(cls))
-            records.append(
-                {"sample_id": image_path.name, "boxes": boxes, "labels": labels}
-            )
+            records.append({"sample_id": image_path.name, "boxes": boxes, "labels": labels})
         return records
 
-    def to_classification_records(
-        self, source: Path
-    ) -> list[ClassificationRecord]:
+    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
         raise ValueError("YOLO is a detection-only format.")
diff --git a/src/raitap/data/label_formats.py b/src/raitap/data/label_formats.py
index 11ae1af3..19021a95 100644
--- a/src/raitap/data/label_formats.py
+++ b/src/raitap/data/label_formats.py
@@ -9,12 +9,12 @@
 
 from __future__ import annotations
 
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar, runtime_checkable
 
-from raitap.data.types import LabelFormat
-
 if TYPE_CHECKING:
+    from pathlib import Path
+
+    from raitap.data.types import LabelFormat
     from raitap.types import TaskKind
 
 #: Native intermediate record shapes (match the on-disk native formats).
@@ -57,9 +57,7 @@ def label_format(cls: type[T]) -> type[T]:
     return cls
 
 
-def resolve_label_format_adapter(
-    fmt: LabelFormat, *, task_kind: TaskKind
-) -> LabelFormatAdapter:
+def resolve_label_format_adapter(fmt: LabelFormat, *, task_kind: TaskKind) -> LabelFormatAdapter:
     """Return the adapter for ``fmt`` that supports ``task_kind``.
 
     Raises ``ValueError`` when no adapter is registered for ``fmt`` (e.g.
@@ -67,7 +65,9 @@ def resolve_label_format_adapter(
     declare ``task_kind`` in ``supported_tasks``.
     """
     # Import side-effect: register the in-tree adapters on first use.
-    from raitap.data import _label_format_adapters  # noqa: F401
+    from raitap.data import (
+        _label_format_adapters,  # noqa: F401  # pyright: ignore[reportUnusedImport]
+    )
 
     adapter = LABEL_FORMAT_ADAPTERS.get(fmt)
     if adapter is None:
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index db27ab9d..39a04e18 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -1,24 +1,34 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, cast
+
 import pytest
-from raitap.data.types import LabelFormat
+
 from raitap.configs.schema import LabelsConfig
 from raitap.data.label_formats import (
     LABEL_FORMAT_ADAPTERS,
     label_format,
     resolve_label_format_adapter,
 )
+from raitap.data.types import LabelFormat
 from raitap.types import TaskKind
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from raitap.configs.schema import AppConfig
 
-def test_label_format_members_are_string_values():
+
+def test_label_format_members_are_string_values() -> None:
     assert LabelFormat.native == "native"
     assert {f.value for f in LabelFormat} == {"native", "coco", "yolo", "voc"}
 
 
-def test_labels_config_defaults_to_native_format():
+def test_labels_config_defaults_to_native_format() -> None:
     assert LabelsConfig().format is LabelFormat.native
 
 
-def test_label_format_decorator_registers_instance():
+def test_label_format_decorator_registers_instance() -> None:
     @label_format
     class _Dummy:
         format = LabelFormat.coco  # reuse an enum member; popped below
@@ -32,24 +42,25 @@ class _Dummy:
         LABEL_FORMAT_ADAPTERS.pop(LabelFormat.coco, None)
 
 
-def test_registry_rejects_unknown_native():
+def test_registry_rejects_unknown_native() -> None:
     with pytest.raises(ValueError, match="No adapter"):
         resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection)
 
 
-def test_registry_resolves_supported_task():
+def test_registry_resolves_supported_task() -> None:
     adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection)
     assert adapter.format is LabelFormat.coco
     assert TaskKind.detection in adapter.supported_tasks
 
 
-def test_registry_rejects_unsupported_task():
+def test_registry_rejects_unsupported_task() -> None:
     with pytest.raises(ValueError, match="does not support task"):
         resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification)
 
 
-def test_coco_detection_records(tmp_path):
+def test_coco_detection_records(tmp_path: Path) -> None:
     import json
+
     from raitap.data.adapters.coco import CocoAdapter
 
     coco = {
@@ -73,8 +84,9 @@ def test_coco_detection_records(tmp_path):
     assert by_id["b.jpg"] == {"sample_id": "b.jpg", "boxes": [], "labels": []}
 
 
-def test_coco_classification_records(tmp_path):
+def test_coco_classification_records(tmp_path: Path) -> None:
     import json
+
     from raitap.data.adapters.coco import CocoAdapter
 
     coco = {
@@ -88,8 +100,9 @@ def test_coco_classification_records(tmp_path):
     assert records == [{"sample_id": "a.jpg", "label": 7}]
 
 
-def test_coco_classification_rejects_zero_categories(tmp_path):
+def test_coco_classification_rejects_zero_categories(tmp_path: Path) -> None:
     import json
+
     from raitap.data.adapters.coco import CocoAdapter
 
     coco = {
@@ -103,8 +116,9 @@ def test_coco_classification_rejects_zero_categories(tmp_path):
         CocoAdapter().to_classification_records(p)
 
 
-def test_coco_classification_rejects_multiple_categories(tmp_path):
+def test_coco_classification_rejects_multiple_categories(tmp_path: Path) -> None:
     import json
+
     from raitap.data.adapters.coco import CocoAdapter
 
     coco = {
@@ -121,8 +135,9 @@ def test_coco_classification_rejects_multiple_categories(tmp_path):
         CocoAdapter().to_classification_records(p)
 
 
-def test_yolo_detection_records(tmp_path):
+def test_yolo_detection_records(tmp_path: Path) -> None:
     from PIL import Image
+
     from raitap.data.adapters.yolo import YoloAdapter
 
     image_dir = tmp_path / "images"
@@ -134,9 +149,7 @@ def test_yolo_detection_records(tmp_path):
     # class=2, cx=0.5 cy=0.5 w=0.2 h=0.1  -> center (50,100), box 20x20px
     (label_dir / "a.txt").write_text("2 0.5 0.5 0.2 0.1\n")
 
-    records = YoloAdapter().to_detection_records(
-        label_dir, image_dir=image_dir, class_names=None
-    )
+    records = YoloAdapter().to_detection_records(label_dir, image_dir=image_dir, class_names=None)
     assert len(records) == 1
     rec = records[0]
     assert rec["sample_id"] == "a.jpg"
@@ -146,7 +159,7 @@ def test_yolo_detection_records(tmp_path):
     assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0])
 
 
-def test_voc_detection_records(tmp_path):
+def test_voc_detection_records(tmp_path: Path) -> None:
     from raitap.data.adapters.voc import VocAdapter
 
     xml = """<annotation>
@@ -162,18 +175,35 @@ def test_voc_detection_records(tmp_path):
     records = VocAdapter().to_detection_records(
         d, image_dir=None, class_names=["background", "person", "car"]
     )
-    assert records == [
-        {"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]}
-    ]
+    assert records == [{"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]}]
+
+
+def test_voc_detection_rejects_object_without_bndbox(tmp_path: Path) -> None:
+    from raitap.data.adapters.voc import VocAdapter
+
+    xml = """<annotation>
+      <filename>a.jpg</filename>
+      <object><name>person</name></object>
+    </annotation>"""
+    d = tmp_path / "ann"
+    d.mkdir()
+    (d / "a.xml").write_text(xml)
+
+    with pytest.raises(ValueError, match="has no <bndbox>"):
+        VocAdapter().to_detection_records(
+            d, image_dir=None, class_names=["background", "person", "car"]
+        )
 
 
-def test_detection_load_labels_via_coco(tmp_path, monkeypatch):
+def test_detection_load_labels_via_coco(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
     import json
-    import torch
     from types import SimpleNamespace
-    from raitap.task_families.detection import DetectionFamily
-    from raitap.data.types import LabelFormat
+
+    import torch
+
     import raitap.data.data as data_mod
+    from raitap.data.types import LabelFormat
+    from raitap.task_families.detection import DetectionFamily
 
     coco = {
         "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}],
@@ -183,15 +213,16 @@ def test_detection_load_labels_via_coco(tmp_path, monkeypatch):
     labels_file = tmp_path / "instances.json"
     labels_file.write_text(json.dumps(coco))
 
-    monkeypatch.setattr(
-        data_mod, "get_source_path", lambda source, *, kind: tmp_path / source
-    )
+    monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source)
     # tmp_path/"instances.json" is LABELS; tmp_path/"imgs" is DATA (unused by coco).
-    cfg = SimpleNamespace(
-        data=SimpleNamespace(
-            source="imgs",
-            labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco),
-        )
+    cfg = cast(
+        "AppConfig",
+        SimpleNamespace(
+            data=SimpleNamespace(
+                source="imgs",
+                labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco),
+            )
+        ),
     )
     tensor = [object(), object()]  # len == 2 samples
     out = DetectionFamily().load_labels(cfg, tensor=tensor, sample_ids=["a.jpg", "b.jpg"])
@@ -200,10 +231,14 @@ def test_detection_load_labels_via_coco(tmp_path, monkeypatch):
     assert out[1]["boxes"].shape == (0, 4)
 
 
-def test_classification_load_labels_via_coco(tmp_path, monkeypatch):
+def test_classification_load_labels_via_coco(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     import json
-    import torch
     from types import SimpleNamespace
+
+    import torch
+
     import raitap.data.data as data_mod
     from raitap.data.data import load_classification_labels
     from raitap.data.types import LabelFormat
@@ -218,18 +253,18 @@ def test_classification_load_labels_via_coco(tmp_path, monkeypatch):
     }
     labels_file = tmp_path / "c.json"
     labels_file.write_text(json.dumps(coco))
-    monkeypatch.setattr(
-        data_mod, "get_source_path", lambda source, *, kind: tmp_path / source
-    )
-    cfg = SimpleNamespace(
-        data=SimpleNamespace(
-            source="imgs",
-            labels=SimpleNamespace(
-                source="c.json", format=LabelFormat.coco, id_strategy="stem"
-            ),
-        )
-    )
-    out = load_classification_labels(
-        cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"]
+    monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source)
+    cfg = cast(
+        "AppConfig",
+        SimpleNamespace(
+            data=SimpleNamespace(
+                source="imgs",
+                labels=SimpleNamespace(
+                    source="c.json", format=LabelFormat.coco, id_strategy="stem"
+                ),
+            )
+        ),
     )
+    out = load_classification_labels(cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"])
+    assert out is not None
     assert torch.equal(out, torch.tensor([0, 4]))
diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py
index a37920a6..97d6b2d3 100644
--- a/src/raitap/task_families/detection.py
+++ b/src/raitap/task_families/detection.py
@@ -15,6 +15,8 @@
 from raitap.types import TaskKind
 
 if TYPE_CHECKING:
+    import torch
+
     from raitap.models.torch_backend import TorchBackend
     from raitap.task_families.base import ExplainContext, ForwardContext
 
@@ -24,7 +26,7 @@ def _align_detection_records(
     *,
     expected: int,
     sample_ids: Any,
-) -> list[dict[str, "torch.Tensor"]]:
+) -> list[dict[str, torch.Tensor]]:
     """Align native detection records to ``sample_ids`` and build tensors.
 
     Extracted from ``DetectionFamily.load_labels`` so label-format adapters can
@@ -196,32 +198,22 @@ def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any:
             with labels_path.open() as fh:
                 records = json.load(fh)
             if not isinstance(records, list):
-                raise ValueError(
-                    f"Detection labels file {labels_path} must be a JSON array."
-                )
+                raise ValueError(f"Detection labels file {labels_path} must be a JSON array.")
         else:
             from raitap.data.label_formats import resolve_label_format_adapter
 
             data_source = _get_optional_config_value(cfg.data, "source")
-            image_dir = (
-                get_source_path(data_source, kind=SourceKind.DATA)
-                if data_source
-                else None
-            )
+            image_dir = get_source_path(data_source, kind=SourceKind.DATA) if data_source else None
             class_names = (
                 _get_optional_config_value(cfg.model, "class_names")
                 if hasattr(cfg, "model")
                 else None
             )
-            adapter = resolve_label_format_adapter(
-                LabelFormat(fmt), task_kind=self.kind
-            )
+            adapter = resolve_label_format_adapter(LabelFormat(fmt), task_kind=self.kind)
             records = adapter.to_detection_records(
                 labels_path, image_dir=image_dir, class_names=class_names
             )
-        return _align_detection_records(
-            records, expected=len(tensor), sample_ids=sample_ids
-        )
+        return _align_detection_records(records, expected=len(tensor), sample_ids=sample_ids)
 
     def validate_labels(self, labels: Any) -> None:
         # The detection loader returns ``list[dict]`` or ``None``. A bare tensor

From 049475882104c4a877ef0087fb236a45b52de35d Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 00:09:52 +0200
Subject: [PATCH 11/28] docs: note detection label formats match sample_id by
 exact name (refs #338)

---
 docs/modules/data/configuration.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md
index 653c9b26..0649ccfe 100644
--- a/docs/modules/data/configuration.md
+++ b/docs/modules/data/configuration.md
@@ -204,6 +204,10 @@ RAITAP reads common annotation formats directly via `data.labels.format`.
 COCO and YOLO labels keep their category ids unchanged. VOC class names map to
 ids by `model.class_names` order, else the standard 20-class VOC order.
 
+Detection formats match each record's `sample_id` against the discovered image
+file by exact name, so the image directory must be flat (nested subdirs are not
+matched). Classification labels still align via `labels.id_strategy`.
+
 For tabular models whose backend expects an unusual per-sample layout (such
 as ACAS Xu, a Torch network whose forward takes `(N, 1, 1, 5)`), supply
 `input_metadata.shape` explicitly so the pipeline reshapes the flat feature

From 17a1bd008a435196b7cf378b64588f03bcdaa1cf Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 02:45:04 +0200
Subject: [PATCH 12/28] docs: fix sphinx cross-ref and heading-level warnings
 for label formats (refs #338)

---
 docs/modules/data/configuration.md | 4 +---
 src/raitap/data/types.py           | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md
index 0649ccfe..8f030846 100644
--- a/docs/modules/data/configuration.md
+++ b/docs/modules/data/configuration.md
@@ -190,9 +190,7 @@ data = DataConfig(
 )
 ```
 
-## Label formats
-
-RAITAP reads common annotation formats directly via `data.labels.format`.
+**Label formats.** RAITAP reads common annotation formats directly via `data.labels.format`.
 
 | Format   | Detection | Classification | Source layout                                  |
 | -------- | --------- | -------------- | ---------------------------------------------- |
diff --git a/src/raitap/data/types.py b/src/raitap/data/types.py
index fc114554..999370e0 100644
--- a/src/raitap/data/types.py
+++ b/src/raitap/data/types.py
@@ -39,8 +39,8 @@ class LabelFormat(StrEnum):
     ``native`` is RAITAP's own shape (classification: CSV/TSV/Parquet or the
     ``directory`` source; detection: the JSON record list). The others are
     converted to the native intermediate by a registered
-    :class:`~raitap.data.label_formats.LabelFormatAdapter` before the task
-    family aligns them. StrEnum so YAML users can write the raw value.
+    ``LabelFormatAdapter`` before the task family aligns them. StrEnum so YAML
+    users can write the raw value.
     """
 
     native = "native"

From a150a60d2f51eb00df5c2d57f8858bbb1cbf18ba Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 02:53:04 +0200
Subject: [PATCH 13/28] test(data): use module-qualified
 load_classification_labels to avoid dual import (refs #338)

---
 src/raitap/data/tests/test_label_formats.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
index 39a04e18..657de141 100644
--- a/src/raitap/data/tests/test_label_formats.py
+++ b/src/raitap/data/tests/test_label_formats.py
@@ -240,7 +240,6 @@ def test_classification_load_labels_via_coco(
     import torch
 
     import raitap.data.data as data_mod
-    from raitap.data.data import load_classification_labels
     from raitap.data.types import LabelFormat
 
     coco = {
@@ -265,6 +264,8 @@ def test_classification_load_labels_via_coco(
             )
         ),
     )
-    out = load_classification_labels(cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"])
+    out = data_mod.load_classification_labels(
+        cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"]
+    )
     assert out is not None
     assert torch.equal(out, torch.tensor([0, 4]))

From ef74e217eee4b6c02072b83f205b58b9a90fbda5 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 03:50:13 +0200
Subject: [PATCH 14/28] feat(config): discriminated LabelsConfig variants, drop
 LabelFormat (refs #338)

---
 src/raitap/configs/schema.py                  | 55 ++++++++++++-------
 .../configs/tests/test_labels_schema.py       | 23 ++++++++
 src/raitap/data/__init__.py                   |  4 +-
 src/raitap/data/types.py                      | 24 --------
 4 files changed, 60 insertions(+), 46 deletions(-)
 create mode 100644 src/raitap/configs/tests/test_labels_schema.py

diff --git a/src/raitap/configs/schema.py b/src/raitap/configs/schema.py
index 1910345c..b616d85a 100644
--- a/src/raitap/configs/schema.py
+++ b/src/raitap/configs/schema.py
@@ -5,7 +5,7 @@
 
 from omegaconf import MISSING
 
-from raitap.data.types import IdStrategy, LabelEncoding, LabelFormat
+from raitap.data.types import IdStrategy, LabelEncoding
 from raitap.types import Hardware, TaskKind
 
 if TYPE_CHECKING:
@@ -70,27 +70,44 @@ class ModelConfig:
 
 @dataclass
 class LabelsConfig:
-    # Optional path to a labels file (currently CSV/TSV/Parquet), OR the reserved
-    # value "directory" (exposed as ``raitap.data.DIRECTORY_LABELS_SOURCE``) to
-    # derive classification labels from each sample's top-level class
-    # subdirectory (torchvision ImageFolder style; no labels file).
-    source: str | None = None
-    # Optional sample-id column for filename alignment (e.g. "image").
+    _target_: str = MISSING
+
+
+@dataclass
+class TabularLabelsConfig(LabelsConfig):
+    _target_: str = "TabularLabelParser"
+    source: str = MISSING
     id_column: str | None = None
-    # Optional class-label column; when omitted, one-hot numeric columns are used via argmax.
     column: str | None = None
-    # Optional parsing strategy for labels: "index", "one_hot", or "argmax".
     encoding: LabelEncoding | None = None
-    # Strategy for matching label-file ids to discovered sample files. One of:
-    #   "auto"          — pick "relative_path" if any id contains "/" or "\\"; else "stem".
-    #   "relative_path" — ids are resolved as posix-style paths relative to ``data.source``
-    #                     (supports nested ImageFolder layouts with colliding stems).
-    #   "stem"          — flat-dir / basename matching: match by ``Path(id).stem`` only.
     id_strategy: IdStrategy = IdStrategy.auto
-    # External label file format. ``native`` (default) reads RAITAP's own
-    # shape. ``coco`` / ``yolo`` / ``voc`` are converted to the native
-    # intermediate before alignment. Requires id-based alignment (sample_ids).
-    format: LabelFormat = LabelFormat.native
+
+
+@dataclass
+class DirectoryLabelsConfig(LabelsConfig):
+    _target_: str = "DirectoryLabelParser"
+
+
+@dataclass
+class CocoLabelsConfig(LabelsConfig):
+    _target_: str = "CocoLabelParser"
+    source: str = MISSING
+    id_strategy: IdStrategy = IdStrategy.auto
+
+
+@dataclass
+class YoloLabelsConfig(LabelsConfig):
+    _target_: str = "YoloLabelParser"
+    source: str = MISSING
+    id_strategy: IdStrategy = IdStrategy.auto
+
+
+@dataclass
+class VocLabelsConfig(LabelsConfig):
+    _target_: str = "VocLabelParser"
+    source: str = MISSING
+    id_strategy: IdStrategy = IdStrategy.auto
+    class_names: list[str] | None = None
 
 
 @dataclass
@@ -126,7 +143,7 @@ class DataConfig:
     # Forwarded to ``infer_input_spec`` so semantics and visualisers see the correct
     # modality for non-image data such as ACAS Xu's 5-feature tabular vector.
     input_metadata: dict[str, Any] | None = None
-    labels: LabelsConfig = field(default_factory=LabelsConfig)
+    labels: LabelsConfig | None = None
 
 
 @dataclass
diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py
new file mode 100644
index 00000000..c22ffece
--- /dev/null
+++ b/src/raitap/configs/tests/test_labels_schema.py
@@ -0,0 +1,23 @@
+import dataclasses
+
+import pytest
+
+from raitap.configs.schema import CocoLabelsConfig, DirectoryLabelsConfig
+
+
+def test_coco_config_has_no_tabular_fields() -> None:
+    names = {f.name for f in dataclasses.fields(CocoLabelsConfig)}
+    assert "id_column" not in names
+    assert "column" not in names
+    assert "encoding" not in names
+    assert {"_target_", "source", "id_strategy"} <= names
+
+
+def test_directory_config_has_only_target() -> None:
+    names = {f.name for f in dataclasses.fields(DirectoryLabelsConfig)}
+    assert names == {"_target_"}
+
+
+def test_labelformat_enum_is_gone() -> None:
+    with pytest.raises(ImportError):
+        from raitap.data.types import LabelFormat  # noqa: F401
diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py
index 1bcb08f3..0365b77f 100644
--- a/src/raitap/data/__init__.py
+++ b/src/raitap/data/__init__.py
@@ -13,7 +13,7 @@
 
 from typing import TYPE_CHECKING, Any
 
-from .types import DIRECTORY_LABELS_SOURCE, IdStrategy, LabelEncoding, LabelFormat, Preprocessing
+from .types import IdStrategy, LabelEncoding, Preprocessing
 
 if TYPE_CHECKING:
     from raitap.configs.schema import DataConfig, LabelsConfig
@@ -30,14 +30,12 @@
 
 
 __all__ = [
-    "DIRECTORY_LABELS_SOURCE",
     "Data",
     "DataConfig",
     "DataInputMetadata",
     "DataPreprocessingFactory",
     "IdStrategy",
     "LabelEncoding",
-    "LabelFormat",
     "LabelFormatAdapter",
     "LabelsConfig",
     "ModelInputTransformationFactory",
diff --git a/src/raitap/data/types.py b/src/raitap/data/types.py
index 999370e0..943f28d9 100644
--- a/src/raitap/data/types.py
+++ b/src/raitap/data/types.py
@@ -33,30 +33,6 @@ class IdStrategy(StrEnum):
     stem = "stem"
 
 
-class LabelFormat(StrEnum):
-    """On-disk label file format selected by ``LabelsConfig.format``.
-
-    ``native`` is RAITAP's own shape (classification: CSV/TSV/Parquet or the
-    ``directory`` source; detection: the JSON record list). The others are
-    converted to the native intermediate by a registered
-    ``LabelFormatAdapter`` before the task family aligns them. StrEnum so YAML
-    users can write the raw value.
-    """
-
-    native = "native"
-    coco = "coco"
-    yolo = "yolo"
-    voc = "voc"
-
-
-#: Reserved ``LabelsConfig.source`` value selecting folder-as-label ingestion:
-#: classification labels are derived from each sample's top-level class
-#: subdirectory (torchvision ``ImageFolder`` style; no labels file). Kept as a
-#: plain ``str`` so it round-trips through OmegaConf; ``LabelsConfig.source``
-#: stays ``str | None`` (a path or this sentinel).
-DIRECTORY_LABELS_SOURCE = "directory"
-
-
 class Preprocessing(StrEnum):
     """Named values for ``DataConfig.preprocessing``.
 

From 164e1513ed5826c8d34cbfbd629645bfb9521dbf Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 04:13:36 +0200
Subject: [PATCH 15/28] feat(data): label parser family, factory, nested-group
 registration (refs #338)

---
 src/raitap/_adapters.py                       |  8 +-
 .../configs/tests/test_labels_schema.py       | 85 ++++++++++++++++++-
 src/raitap/configs/zen.py                     |  1 +
 src/raitap/data/label_parsers/__init__.py     | 14 +++
 src/raitap/data/label_parsers/base.py         | 28 ++++++
 src/raitap/data/label_parsers/directory.py    | 27 ++++++
 src/raitap/data/label_parsers/factory.py      | 34 ++++++++
 src/raitap/data/label_parsers/registration.py | 42 +++++++++
 8 files changed, 235 insertions(+), 4 deletions(-)
 create mode 100644 src/raitap/data/label_parsers/__init__.py
 create mode 100644 src/raitap/data/label_parsers/base.py
 create mode 100644 src/raitap/data/label_parsers/directory.py
 create mode 100644 src/raitap/data/label_parsers/factory.py
 create mode 100644 src/raitap/data/label_parsers/registration.py

diff --git a/src/raitap/_adapters.py b/src/raitap/_adapters.py
index e1f9078e..b58b7c2b 100644
--- a/src/raitap/_adapters.py
+++ b/src/raitap/_adapters.py
@@ -305,10 +305,14 @@ def _register_core(
         if family is not None:
             cls._adapter_group = family.group
             builder = _build_schema_adapter(cls, schema_override or family.schema)
+            # Hydra groups use ``/`` for nesting; OmegaConf packages use ``.``.
+            # A nested group like ``data/labels`` must target package
+            # ``data.labels`` so the composed node lands at ``cfg.data.labels``.
+            package_base = family.group.replace("/", ".")
             package = (
-                f"{family.group}.{registry_name}"
+                f"{package_base}.{registry_name}"
                 if family.package_style == "nested"
-                else family.group
+                else package_base
             )
             store(builder, group=family.group, name=registry_name, package=package)
             _BUILDERS.setdefault(family.group, {})[registry_name] = builder
diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py
index c22ffece..64ee9c84 100644
--- a/src/raitap/configs/tests/test_labels_schema.py
+++ b/src/raitap/configs/tests/test_labels_schema.py
@@ -1,4 +1,5 @@
 import dataclasses
+import importlib
 
 import pytest
 
@@ -19,5 +20,85 @@ def test_directory_config_has_only_target() -> None:
 
 
 def test_labelformat_enum_is_gone() -> None:
-    with pytest.raises(ImportError):
-        from raitap.data.types import LabelFormat  # noqa: F401
+    import importlib
+
+    data_types = importlib.import_module("raitap.data.types")
+    with pytest.raises(AttributeError):
+        getattr(data_types, "LabelFormat")  # noqa: B009
+
+
+# Ground truth (see task-2-report.md): composing ``+data/labels=directory`` onto
+# the AppConfig schema lands the variant at ``cfg.data.labels`` with the FQN
+# ``_target_`` that hydra-zen ``builds()`` injects.
+_COMPOSED_TARGET = "raitap.data.label_parsers.directory.DirectoryLabelParser"
+
+
+def _register_labels_group() -> None:
+    """Register the ``data/labels`` group + AppConfig schema directly.
+
+    Bypasses ``register_configs()`` (which imports transparency and other
+    families that are broken mid-refactor on this branch) by importing only the
+    label_parsers package — enough to fire the ``@label_parser`` decorator — and
+    flushing the hydra-zen store. The AppConfig schema is needed as the compose
+    base so the ``data.labels`` package has a struct to land in.
+    """
+    importlib.import_module("raitap.data.label_parsers")
+    from hydra.core.config_store import ConfigStore
+
+    from raitap._adapters import store
+    from raitap.configs.schema import AppConfig
+
+    store.add_to_hydra_store(overwrite_ok=True)
+    ConfigStore.instance().store(name="raitap_schema", node=AppConfig)
+
+
+def test_directory_parser_group_lands_at_data_labels() -> None:
+    """De-risk (Path A): the nested ``data/labels`` group composes onto
+    ``cfg.data.labels`` as a single config (flat semantics at a nested path)."""
+    from hydra import compose, initialize
+    from hydra.core.global_hydra import GlobalHydra
+
+    _register_labels_group()
+    GlobalHydra.instance().clear()
+    with initialize(version_base=None, config_path=None):
+        cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"])
+    # Assertion runs unconditionally (no swallowing). The composed value is the
+    # FQN hydra-zen stores, NOT the short dataclass default.
+    assert cfg.data.labels._target_ == _COMPOSED_TARGET
+
+
+def test_directory_group_rejects_foreign_field() -> None:
+    """De-risk (Path A): a field the directory variant lacks fails at compose.
+
+    Uses a struct-mode override (``data.labels.id_column=x`` — no ``+``) so
+    OmegaConf's struct check fires; ``+`` force-adds and would bypass it.
+    """
+    from hydra import compose, initialize
+    from hydra.core.global_hydra import GlobalHydra
+    from hydra.errors import ConfigCompositionException
+
+    _register_labels_group()
+    GlobalHydra.instance().clear()
+    with pytest.raises(ConfigCompositionException), initialize(version_base=None, config_path=None):
+        compose(
+            config_name="raitap_schema",
+            overrides=["+data/labels=directory", "data.labels.id_column=x"],
+        )
+
+
+def test_create_label_parser_handles_both_target_forms() -> None:
+    """``create_label_parser`` must instantiate for BOTH ``_target_`` shapes:
+
+    * short bare name (``DirectoryLabelsConfig()`` dataclass default), resolved
+      against the ``raitap.data.label_parsers.`` prefix;
+    * the dotted FQN hydra-zen ``builds()`` stamps on the group-composed cfg.
+    """
+    _register_labels_group()
+    from raitap.data.label_parsers.directory import DirectoryLabelParser
+    from raitap.data.label_parsers.factory import create_label_parser
+
+    short = create_label_parser(DirectoryLabelsConfig())
+    assert isinstance(short, DirectoryLabelParser)
+
+    fqn = create_label_parser({"_target_": _COMPOSED_TARGET})
+    assert isinstance(fqn, DirectoryLabelParser)
diff --git a/src/raitap/configs/zen.py b/src/raitap/configs/zen.py
index a52d04e9..c0ac3928 100644
--- a/src/raitap/configs/zen.py
+++ b/src/raitap/configs/zen.py
@@ -49,6 +49,7 @@ def register_zen_groups() -> None:
     import importlib
 
     for pkg in (
+        "raitap.data.label_parsers",
         "raitap.metrics",
         "raitap.reporting",
         "raitap.robustness",
diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py
new file mode 100644
index 00000000..2b9e82ec
--- /dev/null
+++ b/src/raitap/data/label_parsers/__init__.py
@@ -0,0 +1,14 @@
+"""Label parser family package.
+
+Importing this package fires the ``@label_parser`` decorator on every
+in-tree parser module, registering them with the hydra-zen store. Each
+concrete parser is re-exported here so the short ``_target_`` form (a bare
+class name resolved against ``raitap.data.label_parsers.``) instantiates,
+mirroring how ``raitap.metrics`` re-exports its metric computers.
+"""
+
+from __future__ import annotations
+
+from .directory import DirectoryLabelParser
+
+__all__ = ["DirectoryLabelParser"]
diff --git a/src/raitap/data/label_parsers/base.py b/src/raitap/data/label_parsers/base.py
new file mode 100644
index 00000000..3c8cf47f
--- /dev/null
+++ b/src/raitap/data/label_parsers/base.py
@@ -0,0 +1,28 @@
+"""Base protocol and type alias for label parsers."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
+
+if TYPE_CHECKING:
+    from raitap.types import TaskKind
+
+# Type alias for the union of parsed label representations.
+ParsedLabels = "torch.Tensor | list[dict[str, torch.Tensor]] | None"
+
+
+@runtime_checkable
+class LabelParser(Protocol):
+    """Protocol every label-parser adapter must satisfy."""
+
+    supported_tasks: frozenset[TaskKind]
+
+    def parse(
+        self,
+        *,
+        task_kind: TaskKind,
+        tensor: Any,
+        sample_ids: list[str] | None,
+        data_source: str | None,
+        class_names: list[str] | None,
+    ) -> Any: ...
diff --git a/src/raitap/data/label_parsers/directory.py b/src/raitap/data/label_parsers/directory.py
new file mode 100644
index 00000000..f1770e22
--- /dev/null
+++ b/src/raitap/data/label_parsers/directory.py
@@ -0,0 +1,27 @@
+"""Directory label parser stub (real logic lands in Task 3)."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from raitap.configs.schema import DirectoryLabelsConfig
+from raitap.data.label_parsers.registration import label_parser
+from raitap.types import TaskKind
+
+
+@label_parser(registry_name="directory", schema=DirectoryLabelsConfig)
+class DirectoryLabelParser:
+    """Parse labels from directory structure (stub; returns None until Task 3)."""
+
+    supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification})
+
+    def parse(
+        self,
+        *,
+        task_kind: TaskKind,
+        tensor: Any,
+        sample_ids: list[str] | None,
+        data_source: str | None,
+        class_names: list[str] | None,
+    ) -> None:
+        return None
diff --git a/src/raitap/data/label_parsers/factory.py b/src/raitap/data/label_parsers/factory.py
new file mode 100644
index 00000000..b8d6cfd3
--- /dev/null
+++ b/src/raitap/data/label_parsers/factory.py
@@ -0,0 +1,34 @@
+"""Instantiation factory for label parsers (mirrors metrics/factory.py:44-60)."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from hydra.utils import instantiate
+
+from raitap import raitap_log
+from raitap.configs import cfg_to_dict, resolve_target
+
+if TYPE_CHECKING:
+    from raitap.data.label_parsers.base import LabelParser
+
+_LABELS_PREFIX = "raitap.data.label_parsers."
+
+
+def create_label_parser(labels_config: Any) -> LabelParser:
+    """Instantiate a label parser from Hydra-style config (``_target_`` + kwargs)."""
+    labels_cfg = cfg_to_dict(labels_config)
+    target_path: str = labels_cfg.get("_target_", "")
+    resolved_target = resolve_target(target_path, _LABELS_PREFIX)
+    labels_cfg["_target_"] = resolved_target
+
+    try:
+        parser = instantiate(labels_cfg)
+    except Exception as e:
+        raitap_log.exception("Label parser instantiation failed for target %r", target_path)
+        raise ValueError(
+            f"Could not instantiate label parser {target_path!r}.\n"
+            "Check that _target_ points to a valid LabelParser implementation."
+        ) from e
+
+    return parser
diff --git a/src/raitap/data/label_parsers/registration.py b/src/raitap/data/label_parsers/registration.py
new file mode 100644
index 00000000..3ade1ab7
--- /dev/null
+++ b/src/raitap/data/label_parsers/registration.py
@@ -0,0 +1,42 @@
+"""Family decorator for label-parser adapters.
+
+Mirrors ``raitap.metrics.registration`` exactly, with group ``data/labels``
+and ``package_style="flat"`` so composed configs land at ``cfg.data.labels``.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, TypeVar, Unpack
+
+from raitap._adapters import AdapterDecoratorOptions, FamilyConfig, _register_core
+from raitap.configs.schema import LabelsConfig
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from raitap.data.label_parsers.base import LabelParser
+
+# ``flat``: ``DataConfig.labels`` is a single ``LabelsConfig`` (not a dict of
+# named entries), so the composed variant lands directly at ``cfg.data.labels``
+# (package ``data.labels``), with parser names competing for that one slot.
+LABELS = FamilyConfig(
+    group="data/labels",
+    schema=LabelsConfig,
+    package_style="flat",
+)
+
+T = TypeVar("T", bound="LabelParser")
+
+
+def label_parser(
+    **common: Unpack[AdapterDecoratorOptions],
+) -> Callable[[type[T]], type[T]]:
+    """Decorator: register a label-parser adapter.
+
+    ``registry_name`` is required. Mirrors ``metrics_adapter`` shape.
+    """
+
+    def wrap(cls: type[T]) -> type[T]:
+        return _register_core(cls, family=LABELS, **common)
+
+    return wrap

From f97979b9cb6df5a5e76ee055bd5831085312c94c Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 04:24:32 +0200
Subject: [PATCH 16/28] refactor(data): resolve labels via parser, drop
 TaskFamily.load_labels (refs #338)

---
 src/raitap/data/data.py                     |  79 +++++++++------
 src/raitap/data/label_parsers/directory.py  |  36 ++++++-
 src/raitap/data/tests/test_label_parsers.py | 106 ++++++++++++++++++++
 src/raitap/task_families/base.py            |   4 -
 src/raitap/task_families/classification.py  |   5 -
 src/raitap/task_families/detection.py       |  60 -----------
 6 files changed, 184 insertions(+), 106 deletions(-)
 create mode 100644 src/raitap/data/tests/test_label_parsers.py

diff --git a/src/raitap/data/data.py b/src/raitap/data/data.py
index 9b81806d..46c4ee8d 100644
--- a/src/raitap/data/data.py
+++ b/src/raitap/data/data.py
@@ -12,12 +12,10 @@
 from raitap import raitap_log
 from raitap.data.preprocessing import module_as_per_image_callable, resolve_preprocessing
 from raitap.data.types import (
-    DIRECTORY_LABELS_SOURCE,
     MODALITY_EXTENSIONS,
     IdStrategy,
     InputModality,
     LabelEncoding,
-    LabelFormat,
 )
 from raitap.data.utils import download_file
 from raitap.tracking.base_tracker import BaseTracker, Trackable
@@ -75,7 +73,9 @@ def __init__(
         self.tensor = family.adapt_loaded_inputs(raw_tensor)
         family.validate_inputs(self.tensor)
         self.labels: torch.Tensor | list[dict[str, torch.Tensor]] | None
-        self.labels = family.load_labels(cfg, tensor=self.tensor, sample_ids=self.sample_ids)
+        self.labels = _resolve_and_parse_labels(
+            cfg, task_kind=self.task_kind, tensor=self.tensor, sample_ids=self.sample_ids
+        )
         family.validate_labels(self.labels)
 
     def _load_data(
@@ -237,6 +237,45 @@ def log(self, tracker: BaseTracker, **kwargs: Any) -> None:
         tracker.log_dataset(self.describe())
 
 
+def _resolve_and_parse_labels(
+    cfg: Any,
+    *,
+    task_kind: TaskKind,
+    tensor: Any,
+    sample_ids: list[str] | None,
+) -> Any:
+    """Resolve cfg.data.labels to a parser, gate supported_tasks, call parse.
+
+    Returns None when cfg.data.labels is not set.
+    """
+    from raitap.data.label_parsers.factory import create_label_parser
+
+    labels_cfg = _get_optional_config_value(cfg.data, "labels")
+    if labels_cfg is None:
+        return None
+
+    parser = create_label_parser(labels_cfg)
+
+    if task_kind not in parser.supported_tasks:
+        supported = ", ".join(sorted(str(t) for t in parser.supported_tasks))
+        raise ValueError(
+            f"{type(parser).__name__} does not support task_kind={task_kind!r}. "
+            f"Supported tasks: {supported}."
+        )
+
+    data_source = _get_optional_config_value(cfg.data, "source")
+    model = getattr(cfg, "model", None)
+    class_names = _get_optional_config_value(model, "class_names")
+
+    return parser.parse(
+        task_kind=task_kind,
+        tensor=tensor,
+        sample_ids=sample_ids,
+        data_source=data_source,
+        class_names=class_names,
+    )
+
+
 def _load_directory_labels(sample_ids: list[str] | None) -> torch.Tensor | None:
     """Derive classification labels from each sample's top-level class folder
     (torchvision ImageFolder semantics). Returns None (with a warning) when
@@ -273,42 +312,16 @@ def load_classification_labels(
     Aligns to ``sample_ids`` by id column when available, otherwise falls back
     to row order. Returns ``None`` when ``data.labels.source`` is unset, the
     file is empty, or alignment fails (callers then use predictions as targets).
+
+    Note: directory and format-adapter branches have moved to dedicated
+    ``LabelParser`` implementations. This function handles the tabular (native)
+    path only and will be wrapped by ``TabularLabelParser`` in a later task.
     """
     labels_cfg = _get_optional_config_value(cfg.data, "labels")
     labels_source = _get_optional_config_value(labels_cfg, "source")
     if not labels_source:
         return None
 
-    if labels_source == DIRECTORY_LABELS_SOURCE:
-        return _load_directory_labels(sample_ids)
-
-    labels_format = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native
-    if labels_format != LabelFormat.native:
-        from raitap.data.label_formats import resolve_label_format_adapter
-
-        if not sample_ids:
-            raise ValueError(
-                f"Label format {LabelFormat(labels_format).value!r} requires "
-                "id-based alignment, but no sample ids were discovered."
-            )
-        labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
-        adapter = resolve_label_format_adapter(
-            LabelFormat(labels_format), task_kind=TaskKind.classification
-        )
-        records = adapter.to_classification_records(labels_path)
-        id_series = pd.Series([r["sample_id"] for r in records])
-        record_labels = [int(r["label"]) for r in records]
-        strategy = _resolve_id_strategy(
-            _get_optional_config_value(labels_cfg, "id_strategy") or "auto", id_series
-        )
-        aligned = _align_labels_to_samples(
-            sample_ids=sample_ids,
-            raw_label_ids=id_series,
-            encoded_labels=record_labels,
-            strategy=strategy,
-        )
-        return torch.tensor(aligned, dtype=torch.long)
-
     labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
     labels_df = _load_tabular_frame(labels_path)
     if labels_df.empty:
diff --git a/src/raitap/data/label_parsers/directory.py b/src/raitap/data/label_parsers/directory.py
index f1770e22..78cb5ffe 100644
--- a/src/raitap/data/label_parsers/directory.py
+++ b/src/raitap/data/label_parsers/directory.py
@@ -1,17 +1,26 @@
-"""Directory label parser stub (real logic lands in Task 3)."""
+"""Directory label parser (torchvision ImageFolder semantics)."""
 
 from __future__ import annotations
 
+from pathlib import PurePosixPath
 from typing import Any
 
+from raitap import raitap_log
 from raitap.configs.schema import DirectoryLabelsConfig
 from raitap.data.label_parsers.registration import label_parser
 from raitap.types import TaskKind
+from raitap.utils.lazy import lazy_import
+
+torch = lazy_import("torch")
 
 
 @label_parser(registry_name="directory", schema=DirectoryLabelsConfig)
 class DirectoryLabelParser:
-    """Parse labels from directory structure (stub; returns None until Task 3)."""
+    """Parse classification labels from the top-level class subfolder of each sample.
+
+    Mirrors torchvision ``ImageFolder`` semantics: ``<class>/<file>`` layout.
+    Uses ``sample_ids`` only; ignores ``data_source`` and ``class_names``.
+    """
 
     supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification})
 
@@ -23,5 +32,24 @@ def parse(
         sample_ids: list[str] | None,
         data_source: str | None,
         class_names: list[str] | None,
-    ) -> None:
-        return None
+    ) -> Any:
+        """Derive a long-tensor of class indices from sample_ids directory layout."""
+        if not sample_ids:
+            raitap_log.warn(
+                "DirectoryLabelParser needs image samples organised into "
+                "class subdirectories; none were found. Falling back to "
+                "predictions as metric targets."
+            )
+            return None
+        parts_by_id = [PurePosixPath(sid).parts for sid in sample_ids]
+        if any(len(parts) < 2 for parts in parts_by_id):
+            raitap_log.warn(
+                "DirectoryLabelParser expects a <class>/<file> layout, but "
+                "one or more samples sit directly under the data source root "
+                "(no class subdirectory). Falling back to predictions as metric targets."
+            )
+            return None
+        classes = sorted({parts[0] for parts in parts_by_id})
+        class_to_idx = {name: idx for idx, name in enumerate(classes)}
+        labels = [class_to_idx[parts[0]] for parts in parts_by_id]
+        return torch.tensor(labels, dtype=torch.long)
diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py
new file mode 100644
index 00000000..f31351cf
--- /dev/null
+++ b/src/raitap/data/tests/test_label_parsers.py
@@ -0,0 +1,106 @@
+"""Task 3 tests: _resolve_and_parse_labels + DirectoryLabelParser e2e."""
+
+from __future__ import annotations
+
+import importlib
+from types import SimpleNamespace
+from typing import cast
+
+import pytest
+
+from raitap.configs.schema import AppConfig, DirectoryLabelsConfig
+from raitap.data.data import _resolve_and_parse_labels
+from raitap.types import TaskKind
+
+
+def _make_cfg(
+    *,
+    labels: object = None,
+    source: str | None = None,
+    class_names: list[str] | None = None,
+) -> AppConfig:
+    """Build a minimal AppConfig-shaped namespace for unit tests."""
+    data_ns = SimpleNamespace(labels=labels, source=source)
+    model_ns = SimpleNamespace(class_names=class_names)
+    return cast("AppConfig", SimpleNamespace(data=data_ns, model=model_ns))
+
+
+def test_resolve_returns_none_when_labels_is_none() -> None:
+    cfg = _make_cfg(labels=None)
+    result = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=None
+    )
+    assert result is None
+
+
+def test_directory_parser_e2e_returns_label_tensor() -> None:
+    """DirectoryLabelParser derives class index from top-level folder name."""
+    import torch
+
+    cfg = _make_cfg(labels=DirectoryLabelsConfig())
+    sample_ids = ["cat/a.jpg", "dog/b.jpg"]
+    result = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids
+    )
+    assert result is not None
+    assert isinstance(result, torch.Tensor)
+    assert result.dtype == torch.long
+    # "cat" < "dog" alphabetically -> cat=0, dog=1
+    assert result.tolist() == [0, 1]
+
+
+def test_directory_parser_raises_for_unsupported_task() -> None:
+    cfg = _make_cfg(labels=DirectoryLabelsConfig())
+    sample_ids = ["cat/a.jpg", "dog/b.jpg"]
+    with pytest.raises(ValueError, match="does not support task_kind"):
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=None, sample_ids=sample_ids
+        )
+
+
+def test_directory_parser_returns_none_for_no_sample_ids() -> None:
+    """No sample_ids -> returns None with a warning (graceful degradation)."""
+    cfg = _make_cfg(labels=DirectoryLabelsConfig())
+    result = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=None
+    )
+    assert result is None
+
+
+def test_directory_parser_returns_none_for_flat_layout() -> None:
+    """Samples directly under root (no class subdir) -> None with warning."""
+    cfg = _make_cfg(labels=DirectoryLabelsConfig())
+    sample_ids = ["a.jpg", "b.jpg"]
+    result = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids
+    )
+    assert result is None
+
+
+# --- Integration: full hydra compose path ---
+
+
+def _register_labels_group() -> None:
+    importlib.import_module("raitap.data.label_parsers")
+    from hydra.core.config_store import ConfigStore
+
+    from raitap._adapters import store
+    from raitap.configs.schema import AppConfig
+
+    store.add_to_hydra_store(overwrite_ok=True)
+    ConfigStore.instance().store(name="raitap_schema", node=AppConfig)
+
+
+_COMPOSED_TARGET = "raitap.data.label_parsers.directory.DirectoryLabelParser"
+
+
+def test_integration_compose_data_labels_directory() -> None:
+    """Composing +data/labels=directory lands cfg.data.labels._target_ at the FQN."""
+    from hydra import compose, initialize
+    from hydra.core.global_hydra import GlobalHydra
+
+    _register_labels_group()
+    GlobalHydra.instance().clear()
+    with initialize(version_base=None, config_path=None):
+        cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"])
+    assert cfg.data.labels._target_ == _COMPOSED_TARGET
diff --git a/src/raitap/task_families/base.py b/src/raitap/task_families/base.py
index 167cadad..848c4051 100644
--- a/src/raitap/task_families/base.py
+++ b/src/raitap/task_families/base.py
@@ -96,10 +96,6 @@ def validate_inputs(self, tensor: object) -> None:
         """Validate the (post-adapt) inputs match this family's contract."""
         raise NotImplementedError
 
-    def load_labels(self, cfg: AppConfig, *, tensor: object, sample_ids: object) -> Any:
-        """Load labels in this family's on-disk shape (or None)."""
-        raise NotImplementedError
-
     def validate_labels(self, labels: object) -> None:
         """Raise if loaded labels don't match this family's expected shape."""
         raise NotImplementedError
diff --git a/src/raitap/task_families/classification.py b/src/raitap/task_families/classification.py
index 4f86b759..3f8c1bf1 100644
--- a/src/raitap/task_families/classification.py
+++ b/src/raitap/task_families/classification.py
@@ -61,11 +61,6 @@ def validate_inputs(self, tensor: Any) -> None:
         if tensor.shape[0] < 1:
             raise ValueError("Classification data is empty; loaded zero samples.")
 
-    def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any:
-        from raitap.data.data import load_classification_labels
-
-        return load_classification_labels(cfg, tensor=tensor, sample_ids=sample_ids)
-
     def validate_labels(self, labels: Any) -> None:
         # A ``list[dict]`` is a detection-shaped label set; a tensor (or None)
         # is classification-shaped. Disagreement means model and data declare
diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py
index 97d6b2d3..eab54c78 100644
--- a/src/raitap/task_families/detection.py
+++ b/src/raitap/task_families/detection.py
@@ -155,66 +155,6 @@ def validate_inputs(self, tensor: Any) -> None:
                     + (f" with shape {shape}." if shape is not None else ".")
                 )
 
-    def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any:
-        """Load per-sample detection targets (boxes + labels).
-
-        Expected on-disk shape: JSON file (list of records) with each record
-        carrying ``sample_id`` (str), ``boxes`` (list of ``[x1, y1, x2, y2]``
-        floats), and ``labels`` (list of ints). Returns a list whose length
-        equals ``len(tensor)``; each entry is a dict with
-        ``boxes: (M_i, 4) float32`` and ``labels: (M_i,) int64`` tensors.
-        Samples with no boxes get shape-``(0, 4)`` / shape-``(0,)`` tensors.
-
-        Alignment rules:
-
-        * When ``sample_ids`` is set, records are looked up by ``sample_id``
-          and the output is ordered to match ``sample_ids``. Any sample
-          missing from the labels file → ``ValueError``; duplicate ``sample_id``s
-          in the labels file → ``ValueError``.
-        * When ``sample_ids`` is unset, records are consumed in file order
-          and must equal the dataset length exactly.
-
-        Returns ``None`` when ``data.labels.source`` is unset.
-        """
-        import json
-
-        from raitap.data.data import (
-            SourceKind,
-            _get_optional_config_value,
-            get_source_path,
-        )
-
-        labels_cfg = _get_optional_config_value(cfg.data, "labels")
-        labels_source = _get_optional_config_value(labels_cfg, "source")
-        if not labels_source:
-            return None
-
-        labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
-
-        from raitap.data.types import LabelFormat
-
-        fmt = _get_optional_config_value(labels_cfg, "format") or LabelFormat.native
-        if fmt == LabelFormat.native:
-            with labels_path.open() as fh:
-                records = json.load(fh)
-            if not isinstance(records, list):
-                raise ValueError(f"Detection labels file {labels_path} must be a JSON array.")
-        else:
-            from raitap.data.label_formats import resolve_label_format_adapter
-
-            data_source = _get_optional_config_value(cfg.data, "source")
-            image_dir = get_source_path(data_source, kind=SourceKind.DATA) if data_source else None
-            class_names = (
-                _get_optional_config_value(cfg.model, "class_names")
-                if hasattr(cfg, "model")
-                else None
-            )
-            adapter = resolve_label_format_adapter(LabelFormat(fmt), task_kind=self.kind)
-            records = adapter.to_detection_records(
-                labels_path, image_dir=image_dir, class_names=class_names
-            )
-        return _align_detection_records(records, expected=len(tensor), sample_ids=sample_ids)
-
     def validate_labels(self, labels: Any) -> None:
         # The detection loader returns ``list[dict]`` or ``None``. A bare tensor
         # is a classification-shaped label set; disagreement means model and

From 4970e2cd1d1232b34b80809824c3782103202055 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 04:30:34 +0200
Subject: [PATCH 17/28] feat(data): TabularLabelParser (refs #338)

---
 src/raitap/data/label_parsers/__init__.py   |   3 +-
 src/raitap/data/label_parsers/tabular.py    | 109 ++++++++++++++++++++
 src/raitap/data/tests/test_label_parsers.py |  63 +++++++++++
 3 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 src/raitap/data/label_parsers/tabular.py

diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py
index 2b9e82ec..70234d3b 100644
--- a/src/raitap/data/label_parsers/__init__.py
+++ b/src/raitap/data/label_parsers/__init__.py
@@ -10,5 +10,6 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates,
 from __future__ import annotations
 
 from .directory import DirectoryLabelParser
+from .tabular import TabularLabelParser  # pyright: ignore[reportUnusedImport]
 
-__all__ = ["DirectoryLabelParser"]
+__all__ = ["DirectoryLabelParser", "TabularLabelParser"]
diff --git a/src/raitap/data/label_parsers/tabular.py b/src/raitap/data/label_parsers/tabular.py
new file mode 100644
index 00000000..529c4455
--- /dev/null
+++ b/src/raitap/data/label_parsers/tabular.py
@@ -0,0 +1,109 @@
+"""Tabular label parser (CSV / TSV / Parquet)."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from raitap import raitap_log
+from raitap.configs.schema import TabularLabelsConfig
+from raitap.data.data import (
+    SourceKind,
+    _align_labels_to_samples,
+    _column_as_series,
+    _extract_class_labels,
+    _load_tabular_frame,
+    _resolve_id_strategy,
+    _resolve_labels_id_column,
+    get_source_path,
+)
+from raitap.data.label_parsers.registration import label_parser
+from raitap.data.types import IdStrategy
+from raitap.types import TaskKind
+from raitap.utils.lazy import lazy_import
+
+if TYPE_CHECKING:
+    import torch
+
+torch = lazy_import("torch")  # type: ignore[assignment]
+
+
+@label_parser(registry_name="tabular", schema=TabularLabelsConfig)
+class TabularLabelParser:
+    """Parse classification labels from a CSV, TSV, or Parquet file.
+
+    Aligns to ``sample_ids`` via ``id_column`` when available; falls back to
+    row order otherwise. Returns ``None`` on empty file or count mismatch.
+    """
+
+    supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification})
+
+    def __init__(
+        self,
+        *,
+        source: str,
+        id_column: str | None = None,
+        column: str | None = None,
+        encoding: Any = None,
+        id_strategy: IdStrategy = IdStrategy.auto,
+    ) -> None:
+        self.source = source
+        self.id_column = id_column
+        self.column = column
+        self.encoding = encoding
+        self.id_strategy = id_strategy
+
+    def parse(
+        self,
+        *,
+        task_kind: TaskKind,
+        tensor: Any,
+        sample_ids: list[str] | None,
+        data_source: str | None,
+        class_names: list[str] | None,
+    ) -> Any:
+        """Load tabular labels and align to samples."""
+        labels_path = get_source_path(self.source, kind=SourceKind.LABELS)
+        labels_df = _load_tabular_frame(labels_path)
+        if labels_df.empty:
+            raitap_log.warn("Labels file is empty; falling back to predictions as targets.")
+            return None
+
+        id_column = _resolve_labels_id_column(labels_df, self.id_column)
+        encoded_labels = _extract_class_labels(
+            labels_df,
+            labels_column=self.column,
+            id_column=id_column,
+            labels_encoding=self.encoding,
+        )
+
+        expected = len(tensor) if tensor is not None else len(encoded_labels)
+        if sample_ids and id_column:
+            id_series = _column_as_series(labels_df, id_column)
+            strategy = _resolve_id_strategy(str(self.id_strategy), id_series)
+            try:
+                aligned_labels = _align_labels_to_samples(
+                    sample_ids=sample_ids,
+                    raw_label_ids=id_series,
+                    encoded_labels=encoded_labels,
+                    strategy=strategy,
+                )
+            except ValueError as error:
+                raitap_log.warn(
+                    f"{error} Falling back to predictions as metric targets.",
+                )
+                return None
+            return torch.tensor(aligned_labels, dtype=torch.long)
+
+        if sample_ids and not id_column:
+            raitap_log.warn(
+                "Could not find a labels id column for filename alignment; using row-order labels.",
+            )
+
+        if len(encoded_labels) != expected:
+            raitap_log.warn(
+                f"Label count ({len(encoded_labels)}) does not match sample count ({expected}); "
+                "falling back to predictions as targets.",
+            )
+            return None
+
+        return torch.tensor(encoded_labels, dtype=torch.long)
diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py
index f31351cf..d6ab391e 100644
--- a/src/raitap/data/tests/test_label_parsers.py
+++ b/src/raitap/data/tests/test_label_parsers.py
@@ -104,3 +104,66 @@ def test_integration_compose_data_labels_directory() -> None:
     with initialize(version_base=None, config_path=None):
         cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"])
     assert cfg.data.labels._target_ == _COMPOSED_TARGET
+
+
+# --- Task 4: TabularLabelParser ---
+
+
+def _write_csv(path: object, content: str) -> None:
+    import pathlib
+
+    pathlib.Path(str(path)).write_text(content, encoding="utf-8")
+
+
+def test_tabular_parser_e2e_via_resolve_and_parse_labels(tmp_path: object) -> None:
+    """CSV with image,label rows + sample_ids -> aligned long tensor via resolve."""
+    import pathlib
+
+    import torch
+
+    from raitap.configs.schema import TabularLabelsConfig
+    from raitap.data.data import _resolve_and_parse_labels
+
+    csv_path = pathlib.Path(str(tmp_path)) / "labels.csv"
+    _write_csv(csv_path, "image,label\nb.jpg,1\na.jpg,0\n")
+
+    cfg = _make_cfg(
+        labels=TabularLabelsConfig(
+            source=str(csv_path),
+            id_column="image",
+        )
+    )
+    sample_ids = ["a.jpg", "b.jpg"]
+    result = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids
+    )
+    assert result is not None
+    assert isinstance(result, torch.Tensor)
+    assert result.dtype == torch.long
+    # a.jpg -> label 0, b.jpg -> label 1
+    assert result.tolist() == [0, 1]
+
+
+def test_tabular_parser_direct_unit(tmp_path: object) -> None:
+    """Direct TabularLabelParser.parse unit test without cfg dispatch."""
+    import pathlib
+
+    import torch
+
+    from raitap.data.label_parsers.tabular import TabularLabelParser
+
+    csv_path = pathlib.Path(str(tmp_path)) / "labels.csv"
+    _write_csv(csv_path, "image,label\na.jpg,0\nb.jpg,1\n")
+
+    parser = TabularLabelParser(source=str(csv_path), id_column="image")
+    result = parser.parse(
+        task_kind=TaskKind.classification,
+        tensor=None,
+        sample_ids=["a.jpg", "b.jpg"],
+        data_source=None,
+        class_names=None,
+    )
+    assert result is not None
+    assert isinstance(result, torch.Tensor)
+    assert result.dtype == torch.long
+    assert result.tolist() == [0, 1]

From e9907dc8ed44fc5875bd7c285f0d6d0e75523876 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 04:38:30 +0200
Subject: [PATCH 18/28] feat(data): CocoLabelParser detection and
 classification (refs #338)

---
 src/raitap/data/label_parsers/__init__.py   |   3 +-
 src/raitap/data/label_parsers/coco.py       | 124 +++++++++++++++
 src/raitap/data/tests/test_label_parsers.py | 168 ++++++++++++++++++++
 3 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 src/raitap/data/label_parsers/coco.py

diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py
index 70234d3b..6d122d82 100644
--- a/src/raitap/data/label_parsers/__init__.py
+++ b/src/raitap/data/label_parsers/__init__.py
@@ -9,7 +9,8 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates,
 
 from __future__ import annotations
 
+from .coco import CocoLabelParser  # pyright: ignore[reportUnusedImport]
 from .directory import DirectoryLabelParser
 from .tabular import TabularLabelParser  # pyright: ignore[reportUnusedImport]
 
-__all__ = ["DirectoryLabelParser", "TabularLabelParser"]
+__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser"]
diff --git a/src/raitap/data/label_parsers/coco.py b/src/raitap/data/label_parsers/coco.py
new file mode 100644
index 00000000..71c57011
--- /dev/null
+++ b/src/raitap/data/label_parsers/coco.py
@@ -0,0 +1,124 @@
+"""COCO label parser (detection + classification)."""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Any
+
+from raitap.configs.schema import CocoLabelsConfig
+from raitap.data.data import (
+    SourceKind,
+    _align_labels_to_samples,
+    _resolve_id_strategy,
+    get_source_path,
+)
+from raitap.data.label_parsers.registration import label_parser
+from raitap.data.types import IdStrategy
+from raitap.task_families.detection import _align_detection_records
+from raitap.types import TaskKind
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pandas as pd
+
+
+@label_parser(registry_name="coco", schema=CocoLabelsConfig)
+class CocoLabelParser:
+    """Parse COCO ``instances.json`` labels for detection or classification.
+
+    Detection: ``bbox`` ``[x, y, w, h]`` -> ``[x1, y1, x2, y2]``; ``category_id``
+    passes through unchanged. Classification: one category per image; images with
+    0 or >1 categories raise ValueError.
+    """
+
+    supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection, TaskKind.classification})
+
+    def __init__(
+        self,
+        *,
+        source: str,
+        id_strategy: IdStrategy = IdStrategy.auto,
+    ) -> None:
+        self.source = source
+        self.id_strategy = id_strategy
+
+    # --- internal helpers (ported verbatim from adapters/coco.py) ---
+
+    def _load(self, source: Path) -> dict[str, Any]:
+        with source.open() as fh:
+            data = json.load(fh)
+        if not isinstance(data, dict) or "images" not in data:
+            raise ValueError(f"COCO file {source} must be an object with an 'images' array.")
+        return data
+
+    def _to_detection_records(self, data: dict[str, Any]) -> list[dict[str, Any]]:
+        file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]}
+        boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image}
+        labels: dict[int, list[int]] = {iid: [] for iid in file_by_image}
+        for ann in data.get("annotations", []):
+            iid = ann["image_id"]
+            x, y, w, h = ann["bbox"]
+            boxes[iid].append([x, y, x + w, y + h])
+            labels[iid].append(int(ann["category_id"]))
+        return [
+            {"sample_id": file_by_image[iid], "boxes": boxes[iid], "labels": labels[iid]}
+            for iid in file_by_image
+        ]
+
+    def _to_classification_records(self, data: dict[str, Any]) -> list[dict[str, Any]]:
+        file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]}
+        cats: dict[int, set[int]] = {iid: set() for iid in file_by_image}
+        for ann in data.get("annotations", []):
+            cats[ann["image_id"]].add(int(ann["category_id"]))
+        records: list[dict[str, Any]] = []
+        for iid, name in file_by_image.items():
+            cat_set = cats[iid]
+            if len(cat_set) != 1:
+                raise ValueError(
+                    f"COCO classification needs exactly one category per image; "
+                    f"image {name!r} has {len(cat_set)}."
+                )
+            records.append({"sample_id": name, "label": next(iter(cat_set))})
+        return records
+
+    # --- public parse method ---
+
+    def parse(
+        self,
+        *,
+        task_kind: TaskKind,
+        tensor: Any,
+        sample_ids: list[str] | None,
+        data_source: str | None,
+        class_names: list[str] | None,
+    ) -> Any:
+        """Load and align COCO labels for detection or classification."""
+        import pandas as pd
+
+        labels_path = get_source_path(self.source, kind=SourceKind.LABELS)
+        data = self._load(labels_path)
+
+        if task_kind is TaskKind.detection:
+            records = self._to_detection_records(data)
+            return _align_detection_records(
+                records,
+                expected=len(tensor),
+                sample_ids=sample_ids,
+            )
+
+        # classification
+        records = self._to_classification_records(data)
+        raw_ids: list[str] = [r["sample_id"] for r in records]
+        encoded: list[int] = [r["label"] for r in records]
+        id_series: pd.Series = pd.Series(raw_ids)
+        strategy = _resolve_id_strategy(str(self.id_strategy), id_series)
+        aligned = _align_labels_to_samples(
+            sample_ids=sample_ids or [],
+            raw_label_ids=id_series,
+            encoded_labels=encoded,
+            strategy=strategy,
+        )
+        import torch
+
+        return torch.tensor(aligned, dtype=torch.long)
diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py
index d6ab391e..295bb4dc 100644
--- a/src/raitap/data/tests/test_label_parsers.py
+++ b/src/raitap/data/tests/test_label_parsers.py
@@ -167,3 +167,171 @@ def test_tabular_parser_direct_unit(tmp_path: object) -> None:
     assert isinstance(result, torch.Tensor)
     assert result.dtype == torch.long
     assert result.tolist() == [0, 1]
+
+
+# --- Task 5: CocoLabelParser ---
+
+
+def _write_json(path: object, data: object) -> None:
+    import json
+    import pathlib
+
+    pathlib.Path(str(path)).write_text(json.dumps(data), encoding="utf-8")
+
+
+def _coco_detection_fixture(tmp_path: object) -> object:
+    """Two-image COCO with one annotated image and one empty image."""
+    import pathlib
+
+    coco = {
+        "images": [
+            {"id": 1, "file_name": "a.jpg"},
+            {"id": 2, "file_name": "b.jpg"},
+        ],
+        "annotations": [
+            {"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]},
+            {"image_id": 1, "category_id": 5, "bbox": [0, 0, 5, 5]},
+        ],
+        "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}],
+    }
+    p = pathlib.Path(str(tmp_path)) / "instances.json"
+    _write_json(p, coco)
+    return p
+
+
+def _coco_classification_fixture(tmp_path: object) -> object:
+    """Two-image COCO for classification (one category per image)."""
+    import pathlib
+
+    coco = {
+        "images": [
+            {"id": 1, "file_name": "a.jpg"},
+            {"id": 2, "file_name": "b.jpg"},
+        ],
+        "annotations": [
+            {"image_id": 1, "category_id": 0, "bbox": [0, 0, 1, 1]},
+            {"image_id": 2, "category_id": 4, "bbox": [0, 0, 1, 1]},
+        ],
+        "categories": [{"id": 0, "name": "x"}, {"id": 4, "name": "y"}],
+    }
+    p = pathlib.Path(str(tmp_path)) / "cls.json"
+    _write_json(p, coco)
+    return p
+
+
+def test_coco_parser_detection_direct(tmp_path: object) -> None:
+    """CocoLabelParser.parse detection: boxes xyxy, labels, empty-image shape."""
+    import torch
+
+    from raitap.data.label_parsers.coco import CocoLabelParser
+
+    labels_path = _coco_detection_fixture(tmp_path)
+    parser = CocoLabelParser(source=str(labels_path))
+    tensor = [object(), object()]  # two samples
+    result = parser.parse(
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+        data_source=None,
+        class_names=None,
+    )
+    assert isinstance(result, list)
+    assert len(result) == 2
+    # a.jpg: two boxes, xyxy conversion
+    expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]])
+    assert torch.equal(result[0]["boxes"], expected_boxes)
+    assert torch.equal(result[0]["labels"], torch.tensor([3, 5]))
+    # b.jpg: empty annotation -> (0, 4) boxes, (0,) labels
+    assert result[1]["boxes"].shape == (0, 4)
+    assert result[1]["labels"].shape == (0,)
+
+
+def test_coco_parser_classification_direct(tmp_path: object) -> None:
+    """CocoLabelParser.parse classification: long tensor of category ids."""
+    import torch
+
+    from raitap.data.label_parsers.coco import CocoLabelParser
+
+    labels_path = _coco_classification_fixture(tmp_path)
+    parser = CocoLabelParser(source=str(labels_path))
+    result = parser.parse(
+        task_kind=TaskKind.classification,
+        tensor=None,
+        sample_ids=["a.jpg", "b.jpg"],
+        data_source=None,
+        class_names=None,
+    )
+    assert isinstance(result, torch.Tensor)
+    assert result.dtype == torch.long
+    assert result.tolist() == [0, 4]
+
+
+def test_coco_parser_classification_rejects_multiple_categories(tmp_path: object) -> None:
+    """Classification parse raises ValueError when an image has >1 categories."""
+    import pathlib
+
+    from raitap.data.label_parsers.coco import CocoLabelParser
+
+    coco = {
+        "images": [{"id": 1, "file_name": "a.jpg"}],
+        "annotations": [
+            {"image_id": 1, "category_id": 3, "bbox": [0, 0, 1, 1]},
+            {"image_id": 1, "category_id": 5, "bbox": [0, 0, 1, 1]},
+        ],
+        "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}],
+    }
+    p = pathlib.Path(str(tmp_path)) / "multi.json"
+    _write_json(p, coco)
+    parser = CocoLabelParser(source=str(p))
+    with pytest.raises(ValueError, match="exactly one category per image"):
+        parser.parse(
+            task_kind=TaskKind.classification,
+            tensor=None,
+            sample_ids=["a.jpg"],
+            data_source=None,
+            class_names=None,
+        )
+
+
+def test_coco_parser_detection_e2e_via_resolve(tmp_path: object) -> None:
+    """Detection e2e: _resolve_and_parse_labels with CocoLabelsConfig."""
+    import torch
+
+    from raitap.configs.schema import CocoLabelsConfig
+    from raitap.data.data import _resolve_and_parse_labels
+
+    labels_path = _coco_detection_fixture(tmp_path)
+    cfg = _make_cfg(labels=CocoLabelsConfig(source=str(labels_path)))
+    tensor = [object(), object()]
+    result = _resolve_and_parse_labels(
+        cfg,
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+    )
+    assert isinstance(result, list)
+    assert len(result) == 2
+    expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]])
+    assert torch.equal(result[0]["boxes"], expected_boxes)
+    assert torch.equal(result[0]["labels"], torch.tensor([3, 5]))
+    assert result[1]["boxes"].shape == (0, 4)
+
+
+def test_coco_parser_classification_e2e_via_resolve(tmp_path: object) -> None:
+    """Classification e2e: _resolve_and_parse_labels with CocoLabelsConfig."""
+    import torch
+
+    from raitap.configs.schema import CocoLabelsConfig
+    from raitap.data.data import _resolve_and_parse_labels
+
+    labels_path = _coco_classification_fixture(tmp_path)
+    cfg = _make_cfg(labels=CocoLabelsConfig(source=str(labels_path)))
+    result = _resolve_and_parse_labels(
+        cfg,
+        task_kind=TaskKind.classification,
+        tensor=None,
+        sample_ids=["a.jpg", "b.jpg"],
+    )
+    assert isinstance(result, torch.Tensor)
+    assert result.dtype == torch.long
+    assert result.tolist() == [0, 4]

From 6e6972ba926b616b256a11fb8202588f04d6b13d Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 04:45:03 +0200
Subject: [PATCH 19/28] feat(data): YoloLabelParser with e2e image-dir
 resolution (refs #338)

---
 src/raitap/data/label_parsers/__init__.py   |   3 +-
 src/raitap/data/label_parsers/yolo.py       |  92 ++++++++++++++++++
 src/raitap/data/tests/test_label_parsers.py | 101 ++++++++++++++++++++
 3 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 src/raitap/data/label_parsers/yolo.py

diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py
index 6d122d82..38fab7ce 100644
--- a/src/raitap/data/label_parsers/__init__.py
+++ b/src/raitap/data/label_parsers/__init__.py
@@ -12,5 +12,6 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates,
 from .coco import CocoLabelParser  # pyright: ignore[reportUnusedImport]
 from .directory import DirectoryLabelParser
 from .tabular import TabularLabelParser  # pyright: ignore[reportUnusedImport]
+from .yolo import YoloLabelParser  # pyright: ignore[reportUnusedImport]
 
-__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser"]
+__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser", "YoloLabelParser"]
diff --git a/src/raitap/data/label_parsers/yolo.py b/src/raitap/data/label_parsers/yolo.py
new file mode 100644
index 00000000..75ce80c6
--- /dev/null
+++ b/src/raitap/data/label_parsers/yolo.py
@@ -0,0 +1,92 @@
+"""YOLO label parser (detection-only)."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from PIL import Image
+
+from raitap.configs.schema import YoloLabelsConfig
+from raitap.data.data import SourceKind, get_source_path
+from raitap.data.label_parsers.registration import label_parser
+from raitap.data.types import IdStrategy
+from raitap.task_families.detection import _align_detection_records
+from raitap.types import TaskKind
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
+
+
+@label_parser(registry_name="yolo", schema=YoloLabelsConfig)
+class YoloLabelParser:
+    """Parse YOLO per-image ``.txt`` (``class cx cy w h``, normalised) for detection.
+
+    Boxes are denormalised to pixel ``[x1, y1, x2, y2]`` using each image's
+    size read from PIL. Class indices pass through unchanged.
+    """
+
+    supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection})
+
+    def __init__(
+        self,
+        *,
+        source: str,
+        id_strategy: IdStrategy = IdStrategy.auto,
+    ) -> None:
+        self.source = source
+        self.id_strategy = id_strategy
+
+    def _image_for(self, image_dir: Path, stem: str) -> Path:
+        for suffix in _IMAGE_SUFFIXES:
+            candidate = image_dir / f"{stem}{suffix}"
+            if candidate.exists():
+                return candidate
+        raise ValueError(f"YOLO parser found no image for label {stem!r} in {image_dir}.")
+
+    def _to_detection_records(self, labels_dir: Path, image_dir: Path) -> list[dict[str, Any]]:
+        records: list[dict[str, Any]] = []
+        for txt in sorted(labels_dir.glob("*.txt")):
+            image_path = self._image_for(image_dir, txt.stem)
+            with Image.open(image_path) as im:
+                width, height = im.size
+            boxes: list[list[float]] = []
+            labels: list[int] = []
+            for line in txt.read_text().splitlines():
+                parts = line.split()
+                if not parts:
+                    continue
+                cls, cx, cy, bw, bh = (float(p) for p in parts[:5])
+                x1 = (cx - bw / 2) * width
+                y1 = (cy - bh / 2) * height
+                x2 = (cx + bw / 2) * width
+                y2 = (cy + bh / 2) * height
+                boxes.append([x1, y1, x2, y2])
+                labels.append(int(cls))
+            records.append({"sample_id": image_path.name, "boxes": boxes, "labels": labels})
+        return records
+
+    def parse(
+        self,
+        *,
+        task_kind: TaskKind,
+        tensor: Any,
+        sample_ids: list[str] | None,
+        data_source: str | None,
+        class_names: list[str] | None,
+    ) -> Any:
+        """Load YOLO labels and align to sample_ids for detection."""
+        if data_source is None:
+            raise ValueError(
+                "YOLO labels need data.source (image directory) to denormalise boxes; "
+                "set data.source to the image directory."
+            )
+        labels_dir = get_source_path(self.source, kind=SourceKind.LABELS)
+        image_dir = get_source_path(data_source, kind=SourceKind.DATA)
+        records = self._to_detection_records(labels_dir, image_dir)
+        return _align_detection_records(
+            records,
+            expected=len(tensor),
+            sample_ids=sample_ids,
+        )
diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py
index 295bb4dc..51525158 100644
--- a/src/raitap/data/tests/test_label_parsers.py
+++ b/src/raitap/data/tests/test_label_parsers.py
@@ -335,3 +335,104 @@ def test_coco_parser_classification_e2e_via_resolve(tmp_path: object) -> None:
     assert isinstance(result, torch.Tensor)
     assert result.dtype == torch.long
     assert result.tolist() == [0, 4]
+
+
+# --- Task 6: YoloLabelParser ---
+
+
+def _make_yolo_fixture(
+    tmp_path: object,
+) -> tuple[object, object]:
+    """Create a minimal YOLO label dir + image dir with two images.
+
+    Returns (labels_dir, image_dir). Images are 200x100 px.
+    Each .txt has one box: class 0, cx=0.5, cy=0.5, w=0.6, h=0.1.
+    Denormalised: x1=(0.5-0.3)*200=40, y1=(0.5-0.05)*100=45,
+                  x2=(0.5+0.3)*200=160, y2=(0.5+0.05)*100=55.
+    """
+    import pathlib
+
+    from PIL import Image as PILImage
+
+    tmp = pathlib.Path(str(tmp_path))
+    labels_dir = tmp / "labels"
+    labels_dir.mkdir()
+    image_dir = tmp / "images"
+    image_dir.mkdir()
+
+    for stem in ("a", "b"):
+        img = PILImage.new("RGB", (200, 100))
+        img.save(image_dir / f"{stem}.jpg")
+        (labels_dir / f"{stem}.txt").write_text("0 0.5 0.5 0.6 0.1\n", encoding="utf-8")
+
+    return labels_dir, image_dir
+
+
+def test_yolo_parser_unit(tmp_path: object) -> None:
+    """YoloLabelParser.parse: boxes denormalised via PIL image size."""
+    from raitap.data.label_parsers.yolo import YoloLabelParser
+
+    labels_dir, image_dir = _make_yolo_fixture(tmp_path)
+    parser = YoloLabelParser(source=str(labels_dir))
+
+    tensor = [object(), object()]
+    result = parser.parse(
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+        data_source=str(image_dir),
+        class_names=None,
+    )
+
+    assert isinstance(result, list)
+    assert len(result) == 2
+    # IEEE-754: (0.5+0.05)*100 = 55.00000000000001 -> use pytest.approx
+    assert result[0]["boxes"][0].tolist() == pytest.approx([40.0, 45.0, 160.0, (0.5 + 0.05) * 100])
+    assert result[0]["labels"].tolist() == [0]
+    assert result[1]["boxes"].shape == (1, 4)
+
+
+def test_yolo_parser_raises_when_data_source_none(tmp_path: object) -> None:
+    """parse raises ValueError when data_source is None (no image dir)."""
+    from raitap.data.label_parsers.yolo import YoloLabelParser
+
+    labels_dir, _ = _make_yolo_fixture(tmp_path)
+    parser = YoloLabelParser(source=str(labels_dir))
+    with pytest.raises(ValueError, match=r"data\.source"):
+        parser.parse(
+            task_kind=TaskKind.detection,
+            tensor=[object()],
+            sample_ids=None,
+            data_source=None,
+            class_names=None,
+        )
+
+
+def test_yolo_parser_e2e_via_resolve(tmp_path: object) -> None:
+    """E2E: _resolve_and_parse_labels with YoloLabelsConfig + real image dir.
+
+    Exercises image_dir resolution through the dispatch (gap #1).
+    """
+    from raitap.configs.schema import YoloLabelsConfig
+    from raitap.data.data import _resolve_and_parse_labels
+
+    labels_dir, image_dir = _make_yolo_fixture(tmp_path)
+
+    cfg = _make_cfg(
+        labels=YoloLabelsConfig(source=str(labels_dir)),
+        source=str(image_dir),
+    )
+    tensor = [object(), object()]
+    result = _resolve_and_parse_labels(
+        cfg,
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+    )
+
+    assert isinstance(result, list)
+    assert len(result) == 2
+    assert result[0]["boxes"][0].tolist() == pytest.approx([40.0, 45.0, 160.0, (0.5 + 0.05) * 100])
+    assert result[0]["labels"].tolist() == [0]
+    assert result[1]["boxes"].shape == (1, 4)
+    assert result[1]["labels"].tolist() == [0]

From 4898f4d85e472c506d3c00cb9c21fc4040b8d549 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 04:53:20 +0200
Subject: [PATCH 20/28] feat(data): VocLabelParser with class_names precedence
 and e2e (refs #338)

---
 src/raitap/data/label_parsers/__init__.py   |   9 +-
 src/raitap/data/label_parsers/voc.py        | 129 +++++++++++++++
 src/raitap/data/tests/test_label_parsers.py | 165 ++++++++++++++++++++
 3 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 src/raitap/data/label_parsers/voc.py

diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py
index 38fab7ce..053ad444 100644
--- a/src/raitap/data/label_parsers/__init__.py
+++ b/src/raitap/data/label_parsers/__init__.py
@@ -12,6 +12,13 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates,
 from .coco import CocoLabelParser  # pyright: ignore[reportUnusedImport]
 from .directory import DirectoryLabelParser
 from .tabular import TabularLabelParser  # pyright: ignore[reportUnusedImport]
+from .voc import VocLabelParser  # pyright: ignore[reportUnusedImport]
 from .yolo import YoloLabelParser  # pyright: ignore[reportUnusedImport]
 
-__all__ = ["CocoLabelParser", "DirectoryLabelParser", "TabularLabelParser", "YoloLabelParser"]
+__all__ = [
+    "CocoLabelParser",
+    "DirectoryLabelParser",
+    "TabularLabelParser",
+    "VocLabelParser",
+    "YoloLabelParser",
+]
diff --git a/src/raitap/data/label_parsers/voc.py b/src/raitap/data/label_parsers/voc.py
new file mode 100644
index 00000000..eccba526
--- /dev/null
+++ b/src/raitap/data/label_parsers/voc.py
@@ -0,0 +1,129 @@
+"""Pascal-VOC label parser (detection-only)."""
+
+from __future__ import annotations
+
+import xml.etree.ElementTree as ET
+from typing import TYPE_CHECKING, Any
+
+from raitap.configs.schema import VocLabelsConfig
+from raitap.data.data import SourceKind, get_source_path
+from raitap.data.label_parsers.registration import label_parser
+from raitap.data.types import IdStrategy
+from raitap.task_families.detection import _align_detection_records
+from raitap.types import TaskKind
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+#: Canonical Pascal-VOC class order (index = label id) when no class_names given.
+_VOC_CLASSES = (
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+)
+
+
+def _coord(box: ET.Element, tag: str, xml_path: Path) -> float:
+    text = box.findtext(tag)
+    if text is None:
+        raise ValueError(f"VOC bndbox in {xml_path.name} missing <{tag}>.")
+    return float(text)
+
+
+@label_parser(registry_name="voc", schema=VocLabelsConfig)
+class VocLabelParser:
+    """Parse Pascal-VOC per-image ``.xml`` for detection.
+
+    Boxes are already ``[xmin, ymin, xmax, ymax]`` pixels. Class names map to
+    ids by their position in the active name list (parser's own ``class_names``,
+    else the ``class_names`` arg from ``cfg.model.class_names``, else the
+    standard 20-class VOC order).
+    """
+
+    supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection})
+
+    def __init__(
+        self,
+        *,
+        source: str,
+        id_strategy: IdStrategy = IdStrategy.auto,
+        class_names: list[str] | None = None,
+    ) -> None:
+        self.source = source
+        self.id_strategy = id_strategy
+        self.class_names = class_names
+
+    def _to_detection_records(
+        self, labels_dir: Path, name_to_id: dict[str, int]
+    ) -> list[dict[str, Any]]:
+        records: list[dict[str, Any]] = []
+        for xml_path in sorted(labels_dir.glob("*.xml")):
+            root = ET.parse(xml_path).getroot()
+            filename_el = root.find("filename")
+            if filename_el is None or not filename_el.text:
+                raise ValueError(f"VOC file {xml_path} has no <filename>.")
+            boxes: list[list[float]] = []
+            labels: list[int] = []
+            for obj in root.findall("object"):
+                name = obj.findtext("name")
+                if name not in name_to_id:
+                    raise ValueError(
+                        f"VOC class {name!r} in {xml_path.name} is not in the "
+                        f"class list {sorted(name_to_id)}."
+                    )
+                box = obj.find("bndbox")
+                if box is None:
+                    raise ValueError(f"VOC object in {xml_path.name} has no <bndbox>.")
+                boxes.append(
+                    [
+                        _coord(box, "xmin", xml_path),
+                        _coord(box, "ymin", xml_path),
+                        _coord(box, "xmax", xml_path),
+                        _coord(box, "ymax", xml_path),
+                    ]
+                )
+                labels.append(name_to_id[name])
+            records.append({"sample_id": filename_el.text, "boxes": boxes, "labels": labels})
+        return records
+
+    def parse(
+        self,
+        *,
+        task_kind: TaskKind,
+        tensor: Any,
+        sample_ids: list[str] | None,
+        data_source: str | None,
+        class_names: list[str] | None,
+    ) -> Any:
+        """Load VOC xml labels and align to sample_ids for detection."""
+        labels_dir = get_source_path(self.source, kind=SourceKind.LABELS)
+        # Precedence: parser's own class_names > model's class_names > _VOC_CLASSES
+        active_names: list[str] | tuple[str, ...] = (
+            self.class_names
+            if self.class_names is not None
+            else (class_names if class_names is not None else _VOC_CLASSES)
+        )
+        name_to_id = {name: idx for idx, name in enumerate(active_names)}
+        records = self._to_detection_records(labels_dir, name_to_id)
+        return _align_detection_records(
+            records,
+            expected=len(tensor),
+            sample_ids=sample_ids,
+        )
diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py
index 51525158..8bc39734 100644
--- a/src/raitap/data/tests/test_label_parsers.py
+++ b/src/raitap/data/tests/test_label_parsers.py
@@ -436,3 +436,168 @@ def test_yolo_parser_e2e_via_resolve(tmp_path: object) -> None:
     assert result[0]["labels"].tolist() == [0]
     assert result[1]["boxes"].shape == (1, 4)
     assert result[1]["labels"].tolist() == [0]
+
+
+# --- Task 7: VocLabelParser ---
+
+
+def _write_voc_xml(path: object, filename: str, objects: list[dict]) -> None:
+    """Write a minimal Pascal-VOC XML file."""
+    import pathlib
+
+    lines = [
+        "<annotation>",
+        f"  <filename>{filename}</filename>",
+    ]
+    for obj in objects:
+        lines += [
+            "  <object>",
+            f"    <name>{obj['name']}</name>",
+        ]
+        if obj.get("bndbox") is not None:
+            b = obj["bndbox"]
+            lines += [
+                "    <bndbox>",
+                f"      <xmin>{b[0]}</xmin>",
+                f"      <ymin>{b[1]}</ymin>",
+                f"      <xmax>{b[2]}</xmax>",
+                f"      <ymax>{b[3]}</ymax>",
+                "    </bndbox>",
+            ]
+        lines.append("  </object>")
+    lines.append("</annotation>")
+    pathlib.Path(str(path)).write_text("\n".join(lines), encoding="utf-8")
+
+
+def _make_voc_fixture(tmp_path: object) -> object:
+    """Two-image VOC dir with class_names=['background','person','car'].
+
+    a.jpg: person at [10,20,30,40], car at [5,5,15,15].
+    b.jpg: person at [0,0,50,50].
+    """
+    import pathlib
+
+    tmp = pathlib.Path(str(tmp_path))
+    voc_dir = tmp / "voc_labels"
+    voc_dir.mkdir()
+    _write_voc_xml(
+        voc_dir / "a.xml",
+        "a.jpg",
+        [
+            {"name": "person", "bndbox": [10, 20, 30, 40]},
+            {"name": "car", "bndbox": [5, 5, 15, 15]},
+        ],
+    )
+    _write_voc_xml(
+        voc_dir / "b.xml",
+        "b.jpg",
+        [{"name": "person", "bndbox": [0, 0, 50, 50]}],
+    )
+    return voc_dir
+
+
+def test_voc_parser_unit_with_class_names(tmp_path: object) -> None:
+    """VocLabelParser.parse: person->1, car->2 with explicit class_names arg."""
+    import torch
+
+    from raitap.data.label_parsers.voc import VocLabelParser
+
+    voc_dir = _make_voc_fixture(tmp_path)
+    parser = VocLabelParser(source=str(voc_dir))
+    class_names = ["background", "person", "car"]
+    tensor = [object(), object()]
+    result = parser.parse(
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+        data_source=None,
+        class_names=class_names,
+    )
+    assert isinstance(result, list)
+    assert len(result) == 2
+    # a.jpg: person(1), car(2)
+    expected_boxes = torch.tensor([[10.0, 20.0, 30.0, 40.0], [5.0, 5.0, 15.0, 15.0]])
+    assert torch.equal(result[0]["boxes"], expected_boxes)
+    assert torch.equal(result[0]["labels"], torch.tensor([1, 2]))
+    # b.jpg: person(1)
+    assert torch.equal(result[1]["boxes"], torch.tensor([[0.0, 0.0, 50.0, 50.0]]))
+    assert torch.equal(result[1]["labels"], torch.tensor([1]))
+
+
+def test_voc_parser_raises_on_missing_bndbox(tmp_path: object) -> None:
+    """parse raises ValueError when <object> has no <bndbox>."""
+    import pathlib
+
+    from raitap.data.label_parsers.voc import VocLabelParser
+
+    tmp = pathlib.Path(str(tmp_path))
+    voc_dir = tmp / "voc_no_box"
+    voc_dir.mkdir()
+    _write_voc_xml(
+        voc_dir / "bad.xml",
+        "bad.jpg",
+        [{"name": "person"}],  # no bndbox key -> not written
+    )
+    parser = VocLabelParser(source=str(voc_dir))
+    with pytest.raises(ValueError, match="no <bndbox>"):
+        parser.parse(
+            task_kind=TaskKind.detection,
+            tensor=[object()],
+            sample_ids=["bad.jpg"],
+            data_source=None,
+            class_names=["person"],
+        )
+
+
+def test_voc_parser_e2e_class_names_from_model(tmp_path: object) -> None:
+    """E2E: cfg.model.class_names supplies mapping; person->1 via _resolve_and_parse_labels."""
+    import torch
+
+    from raitap.configs.schema import VocLabelsConfig
+    from raitap.data.data import _resolve_and_parse_labels
+
+    voc_dir = _make_voc_fixture(tmp_path)
+    # class_names on the config is None; model supplies it instead
+    cfg = _make_cfg(
+        labels=VocLabelsConfig(source=str(voc_dir)),
+        class_names=["background", "person", "car"],
+    )
+    tensor = [object(), object()]
+    result = _resolve_and_parse_labels(
+        cfg,
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+    )
+    assert isinstance(result, list)
+    assert len(result) == 2
+    assert torch.equal(result[0]["labels"], torch.tensor([1, 2]))
+    assert torch.equal(result[1]["labels"], torch.tensor([1]))
+
+
+def test_voc_parser_own_class_names_takes_precedence(tmp_path: object) -> None:
+    """Parser's VocLabelsConfig.class_names overrides model's class_names."""
+    import torch
+
+    from raitap.configs.schema import VocLabelsConfig
+    from raitap.data.data import _resolve_and_parse_labels
+
+    voc_dir = _make_voc_fixture(tmp_path)
+    # Parser config has class_names; model has a different (wrong) mapping
+    cfg = _make_cfg(
+        labels=VocLabelsConfig(
+            source=str(voc_dir),
+            class_names=["background", "person", "car"],
+        ),
+        class_names=["car", "background", "person"],  # different order -> would give wrong ids
+    )
+    tensor = [object(), object()]
+    result = _resolve_and_parse_labels(
+        cfg,
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+    )
+    assert isinstance(result, list)
+    # Parser's own list wins: person->1, car->2
+    assert torch.equal(result[0]["labels"], torch.tensor([1, 2]))

From 8407436a66f2c85914aec8ac46f08493626c0eb0 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 05:02:20 +0200
Subject: [PATCH 21/28] fix(model): detection labels honour id_strategy for
 nested dirs (refs #338)

---
 src/raitap/data/label_parsers/coco.py       |  1 +
 src/raitap/data/label_parsers/voc.py        |  1 +
 src/raitap/data/label_parsers/yolo.py       |  1 +
 src/raitap/data/tests/test_label_parsers.py | 70 +++++++++++++++++++++
 src/raitap/task_families/detection.py       | 26 ++++++--
 5 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/src/raitap/data/label_parsers/coco.py b/src/raitap/data/label_parsers/coco.py
index 71c57011..673a39e0 100644
--- a/src/raitap/data/label_parsers/coco.py
+++ b/src/raitap/data/label_parsers/coco.py
@@ -105,6 +105,7 @@ def parse(
                 records,
                 expected=len(tensor),
                 sample_ids=sample_ids,
+                strategy=str(self.id_strategy),
             )
 
         # classification
diff --git a/src/raitap/data/label_parsers/voc.py b/src/raitap/data/label_parsers/voc.py
index eccba526..959f9e48 100644
--- a/src/raitap/data/label_parsers/voc.py
+++ b/src/raitap/data/label_parsers/voc.py
@@ -126,4 +126,5 @@ def parse(
             records,
             expected=len(tensor),
             sample_ids=sample_ids,
+            strategy=str(self.id_strategy),
         )
diff --git a/src/raitap/data/label_parsers/yolo.py b/src/raitap/data/label_parsers/yolo.py
index 75ce80c6..3d76c298 100644
--- a/src/raitap/data/label_parsers/yolo.py
+++ b/src/raitap/data/label_parsers/yolo.py
@@ -89,4 +89,5 @@ def parse(
             records,
             expected=len(tensor),
             sample_ids=sample_ids,
+            strategy=str(self.id_strategy),
         )
diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py
index 8bc39734..cff4face 100644
--- a/src/raitap/data/tests/test_label_parsers.py
+++ b/src/raitap/data/tests/test_label_parsers.py
@@ -601,3 +601,73 @@ def test_voc_parser_own_class_names_takes_precedence(tmp_path: object) -> None:
     assert isinstance(result, list)
     # Parser's own list wins: person->1, car->2
     assert torch.equal(result[0]["labels"], torch.tensor([1, 2]))
+
+
+# --- Task 8: detection id_strategy parity ---
+
+
+def _coco_detection_nested_fixture(tmp_path: object) -> object:
+    """COCO with file_name='a.jpg' (no subdir) but discovered sample_ids=['sub/a.jpg']."""
+    import pathlib
+
+    coco = {
+        "images": [{"id": 1, "file_name": "a.jpg"}],
+        "annotations": [
+            {"image_id": 1, "category_id": 2, "bbox": [1, 2, 3, 4]},
+        ],
+        "categories": [{"id": 2, "name": "cat"}],
+    }
+    p = pathlib.Path(str(tmp_path)) / "nested.json"
+    _write_json(p, coco)
+    return p
+
+
+def test_coco_detection_nested_sample_ids_with_stem_strategy(tmp_path: object) -> None:
+    """Gap #2: COCO record 'a.jpg' matches discovered 'sub/a.jpg' via id_strategy='stem'."""
+    import torch
+
+    from raitap.data.label_parsers.coco import CocoLabelParser
+    from raitap.data.types import IdStrategy
+
+    labels_path = _coco_detection_nested_fixture(tmp_path)
+    parser = CocoLabelParser(source=str(labels_path), id_strategy=IdStrategy.stem)
+    tensor = [object()]
+    result = parser.parse(
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["sub/a.jpg"],
+        data_source=None,
+        class_names=None,
+    )
+    assert isinstance(result, list)
+    assert len(result) == 1
+    # bbox [1,2,3,4] -> xyxy [1, 2, 1+3, 2+4] = [1, 2, 4, 6]
+    expected_boxes = torch.tensor([[1.0, 2.0, 4.0, 6.0]])
+    assert torch.equal(result[0]["boxes"], expected_boxes)
+    assert torch.equal(result[0]["labels"], torch.tensor([2]))
+
+
+def test_coco_detection_exact_match_regression(tmp_path: object) -> None:
+    """Regression: exact-match ids still align under id_strategy='auto'."""
+    import torch
+
+    from raitap.data.label_parsers.coco import CocoLabelParser
+    from raitap.data.types import IdStrategy
+
+    labels_path = _coco_detection_fixture(tmp_path)
+    parser = CocoLabelParser(source=str(labels_path), id_strategy=IdStrategy.auto)
+    tensor = [object(), object()]
+    result = parser.parse(
+        task_kind=TaskKind.detection,
+        tensor=tensor,
+        sample_ids=["a.jpg", "b.jpg"],
+        data_source=None,
+        class_names=None,
+    )
+    assert isinstance(result, list)
+    assert len(result) == 2
+    expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]])
+    assert torch.equal(result[0]["boxes"], expected_boxes)
+    assert torch.equal(result[0]["labels"], torch.tensor([3, 5]))
+    assert result[1]["boxes"].shape == (0, 4)
+    assert result[1]["labels"].shape == (0,)
diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py
index eab54c78..fd10bdcc 100644
--- a/src/raitap/task_families/detection.py
+++ b/src/raitap/task_families/detection.py
@@ -10,6 +10,7 @@
 
 from typing import TYPE_CHECKING, Any, cast
 
+from raitap.data.data import _normalise_sample_id, _resolve_id_strategy
 from raitap.task_families.registry import task_family
 from raitap.transparency.contracts import ExplanationOutputSpace
 from raitap.types import TaskKind
@@ -26,16 +27,24 @@ def _align_detection_records(
     *,
     expected: int,
     sample_ids: Any,
+    strategy: str = "auto",
 ) -> list[dict[str, torch.Tensor]]:
     """Align native detection records to ``sample_ids`` and build tensors.
 
     Extracted from ``DetectionFamily.load_labels`` so label-format adapters can
     feed converted records through the same alignment + validation path.
+
+    When ``sample_ids`` is provided, both the discovered ids and record
+    ``sample_id`` fields are normalised via ``_normalise_sample_id`` using the
+    resolved ``strategy``, matching how the classification path handles nested
+    image directories.
     """
+    import pandas as pd
     import torch
 
     if sample_ids is not None:
-        by_id: dict[str, dict[str, Any]] = {}
+        # Collect raw record ids first so _resolve_id_strategy can inspect them.
+        raw_record_ids: list[str] = []
         for index, record in enumerate(records):
             record_id = record.get("sample_id") if isinstance(record, dict) else None
             if record_id is None:
@@ -43,15 +52,24 @@ def _align_detection_records(
                     f"Detection labels record {index} is missing 'sample_id' "
                     "(required when the dataset exposes sample_ids)."
                 )
-            if record_id in by_id:
+            raw_record_ids.append(str(record_id))
+
+        resolved = _resolve_id_strategy(strategy, pd.Series(raw_record_ids))
+
+        by_id: dict[str, dict[str, Any]] = {}
+        for record, record_id in zip(records, raw_record_ids, strict=True):
+            norm_id = _normalise_sample_id(record_id, resolved)
+            if norm_id in by_id:
                 raise ValueError(
                     f"Detection labels file contains duplicate sample_id {record_id!r}."
                 )
-            by_id[record_id] = record
+            by_id[norm_id] = record
+
         ordered_records = []
         missing: list[str] = []
         for sample_id in sample_ids:
-            record = by_id.get(sample_id)
+            norm_sid = _normalise_sample_id(sample_id, resolved)
+            record = by_id.get(norm_sid)
             if record is None:
                 missing.append(sample_id)
             else:

From ed503a4f3cbc1f978e93213c72be4827f6babd2e Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 05:14:38 +0200
Subject: [PATCH 22/28] feat(data): DetectionJsonLabelParser restores native
 detection JSON format (refs #338)

---
 src/raitap/configs/schema.py                  |  7 +++
 .../configs/tests/test_labels_schema.py       |  7 ++-
 src/raitap/data/label_parsers/__init__.py     |  2 +
 .../data/label_parsers/detection_json.py      | 56 +++++++++++++++++++
 .../data/tests/test_detection_labels.py       | 54 +++++++++++++-----
 .../data/tests/test_detection_ragged.py       | 29 ++++++----
 6 files changed, 127 insertions(+), 28 deletions(-)
 create mode 100644 src/raitap/data/label_parsers/detection_json.py

diff --git a/src/raitap/configs/schema.py b/src/raitap/configs/schema.py
index b616d85a..24fcde0a 100644
--- a/src/raitap/configs/schema.py
+++ b/src/raitap/configs/schema.py
@@ -110,6 +110,13 @@ class VocLabelsConfig(LabelsConfig):
     class_names: list[str] | None = None
 
 
+@dataclass
+class DetectionJsonLabelsConfig(LabelsConfig):
+    _target_: str = "DetectionJsonLabelParser"
+    source: str = MISSING
+    id_strategy: IdStrategy = IdStrategy.auto
+
+
 @dataclass
 class DataConfig:
     name: str = "isic2018"
diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py
index 64ee9c84..e97bea36 100644
--- a/src/raitap/configs/tests/test_labels_schema.py
+++ b/src/raitap/configs/tests/test_labels_schema.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from raitap.configs.schema import CocoLabelsConfig, DirectoryLabelsConfig
+from raitap.configs.schema import CocoLabelsConfig, DetectionJsonLabelsConfig, DirectoryLabelsConfig
 
 
 def test_coco_config_has_no_tabular_fields() -> None:
@@ -102,3 +102,8 @@ def test_create_label_parser_handles_both_target_forms() -> None:
 
     fqn = create_label_parser({"_target_": _COMPOSED_TARGET})
     assert isinstance(fqn, DirectoryLabelParser)
+
+
+def test_detection_json_config_has_exactly_target_source_id_strategy() -> None:
+    names = {f.name for f in dataclasses.fields(DetectionJsonLabelsConfig)}
+    assert names == {"_target_", "source", "id_strategy"}
diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py
index 053ad444..b1ced9e2 100644
--- a/src/raitap/data/label_parsers/__init__.py
+++ b/src/raitap/data/label_parsers/__init__.py
@@ -10,6 +10,7 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates,
 from __future__ import annotations
 
 from .coco import CocoLabelParser  # pyright: ignore[reportUnusedImport]
+from .detection_json import DetectionJsonLabelParser  # pyright: ignore[reportUnusedImport]
 from .directory import DirectoryLabelParser
 from .tabular import TabularLabelParser  # pyright: ignore[reportUnusedImport]
 from .voc import VocLabelParser  # pyright: ignore[reportUnusedImport]
@@ -17,6 +18,7 @@ class name resolved against ``raitap.data.label_parsers.``) instantiates,
 
 __all__ = [
     "CocoLabelParser",
+    "DetectionJsonLabelParser",
     "DirectoryLabelParser",
     "TabularLabelParser",
     "VocLabelParser",
diff --git a/src/raitap/data/label_parsers/detection_json.py b/src/raitap/data/label_parsers/detection_json.py
new file mode 100644
index 00000000..6e3cbfdc
--- /dev/null
+++ b/src/raitap/data/label_parsers/detection_json.py
@@ -0,0 +1,56 @@
+"""Detection-JSON label parser (native RAITAP detection record format)."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from raitap.configs.schema import DetectionJsonLabelsConfig
+from raitap.data.data import SourceKind, get_source_path
+from raitap.data.label_parsers.registration import label_parser
+from raitap.data.types import IdStrategy
+from raitap.task_families.detection import _align_detection_records
+from raitap.types import TaskKind
+
+
+@label_parser(registry_name="detection_json", schema=DetectionJsonLabelsConfig)
+class DetectionJsonLabelParser:
+    """Parse native RAITAP detection JSON records for detection.
+
+    The file must be a JSON array of objects with keys ``sample_id``,
+    ``boxes`` (list of ``[x1, y1, x2, y2]`` in pixels), and ``labels``
+    (list of integer class ids).
+    """
+
+    supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection})
+
+    def __init__(
+        self,
+        *,
+        source: str,
+        id_strategy: IdStrategy = IdStrategy.auto,
+    ) -> None:
+        self.source = source
+        self.id_strategy = id_strategy
+
+    def parse(
+        self,
+        *,
+        task_kind: TaskKind,
+        tensor: Any,
+        sample_ids: list[str] | None,
+        data_source: str | None,
+        class_names: list[str] | None,
+    ) -> Any:
+        """Load native detection JSON and align records to sample_ids."""
+        labels_path = get_source_path(self.source, kind=SourceKind.LABELS)
+        with labels_path.open() as fh:
+            records = json.load(fh)
+        if not isinstance(records, list):
+            raise ValueError(f"Detection labels file {labels_path} must be a JSON array.")
+        return _align_detection_records(
+            records,
+            expected=len(tensor),
+            sample_ids=sample_ids,
+            strategy=str(self.id_strategy),
+        )
diff --git a/src/raitap/data/tests/test_detection_labels.py b/src/raitap/data/tests/test_detection_labels.py
index 5a5663da..413c81da 100644
--- a/src/raitap/data/tests/test_detection_labels.py
+++ b/src/raitap/data/tests/test_detection_labels.py
@@ -1,4 +1,4 @@
-"""Tests for DetectionFamily.load_labels — list[dict] per-sample boxes + labels."""
+"""Tests for DetectionJsonLabelParser -- list[dict] per-sample boxes + labels."""
 
 from __future__ import annotations
 
@@ -9,8 +9,9 @@
 import pytest
 import torch
 
-from raitap.data.data import Data
-from raitap.task_families.detection import DetectionFamily
+from raitap.configs.schema import DetectionJsonLabelsConfig
+from raitap.data.data import Data, _resolve_and_parse_labels
+from raitap.types import TaskKind
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -40,12 +41,15 @@ def _write_detection_labels_json(path: Path) -> None:
 
 
 def _stub_cfg(labels_source: str | None = None) -> AppConfig:
+    labels = DetectionJsonLabelsConfig(source=labels_source) if labels_source is not None else None
     return cast(
         "AppConfig",
         SimpleNamespace(
             data=SimpleNamespace(
-                labels=SimpleNamespace(source=labels_source),
+                labels=labels,
+                source=None,
             ),
+            model=SimpleNamespace(class_names=None),
         ),
     )
 
@@ -63,7 +67,9 @@ def test_load_detection_labels_returns_list_of_dicts(tmp_path: Path) -> None:
     cfg = _stub_cfg(labels_source=str(labels_path))
 
     data = _make_data(num_samples=3)
-    out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+    out = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+    )
     assert out is not None
     assert isinstance(out, list)
     assert len(out) == 3
@@ -79,7 +85,7 @@ def test_load_detection_labels_returns_list_of_dicts(tmp_path: Path) -> None:
 
 
 def test_load_detection_labels_aligns_by_sample_id_when_present(tmp_path: Path) -> None:
-    """Reordered labels file is rewritten to match self.sample_ids ordering."""
+    """Reordered labels file is rewritten to match sample_ids ordering."""
     labels_path = tmp_path / "boxes.json"
     # Write records out of order vs sample_ids.
     payload = [
@@ -91,7 +97,9 @@ def test_load_detection_labels_aligns_by_sample_id_when_present(tmp_path: Path)
     cfg = _stub_cfg(labels_source=str(labels_path))
 
     data = _make_data(num_samples=3, sample_ids=["img_0", "img_1", "img_2"])
-    out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+    out = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+    )
     assert out is not None
     assert int(out[0]["labels"].item()) == 7
     assert out[1]["labels"].numel() == 0
@@ -110,7 +118,9 @@ def test_load_detection_labels_rejects_missing_sample_id_entries(tmp_path: Path)
 
     data = _make_data(num_samples=3, sample_ids=["img_0", "img_1", "img_2"])
     with pytest.raises(ValueError, match="missing entries"):
-        DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
 
 
 def test_load_detection_labels_rejects_duplicate_sample_id(tmp_path: Path) -> None:
@@ -125,7 +135,9 @@ def test_load_detection_labels_rejects_duplicate_sample_id(tmp_path: Path) -> No
 
     data = _make_data(num_samples=2, sample_ids=["img_0", "img_1"])
     with pytest.raises(ValueError, match="duplicate sample_id"):
-        DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
 
 
 def test_load_detection_labels_rejects_record_missing_sample_id_field(tmp_path: Path) -> None:
@@ -138,7 +150,9 @@ def test_load_detection_labels_rejects_record_missing_sample_id_field(tmp_path:
 
     data = _make_data(num_samples=1, sample_ids=["img_0"])
     with pytest.raises(ValueError, match="missing 'sample_id'"):
-        DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
 
 
 def test_load_detection_labels_rejects_wrong_length_when_no_sample_ids(tmp_path: Path) -> None:
@@ -149,7 +163,9 @@ def test_load_detection_labels_rejects_wrong_length_when_no_sample_ids(tmp_path:
 
     data = _make_data(num_samples=5)  # dataset bigger than labels
     with pytest.raises(ValueError, match="5 samples"):
-        DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
 
 
 def test_load_detection_labels_rejects_mismatched_box_label_counts(tmp_path: Path) -> None:
@@ -161,7 +177,9 @@ def test_load_detection_labels_rejects_mismatched_box_label_counts(tmp_path: Pat
 
     data = _make_data(num_samples=1)
     with pytest.raises(ValueError, match="boxes and labels"):
-        DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
 
 
 def test_load_detection_labels_rejects_non_list_root(tmp_path: Path) -> None:
@@ -171,13 +189,17 @@ def test_load_detection_labels_rejects_non_list_root(tmp_path: Path) -> None:
 
     data = _make_data(num_samples=1)
     with pytest.raises(ValueError, match="must be a JSON array"):
-        DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
 
 
 def test_load_detection_labels_returns_none_when_no_source_configured(tmp_path: Path) -> None:
     cfg = _stub_cfg(labels_source=None)
     data = _make_data(num_samples=1)
-    out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+    out = _resolve_and_parse_labels(
+        cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+    )
     assert out is None
 
 
@@ -185,4 +207,6 @@ def test_load_detection_labels_raises_when_source_unresolvable(tmp_path: Path) -
     cfg = _stub_cfg(labels_source=str(tmp_path / "missing.json"))
     data = _make_data(num_samples=1)
     with pytest.raises(ValueError, match="could not be resolved"):
-        DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
diff --git a/src/raitap/data/tests/test_detection_ragged.py b/src/raitap/data/tests/test_detection_ragged.py
index b805bbf2..6f642a41 100644
--- a/src/raitap/data/tests/test_detection_ragged.py
+++ b/src/raitap/data/tests/test_detection_ragged.py
@@ -18,7 +18,8 @@
 import torch
 from PIL import Image
 
-from raitap.data.data import Data
+from raitap.configs.schema import DetectionJsonLabelsConfig
+from raitap.data.data import Data, _resolve_and_parse_labels
 from raitap.task_families.classification import ClassificationFamily
 from raitap.task_families.detection import DetectionFamily
 from raitap.types import TaskKind
@@ -164,7 +165,7 @@ def _write_labels_json(self, path: Path, n: int) -> None:
         path.write_text(json.dumps(payload))
 
     def test_detection_labels_count_matches_list_tensor(self, tmp_path: Path) -> None:
-        """DetectionFamily.load_labels: len(tensor) works when tensor is a list."""
+        """DetectionJsonLabelParser: len(tensor) works when tensor is a list."""
         labels_path = tmp_path / "boxes.json"
         self._write_labels_json(labels_path, n=3)
 
@@ -181,13 +182,15 @@ def test_detection_labels_count_matches_list_tensor(self, tmp_path: Path) -> Non
             "AppConfig",
             SimpleNamespace(
                 data=SimpleNamespace(
-                    labels=SimpleNamespace(
-                        source=str(labels_path),
-                    )
-                )
+                    labels=DetectionJsonLabelsConfig(source=str(labels_path)),
+                    source=None,
+                ),
+                model=SimpleNamespace(class_names=None),
             ),
         )
-        out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+        out = _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+        )
         assert out is not None
         assert len(out) == 3
 
@@ -204,14 +207,16 @@ def test_detection_labels_count_mismatch_raises_with_list_tensor(self, tmp_path:
             "AppConfig",
             SimpleNamespace(
                 data=SimpleNamespace(
-                    labels=SimpleNamespace(
-                        source=str(labels_path),
-                    )
-                )
+                    labels=DetectionJsonLabelsConfig(source=str(labels_path)),
+                    source=None,
+                ),
+                model=SimpleNamespace(class_names=None),
             ),
         )
         with pytest.raises(ValueError, match="3 samples"):
-            DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids)
+            _resolve_and_parse_labels(
+                cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
+            )
 
 
 # ---------------------------------------------------------------------------

From a90c4f36c29291b8377a3d9e776bda82d9ddf89d Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 05:25:30 +0200
Subject: [PATCH 23/28] refactor(data): remove dead LabelFormat seam superseded
 by label parsers (refs #338)

---
 .../configs/tests/test_labels_schema.py       |  69 +++++
 src/raitap/data/__init__.py                   |   8 -
 src/raitap/data/_label_format_adapters.py     |  12 -
 src/raitap/data/adapters/__init__.py          |   1 -
 src/raitap/data/adapters/coco.py              |  72 -----
 src/raitap/data/adapters/voc.py               | 100 -------
 src/raitap/data/adapters/yolo.py              |  71 -----
 src/raitap/data/data.py                       |  99 +------
 src/raitap/data/label_formats.py              |  84 ------
 src/raitap/data/tests/test_label_formats.py   | 271 ------------------
 10 files changed, 70 insertions(+), 717 deletions(-)
 delete mode 100644 src/raitap/data/_label_format_adapters.py
 delete mode 100644 src/raitap/data/adapters/__init__.py
 delete mode 100644 src/raitap/data/adapters/coco.py
 delete mode 100644 src/raitap/data/adapters/voc.py
 delete mode 100644 src/raitap/data/adapters/yolo.py
 delete mode 100644 src/raitap/data/label_formats.py
 delete mode 100644 src/raitap/data/tests/test_label_formats.py

diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py
index e97bea36..85154c2d 100644
--- a/src/raitap/configs/tests/test_labels_schema.py
+++ b/src/raitap/configs/tests/test_labels_schema.py
@@ -107,3 +107,72 @@ def test_create_label_parser_handles_both_target_forms() -> None:
 def test_detection_json_config_has_exactly_target_source_id_strategy() -> None:
     names = {f.name for f in dataclasses.fields(DetectionJsonLabelsConfig)}
     assert names == {"_target_", "source", "id_strategy"}
+
+
+# ---------------------------------------------------------------------------
+# Cross-variant leakage test (Task 10)
+# ---------------------------------------------------------------------------
+
+# Fields that belong exclusively to the tabular variant and must NOT appear
+# in any other variant's builder dataclass.
+_TABULAR_ONLY_FIELDS = {"id_column", "column", "encoding"}
+
+# Fields that belong exclusively to the voc variant.
+_VOC_ONLY_FIELDS = {"class_names"}
+
+# Variants that must have ONLY ``_target_`` (no source, no strategy, nothing).
+_TARGET_ONLY_VARIANTS: set[str] = {"directory"}
+
+# Variants that carry source + id_strategy but NO tabular fields and NO
+# class_names.
+_DETECTION_VARIANTS: set[str] = {"coco", "yolo", "detection_json"}
+
+
+@pytest.mark.parametrize(
+    "registry_name",
+    ["directory", "tabular", "coco", "yolo", "voc", "detection_json"],
+)
+def test_no_cross_variant_field_leakage(registry_name: str) -> None:
+    """Each label-parser builder dataclass must expose only its own fields.
+
+    Specifically:
+    - ``directory`` has only ``_target_``.
+    - ``coco``/``yolo``/``detection_json`` have no tabular-only fields and no
+      ``class_names``.
+    - ``voc`` has ``class_names`` but no tabular-only fields.
+    - ``tabular`` has tabular-only fields but no ``class_names``.
+    """
+    from raitap._adapters import _BUILDERS
+
+    _register_labels_group()
+
+    builders = _BUILDERS.get("data/labels", {})
+    assert registry_name in builders, (
+        f"Registry name {registry_name!r} not found in _BUILDERS['data/labels']; "
+        f"registered: {sorted(builders)}"
+    )
+    builder = builders[registry_name]
+    field_names = {f.name for f in dataclasses.fields(builder)}
+
+    if registry_name in _TARGET_ONLY_VARIANTS:
+        assert field_names == {"_target_"}, (
+            f"{registry_name!r} builder should have only '_target_' but got {field_names}"
+        )
+
+    if registry_name in _DETECTION_VARIANTS:
+        leaked = _TABULAR_ONLY_FIELDS & field_names
+        assert not leaked, f"{registry_name!r} builder leaks tabular-only fields: {leaked}"
+        assert "class_names" not in field_names, (
+            f"{registry_name!r} builder should not have 'class_names'"
+        )
+
+    if registry_name == "voc":
+        leaked = _TABULAR_ONLY_FIELDS & field_names
+        assert not leaked, f"voc builder leaks tabular-only fields: {leaked}"
+        assert "class_names" in field_names, "voc builder must have 'class_names'"
+
+    if registry_name == "tabular":
+        assert field_names >= _TABULAR_ONLY_FIELDS, (
+            f"tabular builder is missing expected fields; got {field_names}"
+        )
+        assert "class_names" not in field_names, "tabular builder should not have 'class_names'"
diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py
index 0365b77f..7b6c7dc5 100644
--- a/src/raitap/data/__init__.py
+++ b/src/raitap/data/__init__.py
@@ -19,7 +19,6 @@
     from raitap.configs.schema import DataConfig, LabelsConfig
 
     from .data import Data, load_numpy_from_source, load_tensor_from_source
-    from .label_formats import LabelFormatAdapter, resolve_label_format_adapter
     from .metadata import DataInputMetadata, infer_data_input_metadata
     from .preprocessing import (
         DataPreprocessingFactory,
@@ -36,7 +35,6 @@
     "DataPreprocessingFactory",
     "IdStrategy",
     "LabelEncoding",
-    "LabelFormatAdapter",
     "LabelsConfig",
     "ModelInputTransformationFactory",
     "Preprocessing",
@@ -45,7 +43,6 @@
     "load_tensor_from_source",
     "raitap_model_input_transformation_factory",
     "raitap_preprocessing_factory",
-    "resolve_label_format_adapter",
 ]
 
 
@@ -71,11 +68,6 @@
         "raitap.data.preprocessing",
         "raitap_preprocessing_factory",
     ),
-    "LabelFormatAdapter": ("raitap.data.label_formats", "LabelFormatAdapter"),
-    "resolve_label_format_adapter": (
-        "raitap.data.label_formats",
-        "resolve_label_format_adapter",
-    ),
 }
 
 
diff --git a/src/raitap/data/_label_format_adapters.py b/src/raitap/data/_label_format_adapters.py
deleted file mode 100644
index 41c06b01..00000000
--- a/src/raitap/data/_label_format_adapters.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# pyright: reportUnusedImport=false
-"""Imports every in-tree label-format adapter so the decorators fire.
-
-Imported for its side effects by
-``raitap.data.label_formats.resolve_label_format_adapter``. Every import in this
-module is intentionally side-effect-only (registers an adapter), so the
-file-level ``reportUnusedImport=false`` above is correct.
-"""
-
-from __future__ import annotations
-
-from raitap.data.adapters import coco, voc, yolo  # noqa: F401
diff --git a/src/raitap/data/adapters/__init__.py b/src/raitap/data/adapters/__init__.py
deleted file mode 100644
index 4b68f1da..00000000
--- a/src/raitap/data/adapters/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Built-in label-format adapters (issue #338)."""
diff --git a/src/raitap/data/adapters/coco.py b/src/raitap/data/adapters/coco.py
deleted file mode 100644
index 3551e5f3..00000000
--- a/src/raitap/data/adapters/coco.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""COCO label-format adapter (issue #338)."""
-
-from __future__ import annotations
-
-import json
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-from raitap.data.label_formats import (
-    ClassificationRecord,
-    DetectionRecord,
-    label_format,
-)
-from raitap.data.types import LabelFormat
-from raitap.types import TaskKind
-
-
-@label_format
-class CocoAdapter:
-    """COCO ``instances.json`` -> native records.
-
-    Detection: ``bbox`` is ``[x, y, w, h]`` -> ``[x1, y1, x2, y2]``;
-    ``category_id`` passes through unchanged so labels stay in the model's
-    label space. Classification: one label per image (the image's single
-    annotation category); images with 0 or >1 categories raise.
-    """
-
-    format = LabelFormat.coco
-    supported_tasks = frozenset({TaskKind.detection, TaskKind.classification})
-
-    def _load(self, source: Path) -> dict[str, Any]:
-        with source.open() as fh:
-            data = json.load(fh)
-        if not isinstance(data, dict) or "images" not in data:
-            raise ValueError(f"COCO file {source} must be an object with an 'images' array.")
-        return data
-
-    def to_detection_records(
-        self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
-    ) -> list[DetectionRecord]:
-        data = self._load(source)
-        file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]}
-        boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image}
-        labels: dict[int, list[int]] = {iid: [] for iid in file_by_image}
-        for ann in data.get("annotations", []):
-            iid = ann["image_id"]
-            x, y, w, h = ann["bbox"]
-            boxes[iid].append([x, y, x + w, y + h])
-            labels[iid].append(int(ann["category_id"]))
-        return [
-            {"sample_id": file_by_image[iid], "boxes": boxes[iid], "labels": labels[iid]}
-            for iid in file_by_image
-        ]
-
-    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
-        data = self._load(source)
-        file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]}
-        cats: dict[int, set[int]] = {iid: set() for iid in file_by_image}
-        for ann in data.get("annotations", []):
-            cats[ann["image_id"]].add(int(ann["category_id"]))
-        records: list[ClassificationRecord] = []
-        for iid, name in file_by_image.items():
-            cat_set = cats[iid]
-            if len(cat_set) != 1:
-                raise ValueError(
-                    f"COCO classification needs exactly one category per image; "
-                    f"image {name!r} has {len(cat_set)}."
-                )
-            records.append({"sample_id": name, "label": next(iter(cat_set))})
-        return records
diff --git a/src/raitap/data/adapters/voc.py b/src/raitap/data/adapters/voc.py
deleted file mode 100644
index 02a8f270..00000000
--- a/src/raitap/data/adapters/voc.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""Pascal-VOC label-format adapter (issue #338)."""
-
-from __future__ import annotations
-
-import xml.etree.ElementTree as ET
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-from raitap.data.label_formats import (
-    ClassificationRecord,
-    DetectionRecord,
-    label_format,
-)
-from raitap.data.types import LabelFormat
-from raitap.types import TaskKind
-
-#: Canonical Pascal-VOC class order (index = label id) when no class_names given.
-_VOC_CLASSES = (
-    "aeroplane",
-    "bicycle",
-    "bird",
-    "boat",
-    "bottle",
-    "bus",
-    "car",
-    "cat",
-    "chair",
-    "cow",
-    "diningtable",
-    "dog",
-    "horse",
-    "motorbike",
-    "person",
-    "pottedplant",
-    "sheep",
-    "sofa",
-    "train",
-    "tvmonitor",
-)
-
-
-def _coord(box: ET.Element, tag: str, xml_path: Path) -> float:
-    text = box.findtext(tag)
-    if text is None:
-        raise ValueError(f"VOC bndbox in {xml_path.name} missing <{tag}>.")
-    return float(text)
-
-
-@label_format
-class VocAdapter:
-    """Pascal-VOC per-image ``.xml`` -> native detection records.
-
-    Boxes are already ``[xmin, ymin, xmax, ymax]`` pixels. Class names map to
-    ids by their position in ``class_names`` (else the standard 20-class VOC
-    order).
-    """
-
-    format = LabelFormat.voc
-    supported_tasks = frozenset({TaskKind.detection})
-
-    def to_detection_records(
-        self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
-    ) -> list[DetectionRecord]:
-        name_to_id = {
-            name: idx for idx, name in enumerate(class_names if class_names else _VOC_CLASSES)
-        }
-        records: list[DetectionRecord] = []
-        for xml_path in sorted(source.glob("*.xml")):
-            root = ET.parse(xml_path).getroot()
-            filename_el = root.find("filename")
-            if filename_el is None or not filename_el.text:
-                raise ValueError(f"VOC file {xml_path} has no <filename>.")
-            boxes: list[list[float]] = []
-            labels: list[int] = []
-            for obj in root.findall("object"):
-                name = obj.findtext("name")
-                if name not in name_to_id:
-                    raise ValueError(
-                        f"VOC class {name!r} in {xml_path.name} is not in the "
-                        f"class list {sorted(name_to_id)}."
-                    )
-                box = obj.find("bndbox")
-                if box is None:
-                    raise ValueError(f"VOC object in {xml_path.name} has no <bndbox>.")
-                boxes.append(
-                    [
-                        _coord(box, "xmin", xml_path),
-                        _coord(box, "ymin", xml_path),
-                        _coord(box, "xmax", xml_path),
-                        _coord(box, "ymax", xml_path),
-                    ]
-                )
-                labels.append(name_to_id[name])
-            records.append({"sample_id": filename_el.text, "boxes": boxes, "labels": labels})
-        return records
-
-    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
-        raise ValueError("VOC is a detection-only format.")
diff --git a/src/raitap/data/adapters/yolo.py b/src/raitap/data/adapters/yolo.py
deleted file mode 100644
index be6419f8..00000000
--- a/src/raitap/data/adapters/yolo.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""YOLO label-format adapter (issue #338)."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from PIL import Image
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-from raitap.data.label_formats import (
-    ClassificationRecord,
-    DetectionRecord,
-    label_format,
-)
-from raitap.data.types import LabelFormat
-from raitap.types import TaskKind
-
-_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
-
-
-@label_format
-class YoloAdapter:
-    """YOLO per-image ``.txt`` (``class cx cy w h``, normalised) -> native records.
-
-    Boxes are denormalised with each image's pixel size, read from
-    ``image_dir``. Class indices pass through unchanged.
-    """
-
-    format = LabelFormat.yolo
-    supported_tasks = frozenset({TaskKind.detection})
-
-    def _image_for(self, image_dir: Path, stem: str) -> Path:
-        for suffix in _IMAGE_SUFFIXES:
-            candidate = image_dir / f"{stem}{suffix}"
-            if candidate.exists():
-                return candidate
-        raise ValueError(f"YOLO adapter found no image for label {stem!r} in {image_dir}.")
-
-    def to_detection_records(
-        self, source: Path, *, image_dir: Path | None, class_names: list[str] | None
-    ) -> list[DetectionRecord]:
-        if image_dir is None:
-            raise ValueError(
-                "YOLO labels need image_dir to denormalise boxes; "
-                "set data.source to the image directory."
-            )
-        records: list[DetectionRecord] = []
-        for txt in sorted(source.glob("*.txt")):
-            image_path = self._image_for(image_dir, txt.stem)
-            with Image.open(image_path) as im:
-                width, height = im.size
-            boxes: list[list[float]] = []
-            labels: list[int] = []
-            for line in txt.read_text().splitlines():
-                parts = line.split()
-                if not parts:
-                    continue
-                cls, cx, cy, bw, bh = (float(p) for p in parts[:5])
-                x1 = (cx - bw / 2) * width
-                y1 = (cy - bh / 2) * height
-                x2 = (cx + bw / 2) * width
-                y2 = (cy + bh / 2) * height
-                boxes.append([x1, y1, x2, y2])
-                labels.append(int(cls))
-            records.append({"sample_id": image_path.name, "boxes": boxes, "labels": labels})
-        return records
-
-    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
-        raise ValueError("YOLO is a detection-only format.")
diff --git a/src/raitap/data/data.py b/src/raitap/data/data.py
index 46c4ee8d..15f6227a 100644
--- a/src/raitap/data/data.py
+++ b/src/raitap/data/data.py
@@ -2,7 +2,7 @@
 
 from collections import Counter
 from enum import StrEnum
-from pathlib import Path, PurePosixPath
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
@@ -276,103 +276,6 @@ def _resolve_and_parse_labels(
     )
 
 
-def _load_directory_labels(sample_ids: list[str] | None) -> torch.Tensor | None:
-    """Derive classification labels from each sample's top-level class folder
-    (torchvision ImageFolder semantics). Returns None (with a warning) when
-    labels cannot be derived: no sample ids, or a sample with no class subdir."""
-    if not sample_ids:
-        raitap_log.warn(
-            "data.labels.source='directory' needs image samples organised into "
-            "class subdirectories; none were found. Falling back to predictions "
-            "as metric targets."
-        )
-        return None
-    parts_by_id = [PurePosixPath(sid).parts for sid in sample_ids]
-    if any(len(parts) < 2 for parts in parts_by_id):
-        raitap_log.warn(
-            "data.labels.source='directory' expects a <class>/<file> layout, but "
-            "one or more samples sit directly under the data source root (no class "
-            "subdirectory). Falling back to predictions as metric targets."
-        )
-        return None
-    classes = sorted({parts[0] for parts in parts_by_id})
-    class_to_idx = {name: idx for idx, name in enumerate(classes)}
-    labels = [class_to_idx[parts[0]] for parts in parts_by_id]
-    return torch.tensor(labels, dtype=torch.long)
-
-
-def load_classification_labels(
-    cfg: AppConfig,
-    *,
-    tensor: torch.Tensor | DetectionInputs,
-    sample_ids: list[str] | None,
-) -> torch.Tensor | None:
-    """Load tabular classification labels (CSV/TSV/Parquet) → tensor or ``None``.
-
-    Aligns to ``sample_ids`` by id column when available, otherwise falls back
-    to row order. Returns ``None`` when ``data.labels.source`` is unset, the
-    file is empty, or alignment fails (callers then use predictions as targets).
-
-    Note: directory and format-adapter branches have moved to dedicated
-    ``LabelParser`` implementations. This function handles the tabular (native)
-    path only and will be wrapped by ``TabularLabelParser`` in a later task.
-    """
-    labels_cfg = _get_optional_config_value(cfg.data, "labels")
-    labels_source = _get_optional_config_value(labels_cfg, "source")
-    if not labels_source:
-        return None
-
-    labels_path = get_source_path(labels_source, kind=SourceKind.LABELS)
-    labels_df = _load_tabular_frame(labels_path)
-    if labels_df.empty:
-        raitap_log.warn("Labels file is empty; falling back to predictions as targets.")
-        return None
-
-    labels_id_column = _get_optional_config_value(labels_cfg, "id_column")
-    id_column = _resolve_labels_id_column(labels_df, labels_id_column)
-    labels_column = _get_optional_config_value(labels_cfg, "column")
-    labels_encoding = _get_optional_config_value(labels_cfg, "encoding")
-    labels_id_strategy = _get_optional_config_value(labels_cfg, "id_strategy") or "auto"
-    encoded_labels = _extract_class_labels(
-        labels_df,
-        labels_column=labels_column,
-        id_column=id_column,
-        labels_encoding=labels_encoding,
-    )
-
-    expected = len(tensor)
-    if sample_ids and id_column:
-        id_series = _column_as_series(labels_df, id_column)
-        strategy = _resolve_id_strategy(labels_id_strategy, id_series)
-        try:
-            aligned_labels = _align_labels_to_samples(
-                sample_ids=sample_ids,
-                raw_label_ids=id_series,
-                encoded_labels=encoded_labels,
-                strategy=strategy,
-            )
-        except ValueError as error:
-            raitap_log.warn(
-                f"{error} Falling back to predictions as metric targets.",
-            )
-            return None
-        return torch.tensor(aligned_labels, dtype=torch.long)
-
-    if sample_ids and not id_column:
-        raitap_log.warn(
-            "Could not find a labels id column for filename alignment; using row-order labels.",
-        )
-
-    if len(encoded_labels) != expected:
-        raitap_log.warn(
-            f"Label count ({len(encoded_labels)}) does not match sample count ({expected}); "
-            "falling back to predictions as targets.",
-        )
-        return None
-
-    return torch.tensor(encoded_labels, dtype=torch.long)
-
-
 def load_tensor_from_source(
     source: str,
     n_samples: int | None = None,
diff --git a/src/raitap/data/label_formats.py b/src/raitap/data/label_formats.py
deleted file mode 100644
index 19021a95..00000000
--- a/src/raitap/data/label_formats.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""Pluggable label-format adapters (issue #338).
-
-Each adapter converts an external annotation file (COCO / YOLO / VOC) into
-RAITAP's native intermediate record list, which the task-family loaders then
-align to ``sample_ids`` with their existing logic. Registry mirrors
-``raitap.task_families.registry``: a decorator registers one singleton per
-``LabelFormat``.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any, Protocol, TypeVar, runtime_checkable
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from raitap.data.types import LabelFormat
-    from raitap.types import TaskKind
-
-#: Native intermediate record shapes (match the on-disk native formats).
-DetectionRecord = dict[str, Any]
-ClassificationRecord = dict[str, Any]
-
-
-@runtime_checkable
-class LabelFormatAdapter(Protocol):
-    """Converts an external label file to native intermediate records."""
-
-    format: LabelFormat
-    supported_tasks: frozenset[TaskKind]
-
-    def to_detection_records(
-        self,
-        source: Path,
-        *,
-        image_dir: Path | None,
-        class_names: list[str] | None,
-    ) -> list[DetectionRecord]:
-        """Return ``[{sample_id, boxes (xyxy), labels}]``. Raise if unsupported."""
-        ...
-
-    def to_classification_records(self, source: Path) -> list[ClassificationRecord]:
-        """Return ``[{sample_id, label}]``. Raise if unsupported."""
-        ...
-
-
-#: format -> the adapter singleton serving it.
-LABEL_FORMAT_ADAPTERS: dict[LabelFormat, LabelFormatAdapter] = {}
-
-T = TypeVar("T")
-
-
-def label_format(cls: type[T]) -> type[T]:
-    """Register ``cls`` (instantiated once) under its ``format`` class attribute."""
-    instance = cls()  # type: ignore[call-arg]
-    LABEL_FORMAT_ADAPTERS[instance.format] = instance  # type: ignore[attr-defined]
-    return cls
-
-
-def resolve_label_format_adapter(fmt: LabelFormat, *, task_kind: TaskKind) -> LabelFormatAdapter:
-    """Return the adapter for ``fmt`` that supports ``task_kind``.
-
-    Raises ``ValueError`` when no adapter is registered for ``fmt`` (e.g.
-    ``native``, which the caller should special-case) or the adapter does not
-    declare ``task_kind`` in ``supported_tasks``.
-    """
-    # Import side-effect: register the in-tree adapters on first use.
-    from raitap.data import (
-        _label_format_adapters,  # noqa: F401  # pyright: ignore[reportUnusedImport]
-    )
-
-    adapter = LABEL_FORMAT_ADAPTERS.get(fmt)
-    if adapter is None:
-        raise ValueError(
-            f"No adapter registered for label format {fmt.value!r}; "
-            f"registered: {sorted(f.value for f in LABEL_FORMAT_ADAPTERS)}."
-        )
-    if task_kind not in adapter.supported_tasks:
-        supported = sorted(t.value for t in adapter.supported_tasks)
-        raise ValueError(
-            f"Label format {fmt.value!r} does not support task {task_kind.value!r}; "
-            f"supported tasks: {supported}."
-        )
-    return adapter
diff --git a/src/raitap/data/tests/test_label_formats.py b/src/raitap/data/tests/test_label_formats.py
deleted file mode 100644
index 657de141..00000000
--- a/src/raitap/data/tests/test_label_formats.py
+++ /dev/null
@@ -1,271 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, cast
-
-import pytest
-
-from raitap.configs.schema import LabelsConfig
-from raitap.data.label_formats import (
-    LABEL_FORMAT_ADAPTERS,
-    label_format,
-    resolve_label_format_adapter,
-)
-from raitap.data.types import LabelFormat
-from raitap.types import TaskKind
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from raitap.configs.schema import AppConfig
-
-
-def test_label_format_members_are_string_values() -> None:
-    assert LabelFormat.native == "native"
-    assert {f.value for f in LabelFormat} == {"native", "coco", "yolo", "voc"}
-
-
-def test_labels_config_defaults_to_native_format() -> None:
-    assert LabelsConfig().format is LabelFormat.native
-
-
-def test_label_format_decorator_registers_instance() -> None:
-    @label_format
-    class _Dummy:
-        format = LabelFormat.coco  # reuse an enum member; popped below
-        supported_tasks = frozenset({TaskKind.detection})
-
-    try:
-        assert LABEL_FORMAT_ADAPTERS[LabelFormat.coco].supported_tasks == frozenset(
-            {TaskKind.detection}
-        )
-    finally:
-        LABEL_FORMAT_ADAPTERS.pop(LabelFormat.coco, None)
-
-
-def test_registry_rejects_unknown_native() -> None:
-    with pytest.raises(ValueError, match="No adapter"):
-        resolve_label_format_adapter(LabelFormat.native, task_kind=TaskKind.detection)
-
-
-def test_registry_resolves_supported_task() -> None:
-    adapter = resolve_label_format_adapter(LabelFormat.coco, task_kind=TaskKind.detection)
-    assert adapter.format is LabelFormat.coco
-    assert TaskKind.detection in adapter.supported_tasks
-
-
-def test_registry_rejects_unsupported_task() -> None:
-    with pytest.raises(ValueError, match="does not support task"):
-        resolve_label_format_adapter(LabelFormat.yolo, task_kind=TaskKind.classification)
-
-
-def test_coco_detection_records(tmp_path: Path) -> None:
-    import json
-
-    from raitap.data.adapters.coco import CocoAdapter
-
-    coco = {
-        "images": [
-            {"id": 1, "file_name": "a.jpg"},
-            {"id": 2, "file_name": "b.jpg"},
-        ],
-        "annotations": [
-            {"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]},
-            {"image_id": 1, "category_id": 5, "bbox": [0, 0, 5, 5]},
-        ],
-        "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}],
-    }
-    p = tmp_path / "instances.json"
-    p.write_text(json.dumps(coco))
-
-    records = CocoAdapter().to_detection_records(p, image_dir=None, class_names=None)
-    by_id = {r["sample_id"]: r for r in records}
-    assert by_id["a.jpg"]["boxes"] == [[10, 20, 40, 60], [0, 0, 5, 5]]
-    assert by_id["a.jpg"]["labels"] == [3, 5]
-    assert by_id["b.jpg"] == {"sample_id": "b.jpg", "boxes": [], "labels": []}
-
-
-def test_coco_classification_records(tmp_path: Path) -> None:
-    import json
-
-    from raitap.data.adapters.coco import CocoAdapter
-
-    coco = {
-        "images": [{"id": 1, "file_name": "a.jpg"}],
-        "annotations": [{"image_id": 1, "category_id": 7, "bbox": [0, 0, 1, 1]}],
-        "categories": [{"id": 7, "name": "cat"}],
-    }
-    p = tmp_path / "c.json"
-    p.write_text(json.dumps(coco))
-    records = CocoAdapter().to_classification_records(p)
-    assert records == [{"sample_id": "a.jpg", "label": 7}]
-
-
-def test_coco_classification_rejects_zero_categories(tmp_path: Path) -> None:
-    import json
-
-    from raitap.data.adapters.coco import CocoAdapter
-
-    coco = {
-        "images": [{"id": 1, "file_name": "a.jpg"}],
-        "annotations": [],
-        "categories": [{"id": 7, "name": "cat"}],
-    }
-    p = tmp_path / "zero.json"
-    p.write_text(json.dumps(coco))
-    with pytest.raises(ValueError, match="exactly one category per image"):
-        CocoAdapter().to_classification_records(p)
-
-
-def test_coco_classification_rejects_multiple_categories(tmp_path: Path) -> None:
-    import json
-
-    from raitap.data.adapters.coco import CocoAdapter
-
-    coco = {
-        "images": [{"id": 1, "file_name": "a.jpg"}],
-        "annotations": [
-            {"image_id": 1, "category_id": 3, "bbox": [0, 0, 1, 1]},
-            {"image_id": 1, "category_id": 5, "bbox": [0, 0, 1, 1]},
-        ],
-        "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}],
-    }
-    p = tmp_path / "multi.json"
-    p.write_text(json.dumps(coco))
-    with pytest.raises(ValueError, match="exactly one category per image"):
-        CocoAdapter().to_classification_records(p)
-
-
-def test_yolo_detection_records(tmp_path: Path) -> None:
-    from PIL import Image
-
-    from raitap.data.adapters.yolo import YoloAdapter
-
-    image_dir = tmp_path / "images"
-    image_dir.mkdir()
-    Image.new("RGB", (100, 200)).save(image_dir / "a.jpg")  # w=100, h=200
-
-    label_dir = tmp_path / "labels"
-    label_dir.mkdir()
-    # class=2, cx=0.5 cy=0.5 w=0.2 h=0.1  -> center (50,100), box 20x20px
-    (label_dir / "a.txt").write_text("2 0.5 0.5 0.2 0.1\n")
-
-    records = YoloAdapter().to_detection_records(label_dir, image_dir=image_dir, class_names=None)
-    assert len(records) == 1
-    rec = records[0]
-    assert rec["sample_id"] == "a.jpg"
-    assert rec["labels"] == [2]
-    # x1 = (0.5-0.1)*100=40, y1=(0.5-0.05)*200=90, x2=60, y2=110
-    assert len(rec["boxes"]) == 1
-    assert rec["boxes"][0] == pytest.approx([40.0, 90.0, 60.0, 110.0])
-
-
-def test_voc_detection_records(tmp_path: Path) -> None:
-    from raitap.data.adapters.voc import VocAdapter
-
-    xml = """<annotation>
-      <filename>a.jpg</filename>
-      <object><name>person</name>
-        <bndbox><xmin>10</xmin><ymin>20</ymin><xmax>30</xmax><ymax>40</ymax></bndbox>
-      </object>
-    </annotation>"""
-    d = tmp_path / "ann"
-    d.mkdir()
-    (d / "a.xml").write_text(xml)
-
-    records = VocAdapter().to_detection_records(
-        d, image_dir=None, class_names=["background", "person", "car"]
-    )
-    assert records == [{"sample_id": "a.jpg", "boxes": [[10.0, 20.0, 30.0, 40.0]], "labels": [1]}]
-
-
-def test_voc_detection_rejects_object_without_bndbox(tmp_path: Path) -> None:
-    from raitap.data.adapters.voc import VocAdapter
-
-    xml = """<annotation>
-      <filename>a.jpg</filename>
-      <object><name>person</name></object>
-    </annotation>"""
-    d = tmp_path / "ann"
-    d.mkdir()
-    (d / "a.xml").write_text(xml)
-
-    with pytest.raises(ValueError, match="has no <bndbox>"):
-        VocAdapter().to_detection_records(
-            d, image_dir=None, class_names=["background", "person", "car"]
-        )
-
-
-def test_detection_load_labels_via_coco(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
-    import json
-    from types import SimpleNamespace
-
-    import torch
-
-    import raitap.data.data as data_mod
-    from raitap.data.types import LabelFormat
-    from raitap.task_families.detection import DetectionFamily
-
-    coco = {
-        "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}],
-        "annotations": [{"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}],
-        "categories": [{"id": 3, "name": "car"}],
-    }
-    labels_file = tmp_path / "instances.json"
-    labels_file.write_text(json.dumps(coco))
-
-    monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source)
-    # tmp_path/"instances.json" is LABELS; tmp_path/"imgs" is DATA (unused by coco).
-    cfg = cast(
-        "AppConfig",
-        SimpleNamespace(
-            data=SimpleNamespace(
-                source="imgs",
-                labels=SimpleNamespace(source="instances.json", format=LabelFormat.coco),
-            )
-        ),
-    )
-    tensor = [object(), object()]  # len == 2 samples
-    out = DetectionFamily().load_labels(cfg, tensor=tensor, sample_ids=["a.jpg", "b.jpg"])
-    assert torch.equal(out[0]["boxes"], torch.tensor([[10.0, 20.0, 40.0, 60.0]]))
-    assert torch.equal(out[0]["labels"], torch.tensor([3]))
-    assert out[1]["boxes"].shape == (0, 4)
-
-
-def test_classification_load_labels_via_coco(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
-) -> None:
-    import json
-    from types import SimpleNamespace
-
-    import torch
-
-    import raitap.data.data as data_mod
-    from raitap.data.types import LabelFormat
-
-    coco = {
-        "images": [{"id": 1, "file_name": "a.jpg"}, {"id": 2, "file_name": "b.jpg"}],
-        "annotations": [
-            {"image_id": 1, "category_id": 0, "bbox": [0, 0, 1, 1]},
-            {"image_id": 2, "category_id": 4, "bbox": [0, 0, 1, 1]},
-        ],
-        "categories": [{"id": 0, "name": "x"}, {"id": 4, "name": "y"}],
-    }
-    labels_file = tmp_path / "c.json"
-    labels_file.write_text(json.dumps(coco))
-    monkeypatch.setattr(data_mod, "get_source_path", lambda source, *, kind: tmp_path / source)
-    cfg = cast(
-        "AppConfig",
-        SimpleNamespace(
-            data=SimpleNamespace(
-                source="imgs",
-                labels=SimpleNamespace(
-                    source="c.json", format=LabelFormat.coco, id_strategy="stem"
-                ),
-            )
-        ),
-    )
-    out = data_mod.load_classification_labels(
-        cfg, tensor=torch.zeros(2), sample_ids=["a.jpg", "b.jpg"]
-    )
-    assert out is not None
-    assert torch.equal(out, torch.tensor([0, 4]))

From 805621c51adbf4a1cf0b5a9d0dd933fa57378885 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 05:52:03 +0200
Subject: [PATCH 24/28] chore(config): migrate data.labels to discriminated
 label parsers (refs #338)

---
 .../fasterrcnn-udacity/assessment.yaml        |   5 +
 .../imagecorruptions-imagenet/assessment.yaml |   1 +
 .../lwise-ham10000/assessment.yaml            |   6 +
 .../marabou-mnist/assessment.yaml             |   2 +
 .../noisetunnel-smoothgrad/assessment.yaml    |   1 +
 example/assessment.yaml                       |   1 +
 src/raitap/configs/demo.yaml                  |   1 +
 src/raitap/data/tests/test_data.py            |  60 +++----
 src/raitap/data/tests/test_data_class.py      | 150 +++++++++++-------
 .../data/tests/test_detection_ragged.py       |   9 +-
 src/raitap/tests/test_api.py                  |   4 +-
 src/raitap/tests/test_example_recipes.py      |   5 +-
 12 files changed, 143 insertions(+), 102 deletions(-)

diff --git a/contributor-configs/fasterrcnn-udacity/assessment.yaml b/contributor-configs/fasterrcnn-udacity/assessment.yaml
index d9f38975..7fd84ea7 100644
--- a/contributor-configs/fasterrcnn-udacity/assessment.yaml
+++ b/contributor-configs/fasterrcnn-udacity/assessment.yaml
@@ -12,6 +12,11 @@
 #   Hand-authored COCO-class boxes in ``labels/udacity-boxes.json`` — see
 #   ``labelling-data.md`` for the format and a candidate-generation helper.
 
+defaults:
+  - raitap_schema
+  - data/labels: detection_json
+  - _self_
+
 experiment_name: fasterrcnn-udacity-detection-demo
 hardware: cpu
 
diff --git a/contributor-configs/imagecorruptions-imagenet/assessment.yaml b/contributor-configs/imagecorruptions-imagenet/assessment.yaml
index 97c2fbee..659e8708 100644
--- a/contributor-configs/imagecorruptions-imagenet/assessment.yaml
+++ b/contributor-configs/imagecorruptions-imagenet/assessment.yaml
@@ -12,6 +12,7 @@ defaults:
   - raitap_schema      # required to bind the schema, must come first
   - reporting: html
   - metrics: multiclass_classification
+  - data/labels: tabular
   - _self_
 
 hardware: gpu
diff --git a/contributor-configs/lwise-ham10000/assessment.yaml b/contributor-configs/lwise-ham10000/assessment.yaml
index e1d90cf9..a80866ba 100644
--- a/contributor-configs/lwise-ham10000/assessment.yaml
+++ b/contributor-configs/lwise-ham10000/assessment.yaml
@@ -7,6 +7,12 @@
 #   TorchScript modules do not support. Override with:
 #   LWISE_HAM10000_MODEL=/path/to/lwise_ham10000_eager.pt
 
+defaults:
+  - raitap_schema
+  - metrics: multiclass_classification
+  - data/labels: tabular
+  - _self_
+
 experiment_name: lwise-ham10000-dermoscopy-demo
 hardware: gpu
 
diff --git a/contributor-configs/marabou-mnist/assessment.yaml b/contributor-configs/marabou-mnist/assessment.yaml
index e00b0e7c..5a89aadd 100644
--- a/contributor-configs/marabou-mnist/assessment.yaml
+++ b/contributor-configs/marabou-mnist/assessment.yaml
@@ -3,6 +3,8 @@
 # `data=mnist_samples`, `model=mlp_mnist`, `robustness=marabou_linf`.
 # No transparency block (the original demo invoked `~transparency`).
 defaults:
+  - raitap_schema
+  - data/labels: tabular
   - _self_
 
 experiment_name: marabou_mnist_uc1
diff --git a/contributor-configs/noisetunnel-smoothgrad/assessment.yaml b/contributor-configs/noisetunnel-smoothgrad/assessment.yaml
index cb7b9b3e..44946d44 100644
--- a/contributor-configs/noisetunnel-smoothgrad/assessment.yaml
+++ b/contributor-configs/noisetunnel-smoothgrad/assessment.yaml
@@ -8,6 +8,7 @@
 defaults:
   - raitap_schema
   - metrics: multiclass_classification
+  - data/labels: tabular
   - _self_
 
 hardware: cpu
diff --git a/example/assessment.yaml b/example/assessment.yaml
index 4a2df860..13fb6c72 100644
--- a/example/assessment.yaml
+++ b/example/assessment.yaml
@@ -2,6 +2,7 @@ defaults:
   - raitap_schema      # required to bind the schema, must come first
   - reporting: html
   - metrics: multiclass_classification
+  - data/labels: tabular
   - _self_
 
 hardware: gpu
diff --git a/src/raitap/configs/demo.yaml b/src/raitap/configs/demo.yaml
index d0ef4e17..ddb15c34 100644
--- a/src/raitap/configs/demo.yaml
+++ b/src/raitap/configs/demo.yaml
@@ -3,6 +3,7 @@
 defaults:
   - raitap_schema     # binds AppConfig dataclass → unset fields inherit defaults
   - metrics: multiclass_classification   # narrows metrics schema so num_classes is accepted
+  - data/labels: tabular
   - _self_
 
 hardware: cpu
diff --git a/src/raitap/data/tests/test_data.py b/src/raitap/data/tests/test_data.py
index b1e931e4..907778df 100644
--- a/src/raitap/data/tests/test_data.py
+++ b/src/raitap/data/tests/test_data.py
@@ -195,7 +195,7 @@ class TestDataPreprocessing:
 
     @staticmethod
     def _make_cfg(source: str, *, preprocessing: str | None) -> AppConfig:
-        from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig
+        from raitap.configs.schema import AppConfig, DataConfig, ModelConfig
 
         return cast(
             "AppConfig",
@@ -205,7 +205,7 @@ def _make_cfg(source: str, *, preprocessing: str | None) -> AppConfig:
                     name="test",
                     source=source,
                     preprocessing=preprocessing,
-                    labels=LabelsConfig(),
+                    labels=None,
                 ),
                 hardware=Hardware.cpu,
             ),
@@ -239,7 +239,7 @@ def test_uniform_dir_without_preprocessing_still_loads(self, tmp_path: Path) ->
     def test_supplied_resolved_preprocessing_skips_resolution(self, tmp_path: Path) -> None:
         from torch import nn
 
-        from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig
+        from raitap.configs.schema import AppConfig, DataConfig, ModelConfig
         from raitap.data.preprocessing import ResolvedPreprocessing
 
         class _ShapeModule(nn.Module):
@@ -255,7 +255,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor:
                     name="test",
                     source=str(tmp_path),
                     preprocessing="model-bundled",
-                    labels=LabelsConfig(),
+                    labels=None,
                 ),
                 hardware=Hardware.cpu,
             ),
@@ -279,7 +279,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor:
     def test_onnx_custom_file_data_factory_drives_data_loading(
         self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
     ) -> None:
-        from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig
+        from raitap.configs.schema import AppConfig, DataConfig, ModelConfig
 
         _write_image(tmp_path / "a.jpg", 32, 48)
         _write_image(tmp_path / "b.jpg", 40, 64)
@@ -319,7 +319,7 @@ def test_onnx_custom_file_data_factory_drives_data_loading(
                     name="test",
                     source=str(tmp_path),
                     preprocessing=str(preprocessing_path),
-                    labels=LabelsConfig(),
+                    labels=None,
                 ),
                 hardware=Hardware.cpu,
             ),
@@ -369,7 +369,7 @@ def test_sample_source_loads_native_resolution_then_transforms(self, tmp_path: P
         breaks pretrained-weight accuracy on `raitap --demo`."""
         from torch import nn
 
-        from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig
+        from raitap.configs.schema import AppConfig, DataConfig, ModelConfig
         from raitap.data.samples import SAMPLE_SOURCES
 
         # Stage a fake sample at varied native sizes so the test would fail
@@ -411,7 +411,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                             name="fake_native_samples",
                             source="fake_native_samples",
                             preprocessing="model-bundled",
-                            labels=LabelsConfig(),
+                            labels=None,
                         ),
                         hardware=Hardware.cpu,
                     ),
@@ -589,7 +589,7 @@ def test_unknown_extension_raises(self, tmp_path: Path) -> None:
     def test_tabular_applies_data_module(self, tmp_path: Path) -> None:
         from torch import nn
 
-        from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig
+        from raitap.configs.schema import AppConfig, DataConfig, ModelConfig
         from raitap.data.preprocessing import ResolvedPreprocessing
 
         class _ScaleModule(nn.Module):
@@ -606,7 +606,7 @@ def forward(self, batch: torch.Tensor) -> torch.Tensor:
                     name="tab",
                     source=str(p),
                     preprocessing="./scale.py",
-                    labels=LabelsConfig(),
+                    labels=None,
                 ),
                 hardware=Hardware.cpu,
             ),
@@ -701,7 +701,9 @@ def test_url_source_loads_image_via_get_source_path(self, tmp_path: Path) -> Non
         assert data.tensor.shape == (1, 3, 32, 32)
 
     def test_sample_labels_align_with_sample_images(self, tmp_path: Path) -> None:
+        from raitap.configs.schema import AppConfig, DataConfig, ModelConfig, TabularLabelsConfig
         from raitap.data.samples import SAMPLE_LABELS
+        from raitap.data.types import LabelEncoding
 
         with (
             patch("raitap.data.samples._CACHE_DIR", tmp_path),
@@ -711,30 +713,20 @@ def test_sample_labels_align_with_sample_images(self, tmp_path: Path) -> None:
             mock_download.side_effect = lambda _url, dest: _write_image(dest, 32, 32)
             cfg = cast(
                 "AppConfig",
-                type(
-                    "AppConfig",
-                    (),
-                    {
-                        "data": type(
-                            "DataConfig",
-                            (),
-                            {
-                                "source": "imagenet_samples",
-                                "name": "imagenet_samples",
-                                "labels": type(
-                                    "LabelsConfig",
-                                    (),
-                                    {
-                                        "source": "imagenet_samples",
-                                        "id_column": "image",
-                                        "column": "label",
-                                        "encoding": "index",
-                                    },
-                                )(),
-                            },
-                        )()
-                    },
-                )(),
+                AppConfig(
+                    model=ModelConfig(source="resnet50"),
+                    data=DataConfig(
+                        name="imagenet_samples",
+                        source="imagenet_samples",
+                        labels=TabularLabelsConfig(
+                            source="imagenet_samples",
+                            id_column="image",
+                            column="label",
+                            encoding=LabelEncoding.index,
+                        ),
+                    ),
+                    hardware=Hardware.cpu,
+                ),
             )
             data = Data(cfg)
 
diff --git a/src/raitap/data/tests/test_data_class.py b/src/raitap/data/tests/test_data_class.py
index 44b207f7..72ead53e 100644
--- a/src/raitap/data/tests/test_data_class.py
+++ b/src/raitap/data/tests/test_data_class.py
@@ -16,9 +16,9 @@
 
     from raitap.configs.schema import AppConfig
 
+from raitap.configs.schema import DirectoryLabelsConfig, TabularLabelsConfig
 from raitap.data import Data
-from raitap.data.data import _load_directory_labels, load_classification_labels
-from raitap.data.types import DIRECTORY_LABELS_SOURCE, InputModality
+from raitap.data.types import InputModality
 
 
 def _write_image(path: Path) -> None:
@@ -35,19 +35,28 @@ def _make_config(
     labels_encoding: str | None = None,
     labels_id_strategy: str | None = None,
 ) -> AppConfig:
+    from raitap.data.types import IdStrategy, LabelEncoding
+
+    if labels_source is not None:
+        encoding = LabelEncoding(labels_encoding) if labels_encoding else None
+        id_strategy = IdStrategy(labels_id_strategy) if labels_id_strategy else IdStrategy.auto
+        labels = TabularLabelsConfig(
+            source=labels_source,
+            id_column=labels_id_column,
+            column=labels_column,
+            encoding=encoding,
+            id_strategy=id_strategy,
+        )
+    else:
+        labels = None
+
     return cast(
         "AppConfig",
         SimpleNamespace(
             data=SimpleNamespace(
                 source=source,
                 name=name,
-                labels=SimpleNamespace(
-                    source=labels_source,
-                    id_column=labels_id_column,
-                    column=labels_column,
-                    encoding=labels_encoding,
-                    id_strategy=labels_id_strategy,
-                ),
+                labels=labels,
             )
         ),
     )
@@ -342,16 +351,15 @@ def test_data_raises_for_unsupported_id_strategy(self, tmp_path: Path) -> None:
         _write_image(data_dir / "x.jpg")
         labels_file = tmp_path / "labels.csv"
         labels_file.write_text("image,label\nx,0\n")
-        config = _make_config(
-            str(data_dir),
-            labels_source=str(labels_file),
-            labels_id_column="image",
-            labels_column="label",
-            labels_encoding="index",
-            labels_id_strategy="bogus",
-        )
-
-        with pytest.raises(ValueError, match=r"Unsupported data\.labels\.id_strategy"):
+        with pytest.raises(ValueError):
+            config = _make_config(
+                str(data_dir),
+                labels_source=str(labels_file),
+                labels_id_column="image",
+                labels_column="label",
+                labels_encoding="index",
+                labels_id_strategy="bogus",
+            )
             Data(config)
 
     def test_data_records_image_modality_for_image_dir(self, tmp_path: Path) -> None:
@@ -397,14 +405,13 @@ def test_data_raises_for_unsupported_labels_encoding(self, tmp_path: Path) -> No
         csv_file.write_text("a\n1\n2")
         labels_file = tmp_path / "labels.csv"
         labels_file.write_text("label\n0\n1")
-        config = _make_config(
-            str(csv_file),
-            labels_source=str(labels_file),
-            labels_column="label",
-            labels_encoding="ordinal",
-        )
-
-        with pytest.raises(ValueError, match=r"Unsupported data\.labels\.encoding"):
+        with pytest.raises(ValueError):
+            config = _make_config(
+                str(csv_file),
+                labels_source=str(labels_file),
+                labels_column="label",
+                labels_encoding="ordinal",
+            )
             Data(config)
 
 
@@ -482,54 +489,85 @@ def test_log_includes_full_metadata(self, tmp_path: Path) -> None:
         assert "dtype" in call_args
 
 
-class TestLoadDirectoryLabels:
+class TestLoadDirectoryLabelsViaParser:
+    """Directory label behavior via DirectoryLabelParser (replaces deleted _load_directory_labels).
+
+    The private _load_directory_labels function and load_classification_labels were removed in
+    the discriminated-config refactor. Behavior is now covered by DirectoryLabelParser
+    and _resolve_and_parse_labels. These tests preserve the behavioral contracts.
+    """
+
+    def _run_directory_parser(self, sample_ids: list[str] | None) -> torch.Tensor | None:
+        from types import SimpleNamespace
+        from typing import cast
+
+        from raitap.data.data import _resolve_and_parse_labels
+        from raitap.types import TaskKind
+
+        cfg = cast(
+            "AppConfig",
+            SimpleNamespace(
+                data=SimpleNamespace(labels=DirectoryLabelsConfig(), source=None),
+                model=SimpleNamespace(class_names=None),
+            ),
+        )
+        return _resolve_and_parse_labels(
+            cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids
+        )
+
     def test_derives_labels_from_top_level_class_folder(self) -> None:
-        result = _load_directory_labels(["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"])
+        result = self._run_directory_parser(["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"])
         assert result is not None
         assert torch.equal(result, torch.tensor([0, 1, 0]))
 
     def test_nesting_within_class_stays_top_level(self) -> None:
-        result = _load_directory_labels(["NORMAL/sub/a.jpg", "PNEUMONIA/b.jpg"])
+        result = self._run_directory_parser(["NORMAL/sub/a.jpg", "PNEUMONIA/b.jpg"])
         assert result is not None
         assert torch.equal(result, torch.tensor([0, 1]))
 
     def test_single_class_is_all_zeros_not_error(self) -> None:
-        result = _load_directory_labels(["NORMAL/a.jpg", "NORMAL/b.jpg"])
+        result = self._run_directory_parser(["NORMAL/a.jpg", "NORMAL/b.jpg"])
         assert result is not None
         assert torch.equal(result, torch.tensor([0, 0]))
 
     def test_sample_without_class_subdir_returns_none(self) -> None:
-        with pytest.warns(UserWarning, match="class subdirectory"):
-            result = _load_directory_labels(["a.jpg", "NORMAL/b.jpg"])
+        result = self._run_directory_parser(["a.jpg", "NORMAL/b.jpg"])
         assert result is None
 
     def test_none_sample_ids_returns_none(self) -> None:
-        with pytest.warns(UserWarning, match="class subdirectories"):
-            result = _load_directory_labels(None)
+        result = self._run_directory_parser(None)
         assert result is None
 
     def test_empty_sample_ids_returns_none(self) -> None:
-        with pytest.warns(UserWarning, match="class subdirectories"):
-            result = _load_directory_labels([])
+        result = self._run_directory_parser([])
         assert result is None
 
+    def test_directory_source_derives_labels_from_layout(self, tmp_path: Path) -> None:
+        """Data with DirectoryLabelsConfig derives labels from the sample layout."""
+        from types import SimpleNamespace
+        from typing import cast
+
+
+        img_dir = tmp_path / "images"
+        (img_dir / "NORMAL").mkdir(parents=True)
+        (img_dir / "PNEUMONIA").mkdir(parents=True)
+        _write_image(img_dir / "NORMAL" / "a.jpg")
+        _write_image(img_dir / "PNEUMONIA" / "b.jpg")
+        _write_image(img_dir / "NORMAL" / "c.jpg")
+
+        cfg = cast(
+            "AppConfig",
+            SimpleNamespace(
+                data=SimpleNamespace(
+                    source=str(img_dir),
+                    name="test_dir",
+                    labels=DirectoryLabelsConfig(),
+                )
+            ),
+        )
+        data = Data(cfg)
 
-class TestLoadClassificationLabelsDirectorySource:
-    def test_directory_source_derives_labels(self) -> None:
-        config = _make_config("images", labels_source=DIRECTORY_LABELS_SOURCE)
-        sample_ids = ["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"]
-        tensor = torch.zeros(len(sample_ids), 3, 8, 8)
-
-        result = load_classification_labels(config, tensor=tensor, sample_ids=sample_ids)
-
-        assert result is not None
-        assert torch.equal(result, torch.tensor([0, 1, 0]))
-
-    def test_directory_source_none_sample_ids_returns_none(self) -> None:
-        config = _make_config("rows.csv", labels_source=DIRECTORY_LABELS_SOURCE)
-        tensor = torch.zeros(3, 4)
-
-        with pytest.warns(UserWarning, match="class subdirectories"):
-            result = load_classification_labels(config, tensor=tensor, sample_ids=None)
-
-        assert result is None
+        assert data.labels is not None
+        assert isinstance(data.labels, torch.Tensor)
+        # NORMAL=0, PNEUMONIA=1; sorted by posix path: NORMAL/a, NORMAL/c, PNEUMONIA/b
+        assert data.labels.tolist() == [0, 0, 1]
diff --git a/src/raitap/data/tests/test_detection_ragged.py b/src/raitap/data/tests/test_detection_ragged.py
index 6f642a41..bbad5a7b 100644
--- a/src/raitap/data/tests/test_detection_ragged.py
+++ b/src/raitap/data/tests/test_detection_ragged.py
@@ -48,14 +48,7 @@ def _make_config(source: str, name: str = "test_det") -> AppConfig:
             data=SimpleNamespace(
                 source=source,
                 name=name,
-                labels=SimpleNamespace(
-                    source=None,
-                    kind=None,
-                    id_column=None,
-                    column=None,
-                    encoding=None,
-                    id_strategy=None,
-                ),
+                labels=None,
             )
         ),
     )
diff --git a/src/raitap/tests/test_api.py b/src/raitap/tests/test_api.py
index 772e0d74..4f2d412c 100644
--- a/src/raitap/tests/test_api.py
+++ b/src/raitap/tests/test_api.py
@@ -23,10 +23,10 @@
 from raitap.api import instantiate
 from raitap.configs.schema import (
     DataConfig,
-    LabelsConfig,
     ModelConfig,
     MulticlassClassificationMetricsConfig,
     RobustnessConfig,
+    TabularLabelsConfig,
     TransparencyConfig,
 )
 from raitap.data.preprocessing import resolve_preprocessing
@@ -56,7 +56,7 @@ def _demo_app_config() -> AppConfig:
             name="imagenet_samples",
             source="imagenet_samples",
             forward_batch_size=4,
-            labels=LabelsConfig(
+            labels=TabularLabelsConfig(
                 source="imagenet_samples",
                 id_column="image",
                 column="label",
diff --git a/src/raitap/tests/test_example_recipes.py b/src/raitap/tests/test_example_recipes.py
index 521cfb8e..a6c40a1d 100644
--- a/src/raitap/tests/test_example_recipes.py
+++ b/src/raitap/tests/test_example_recipes.py
@@ -28,7 +28,8 @@
 pytest.importorskip("torchmetrics")  # metrics adapter
 
 from raitap import AppConfig, Hardware, run
-from raitap.data import DataConfig, LabelsConfig
+from raitap.configs.schema import TabularLabelsConfig
+from raitap.data import DataConfig
 from raitap.metrics import multiclass_classification as classification
 from raitap.models import ModelConfig
 from raitap.pipeline.outputs import RunOutputs
@@ -60,7 +61,7 @@ def _base_kwargs(experiment_name: str) -> _BaseKwargs:
             name="imagenet_samples",
             source="imagenet_samples",
             forward_batch_size=4,
-            labels=LabelsConfig(
+            labels=TabularLabelsConfig(
                 source="imagenet_samples",
                 id_column="image",
                 column="label",

From 1224e7b6d50449059be04607c11518e5a8070ce0 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Wed, 24 Jun 2026 06:01:29 +0200
Subject: [PATCH 25/28] docs: document label parsers and discriminated labels
 config (refs #338)

---
 docs/contributor/modules/data.md         |  70 ++++++++++---
 docs/modules/data/configuration.md       | 125 ++++++++---------------
 docs/modules/data/own-vs-built-in.md     |  37 +++----
 src/raitap/data/tests/test_data_class.py |   1 -
 4 files changed, 113 insertions(+), 120 deletions(-)

diff --git a/docs/contributor/modules/data.md b/docs/contributor/modules/data.md
index 82bfc9a7..ef4e4cc6 100644
--- a/docs/contributor/modules/data.md
+++ b/docs/contributor/modules/data.md
@@ -45,9 +45,14 @@ referenceable by name in `data.source`. Registration lives in
     }
     ```
 
-3. **Use it** from any consumer config:
+3. **Use it** from any consumer config (select the `tabular` label variant):
 
     ```yaml
+    defaults:
+      - raitap_schema
+      - data/labels: tabular
+      - _self_
+
     data:
       name: cifar10_samples
       source: cifar10_samples            # resolves via SAMPLE_SOURCES
@@ -69,19 +74,56 @@ referenceable by name in `data.source`. Registration lives in
 5. **Update docs** — add the new sample name to
    {doc}`/modules/data/own-vs-built-in`.
 
-## Adding a label format
-
-1. Create `src/raitap/data/adapters/<format>.py` with a class decorated
-   `@label_format`. Set `format = LabelFormat.<name>` and
-   `supported_tasks = frozenset({...})`.
-2. Implement `to_detection_records` and/or `to_classification_records`,
-   returning the native record shape (`{sample_id, boxes (xyxy), labels}` or
-   `{sample_id, label}`). Raise `ValueError` for an unsupported task.
-3. Import it in `src/raitap/data/_label_format_adapters.py` so the decorator
-   fires.
-4. Add a `LabelFormat` member in `src/raitap/data/types.py` and a row to the
-   label-format table in `docs/modules/data/configuration.md`.
-5. Add tests in `src/raitap/data/tests/test_label_formats.py`.
+## Adding a label parser
+
+1. Create `src/raitap/data/label_parsers/<name>.py`. Add a dataclass in
+   `src/raitap/configs/schema.py` that subclasses `LabelsConfig`:
+
+    ```python
+    @dataclass
+    class MyFormatLabelsConfig(LabelsConfig):
+        _target_: str = "MyFormatLabelParser"
+        source: str = MISSING
+        id_strategy: IdStrategy = IdStrategy.auto
+        # add only fields this variant uses
+    ```
+
+2. Write the parser class decorated with `@label_parser`:
+
+    ```python
+    from raitap.data.label_parsers.registration import label_parser
+    from raitap.configs.schema import MyFormatLabelsConfig
+
+    @label_parser(registry_name="my_format", schema=MyFormatLabelsConfig)
+    class MyFormatLabelParser:
+        supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection})
+
+        def __init__(self, *, source: str, id_strategy: IdStrategy = IdStrategy.auto) -> None:
+            ...
+
+        def parse(
+            self,
+            *,
+            task_kind: TaskKind,
+            tensor: Any,
+            sample_ids: list[str] | None,
+            data_source: str | None,
+            class_names: list[str] | None,
+        ) -> Any:
+            ...
+    ```
+
+3. Import and re-export in `src/raitap/data/label_parsers/__init__.py`:
+
+    ```python
+    from .my_format import MyFormatLabelParser  # pyright: ignore[reportUnusedImport]
+    ```
+
+    Add `"MyFormatLabelParser"` to `__all__`.
+
+4. Add tests in `src/raitap/data/tests/`.
+
+5. Add a row to the label-variant table in `docs/modules/data/configuration.md`.
 
 ## Sample discovery and label alignment
 
diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md
index 8f030846..e5f747e8 100644
--- a/docs/modules/data/configuration.md
+++ b/docs/modules/data/configuration.md
@@ -66,55 +66,6 @@ myst:
   <a href="../../using-raitap/flags.html#flag-acknowledge-preprocessing-off"><code>--acknowledge-preprocessing-off</code></a>.
   See {doc}`preprocessing`.
 
-:option: labels.source
-:allowed: string, null
-:default: null
-:description: Optional path to a labels file (CSV, TSV, or Parquet), URL, or
-  named sample set. When set to a sample name (e.g. `"imagenet_samples"`),
-  raitap resolves to the labels CSV bundled with that sample. Sample sets
-  without bundled labels raise an error. The reserved value `"directory"`
-  derives classification labels from each sample's top-level class
-  subdirectory (torchvision `ImageFolder` style; no labels file) — see
-  {doc}`own-vs-built-in`. In that mode `id_column` and `id_strategy` do not
-  apply.
-
-:option: labels.id_column
-:allowed: string, null
-:default: null
-:description: Optional sample-ID column used to align labels with filenames,
-  for example `"image"`.
-
-:option: labels.column
-:allowed: string, null
-:default: null
-:description: Optional class-label column. If omitted, one-hot numeric columns
-  are reduced with `argmax`.
-
-:option: labels.encoding
-:allowed: "index", "one_hot", "argmax", null
-:default: null
-:description: Optional label parsing strategy.
-
-:option: labels.id_strategy
-:allowed: "auto", "relative_path", "stem"
-:default: "auto"
-:description: How label-file ids are matched against discovered sample
-  files. `"auto"` (default) inspects the id column and switches to
-  `"relative_path"` if any value contains `/` or `\`, otherwise falls back
-  to `"stem"`. `"relative_path"` keeps directory components and supports
-  nested ImageFolder layouts (e.g. `NORMAL/IM-0001.jpeg`) — required when
-  filename stems collide across class subdirs. `"stem"` matches by basename only (flat-dir layouts).
-
-:option: labels.format
-:allowed: "native", "coco", "yolo", "voc"
-:default: "native"
-:description: External label file format. `"native"` (default) reads RAITAP's
-  own shape (classification: CSV/TSV/Parquet or the `"directory"` source;
-  detection: the JSON record list). `"coco"`, `"yolo"`, and `"voc"` convert a
-  standard annotation file to the native shape before alignment. `"yolo"` and
-  `"voc"` are detection only; `"coco"` serves detection and classification.
-  Non-native formats align by sample id, so a labels id is required.
-
 :option: input_metadata
 :allowed: dict, null
 :default: null
@@ -155,56 +106,62 @@ data:
   forward_batch_size: 32
   preprocessing: model-bundled
   model_input_transformation: model-bundled
-  labels:
-    source: "./data/labels.csv"
-    id_column: "image"
-    column: "label"
-    encoding: "index"
-    id_strategy: "auto"
 
-:cli: data.source="./data/images" data.preprocessing=model-bundled data.model_input_transformation=model-bundled data.labels.source="./data/labels.csv" data.labels.column=label
+:cli: data.source="./data/images" data.preprocessing=model-bundled data.model_input_transformation=model-bundled
 
 :python:
-from raitap.data import (
-    DataConfig,
-    IdStrategy,
-    LabelEncoding,
-    LabelsConfig,
-    Preprocessing,
-)
+from raitap.data import DataConfig
 
 data = DataConfig(
     name="my-dataset",
     description="Internal validation set",
     source="./data/images",
     forward_batch_size=32,
-    preprocessing=Preprocessing.model_bundled,
-    model_input_transformation=Preprocessing.model_bundled,
-    labels=LabelsConfig(
-        source="./data/labels.csv",
-        id_column="image",
-        column="label",
-        encoding=LabelEncoding.index,
-        id_strategy=IdStrategy.auto,
-    ),
+    preprocessing="model-bundled",
+    model_input_transformation="model-bundled",
 )
 ```
 
-**Label formats.** RAITAP reads common annotation formats directly via `data.labels.format`.
+**Label variants.** `data.labels` is a Hydra config-group: select the variant with `defaults: [data/labels: <name>]`, then set its fields under `data.labels:`. Each variant exposes only the fields it accepts — setting a foreign field is a load error.
+
+```yaml
+defaults:
+  - raitap_schema
+  - data/labels: tabular   # pick one variant
+  - _self_
+
+data:
+  source: "./data/images"
+  labels:
+    source: "./data/labels.csv"
+    id_column: "image"
+    column: "label"
+```
+
+| Variant          | Task(s)                    | Fields                                                  |
+| ---------------- | -------------------------- | ------------------------------------------------------- |
+| `tabular`        | classification             | `source`, `id_column`, `column`, `encoding`, `id_strategy` |
+| `directory`      | classification             | *(none — class from top-level subdir name)*             |
+| `coco`           | detection + classification | `source`, `id_strategy`                                 |
+| `yolo`           | detection                  | `source`, `id_strategy`                                 |
+| `voc`            | detection                  | `source`, `id_strategy`, `class_names`                  |
+| `detection_json` | detection                  | `source`, `id_strategy`                                 |
+
+**`tabular`** — CSV, TSV, or Parquet file. `id_column` aligns rows to sample filenames; `column` names the label column (omit for one-hot numeric columns, which are reduced with `argmax`). `encoding` is one of `"index"`, `"one_hot"`, `"argmax"`. `id_strategy` controls alignment — see below.
+
+**`directory`** — no labels file. Class is each sample's top-level subdirectory (torchvision `ImageFolder` style). See {doc}`own-vs-built-in`.
+
+**`coco`** — single `instances.json` file (`source`). Category ids pass through unchanged. Serves detection and classification.
+
+**`yolo`** — directory of per-image `.txt` files (`source`). Needs `data.source` set to the image directory so RAITAP can match annotation files to images. Detection only. Category ids pass through unchanged.
+
+**`voc`** — directory of per-image `.xml` files (`source`). `class_names` maps VOC names to integer ids; falls back to `model.class_names`, then the standard 20-class VOC order. Detection only.
 
-| Format   | Detection | Classification | Source layout                                  |
-| -------- | --------- | -------------- | ---------------------------------------------- |
-| `native` | yes       | yes            | JSON record list / CSV-TSV-Parquet             |
-| `coco`   | yes       | yes            | single `instances.json`                        |
-| `yolo`   | yes       | no             | dir of per-image `.txt` (needs `data.source`)  |
-| `voc`    | yes       | no             | dir of per-image `.xml`                        |
+**`detection_json`** — RAITAP native JSON record list `[{"sample_id": ..., "boxes": ..., "labels": ...}]`. Detection only.
 
-COCO and YOLO labels keep their category ids unchanged. VOC class names map to
-ids by `model.class_names` order, else the standard 20-class VOC order.
+All detection variants honour `id_strategy` for nested image-directory layouts.
 
-Detection formats match each record's `sample_id` against the discovered image
-file by exact name, so the image directory must be flat (nested subdirs are not
-matched). Classification labels still align via `labels.id_strategy`.
+**`id_strategy`** (`"auto"` / `"relative_path"` / `"stem"`, default `"auto"`): how label ids are matched against discovered sample files. `"auto"` inspects the id column and switches to `"relative_path"` if any value contains `/` or `\`, otherwise `"stem"`. `"relative_path"` keeps directory components (e.g. `NORMAL/IM-0001`) — required when filename stems collide across class subdirs. `"stem"` matches by basename only.
 
 For tabular models whose backend expects an unusual per-sample layout (such
 as ACAS Xu, a Torch network whose forward takes `(N, 1, 1, 5)`), supply
diff --git a/docs/modules/data/own-vs-built-in.md b/docs/modules/data/own-vs-built-in.md
index b3364cf7..8a4006fd 100644
--- a/docs/modules/data/own-vs-built-in.md
+++ b/docs/modules/data/own-vs-built-in.md
@@ -33,11 +33,12 @@ data:
     column: "label"
 
 :python:
-from raitap.data import DataConfig, LabelsConfig
+from raitap.configs.schema import TabularLabelsConfig
+from raitap.data import DataConfig
 
 data = DataConfig(
     source="./data/images",  # a directory of images
-    labels=LabelsConfig(
+    labels=TabularLabelsConfig(
         source="./data/labels.csv",
         id_column="image",
         column="label",
@@ -76,11 +77,12 @@ data:
     # id_strategy: "auto"   # default — relative paths auto-detected
 
 :python:
-from raitap.data import DataConfig, IdStrategy, LabelsConfig
+from raitap.configs.schema import TabularLabelsConfig
+from raitap.data import DataConfig
 
 data = DataConfig(
     source="./data/test",
-    labels=LabelsConfig(
+    labels=TabularLabelsConfig(
         source="./data/labels.csv",
         id_column="image",
         column="label",
@@ -134,8 +136,8 @@ both become `IM-0001`), which raises a duplicate-id error.
 ### Labels from directory structure
 
 If your images are already organised into one folder per class (the
-torchvision `ImageFolder` convention), set `labels.source: "directory"` to use
-the folder names as labels. No labels file needed.
+torchvision `ImageFolder` convention), select the `directory` label variant.
+No labels file needed.
 
 ```text
 data/train/
@@ -144,27 +146,20 @@ data/train/
 └── PNEUMONIA/IM-0001.jpeg   # label: PNEUMONIA
 ```
 
-```{config-tabs}
-:yaml:
+```yaml
+defaults:
+  - raitap_schema
+  - data/labels: directory
+  - _self_
+
 data:
   source: "./data/train"
-  labels:
-    source: "directory"
-
-:python:
-from raitap.data import DIRECTORY_LABELS_SOURCE, DataConfig, LabelsConfig
-
-data = DataConfig(
-    source="./data/train",
-    labels=LabelsConfig(source=DIRECTORY_LABELS_SOURCE),  # == "directory"
-)
 ```
 
 The class is each sample's top-level subdirectory; nesting within a class
 folder is fine. Class ids are assigned alphabetically (`NORMAL` to `0`,
-`PNEUMONIA` to `1`). `id_column` and `id_strategy` do not apply. If images sit
-directly under the source with no class subdirs, RAITAP warns and falls back to
-predictions as metric targets.
+`PNEUMONIA` to `1`). If images sit directly under the source with no class
+subdirs, RAITAP warns and falls back to predictions as metric targets.
 
 If you want to evaluate metrics against ground-truth labels, configure the
 optional `data.labels` block as described in {doc}`configuration`.
diff --git a/src/raitap/data/tests/test_data_class.py b/src/raitap/data/tests/test_data_class.py
index 72ead53e..1fba154e 100644
--- a/src/raitap/data/tests/test_data_class.py
+++ b/src/raitap/data/tests/test_data_class.py
@@ -547,7 +547,6 @@ def test_directory_source_derives_labels_from_layout(self, tmp_path: Path) -> No
         from types import SimpleNamespace
         from typing import cast
 
-
         img_dir = tmp_path / "images"
         (img_dir / "NORMAL").mkdir(parents=True)
         (img_dir / "PNEUMONIA").mkdir(parents=True)

From 4ee261f1a8d5d4ca5bc756d8c2eea58ac4741c61 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Thu, 25 Jun 2026 16:38:15 +0200
Subject: [PATCH 26/28] fix(data): migrate e2e detection test to parser seam;
 warn against inline _target_ (refs #338)

---
 docs/modules/data/configuration.md          |  2 +-
 src/raitap/task_families/tests/test_base.py |  3 ---
 src/raitap/tests/test_e2e_detection.py      | 19 +++++++++++++------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md
index e5f747e8..84f468cf 100644
--- a/docs/modules/data/configuration.md
+++ b/docs/modules/data/configuration.md
@@ -122,7 +122,7 @@ data = DataConfig(
 )
 ```
 
-**Label variants.** `data.labels` is a Hydra config-group: select the variant with `defaults: [data/labels: <name>]`, then set its fields under `data.labels:`. Each variant exposes only the fields it accepts — setting a foreign field is a load error.
+**Label variants.** `data.labels` is a Hydra config-group: select the variant with `defaults: [data/labels: <name>]`, then set its fields under `data.labels:`. Each variant exposes only the fields it accepts — setting a foreign field is a load error. Always select via the `defaults` group; inlining `_target_` directly (e.g. `data.labels: {_target_: TabularLabelParser, bogus: 1}`) bypasses struct-mode validation and unknown fields are silently dropped.
 
 ```yaml
 defaults:
diff --git a/src/raitap/task_families/tests/test_base.py b/src/raitap/task_families/tests/test_base.py
index 2df0e75a..78dcaa79 100644
--- a/src/raitap/task_families/tests/test_base.py
+++ b/src/raitap/task_families/tests/test_base.py
@@ -27,9 +27,6 @@ def adapt_loaded_inputs(self, tensor: object) -> object:
         def validate_inputs(self, tensor: object) -> None:
             pass
 
-        def load_labels(self, cfg: object, *, tensor: object, sample_ids: object) -> object:
-            pass
-
         def validate_labels(self, labels: object) -> None:
             pass
 
diff --git a/src/raitap/tests/test_e2e_detection.py b/src/raitap/tests/test_e2e_detection.py
index 66d22414..9824f28e 100644
--- a/src/raitap/tests/test_e2e_detection.py
+++ b/src/raitap/tests/test_e2e_detection.py
@@ -97,19 +97,26 @@ def test_detection_pipeline_e2e_via_fasterrcnn_mobilenet(tmp_path: Path) -> None
     labels_path = tmp_path / "detection_labels.json"
     labels_path.write_text(json.dumps(labels_payload))
 
-    # Bypass Data.__init__ and call DetectionFamily.load_labels directly; the
+    # Bypass Data.__init__ and call _resolve_and_parse_labels directly; the
     # detection label loader has its own dedicated coverage in
     # src/raitap/data/tests/test_detection_labels.py. This test focuses on the
     # pipeline plumbing downstream of Data.
-    from raitap.task_families.detection import DetectionFamily
+    from raitap.configs.schema import DetectionJsonLabelsConfig
+    from raitap.data.data import _resolve_and_parse_labels
+    from raitap.types import TaskKind
 
-    labels_cfg = SimpleNamespace(source=str(labels_path))
     load_cfg = cast(
         "AppConfig",
-        SimpleNamespace(data=SimpleNamespace(labels=labels_cfg)),
+        SimpleNamespace(
+            data=SimpleNamespace(
+                labels=DetectionJsonLabelsConfig(source=str(labels_path)),
+                source=None,
+            ),
+            model=SimpleNamespace(class_names=None),
+        ),
     )
-    data.labels = DetectionFamily().load_labels(
-        load_cfg, tensor=data.tensor, sample_ids=data.sample_ids
+    data.labels = _resolve_and_parse_labels(
+        load_cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids
     )
 
     # --- app config --------------------------------------------------------

From 39340d9a354edc84481a93f27c929ae6a3d5a8d8 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Thu, 25 Jun 2026 16:45:36 +0200
Subject: [PATCH 27/28] fix(data): label parsers declare no uv extra to satisfy
 deps static-scan (refs #338)

---
 src/raitap/data/label_parsers/registration.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/raitap/data/label_parsers/registration.py b/src/raitap/data/label_parsers/registration.py
index 3ade1ab7..cc65235f 100644
--- a/src/raitap/data/label_parsers/registration.py
+++ b/src/raitap/data/label_parsers/registration.py
@@ -34,7 +34,13 @@ def label_parser(
     """Decorator: register a label-parser adapter.
 
     ``registry_name`` is required. Mirrors ``metrics_adapter`` shape.
+
+    Label parsers are core (stdlib only — no optional dependency), so they
+    declare no uv extra. Without this default the schema-backed auto-extra
+    (``extra=registry_name``) would register phantom extras like ``tabular``
+    that no ``pyproject`` group provides, breaking the deps static-scan gate.
     """
+    common.setdefault("extra", "")
 
     def wrap(cls: type[T]) -> type[T]:
         return _register_core(cls, family=LABELS, **common)

From a32db1bcc8884f742cfa4d7d183abbe0716ce917 Mon Sep 17 00:00:00 2001
From: Stanislas Laurent <stnsls.lrt.accnts@gmail.com>
Date: Thu, 25 Jun 2026 16:54:25 +0200
Subject: [PATCH 28/28] test(data): drop redundant imports, wire
 _VOC_ONLY_FIELDS into leakage test (refs #338)

---
 src/raitap/configs/tests/test_labels_schema.py | 14 +++++---------
 src/raitap/data/label_parsers/coco.py          |  2 --
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py
index 85154c2d..13f2eaef 100644
--- a/src/raitap/configs/tests/test_labels_schema.py
+++ b/src/raitap/configs/tests/test_labels_schema.py
@@ -20,8 +20,6 @@ def test_directory_config_has_only_target() -> None:
 
 
 def test_labelformat_enum_is_gone() -> None:
-    import importlib
-
     data_types = importlib.import_module("raitap.data.types")
     with pytest.raises(AttributeError):
         getattr(data_types, "LabelFormat")  # noqa: B009
@@ -160,19 +158,17 @@ def test_no_cross_variant_field_leakage(registry_name: str) -> None:
         )
 
     if registry_name in _DETECTION_VARIANTS:
-        leaked = _TABULAR_ONLY_FIELDS & field_names
-        assert not leaked, f"{registry_name!r} builder leaks tabular-only fields: {leaked}"
-        assert "class_names" not in field_names, (
-            f"{registry_name!r} builder should not have 'class_names'"
-        )
+        leaked = (_TABULAR_ONLY_FIELDS | _VOC_ONLY_FIELDS) & field_names
+        assert not leaked, f"{registry_name!r} builder leaks foreign fields: {leaked}"
 
     if registry_name == "voc":
         leaked = _TABULAR_ONLY_FIELDS & field_names
         assert not leaked, f"voc builder leaks tabular-only fields: {leaked}"
-        assert "class_names" in field_names, "voc builder must have 'class_names'"
+        assert field_names >= _VOC_ONLY_FIELDS, "voc builder must have 'class_names'"
 
     if registry_name == "tabular":
         assert field_names >= _TABULAR_ONLY_FIELDS, (
             f"tabular builder is missing expected fields; got {field_names}"
         )
-        assert "class_names" not in field_names, "tabular builder should not have 'class_names'"
+        leaked = _VOC_ONLY_FIELDS & field_names
+        assert not leaked, "tabular builder should not have 'class_names'"
diff --git a/src/raitap/data/label_parsers/coco.py b/src/raitap/data/label_parsers/coco.py
index 673a39e0..12b3f332 100644
--- a/src/raitap/data/label_parsers/coco.py
+++ b/src/raitap/data/label_parsers/coco.py
@@ -20,8 +20,6 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
-    import pandas as pd
-
 
 @label_parser(registry_name="coco", schema=CocoLabelsConfig)
 class CocoLabelParser: