diff --git a/contributor-configs/fasterrcnn-udacity/assessment.yaml b/contributor-configs/fasterrcnn-udacity/assessment.yaml index d9f38975..7fd84ea7 100644 --- a/contributor-configs/fasterrcnn-udacity/assessment.yaml +++ b/contributor-configs/fasterrcnn-udacity/assessment.yaml @@ -12,6 +12,11 @@ # Hand-authored COCO-class boxes in ``labels/udacity-boxes.json`` — see # ``labelling-data.md`` for the format and a candidate-generation helper. +defaults: + - raitap_schema + - data/labels: detection_json + - _self_ + experiment_name: fasterrcnn-udacity-detection-demo hardware: cpu diff --git a/contributor-configs/imagecorruptions-imagenet/assessment.yaml b/contributor-configs/imagecorruptions-imagenet/assessment.yaml index 97c2fbee..659e8708 100644 --- a/contributor-configs/imagecorruptions-imagenet/assessment.yaml +++ b/contributor-configs/imagecorruptions-imagenet/assessment.yaml @@ -12,6 +12,7 @@ defaults: - raitap_schema # required to bind the schema, must come first - reporting: html - metrics: multiclass_classification + - data/labels: tabular - _self_ hardware: gpu diff --git a/contributor-configs/lwise-ham10000/assessment.yaml b/contributor-configs/lwise-ham10000/assessment.yaml index e1d90cf9..a80866ba 100644 --- a/contributor-configs/lwise-ham10000/assessment.yaml +++ b/contributor-configs/lwise-ham10000/assessment.yaml @@ -7,6 +7,12 @@ # TorchScript modules do not support. Override with: # LWISE_HAM10000_MODEL=/path/to/lwise_ham10000_eager.pt +defaults: + - raitap_schema + - metrics: multiclass_classification + - data/labels: tabular + - _self_ + experiment_name: lwise-ham10000-dermoscopy-demo hardware: gpu diff --git a/contributor-configs/marabou-mnist/assessment.yaml b/contributor-configs/marabou-mnist/assessment.yaml index e00b0e7c..5a89aadd 100644 --- a/contributor-configs/marabou-mnist/assessment.yaml +++ b/contributor-configs/marabou-mnist/assessment.yaml @@ -3,6 +3,8 @@ # `data=mnist_samples`, `model=mlp_mnist`, `robustness=marabou_linf`. # No transparency block (the original demo invoked `~transparency`). defaults: + - raitap_schema + - data/labels: tabular - _self_ experiment_name: marabou_mnist_uc1 diff --git a/contributor-configs/noisetunnel-smoothgrad/assessment.yaml b/contributor-configs/noisetunnel-smoothgrad/assessment.yaml index cb7b9b3e..44946d44 100644 --- a/contributor-configs/noisetunnel-smoothgrad/assessment.yaml +++ b/contributor-configs/noisetunnel-smoothgrad/assessment.yaml @@ -8,6 +8,7 @@ defaults: - raitap_schema - metrics: multiclass_classification + - data/labels: tabular - _self_ hardware: cpu diff --git a/docs/contributor/modules/data.md b/docs/contributor/modules/data.md index 319d72c4..ef4e4cc6 100644 --- a/docs/contributor/modules/data.md +++ b/docs/contributor/modules/data.md @@ -45,9 +45,14 @@ referenceable by name in `data.source`. Registration lives in } ``` -3. **Use it** from any consumer config: +3. **Use it** from any consumer config (select the `tabular` label variant): ```yaml + defaults: + - raitap_schema + - data/labels: tabular + - _self_ + data: name: cifar10_samples source: cifar10_samples # resolves via SAMPLE_SOURCES @@ -69,6 +74,57 @@ referenceable by name in `data.source`. Registration lives in 5. **Update docs** — add the new sample name to {doc}`/modules/data/own-vs-built-in`. +## Adding a label parser + +1. Create `src/raitap/data/label_parsers/.py`. Add a dataclass in + `src/raitap/configs/schema.py` that subclasses `LabelsConfig`: + + ```python + @dataclass + class MyFormatLabelsConfig(LabelsConfig): + _target_: str = "MyFormatLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + # add only fields this variant uses + ``` + +2. Write the parser class decorated with `@label_parser`: + + ```python + from raitap.data.label_parsers.registration import label_parser + from raitap.configs.schema import MyFormatLabelsConfig + + @label_parser(registry_name="my_format", schema=MyFormatLabelsConfig) + class MyFormatLabelParser: + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__(self, *, source: str, id_strategy: IdStrategy = IdStrategy.auto) -> None: + ... + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + ... + ``` + +3. Import and re-export in `src/raitap/data/label_parsers/__init__.py`: + + ```python + from .my_format import MyFormatLabelParser # pyright: ignore[reportUnusedImport] + ``` + + Add `"MyFormatLabelParser"` to `__all__`. + +4. Add tests in `src/raitap/data/tests/`. + +5. Add a row to the label-variant table in `docs/modules/data/configuration.md`. + ## Sample discovery and label alignment `data.source` directories are walked **recursively** (`Path.rglob`); sample diff --git a/docs/modules/data/configuration.md b/docs/modules/data/configuration.md index 447c29d2..84f468cf 100644 --- a/docs/modules/data/configuration.md +++ b/docs/modules/data/configuration.md @@ -66,45 +66,6 @@ myst: --acknowledge-preprocessing-off. See {doc}`preprocessing`. -:option: labels.source -:allowed: string, null -:default: null -:description: Optional path to a labels file (CSV, TSV, or Parquet), URL, or - named sample set. When set to a sample name (e.g. `"imagenet_samples"`), - raitap resolves to the labels CSV bundled with that sample. Sample sets - without bundled labels raise an error. The reserved value `"directory"` - derives classification labels from each sample's top-level class - subdirectory (torchvision `ImageFolder` style; no labels file) — see - {doc}`own-vs-built-in`. In that mode `id_column` and `id_strategy` do not - apply. - -:option: labels.id_column -:allowed: string, null -:default: null -:description: Optional sample-ID column used to align labels with filenames, - for example `"image"`. - -:option: labels.column -:allowed: string, null -:default: null -:description: Optional class-label column. If omitted, one-hot numeric columns - are reduced with `argmax`. - -:option: labels.encoding -:allowed: "index", "one_hot", "argmax", null -:default: null -:description: Optional label parsing strategy. - -:option: labels.id_strategy -:allowed: "auto", "relative_path", "stem" -:default: "auto" -:description: How label-file ids are matched against discovered sample - files. `"auto"` (default) inspects the id column and switches to - `"relative_path"` if any value contains `/` or `\`, otherwise falls back - to `"stem"`. `"relative_path"` keeps directory components and supports - nested ImageFolder layouts (e.g. `NORMAL/IM-0001.jpeg`) — required when - filename stems collide across class subdirs. `"stem"` matches by basename only (flat-dir layouts). - :option: input_metadata :allowed: dict, null :default: null @@ -145,41 +106,63 @@ data: forward_batch_size: 32 preprocessing: model-bundled model_input_transformation: model-bundled - labels: - source: "./data/labels.csv" - id_column: "image" - column: "label" - encoding: "index" - id_strategy: "auto" -:cli: data.source="./data/images" data.preprocessing=model-bundled data.model_input_transformation=model-bundled data.labels.source="./data/labels.csv" data.labels.column=label +:cli: data.source="./data/images" data.preprocessing=model-bundled data.model_input_transformation=model-bundled :python: -from raitap.data import ( - DataConfig, - IdStrategy, - LabelEncoding, - LabelsConfig, - Preprocessing, -) +from raitap.data import DataConfig data = DataConfig( name="my-dataset", description="Internal validation set", source="./data/images", forward_batch_size=32, - preprocessing=Preprocessing.model_bundled, - model_input_transformation=Preprocessing.model_bundled, - labels=LabelsConfig( - source="./data/labels.csv", - id_column="image", - column="label", - encoding=LabelEncoding.index, - id_strategy=IdStrategy.auto, - ), + preprocessing="model-bundled", + model_input_transformation="model-bundled", ) ``` +**Label variants.** `data.labels` is a Hydra config-group: select the variant with `defaults: [data/labels: ]`, then set its fields under `data.labels:`. Each variant exposes only the fields it accepts — setting a foreign field is a load error. Always select via the `defaults` group; inlining `_target_` directly (e.g. `data.labels: {_target_: TabularLabelParser, bogus: 1}`) bypasses struct-mode validation and unknown fields are silently dropped. + +```yaml +defaults: + - raitap_schema + - data/labels: tabular # pick one variant + - _self_ + +data: + source: "./data/images" + labels: + source: "./data/labels.csv" + id_column: "image" + column: "label" +``` + +| Variant | Task(s) | Fields | +| ---------------- | -------------------------- | ------------------------------------------------------- | +| `tabular` | classification | `source`, `id_column`, `column`, `encoding`, `id_strategy` | +| `directory` | classification | *(none — class from top-level subdir name)* | +| `coco` | detection + classification | `source`, `id_strategy` | +| `yolo` | detection | `source`, `id_strategy` | +| `voc` | detection | `source`, `id_strategy`, `class_names` | +| `detection_json` | detection | `source`, `id_strategy` | + +**`tabular`** — CSV, TSV, or Parquet file. `id_column` aligns rows to sample filenames; `column` names the label column (omit for one-hot numeric columns, which are reduced with `argmax`). `encoding` is one of `"index"`, `"one_hot"`, `"argmax"`. `id_strategy` controls alignment — see below. + +**`directory`** — no labels file. Class is each sample's top-level subdirectory (torchvision `ImageFolder` style). See {doc}`own-vs-built-in`. + +**`coco`** — single `instances.json` file (`source`). Category ids pass through unchanged. Serves detection and classification. + +**`yolo`** — directory of per-image `.txt` files (`source`). Needs `data.source` set to the image directory so RAITAP can match annotation files to images. Detection only. Category ids pass through unchanged. + +**`voc`** — directory of per-image `.xml` files (`source`). `class_names` maps VOC names to integer ids; falls back to `model.class_names`, then the standard 20-class VOC order. Detection only. + +**`detection_json`** — RAITAP native JSON record list `[{"sample_id": ..., "boxes": ..., "labels": ...}]`. Detection only. + +All detection variants honour `id_strategy` for nested image-directory layouts. + +**`id_strategy`** (`"auto"` / `"relative_path"` / `"stem"`, default `"auto"`): how label ids are matched against discovered sample files. `"auto"` inspects the id column and switches to `"relative_path"` if any value contains `/` or `\`, otherwise `"stem"`. `"relative_path"` keeps directory components (e.g. `NORMAL/IM-0001`) — required when filename stems collide across class subdirs. `"stem"` matches by basename only. + For tabular models whose backend expects an unusual per-sample layout (such as ACAS Xu, a Torch network whose forward takes `(N, 1, 1, 5)`), supply `input_metadata.shape` explicitly so the pipeline reshapes the flat feature diff --git a/docs/modules/data/own-vs-built-in.md b/docs/modules/data/own-vs-built-in.md index b3364cf7..8a4006fd 100644 --- a/docs/modules/data/own-vs-built-in.md +++ b/docs/modules/data/own-vs-built-in.md @@ -33,11 +33,12 @@ data: column: "label" :python: -from raitap.data import DataConfig, LabelsConfig +from raitap.configs.schema import TabularLabelsConfig +from raitap.data import DataConfig data = DataConfig( source="./data/images", # a directory of images - labels=LabelsConfig( + labels=TabularLabelsConfig( source="./data/labels.csv", id_column="image", column="label", @@ -76,11 +77,12 @@ data: # id_strategy: "auto" # default — relative paths auto-detected :python: -from raitap.data import DataConfig, IdStrategy, LabelsConfig +from raitap.configs.schema import TabularLabelsConfig +from raitap.data import DataConfig data = DataConfig( source="./data/test", - labels=LabelsConfig( + labels=TabularLabelsConfig( source="./data/labels.csv", id_column="image", column="label", @@ -134,8 +136,8 @@ both become `IM-0001`), which raises a duplicate-id error. ### Labels from directory structure If your images are already organised into one folder per class (the -torchvision `ImageFolder` convention), set `labels.source: "directory"` to use -the folder names as labels. No labels file needed. +torchvision `ImageFolder` convention), select the `directory` label variant. +No labels file needed. ```text data/train/ @@ -144,27 +146,20 @@ data/train/ └── PNEUMONIA/IM-0001.jpeg # label: PNEUMONIA ``` -```{config-tabs} -:yaml: +```yaml +defaults: + - raitap_schema + - data/labels: directory + - _self_ + data: source: "./data/train" - labels: - source: "directory" - -:python: -from raitap.data import DIRECTORY_LABELS_SOURCE, DataConfig, LabelsConfig - -data = DataConfig( - source="./data/train", - labels=LabelsConfig(source=DIRECTORY_LABELS_SOURCE), # == "directory" -) ``` The class is each sample's top-level subdirectory; nesting within a class folder is fine. Class ids are assigned alphabetically (`NORMAL` to `0`, -`PNEUMONIA` to `1`). `id_column` and `id_strategy` do not apply. If images sit -directly under the source with no class subdirs, RAITAP warns and falls back to -predictions as metric targets. +`PNEUMONIA` to `1`). If images sit directly under the source with no class +subdirs, RAITAP warns and falls back to predictions as metric targets. If you want to evaluate metrics against ground-truth labels, configure the optional `data.labels` block as described in {doc}`configuration`. diff --git a/example/assessment.yaml b/example/assessment.yaml index 4a2df860..13fb6c72 100644 --- a/example/assessment.yaml +++ b/example/assessment.yaml @@ -2,6 +2,7 @@ defaults: - raitap_schema # required to bind the schema, must come first - reporting: html - metrics: multiclass_classification + - data/labels: tabular - _self_ hardware: gpu diff --git a/src/raitap/_adapters.py b/src/raitap/_adapters.py index e1f9078e..b58b7c2b 100644 --- a/src/raitap/_adapters.py +++ b/src/raitap/_adapters.py @@ -305,10 +305,14 @@ def _register_core( if family is not None: cls._adapter_group = family.group builder = _build_schema_adapter(cls, schema_override or family.schema) + # Hydra groups use ``/`` for nesting; OmegaConf packages use ``.``. + # A nested group like ``data/labels`` must target package + # ``data.labels`` so the composed node lands at ``cfg.data.labels``. + package_base = family.group.replace("/", ".") package = ( - f"{family.group}.{registry_name}" + f"{package_base}.{registry_name}" if family.package_style == "nested" - else family.group + else package_base ) store(builder, group=family.group, name=registry_name, package=package) _BUILDERS.setdefault(family.group, {})[registry_name] = builder diff --git a/src/raitap/configs/demo.yaml b/src/raitap/configs/demo.yaml index d0ef4e17..ddb15c34 100644 --- a/src/raitap/configs/demo.yaml +++ b/src/raitap/configs/demo.yaml @@ -3,6 +3,7 @@ defaults: - raitap_schema # binds AppConfig dataclass → unset fields inherit defaults - metrics: multiclass_classification # narrows metrics schema so num_classes is accepted + - data/labels: tabular - _self_ hardware: cpu diff --git a/src/raitap/configs/schema.py b/src/raitap/configs/schema.py index fbab40f7..24fcde0a 100644 --- a/src/raitap/configs/schema.py +++ b/src/raitap/configs/schema.py @@ -70,22 +70,50 @@ class ModelConfig: @dataclass class LabelsConfig: - # Optional path to a labels file (currently CSV/TSV/Parquet), OR the reserved - # value "directory" (exposed as ``raitap.data.DIRECTORY_LABELS_SOURCE``) to - # derive classification labels from each sample's top-level class - # subdirectory (torchvision ImageFolder style; no labels file). - source: str | None = None - # Optional sample-id column for filename alignment (e.g. "image"). + _target_: str = MISSING + + +@dataclass +class TabularLabelsConfig(LabelsConfig): + _target_: str = "TabularLabelParser" + source: str = MISSING id_column: str | None = None - # Optional class-label column; when omitted, one-hot numeric columns are used via argmax. column: str | None = None - # Optional parsing strategy for labels: "index", "one_hot", or "argmax". encoding: LabelEncoding | None = None - # Strategy for matching label-file ids to discovered sample files. One of: - # "auto" — pick "relative_path" if any id contains "/" or "\\"; else "stem". - # "relative_path" — ids are resolved as posix-style paths relative to ``data.source`` - # (supports nested ImageFolder layouts with colliding stems). - # "stem" — flat-dir / basename matching: match by ``Path(id).stem`` only. + id_strategy: IdStrategy = IdStrategy.auto + + +@dataclass +class DirectoryLabelsConfig(LabelsConfig): + _target_: str = "DirectoryLabelParser" + + +@dataclass +class CocoLabelsConfig(LabelsConfig): + _target_: str = "CocoLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + + +@dataclass +class YoloLabelsConfig(LabelsConfig): + _target_: str = "YoloLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + + +@dataclass +class VocLabelsConfig(LabelsConfig): + _target_: str = "VocLabelParser" + source: str = MISSING + id_strategy: IdStrategy = IdStrategy.auto + class_names: list[str] | None = None + + +@dataclass +class DetectionJsonLabelsConfig(LabelsConfig): + _target_: str = "DetectionJsonLabelParser" + source: str = MISSING id_strategy: IdStrategy = IdStrategy.auto @@ -122,7 +150,7 @@ class DataConfig: # Forwarded to ``infer_input_spec`` so semantics and visualisers see the correct # modality for non-image data such as ACAS Xu's 5-feature tabular vector. input_metadata: dict[str, Any] | None = None - labels: LabelsConfig = field(default_factory=LabelsConfig) + labels: LabelsConfig | None = None @dataclass diff --git a/src/raitap/configs/tests/test_labels_schema.py b/src/raitap/configs/tests/test_labels_schema.py new file mode 100644 index 00000000..13f2eaef --- /dev/null +++ b/src/raitap/configs/tests/test_labels_schema.py @@ -0,0 +1,174 @@ +import dataclasses +import importlib + +import pytest + +from raitap.configs.schema import CocoLabelsConfig, DetectionJsonLabelsConfig, DirectoryLabelsConfig + + +def test_coco_config_has_no_tabular_fields() -> None: + names = {f.name for f in dataclasses.fields(CocoLabelsConfig)} + assert "id_column" not in names + assert "column" not in names + assert "encoding" not in names + assert {"_target_", "source", "id_strategy"} <= names + + +def test_directory_config_has_only_target() -> None: + names = {f.name for f in dataclasses.fields(DirectoryLabelsConfig)} + assert names == {"_target_"} + + +def test_labelformat_enum_is_gone() -> None: + data_types = importlib.import_module("raitap.data.types") + with pytest.raises(AttributeError): + getattr(data_types, "LabelFormat") # noqa: B009 + + +# Ground truth (see task-2-report.md): composing ``+data/labels=directory`` onto +# the AppConfig schema lands the variant at ``cfg.data.labels`` with the FQN +# ``_target_`` that hydra-zen ``builds()`` injects. +_COMPOSED_TARGET = "raitap.data.label_parsers.directory.DirectoryLabelParser" + + +def _register_labels_group() -> None: + """Register the ``data/labels`` group + AppConfig schema directly. + + Bypasses ``register_configs()`` (which imports transparency and other + families that are broken mid-refactor on this branch) by importing only the + label_parsers package — enough to fire the ``@label_parser`` decorator — and + flushing the hydra-zen store. The AppConfig schema is needed as the compose + base so the ``data.labels`` package has a struct to land in. + """ + importlib.import_module("raitap.data.label_parsers") + from hydra.core.config_store import ConfigStore + + from raitap._adapters import store + from raitap.configs.schema import AppConfig + + store.add_to_hydra_store(overwrite_ok=True) + ConfigStore.instance().store(name="raitap_schema", node=AppConfig) + + +def test_directory_parser_group_lands_at_data_labels() -> None: + """De-risk (Path A): the nested ``data/labels`` group composes onto + ``cfg.data.labels`` as a single config (flat semantics at a nested path).""" + from hydra import compose, initialize + from hydra.core.global_hydra import GlobalHydra + + _register_labels_group() + GlobalHydra.instance().clear() + with initialize(version_base=None, config_path=None): + cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"]) + # Assertion runs unconditionally (no swallowing). The composed value is the + # FQN hydra-zen stores, NOT the short dataclass default. + assert cfg.data.labels._target_ == _COMPOSED_TARGET + + +def test_directory_group_rejects_foreign_field() -> None: + """De-risk (Path A): a field the directory variant lacks fails at compose. + + Uses a struct-mode override (``data.labels.id_column=x`` — no ``+``) so + OmegaConf's struct check fires; ``+`` force-adds and would bypass it. + """ + from hydra import compose, initialize + from hydra.core.global_hydra import GlobalHydra + from hydra.errors import ConfigCompositionException + + _register_labels_group() + GlobalHydra.instance().clear() + with pytest.raises(ConfigCompositionException), initialize(version_base=None, config_path=None): + compose( + config_name="raitap_schema", + overrides=["+data/labels=directory", "data.labels.id_column=x"], + ) + + +def test_create_label_parser_handles_both_target_forms() -> None: + """``create_label_parser`` must instantiate for BOTH ``_target_`` shapes: + + * short bare name (``DirectoryLabelsConfig()`` dataclass default), resolved + against the ``raitap.data.label_parsers.`` prefix; + * the dotted FQN hydra-zen ``builds()`` stamps on the group-composed cfg. + """ + _register_labels_group() + from raitap.data.label_parsers.directory import DirectoryLabelParser + from raitap.data.label_parsers.factory import create_label_parser + + short = create_label_parser(DirectoryLabelsConfig()) + assert isinstance(short, DirectoryLabelParser) + + fqn = create_label_parser({"_target_": _COMPOSED_TARGET}) + assert isinstance(fqn, DirectoryLabelParser) + + +def test_detection_json_config_has_exactly_target_source_id_strategy() -> None: + names = {f.name for f in dataclasses.fields(DetectionJsonLabelsConfig)} + assert names == {"_target_", "source", "id_strategy"} + + +# --------------------------------------------------------------------------- +# Cross-variant leakage test (Task 10) +# --------------------------------------------------------------------------- + +# Fields that belong exclusively to the tabular variant and must NOT appear +# in any other variant's builder dataclass. +_TABULAR_ONLY_FIELDS = {"id_column", "column", "encoding"} + +# Fields that belong exclusively to the voc variant. +_VOC_ONLY_FIELDS = {"class_names"} + +# Variants that must have ONLY ``_target_`` (no source, no strategy, nothing). +_TARGET_ONLY_VARIANTS: set[str] = {"directory"} + +# Variants that carry source + id_strategy but NO tabular fields and NO +# class_names. +_DETECTION_VARIANTS: set[str] = {"coco", "yolo", "detection_json"} + + +@pytest.mark.parametrize( + "registry_name", + ["directory", "tabular", "coco", "yolo", "voc", "detection_json"], +) +def test_no_cross_variant_field_leakage(registry_name: str) -> None: + """Each label-parser builder dataclass must expose only its own fields. + + Specifically: + - ``directory`` has only ``_target_``. + - ``coco``/``yolo``/``detection_json`` have no tabular-only fields and no + ``class_names``. + - ``voc`` has ``class_names`` but no tabular-only fields. + - ``tabular`` has tabular-only fields but no ``class_names``. + """ + from raitap._adapters import _BUILDERS + + _register_labels_group() + + builders = _BUILDERS.get("data/labels", {}) + assert registry_name in builders, ( + f"Registry name {registry_name!r} not found in _BUILDERS['data/labels']; " + f"registered: {sorted(builders)}" + ) + builder = builders[registry_name] + field_names = {f.name for f in dataclasses.fields(builder)} + + if registry_name in _TARGET_ONLY_VARIANTS: + assert field_names == {"_target_"}, ( + f"{registry_name!r} builder should have only '_target_' but got {field_names}" + ) + + if registry_name in _DETECTION_VARIANTS: + leaked = (_TABULAR_ONLY_FIELDS | _VOC_ONLY_FIELDS) & field_names + assert not leaked, f"{registry_name!r} builder leaks foreign fields: {leaked}" + + if registry_name == "voc": + leaked = _TABULAR_ONLY_FIELDS & field_names + assert not leaked, f"voc builder leaks tabular-only fields: {leaked}" + assert field_names >= _VOC_ONLY_FIELDS, "voc builder must have 'class_names'" + + if registry_name == "tabular": + assert field_names >= _TABULAR_ONLY_FIELDS, ( + f"tabular builder is missing expected fields; got {field_names}" + ) + leaked = _VOC_ONLY_FIELDS & field_names + assert not leaked, "tabular builder should not have 'class_names'" diff --git a/src/raitap/configs/zen.py b/src/raitap/configs/zen.py index a52d04e9..c0ac3928 100644 --- a/src/raitap/configs/zen.py +++ b/src/raitap/configs/zen.py @@ -49,6 +49,7 @@ def register_zen_groups() -> None: import importlib for pkg in ( + "raitap.data.label_parsers", "raitap.metrics", "raitap.reporting", "raitap.robustness", diff --git a/src/raitap/data/__init__.py b/src/raitap/data/__init__.py index 2c5aa3e0..7b6c7dc5 100644 --- a/src/raitap/data/__init__.py +++ b/src/raitap/data/__init__.py @@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any -from .types import DIRECTORY_LABELS_SOURCE, IdStrategy, LabelEncoding, Preprocessing +from .types import IdStrategy, LabelEncoding, Preprocessing if TYPE_CHECKING: from raitap.configs.schema import DataConfig, LabelsConfig @@ -29,7 +29,6 @@ __all__ = [ - "DIRECTORY_LABELS_SOURCE", "Data", "DataConfig", "DataInputMetadata", diff --git a/src/raitap/data/data.py b/src/raitap/data/data.py index b154b1b2..15f6227a 100644 --- a/src/raitap/data/data.py +++ b/src/raitap/data/data.py @@ -2,7 +2,7 @@ from collections import Counter from enum import StrEnum -from pathlib import Path, PurePosixPath +from pathlib import Path from typing import TYPE_CHECKING, Any import numpy as np @@ -12,7 +12,6 @@ from raitap import raitap_log from raitap.data.preprocessing import module_as_per_image_callable, resolve_preprocessing from raitap.data.types import ( - DIRECTORY_LABELS_SOURCE, MODALITY_EXTENSIONS, IdStrategy, InputModality, @@ -74,7 +73,9 @@ def __init__( self.tensor = family.adapt_loaded_inputs(raw_tensor) family.validate_inputs(self.tensor) self.labels: torch.Tensor | list[dict[str, torch.Tensor]] | None - self.labels = family.load_labels(cfg, tensor=self.tensor, sample_ids=self.sample_ids) + self.labels = _resolve_and_parse_labels( + cfg, task_kind=self.task_kind, tensor=self.tensor, sample_ids=self.sample_ids + ) family.validate_labels(self.labels) def _load_data( @@ -236,100 +237,43 @@ def log(self, tracker: BaseTracker, **kwargs: Any) -> None: tracker.log_dataset(self.describe()) -def _load_directory_labels(sample_ids: list[str] | None) -> torch.Tensor | None: - """Derive classification labels from each sample's top-level class folder - (torchvision ImageFolder semantics). Returns None (with a warning) when - labels cannot be derived: no sample ids, or a sample with no class subdir.""" - if not sample_ids: - raitap_log.warn( - "data.labels.source='directory' needs image samples organised into " - "class subdirectories; none were found. Falling back to predictions " - "as metric targets." - ) - return None - parts_by_id = [PurePosixPath(sid).parts for sid in sample_ids] - if any(len(parts) < 2 for parts in parts_by_id): - raitap_log.warn( - "data.labels.source='directory' expects a / layout, but " - "one or more samples sit directly under the data source root (no class " - "subdirectory). Falling back to predictions as metric targets." - ) - return None - classes = sorted({parts[0] for parts in parts_by_id}) - class_to_idx = {name: idx for idx, name in enumerate(classes)} - labels = [class_to_idx[parts[0]] for parts in parts_by_id] - return torch.tensor(labels, dtype=torch.long) - - -def load_classification_labels( - cfg: AppConfig, +def _resolve_and_parse_labels( + cfg: Any, *, - tensor: torch.Tensor | DetectionInputs, + task_kind: TaskKind, + tensor: Any, sample_ids: list[str] | None, -) -> torch.Tensor | None: - """Load tabular classification labels (CSV/TSV/Parquet) → tensor or ``None``. +) -> Any: + """Resolve cfg.data.labels to a parser, gate supported_tasks, call parse. - Aligns to ``sample_ids`` by id column when available, otherwise falls back - to row order. Returns ``None`` when ``data.labels.source`` is unset, the - file is empty, or alignment fails (callers then use predictions as targets). + Returns None when cfg.data.labels is not set. """ - labels_cfg = _get_optional_config_value(cfg.data, "labels") - labels_source = _get_optional_config_value(labels_cfg, "source") - if not labels_source: - return None + from raitap.data.label_parsers.factory import create_label_parser - if labels_source == DIRECTORY_LABELS_SOURCE: - return _load_directory_labels(sample_ids) - - labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) - labels_df = _load_tabular_frame(labels_path) - if labels_df.empty: - raitap_log.warn("Labels file is empty; falling back to predictions as targets.") + labels_cfg = _get_optional_config_value(cfg.data, "labels") + if labels_cfg is None: return None - labels_id_column = _get_optional_config_value(labels_cfg, "id_column") - id_column = _resolve_labels_id_column(labels_df, labels_id_column) - labels_column = _get_optional_config_value(labels_cfg, "column") - labels_encoding = _get_optional_config_value(labels_cfg, "encoding") - labels_id_strategy = _get_optional_config_value(labels_cfg, "id_strategy") or "auto" - encoded_labels = _extract_class_labels( - labels_df, - labels_column=labels_column, - id_column=id_column, - labels_encoding=labels_encoding, - ) + parser = create_label_parser(labels_cfg) - expected = len(tensor) - if sample_ids and id_column: - id_series = _column_as_series(labels_df, id_column) - strategy = _resolve_id_strategy(labels_id_strategy, id_series) - try: - aligned_labels = _align_labels_to_samples( - sample_ids=sample_ids, - raw_label_ids=id_series, - encoded_labels=encoded_labels, - strategy=strategy, - ) - except ValueError as error: - raitap_log.warn( - f"{error} Falling back to predictions as metric targets.", - ) - return None - return torch.tensor(aligned_labels, dtype=torch.long) - - if sample_ids and not id_column: - raitap_log.warn( - "Could not find a labels id column for filename alignment; using row-order labels.", + if task_kind not in parser.supported_tasks: + supported = ", ".join(sorted(str(t) for t in parser.supported_tasks)) + raise ValueError( + f"{type(parser).__name__} does not support task_kind={task_kind!r}. " + f"Supported tasks: {supported}." ) - if len(encoded_labels) != expected: - raitap_log.warn( - f"Label count ({len(encoded_labels)}) does not match sample count ({expected}); " - "falling back to predictions as targets.", - ) - return None + data_source = _get_optional_config_value(cfg.data, "source") + model = getattr(cfg, "model", None) + class_names = _get_optional_config_value(model, "class_names") - return torch.tensor(encoded_labels, dtype=torch.long) + return parser.parse( + task_kind=task_kind, + tensor=tensor, + sample_ids=sample_ids, + data_source=data_source, + class_names=class_names, + ) def load_tensor_from_source( diff --git a/src/raitap/data/label_parsers/__init__.py b/src/raitap/data/label_parsers/__init__.py new file mode 100644 index 00000000..b1ced9e2 --- /dev/null +++ b/src/raitap/data/label_parsers/__init__.py @@ -0,0 +1,26 @@ +"""Label parser family package. + +Importing this package fires the ``@label_parser`` decorator on every +in-tree parser module, registering them with the hydra-zen store. Each +concrete parser is re-exported here so the short ``_target_`` form (a bare +class name resolved against ``raitap.data.label_parsers.``) instantiates, +mirroring how ``raitap.metrics`` re-exports its metric computers. +""" + +from __future__ import annotations + +from .coco import CocoLabelParser # pyright: ignore[reportUnusedImport] +from .detection_json import DetectionJsonLabelParser # pyright: ignore[reportUnusedImport] +from .directory import DirectoryLabelParser +from .tabular import TabularLabelParser # pyright: ignore[reportUnusedImport] +from .voc import VocLabelParser # pyright: ignore[reportUnusedImport] +from .yolo import YoloLabelParser # pyright: ignore[reportUnusedImport] + +__all__ = [ + "CocoLabelParser", + "DetectionJsonLabelParser", + "DirectoryLabelParser", + "TabularLabelParser", + "VocLabelParser", + "YoloLabelParser", +] diff --git a/src/raitap/data/label_parsers/base.py b/src/raitap/data/label_parsers/base.py new file mode 100644 index 00000000..3c8cf47f --- /dev/null +++ b/src/raitap/data/label_parsers/base.py @@ -0,0 +1,28 @@ +"""Base protocol and type alias for label parsers.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable + +if TYPE_CHECKING: + from raitap.types import TaskKind + +# Type alias for the union of parsed label representations. +ParsedLabels = "torch.Tensor | list[dict[str, torch.Tensor]] | None" + + +@runtime_checkable +class LabelParser(Protocol): + """Protocol every label-parser adapter must satisfy.""" + + supported_tasks: frozenset[TaskKind] + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: ... diff --git a/src/raitap/data/label_parsers/coco.py b/src/raitap/data/label_parsers/coco.py new file mode 100644 index 00000000..12b3f332 --- /dev/null +++ b/src/raitap/data/label_parsers/coco.py @@ -0,0 +1,123 @@ +"""COCO label parser (detection + classification).""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +from raitap.configs.schema import CocoLabelsConfig +from raitap.data.data import ( + SourceKind, + _align_labels_to_samples, + _resolve_id_strategy, + get_source_path, +) +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + +if TYPE_CHECKING: + from pathlib import Path + + +@label_parser(registry_name="coco", schema=CocoLabelsConfig) +class CocoLabelParser: + """Parse COCO ``instances.json`` labels for detection or classification. + + Detection: ``bbox`` ``[x, y, w, h]`` -> ``[x1, y1, x2, y2]``; ``category_id`` + passes through unchanged. Classification: one category per image; images with + 0 or >1 categories raise ValueError. + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection, TaskKind.classification}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_strategy = id_strategy + + # --- internal helpers (ported verbatim from adapters/coco.py) --- + + def _load(self, source: Path) -> dict[str, Any]: + with source.open() as fh: + data = json.load(fh) + if not isinstance(data, dict) or "images" not in data: + raise ValueError(f"COCO file {source} must be an object with an 'images' array.") + return data + + def _to_detection_records(self, data: dict[str, Any]) -> list[dict[str, Any]]: + file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} + boxes: dict[int, list[list[float]]] = {iid: [] for iid in file_by_image} + labels: dict[int, list[int]] = {iid: [] for iid in file_by_image} + for ann in data.get("annotations", []): + iid = ann["image_id"] + x, y, w, h = ann["bbox"] + boxes[iid].append([x, y, x + w, y + h]) + labels[iid].append(int(ann["category_id"])) + return [ + {"sample_id": file_by_image[iid], "boxes": boxes[iid], "labels": labels[iid]} + for iid in file_by_image + ] + + def _to_classification_records(self, data: dict[str, Any]) -> list[dict[str, Any]]: + file_by_image: dict[int, str] = {img["id"]: img["file_name"] for img in data["images"]} + cats: dict[int, set[int]] = {iid: set() for iid in file_by_image} + for ann in data.get("annotations", []): + cats[ann["image_id"]].add(int(ann["category_id"])) + records: list[dict[str, Any]] = [] + for iid, name in file_by_image.items(): + cat_set = cats[iid] + if len(cat_set) != 1: + raise ValueError( + f"COCO classification needs exactly one category per image; " + f"image {name!r} has {len(cat_set)}." + ) + records.append({"sample_id": name, "label": next(iter(cat_set))}) + return records + + # --- public parse method --- + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load and align COCO labels for detection or classification.""" + import pandas as pd + + labels_path = get_source_path(self.source, kind=SourceKind.LABELS) + data = self._load(labels_path) + + if task_kind is TaskKind.detection: + records = self._to_detection_records(data) + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + strategy=str(self.id_strategy), + ) + + # classification + records = self._to_classification_records(data) + raw_ids: list[str] = [r["sample_id"] for r in records] + encoded: list[int] = [r["label"] for r in records] + id_series: pd.Series = pd.Series(raw_ids) + strategy = _resolve_id_strategy(str(self.id_strategy), id_series) + aligned = _align_labels_to_samples( + sample_ids=sample_ids or [], + raw_label_ids=id_series, + encoded_labels=encoded, + strategy=strategy, + ) + import torch + + return torch.tensor(aligned, dtype=torch.long) diff --git a/src/raitap/data/label_parsers/detection_json.py b/src/raitap/data/label_parsers/detection_json.py new file mode 100644 index 00000000..6e3cbfdc --- /dev/null +++ b/src/raitap/data/label_parsers/detection_json.py @@ -0,0 +1,56 @@ +"""Detection-JSON label parser (native RAITAP detection record format).""" + +from __future__ import annotations + +import json +from typing import Any + +from raitap.configs.schema import DetectionJsonLabelsConfig +from raitap.data.data import SourceKind, get_source_path +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + + +@label_parser(registry_name="detection_json", schema=DetectionJsonLabelsConfig) +class DetectionJsonLabelParser: + """Parse native RAITAP detection JSON records for detection. + + The file must be a JSON array of objects with keys ``sample_id``, + ``boxes`` (list of ``[x1, y1, x2, y2]`` in pixels), and ``labels`` + (list of integer class ids). + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_strategy = id_strategy + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load native detection JSON and align records to sample_ids.""" + labels_path = get_source_path(self.source, kind=SourceKind.LABELS) + with labels_path.open() as fh: + records = json.load(fh) + if not isinstance(records, list): + raise ValueError(f"Detection labels file {labels_path} must be a JSON array.") + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + strategy=str(self.id_strategy), + ) diff --git a/src/raitap/data/label_parsers/directory.py b/src/raitap/data/label_parsers/directory.py new file mode 100644 index 00000000..78cb5ffe --- /dev/null +++ b/src/raitap/data/label_parsers/directory.py @@ -0,0 +1,55 @@ +"""Directory label parser (torchvision ImageFolder semantics).""" + +from __future__ import annotations + +from pathlib import PurePosixPath +from typing import Any + +from raitap import raitap_log +from raitap.configs.schema import DirectoryLabelsConfig +from raitap.data.label_parsers.registration import label_parser +from raitap.types import TaskKind +from raitap.utils.lazy import lazy_import + +torch = lazy_import("torch") + + +@label_parser(registry_name="directory", schema=DirectoryLabelsConfig) +class DirectoryLabelParser: + """Parse classification labels from the top-level class subfolder of each sample. + + Mirrors torchvision ``ImageFolder`` semantics: ``/`` layout. + Uses ``sample_ids`` only; ignores ``data_source`` and ``class_names``. + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification}) + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Derive a long-tensor of class indices from sample_ids directory layout.""" + if not sample_ids: + raitap_log.warn( + "DirectoryLabelParser needs image samples organised into " + "class subdirectories; none were found. Falling back to " + "predictions as metric targets." + ) + return None + parts_by_id = [PurePosixPath(sid).parts for sid in sample_ids] + if any(len(parts) < 2 for parts in parts_by_id): + raitap_log.warn( + "DirectoryLabelParser expects a / layout, but " + "one or more samples sit directly under the data source root " + "(no class subdirectory). Falling back to predictions as metric targets." + ) + return None + classes = sorted({parts[0] for parts in parts_by_id}) + class_to_idx = {name: idx for idx, name in enumerate(classes)} + labels = [class_to_idx[parts[0]] for parts in parts_by_id] + return torch.tensor(labels, dtype=torch.long) diff --git a/src/raitap/data/label_parsers/factory.py b/src/raitap/data/label_parsers/factory.py new file mode 100644 index 00000000..b8d6cfd3 --- /dev/null +++ b/src/raitap/data/label_parsers/factory.py @@ -0,0 +1,34 @@ +"""Instantiation factory for label parsers (mirrors metrics/factory.py:44-60).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from hydra.utils import instantiate + +from raitap import raitap_log +from raitap.configs import cfg_to_dict, resolve_target + +if TYPE_CHECKING: + from raitap.data.label_parsers.base import LabelParser + +_LABELS_PREFIX = "raitap.data.label_parsers." + + +def create_label_parser(labels_config: Any) -> LabelParser: + """Instantiate a label parser from Hydra-style config (``_target_`` + kwargs).""" + labels_cfg = cfg_to_dict(labels_config) + target_path: str = labels_cfg.get("_target_", "") + resolved_target = resolve_target(target_path, _LABELS_PREFIX) + labels_cfg["_target_"] = resolved_target + + try: + parser = instantiate(labels_cfg) + except Exception as e: + raitap_log.exception("Label parser instantiation failed for target %r", target_path) + raise ValueError( + f"Could not instantiate label parser {target_path!r}.\n" + "Check that _target_ points to a valid LabelParser implementation." + ) from e + + return parser diff --git a/src/raitap/data/label_parsers/registration.py b/src/raitap/data/label_parsers/registration.py new file mode 100644 index 00000000..cc65235f --- /dev/null +++ b/src/raitap/data/label_parsers/registration.py @@ -0,0 +1,48 @@ +"""Family decorator for label-parser adapters. + +Mirrors ``raitap.metrics.registration`` exactly, with group ``data/labels`` +and ``package_style="flat"`` so composed configs land at ``cfg.data.labels``. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, TypeVar, Unpack + +from raitap._adapters import AdapterDecoratorOptions, FamilyConfig, _register_core +from raitap.configs.schema import LabelsConfig + +if TYPE_CHECKING: + from collections.abc import Callable + + from raitap.data.label_parsers.base import LabelParser + +# ``flat``: ``DataConfig.labels`` is a single ``LabelsConfig`` (not a dict of +# named entries), so the composed variant lands directly at ``cfg.data.labels`` +# (package ``data.labels``), with parser names competing for that one slot. +LABELS = FamilyConfig( + group="data/labels", + schema=LabelsConfig, + package_style="flat", +) + +T = TypeVar("T", bound="LabelParser") + + +def label_parser( + **common: Unpack[AdapterDecoratorOptions], +) -> Callable[[type[T]], type[T]]: + """Decorator: register a label-parser adapter. + + ``registry_name`` is required. Mirrors ``metrics_adapter`` shape. + + Label parsers are core (stdlib only — no optional dependency), so they + declare no uv extra. Without this default the schema-backed auto-extra + (``extra=registry_name``) would register phantom extras like ``tabular`` + that no ``pyproject`` group provides, breaking the deps static-scan gate. + """ + common.setdefault("extra", "") + + def wrap(cls: type[T]) -> type[T]: + return _register_core(cls, family=LABELS, **common) + + return wrap diff --git a/src/raitap/data/label_parsers/tabular.py b/src/raitap/data/label_parsers/tabular.py new file mode 100644 index 00000000..529c4455 --- /dev/null +++ b/src/raitap/data/label_parsers/tabular.py @@ -0,0 +1,109 @@ +"""Tabular label parser (CSV / TSV / Parquet).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from raitap import raitap_log +from raitap.configs.schema import TabularLabelsConfig +from raitap.data.data import ( + SourceKind, + _align_labels_to_samples, + _column_as_series, + _extract_class_labels, + _load_tabular_frame, + _resolve_id_strategy, + _resolve_labels_id_column, + get_source_path, +) +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.types import TaskKind +from raitap.utils.lazy import lazy_import + +if TYPE_CHECKING: + import torch + +torch = lazy_import("torch") # type: ignore[assignment] + + +@label_parser(registry_name="tabular", schema=TabularLabelsConfig) +class TabularLabelParser: + """Parse classification labels from a CSV, TSV, or Parquet file. + + Aligns to ``sample_ids`` via ``id_column`` when available; falls back to + row order otherwise. Returns ``None`` on empty file or count mismatch. + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.classification}) + + def __init__( + self, + *, + source: str, + id_column: str | None = None, + column: str | None = None, + encoding: Any = None, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_column = id_column + self.column = column + self.encoding = encoding + self.id_strategy = id_strategy + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load tabular labels and align to samples.""" + labels_path = get_source_path(self.source, kind=SourceKind.LABELS) + labels_df = _load_tabular_frame(labels_path) + if labels_df.empty: + raitap_log.warn("Labels file is empty; falling back to predictions as targets.") + return None + + id_column = _resolve_labels_id_column(labels_df, self.id_column) + encoded_labels = _extract_class_labels( + labels_df, + labels_column=self.column, + id_column=id_column, + labels_encoding=self.encoding, + ) + + expected = len(tensor) if tensor is not None else len(encoded_labels) + if sample_ids and id_column: + id_series = _column_as_series(labels_df, id_column) + strategy = _resolve_id_strategy(str(self.id_strategy), id_series) + try: + aligned_labels = _align_labels_to_samples( + sample_ids=sample_ids, + raw_label_ids=id_series, + encoded_labels=encoded_labels, + strategy=strategy, + ) + except ValueError as error: + raitap_log.warn( + f"{error} Falling back to predictions as metric targets.", + ) + return None + return torch.tensor(aligned_labels, dtype=torch.long) + + if sample_ids and not id_column: + raitap_log.warn( + "Could not find a labels id column for filename alignment; using row-order labels.", + ) + + if len(encoded_labels) != expected: + raitap_log.warn( + f"Label count ({len(encoded_labels)}) does not match sample count ({expected}); " + "falling back to predictions as targets.", + ) + return None + + return torch.tensor(encoded_labels, dtype=torch.long) diff --git a/src/raitap/data/label_parsers/voc.py b/src/raitap/data/label_parsers/voc.py new file mode 100644 index 00000000..959f9e48 --- /dev/null +++ b/src/raitap/data/label_parsers/voc.py @@ -0,0 +1,130 @@ +"""Pascal-VOC label parser (detection-only).""" + +from __future__ import annotations + +import xml.etree.ElementTree as ET +from typing import TYPE_CHECKING, Any + +from raitap.configs.schema import VocLabelsConfig +from raitap.data.data import SourceKind, get_source_path +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + +if TYPE_CHECKING: + from pathlib import Path + +#: Canonical Pascal-VOC class order (index = label id) when no class_names given. +_VOC_CLASSES = ( + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", +) + + +def _coord(box: ET.Element, tag: str, xml_path: Path) -> float: + text = box.findtext(tag) + if text is None: + raise ValueError(f"VOC bndbox in {xml_path.name} missing <{tag}>.") + return float(text) + + +@label_parser(registry_name="voc", schema=VocLabelsConfig) +class VocLabelParser: + """Parse Pascal-VOC per-image ``.xml`` for detection. + + Boxes are already ``[xmin, ymin, xmax, ymax]`` pixels. Class names map to + ids by their position in the active name list (parser's own ``class_names``, + else the ``class_names`` arg from ``cfg.model.class_names``, else the + standard 20-class VOC order). + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + class_names: list[str] | None = None, + ) -> None: + self.source = source + self.id_strategy = id_strategy + self.class_names = class_names + + def _to_detection_records( + self, labels_dir: Path, name_to_id: dict[str, int] + ) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for xml_path in sorted(labels_dir.glob("*.xml")): + root = ET.parse(xml_path).getroot() + filename_el = root.find("filename") + if filename_el is None or not filename_el.text: + raise ValueError(f"VOC file {xml_path} has no .") + boxes: list[list[float]] = [] + labels: list[int] = [] + for obj in root.findall("object"): + name = obj.findtext("name") + if name not in name_to_id: + raise ValueError( + f"VOC class {name!r} in {xml_path.name} is not in the " + f"class list {sorted(name_to_id)}." + ) + box = obj.find("bndbox") + if box is None: + raise ValueError(f"VOC object in {xml_path.name} has no .") + boxes.append( + [ + _coord(box, "xmin", xml_path), + _coord(box, "ymin", xml_path), + _coord(box, "xmax", xml_path), + _coord(box, "ymax", xml_path), + ] + ) + labels.append(name_to_id[name]) + records.append({"sample_id": filename_el.text, "boxes": boxes, "labels": labels}) + return records + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load VOC xml labels and align to sample_ids for detection.""" + labels_dir = get_source_path(self.source, kind=SourceKind.LABELS) + # Precedence: parser's own class_names > model's class_names > _VOC_CLASSES + active_names: list[str] | tuple[str, ...] = ( + self.class_names + if self.class_names is not None + else (class_names if class_names is not None else _VOC_CLASSES) + ) + name_to_id = {name: idx for idx, name in enumerate(active_names)} + records = self._to_detection_records(labels_dir, name_to_id) + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + strategy=str(self.id_strategy), + ) diff --git a/src/raitap/data/label_parsers/yolo.py b/src/raitap/data/label_parsers/yolo.py new file mode 100644 index 00000000..3d76c298 --- /dev/null +++ b/src/raitap/data/label_parsers/yolo.py @@ -0,0 +1,93 @@ +"""YOLO label parser (detection-only).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from PIL import Image + +from raitap.configs.schema import YoloLabelsConfig +from raitap.data.data import SourceKind, get_source_path +from raitap.data.label_parsers.registration import label_parser +from raitap.data.types import IdStrategy +from raitap.task_families.detection import _align_detection_records +from raitap.types import TaskKind + +if TYPE_CHECKING: + from pathlib import Path + +_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".bmp", ".webp") + + +@label_parser(registry_name="yolo", schema=YoloLabelsConfig) +class YoloLabelParser: + """Parse YOLO per-image ``.txt`` (``class cx cy w h``, normalised) for detection. + + Boxes are denormalised to pixel ``[x1, y1, x2, y2]`` using each image's + size read from PIL. Class indices pass through unchanged. + """ + + supported_tasks: frozenset[TaskKind] = frozenset({TaskKind.detection}) + + def __init__( + self, + *, + source: str, + id_strategy: IdStrategy = IdStrategy.auto, + ) -> None: + self.source = source + self.id_strategy = id_strategy + + def _image_for(self, image_dir: Path, stem: str) -> Path: + for suffix in _IMAGE_SUFFIXES: + candidate = image_dir / f"{stem}{suffix}" + if candidate.exists(): + return candidate + raise ValueError(f"YOLO parser found no image for label {stem!r} in {image_dir}.") + + def _to_detection_records(self, labels_dir: Path, image_dir: Path) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + for txt in sorted(labels_dir.glob("*.txt")): + image_path = self._image_for(image_dir, txt.stem) + with Image.open(image_path) as im: + width, height = im.size + boxes: list[list[float]] = [] + labels: list[int] = [] + for line in txt.read_text().splitlines(): + parts = line.split() + if not parts: + continue + cls, cx, cy, bw, bh = (float(p) for p in parts[:5]) + x1 = (cx - bw / 2) * width + y1 = (cy - bh / 2) * height + x2 = (cx + bw / 2) * width + y2 = (cy + bh / 2) * height + boxes.append([x1, y1, x2, y2]) + labels.append(int(cls)) + records.append({"sample_id": image_path.name, "boxes": boxes, "labels": labels}) + return records + + def parse( + self, + *, + task_kind: TaskKind, + tensor: Any, + sample_ids: list[str] | None, + data_source: str | None, + class_names: list[str] | None, + ) -> Any: + """Load YOLO labels and align to sample_ids for detection.""" + if data_source is None: + raise ValueError( + "YOLO labels need data.source (image directory) to denormalise boxes; " + "set data.source to the image directory." + ) + labels_dir = get_source_path(self.source, kind=SourceKind.LABELS) + image_dir = get_source_path(data_source, kind=SourceKind.DATA) + records = self._to_detection_records(labels_dir, image_dir) + return _align_detection_records( + records, + expected=len(tensor), + sample_ids=sample_ids, + strategy=str(self.id_strategy), + ) diff --git a/src/raitap/data/tests/test_data.py b/src/raitap/data/tests/test_data.py index b1e931e4..907778df 100644 --- a/src/raitap/data/tests/test_data.py +++ b/src/raitap/data/tests/test_data.py @@ -195,7 +195,7 @@ class TestDataPreprocessing: @staticmethod def _make_cfg(source: str, *, preprocessing: str | None) -> AppConfig: - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig return cast( "AppConfig", @@ -205,7 +205,7 @@ def _make_cfg(source: str, *, preprocessing: str | None) -> AppConfig: name="test", source=source, preprocessing=preprocessing, - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -239,7 +239,7 @@ def test_uniform_dir_without_preprocessing_still_loads(self, tmp_path: Path) -> def test_supplied_resolved_preprocessing_skips_resolution(self, tmp_path: Path) -> None: from torch import nn - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig from raitap.data.preprocessing import ResolvedPreprocessing class _ShapeModule(nn.Module): @@ -255,7 +255,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: name="test", source=str(tmp_path), preprocessing="model-bundled", - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -279,7 +279,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: def test_onnx_custom_file_data_factory_drives_data_loading( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig _write_image(tmp_path / "a.jpg", 32, 48) _write_image(tmp_path / "b.jpg", 40, 64) @@ -319,7 +319,7 @@ def test_onnx_custom_file_data_factory_drives_data_loading( name="test", source=str(tmp_path), preprocessing=str(preprocessing_path), - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -369,7 +369,7 @@ def test_sample_source_loads_native_resolution_then_transforms(self, tmp_path: P breaks pretrained-weight accuracy on `raitap --demo`.""" from torch import nn - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig from raitap.data.samples import SAMPLE_SOURCES # Stage a fake sample at varied native sizes so the test would fail @@ -411,7 +411,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: name="fake_native_samples", source="fake_native_samples", preprocessing="model-bundled", - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -589,7 +589,7 @@ def test_unknown_extension_raises(self, tmp_path: Path) -> None: def test_tabular_applies_data_module(self, tmp_path: Path) -> None: from torch import nn - from raitap.configs.schema import AppConfig, DataConfig, LabelsConfig, ModelConfig + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig from raitap.data.preprocessing import ResolvedPreprocessing class _ScaleModule(nn.Module): @@ -606,7 +606,7 @@ def forward(self, batch: torch.Tensor) -> torch.Tensor: name="tab", source=str(p), preprocessing="./scale.py", - labels=LabelsConfig(), + labels=None, ), hardware=Hardware.cpu, ), @@ -701,7 +701,9 @@ def test_url_source_loads_image_via_get_source_path(self, tmp_path: Path) -> Non assert data.tensor.shape == (1, 3, 32, 32) def test_sample_labels_align_with_sample_images(self, tmp_path: Path) -> None: + from raitap.configs.schema import AppConfig, DataConfig, ModelConfig, TabularLabelsConfig from raitap.data.samples import SAMPLE_LABELS + from raitap.data.types import LabelEncoding with ( patch("raitap.data.samples._CACHE_DIR", tmp_path), @@ -711,30 +713,20 @@ def test_sample_labels_align_with_sample_images(self, tmp_path: Path) -> None: mock_download.side_effect = lambda _url, dest: _write_image(dest, 32, 32) cfg = cast( "AppConfig", - type( - "AppConfig", - (), - { - "data": type( - "DataConfig", - (), - { - "source": "imagenet_samples", - "name": "imagenet_samples", - "labels": type( - "LabelsConfig", - (), - { - "source": "imagenet_samples", - "id_column": "image", - "column": "label", - "encoding": "index", - }, - )(), - }, - )() - }, - )(), + AppConfig( + model=ModelConfig(source="resnet50"), + data=DataConfig( + name="imagenet_samples", + source="imagenet_samples", + labels=TabularLabelsConfig( + source="imagenet_samples", + id_column="image", + column="label", + encoding=LabelEncoding.index, + ), + ), + hardware=Hardware.cpu, + ), ) data = Data(cfg) diff --git a/src/raitap/data/tests/test_data_class.py b/src/raitap/data/tests/test_data_class.py index 44b207f7..1fba154e 100644 --- a/src/raitap/data/tests/test_data_class.py +++ b/src/raitap/data/tests/test_data_class.py @@ -16,9 +16,9 @@ from raitap.configs.schema import AppConfig +from raitap.configs.schema import DirectoryLabelsConfig, TabularLabelsConfig from raitap.data import Data -from raitap.data.data import _load_directory_labels, load_classification_labels -from raitap.data.types import DIRECTORY_LABELS_SOURCE, InputModality +from raitap.data.types import InputModality def _write_image(path: Path) -> None: @@ -35,19 +35,28 @@ def _make_config( labels_encoding: str | None = None, labels_id_strategy: str | None = None, ) -> AppConfig: + from raitap.data.types import IdStrategy, LabelEncoding + + if labels_source is not None: + encoding = LabelEncoding(labels_encoding) if labels_encoding else None + id_strategy = IdStrategy(labels_id_strategy) if labels_id_strategy else IdStrategy.auto + labels = TabularLabelsConfig( + source=labels_source, + id_column=labels_id_column, + column=labels_column, + encoding=encoding, + id_strategy=id_strategy, + ) + else: + labels = None + return cast( "AppConfig", SimpleNamespace( data=SimpleNamespace( source=source, name=name, - labels=SimpleNamespace( - source=labels_source, - id_column=labels_id_column, - column=labels_column, - encoding=labels_encoding, - id_strategy=labels_id_strategy, - ), + labels=labels, ) ), ) @@ -342,16 +351,15 @@ def test_data_raises_for_unsupported_id_strategy(self, tmp_path: Path) -> None: _write_image(data_dir / "x.jpg") labels_file = tmp_path / "labels.csv" labels_file.write_text("image,label\nx,0\n") - config = _make_config( - str(data_dir), - labels_source=str(labels_file), - labels_id_column="image", - labels_column="label", - labels_encoding="index", - labels_id_strategy="bogus", - ) - - with pytest.raises(ValueError, match=r"Unsupported data\.labels\.id_strategy"): + with pytest.raises(ValueError): + config = _make_config( + str(data_dir), + labels_source=str(labels_file), + labels_id_column="image", + labels_column="label", + labels_encoding="index", + labels_id_strategy="bogus", + ) Data(config) def test_data_records_image_modality_for_image_dir(self, tmp_path: Path) -> None: @@ -397,14 +405,13 @@ def test_data_raises_for_unsupported_labels_encoding(self, tmp_path: Path) -> No csv_file.write_text("a\n1\n2") labels_file = tmp_path / "labels.csv" labels_file.write_text("label\n0\n1") - config = _make_config( - str(csv_file), - labels_source=str(labels_file), - labels_column="label", - labels_encoding="ordinal", - ) - - with pytest.raises(ValueError, match=r"Unsupported data\.labels\.encoding"): + with pytest.raises(ValueError): + config = _make_config( + str(csv_file), + labels_source=str(labels_file), + labels_column="label", + labels_encoding="ordinal", + ) Data(config) @@ -482,54 +489,84 @@ def test_log_includes_full_metadata(self, tmp_path: Path) -> None: assert "dtype" in call_args -class TestLoadDirectoryLabels: +class TestLoadDirectoryLabelsViaParser: + """Directory label behavior via DirectoryLabelParser (replaces deleted _load_directory_labels). + + The private _load_directory_labels function and load_classification_labels were removed in + the discriminated-config refactor. Behavior is now covered by DirectoryLabelParser + and _resolve_and_parse_labels. These tests preserve the behavioral contracts. + """ + + def _run_directory_parser(self, sample_ids: list[str] | None) -> torch.Tensor | None: + from types import SimpleNamespace + from typing import cast + + from raitap.data.data import _resolve_and_parse_labels + from raitap.types import TaskKind + + cfg = cast( + "AppConfig", + SimpleNamespace( + data=SimpleNamespace(labels=DirectoryLabelsConfig(), source=None), + model=SimpleNamespace(class_names=None), + ), + ) + return _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + def test_derives_labels_from_top_level_class_folder(self) -> None: - result = _load_directory_labels(["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"]) + result = self._run_directory_parser(["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"]) assert result is not None assert torch.equal(result, torch.tensor([0, 1, 0])) def test_nesting_within_class_stays_top_level(self) -> None: - result = _load_directory_labels(["NORMAL/sub/a.jpg", "PNEUMONIA/b.jpg"]) + result = self._run_directory_parser(["NORMAL/sub/a.jpg", "PNEUMONIA/b.jpg"]) assert result is not None assert torch.equal(result, torch.tensor([0, 1])) def test_single_class_is_all_zeros_not_error(self) -> None: - result = _load_directory_labels(["NORMAL/a.jpg", "NORMAL/b.jpg"]) + result = self._run_directory_parser(["NORMAL/a.jpg", "NORMAL/b.jpg"]) assert result is not None assert torch.equal(result, torch.tensor([0, 0])) def test_sample_without_class_subdir_returns_none(self) -> None: - with pytest.warns(UserWarning, match="class subdirectory"): - result = _load_directory_labels(["a.jpg", "NORMAL/b.jpg"]) + result = self._run_directory_parser(["a.jpg", "NORMAL/b.jpg"]) assert result is None def test_none_sample_ids_returns_none(self) -> None: - with pytest.warns(UserWarning, match="class subdirectories"): - result = _load_directory_labels(None) + result = self._run_directory_parser(None) assert result is None def test_empty_sample_ids_returns_none(self) -> None: - with pytest.warns(UserWarning, match="class subdirectories"): - result = _load_directory_labels([]) + result = self._run_directory_parser([]) assert result is None + def test_directory_source_derives_labels_from_layout(self, tmp_path: Path) -> None: + """Data with DirectoryLabelsConfig derives labels from the sample layout.""" + from types import SimpleNamespace + from typing import cast + + img_dir = tmp_path / "images" + (img_dir / "NORMAL").mkdir(parents=True) + (img_dir / "PNEUMONIA").mkdir(parents=True) + _write_image(img_dir / "NORMAL" / "a.jpg") + _write_image(img_dir / "PNEUMONIA" / "b.jpg") + _write_image(img_dir / "NORMAL" / "c.jpg") + + cfg = cast( + "AppConfig", + SimpleNamespace( + data=SimpleNamespace( + source=str(img_dir), + name="test_dir", + labels=DirectoryLabelsConfig(), + ) + ), + ) + data = Data(cfg) -class TestLoadClassificationLabelsDirectorySource: - def test_directory_source_derives_labels(self) -> None: - config = _make_config("images", labels_source=DIRECTORY_LABELS_SOURCE) - sample_ids = ["NORMAL/a.jpg", "PNEUMONIA/b.jpg", "NORMAL/c.jpg"] - tensor = torch.zeros(len(sample_ids), 3, 8, 8) - - result = load_classification_labels(config, tensor=tensor, sample_ids=sample_ids) - - assert result is not None - assert torch.equal(result, torch.tensor([0, 1, 0])) - - def test_directory_source_none_sample_ids_returns_none(self) -> None: - config = _make_config("rows.csv", labels_source=DIRECTORY_LABELS_SOURCE) - tensor = torch.zeros(3, 4) - - with pytest.warns(UserWarning, match="class subdirectories"): - result = load_classification_labels(config, tensor=tensor, sample_ids=None) - - assert result is None + assert data.labels is not None + assert isinstance(data.labels, torch.Tensor) + # NORMAL=0, PNEUMONIA=1; sorted by posix path: NORMAL/a, NORMAL/c, PNEUMONIA/b + assert data.labels.tolist() == [0, 0, 1] diff --git a/src/raitap/data/tests/test_detection_labels.py b/src/raitap/data/tests/test_detection_labels.py index 5a5663da..413c81da 100644 --- a/src/raitap/data/tests/test_detection_labels.py +++ b/src/raitap/data/tests/test_detection_labels.py @@ -1,4 +1,4 @@ -"""Tests for DetectionFamily.load_labels — list[dict] per-sample boxes + labels.""" +"""Tests for DetectionJsonLabelParser -- list[dict] per-sample boxes + labels.""" from __future__ import annotations @@ -9,8 +9,9 @@ import pytest import torch -from raitap.data.data import Data -from raitap.task_families.detection import DetectionFamily +from raitap.configs.schema import DetectionJsonLabelsConfig +from raitap.data.data import Data, _resolve_and_parse_labels +from raitap.types import TaskKind if TYPE_CHECKING: from pathlib import Path @@ -40,12 +41,15 @@ def _write_detection_labels_json(path: Path) -> None: def _stub_cfg(labels_source: str | None = None) -> AppConfig: + labels = DetectionJsonLabelsConfig(source=labels_source) if labels_source is not None else None return cast( "AppConfig", SimpleNamespace( data=SimpleNamespace( - labels=SimpleNamespace(source=labels_source), + labels=labels, + source=None, ), + model=SimpleNamespace(class_names=None), ), ) @@ -63,7 +67,9 @@ def test_load_detection_labels_returns_list_of_dicts(tmp_path: Path) -> None: cfg = _stub_cfg(labels_source=str(labels_path)) data = _make_data(num_samples=3) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is not None assert isinstance(out, list) assert len(out) == 3 @@ -79,7 +85,7 @@ def test_load_detection_labels_returns_list_of_dicts(tmp_path: Path) -> None: def test_load_detection_labels_aligns_by_sample_id_when_present(tmp_path: Path) -> None: - """Reordered labels file is rewritten to match self.sample_ids ordering.""" + """Reordered labels file is rewritten to match sample_ids ordering.""" labels_path = tmp_path / "boxes.json" # Write records out of order vs sample_ids. payload = [ @@ -91,7 +97,9 @@ def test_load_detection_labels_aligns_by_sample_id_when_present(tmp_path: Path) cfg = _stub_cfg(labels_source=str(labels_path)) data = _make_data(num_samples=3, sample_ids=["img_0", "img_1", "img_2"]) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is not None assert int(out[0]["labels"].item()) == 7 assert out[1]["labels"].numel() == 0 @@ -110,7 +118,9 @@ def test_load_detection_labels_rejects_missing_sample_id_entries(tmp_path: Path) data = _make_data(num_samples=3, sample_ids=["img_0", "img_1", "img_2"]) with pytest.raises(ValueError, match="missing entries"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_duplicate_sample_id(tmp_path: Path) -> None: @@ -125,7 +135,9 @@ def test_load_detection_labels_rejects_duplicate_sample_id(tmp_path: Path) -> No data = _make_data(num_samples=2, sample_ids=["img_0", "img_1"]) with pytest.raises(ValueError, match="duplicate sample_id"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_record_missing_sample_id_field(tmp_path: Path) -> None: @@ -138,7 +150,9 @@ def test_load_detection_labels_rejects_record_missing_sample_id_field(tmp_path: data = _make_data(num_samples=1, sample_ids=["img_0"]) with pytest.raises(ValueError, match="missing 'sample_id'"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_wrong_length_when_no_sample_ids(tmp_path: Path) -> None: @@ -149,7 +163,9 @@ def test_load_detection_labels_rejects_wrong_length_when_no_sample_ids(tmp_path: data = _make_data(num_samples=5) # dataset bigger than labels with pytest.raises(ValueError, match="5 samples"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_mismatched_box_label_counts(tmp_path: Path) -> None: @@ -161,7 +177,9 @@ def test_load_detection_labels_rejects_mismatched_box_label_counts(tmp_path: Pat data = _make_data(num_samples=1) with pytest.raises(ValueError, match="boxes and labels"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_rejects_non_list_root(tmp_path: Path) -> None: @@ -171,13 +189,17 @@ def test_load_detection_labels_rejects_non_list_root(tmp_path: Path) -> None: data = _make_data(num_samples=1) with pytest.raises(ValueError, match="must be a JSON array"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) def test_load_detection_labels_returns_none_when_no_source_configured(tmp_path: Path) -> None: cfg = _stub_cfg(labels_source=None) data = _make_data(num_samples=1) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is None @@ -185,4 +207,6 @@ def test_load_detection_labels_raises_when_source_unresolvable(tmp_path: Path) - cfg = _stub_cfg(labels_source=str(tmp_path / "missing.json")) data = _make_data(num_samples=1) with pytest.raises(ValueError, match="could not be resolved"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) diff --git a/src/raitap/data/tests/test_detection_ragged.py b/src/raitap/data/tests/test_detection_ragged.py index b805bbf2..bbad5a7b 100644 --- a/src/raitap/data/tests/test_detection_ragged.py +++ b/src/raitap/data/tests/test_detection_ragged.py @@ -18,7 +18,8 @@ import torch from PIL import Image -from raitap.data.data import Data +from raitap.configs.schema import DetectionJsonLabelsConfig +from raitap.data.data import Data, _resolve_and_parse_labels from raitap.task_families.classification import ClassificationFamily from raitap.task_families.detection import DetectionFamily from raitap.types import TaskKind @@ -47,14 +48,7 @@ def _make_config(source: str, name: str = "test_det") -> AppConfig: data=SimpleNamespace( source=source, name=name, - labels=SimpleNamespace( - source=None, - kind=None, - id_column=None, - column=None, - encoding=None, - id_strategy=None, - ), + labels=None, ) ), ) @@ -164,7 +158,7 @@ def _write_labels_json(self, path: Path, n: int) -> None: path.write_text(json.dumps(payload)) def test_detection_labels_count_matches_list_tensor(self, tmp_path: Path) -> None: - """DetectionFamily.load_labels: len(tensor) works when tensor is a list.""" + """DetectionJsonLabelParser: len(tensor) works when tensor is a list.""" labels_path = tmp_path / "boxes.json" self._write_labels_json(labels_path, n=3) @@ -181,13 +175,15 @@ def test_detection_labels_count_matches_list_tensor(self, tmp_path: Path) -> Non "AppConfig", SimpleNamespace( data=SimpleNamespace( - labels=SimpleNamespace( - source=str(labels_path), - ) - ) + labels=DetectionJsonLabelsConfig(source=str(labels_path)), + source=None, + ), + model=SimpleNamespace(class_names=None), ), ) - out = DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + out = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) assert out is not None assert len(out) == 3 @@ -204,14 +200,16 @@ def test_detection_labels_count_mismatch_raises_with_list_tensor(self, tmp_path: "AppConfig", SimpleNamespace( data=SimpleNamespace( - labels=SimpleNamespace( - source=str(labels_path), - ) - ) + labels=DetectionJsonLabelsConfig(source=str(labels_path)), + source=None, + ), + model=SimpleNamespace(class_names=None), ), ) with pytest.raises(ValueError, match="3 samples"): - DetectionFamily().load_labels(cfg, tensor=data.tensor, sample_ids=data.sample_ids) + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids + ) # --------------------------------------------------------------------------- diff --git a/src/raitap/data/tests/test_label_parsers.py b/src/raitap/data/tests/test_label_parsers.py new file mode 100644 index 00000000..cff4face --- /dev/null +++ b/src/raitap/data/tests/test_label_parsers.py @@ -0,0 +1,673 @@ +"""Task 3 tests: _resolve_and_parse_labels + DirectoryLabelParser e2e.""" + +from __future__ import annotations + +import importlib +from types import SimpleNamespace +from typing import cast + +import pytest + +from raitap.configs.schema import AppConfig, DirectoryLabelsConfig +from raitap.data.data import _resolve_and_parse_labels +from raitap.types import TaskKind + + +def _make_cfg( + *, + labels: object = None, + source: str | None = None, + class_names: list[str] | None = None, +) -> AppConfig: + """Build a minimal AppConfig-shaped namespace for unit tests.""" + data_ns = SimpleNamespace(labels=labels, source=source) + model_ns = SimpleNamespace(class_names=class_names) + return cast("AppConfig", SimpleNamespace(data=data_ns, model=model_ns)) + + +def test_resolve_returns_none_when_labels_is_none() -> None: + cfg = _make_cfg(labels=None) + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=None + ) + assert result is None + + +def test_directory_parser_e2e_returns_label_tensor() -> None: + """DirectoryLabelParser derives class index from top-level folder name.""" + import torch + + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + sample_ids = ["cat/a.jpg", "dog/b.jpg"] + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + assert result is not None + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + # "cat" < "dog" alphabetically -> cat=0, dog=1 + assert result.tolist() == [0, 1] + + +def test_directory_parser_raises_for_unsupported_task() -> None: + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + sample_ids = ["cat/a.jpg", "dog/b.jpg"] + with pytest.raises(ValueError, match="does not support task_kind"): + _resolve_and_parse_labels( + cfg, task_kind=TaskKind.detection, tensor=None, sample_ids=sample_ids + ) + + +def test_directory_parser_returns_none_for_no_sample_ids() -> None: + """No sample_ids -> returns None with a warning (graceful degradation).""" + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=None + ) + assert result is None + + +def test_directory_parser_returns_none_for_flat_layout() -> None: + """Samples directly under root (no class subdir) -> None with warning.""" + cfg = _make_cfg(labels=DirectoryLabelsConfig()) + sample_ids = ["a.jpg", "b.jpg"] + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + assert result is None + + +# --- Integration: full hydra compose path --- + + +def _register_labels_group() -> None: + importlib.import_module("raitap.data.label_parsers") + from hydra.core.config_store import ConfigStore + + from raitap._adapters import store + from raitap.configs.schema import AppConfig + + store.add_to_hydra_store(overwrite_ok=True) + ConfigStore.instance().store(name="raitap_schema", node=AppConfig) + + +_COMPOSED_TARGET = "raitap.data.label_parsers.directory.DirectoryLabelParser" + + +def test_integration_compose_data_labels_directory() -> None: + """Composing +data/labels=directory lands cfg.data.labels._target_ at the FQN.""" + from hydra import compose, initialize + from hydra.core.global_hydra import GlobalHydra + + _register_labels_group() + GlobalHydra.instance().clear() + with initialize(version_base=None, config_path=None): + cfg = compose(config_name="raitap_schema", overrides=["+data/labels=directory"]) + assert cfg.data.labels._target_ == _COMPOSED_TARGET + + +# --- Task 4: TabularLabelParser --- + + +def _write_csv(path: object, content: str) -> None: + import pathlib + + pathlib.Path(str(path)).write_text(content, encoding="utf-8") + + +def test_tabular_parser_e2e_via_resolve_and_parse_labels(tmp_path: object) -> None: + """CSV with image,label rows + sample_ids -> aligned long tensor via resolve.""" + import pathlib + + import torch + + from raitap.configs.schema import TabularLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + csv_path = pathlib.Path(str(tmp_path)) / "labels.csv" + _write_csv(csv_path, "image,label\nb.jpg,1\na.jpg,0\n") + + cfg = _make_cfg( + labels=TabularLabelsConfig( + source=str(csv_path), + id_column="image", + ) + ) + sample_ids = ["a.jpg", "b.jpg"] + result = _resolve_and_parse_labels( + cfg, task_kind=TaskKind.classification, tensor=None, sample_ids=sample_ids + ) + assert result is not None + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + # a.jpg -> label 0, b.jpg -> label 1 + assert result.tolist() == [0, 1] + + +def test_tabular_parser_direct_unit(tmp_path: object) -> None: + """Direct TabularLabelParser.parse unit test without cfg dispatch.""" + import pathlib + + import torch + + from raitap.data.label_parsers.tabular import TabularLabelParser + + csv_path = pathlib.Path(str(tmp_path)) / "labels.csv" + _write_csv(csv_path, "image,label\na.jpg,0\nb.jpg,1\n") + + parser = TabularLabelParser(source=str(csv_path), id_column="image") + result = parser.parse( + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert result is not None + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + assert result.tolist() == [0, 1] + + +# --- Task 5: CocoLabelParser --- + + +def _write_json(path: object, data: object) -> None: + import json + import pathlib + + pathlib.Path(str(path)).write_text(json.dumps(data), encoding="utf-8") + + +def _coco_detection_fixture(tmp_path: object) -> object: + """Two-image COCO with one annotated image and one empty image.""" + import pathlib + + coco = { + "images": [ + {"id": 1, "file_name": "a.jpg"}, + {"id": 2, "file_name": "b.jpg"}, + ], + "annotations": [ + {"image_id": 1, "category_id": 3, "bbox": [10, 20, 30, 40]}, + {"image_id": 1, "category_id": 5, "bbox": [0, 0, 5, 5]}, + ], + "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], + } + p = pathlib.Path(str(tmp_path)) / "instances.json" + _write_json(p, coco) + return p + + +def _coco_classification_fixture(tmp_path: object) -> object: + """Two-image COCO for classification (one category per image).""" + import pathlib + + coco = { + "images": [ + {"id": 1, "file_name": "a.jpg"}, + {"id": 2, "file_name": "b.jpg"}, + ], + "annotations": [ + {"image_id": 1, "category_id": 0, "bbox": [0, 0, 1, 1]}, + {"image_id": 2, "category_id": 4, "bbox": [0, 0, 1, 1]}, + ], + "categories": [{"id": 0, "name": "x"}, {"id": 4, "name": "y"}], + } + p = pathlib.Path(str(tmp_path)) / "cls.json" + _write_json(p, coco) + return p + + +def test_coco_parser_detection_direct(tmp_path: object) -> None: + """CocoLabelParser.parse detection: boxes xyxy, labels, empty-image shape.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + + labels_path = _coco_detection_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path)) + tensor = [object(), object()] # two samples + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, list) + assert len(result) == 2 + # a.jpg: two boxes, xyxy conversion + expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([3, 5])) + # b.jpg: empty annotation -> (0, 4) boxes, (0,) labels + assert result[1]["boxes"].shape == (0, 4) + assert result[1]["labels"].shape == (0,) + + +def test_coco_parser_classification_direct(tmp_path: object) -> None: + """CocoLabelParser.parse classification: long tensor of category ids.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + + labels_path = _coco_classification_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path)) + result = parser.parse( + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + assert result.tolist() == [0, 4] + + +def test_coco_parser_classification_rejects_multiple_categories(tmp_path: object) -> None: + """Classification parse raises ValueError when an image has >1 categories.""" + import pathlib + + from raitap.data.label_parsers.coco import CocoLabelParser + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}], + "annotations": [ + {"image_id": 1, "category_id": 3, "bbox": [0, 0, 1, 1]}, + {"image_id": 1, "category_id": 5, "bbox": [0, 0, 1, 1]}, + ], + "categories": [{"id": 3, "name": "car"}, {"id": 5, "name": "dog"}], + } + p = pathlib.Path(str(tmp_path)) / "multi.json" + _write_json(p, coco) + parser = CocoLabelParser(source=str(p)) + with pytest.raises(ValueError, match="exactly one category per image"): + parser.parse( + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg"], + data_source=None, + class_names=None, + ) + + +def test_coco_parser_detection_e2e_via_resolve(tmp_path: object) -> None: + """Detection e2e: _resolve_and_parse_labels with CocoLabelsConfig.""" + import torch + + from raitap.configs.schema import CocoLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + labels_path = _coco_detection_fixture(tmp_path) + cfg = _make_cfg(labels=CocoLabelsConfig(source=str(labels_path))) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, list) + assert len(result) == 2 + expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([3, 5])) + assert result[1]["boxes"].shape == (0, 4) + + +def test_coco_parser_classification_e2e_via_resolve(tmp_path: object) -> None: + """Classification e2e: _resolve_and_parse_labels with CocoLabelsConfig.""" + import torch + + from raitap.configs.schema import CocoLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + labels_path = _coco_classification_fixture(tmp_path) + cfg = _make_cfg(labels=CocoLabelsConfig(source=str(labels_path))) + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.classification, + tensor=None, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, torch.Tensor) + assert result.dtype == torch.long + assert result.tolist() == [0, 4] + + +# --- Task 6: YoloLabelParser --- + + +def _make_yolo_fixture( + tmp_path: object, +) -> tuple[object, object]: + """Create a minimal YOLO label dir + image dir with two images. + + Returns (labels_dir, image_dir). Images are 200x100 px. + Each .txt has one box: class 0, cx=0.5, cy=0.5, w=0.6, h=0.1. + Denormalised: x1=(0.5-0.3)*200=40, y1=(0.5-0.05)*100=45, + x2=(0.5+0.3)*200=160, y2=(0.5+0.05)*100=55. + """ + import pathlib + + from PIL import Image as PILImage + + tmp = pathlib.Path(str(tmp_path)) + labels_dir = tmp / "labels" + labels_dir.mkdir() + image_dir = tmp / "images" + image_dir.mkdir() + + for stem in ("a", "b"): + img = PILImage.new("RGB", (200, 100)) + img.save(image_dir / f"{stem}.jpg") + (labels_dir / f"{stem}.txt").write_text("0 0.5 0.5 0.6 0.1\n", encoding="utf-8") + + return labels_dir, image_dir + + +def test_yolo_parser_unit(tmp_path: object) -> None: + """YoloLabelParser.parse: boxes denormalised via PIL image size.""" + from raitap.data.label_parsers.yolo import YoloLabelParser + + labels_dir, image_dir = _make_yolo_fixture(tmp_path) + parser = YoloLabelParser(source=str(labels_dir)) + + tensor = [object(), object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=str(image_dir), + class_names=None, + ) + + assert isinstance(result, list) + assert len(result) == 2 + # IEEE-754: (0.5+0.05)*100 = 55.00000000000001 -> use pytest.approx + assert result[0]["boxes"][0].tolist() == pytest.approx([40.0, 45.0, 160.0, (0.5 + 0.05) * 100]) + assert result[0]["labels"].tolist() == [0] + assert result[1]["boxes"].shape == (1, 4) + + +def test_yolo_parser_raises_when_data_source_none(tmp_path: object) -> None: + """parse raises ValueError when data_source is None (no image dir).""" + from raitap.data.label_parsers.yolo import YoloLabelParser + + labels_dir, _ = _make_yolo_fixture(tmp_path) + parser = YoloLabelParser(source=str(labels_dir)) + with pytest.raises(ValueError, match=r"data\.source"): + parser.parse( + task_kind=TaskKind.detection, + tensor=[object()], + sample_ids=None, + data_source=None, + class_names=None, + ) + + +def test_yolo_parser_e2e_via_resolve(tmp_path: object) -> None: + """E2E: _resolve_and_parse_labels with YoloLabelsConfig + real image dir. + + Exercises image_dir resolution through the dispatch (gap #1). + """ + from raitap.configs.schema import YoloLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + labels_dir, image_dir = _make_yolo_fixture(tmp_path) + + cfg = _make_cfg( + labels=YoloLabelsConfig(source=str(labels_dir)), + source=str(image_dir), + ) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + + assert isinstance(result, list) + assert len(result) == 2 + assert result[0]["boxes"][0].tolist() == pytest.approx([40.0, 45.0, 160.0, (0.5 + 0.05) * 100]) + assert result[0]["labels"].tolist() == [0] + assert result[1]["boxes"].shape == (1, 4) + assert result[1]["labels"].tolist() == [0] + + +# --- Task 7: VocLabelParser --- + + +def _write_voc_xml(path: object, filename: str, objects: list[dict]) -> None: + """Write a minimal Pascal-VOC XML file.""" + import pathlib + + lines = [ + "", + f" {filename}", + ] + for obj in objects: + lines += [ + " ", + f" {obj['name']}", + ] + if obj.get("bndbox") is not None: + b = obj["bndbox"] + lines += [ + " ", + f" {b[0]}", + f" {b[1]}", + f" {b[2]}", + f" {b[3]}", + " ", + ] + lines.append(" ") + lines.append("") + pathlib.Path(str(path)).write_text("\n".join(lines), encoding="utf-8") + + +def _make_voc_fixture(tmp_path: object) -> object: + """Two-image VOC dir with class_names=['background','person','car']. + + a.jpg: person at [10,20,30,40], car at [5,5,15,15]. + b.jpg: person at [0,0,50,50]. + """ + import pathlib + + tmp = pathlib.Path(str(tmp_path)) + voc_dir = tmp / "voc_labels" + voc_dir.mkdir() + _write_voc_xml( + voc_dir / "a.xml", + "a.jpg", + [ + {"name": "person", "bndbox": [10, 20, 30, 40]}, + {"name": "car", "bndbox": [5, 5, 15, 15]}, + ], + ) + _write_voc_xml( + voc_dir / "b.xml", + "b.jpg", + [{"name": "person", "bndbox": [0, 0, 50, 50]}], + ) + return voc_dir + + +def test_voc_parser_unit_with_class_names(tmp_path: object) -> None: + """VocLabelParser.parse: person->1, car->2 with explicit class_names arg.""" + import torch + + from raitap.data.label_parsers.voc import VocLabelParser + + voc_dir = _make_voc_fixture(tmp_path) + parser = VocLabelParser(source=str(voc_dir)) + class_names = ["background", "person", "car"] + tensor = [object(), object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=class_names, + ) + assert isinstance(result, list) + assert len(result) == 2 + # a.jpg: person(1), car(2) + expected_boxes = torch.tensor([[10.0, 20.0, 30.0, 40.0], [5.0, 5.0, 15.0, 15.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([1, 2])) + # b.jpg: person(1) + assert torch.equal(result[1]["boxes"], torch.tensor([[0.0, 0.0, 50.0, 50.0]])) + assert torch.equal(result[1]["labels"], torch.tensor([1])) + + +def test_voc_parser_raises_on_missing_bndbox(tmp_path: object) -> None: + """parse raises ValueError when has no .""" + import pathlib + + from raitap.data.label_parsers.voc import VocLabelParser + + tmp = pathlib.Path(str(tmp_path)) + voc_dir = tmp / "voc_no_box" + voc_dir.mkdir() + _write_voc_xml( + voc_dir / "bad.xml", + "bad.jpg", + [{"name": "person"}], # no bndbox key -> not written + ) + parser = VocLabelParser(source=str(voc_dir)) + with pytest.raises(ValueError, match="no "): + parser.parse( + task_kind=TaskKind.detection, + tensor=[object()], + sample_ids=["bad.jpg"], + data_source=None, + class_names=["person"], + ) + + +def test_voc_parser_e2e_class_names_from_model(tmp_path: object) -> None: + """E2E: cfg.model.class_names supplies mapping; person->1 via _resolve_and_parse_labels.""" + import torch + + from raitap.configs.schema import VocLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + voc_dir = _make_voc_fixture(tmp_path) + # class_names on the config is None; model supplies it instead + cfg = _make_cfg( + labels=VocLabelsConfig(source=str(voc_dir)), + class_names=["background", "person", "car"], + ) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, list) + assert len(result) == 2 + assert torch.equal(result[0]["labels"], torch.tensor([1, 2])) + assert torch.equal(result[1]["labels"], torch.tensor([1])) + + +def test_voc_parser_own_class_names_takes_precedence(tmp_path: object) -> None: + """Parser's VocLabelsConfig.class_names overrides model's class_names.""" + import torch + + from raitap.configs.schema import VocLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + + voc_dir = _make_voc_fixture(tmp_path) + # Parser config has class_names; model has a different (wrong) mapping + cfg = _make_cfg( + labels=VocLabelsConfig( + source=str(voc_dir), + class_names=["background", "person", "car"], + ), + class_names=["car", "background", "person"], # different order -> would give wrong ids + ) + tensor = [object(), object()] + result = _resolve_and_parse_labels( + cfg, + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + ) + assert isinstance(result, list) + # Parser's own list wins: person->1, car->2 + assert torch.equal(result[0]["labels"], torch.tensor([1, 2])) + + +# --- Task 8: detection id_strategy parity --- + + +def _coco_detection_nested_fixture(tmp_path: object) -> object: + """COCO with file_name='a.jpg' (no subdir) but discovered sample_ids=['sub/a.jpg'].""" + import pathlib + + coco = { + "images": [{"id": 1, "file_name": "a.jpg"}], + "annotations": [ + {"image_id": 1, "category_id": 2, "bbox": [1, 2, 3, 4]}, + ], + "categories": [{"id": 2, "name": "cat"}], + } + p = pathlib.Path(str(tmp_path)) / "nested.json" + _write_json(p, coco) + return p + + +def test_coco_detection_nested_sample_ids_with_stem_strategy(tmp_path: object) -> None: + """Gap #2: COCO record 'a.jpg' matches discovered 'sub/a.jpg' via id_strategy='stem'.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + from raitap.data.types import IdStrategy + + labels_path = _coco_detection_nested_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path), id_strategy=IdStrategy.stem) + tensor = [object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["sub/a.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, list) + assert len(result) == 1 + # bbox [1,2,3,4] -> xyxy [1, 2, 1+3, 2+4] = [1, 2, 4, 6] + expected_boxes = torch.tensor([[1.0, 2.0, 4.0, 6.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([2])) + + +def test_coco_detection_exact_match_regression(tmp_path: object) -> None: + """Regression: exact-match ids still align under id_strategy='auto'.""" + import torch + + from raitap.data.label_parsers.coco import CocoLabelParser + from raitap.data.types import IdStrategy + + labels_path = _coco_detection_fixture(tmp_path) + parser = CocoLabelParser(source=str(labels_path), id_strategy=IdStrategy.auto) + tensor = [object(), object()] + result = parser.parse( + task_kind=TaskKind.detection, + tensor=tensor, + sample_ids=["a.jpg", "b.jpg"], + data_source=None, + class_names=None, + ) + assert isinstance(result, list) + assert len(result) == 2 + expected_boxes = torch.tensor([[10.0, 20.0, 40.0, 60.0], [0.0, 0.0, 5.0, 5.0]]) + assert torch.equal(result[0]["boxes"], expected_boxes) + assert torch.equal(result[0]["labels"], torch.tensor([3, 5])) + assert result[1]["boxes"].shape == (0, 4) + assert result[1]["labels"].shape == (0,) diff --git a/src/raitap/data/types.py b/src/raitap/data/types.py index 2defb94a..943f28d9 100644 --- a/src/raitap/data/types.py +++ b/src/raitap/data/types.py @@ -33,14 +33,6 @@ class IdStrategy(StrEnum): stem = "stem" -#: Reserved ``LabelsConfig.source`` value selecting folder-as-label ingestion: -#: classification labels are derived from each sample's top-level class -#: subdirectory (torchvision ``ImageFolder`` style; no labels file). Kept as a -#: plain ``str`` so it round-trips through OmegaConf; ``LabelsConfig.source`` -#: stays ``str | None`` (a path or this sentinel). -DIRECTORY_LABELS_SOURCE = "directory" - - class Preprocessing(StrEnum): """Named values for ``DataConfig.preprocessing``. diff --git a/src/raitap/task_families/base.py b/src/raitap/task_families/base.py index 167cadad..848c4051 100644 --- a/src/raitap/task_families/base.py +++ b/src/raitap/task_families/base.py @@ -96,10 +96,6 @@ def validate_inputs(self, tensor: object) -> None: """Validate the (post-adapt) inputs match this family's contract.""" raise NotImplementedError - def load_labels(self, cfg: AppConfig, *, tensor: object, sample_ids: object) -> Any: - """Load labels in this family's on-disk shape (or None).""" - raise NotImplementedError - def validate_labels(self, labels: object) -> None: """Raise if loaded labels don't match this family's expected shape.""" raise NotImplementedError diff --git a/src/raitap/task_families/classification.py b/src/raitap/task_families/classification.py index 4f86b759..3f8c1bf1 100644 --- a/src/raitap/task_families/classification.py +++ b/src/raitap/task_families/classification.py @@ -61,11 +61,6 @@ def validate_inputs(self, tensor: Any) -> None: if tensor.shape[0] < 1: raise ValueError("Classification data is empty; loaded zero samples.") - def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any: - from raitap.data.data import load_classification_labels - - return load_classification_labels(cfg, tensor=tensor, sample_ids=sample_ids) - def validate_labels(self, labels: Any) -> None: # A ``list[dict]`` is a detection-shaped label set; a tensor (or None) # is classification-shaped. Disagreement means model and data declare diff --git a/src/raitap/task_families/detection.py b/src/raitap/task_families/detection.py index 5141992c..fd10bdcc 100644 --- a/src/raitap/task_families/detection.py +++ b/src/raitap/task_families/detection.py @@ -10,15 +10,119 @@ from typing import TYPE_CHECKING, Any, cast +from raitap.data.data import _normalise_sample_id, _resolve_id_strategy from raitap.task_families.registry import task_family from raitap.transparency.contracts import ExplanationOutputSpace from raitap.types import TaskKind if TYPE_CHECKING: + import torch + from raitap.models.torch_backend import TorchBackend from raitap.task_families.base import ExplainContext, ForwardContext +def _align_detection_records( + records: list[dict[str, Any]], + *, + expected: int, + sample_ids: Any, + strategy: str = "auto", +) -> list[dict[str, torch.Tensor]]: + """Align native detection records to ``sample_ids`` and build tensors. + + Extracted from ``DetectionFamily.load_labels`` so label-format adapters can + feed converted records through the same alignment + validation path. + + When ``sample_ids`` is provided, both the discovered ids and record + ``sample_id`` fields are normalised via ``_normalise_sample_id`` using the + resolved ``strategy``, matching how the classification path handles nested + image directories. + """ + import pandas as pd + import torch + + if sample_ids is not None: + # Collect raw record ids first so _resolve_id_strategy can inspect them. + raw_record_ids: list[str] = [] + for index, record in enumerate(records): + record_id = record.get("sample_id") if isinstance(record, dict) else None + if record_id is None: + raise ValueError( + f"Detection labels record {index} is missing 'sample_id' " + "(required when the dataset exposes sample_ids)." + ) + raw_record_ids.append(str(record_id)) + + resolved = _resolve_id_strategy(strategy, pd.Series(raw_record_ids)) + + by_id: dict[str, dict[str, Any]] = {} + for record, record_id in zip(records, raw_record_ids, strict=True): + norm_id = _normalise_sample_id(record_id, resolved) + if norm_id in by_id: + raise ValueError( + f"Detection labels file contains duplicate sample_id {record_id!r}." + ) + by_id[norm_id] = record + + ordered_records = [] + missing: list[str] = [] + for sample_id in sample_ids: + norm_sid = _normalise_sample_id(sample_id, resolved) + record = by_id.get(norm_sid) + if record is None: + missing.append(sample_id) + else: + ordered_records.append(record) + if missing: + raise ValueError( + f"Detection labels file is missing entries for sample_ids: {missing!r}." + ) + records_iter: list[dict[str, Any]] = ordered_records + else: + if len(records) != expected: + raise ValueError( + f"Detection labels file has {len(records)} records but the " + f"dataset has {expected} samples; provide sample_id fields and " + "set data.labels.source so records can be aligned by id, or " + "match the record count to the sample count." + ) + records_iter = records + + out: list[dict[str, torch.Tensor]] = [] + for index, record in enumerate(records_iter): + boxes_raw = record.get("boxes", []) + labels_raw = record.get("labels", []) + if len(boxes_raw) != len(labels_raw): + raise ValueError( + f"Sample index {index}: boxes and labels must have matching " + f"length (got {len(boxes_raw)} boxes vs {len(labels_raw)} labels)." + ) + boxes_tensor = ( + torch.tensor(boxes_raw, dtype=torch.float32) + if boxes_raw + else torch.zeros((0, 4), dtype=torch.float32) + ) + labels_tensor = ( + torch.tensor(labels_raw, dtype=torch.int64) + if labels_raw + else torch.zeros((0,), dtype=torch.int64) + ) + if boxes_tensor.ndim != 2 or boxes_tensor.shape[1] != 4: + raise ValueError( + f"Sample index {index}: boxes must be shape (M_i, 4); got " + f"{tuple(boxes_tensor.shape)}." + ) + out.append({"boxes": boxes_tensor, "labels": labels_tensor}) + + if len(out) != expected: + raise ValueError( + f"Detection labels alignment produced {len(out)} entries but the " + f"dataset has {expected} samples." + ) + return out + + @task_family class DetectionFamily: kind: TaskKind = TaskKind.detection @@ -69,120 +173,6 @@ def validate_inputs(self, tensor: Any) -> None: + (f" with shape {shape}." if shape is not None else ".") ) - def load_labels(self, cfg: Any, *, tensor: Any, sample_ids: Any) -> Any: - """Load per-sample detection targets (boxes + labels). - - Expected on-disk shape: JSON file (list of records) with each record - carrying ``sample_id`` (str), ``boxes`` (list of ``[x1, y1, x2, y2]`` - floats), and ``labels`` (list of ints). Returns a list whose length - equals ``len(tensor)``; each entry is a dict with - ``boxes: (M_i, 4) float32`` and ``labels: (M_i,) int64`` tensors. - Samples with no boxes get shape-``(0, 4)`` / shape-``(0,)`` tensors. - - Alignment rules: - - * When ``sample_ids`` is set, records are looked up by ``sample_id`` - and the output is ordered to match ``sample_ids``. Any sample - missing from the labels file → ``ValueError``; duplicate ``sample_id``s - in the labels file → ``ValueError``. - * When ``sample_ids`` is unset, records are consumed in file order - and must equal the dataset length exactly. - - Returns ``None`` when ``data.labels.source`` is unset. - """ - import json - - import torch - - from raitap.data.data import SourceKind, _get_optional_config_value, get_source_path - - labels_cfg = _get_optional_config_value(cfg.data, "labels") - labels_source = _get_optional_config_value(labels_cfg, "source") - if not labels_source: - return None - - # ``get_source_path`` raises ValueError if the source can't be resolved - # or returns an existing path; no separate existence check needed. - labels_path = get_source_path(labels_source, kind=SourceKind.LABELS) - - with labels_path.open() as fh: - records = json.load(fh) - if not isinstance(records, list): - raise ValueError(f"Detection labels file {labels_path} must be a JSON array.") - - expected = len(tensor) - - if sample_ids is not None: - by_id: dict[str, dict[str, Any]] = {} - for index, record in enumerate(records): - record_id = record.get("sample_id") if isinstance(record, dict) else None - if record_id is None: - raise ValueError( - f"Detection labels record {index} is missing 'sample_id' " - "(required when the dataset exposes sample_ids)." - ) - if record_id in by_id: - raise ValueError( - f"Detection labels file contains duplicate sample_id {record_id!r}." - ) - by_id[record_id] = record - ordered_records = [] - missing: list[str] = [] - for sample_id in sample_ids: - record = by_id.get(sample_id) - if record is None: - missing.append(sample_id) - else: - ordered_records.append(record) - if missing: - raise ValueError( - f"Detection labels file is missing entries for sample_ids: {missing!r}." - ) - records_iter: list[dict[str, Any]] = ordered_records - else: - if len(records) != expected: - raise ValueError( - f"Detection labels file has {len(records)} records but the " - f"dataset has {expected} samples; provide sample_id fields and " - "set data.labels.source so records can be aligned by id, or " - "match the record count to the sample count." - ) - records_iter = records - - out: list[dict[str, torch.Tensor]] = [] - for index, record in enumerate(records_iter): - boxes_raw = record.get("boxes", []) - labels_raw = record.get("labels", []) - if len(boxes_raw) != len(labels_raw): - raise ValueError( - f"Sample index {index}: boxes and labels must have matching " - f"length (got {len(boxes_raw)} boxes vs {len(labels_raw)} labels)." - ) - boxes_tensor = ( - torch.tensor(boxes_raw, dtype=torch.float32) - if boxes_raw - else torch.zeros((0, 4), dtype=torch.float32) - ) - labels_tensor = ( - torch.tensor(labels_raw, dtype=torch.int64) - if labels_raw - else torch.zeros((0,), dtype=torch.int64) - ) - if boxes_tensor.ndim != 2 or boxes_tensor.shape[1] != 4: - raise ValueError( - f"Sample index {index}: boxes must be shape (M_i, 4); got " - f"{tuple(boxes_tensor.shape)}." - ) - out.append({"boxes": boxes_tensor, "labels": labels_tensor}) - - if len(out) != expected: - raise ValueError( - f"Detection labels alignment produced {len(out)} entries but the " - f"dataset has {expected} samples." - ) - - return out - def validate_labels(self, labels: Any) -> None: # The detection loader returns ``list[dict]`` or ``None``. A bare tensor # is a classification-shaped label set; disagreement means model and diff --git a/src/raitap/task_families/tests/test_base.py b/src/raitap/task_families/tests/test_base.py index 2df0e75a..78dcaa79 100644 --- a/src/raitap/task_families/tests/test_base.py +++ b/src/raitap/task_families/tests/test_base.py @@ -27,9 +27,6 @@ def adapt_loaded_inputs(self, tensor: object) -> object: def validate_inputs(self, tensor: object) -> None: pass - def load_labels(self, cfg: object, *, tensor: object, sample_ids: object) -> object: - pass - def validate_labels(self, labels: object) -> None: pass diff --git a/src/raitap/tests/test_api.py b/src/raitap/tests/test_api.py index 772e0d74..4f2d412c 100644 --- a/src/raitap/tests/test_api.py +++ b/src/raitap/tests/test_api.py @@ -23,10 +23,10 @@ from raitap.api import instantiate from raitap.configs.schema import ( DataConfig, - LabelsConfig, ModelConfig, MulticlassClassificationMetricsConfig, RobustnessConfig, + TabularLabelsConfig, TransparencyConfig, ) from raitap.data.preprocessing import resolve_preprocessing @@ -56,7 +56,7 @@ def _demo_app_config() -> AppConfig: name="imagenet_samples", source="imagenet_samples", forward_batch_size=4, - labels=LabelsConfig( + labels=TabularLabelsConfig( source="imagenet_samples", id_column="image", column="label", diff --git a/src/raitap/tests/test_e2e_detection.py b/src/raitap/tests/test_e2e_detection.py index 66d22414..9824f28e 100644 --- a/src/raitap/tests/test_e2e_detection.py +++ b/src/raitap/tests/test_e2e_detection.py @@ -97,19 +97,26 @@ def test_detection_pipeline_e2e_via_fasterrcnn_mobilenet(tmp_path: Path) -> None labels_path = tmp_path / "detection_labels.json" labels_path.write_text(json.dumps(labels_payload)) - # Bypass Data.__init__ and call DetectionFamily.load_labels directly; the + # Bypass Data.__init__ and call _resolve_and_parse_labels directly; the # detection label loader has its own dedicated coverage in # src/raitap/data/tests/test_detection_labels.py. This test focuses on the # pipeline plumbing downstream of Data. - from raitap.task_families.detection import DetectionFamily + from raitap.configs.schema import DetectionJsonLabelsConfig + from raitap.data.data import _resolve_and_parse_labels + from raitap.types import TaskKind - labels_cfg = SimpleNamespace(source=str(labels_path)) load_cfg = cast( "AppConfig", - SimpleNamespace(data=SimpleNamespace(labels=labels_cfg)), + SimpleNamespace( + data=SimpleNamespace( + labels=DetectionJsonLabelsConfig(source=str(labels_path)), + source=None, + ), + model=SimpleNamespace(class_names=None), + ), ) - data.labels = DetectionFamily().load_labels( - load_cfg, tensor=data.tensor, sample_ids=data.sample_ids + data.labels = _resolve_and_parse_labels( + load_cfg, task_kind=TaskKind.detection, tensor=data.tensor, sample_ids=data.sample_ids ) # --- app config -------------------------------------------------------- diff --git a/src/raitap/tests/test_example_recipes.py b/src/raitap/tests/test_example_recipes.py index 521cfb8e..a6c40a1d 100644 --- a/src/raitap/tests/test_example_recipes.py +++ b/src/raitap/tests/test_example_recipes.py @@ -28,7 +28,8 @@ pytest.importorskip("torchmetrics") # metrics adapter from raitap import AppConfig, Hardware, run -from raitap.data import DataConfig, LabelsConfig +from raitap.configs.schema import TabularLabelsConfig +from raitap.data import DataConfig from raitap.metrics import multiclass_classification as classification from raitap.models import ModelConfig from raitap.pipeline.outputs import RunOutputs @@ -60,7 +61,7 @@ def _base_kwargs(experiment_name: str) -> _BaseKwargs: name="imagenet_samples", source="imagenet_samples", forward_batch_size=4, - labels=LabelsConfig( + labels=TabularLabelsConfig( source="imagenet_samples", id_column="image", column="label",