MMIV-ML · skaliy · Feb 18, 2026 · Feb 18, 2026
diff --git a/fastMONAI/__init__.py b/fastMONAI/__init__.py
@@ -1 +1 @@
-__version__ = "0.8.1"
+__version__ = "0.8.2"
diff --git a/fastMONAI/dataset_info.py b/fastMONAI/dataset_info.py
@@ -554,11 +554,11 @@ def round_to_divisor(val, div):
 def preprocess_dataset(df, img_col, mask_col=None, output_dir='preprocessed',
                        target_spacing=None, apply_reorder=True, transforms=None,
                        max_workers=4, skip_existing=True):
-    """Preprocess dataset to disk and update DataFrame path columns in-place.
+    """Preprocess dataset to disk, creating new columns for preprocessed paths.
 
     Processes images (and optionally masks) through a transform pipeline,
-    saves to output_dir, then updates df[img_col] and df[mask_col] in-place
-    to point to the preprocessed files.
+    saves to output_dir, then creates new '{col}_preprocessed' columns in
+    the DataFrame. Original columns are preserved unchanged.
 
     Transform pipeline order:
         CopyAffine (if masks) -> ToCanonical (if apply_reorder)
@@ -678,10 +678,11 @@ def _process_case(item):
                     failed_cases.append(Path(item['img_path']).name)
                     warnings.warn(f"Failed to process {item['img_path']}: {e}")
 
-    # Update DataFrame in-place
-    df[img_col] = [str(img_dir / Path(p).name) for p in df[img_col]]
+    # Create new columns for preprocessed paths (preserve originals)
+    df[f'{img_col}_preprocessed'] = [str(img_dir / Path(p).name) for p in df[img_col]]
+
     if mask_col is not None:
-        df[mask_col] = [str(mask_dir / Path(p).name) for p in df[mask_col]]
+        df[f'{mask_col}_preprocessed'] = [str(mask_dir / Path(p).name) for p in df[mask_col]]
 
     print(f"Preprocessing complete: {processed} processed, {skipped} skipped, {failed} failed")
     if failed_cases:

diff --git a/fastMONAI/utils.py b/fastMONAI/utils.py
@@ -237,6 +237,7 @@ def _extract_patch_config(learn) -> dict:
             'aggregation_mode': patch_config.aggregation_mode,
             'padding_mode': patch_config.padding_mode,
             'keep_largest_component': patch_config.keep_largest_component,
+            'preprocessed': patch_config.preprocessed,
         }
     else:
         config['patch_config'] = None

diff --git a/fastMONAI/vision_patch.py b/fastMONAI/vision_patch.py
@@ -116,6 +116,11 @@ class PatchConfig:
             training and inference. Defaults to True (the common case).
         target_spacing: Target voxel spacing [x, y, z] for resampling. Must match between
             training and inference.
+        preprocessed: If True, data has been preprocessed externally (e.g., via
+            preprocess_dataset()). Training will skip reorder, resample, AND
+            pre_patch_tfms (e.g., normalization) since they were already applied.
+            Inference is unaffected and always applies pre_inference_tfms to raw
+            images. Defaults to False.
         padding_mode: Padding mode for CropOrPad when image < patch_size. Default is 0 (zero padding)
             to align with nnU-Net's approach. Can be int, float, or string (e.g., 'minimum', 'mean').
         keep_largest_component: If True, keep only the largest connected component
@@ -142,6 +147,7 @@ class PatchConfig:
     # Preprocessing parameters - must match between training and inference
     apply_reorder: bool = True  # Defaults to True (the common case)
     target_spacing: list = None
+    preprocessed: bool = False  # True = data already preprocessed, skip all preprocessing during training
     padding_mode: int | float | str = 0  # Zero padding (nnU-Net standard)
     # Post-processing (binary segmentation only)
     keep_largest_component: bool = False
@@ -653,6 +659,8 @@ def from_df(
             pre_patch_tfms: TorchIO transforms applied before patch extraction
                            (after reorder/resample). Example: [tio.ZNormalization()].
                            Accepts both fastMONAI wrappers and raw TorchIO transforms.
+                           Skipped when preprocessed=True (include in preprocess_dataset()
+                           transforms instead). Still needed for inference via pre_inference_tfms.
             patch_tfms: TorchIO transforms applied to extracted patches (training only).
                 Mutually exclusive with gpu_augmentation.
             gpu_augmentation: GpuPatchAugmentation instance for GPU-batched augmentation
@@ -725,17 +733,19 @@ def from_df(
         # Build preprocessing transforms
         all_pre_tfms = []
 
-        # Add reorder transform (reorder to RAS+ orientation)
-        if _apply_reorder:
-            all_pre_tfms.append(tio.ToCanonical())
+        # Skip all preprocessing if data was already preprocessed externally
+        if not patch_config.preprocessed:
+            # Add reorder transform (reorder to RAS+ orientation)
+            if _apply_reorder:
+                all_pre_tfms.append(tio.ToCanonical())
 
-        # Add resample transform
-        if _target_spacing is not None:
-            all_pre_tfms.append(tio.Resample(_target_spacing))
+            # Add resample transform
+            if _target_spacing is not None:
+                all_pre_tfms.append(tio.Resample(_target_spacing))
 
-        # Add user-provided transforms (normalize to raw TorchIO transforms)
-        if pre_patch_tfms:
-            all_pre_tfms.extend(normalize_patch_transforms(pre_patch_tfms))
+            # Add user-provided transforms (normalize to raw TorchIO transforms)
+            if pre_patch_tfms:
+                all_pre_tfms.extend(normalize_patch_transforms(pre_patch_tfms))
 
         # Create subjects datasets with lazy loading (paths only, ~0 MB)
         train_subjects = create_subjects_dataset(

diff --git a/nbs/07_utils.ipynb b/nbs/07_utils.ipynb
@@ -199,7 +199,7 @@
    "id": "czquspt567w",
    "metadata": {},
    "outputs": [],
-   "source": "#| export\ndef _detect_patch_workflow(dls) -> bool:\n    \"\"\"Detect if DataLoaders are patch-based (MedPatchDataLoaders).\n    \n    Args:\n        dls: DataLoaders instance\n        \n    Returns:\n        True if dls is a MedPatchDataLoaders instance\n    \"\"\"\n    return hasattr(dls, 'patch_config') or hasattr(dls, '_patch_config')\n\n\ndef _extract_size_from_transforms(tfms) -> list | None:\n    \"\"\"Extract target size from PadOrCrop transform if present.\n    \n    Args:\n        tfms: List of transforms\n        \n    Returns:\n        Target size as list, or None if not found\n    \"\"\"\n    if tfms is None:\n        return None\n    for tfm in tfms:\n        if hasattr(tfm, 'pad_or_crop') and hasattr(tfm.pad_or_crop, 'target_shape'):\n            return list(tfm.pad_or_crop.target_shape)\n    return None\n\n\ndef _extract_standard_config(learn) -> dict:\n    \"\"\"Extract config from standard MedDataBlock workflow.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Dictionary with extracted configuration\n    \"\"\"\n    from fastMONAI.vision_core import MedBase\n    dls = learn.dls\n\n    # Get preprocessing from MedBase class attributes\n    apply_reorder = MedBase.apply_reorder\n    target_spacing = MedBase.target_spacing\n\n    # Extract item_tfms from DataLoaders pipeline\n    item_tfms = []\n    if hasattr(dls, 'after_item') and dls.after_item:\n        item_tfms = list(dls.after_item.fs)\n\n    # Extract size from PadOrCrop transform\n    size = _extract_size_from_transforms(item_tfms)\n\n    return {\n        'apply_reorder': apply_reorder,\n        'target_spacing': target_spacing,\n        'size': size,\n        'item_tfms': item_tfms,\n        'batch_size': dls.bs,\n        'patch_config': None,\n    }\n\n\ndef _extract_patch_config(learn) -> dict:\n    \"\"\"Extract config from MedPatchDataLoaders workflow.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Dictionary with extracted configuration including patch-specific params\n    \"\"\"\n    dls = learn.dls\n    patch_config = getattr(dls, '_patch_config', None) or getattr(dls, 'patch_config', None)\n\n    config = {\n        'apply_reorder': getattr(dls, '_apply_reorder', patch_config.apply_reorder if patch_config else False),\n        'target_spacing': getattr(dls, '_target_spacing', patch_config.target_spacing if patch_config else None),\n        'size': patch_config.patch_size if patch_config else None,\n        'item_tfms': getattr(dls, '_pre_patch_tfms', []) or [],\n        'batch_size': dls.bs,\n    }\n\n    # Add patch-specific params for logging\n    if patch_config:\n        config['patch_config'] = {\n            'patch_size': patch_config.patch_size,\n            'patch_overlap': patch_config.patch_overlap,\n            'samples_per_volume': patch_config.samples_per_volume,\n            'sampler_type': patch_config.sampler_type,\n            'label_probabilities': str(patch_config.label_probabilities) if patch_config.label_probabilities else None,\n            'queue_length': patch_config.queue_length,\n            'aggregation_mode': patch_config.aggregation_mode,\n            'padding_mode': patch_config.padding_mode,\n            'keep_largest_component': patch_config.keep_largest_component,\n        }\n    else:\n        config['patch_config'] = None\n\n    return config\n\n\ndef _extract_loss_name(learn) -> str:\n    \"\"\"Extract loss function name from Learner.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Name of the loss function\n    \"\"\"\n    loss_func = learn.loss_func\n    # Handle CustomLoss wrapper\n    if hasattr(loss_func, 'loss_func'):\n        inner = loss_func.loss_func\n        return inner._get_name() if hasattr(inner, '_get_name') else inner.__class__.__name__\n    return loss_func._get_name() if hasattr(loss_func, '_get_name') else loss_func.__class__.__name__\n\n\ndef _extract_model_name(learn) -> str:\n    \"\"\"Extract model architecture name from Learner.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Name of the model architecture\n    \"\"\"\n    model = learn.model\n    return model._get_name() if hasattr(model, '_get_name') else model.__class__.__name__"
+   "source": "#| export\ndef _detect_patch_workflow(dls) -> bool:\n    \"\"\"Detect if DataLoaders are patch-based (MedPatchDataLoaders).\n    \n    Args:\n        dls: DataLoaders instance\n        \n    Returns:\n        True if dls is a MedPatchDataLoaders instance\n    \"\"\"\n    return hasattr(dls, 'patch_config') or hasattr(dls, '_patch_config')\n\n\ndef _extract_size_from_transforms(tfms) -> list | None:\n    \"\"\"Extract target size from PadOrCrop transform if present.\n    \n    Args:\n        tfms: List of transforms\n        \n    Returns:\n        Target size as list, or None if not found\n    \"\"\"\n    if tfms is None:\n        return None\n    for tfm in tfms:\n        if hasattr(tfm, 'pad_or_crop') and hasattr(tfm.pad_or_crop, 'target_shape'):\n            return list(tfm.pad_or_crop.target_shape)\n    return None\n\n\ndef _extract_standard_config(learn) -> dict:\n    \"\"\"Extract config from standard MedDataBlock workflow.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Dictionary with extracted configuration\n    \"\"\"\n    from fastMONAI.vision_core import MedBase\n    dls = learn.dls\n\n    # Get preprocessing from MedBase class attributes\n    apply_reorder = MedBase.apply_reorder\n    target_spacing = MedBase.target_spacing\n\n    # Extract item_tfms from DataLoaders pipeline\n    item_tfms = []\n    if hasattr(dls, 'after_item') and dls.after_item:\n        item_tfms = list(dls.after_item.fs)\n\n    # Extract size from PadOrCrop transform\n    size = _extract_size_from_transforms(item_tfms)\n\n    return {\n        'apply_reorder': apply_reorder,\n        'target_spacing': target_spacing,\n        'size': size,\n        'item_tfms': item_tfms,\n        'batch_size': dls.bs,\n        'patch_config': None,\n    }\n\n\ndef _extract_patch_config(learn) -> dict:\n    \"\"\"Extract config from MedPatchDataLoaders workflow.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Dictionary with extracted configuration including patch-specific params\n    \"\"\"\n    dls = learn.dls\n    patch_config = getattr(dls, '_patch_config', None) or getattr(dls, 'patch_config', None)\n\n    config = {\n        'apply_reorder': getattr(dls, '_apply_reorder', patch_config.apply_reorder if patch_config else False),\n        'target_spacing': getattr(dls, '_target_spacing', patch_config.target_spacing if patch_config else None),\n        'size': patch_config.patch_size if patch_config else None,\n        'item_tfms': getattr(dls, '_pre_patch_tfms', []) or [],\n        'batch_size': dls.bs,\n    }\n\n    # Add patch-specific params for logging\n    if patch_config:\n        config['patch_config'] = {\n            'patch_size': patch_config.patch_size,\n            'patch_overlap': patch_config.patch_overlap,\n            'samples_per_volume': patch_config.samples_per_volume,\n            'sampler_type': patch_config.sampler_type,\n            'label_probabilities': str(patch_config.label_probabilities) if patch_config.label_probabilities else None,\n            'queue_length': patch_config.queue_length,\n            'aggregation_mode': patch_config.aggregation_mode,\n            'padding_mode': patch_config.padding_mode,\n            'keep_largest_component': patch_config.keep_largest_component,\n            'preprocessed': patch_config.preprocessed,\n        }\n    else:\n        config['patch_config'] = None\n\n    return config\n\n\ndef _extract_loss_name(learn) -> str:\n    \"\"\"Extract loss function name from Learner.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Name of the loss function\n    \"\"\"\n    loss_func = learn.loss_func\n    # Handle CustomLoss wrapper\n    if hasattr(loss_func, 'loss_func'):\n        inner = loss_func.loss_func\n        return inner._get_name() if hasattr(inner, '_get_name') else inner.__class__.__name__\n    return loss_func._get_name() if hasattr(loss_func, '_get_name') else loss_func.__class__.__name__\n\n\ndef _extract_model_name(learn) -> str:\n    \"\"\"Extract model architecture name from Learner.\n    \n    Args:\n        learn: fastai Learner instance\n        \n    Returns:\n        Name of the model architecture\n    \"\"\"\n    model = learn.model\n    return model._get_name() if hasattr(model, '_get_name') else model.__class__.__name__"
   },
   {
    "cell_type": "code",