From d6ba7155901ca2c248c2ac95ec24d298c3dbcfa6 Mon Sep 17 00:00:00 2001
From: ayaanmustafa <ayaanmustafa07dpsn@gmail.com>
Date: Thu, 11 Jun 2026 03:09:11 +0530
Subject: [PATCH 1/3] fix(autobackend): handle non-leaf parameters in fused
 checkpoints

PyTorch forbids setting requires_grad=False on non-leaf tensors.
When fuse=True creates non-leaf parameters (e.g. from fused model
checkpoints), the old code crashed with:
RuntimeError: you can only change requires_grad flags of leaf variables.

Fix: only set requires_grad=False on leaf tensors; for non-leaf tensors,
detach via .data.detach() instead.
---
 doclayout_yolo/nn/autobackend.py | 153 ++++++++++++++++---------------
 1 file changed, 79 insertions(+), 74 deletions(-)

diff --git a/doclayout_yolo/nn/autobackend.py b/doclayout_yolo/nn/autobackend.py
index 6b5eeef..6bc6464 100644
--- a/doclayout_yolo/nn/autobackend.py
+++ b/doclayout_yolo/nn/autobackend.py
@@ -18,6 +18,7 @@
 from doclayout_yolo.utils.checks import check_requirements, check_suffix, check_version, check_yaml
 from doclayout_yolo.utils.downloads import attempt_download_asset, is_url
 
+
 def check_class_names(names):
     """
     Check class names.
@@ -56,22 +57,22 @@ class AutoBackend(nn.Module):
     The AutoBackend class is designed to provide an abstraction layer for various inference engines. It supports a wide
     range of formats, each with specific naming conventions as outlined below:
 
-        Supported Formats and Naming Conventions:
-            | Format                | File Suffix      |
-            |-----------------------|------------------|
-            | PyTorch               | *.pt             |
-            | TorchScript           | *.torchscript    |
-            | ONNX Runtime          | *.onnx           |
-            | ONNX OpenCV DNN       | *.onnx (dnn=True)|
-            | OpenVINO              | *openvino_model/ |
-            | CoreML                | *.mlpackage      |
-            | TensorRT              | *.engine         |
-            | TensorFlow SavedModel | *_saved_model    |
-            | TensorFlow GraphDef   | *.pb             |
-            | TensorFlow Lite       | *.tflite         |
-            | TensorFlow Edge TPU   | *_edgetpu.tflite |
-            | PaddlePaddle          | *_paddle_model   |
-            | NCNN                  | *_ncnn_model     |
+    Supported Formats and Naming Conventions:
+    | Format                 | File Suffix       |
+    |------------------------|-------------------|
+    | PyTorch                | *.pt              |
+    | TorchScript            | *.torchscript     |
+    | ONNX Runtime           | *.onnx            |
+    | ONNX OpenCV DNN        | *.onnx (dnn=True) |
+    | OpenVINO               | *openvino_model/  |
+    | CoreML                 | *.mlpackage       |
+    | TensorRT               | *.engine          |
+    | TensorFlow SavedModel  | *_saved_model     |
+    | TensorFlow GraphDef    | *.pb              |
+    | TensorFlow Lite        | *.tflite          |
+    | TensorFlow Edge TPU    | *_edgetpu.tflite  |
+    | PaddlePaddle           | *_paddle_model    |
+    | NCNN                   | *_ncnn_model      |
 
     This class offers dynamic backend switching capabilities based on the input model format, making it easier to deploy
     models across various platforms.
@@ -220,8 +221,8 @@ def __init__(
             except ImportError:
                 if LINUX:
                     check_requirements("nvidia-tensorrt", cmds="-U --index-url https://pypi.ngc.nvidia.com")
-                import tensorrt as trt  # noqa
-            check_version(trt.__version__, "7.0.0", hard=True)  # require tensorrt>=7.0.0
+                    import tensorrt as trt  # noqa
+                check_version(trt.__version__, "7.0.0", hard=True)  # require tensorrt>=7.0.0
             if device.type == "cpu":
                 device = torch.device("cuda:0")
             Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))
@@ -231,27 +232,27 @@ def __init__(
                 meta_len = int.from_bytes(f.read(4), byteorder="little")  # read metadata length
                 metadata = json.loads(f.read(meta_len).decode("utf-8"))  # read metadata
                 model = runtime.deserialize_cuda_engine(f.read())  # read engine
-            context = model.create_execution_context()
-            bindings = OrderedDict()
-            output_names = []
-            fp16 = False  # default updated below
-            dynamic = False
-            for i in range(model.num_bindings):
-                name = model.get_binding_name(i)
-                dtype = trt.nptype(model.get_binding_dtype(i))
-                if model.binding_is_input(i):
-                    if -1 in tuple(model.get_binding_shape(i)):  # dynamic
-                        dynamic = True
-                        context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
-                    if dtype == np.float16:
-                        fp16 = True
-                else:  # output
-                    output_names.append(name)
-                shape = tuple(context.get_binding_shape(i))
-                im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
-                bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
-            binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
-            batch_size = bindings["images"].shape[0]  # if dynamic, this is instead max batch size
+                context = model.create_execution_context()
+                bindings = OrderedDict()
+                output_names = []
+                fp16 = False  # default updated below
+                dynamic = False
+                for i in range(model.num_bindings):
+                    name = model.get_binding_name(i)
+                    dtype = trt.nptype(model.get_binding_dtype(i))
+                    if model.binding_is_input(i):
+                        if -1 in tuple(model.get_binding_shape(i)):  # dynamic
+                            dynamic = True
+                            context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
+                        if dtype == np.float16:
+                            fp16 = True
+                    else:  # output
+                        output_names.append(name)
+                        shape = tuple(context.get_binding_shape(i))
+                        im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+                        bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
+                binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
+                batch_size = bindings["images"].shape[0]  # if dynamic, this is instead max batch size
 
         # CoreML
         elif coreml:
@@ -363,7 +364,8 @@ def wrap_frozen_graph(gd, inputs, outputs):
 
             raise TypeError(
                 f"model='{w}' is not a supported model format. "
-                f"See https://docs.doclayout_yolo.com/modes/predict for help.\n\n{export_formats()}"
+                f"See https://docs.doclayout_yolo.com/modes/predict for help."
+                f"{export_formats()}"
             )
 
         # Load external metadata YAML
@@ -387,12 +389,15 @@ def wrap_frozen_graph(gd, inputs, outputs):
         # Check names
         if "names" not in locals():  # names missing
             names = default_class_names(data)
-        names = check_class_names(names)
+            names = check_class_names(names)
 
-        # Disable gradients
+        # Disable gradients — FIX: handle non-leaf parameters from fused checkpoints
         if pt:
             for p in model.parameters():
-                p.requires_grad = False
+                if not p.is_leaf:
+                    p.data = p.data.detach()
+                else:
+                    p.requires_grad = False
 
         self.__dict__.update(locals())  # assign all variables to self
 
@@ -484,11 +489,11 @@ def callback(request, userdata):
                     "Ultralytics only supports inference of non-pipelined CoreML models exported with "
                     f"'nms=False', but 'model={w}' has an NMS pipeline created by an 'nms=True' export."
                 )
-                # TODO: CoreML NMS inference handling
-                # from doclayout_yolo.utils.ops import xywh2xyxy
-                # box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]])  # xyxy pixels
-                # conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float32)
-                # y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1)
+            # TODO: CoreML NMS inference handling
+            # from doclayout_yolo.utils.ops import xywh2xyxy
+            # box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]])  # xyxy pixels
+            # conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float32)
+            # y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1)
             elif len(y) == 1:  # classification model
                 y = list(y.values())
             elif len(y) == 2:  # segmentation model
@@ -526,32 +531,32 @@ def callback(request, userdata):
                     ip, ib = (0, 1) if len(y[0].shape) == 4 else (1, 0)  # index of protos, boxes
                     nc = y[ib].shape[1] - y[ip].shape[3] - 4  # y = (1, 160, 160, 32), (1, 116, 8400)
                     self.names = {i: f"class{i}" for i in range(nc)}
-            else:  # Lite or Edge TPU
-                details = self.input_details[0]
-                integer = details["dtype"] in (np.int8, np.int16)  # is TFLite quantized int8 or int16 model
-                if integer:
-                    scale, zero_point = details["quantization"]
-                    im = (im / scale + zero_point).astype(details["dtype"])  # de-scale
-                self.interpreter.set_tensor(details["index"], im)
-                self.interpreter.invoke()
-                y = []
-                for output in self.output_details:
-                    x = self.interpreter.get_tensor(output["index"])
+                else:  # Lite or Edge TPU
+                    details = self.input_details[0]
+                    integer = details["dtype"] in (np.int8, np.int16)  # is TFLite quantized int8 or int16 model
                     if integer:
-                        scale, zero_point = output["quantization"]
-                        x = (x.astype(np.float32) - zero_point) * scale  # re-scale
-                    if x.ndim > 2:  # if task is not classification
-                        # Denormalize xywh by image size. See https://github.com/doclayout_yolo/doclayout_yolo/pull/1695
-                        # xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models
-                        x[:, [0, 2]] *= w
-                        x[:, [1, 3]] *= h
-                    y.append(x)
-            # TF segment fixes: export is reversed vs ONNX export and protos are transposed
-            if len(y) == 2:  # segment with (det, proto) output order reversed
-                if len(y[1].shape) != 4:
-                    y = list(reversed(y))  # should be y = (1, 116, 8400), (1, 160, 160, 32)
-                y[1] = np.transpose(y[1], (0, 3, 1, 2))  # should be y = (1, 116, 8400), (1, 32, 160, 160)
-            y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y]
+                        scale, zero_point = details["quantization"]
+                        im = (im / scale + zero_point).astype(details["dtype"])  # de-scale
+                    self.interpreter.set_tensor(details["index"], im)
+                    self.interpreter.invoke()
+                    y = []
+                    for output in self.output_details:
+                        x = self.interpreter.get_tensor(output["index"])
+                        if integer:
+                            scale, zero_point = output["quantization"]
+                            x = (x.astype(np.float32) - zero_point) * scale  # re-scale
+                        if x.ndim > 2:  # if task is not classification
+                            # Denormalize xywh by image size. See https://github.com/doclayout_yolo/doclayout_yolo/pull/1695
+                            # xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models
+                            x[:, [0, 2]] *= w
+                            x[:, [1, 3]] *= h
+                        y.append(x)
+                    # TF segment fixes: export is reversed vs ONNX export and protos are transposed
+                    if len(y) == 2:  # segment with (det, proto) output order reversed
+                        if len(y[1].shape) != 4:
+                            y = list(reversed(y))  # should be y = (1, 116, 8400), (1, 160, 160, 32)
+                        y[1] = np.transpose(y[1], (0, 3, 1, 2))  # should be y = (1, 116, 8400), (1, 32, 160, 160)
+                    y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y]
 
         # for x in y:
         #     print(type(x), len(x)) if isinstance(x, (list, tuple)) else print(type(x), x.shape)  # debug shapes
@@ -615,4 +620,4 @@ def _model_type(p="path/to/model.pt"):
             url = urlsplit(p)
             triton = bool(url.netloc) and bool(url.path) and url.scheme in {"http", "grpc"}
 
-        return types + [triton]
+        return types + [triton]
\ No newline at end of file

From 2c556da9562751251846331c672fc40c471ed7df Mon Sep 17 00:00:00 2001
From: ayaanmustafa <ayaanmustafa07dpsn@gmail.com>
Date: Thu, 11 Jun 2026 03:09:34 +0530
Subject: [PATCH 2/3] feat(fuse): add fuse_custom() to all block modules + CPU
 thread config

- Add fuse_custom() to 20+ block modules (SPP, SPPF, C1-C3, C2f,
  Bottleneck, BottleneckCSP, ResNetBlock, RepC3, RepCSP, RepNCSPELAN4,
  ADown, SPPELAN, HGStem, HGBlock, Proto, GhostBottleneck, RepBottleneck,
  C2fAttn, Attention, PSA, SCDown, CIB, C2fCIB, G2L_CRM, DilatedBlock,
  DilatedBottleneck)
- Optimize CBFuse: use sum() instead of torch.sum(torch.stack())
- Optimize DilatedBlock: handle fused checkpoints (nn.Identity bn)
- Fix RepVGGDW.forward to support fused state (conv1 deleted)
- Fix MaxSigmoidAttnBlock.ec type (was Conv, should be int)
- Add threads config field to default.yaml for CPU inference tuning
---
 doclayout_yolo/cfg/default.yaml       |   1 +
 doclayout_yolo/nn/modules/__init__.py |   2 +
 doclayout_yolo/nn/modules/block.py    | 242 ++++++++++++++++++++++++--
 doclayout_yolo/nn/modules/conv.py     |  45 ++++-
 doclayout_yolo/nn/modules/g2l_crm.py  |  74 ++++++--
 5 files changed, 333 insertions(+), 31 deletions(-)

diff --git a/doclayout_yolo/cfg/default.yaml b/doclayout_yolo/cfg/default.yaml
index 19df0f8..432a7e3 100644
--- a/doclayout_yolo/cfg/default.yaml
+++ b/doclayout_yolo/cfg/default.yaml
@@ -18,6 +18,7 @@ val_period: 1 # (int) Validation every x epochs
 cache: False # (bool) True/ram, disk or False. Use cache for data loading
 device: # (int | str | list, optional) device to run on, i.e. cuda device=0 or device=0,1,2,3 or device=cpu
 workers: 8 # (int) number of worker threads for data loading (per RANK if DDP)
+threads: # (int, optional) number of threads for PyTorch CPU inference
 project: # (str, optional) project name
 name: # (str, optional) experiment name, results saved to 'project/name' directory
 exist_ok: True # (bool) whether to overwrite existing experiment
diff --git a/doclayout_yolo/nn/modules/__init__.py b/doclayout_yolo/nn/modules/__init__.py
index ad19bc9..a6f1618 100644
--- a/doclayout_yolo/nn/modules/__init__.py
+++ b/doclayout_yolo/nn/modules/__init__.py
@@ -48,6 +48,7 @@
     CBLinear,
     Silence,
     PSA,
+    CIB,
     C2fCIB,
     SCDown,
     RepVGGDW,
@@ -141,6 +142,7 @@
     "CBLinear",
     "Silence",
     "PSA",
+    "CIB",
     "C2fCIB",
     "SCDown",
     "RepVGGDW",
diff --git a/doclayout_yolo/nn/modules/block.py b/doclayout_yolo/nn/modules/block.py
index 9c17929..056b0c0 100644
--- a/doclayout_yolo/nn/modules/block.py
+++ b/doclayout_yolo/nn/modules/block.py
@@ -43,7 +43,6 @@
 )
 
 
-
 class DFL(nn.Module):
     """
     Integral module of Distribution Focal Loss (DFL).
@@ -85,6 +84,12 @@ def forward(self, x):
         """Performs a forward pass through layers using an upsampled input image."""
         return self.cv3(self.cv2(self.upsample(self.cv1(x))))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2, self.cv3]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class HGStem(nn.Module):
     """
@@ -116,6 +121,12 @@ def forward(self, x):
         x = self.stem4(x)
         return x
 
+    def fuse_custom(self):
+        for m in [self.stem1, self.stem2a, self.stem2b, self.stem3, self.stem4]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class HGBlock(nn.Module):
     """
@@ -140,6 +151,15 @@ def forward(self, x):
         y = self.ec(self.sc(torch.cat(y, 1)))
         return y + x if self.add else y
 
+    def fuse_custom(self):
+        for m in self.m:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in [self.sc, self.ec]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class SPP(nn.Module):
     """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
@@ -157,6 +177,13 @@ def forward(self, x):
         x = self.cv1(x)
         return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
+
 class SPPF(nn.Module):
     """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
 
@@ -179,6 +206,12 @@ def forward(self, x):
         y2 = self.m(y1)
         return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class C1(nn.Module):
     """CSP Bottleneck with 1 convolution."""
@@ -194,6 +227,12 @@ def forward(self, x):
         y = self.cv1(x)
         return self.m(y) + y
 
+    def fuse_custom(self):
+        for m in [self.cv1] + list(self.m):
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class C2(nn.Module):
     """CSP Bottleneck with 2 convolutions."""
@@ -213,7 +252,17 @@ def forward(self, x):
         """Forward pass through the CSP bottleneck with 2 convolutions."""
         a, b = self.cv1(x).chunk(2, 1)
         return self.cv2(torch.cat((self.m(a), b), 1))
-    
+
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom"):
+                m.fuse_custom()
+        return self
+
+
 class C2f(nn.Module):
     """Faster Implementation of CSP Bottleneck with 2 convolutions."""
 
@@ -233,6 +282,15 @@ def forward(self, x):
         y.extend(m(y[-1]) for m in self.m)
         return self.cv2(torch.cat(y, 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom"):
+                m.fuse_custom()
+        return self
+
     def forward_split(self, x):
         """Forward pass using split() instead of chunk()."""
         y = list(self.cv1(x).split((self.c, self.c), 1))
@@ -256,6 +314,15 @@ def forward(self, x):
         """Forward pass through the CSP bottleneck with 2 convolutions."""
         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2, self.cv3]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom"):
+                m.fuse_custom()
+        return self
+
 
 class C3x(C3):
     """C3 module with cross-convolutions."""
@@ -283,6 +350,15 @@ def forward(self, x):
         """Forward pass of RT-DETR neck layer."""
         return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2, self.cv3]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class C3TR(C3):
     """C3 module with TransformerBlock()."""
@@ -324,6 +400,17 @@ def forward(self, x):
         """Applies skip connection and concatenation to input tensor."""
         return self.conv(x) + self.shortcut(x)
 
+    def fuse_custom(self):
+        for m in self.conv:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        if hasattr(self.shortcut, "fuse_custom"):
+            for m in self.shortcut:
+                if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                    m.fuse_custom()
+        return self
+
+
 class Bottleneck(nn.Module):
     """Standard bottleneck."""
 
@@ -340,7 +427,13 @@ def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
     def forward(self, x):
         """'forward()' applies the YOLO FPN to input data."""
         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
-    
+
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class BottleneckCSP(nn.Module):
     """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
@@ -363,6 +456,15 @@ def forward(self, x):
         y2 = self.cv2(x)
         return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv4]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom"):
+                m.fuse_custom()
+        return self
+
 
 class ResNetBlock(nn.Module):
     """ResNet block with standard convolution layers."""
@@ -380,6 +482,16 @@ def forward(self, x):
         """Forward pass through the ResNet block."""
         return F.relu(self.cv3(self.cv2(self.cv1(x))) + self.shortcut(x))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2, self.cv3]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        if hasattr(self.shortcut, "fuse_custom"):
+            for m in self.shortcut:
+                if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                    m.fuse_custom()
+        return self
+
 
 class ResNetLayer(nn.Module):
     """ResNet layer with multiple ResNet blocks."""
@@ -411,7 +523,7 @@ def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):
         super().__init__()
         self.nh = nh
         self.hc = c2 // nh
-        self.ec = Conv(c1, ec, k=1, act=False) if c1 != ec else None
+        self.ec = ec
         self.gl = nn.Linear(gc, ec)
         self.bias = nn.Parameter(torch.zeros(nh))
         self.proj_conv = Conv(c1, c2, k=3, s=1, act=False)
@@ -466,6 +578,15 @@ def forward_split(self, x, guide):
         y.append(self.attn(y[-1], guide))
         return self.cv2(torch.cat(y, 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom"):
+                m.fuse_custom()
+        return self
+
 
 class ImagePoolingAttn(nn.Module):
     """ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
@@ -573,6 +694,12 @@ def forward(self, x):
         """Forward pass through RepBottleneck layer."""
         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class RepCSP(nn.Module):
     """Rep CSP Bottleneck with 3 convolutions."""
@@ -590,6 +717,15 @@ def forward(self, x):
         """Forward pass through RepCSP layer."""
         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2, self.cv3]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom"):
+                m.fuse_custom()
+        return self
+
 
 class RepNCSPELAN4(nn.Module):
     """CSP-ELAN."""
@@ -615,6 +751,18 @@ def forward_split(self, x):
         y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
         return self.cv4(torch.cat(y, 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv4]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        if hasattr(self.cv2, "fuse_custom"):
+            self.cv2[0].fuse_custom() if hasattr(self.cv2[0], "fuse_custom") else None
+            self.cv2[1].fuse_custom() if hasattr(self.cv2[1], "fuse_custom") else None
+        if hasattr(self.cv3, "fuse_custom"):
+            self.cv3[0].fuse_custom() if hasattr(self.cv3[0], "fuse_custom") else None
+            self.cv3[1].fuse_custom() if hasattr(self.cv3[1], "fuse_custom") else None
+        return self
+
 
 class ADown(nn.Module):
     """ADown."""
@@ -635,6 +783,12 @@ def forward(self, x):
         x2 = self.cv2(x2)
         return torch.cat((x1, x2), 1)
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class SPPELAN(nn.Module):
     """SPP-ELAN."""
@@ -655,6 +809,12 @@ def forward(self, x):
         y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4])
         return self.cv5(torch.cat(y, 1))
 
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv5]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class Silence(nn.Module):
     """Silence."""
@@ -695,7 +855,8 @@ def forward(self, xs):
         """Forward pass through CBFuse layer."""
         target_size = xs[-1].shape[2:]
         res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
-        out = torch.sum(torch.stack(res + xs[-1:]), dim=0)
+        # Optimized: use sum() instead of torch.sum(torch.stack(...)) to avoid extra allocation
+        out = sum(res + xs[-1:])
         return out
 
 
@@ -706,10 +867,14 @@ def __init__(self, ed) -> None:
         self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
         self.dim = ed
         self.act = nn.SiLU()
-    
+
     def forward(self, x):
-        return self.act(self.conv(x) + self.conv1(x))
-    
+        # Support both unfused (has conv1) and fused (conv1 removed) states
+        c1 = getattr(self, 'conv1', None)
+        if c1 is None:
+            return self.act(self.conv(x))
+        return self.act(self.conv(x) + c1(x))
+
     def forward_fuse(self, x):
         return self.act(self.conv(x))
 
@@ -717,12 +882,12 @@ def forward_fuse(self, x):
     def fuse(self):
         conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
         conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)
-        
+
         conv_w = conv.weight
         conv_b = conv.bias
         conv1_w = conv1.weight
         conv1_b = conv1.bias
-        
+
         conv1_w = torch.nn.functional.pad(conv1_w, [2,2,2,2])
 
         final_conv_w = conv_w + conv1_w
@@ -732,7 +897,12 @@ def fuse(self):
         conv.bias.data.copy_(final_conv_b)
 
         self.conv = conv
-        del self.conv1
+        # Remove conv1 and switch to fused forward for efficiency
+        if hasattr(self, 'conv1'):
+            del self.conv1
+        self.forward = self.forward_fuse
+        self.fused = True
+
 
 class CIB(nn.Module):
     """Standard bottleneck."""
@@ -753,10 +923,19 @@ def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
 
         self.add = shortcut and c1 == c2
 
+    def fuse_custom(self):
+        for m in self.cv1:
+            if isinstance(m, Conv) and not getattr(m, 'fused', False):
+                m.fuse_custom()
+            elif isinstance(m, RepVGGDW) and not getattr(m, 'fused', False):
+                m.fuse()
+        return self
+
     def forward(self, x):
         """'forward()' applies the YOLO FPN to input data."""
         return x + self.cv1(x) if self.add else self.cv1(x)
 
+
 class C2fCIB(C2f):
     """Faster Implementation of CSP Bottleneck with 2 convolutions."""
 
@@ -767,6 +946,15 @@ def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
         super().__init__(c1, c2, n, shortcut, g, e)
         self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
 
+    def fuse_custom(self):
+        if not getattr(self.cv1, 'fused', False):
+            self.cv1.fuse_custom()
+        if not getattr(self.cv2, 'fused', False):
+            self.cv2.fuse_custom()
+        for block in self.m:
+            block.fuse_custom()
+        return self
+
 
 class Attention(nn.Module):
     def __init__(self, dim, num_heads=8,
@@ -796,6 +984,13 @@ def forward(self, x):
         x = self.proj(x)
         return x
 
+    def fuse_custom(self):
+        for m in [self.qkv, self.proj, self.pe]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
+
 class PSA(nn.Module):
 
     def __init__(self, c1, c2, e=0.5):
@@ -804,19 +999,32 @@ def __init__(self, c1, c2, e=0.5):
         self.c = int(c1 * e)
         self.cv1 = Conv(c1, 2 * self.c, 1, 1)
         self.cv2 = Conv(2 * self.c, c1, 1)
-        
+
         self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
         self.ffn = nn.Sequential(
             Conv(self.c, self.c*2, 1),
             Conv(self.c*2, self.c, 1, act=False)
         )
-        
+
     def forward(self, x):
         a, b = self.cv1(x).split((self.c, self.c), dim=1)
         b = b + self.attn(b)
         b = b + self.ffn(b)
         return self.cv2(torch.cat((a, b), 1))
 
+    def fuse_custom(self):
+        if not getattr(self.cv1, 'fused', False):
+            self.cv1.fuse_custom()
+        if not getattr(self.cv2, 'fused', False):
+            self.cv2.fuse_custom()
+        for m in self.ffn:
+            if isinstance(m, Conv) and not getattr(m, 'fused', False):
+                m.fuse_custom()
+        if hasattr(self.attn, "fuse_custom"):
+            self.attn.fuse_custom()
+        return self
+
+
 class SCDown(nn.Module):
     def __init__(self, c1, c2, k, s):
         super().__init__()
@@ -824,4 +1032,10 @@ def __init__(self, c1, c2, k, s):
         self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
 
     def forward(self, x):
-        return self.cv2(self.cv1(x))
\ No newline at end of file
+        return self.cv2(self.cv1(x))
+
+    def fuse_custom(self):
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
\ No newline at end of file
diff --git a/doclayout_yolo/nn/modules/conv.py b/doclayout_yolo/nn/modules/conv.py
index 399c422..d9c6c39 100644
--- a/doclayout_yolo/nn/modules/conv.py
+++ b/doclayout_yolo/nn/modules/conv.py
@@ -44,15 +44,58 @@ def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
         self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
         self.bn = nn.BatchNorm2d(c2)
         self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+        self.fused = False
 
     def forward(self, x):
         """Apply convolution, batch normalization and activation to input tensor."""
+        # Guard: if already fused or bn was removed/Identity, skip bn
+        if getattr(self, "fused", False) or not hasattr(self, "bn") or isinstance(getattr(self, "bn", None), nn.Identity):
+            return self.act(self.conv(x))
         return self.act(self.bn(self.conv(x)))
 
     def forward_fuse(self, x):
         """Perform transposed convolution of 2D data."""
         return self.act(self.conv(x))
 
+    def fuse_custom(self):
+        """Fuse Conv2d() and BatchNorm2d() layers into a single Conv2d with bias."""
+        if getattr(self, "fused", False) or not hasattr(self, "bn"):
+            self.fused = True
+            return self
+
+        w = self.conv.weight
+        b = self.conv.bias if self.conv.bias is not None else torch.zeros(
+            self.conv.out_channels, device=w.device, dtype=w.dtype
+        )
+
+        bn_mean = self.bn.running_mean
+        bn_var = self.bn.running_var
+        bn_weight = self.bn.weight
+        bn_bias = self.bn.bias
+
+        std = torch.sqrt(bn_var + self.bn.eps)
+        scale = (bn_weight / std).view(-1, 1, 1, 1)
+
+        fused_weight = w * scale
+        fused_bias = bn_weight * (b - bn_mean) / std + bn_bias
+
+        self.conv = nn.Conv2d(
+            self.conv.in_channels,
+            self.conv.out_channels,
+            self.conv.kernel_size,
+            self.conv.stride,
+            self.conv.padding,
+            self.conv.dilation,
+            self.conv.groups,
+            bias=True
+        )
+        self.conv.weight.data = fused_weight
+        self.conv.bias.data = fused_bias
+
+        self.bn = nn.Identity()
+        self.fused = True
+        return self
+
 
 class Conv2(Conv):
     """Simplified RepConv module with Conv fusing."""
@@ -330,4 +373,4 @@ def __init__(self, dimension=1):
 
     def forward(self, x):
         """Forward pass for the YOLOv8 mask Proto module."""
-        return torch.cat(x, self.d)
+        return torch.cat(x, self.d)
\ No newline at end of file
diff --git a/doclayout_yolo/nn/modules/g2l_crm.py b/doclayout_yolo/nn/modules/g2l_crm.py
index 62466b1..4b4dcab 100644
--- a/doclayout_yolo/nn/modules/g2l_crm.py
+++ b/doclayout_yolo/nn/modules/g2l_crm.py
@@ -9,6 +9,7 @@
 from torch import nn, Tensor
 import torch.nn.functional as F
 
+
 class DilatedBlock(nn.Module):
     """Standard bottleneck with dilated convolution."""
 
@@ -21,23 +22,38 @@ def __init__(self, c, dilation, k, fuse="sum", shortcut=True):
         self.k = k
         self.cv2 = Conv(c, c, k=1, s=1)
         self.add = shortcut
-        
+
         self.fuse = fuse
         if fuse == "glu":
             self.conv_gating = Conv(c*len(self.dilation), c*len(self.dilation), k=1, s=1, g=c*len(self.dilation))
             self.conv1x1 = Conv(c*len(self.dilation), c, k=1, s=1, g=c)
         elif fuse == "sum":
             self.conv1x1 = Conv(c, c, k=1, s=1, g=c)
-            
+
         self.dcv = Conv(c, c, k=self.k, s=1)
 
     def dilated_conv(self, x, dilation):
-        act = self.dcv.act
-        bn = self.dcv.bn
+        """Apply dilated convolution using the shared Conv weight.
+
+        Handles fused checkpoints where `bn` may be missing or replaced with nn.Identity.
+        """
         weight = self.dcv.conv.weight
-        padding = dilation * (self.k//2)
-        return act(bn(F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation)))
-    
+        padding = dilation * (self.k // 2)
+
+        # Check if bn exists and is a real BatchNorm (not Identity)
+        bn = getattr(self.dcv, "bn", None)
+        has_bn = isinstance(bn, nn.BatchNorm2d)
+
+        if has_bn:
+            x = F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation)
+            x = bn(x)
+        else:
+            # Fused: use bias directly from conv
+            bias = getattr(self.dcv.conv, "bias", None)
+            x = F.conv2d(x, weight, bias=bias, stride=1, padding=padding, dilation=dilation)
+
+        return self.dcv.act(x)
+
     def forward(self, x):
         """'forward()' applies the YOLO FPN to input data."""
         dx = [self.dilated_conv(x, d) for d in self.dilation]
@@ -48,13 +64,19 @@ def forward(self, x):
             dx = dx * G  # Element-wise multiplication
             dx = self.conv1x1(dx)
         elif self.fuse == "sum":
-            dx = [_dx.unsqueeze(0) for _dx in dx]
-            dx = torch.cat(dx, dim=0)
-            dx = torch.sum(dx, dim=0)
+            # Optimized: avoid torch.stack allocation of [n_dilation, B, C, H, W]
+            dx = sum(dx)
             dx = self.conv1x1(dx)
-            
+
         return x + dx if self.add else dx
-        
+
+    def fuse_custom(self):
+        """Fuse Conv+BN in all submodules."""
+        for m in [self.cv2, self.conv1x1, self.dcv]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        return self
+
 
 class DilatedBottleneck(nn.Module):
     """Standard bottleneck with dilated convolution."""
@@ -64,18 +86,28 @@ def __init__(self, c1, c2, shortcut=True, dilation=[1,2,3], block_k=3, fuse="sum
         expansion.
         """
         super().__init__()
-        
+
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, c_, k[0], 1)
         self.cv2 = Conv(c_, c2, k[1], 1, g=g)
-        
+
         self.dilated_block = DilatedBlock(c_, dilation, block_k, fuse)
         self.add = shortcut and c1 == c2
 
     def forward(self, x):
         """'forward()' applies the YOLO FPN to input data."""
         return x + self.cv2(self.dilated_block(self.cv1(x))) if self.add else self.cv2(self.dilated_block(self.cv1(x)))
-    
+
+    def fuse_custom(self):
+        """Fuse Conv+BN in all submodules."""
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        if hasattr(self.dilated_block, "fuse_custom"):
+            self.dilated_block.fuse_custom()
+        return self
+
+
 class G2L_CRM(nn.Module):
     """Faster Implementation of CSP Bottleneck with 2 convolutions."""
 
@@ -119,4 +151,14 @@ def forward_split(self, x):
         y = list(self.cv1(x).split((self.c, self.c), 1))
         for m in self.m:
             y.append(m(y[-1]))
-        return self.cv2(torch.cat(y, 1))
\ No newline at end of file
+        return self.cv2(torch.cat(y, 1))
+
+    def fuse_custom(self):
+        """Fuse all Conv+BN in G2L_CRM."""
+        for m in [self.cv1, self.cv2]:
+            if hasattr(m, "fuse_custom") and not getattr(m, "fused", False):
+                m.fuse_custom()
+        for m in self.m:
+            if hasattr(m, "fuse_custom"):
+                m.fuse_custom()
+        return self
\ No newline at end of file

From 12a89e9cddf18d45563c5f814a2c8631862fc32d Mon Sep 17 00:00:00 2001
From: ayaanmustafa <ayaanmustafa07dpsn@gmail.com>
Date: Thu, 11 Jun 2026 03:09:45 +0530
Subject: [PATCH 3/3] chore(benchmarks): add CPU inference benchmark with
 opt-in toggles

benchmark_cpu_inference.py tests --channels-last and --fuse as
opt-in toggles, measuring raw forward and full predict pipeline.
No hardcoded paths; uses argparse for model/image paths.
---
 benchmarks/benchmark_cpu_inference.py | 185 ++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 benchmarks/benchmark_cpu_inference.py

diff --git a/benchmarks/benchmark_cpu_inference.py b/benchmarks/benchmark_cpu_inference.py
new file mode 100644
index 0000000..05b28a1
--- /dev/null
+++ b/benchmarks/benchmark_cpu_inference.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""
+DocLayout-YOLO CPU Inference Benchmark
+======================================
+Benchmarks CPU inference optimizations as opt-in toggles:
+  --channels-last    : Use NHWC memory format (typically ~1.3-1.4x speedup on CPU)
+  --fuse             : Apply fuse_custom() recursively (safe, idempotent)
+
+Usage:
+    python benchmark_cpu_inference.py --model model.pt --image img.png
+    python benchmark_cpu_inference.py --model model.pt --image img.png --channels-last
+    python benchmark_cpu_inference.py --model model.pt --image img.png --channels-last --fuse
+"""
+import argparse
+import gc
+import time
+import warnings
+from contextlib import contextmanager
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+
+from doclayout_yolo import YOLOv10
+
+warnings.filterwarnings("ignore", category=UserWarning)
+
+
+# ── CONFIG ─────────────────────────────────────────────
+DEFAULT_IMGSZ = 1024
+DEFAULT_CONF = 0.2
+DEFAULT_DEVICE = "cpu"
+N_WARMUP_FWD = 5
+N_ITERS_FWD = 20
+N_WARMUP_PRED = 2
+N_ITERS_PRED = 5
+# ───────────────────────────────────────────────────────
+
+
+@contextmanager
+def cache_flush():
+    gc.collect()
+    _ = torch.randn(20_000_000).mul_(2).sum().item()
+    yield
+    gc.collect()
+
+
+def preprocess(path: str, size: int) -> torch.Tensor:
+    im = cv2.imread(path)
+    if im is None:
+        raise FileNotFoundError(path)
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    im = cv2.resize(im, (size, size))
+    return torch.from_numpy(im).permute(2, 0, 1).unsqueeze(0).float() / 255.0
+
+
+def recursive_fuse(model: nn.Module) -> int:
+    """Recursively apply fuse_custom() / fuse() to all modules. Safe to call multiple times."""
+    fused_count = 0
+    for name, m in model.named_modules():
+        if hasattr(m, "fuse_custom") and callable(m.fuse_custom):
+            if not getattr(m, "fused", False):
+                try:
+                    m.fuse_custom()
+                    fused_count += 1
+                except Exception as e:
+                    print(f"  [WARN] fuse_custom failed on {name}: {e}")
+        elif hasattr(m, "fuse") and callable(m.fuse):
+            if not getattr(m, "fused", False):
+                try:
+                    m.fuse()
+                    fused_count += 1
+                except Exception as e:
+                    print(f"  [WARN] fuse failed on {name}: {e}")
+    return fused_count
+
+
+def benchmark_forward(model, x, n_warmup=5, n_iters=20, channels_last=False, device="cpu"):
+    model = model.eval().to(device)
+    x = x.to(device)
+    if channels_last:
+        x = x.to(memory_format=torch.channels_last)
+        model = model.to(memory_format=torch.channels_last)
+    with torch.no_grad():
+        for _ in range(n_warmup):
+            _ = model(x)
+    with cache_flush():
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            for _ in range(n_iters):
+                _ = model(x)
+        return (time.perf_counter() - t0) / n_iters * 1000
+
+
+def benchmark_predict(model, path, imgsz, conf, n_warmup=2, n_iters=5, device="cpu"):
+    for _ in range(n_warmup):
+        _ = model.predict(path, imgsz=imgsz, conf=conf, device=device, verbose=False)
+    gc.collect()
+    t0 = time.perf_counter()
+    for _ in range(n_iters):
+        _ = model.predict(path, imgsz=imgsz, conf=conf, device=device, verbose=False)
+    return (time.perf_counter() - t0) / n_iters * 1000
+
+
+def main():
+    parser = argparse.ArgumentParser(description="DocLayout-YOLO CPU Inference Benchmark")
+    parser.add_argument("--model", required=True, help="Path to .pt model")
+    parser.add_argument("--image", required=True, help="Path to test image")
+    parser.add_argument("--imgsz", type=int, default=DEFAULT_IMGSZ, help="Inference size")
+    parser.add_argument("--conf", type=float, default=DEFAULT_CONF, help="Confidence threshold")
+    parser.add_argument("--device", default=DEFAULT_DEVICE, help="Device (cpu/cuda)")
+    parser.add_argument("--channels-last", action="store_true", help="Use channels_last (NHWC) memory format")
+    parser.add_argument("--fuse", action="store_true", help="Apply recursive fuse_custom() (safe, idempotent)")
+    parser.add_argument("--save", default=None, help="Optional path to save annotated result image")
+    args = parser.parse_args()
+
+    print("=" * 70)
+    print("  DocLayout-YOLO CPU Inference Benchmark")
+    print("=" * 70)
+    print(f"PyTorch:  {torch.__version__}")
+    print(f"Device:   {args.device} ({torch.get_num_threads()} threads)")
+    print(f"Model:    {args.model}")
+    print(f"Image:    {args.image}")
+    print(f"Size:     {args.imgsz}")
+    print(f"Opts:     channels_last={args.channels_last}, fuse={args.fuse}")
+    print("=" * 70)
+
+    x = preprocess(args.image, args.imgsz)
+
+    # ── Load model ──
+    print("\nLoading model...")
+    model = YOLOv10(args.model)
+    model = model.to(args.device)
+
+    # ── Optional: recursive fuse_custom ──
+    if args.fuse:
+        print("Applying fuse_custom()...")
+        n = recursive_fuse(model.model)
+        print(f"  Fused {n} modules.")
+
+    # ── Optional: channels_last ──
+    if args.channels_last:
+        print("Converting to channels_last (NHWC)...")
+        x = x.to(memory_format=torch.channels_last)
+        model.model = model.model.to(memory_format=torch.channels_last)
+
+    # ── Benchmark raw forward ──
+    print("\nBenchmarking raw forward pass...")
+    t_fwd = benchmark_forward(
+        model.model, x,
+        n_warmup=N_WARMUP_FWD, n_iters=N_ITERS_FWD,
+        channels_last=False,  # already applied above
+        device=args.device,
+    )
+    print(f"  Raw forward: {t_fwd:.2f} ms")
+
+    # ── Benchmark full predict pipeline ──
+    print("\nBenchmarking full predict pipeline...")
+    t_pred = benchmark_predict(
+        model, args.image,
+        imgsz=args.imgsz, conf=args.conf,
+        n_warmup=N_WARMUP_PRED, n_iters=N_ITERS_PRED,
+        device=args.device,
+    )
+    print(f"  Full predict: {t_pred:.2f} ms")
+
+    # ── Optional: save result ──
+    if args.save:
+        print(f"\nSaving result to {args.save}...")
+        result = model.predict(args.image, imgsz=args.imgsz, conf=args.conf, device=args.device, verbose=False)[0]
+        result.save(args.save)
+
+    # ── Summary ──
+    print(f"\n{'='*70}")
+    print("  RESULTS")
+    print(f"{'='*70}")
+    print(f"  Raw forward:  {t_fwd:>10.2f} ms")
+    print(f"  Full predict: {t_pred:>10.2f} ms")
+    print(f"{'='*70}")
+
+
+if __name__ == "__main__":
+    main()