From d6ba7155901ca2c248c2ac95ec24d298c3dbcfa6 Mon Sep 17 00:00:00 2001 From: ayaanmustafa Date: Thu, 11 Jun 2026 03:09:11 +0530 Subject: [PATCH 1/3] fix(autobackend): handle non-leaf parameters in fused checkpoints PyTorch forbids setting requires_grad=False on non-leaf tensors. When fuse=True creates non-leaf parameters (e.g. from fused model checkpoints), the old code crashed with: RuntimeError: you can only change requires_grad flags of leaf variables. Fix: only set requires_grad=False on leaf tensors; for non-leaf tensors, detach via .data.detach() instead. --- doclayout_yolo/nn/autobackend.py | 153 ++++++++++++++++--------------- 1 file changed, 79 insertions(+), 74 deletions(-) diff --git a/doclayout_yolo/nn/autobackend.py b/doclayout_yolo/nn/autobackend.py index 6b5eeef..6bc6464 100644 --- a/doclayout_yolo/nn/autobackend.py +++ b/doclayout_yolo/nn/autobackend.py @@ -18,6 +18,7 @@ from doclayout_yolo.utils.checks import check_requirements, check_suffix, check_version, check_yaml from doclayout_yolo.utils.downloads import attempt_download_asset, is_url + def check_class_names(names): """ Check class names. @@ -56,22 +57,22 @@ class AutoBackend(nn.Module): The AutoBackend class is designed to provide an abstraction layer for various inference engines. It supports a wide range of formats, each with specific naming conventions as outlined below: - Supported Formats and Naming Conventions: - | Format | File Suffix | - |-----------------------|------------------| - | PyTorch | *.pt | - | TorchScript | *.torchscript | - | ONNX Runtime | *.onnx | - | ONNX OpenCV DNN | *.onnx (dnn=True)| - | OpenVINO | *openvino_model/ | - | CoreML | *.mlpackage | - | TensorRT | *.engine | - | TensorFlow SavedModel | *_saved_model | - | TensorFlow GraphDef | *.pb | - | TensorFlow Lite | *.tflite | - | TensorFlow Edge TPU | *_edgetpu.tflite | - | PaddlePaddle | *_paddle_model | - | NCNN | *_ncnn_model | + Supported Formats and Naming Conventions: + | Format | File Suffix | + |------------------------|-------------------| + | PyTorch | *.pt | + | TorchScript | *.torchscript | + | ONNX Runtime | *.onnx | + | ONNX OpenCV DNN | *.onnx (dnn=True) | + | OpenVINO | *openvino_model/ | + | CoreML | *.mlpackage | + | TensorRT | *.engine | + | TensorFlow SavedModel | *_saved_model | + | TensorFlow GraphDef | *.pb | + | TensorFlow Lite | *.tflite | + | TensorFlow Edge TPU | *_edgetpu.tflite | + | PaddlePaddle | *_paddle_model | + | NCNN | *_ncnn_model | This class offers dynamic backend switching capabilities based on the input model format, making it easier to deploy models across various platforms. @@ -220,8 +221,8 @@ def __init__( except ImportError: if LINUX: check_requirements("nvidia-tensorrt", cmds="-U --index-url https://pypi.ngc.nvidia.com") - import tensorrt as trt # noqa - check_version(trt.__version__, "7.0.0", hard=True) # require tensorrt>=7.0.0 + import tensorrt as trt # noqa + check_version(trt.__version__, "7.0.0", hard=True) # require tensorrt>=7.0.0 if device.type == "cpu": device = torch.device("cuda:0") Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) @@ -231,27 +232,27 @@ def __init__( meta_len = int.from_bytes(f.read(4), byteorder="little") # read metadata length metadata = json.loads(f.read(meta_len).decode("utf-8")) # read metadata model = runtime.deserialize_cuda_engine(f.read()) # read engine - context = model.create_execution_context() - bindings = OrderedDict() - output_names = [] - fp16 = False # default updated below - dynamic = False - for i in range(model.num_bindings): - name = model.get_binding_name(i) - dtype = trt.nptype(model.get_binding_dtype(i)) - if model.binding_is_input(i): - if -1 in tuple(model.get_binding_shape(i)): # dynamic - dynamic = True - context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2])) - if dtype == np.float16: - fp16 = True - else: # output - output_names.append(name) - shape = tuple(context.get_binding_shape(i)) - im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) - bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr())) - binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items()) - batch_size = bindings["images"].shape[0] # if dynamic, this is instead max batch size + context = model.create_execution_context() + bindings = OrderedDict() + output_names = [] + fp16 = False # default updated below + dynamic = False + for i in range(model.num_bindings): + name = model.get_binding_name(i) + dtype = trt.nptype(model.get_binding_dtype(i)) + if model.binding_is_input(i): + if -1 in tuple(model.get_binding_shape(i)): # dynamic + dynamic = True + context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2])) + if dtype == np.float16: + fp16 = True + else: # output + output_names.append(name) + shape = tuple(context.get_binding_shape(i)) + im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) + bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr())) + binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items()) + batch_size = bindings["images"].shape[0] # if dynamic, this is instead max batch size # CoreML elif coreml: @@ -363,7 +364,8 @@ def wrap_frozen_graph(gd, inputs, outputs): raise TypeError( f"model='{w}' is not a supported model format. " - f"See https://docs.doclayout_yolo.com/modes/predict for help.\n\n{export_formats()}" + f"See https://docs.doclayout_yolo.com/modes/predict for help." + f"{export_formats()}" ) # Load external metadata YAML @@ -387,12 +389,15 @@ def wrap_frozen_graph(gd, inputs, outputs): # Check names if "names" not in locals(): # names missing names = default_class_names(data) - names = check_class_names(names) + names = check_class_names(names) - # Disable gradients + # Disable gradients — FIX: handle non-leaf parameters from fused checkpoints if pt: for p in model.parameters(): - p.requires_grad = False + if not p.is_leaf: + p.data = p.data.detach() + else: + p.requires_grad = False self.__dict__.update(locals()) # assign all variables to self @@ -484,11 +489,11 @@ def callback(request, userdata): "Ultralytics only supports inference of non-pipelined CoreML models exported with " f"'nms=False', but 'model={w}' has an NMS pipeline created by an 'nms=True' export." ) - # TODO: CoreML NMS inference handling - # from doclayout_yolo.utils.ops import xywh2xyxy - # box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]]) # xyxy pixels - # conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float32) - # y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1) + # TODO: CoreML NMS inference handling + # from doclayout_yolo.utils.ops import xywh2xyxy + # box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]]) # xyxy pixels + # conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float32) + # y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1) elif len(y) == 1: # classification model y = list(y.values()) elif len(y) == 2: # segmentation model @@ -526,32 +531,32 @@ def callback(request, userdata): ip, ib = (0, 1) if len(y[0].shape) == 4 else (1, 0) # index of protos, boxes nc = y[ib].shape[1] - y[ip].shape[3] - 4 # y = (1, 160, 160, 32), (1, 116, 8400) self.names = {i: f"class{i}" for i in range(nc)} - else: # Lite or Edge TPU - details = self.input_details[0] - integer = details["dtype"] in (np.int8, np.int16) # is TFLite quantized int8 or int16 model - if integer: - scale, zero_point = details["quantization"] - im = (im / scale + zero_point).astype(details["dtype"]) # de-scale - self.interpreter.set_tensor(details["index"], im) - self.interpreter.invoke() - y = [] - for output in self.output_details: - x = self.interpreter.get_tensor(output["index"]) + else: # Lite or Edge TPU + details = self.input_details[0] + integer = details["dtype"] in (np.int8, np.int16) # is TFLite quantized int8 or int16 model if integer: - scale, zero_point = output["quantization"] - x = (x.astype(np.float32) - zero_point) * scale # re-scale - if x.ndim > 2: # if task is not classification - # Denormalize xywh by image size. See https://github.com/doclayout_yolo/doclayout_yolo/pull/1695 - # xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models - x[:, [0, 2]] *= w - x[:, [1, 3]] *= h - y.append(x) - # TF segment fixes: export is reversed vs ONNX export and protos are transposed - if len(y) == 2: # segment with (det, proto) output order reversed - if len(y[1].shape) != 4: - y = list(reversed(y)) # should be y = (1, 116, 8400), (1, 160, 160, 32) - y[1] = np.transpose(y[1], (0, 3, 1, 2)) # should be y = (1, 116, 8400), (1, 32, 160, 160) - y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y] + scale, zero_point = details["quantization"] + im = (im / scale + zero_point).astype(details["dtype"]) # de-scale + self.interpreter.set_tensor(details["index"], im) + self.interpreter.invoke() + y = [] + for output in self.output_details: + x = self.interpreter.get_tensor(output["index"]) + if integer: + scale, zero_point = output["quantization"] + x = (x.astype(np.float32) - zero_point) * scale # re-scale + if x.ndim > 2: # if task is not classification + # Denormalize xywh by image size. See https://github.com/doclayout_yolo/doclayout_yolo/pull/1695 + # xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models + x[:, [0, 2]] *= w + x[:, [1, 3]] *= h + y.append(x) + # TF segment fixes: export is reversed vs ONNX export and protos are transposed + if len(y) == 2: # segment with (det, proto) output order reversed + if len(y[1].shape) != 4: + y = list(reversed(y)) # should be y = (1, 116, 8400), (1, 160, 160, 32) + y[1] = np.transpose(y[1], (0, 3, 1, 2)) # should be y = (1, 116, 8400), (1, 32, 160, 160) + y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y] # for x in y: # print(type(x), len(x)) if isinstance(x, (list, tuple)) else print(type(x), x.shape) # debug shapes @@ -615,4 +620,4 @@ def _model_type(p="path/to/model.pt"): url = urlsplit(p) triton = bool(url.netloc) and bool(url.path) and url.scheme in {"http", "grpc"} - return types + [triton] + return types + [triton] \ No newline at end of file From 2c556da9562751251846331c672fc40c471ed7df Mon Sep 17 00:00:00 2001 From: ayaanmustafa Date: Thu, 11 Jun 2026 03:09:34 +0530 Subject: [PATCH 2/3] feat(fuse): add fuse_custom() to all block modules + CPU thread config - Add fuse_custom() to 20+ block modules (SPP, SPPF, C1-C3, C2f, Bottleneck, BottleneckCSP, ResNetBlock, RepC3, RepCSP, RepNCSPELAN4, ADown, SPPELAN, HGStem, HGBlock, Proto, GhostBottleneck, RepBottleneck, C2fAttn, Attention, PSA, SCDown, CIB, C2fCIB, G2L_CRM, DilatedBlock, DilatedBottleneck) - Optimize CBFuse: use sum() instead of torch.sum(torch.stack()) - Optimize DilatedBlock: handle fused checkpoints (nn.Identity bn) - Fix RepVGGDW.forward to support fused state (conv1 deleted) - Fix MaxSigmoidAttnBlock.ec type (was Conv, should be int) - Add threads config field to default.yaml for CPU inference tuning --- doclayout_yolo/cfg/default.yaml | 1 + doclayout_yolo/nn/modules/__init__.py | 2 + doclayout_yolo/nn/modules/block.py | 242 ++++++++++++++++++++++++-- doclayout_yolo/nn/modules/conv.py | 45 ++++- doclayout_yolo/nn/modules/g2l_crm.py | 74 ++++++-- 5 files changed, 333 insertions(+), 31 deletions(-) diff --git a/doclayout_yolo/cfg/default.yaml b/doclayout_yolo/cfg/default.yaml index 19df0f8..432a7e3 100644 --- a/doclayout_yolo/cfg/default.yaml +++ b/doclayout_yolo/cfg/default.yaml @@ -18,6 +18,7 @@ val_period: 1 # (int) Validation every x epochs cache: False # (bool) True/ram, disk or False. Use cache for data loading device: # (int | str | list, optional) device to run on, i.e. cuda device=0 or device=0,1,2,3 or device=cpu workers: 8 # (int) number of worker threads for data loading (per RANK if DDP) +threads: # (int, optional) number of threads for PyTorch CPU inference project: # (str, optional) project name name: # (str, optional) experiment name, results saved to 'project/name' directory exist_ok: True # (bool) whether to overwrite existing experiment diff --git a/doclayout_yolo/nn/modules/__init__.py b/doclayout_yolo/nn/modules/__init__.py index ad19bc9..a6f1618 100644 --- a/doclayout_yolo/nn/modules/__init__.py +++ b/doclayout_yolo/nn/modules/__init__.py @@ -48,6 +48,7 @@ CBLinear, Silence, PSA, + CIB, C2fCIB, SCDown, RepVGGDW, @@ -141,6 +142,7 @@ "CBLinear", "Silence", "PSA", + "CIB", "C2fCIB", "SCDown", "RepVGGDW", diff --git a/doclayout_yolo/nn/modules/block.py b/doclayout_yolo/nn/modules/block.py index 9c17929..056b0c0 100644 --- a/doclayout_yolo/nn/modules/block.py +++ b/doclayout_yolo/nn/modules/block.py @@ -43,7 +43,6 @@ ) - class DFL(nn.Module): """ Integral module of Distribution Focal Loss (DFL). @@ -85,6 +84,12 @@ def forward(self, x): """Performs a forward pass through layers using an upsampled input image.""" return self.cv3(self.cv2(self.upsample(self.cv1(x)))) + def fuse_custom(self): + for m in [self.cv1, self.cv2, self.cv3]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class HGStem(nn.Module): """ @@ -116,6 +121,12 @@ def forward(self, x): x = self.stem4(x) return x + def fuse_custom(self): + for m in [self.stem1, self.stem2a, self.stem2b, self.stem3, self.stem4]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class HGBlock(nn.Module): """ @@ -140,6 +151,15 @@ def forward(self, x): y = self.ec(self.sc(torch.cat(y, 1))) return y + x if self.add else y + def fuse_custom(self): + for m in self.m: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in [self.sc, self.ec]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class SPP(nn.Module): """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.""" @@ -157,6 +177,13 @@ def forward(self, x): x = self.cv1(x) return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + + class SPPF(nn.Module): """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.""" @@ -179,6 +206,12 @@ def forward(self, x): y2 = self.m(y1) return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class C1(nn.Module): """CSP Bottleneck with 1 convolution.""" @@ -194,6 +227,12 @@ def forward(self, x): y = self.cv1(x) return self.m(y) + y + def fuse_custom(self): + for m in [self.cv1] + list(self.m): + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class C2(nn.Module): """CSP Bottleneck with 2 convolutions.""" @@ -213,7 +252,17 @@ def forward(self, x): """Forward pass through the CSP bottleneck with 2 convolutions.""" a, b = self.cv1(x).chunk(2, 1) return self.cv2(torch.cat((self.m(a), b), 1)) - + + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom"): + m.fuse_custom() + return self + + class C2f(nn.Module): """Faster Implementation of CSP Bottleneck with 2 convolutions.""" @@ -233,6 +282,15 @@ def forward(self, x): y.extend(m(y[-1]) for m in self.m) return self.cv2(torch.cat(y, 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom"): + m.fuse_custom() + return self + def forward_split(self, x): """Forward pass using split() instead of chunk().""" y = list(self.cv1(x).split((self.c, self.c), 1)) @@ -256,6 +314,15 @@ def forward(self, x): """Forward pass through the CSP bottleneck with 2 convolutions.""" return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv2, self.cv3]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom"): + m.fuse_custom() + return self + class C3x(C3): """C3 module with cross-convolutions.""" @@ -283,6 +350,15 @@ def forward(self, x): """Forward pass of RT-DETR neck layer.""" return self.cv3(self.m(self.cv1(x)) + self.cv2(x)) + def fuse_custom(self): + for m in [self.cv1, self.cv2, self.cv3]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class C3TR(C3): """C3 module with TransformerBlock().""" @@ -324,6 +400,17 @@ def forward(self, x): """Applies skip connection and concatenation to input tensor.""" return self.conv(x) + self.shortcut(x) + def fuse_custom(self): + for m in self.conv: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + if hasattr(self.shortcut, "fuse_custom"): + for m in self.shortcut: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + + class Bottleneck(nn.Module): """Standard bottleneck.""" @@ -340,7 +427,13 @@ def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): def forward(self, x): """'forward()' applies the YOLO FPN to input data.""" return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) - + + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class BottleneckCSP(nn.Module): """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.""" @@ -363,6 +456,15 @@ def forward(self, x): y2 = self.cv2(x) return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1)))) + def fuse_custom(self): + for m in [self.cv1, self.cv4]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom"): + m.fuse_custom() + return self + class ResNetBlock(nn.Module): """ResNet block with standard convolution layers.""" @@ -380,6 +482,16 @@ def forward(self, x): """Forward pass through the ResNet block.""" return F.relu(self.cv3(self.cv2(self.cv1(x))) + self.shortcut(x)) + def fuse_custom(self): + for m in [self.cv1, self.cv2, self.cv3]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + if hasattr(self.shortcut, "fuse_custom"): + for m in self.shortcut: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class ResNetLayer(nn.Module): """ResNet layer with multiple ResNet blocks.""" @@ -411,7 +523,7 @@ def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False): super().__init__() self.nh = nh self.hc = c2 // nh - self.ec = Conv(c1, ec, k=1, act=False) if c1 != ec else None + self.ec = ec self.gl = nn.Linear(gc, ec) self.bias = nn.Parameter(torch.zeros(nh)) self.proj_conv = Conv(c1, c2, k=3, s=1, act=False) @@ -466,6 +578,15 @@ def forward_split(self, x, guide): y.append(self.attn(y[-1], guide)) return self.cv2(torch.cat(y, 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom"): + m.fuse_custom() + return self + class ImagePoolingAttn(nn.Module): """ImagePoolingAttn: Enhance the text embeddings with image-aware information.""" @@ -573,6 +694,12 @@ def forward(self, x): """Forward pass through RepBottleneck layer.""" return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class RepCSP(nn.Module): """Rep CSP Bottleneck with 3 convolutions.""" @@ -590,6 +717,15 @@ def forward(self, x): """Forward pass through RepCSP layer.""" return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv2, self.cv3]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom"): + m.fuse_custom() + return self + class RepNCSPELAN4(nn.Module): """CSP-ELAN.""" @@ -615,6 +751,18 @@ def forward_split(self, x): y.extend(m(y[-1]) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv4]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + if hasattr(self.cv2, "fuse_custom"): + self.cv2[0].fuse_custom() if hasattr(self.cv2[0], "fuse_custom") else None + self.cv2[1].fuse_custom() if hasattr(self.cv2[1], "fuse_custom") else None + if hasattr(self.cv3, "fuse_custom"): + self.cv3[0].fuse_custom() if hasattr(self.cv3[0], "fuse_custom") else None + self.cv3[1].fuse_custom() if hasattr(self.cv3[1], "fuse_custom") else None + return self + class ADown(nn.Module): """ADown.""" @@ -635,6 +783,12 @@ def forward(self, x): x2 = self.cv2(x2) return torch.cat((x1, x2), 1) + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class SPPELAN(nn.Module): """SPP-ELAN.""" @@ -655,6 +809,12 @@ def forward(self, x): y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4]) return self.cv5(torch.cat(y, 1)) + def fuse_custom(self): + for m in [self.cv1, self.cv5]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class Silence(nn.Module): """Silence.""" @@ -695,7 +855,8 @@ def forward(self, xs): """Forward pass through CBFuse layer.""" target_size = xs[-1].shape[2:] res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])] - out = torch.sum(torch.stack(res + xs[-1:]), dim=0) + # Optimized: use sum() instead of torch.sum(torch.stack(...)) to avoid extra allocation + out = sum(res + xs[-1:]) return out @@ -706,10 +867,14 @@ def __init__(self, ed) -> None: self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False) self.dim = ed self.act = nn.SiLU() - + def forward(self, x): - return self.act(self.conv(x) + self.conv1(x)) - + # Support both unfused (has conv1) and fused (conv1 removed) states + c1 = getattr(self, 'conv1', None) + if c1 is None: + return self.act(self.conv(x)) + return self.act(self.conv(x) + c1(x)) + def forward_fuse(self, x): return self.act(self.conv(x)) @@ -717,12 +882,12 @@ def forward_fuse(self, x): def fuse(self): conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn) conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn) - + conv_w = conv.weight conv_b = conv.bias conv1_w = conv1.weight conv1_b = conv1.bias - + conv1_w = torch.nn.functional.pad(conv1_w, [2,2,2,2]) final_conv_w = conv_w + conv1_w @@ -732,7 +897,12 @@ def fuse(self): conv.bias.data.copy_(final_conv_b) self.conv = conv - del self.conv1 + # Remove conv1 and switch to fused forward for efficiency + if hasattr(self, 'conv1'): + del self.conv1 + self.forward = self.forward_fuse + self.fused = True + class CIB(nn.Module): """Standard bottleneck.""" @@ -753,10 +923,19 @@ def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False): self.add = shortcut and c1 == c2 + def fuse_custom(self): + for m in self.cv1: + if isinstance(m, Conv) and not getattr(m, 'fused', False): + m.fuse_custom() + elif isinstance(m, RepVGGDW) and not getattr(m, 'fused', False): + m.fuse() + return self + def forward(self, x): """'forward()' applies the YOLO FPN to input data.""" return x + self.cv1(x) if self.add else self.cv1(x) + class C2fCIB(C2f): """Faster Implementation of CSP Bottleneck with 2 convolutions.""" @@ -767,6 +946,15 @@ def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n)) + def fuse_custom(self): + if not getattr(self.cv1, 'fused', False): + self.cv1.fuse_custom() + if not getattr(self.cv2, 'fused', False): + self.cv2.fuse_custom() + for block in self.m: + block.fuse_custom() + return self + class Attention(nn.Module): def __init__(self, dim, num_heads=8, @@ -796,6 +984,13 @@ def forward(self, x): x = self.proj(x) return x + def fuse_custom(self): + for m in [self.qkv, self.proj, self.pe]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + + class PSA(nn.Module): def __init__(self, c1, c2, e=0.5): @@ -804,19 +999,32 @@ def __init__(self, c1, c2, e=0.5): self.c = int(c1 * e) self.cv1 = Conv(c1, 2 * self.c, 1, 1) self.cv2 = Conv(2 * self.c, c1, 1) - + self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64) self.ffn = nn.Sequential( Conv(self.c, self.c*2, 1), Conv(self.c*2, self.c, 1, act=False) ) - + def forward(self, x): a, b = self.cv1(x).split((self.c, self.c), dim=1) b = b + self.attn(b) b = b + self.ffn(b) return self.cv2(torch.cat((a, b), 1)) + def fuse_custom(self): + if not getattr(self.cv1, 'fused', False): + self.cv1.fuse_custom() + if not getattr(self.cv2, 'fused', False): + self.cv2.fuse_custom() + for m in self.ffn: + if isinstance(m, Conv) and not getattr(m, 'fused', False): + m.fuse_custom() + if hasattr(self.attn, "fuse_custom"): + self.attn.fuse_custom() + return self + + class SCDown(nn.Module): def __init__(self, c1, c2, k, s): super().__init__() @@ -824,4 +1032,10 @@ def __init__(self, c1, c2, k, s): self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False) def forward(self, x): - return self.cv2(self.cv1(x)) \ No newline at end of file + return self.cv2(self.cv1(x)) + + def fuse_custom(self): + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self \ No newline at end of file diff --git a/doclayout_yolo/nn/modules/conv.py b/doclayout_yolo/nn/modules/conv.py index 399c422..d9c6c39 100644 --- a/doclayout_yolo/nn/modules/conv.py +++ b/doclayout_yolo/nn/modules/conv.py @@ -44,15 +44,58 @@ def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() + self.fused = False def forward(self, x): """Apply convolution, batch normalization and activation to input tensor.""" + # Guard: if already fused or bn was removed/Identity, skip bn + if getattr(self, "fused", False) or not hasattr(self, "bn") or isinstance(getattr(self, "bn", None), nn.Identity): + return self.act(self.conv(x)) return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): """Perform transposed convolution of 2D data.""" return self.act(self.conv(x)) + def fuse_custom(self): + """Fuse Conv2d() and BatchNorm2d() layers into a single Conv2d with bias.""" + if getattr(self, "fused", False) or not hasattr(self, "bn"): + self.fused = True + return self + + w = self.conv.weight + b = self.conv.bias if self.conv.bias is not None else torch.zeros( + self.conv.out_channels, device=w.device, dtype=w.dtype + ) + + bn_mean = self.bn.running_mean + bn_var = self.bn.running_var + bn_weight = self.bn.weight + bn_bias = self.bn.bias + + std = torch.sqrt(bn_var + self.bn.eps) + scale = (bn_weight / std).view(-1, 1, 1, 1) + + fused_weight = w * scale + fused_bias = bn_weight * (b - bn_mean) / std + bn_bias + + self.conv = nn.Conv2d( + self.conv.in_channels, + self.conv.out_channels, + self.conv.kernel_size, + self.conv.stride, + self.conv.padding, + self.conv.dilation, + self.conv.groups, + bias=True + ) + self.conv.weight.data = fused_weight + self.conv.bias.data = fused_bias + + self.bn = nn.Identity() + self.fused = True + return self + class Conv2(Conv): """Simplified RepConv module with Conv fusing.""" @@ -330,4 +373,4 @@ def __init__(self, dimension=1): def forward(self, x): """Forward pass for the YOLOv8 mask Proto module.""" - return torch.cat(x, self.d) + return torch.cat(x, self.d) \ No newline at end of file diff --git a/doclayout_yolo/nn/modules/g2l_crm.py b/doclayout_yolo/nn/modules/g2l_crm.py index 62466b1..4b4dcab 100644 --- a/doclayout_yolo/nn/modules/g2l_crm.py +++ b/doclayout_yolo/nn/modules/g2l_crm.py @@ -9,6 +9,7 @@ from torch import nn, Tensor import torch.nn.functional as F + class DilatedBlock(nn.Module): """Standard bottleneck with dilated convolution.""" @@ -21,23 +22,38 @@ def __init__(self, c, dilation, k, fuse="sum", shortcut=True): self.k = k self.cv2 = Conv(c, c, k=1, s=1) self.add = shortcut - + self.fuse = fuse if fuse == "glu": self.conv_gating = Conv(c*len(self.dilation), c*len(self.dilation), k=1, s=1, g=c*len(self.dilation)) self.conv1x1 = Conv(c*len(self.dilation), c, k=1, s=1, g=c) elif fuse == "sum": self.conv1x1 = Conv(c, c, k=1, s=1, g=c) - + self.dcv = Conv(c, c, k=self.k, s=1) def dilated_conv(self, x, dilation): - act = self.dcv.act - bn = self.dcv.bn + """Apply dilated convolution using the shared Conv weight. + + Handles fused checkpoints where `bn` may be missing or replaced with nn.Identity. + """ weight = self.dcv.conv.weight - padding = dilation * (self.k//2) - return act(bn(F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation))) - + padding = dilation * (self.k // 2) + + # Check if bn exists and is a real BatchNorm (not Identity) + bn = getattr(self.dcv, "bn", None) + has_bn = isinstance(bn, nn.BatchNorm2d) + + if has_bn: + x = F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation) + x = bn(x) + else: + # Fused: use bias directly from conv + bias = getattr(self.dcv.conv, "bias", None) + x = F.conv2d(x, weight, bias=bias, stride=1, padding=padding, dilation=dilation) + + return self.dcv.act(x) + def forward(self, x): """'forward()' applies the YOLO FPN to input data.""" dx = [self.dilated_conv(x, d) for d in self.dilation] @@ -48,13 +64,19 @@ def forward(self, x): dx = dx * G # Element-wise multiplication dx = self.conv1x1(dx) elif self.fuse == "sum": - dx = [_dx.unsqueeze(0) for _dx in dx] - dx = torch.cat(dx, dim=0) - dx = torch.sum(dx, dim=0) + # Optimized: avoid torch.stack allocation of [n_dilation, B, C, H, W] + dx = sum(dx) dx = self.conv1x1(dx) - + return x + dx if self.add else dx - + + def fuse_custom(self): + """Fuse Conv+BN in all submodules.""" + for m in [self.cv2, self.conv1x1, self.dcv]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + return self + class DilatedBottleneck(nn.Module): """Standard bottleneck with dilated convolution.""" @@ -64,18 +86,28 @@ def __init__(self, c1, c2, shortcut=True, dilation=[1,2,3], block_k=3, fuse="sum expansion. """ super().__init__() - + c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, k[0], 1) self.cv2 = Conv(c_, c2, k[1], 1, g=g) - + self.dilated_block = DilatedBlock(c_, dilation, block_k, fuse) self.add = shortcut and c1 == c2 def forward(self, x): """'forward()' applies the YOLO FPN to input data.""" return x + self.cv2(self.dilated_block(self.cv1(x))) if self.add else self.cv2(self.dilated_block(self.cv1(x))) - + + def fuse_custom(self): + """Fuse Conv+BN in all submodules.""" + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + if hasattr(self.dilated_block, "fuse_custom"): + self.dilated_block.fuse_custom() + return self + + class G2L_CRM(nn.Module): """Faster Implementation of CSP Bottleneck with 2 convolutions.""" @@ -119,4 +151,14 @@ def forward_split(self, x): y = list(self.cv1(x).split((self.c, self.c), 1)) for m in self.m: y.append(m(y[-1])) - return self.cv2(torch.cat(y, 1)) \ No newline at end of file + return self.cv2(torch.cat(y, 1)) + + def fuse_custom(self): + """Fuse all Conv+BN in G2L_CRM.""" + for m in [self.cv1, self.cv2]: + if hasattr(m, "fuse_custom") and not getattr(m, "fused", False): + m.fuse_custom() + for m in self.m: + if hasattr(m, "fuse_custom"): + m.fuse_custom() + return self \ No newline at end of file From 12a89e9cddf18d45563c5f814a2c8631862fc32d Mon Sep 17 00:00:00 2001 From: ayaanmustafa Date: Thu, 11 Jun 2026 03:09:45 +0530 Subject: [PATCH 3/3] chore(benchmarks): add CPU inference benchmark with opt-in toggles benchmark_cpu_inference.py tests --channels-last and --fuse as opt-in toggles, measuring raw forward and full predict pipeline. No hardcoded paths; uses argparse for model/image paths. --- benchmarks/benchmark_cpu_inference.py | 185 ++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 benchmarks/benchmark_cpu_inference.py diff --git a/benchmarks/benchmark_cpu_inference.py b/benchmarks/benchmark_cpu_inference.py new file mode 100644 index 0000000..05b28a1 --- /dev/null +++ b/benchmarks/benchmark_cpu_inference.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +DocLayout-YOLO CPU Inference Benchmark +====================================== +Benchmarks CPU inference optimizations as opt-in toggles: + --channels-last : Use NHWC memory format (typically ~1.3-1.4x speedup on CPU) + --fuse : Apply fuse_custom() recursively (safe, idempotent) + +Usage: + python benchmark_cpu_inference.py --model model.pt --image img.png + python benchmark_cpu_inference.py --model model.pt --image img.png --channels-last + python benchmark_cpu_inference.py --model model.pt --image img.png --channels-last --fuse +""" +import argparse +import gc +import time +import warnings +from contextlib import contextmanager +from pathlib import Path + +import cv2 +import numpy as np +import torch +import torch.nn as nn + +from doclayout_yolo import YOLOv10 + +warnings.filterwarnings("ignore", category=UserWarning) + + +# ── CONFIG ───────────────────────────────────────────── +DEFAULT_IMGSZ = 1024 +DEFAULT_CONF = 0.2 +DEFAULT_DEVICE = "cpu" +N_WARMUP_FWD = 5 +N_ITERS_FWD = 20 +N_WARMUP_PRED = 2 +N_ITERS_PRED = 5 +# ─────────────────────────────────────────────────────── + + +@contextmanager +def cache_flush(): + gc.collect() + _ = torch.randn(20_000_000).mul_(2).sum().item() + yield + gc.collect() + + +def preprocess(path: str, size: int) -> torch.Tensor: + im = cv2.imread(path) + if im is None: + raise FileNotFoundError(path) + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + im = cv2.resize(im, (size, size)) + return torch.from_numpy(im).permute(2, 0, 1).unsqueeze(0).float() / 255.0 + + +def recursive_fuse(model: nn.Module) -> int: + """Recursively apply fuse_custom() / fuse() to all modules. Safe to call multiple times.""" + fused_count = 0 + for name, m in model.named_modules(): + if hasattr(m, "fuse_custom") and callable(m.fuse_custom): + if not getattr(m, "fused", False): + try: + m.fuse_custom() + fused_count += 1 + except Exception as e: + print(f" [WARN] fuse_custom failed on {name}: {e}") + elif hasattr(m, "fuse") and callable(m.fuse): + if not getattr(m, "fused", False): + try: + m.fuse() + fused_count += 1 + except Exception as e: + print(f" [WARN] fuse failed on {name}: {e}") + return fused_count + + +def benchmark_forward(model, x, n_warmup=5, n_iters=20, channels_last=False, device="cpu"): + model = model.eval().to(device) + x = x.to(device) + if channels_last: + x = x.to(memory_format=torch.channels_last) + model = model.to(memory_format=torch.channels_last) + with torch.no_grad(): + for _ in range(n_warmup): + _ = model(x) + with cache_flush(): + t0 = time.perf_counter() + with torch.no_grad(): + for _ in range(n_iters): + _ = model(x) + return (time.perf_counter() - t0) / n_iters * 1000 + + +def benchmark_predict(model, path, imgsz, conf, n_warmup=2, n_iters=5, device="cpu"): + for _ in range(n_warmup): + _ = model.predict(path, imgsz=imgsz, conf=conf, device=device, verbose=False) + gc.collect() + t0 = time.perf_counter() + for _ in range(n_iters): + _ = model.predict(path, imgsz=imgsz, conf=conf, device=device, verbose=False) + return (time.perf_counter() - t0) / n_iters * 1000 + + +def main(): + parser = argparse.ArgumentParser(description="DocLayout-YOLO CPU Inference Benchmark") + parser.add_argument("--model", required=True, help="Path to .pt model") + parser.add_argument("--image", required=True, help="Path to test image") + parser.add_argument("--imgsz", type=int, default=DEFAULT_IMGSZ, help="Inference size") + parser.add_argument("--conf", type=float, default=DEFAULT_CONF, help="Confidence threshold") + parser.add_argument("--device", default=DEFAULT_DEVICE, help="Device (cpu/cuda)") + parser.add_argument("--channels-last", action="store_true", help="Use channels_last (NHWC) memory format") + parser.add_argument("--fuse", action="store_true", help="Apply recursive fuse_custom() (safe, idempotent)") + parser.add_argument("--save", default=None, help="Optional path to save annotated result image") + args = parser.parse_args() + + print("=" * 70) + print(" DocLayout-YOLO CPU Inference Benchmark") + print("=" * 70) + print(f"PyTorch: {torch.__version__}") + print(f"Device: {args.device} ({torch.get_num_threads()} threads)") + print(f"Model: {args.model}") + print(f"Image: {args.image}") + print(f"Size: {args.imgsz}") + print(f"Opts: channels_last={args.channels_last}, fuse={args.fuse}") + print("=" * 70) + + x = preprocess(args.image, args.imgsz) + + # ── Load model ── + print("\nLoading model...") + model = YOLOv10(args.model) + model = model.to(args.device) + + # ── Optional: recursive fuse_custom ── + if args.fuse: + print("Applying fuse_custom()...") + n = recursive_fuse(model.model) + print(f" Fused {n} modules.") + + # ── Optional: channels_last ── + if args.channels_last: + print("Converting to channels_last (NHWC)...") + x = x.to(memory_format=torch.channels_last) + model.model = model.model.to(memory_format=torch.channels_last) + + # ── Benchmark raw forward ── + print("\nBenchmarking raw forward pass...") + t_fwd = benchmark_forward( + model.model, x, + n_warmup=N_WARMUP_FWD, n_iters=N_ITERS_FWD, + channels_last=False, # already applied above + device=args.device, + ) + print(f" Raw forward: {t_fwd:.2f} ms") + + # ── Benchmark full predict pipeline ── + print("\nBenchmarking full predict pipeline...") + t_pred = benchmark_predict( + model, args.image, + imgsz=args.imgsz, conf=args.conf, + n_warmup=N_WARMUP_PRED, n_iters=N_ITERS_PRED, + device=args.device, + ) + print(f" Full predict: {t_pred:.2f} ms") + + # ── Optional: save result ── + if args.save: + print(f"\nSaving result to {args.save}...") + result = model.predict(args.image, imgsz=args.imgsz, conf=args.conf, device=args.device, verbose=False)[0] + result.save(args.save) + + # ── Summary ── + print(f"\n{'='*70}") + print(" RESULTS") + print(f"{'='*70}") + print(f" Raw forward: {t_fwd:>10.2f} ms") + print(f" Full predict: {t_pred:>10.2f} ms") + print(f"{'='*70}") + + +if __name__ == "__main__": + main()