[Advanced Compiler]Add Unfold backward (flagos-ai#1784)

AdvancedCompiler · web-flow · commit 06d37dfc449c · 2026-03-11T17:24:00.000+08:00
diff --git a/benchmark/test_special_perf.py b/benchmark/test_special_perf.py
@@ -905,3 +905,43 @@ def input_kwargs(shape, dtype, device):
     )
     bench.set_gems(flag_gems.per_token_group_quant_fp8)
     bench.run()
+
+
+@pytest.mark.unfold
+def test_perf_unfold_backward():
+    def unfold_backward_input_fn(config, dtype, device):
+        input_sizes, dim, size, step = config
+        d = dim % len(input_sizes)
+        num_windows = (input_sizes[d] - size) // step + 1
+        grad_shape = (
+            list(input_sizes[:d]) + [num_windows] + list(input_sizes[d + 1 :]) + [size]
+        )
+        grad_in = torch.randn(grad_shape, dtype=dtype, device=device)
+        yield grad_in, list(input_sizes), dim, size, step
+
+    class UnfoldBackwardBenchmark(Benchmark):
+        def set_shapes(self, shape_file_path=None):
+            self.shapes = [
+                ((32, 64), 1, 16, 16),
+                ((16, 33), 0, 5, 2),
+                ((4, 8, 12), -1, 6, 4),
+                ((7, 13), 1, 13, 3),
+                ((6, 20), 1, 7, 4),
+                ((2, 3, 17), -1, 9, 1),
+                ((2, 17), 1, 4, 6),
+            ]
+
+        def set_more_shapes(self):
+            return None
+
+        def get_input_iter(self, cur_dtype):
+            for config in self.shapes:
+                yield from unfold_backward_input_fn(config, cur_dtype, self.device)
+
+    bench = UnfoldBackwardBenchmark(
+        op_name="unfold_backward",
+        torch_op=torch.ops.aten.unfold_backward,
+        dtypes=[torch.float16, torch.float32, torch.bfloat16],
+    )
+    bench.set_gems(flag_gems.unfold_backward)
+    bench.run()
diff --git a/src/flag_gems/__init__.py b/src/flag_gems/__init__.py
@@ -344,6 +344,7 @@ def torch_ge(v):
     ("true_divide.Tensor", true_divide),
     ("true_divide_.Scalar", true_divide_),
     ("true_divide_.Tensor", true_divide_),
+    ("unfold_backward", unfold_backward),
     ("uniform_", uniform_),
     ("upsample_linear1d", upsample_linear1d),
     ("upsample_nearest1d", upsample_nearest1d),
diff --git a/src/flag_gems/ops/__init__.py b/src/flag_gems/ops/__init__.py
@@ -217,6 +217,7 @@
 from flag_gems.ops.topk import topk
 from flag_gems.ops.trace import trace
 from flag_gems.ops.triu import triu, triu_
+from flag_gems.ops.unfold_backward import unfold_backward
 from flag_gems.ops.uniform import uniform_
 from flag_gems.ops.unique import _unique2
 from flag_gems.ops.upsample_bicubic2d_aa import _upsample_bicubic2d_aa
@@ -528,6 +529,7 @@
     "true_divide",
     "true_divide_",
     "true_divide_out",
+    "unfold_backward",
     "uniform_",
     "upsample_linear1d",
     "upsample_nearest1d",
diff --git a/src/flag_gems/ops/unfold_backward.py b/src/flag_gems/ops/unfold_backward.py
@@ -0,0 +1,88 @@
+import logging
+
+import torch
+import triton
+import triton.language as tl
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def _unfold_backward_kernel(
+    grad_in_ptr,
+    grad_out_ptr,
+    numel_in,
+    prod_after,
+    L,
+    size,
+    step,
+    D,
+    inner_total,
+    BLOCK: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offs = pid * BLOCK + tl.arange(0, BLOCK)
+    mask = offs < numel_in
+
+    vals = tl.load(grad_in_ptr + offs, mask=mask, other=0)
+    vals_f32 = tl.cast(vals, tl.float32)
+
+    k = offs % size
+    tmp1 = offs // size
+    after_lin = tmp1 % prod_after
+    tmp2 = offs // (prod_after * size)
+    s = tmp2 % L
+    before_lin = offs // inner_total
+
+    pos = s * step + k
+
+    out_id = ((before_lin * D) + pos) * prod_after + after_lin
+
+    tl.atomic_add(grad_out_ptr + out_id, vals_f32, mask=mask)
+
+
+def unfold_backward(
+    grad_in: torch.Tensor, input_sizes, dim: int, size: int, step: int
+) -> torch.Tensor:
+    logger.debug("GEMS UNFOLD BACKWARD")
+    if step <= 0:
+        raise ValueError("step must be > 0")
+
+    if not isinstance(input_sizes, (list, tuple)):
+        input_sizes = list(input_sizes)
+    input_sizes = [int(s) for s in input_sizes]
+    ndim = len(input_sizes)
+    d = dim % ndim
+
+    D = int(input_sizes[d])
+    L = (D - int(size)) // int(step) + 1
+
+    prod_after = 1
+    for s_ in input_sizes[d + 1 :]:
+        prod_after *= int(s_)
+    inner_total = int(L) * int(prod_after) * int(size)
+
+    device = grad_in.device
+    grad_out_f32 = torch.zeros(input_sizes, dtype=torch.float32, device=device)
+
+    numel_in = grad_in.numel()
+
+    BLOCK = 128
+    grid = lambda meta: (triton.cdiv(numel_in, meta["BLOCK"]),)
+
+    _unfold_backward_kernel[grid](
+        grad_in,
+        grad_out_f32,
+        numel_in,
+        prod_after,
+        L,
+        size,
+        step,
+        D,
+        inner_total,
+        BLOCK=BLOCK,
+    )
+
+    if grad_in.dtype != torch.float32:
+        return grad_out_f32.to(grad_in.dtype)
+    return grad_out_f32
diff --git a/tests/test_special_ops.py b/tests/test_special_ops.py
@@ -1929,3 +1929,34 @@ def _verify_expert_level_sorting(
     gems_assert_close(
         num_tokens_post_pad, to_reference(num_tokens_post_pad_vllm), dtype=dtype
     )
+
+
+@pytest.mark.unfold
+@pytest.mark.parametrize(
+    "input_sizes, dim, size, step",
+    [
+        ((32, 64), 1, 16, 16),
+        ((16, 33), 0, 5, 2),
+        ((4, 8, 12), -1, 6, 4),
+        ((7, 13), 1, 13, 3),
+        ((6, 20), 1, 7, 4),
+        ((2, 3, 17), -1, 9, 1),
+        ((2, 17), 1, 4, 6),
+    ],
+)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.bfloat16])
+def test_unfold_backward(input_sizes, dim, size, step, dtype):
+    d = dim % len(input_sizes)
+    num_windows = (input_sizes[d] - size) // step + 1
+    grad_shape = (
+        list(input_sizes[:d]) + [num_windows] + list(input_sizes[d + 1 :]) + [size]
+    )
+
+    grad_in = torch.randn(grad_shape, dtype=dtype, device=device)
+
+    ref_grad = to_reference(grad_in, True)
+    ref_out = torch.ops.aten.unfold_backward(ref_grad, input_sizes, dim, size, step)
+
+    with flag_gems.use_gems():
+        res_out = flag_gems.unfold_backward(grad_in, input_sizes, dim, size, step)
+    gems_assert_close(res_out, ref_out, dtype, reduce_dim=size)