[SiliconFlow] Op: Reflection pad2d (flagos-ai#1930)

fpzh2011 · web-flow · commit 5589e3bc46c8 · 2026-03-24T16:57:32.000+08:00
diff --git a/benchmark/test_special_perf.py b/benchmark/test_special_perf.py
@@ -3,6 +3,7 @@
 
 import pytest
 import torch
+import triton
 
 import flag_gems
 from benchmark.attri_util import BOOL_DTYPES, FLOAT_DTYPES, INT_DTYPES, BenchLevel
@@ -1025,6 +1026,54 @@ def input_kwargs(shape, dtype, device):
     bench.run()
 
 
+@pytest.mark.reflection_pad2d
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (3, 33, 33),
+        (2, 4, 32, 64),
+        (8, 16, 64, 64),
+        (32, 64, 128, 256),
+        (16, 32, 64, 128),
+    ],
+)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    "padding",
+    [
+        (1, 1, 1, 1),
+        (2, 3, 2, 3),
+        (3, 5, 3, 5),
+        (0, 4, 0, 4),
+    ],
+)
+def test_reflection_pad2d_benchmark_tensor(shape, dtype, padding):
+    quantiles = [0.5, 0.2, 0.8]
+
+    x = torch.randn(shape, dtype=dtype, device=flag_gems.device)
+    ref_x = x.clone()
+
+    # PyTorch reference implementation
+    ms_torch, _, _ = triton.testing.do_bench(
+        lambda: torch.ops.aten.reflection_pad2d(ref_x, padding),
+        rep=100,
+        quantiles=quantiles,
+    )
+
+    # Triton implementation
+    with flag_gems.use_gems():
+        ms_triton, _, _ = triton.testing.do_bench(
+            lambda: flag_gems.reflection_pad2d(x, padding), rep=100, quantiles=quantiles
+        )
+
+    # Calculate speedup and return result
+    speedup = ms_torch / ms_triton
+
+    print(f"reflection_pad2d {shape} {dtype}:")
+    print(f"  FlagGems: {ms_triton:.3f}ms")
+    print(f"  Speedup: {speedup:.2f}x")
+
+
 @pytest.mark.upsample_bicubic2d
 @pytest.mark.parametrize("align_corners", [False, True])
 def test_perf_upsample_bicubic2d(align_corners):
diff --git a/src/flag_gems/__init__.py b/src/flag_gems/__init__.py
@@ -307,6 +307,8 @@ def torch_ge(v):
     ("randperm", randperm),
     ("reciprocal", reciprocal),
     ("reciprocal_", reciprocal_),
+    ("reflection_pad2d", reflection_pad2d),
+    ("reflection_pad2d.out", reflection_pad2d_out),
     ("relu", relu),
     ("relu_", relu_),
     ("relu6", relu6),
diff --git a/src/flag_gems/ops/__init__.py b/src/flag_gems/ops/__init__.py
@@ -205,6 +205,7 @@
 from flag_gems.ops.randn_like import randn_like
 from flag_gems.ops.randperm import randperm
 from flag_gems.ops.reciprocal import reciprocal, reciprocal_
+from flag_gems.ops.reflection_pad2d import reflection_pad2d, reflection_pad2d_out
 from flag_gems.ops.relu import relu, relu_
 from flag_gems.ops.relu6 import relu6
 from flag_gems.ops.repeat import repeat
@@ -523,6 +524,8 @@
     "randperm",
     "reciprocal",
     "reciprocal_",
+    "reflection_pad2d",
+    "reflection_pad2d_out",
     "relu",
     "relu_",
     "relu6",
diff --git a/src/flag_gems/ops/reflection_pad2d.py b/src/flag_gems/ops/reflection_pad2d.py
@@ -0,0 +1,157 @@
+import logging
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def reflection_pad2d_kernel(
+    in_ptr,
+    out_ptr,
+    B,
+    H_in,
+    W_in,
+    pad_left,
+    pad_top,
+    H_out,
+    W_out,
+    BLOCK_HW: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+
+    # Flatten 2D index to 1D for block processing
+    offs_n = pid_n * BLOCK_HW + tl.arange(0, BLOCK_HW)
+    # Decode to (h, w) coordinates
+    h_idx = offs_n // W_out
+    w_idx = offs_n % W_out
+
+    mask = (offs_n < H_out * W_out) & (pid_b < B)
+
+    base_in = pid_b * (H_in * W_in)
+    base_out = pid_b * (H_out * W_out)
+
+    # Compute reflected indices for height
+    y = h_idx.to(tl.int32) - pad_top
+    Hm1 = H_in - 1
+    pH = 2 * Hm1
+    t_h = tl.abs(y)
+    m_h = t_h % pH
+    ih = tl.where(m_h < H_in, m_h, pH - m_h)
+
+    # Compute reflected indices for width
+    x = w_idx.to(tl.int32) - pad_left
+    Wm1 = W_in - 1
+    pW = 2 * Wm1
+    t_w = tl.abs(x)
+    m_w = t_w % pW
+    iw = tl.where(m_w < W_in, m_w, pW - m_w)
+
+    # Load from input and store to output
+    in_offs = ih * W_in + iw
+    vals = tl.load(in_ptr + base_in + in_offs, mask=mask, other=0)
+    tl.store(out_ptr + base_out + offs_n, vals, mask=mask)
+
+
+@triton.jit
+def copy_tensor_kernel(in_ptr, out_ptr, B, H, W, BLOCK_HW: tl.constexpr):
+    pid_b = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+
+    offs_n = pid_n * BLOCK_HW + tl.arange(0, BLOCK_HW)
+    mask = (offs_n < H * W) & (pid_b < B)
+
+    base = pid_b * (H * W)
+    vals = tl.load(in_ptr + base + offs_n, mask=mask, other=0)
+    tl.store(out_ptr + base + offs_n, vals, mask=mask)
+
+
+def launch_reflection_pad2d(input: torch.Tensor, padding, out: torch.Tensor = None):
+    # Validate padding format
+    if not isinstance(padding, (list, tuple)):
+        raise ValueError("padding must be a sequence")
+    if len(padding) != 4:
+        raise ValueError(
+            "padding must be a sequence of length 4: (pad_left, pad_right, pad_top, pad_bottom)"
+        )
+    pad_left, pad_right, pad_top, pad_bottom = [int(p) for p in padding]
+
+    # Validate padding values
+    if pad_left < 0 or pad_right < 0 or pad_top < 0 or pad_bottom < 0:
+        raise ValueError("padding values must be >= 0")
+
+    # Validate input
+    if input.dim() < 3:
+        raise ValueError("input must have at least 3 dimensions")
+    if not input.is_cuda:
+        raise ValueError("input must be a CUDA tensor")
+
+    x = input.contiguous()
+    H_in = int(x.shape[-2])
+    W_in = int(x.shape[-1])
+    # Validate reflection padding constraints
+    if H_in < 2 or W_in < 2:
+        raise ValueError(
+            "input spatial dimensions must be at least 2 for reflection padding when padding > 0"
+        )
+    if H_in <= 0 or W_in <= 0:
+        raise ValueError("spatial dimensions must be > 0")
+    if pad_left >= W_in or pad_right >= W_in or pad_top >= H_in or pad_bottom >= H_in:
+        raise ValueError(
+            "padding values must be less than the input spatial dimensions for reflection padding"
+        )
+
+    H_out = H_in + pad_top + pad_bottom
+    W_out = W_in + pad_left + pad_right
+
+    leading_shape = x.shape[:-2]
+    B = int(math.prod(leading_shape)) if len(leading_shape) > 0 else 1
+
+    # Handle output tensor
+    if out is None:
+        out = torch.empty(
+            (*leading_shape, H_out, W_out), device=x.device, dtype=x.dtype
+        )
+    else:
+        if not out.is_cuda:
+            raise ValueError("out must be a CUDA tensor")
+        expected_shape = (*leading_shape, H_out, W_out)
+        if tuple(out.shape) != expected_shape:
+            raise ValueError(
+                f"out tensor has shape {tuple(out.shape)}, expected {expected_shape}"
+            )
+        if out.dtype != x.dtype:
+            raise ValueError(
+                f"out dtype {out.dtype} does not match input dtype {x.dtype}"
+            )
+        if out.device != x.device:
+            raise ValueError("out must be on the same device as input")
+        out = out.contiguous()
+
+    # No padding: just copy
+    if pad_left == 0 and pad_right == 0 and pad_top == 0 and pad_bottom == 0:
+        BLOCK_HW = 256
+        grid = (B, triton.cdiv(H_in * W_in, BLOCK_HW))
+        copy_tensor_kernel[grid](x, out, B, H_in, W_in, BLOCK_HW=BLOCK_HW)
+        return out
+
+    BLOCK_HW = 256
+    grid = (B, triton.cdiv(H_out * W_out, BLOCK_HW))
+    reflection_pad2d_kernel[grid](
+        x, out, B, H_in, W_in, pad_left, pad_top, H_out, W_out, BLOCK_HW=BLOCK_HW
+    )
+    return out
+
+
+def reflection_pad2d(input: torch.Tensor, padding):
+    logger.debug("GEMS REFLECTION_PAD2D")
+    return launch_reflection_pad2d(input, padding, out=None)
+
+
+def reflection_pad2d_out(input: torch.Tensor, padding, out: torch.Tensor):
+    logger.debug("GEMS REFLECTION_PAD2D_OUT")
+    return launch_reflection_pad2d(input, padding, out=out)
diff --git a/tests/test_special_ops.py b/tests/test_special_ops.py
@@ -2060,6 +2060,83 @@ def _verify_expert_level_sorting(
     )
 
 
+@pytest.mark.reflection_pad2d
+@pytest.mark.parametrize(
+    "shape", [(3, 33, 33), (2, 4, 32, 64), (8, 16, 64, 64), (32, 64, 128, 256)]
+)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    "padding",
+    [
+        (1, 1, 1, 1),
+        (2, 3, 2, 3),
+        (3, 5, 3, 5),
+        (0, 4, 0, 4),
+        (4, 0, 4, 0),
+    ],
+)
+def test_reflection_pad2d(shape, dtype, padding):
+    x = torch.randn(shape, dtype=dtype, device=flag_gems.device)
+
+    ref_x = to_reference(x)
+    ref_out = torch.ops.aten.reflection_pad2d(ref_x, padding)
+
+    with flag_gems.use_gems():
+        act_out = flag_gems.reflection_pad2d(x, padding)
+
+    gems_assert_close(act_out, ref_out, dtype, equal_nan=True)
+
+
+@pytest.mark.reflection_pad2d
+@pytest.mark.parametrize("padding", [[1, 1, 1, 1], [2, 3, 4, 5]])
+def test_reflection_pad2d_list_padding(padding):
+    # Test with list format: [pad_left, pad_right, pad_top, pad_bottom]
+    shape = (2, 4, 32, 64)
+    dtype = torch.float32
+    x = torch.randn(shape, dtype=dtype, device=flag_gems.device)
+
+    ref_x = to_reference(x.clone())
+    ref_out = torch.ops.aten.reflection_pad2d(ref_x, padding)
+
+    with flag_gems.use_gems():
+        act_out = flag_gems.reflection_pad2d(x, padding)
+
+    gems_assert_close(act_out, ref_out, dtype, equal_nan=True)
+
+
+@pytest.mark.reflection_pad2d
+def test_reflection_pad2d_empty_padding():
+    shape = (2, 4, 32, 64)
+    dtype = torch.float32
+    padding = (0, 0, 0, 0)
+    x = torch.randn(shape, dtype=dtype, device=flag_gems.device)
+
+    ref_x = to_reference(x.clone())
+    ref_out = torch.ops.aten.reflection_pad2d(ref_x, padding)
+
+    with flag_gems.use_gems():
+        act_out = flag_gems.reflection_pad2d(x, padding)
+
+    gems_assert_close(act_out, ref_out, dtype, equal_nan=True)
+
+
+@pytest.mark.reflection_pad2d
+@pytest.mark.parametrize("padding", [(1, 1, 1, 1), (2, 3, 4, 5)])
+def test_reflection_pad2d_3d_input(padding):
+    # Test with 3D input (C, H, W) - no batch dimension
+    shape = (3, 32, 64)
+    dtype = torch.float32
+    x = torch.randn(shape, dtype=dtype, device=flag_gems.device)
+
+    ref_x = to_reference(x.clone())
+    ref_out = torch.ops.aten.reflection_pad2d(ref_x, padding)
+
+    with flag_gems.use_gems():
+        act_out = flag_gems.reflection_pad2d(x, padding)
+
+    gems_assert_close(act_out, ref_out, dtype, equal_nan=True)
+
+
 @pytest.mark.upsample_bicubic2d
 @pytest.mark.parametrize(
     "N, C, H, W, outH, outW, align_corners, use_scale",