From 69fba2c4d160aaca93adbf88c71fe5aa2e756c68 Mon Sep 17 00:00:00 2001
From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Date: Tue, 2 Jun 2026 16:59:50 +0200
Subject: [PATCH] [ROCm] skip two gaussian_blur tests on gfx90a

Skip two gaussian_blur CUDA tests on AMD gfx90a (MI200, MI250) that fail
due to small numerical differences with reference values. Other GPUs and
CPU paths are unchanged.

1. `test_transforms_tensor.py::test_gaussian_blur[3-meth_kwargs4-cuda]`
Failure: Batched GaussianBlur vs per-image calls disagree by 1 on a
single uint8 pixel after rounding from fp32.
Cause: MIOpen conv2d returns batch and single results that differ by 1
float32 ULP at a half-integer (batched: 188.50000000, single:
188.50001526), so rounding gives 188 vs 189. Not a transform logic bug.

2. `test_functional_tensor.py::test_gaussian_blur[gaussian_blur-sigma3-ksize2-dt3-large-cuda]`
Failure: Output exceeds atol=1.0 vs stored OpenCV reference (max diff
1.125 at known pixels).
Cause: Looks like incorrect fp16 OpenCV reference value. CPU (174.0) and
gfx90a (173.875) both differ from OpenCV (175.0) but agree with each
other within ~0.125 (1 fp16 ULP)

Add gfx90a + ROCm + PYTEST_CURRENT_TEST guards to skip failed tests
---
 test/test_functional_tensor.py | 12 ++++++++++++
 test/test_transforms_tensor.py | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 7d491372b77..aa124e7f20f 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -1009,6 +1009,18 @@ def test_crop(device, top, left, height, width):
 @pytest.mark.parametrize("fn", [F.gaussian_blur, torch.jit.script(F.gaussian_blur)])
 def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn):
 
+    if all(
+        [
+            device == "cuda",
+            torch.version.hip is not None,
+            torch.cuda.is_available() and "gfx90a" in torch.cuda.get_device_properties().gcnArchName,
+            "[gaussian_blur-sigma3-ksize2-dt3-large-cuda]" in os.environ.get("PYTEST_CURRENT_TEST", "")
+        ]
+    ):
+        pytest.skip(
+            "Skipped on gfx90a because fp16 gaussian_blur differs from stored OpenCV reference by more then atol+ULP"
+        )
+
     # true_cv2_results = {
     #     # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))
     #     # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8)
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index eac52dafc17..0294ac43a5e 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -847,6 +847,18 @@ def test_gaussian_blur(device, channels, meth_kwargs):
     ):
         pytest.skip("Fails on Windows, see https://github.com/pytorch/vision/issues/5464")
 
+    if all(
+        [
+            device == "cuda",
+            torch.version.hip is not None,
+            torch.cuda.is_available() and "gfx90a" in torch.cuda.get_device_properties().gcnArchName,
+            "test_gaussian_blur[3-meth_kwargs4-cuda]" in os.environ.get("PYTEST_CURRENT_TEST", "")
+        ]
+    ):
+        pytest.skip(
+            "Skipped on gfx90a because of uint8 rounding difference for batched and single conv2d"
+        )
+
     tol = 1.0 + 1e-10
     torch.manual_seed(12)
     _test_class_op(