From d31e6d2f604c7c2b219bb29bf7153653e8c8f372 Mon Sep 17 00:00:00 2001
From: Nueramarcos <marcosrodriguez0324@gmail.com>
Date: Sun, 14 Jun 2026 06:57:27 -0700
Subject: [PATCH] Fix deform_conv2d kernels to use current CUDA stream

CUDA kernels should respect PyTorch stream semantics. Previously
deformable_im2col_kernel and deformable_col2im_kernel (both int and
int64_t variants) and deformable_col2im_coord_kernel launched on the
default stream, causing race conditions when users use multiple streams.

This changes all 6 kernel launches to use at::cuda::getCurrentCUDAStream(),
matching the pattern in roi_pool_kernel.cu.

Fixes #9513
---
 torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
index ae496b37d48..155064670a3 100644
--- a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
+++ b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
@@ -247,10 +247,11 @@ void deformable_im2col(
            out_w >
        std::numeric_limits<int32_t>::max());
 
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         input.scalar_type(), "deformable_im2col", ([&] {
-          deformable_im2col_kernel<scalar_t, int64_t><<<blocks, threads>>>(
+          deformable_im2col_kernel<scalar_t, int64_t><<<blocks, threads, 0, stream>>>(
               num_kernels,
               input.data_ptr<scalar_t>(),
               data_offset.data_ptr<scalar_t>(),
@@ -277,7 +278,7 @@ void deformable_im2col(
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         input.scalar_type(), "deformable_im2col", ([&] {
-          deformable_im2col_kernel<scalar_t, int><<<blocks, threads>>>(
+          deformable_im2col_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
               num_kernels,
               input.data_ptr<scalar_t>(),
               data_offset.data_ptr<scalar_t>(),
@@ -436,10 +437,11 @@ void compute_grad_input(
 
   at::globalContext().alertNotDeterministic("compute_grad_input");
 
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         columns.scalar_type(), "compute_grad_input", ([&] {
-          deformable_col2im_kernel<scalar_t, int64_t><<<blocks, threads>>>(
+          deformable_col2im_kernel<scalar_t, int64_t><<<blocks, threads, 0, stream>>>(
               num_kernels,
               columns.data_ptr<scalar_t>(),
               offset.data_ptr<scalar_t>(),
@@ -465,7 +467,7 @@ void compute_grad_input(
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         columns.scalar_type(), "compute_grad_input", ([&] {
-          deformable_col2im_kernel<scalar_t, int><<<blocks, threads>>>(
+          deformable_col2im_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
               num_kernels,
               columns.data_ptr<scalar_t>(),
               offset.data_ptr<scalar_t>(),
@@ -678,11 +680,12 @@ void compute_grad_offset_and_mask(
       ((int64_t)channels * weight_h * weight_w * parallel_imgs * out_h * out_w >
        std::numeric_limits<int32_t>::max());
 
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         columns.scalar_type(), "compute_grad_offset_and_mask", ([&] {
           deformable_col2im_coord_kernel<scalar_t, int64_t>
-              <<<blocks, threads>>>(
+              <<<blocks, threads, 0, stream>>>(
                   num_kernels,
                   columns.data_ptr<scalar_t>(),
                   input.data_ptr<scalar_t>(),
@@ -711,7 +714,7 @@ void compute_grad_offset_and_mask(
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         columns.scalar_type(), "compute_grad_offset_and_mask", ([&] {
-          deformable_col2im_coord_kernel<scalar_t, int><<<blocks, threads>>>(
+          deformable_col2im_coord_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
               num_kernels,
               columns.data_ptr<scalar_t>(),
               input.data_ptr<scalar_t>(),