From d31e6d2f604c7c2b219bb29bf7153653e8c8f372 Mon Sep 17 00:00:00 2001 From: Nueramarcos Date: Sun, 14 Jun 2026 06:57:27 -0700 Subject: [PATCH] Fix deform_conv2d kernels to use current CUDA stream CUDA kernels should respect PyTorch stream semantics. Previously deformable_im2col_kernel and deformable_col2im_kernel (both int and int64_t variants) and deformable_col2im_coord_kernel launched on the default stream, causing race conditions when users use multiple streams. This changes all 6 kernel launches to use at::cuda::getCurrentCUDAStream(), matching the pattern in roi_pool_kernel.cu. Fixes #9513 --- torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu index ae496b37d48..155064670a3 100644 --- a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu +++ b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu @@ -247,10 +247,11 @@ void deformable_im2col( out_w > std::numeric_limits::max()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (use_64bits_indexing) { AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "deformable_im2col", ([&] { - deformable_im2col_kernel<<>>( + deformable_im2col_kernel<<>>( num_kernels, input.data_ptr(), data_offset.data_ptr(), @@ -277,7 +278,7 @@ void deformable_im2col( } else { AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "deformable_im2col", ([&] { - deformable_im2col_kernel<<>>( + deformable_im2col_kernel<<>>( num_kernels, input.data_ptr(), data_offset.data_ptr(), @@ -436,10 +437,11 @@ void compute_grad_input( at::globalContext().alertNotDeterministic("compute_grad_input"); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (use_64bits_indexing) { AT_DISPATCH_FLOATING_TYPES_AND_HALF( columns.scalar_type(), "compute_grad_input", ([&] { - deformable_col2im_kernel<<>>( + deformable_col2im_kernel<<>>( num_kernels, columns.data_ptr(), offset.data_ptr(), @@ -465,7 +467,7 @@ void compute_grad_input( } else { AT_DISPATCH_FLOATING_TYPES_AND_HALF( columns.scalar_type(), "compute_grad_input", ([&] { - deformable_col2im_kernel<<>>( + deformable_col2im_kernel<<>>( num_kernels, columns.data_ptr(), offset.data_ptr(), @@ -678,11 +680,12 @@ void compute_grad_offset_and_mask( ((int64_t)channels * weight_h * weight_w * parallel_imgs * out_h * out_w > std::numeric_limits::max()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (use_64bits_indexing) { AT_DISPATCH_FLOATING_TYPES_AND_HALF( columns.scalar_type(), "compute_grad_offset_and_mask", ([&] { deformable_col2im_coord_kernel - <<>>( + <<>>( num_kernels, columns.data_ptr(), input.data_ptr(), @@ -711,7 +714,7 @@ void compute_grad_offset_and_mask( } else { AT_DISPATCH_FLOATING_TYPES_AND_HALF( columns.scalar_type(), "compute_grad_offset_and_mask", ([&] { - deformable_col2im_coord_kernel<<>>( + deformable_col2im_coord_kernel<<>>( num_kernels, columns.data_ptr(), input.data_ptr(),