diff --git a/include/cudecomp.h b/include/cudecomp.h
index 9d5e5dd..902a628 100644
--- a/include/cudecomp.h
+++ b/include/cudecomp.h
@@ -48,7 +48,8 @@ typedef enum {
   CUDECOMP_TRANSPOSE_COMM_NCCL = 4,       ///< NCCL backend
   CUDECOMP_TRANSPOSE_COMM_NCCL_PL = 5,    ///< NCCL backend with pipelining
   CUDECOMP_TRANSPOSE_COMM_NVSHMEM = 6,    ///< NVSHMEM backend
-  CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL = 7  ///< NVSHMEM backend with pipelining
+  CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL = 7, ///< NVSHMEM backend with pipelining
+  CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM = 8  ///< NVSHMEM backend using SM-based P2P transfers
 } cudecompTransposeCommBackend_t;
 
 /**
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
index 138de8f..aebd048 100644
--- a/include/internal/comm_routines.h
+++ b/include/internal/comm_routines.h
@@ -90,11 +90,12 @@ static inline void checkMpiInt32Limit(int64_t val, cudecompHaloCommBackend_t bac
 #ifdef ENABLE_NVSHMEM
 #define CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ 8 // max number of intra-group transfers to schedule between team syncs
 template <typename T>
-static void
-nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
-                 const std::vector<comm_count_t>& send_counts, const std::vector<comm_count_t>& send_offsets,
-                 T* recv_buff, const std::vector<comm_count_t>& recv_counts,
-                 const std::vector<comm_count_t>& recv_offsets, cudecompCommAxis comm_axis, cudaStream_t stream) {
+static void nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, T* send_buff,
+                             const std::vector<comm_count_t>& send_counts,
+                             const std::vector<comm_count_t>& send_offsets, T* recv_buff,
+                             const std::vector<comm_count_t>& recv_counts,
+                             const std::vector<comm_count_t>& recv_offsets, cudecompCommAxis comm_axis, bool use_sm,
+                             cudaStream_t stream) {
   auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
   auto team = comm_info.nvshmem_team;
   int self_rank = comm_info.rank;
@@ -104,16 +105,25 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
   CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->nvshmem_sync_event));
 
   // Event dependency on external stream for intra-group transfers
-  CHECK_CUDA(cudaEventRecord(grid_desc->events[0], stream));
-  for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
-    CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
+  if (!use_sm) {
+    CHECK_CUDA(cudaEventRecord(grid_desc->events[0], stream));
+    for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
+      CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
+    }
   }
 
+  bool need_barrier = false;
+  bool need_quiet = false;
   cudecompNvshmemA2AParams<T> params;
+  cudecompNvshmemP2PParams<T> p2p_params;
+  p2p_params.send_buff = send_buff;
+  p2p_params.recv_buff = recv_buff;
+  p2p_params.block_counters = grid_desc->nvshmem_block_counters;
 
   // Inter-group transfers (non-blocking)
   params.send_buff = send_buff;
   params.recv_buff = recv_buff;
+
   int count = 0;
   for (int i = 1; i < send_counts.size(); ++i) {
     int src_rank, dst_rank;
@@ -121,6 +131,8 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
     int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
     if (nvshmem_ptr(recv_buff, dst_rank_global)) { continue; }
 
+    if (!use_sm) need_barrier = true;
+    need_quiet = true;
     params.send_offsets[count] = send_offsets[dst_rank];
     params.recv_offsets[count] = recv_offsets[dst_rank];
     params.send_counts[count] = send_counts[dst_rank];
@@ -129,59 +141,87 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
 
     if (count == CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY) {
       params.ntransfers = count;
-      cudecomp_nvshmem_alltoallv(params, stream);
+      cudecomp_nvshmem_alltoallv(params, use_sm ? &comm_info.nvshmem_signals[0] : nullptr, stream);
       count = 0;
     }
   }
   if (count != 0) {
     params.ntransfers = count;
-    cudecomp_nvshmem_alltoallv(params, stream);
+    cudecomp_nvshmem_alltoallv(params, use_sm ? &comm_info.nvshmem_signals[0] : nullptr, stream);
   }
 
   // Intra-group transfers (blocking, scheduled after non-blocking inter-group transfers for concurrency)
   count = 0;
-  for (int i = 1; i < send_counts.size(); ++i) {
+  for (int i = (use_sm ? 0 : 1); i < send_counts.size(); ++i) {
     int src_rank, dst_rank;
     getAlltoallPeerRanks(grid_desc, comm_axis, i, src_rank, dst_rank);
     int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
     if (nvshmem_ptr(recv_buff, dst_rank_global)) {
 
-      if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 && count != 0 &&
-          count % CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ == 0) {
-        // For single group, single P2P CE (e.g. NVSwitch), synchronize NVSHMEM team every
-        // CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ transfers This helps reduce CE contention due to accumulation of
-        // jitter.
-        for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
-          CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i]));
-          CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->events[0], 0));
-        }
+      if (!use_sm) {
+        need_barrier = true;
+        if (comm_info.ngroups == 1 && handle->device_p2p_ce_count == 1 && count != 0 &&
+            count % CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ == 0) {
+          // For single group, single P2P CE (e.g. NVSwitch), synchronize NVSHMEM team every
+          // CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ transfers This helps reduce CE contention due to accumulation of
+          // jitter.
+          for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
+            CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i]));
+            CHECK_CUDA(cudaStreamWaitEvent(aux_stream, grid_desc->events[0], 0));
+          }
 
-        nvshmemx_team_sync_on_stream(team, aux_stream);
+          nvshmemx_team_sync_on_stream(team, aux_stream);
 
-        CHECK_CUDA(cudaEventRecord(grid_desc->events[0], aux_stream));
-        for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
-          CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
+          CHECK_CUDA(cudaEventRecord(grid_desc->events[0], aux_stream));
+          for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
+            CHECK_CUDA(cudaStreamWaitEvent(handle->streams[i], grid_desc->events[0], 0));
+          }
         }
-      }
 
-      nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank], send_buff + send_offsets[dst_rank],
-                                send_counts[dst_rank] * sizeof(T), dst_rank_global,
-                                handle->streams[count % handle->device_p2p_ce_count]);
-      count++;
+        nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank], send_buff + send_offsets[dst_rank],
+                                  send_counts[dst_rank] * sizeof(T), dst_rank_global,
+                                  handle->streams[count % handle->device_p2p_ce_count]);
+        count++;
+      } else {
+        p2p_params.send_offsets[count] = send_offsets[dst_rank];
+        p2p_params.recv_offsets[count] = recv_offsets[dst_rank];
+        p2p_params.send_counts[count] = send_counts[dst_rank];
+        p2p_params.peer_ranks[count] = dst_rank_global;
+        count++;
+
+        if (count == CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY) {
+          p2p_params.ntransfers = count;
+          cudecomp_nvshmem_alltoallv_p2p(handle, p2p_params, &comm_info.nvshmem_signals[0], stream);
+          count = 0;
+        }
+      }
     }
   }
 
-  // Self-copy with cudaMemcpy
-  CHECK_CUDA(cudaMemcpyAsync(recv_buff + recv_offsets[self_rank], send_buff + send_offsets[self_rank],
-                             send_counts[self_rank] * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+  if (use_sm) {
+    if (count != 0) {
+      p2p_params.ntransfers = count;
+      cudecomp_nvshmem_alltoallv_p2p(handle, p2p_params, &comm_info.nvshmem_signals[0], stream);
+    }
+
+    if (need_quiet) { nvshmemx_quiet_on_stream(stream); }
+    nvshmemx_signal_wait_until_on_stream(&comm_info.nvshmem_signals[0], NVSHMEM_CMP_EQ,
+                                         static_cast<uint64_t>(comm_info.nranks), stream);
+  } else {
+    // Self-copy with cudaMemcpy
+    CHECK_CUDA(cudaMemcpyAsync(recv_buff + recv_offsets[self_rank], send_buff + send_offsets[self_rank],
+                               send_counts[self_rank] * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+  }
 
-  // Event dependency on internal streams for completion of intra-group transfers
-  for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
-    CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i]));
-    CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[0], 0));
+  // Event dependency on internal streams for completion of intra-group transfers (not needed for SM path)
+  if (!use_sm) {
+    for (int i = 0; i < handle->device_p2p_ce_count; ++i) {
+      CHECK_CUDA(cudaEventRecord(grid_desc->events[0], handle->streams[i]));
+      CHECK_CUDA(cudaStreamWaitEvent(stream, grid_desc->events[0], 0));
+    }
   }
 
-  nvshmemx_barrier_on_stream(team, stream);
+  if (need_barrier) { nvshmemx_barrier_on_stream(team, stream); }
 }
 #endif
 
@@ -213,11 +253,13 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD
 
   std::vector<MPI_Request> reqs;
   switch (grid_desc->config.transpose_comm_backend) {
-  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: {
+  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM:
+  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: {
 #ifdef ENABLE_NVSHMEM
     if (nvshmem_ptr(send_buff, handle->rank) && nvshmem_ptr(recv_buff, handle->rank)) {
       nvshmemAlltoallV(handle, grid_desc, send_buff, send_counts, send_offsets, recv_buff, recv_counts,
-                       recv_offsets_nvshmem, comm_axis, stream);
+                       recv_offsets_nvshmem, comm_axis,
+                       grid_desc->config.transpose_comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM, stream);
       break;
     } else {
       THROW_INVALID_USAGE("NVSHMEM communication backends require workspace allocated via cudecompMalloc.");
diff --git a/include/internal/common.h b/include/internal/common.h
index fde5680..ea52a73 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -114,6 +114,8 @@ struct cudecompHandle {
 
   // Miscellaneous
   int32_t device_p2p_ce_count = 0;       // number of P2P CEs available
+  int32_t device_num_sms = 0;            // number of SMs on the device
+  int32_t device_max_threads_per_sm = 0; // maximum threads per SM
   bool use_col_major_rank_order = false; // Flag to control whether to use column-major rank order
 };
 
@@ -183,6 +185,10 @@ struct cudecompGridDesc {
   std::vector<cudaEvent_t> events{nullptr}; // CUDA events used for scheduling
   cudaEvent_t nvshmem_sync_event = nullptr; // NVSHMEM event used for synchronization
 
+#ifdef ENABLE_NVSHMEM
+  int* nvshmem_block_counters = nullptr; // device memory counters for SM alltoallv last-block detection
+#endif
+
   cudecomp::graphCache graph_cache; // CUDA graph cache
 
   cudecomp::ncclComm nccl_comm; // NCCL communicator (global), shared from handle
@@ -292,7 +298,8 @@ static inline bool haloBackendRequiresNccl(cudecompHaloCommBackend_t comm_backen
 }
 
 static inline bool transposeBackendRequiresNvshmem(cudecompTransposeCommBackend_t comm_backend) {
-  return (comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM || comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL);
+  return (comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM || comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL ||
+          comm_backend == CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM);
 }
 
 static inline bool haloBackendRequiresNvshmem(cudecompHaloCommBackend_t comm_backend) {
@@ -380,8 +387,8 @@ static inline void getAlltoallPeerRanks(cudecompGridDesc_t grid_desc, cudecompCo
 
   const auto& info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
 
-  // Quick return for single rank case
-  if (info.nranks == 1) {
+  // Return self for single rank case or when iter is zero
+  if (info.nranks == 1 || iter == 0) {
     src_rank = info.rank;
     dst_rank = info.rank;
     return;
diff --git a/include/internal/cudecomp_kernels.cuh b/include/internal/cudecomp_kernels.cuh
index f09c3d2..97bcfc9 100644
--- a/include/internal/cudecomp_kernels.cuh
+++ b/include/internal/cudecomp_kernels.cuh
@@ -18,6 +18,8 @@
 #ifndef CUDECOMP_KERNELS_CUH
 #define CUDECOMP_KERNELS_CUH
 
+#include <algorithm>
+
 #ifdef ENABLE_NVSHMEM
 #include <nvshmem.h>
 #endif
@@ -28,6 +30,8 @@
 #define CUDECOMP_UNROLL_FACTOR (4)
 #define CUDECOMP_MIN_BLOCKS_PER_SM (16)
 
+#define CUDECOMP_NVSHMEM_NTHREADS (1024)
+
 namespace cudecomp {
 
 #ifdef ENABLE_NVSHMEM
@@ -47,6 +51,61 @@ __launch_bounds__(CUDECOMP_CUDA_NTHREADS) __global__
 
   nvshmem_putmem_nbi(recv_buff + recv_offset, send_buff + send_offset, send_count * sizeof(T), peer_rank);
 }
+
+template <typename T>
+__launch_bounds__(CUDECOMP_CUDA_NTHREADS) __global__
+    void cudecomp_nvshmem_alltoallv_signal_k(cudecompNvshmemA2AParams<T> params, uint64_t* sig_addr) {
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid >= params.ntransfers) return;
+
+  int peer_rank = params.peer_ranks[tid];
+  T* send_buff = params.send_buff;
+  T* recv_buff = params.recv_buff;
+  size_t send_offset = params.send_offsets[tid];
+  size_t recv_offset = params.recv_offsets[tid];
+  size_t send_count = params.send_counts[tid];
+
+  nvshmem_putmem_signal_nbi(recv_buff + recv_offset, send_buff + send_offset, send_count * sizeof(T), sig_addr, 1,
+                            NVSHMEM_SIGNAL_ADD, peer_rank);
+}
+
+template <typename T>
+__launch_bounds__(CUDECOMP_NVSHMEM_NTHREADS) __global__
+    void cudecomp_nvshmem_alltoallv_p2p_k(cudecompNvshmemP2PParams<T> params, uint64_t* sig_addr) {
+
+  T* send_buff = params.send_buff;
+  T* recv_buff = params.recv_buff;
+  int bid = blockIdx.x;
+
+  if (params.ntransfers > 0) {
+    int blocks_per_copy = gridDim.x / params.ntransfers;
+    int copyid = bid / blocks_per_copy;
+    int block_within_copy = bid % blocks_per_copy;
+    int peer_rank = params.peer_ranks[copyid];
+    size_t send_offset = params.send_offsets[copyid];
+    size_t recv_offset = params.recv_offsets[copyid];
+    size_t send_count = params.send_counts[copyid];
+
+    size_t nelems_per_block = (send_count + blocks_per_copy - 1) / blocks_per_copy;
+    size_t block_offset = (size_t)block_within_copy * nelems_per_block;
+    if (block_offset < send_count) {
+      size_t block_count = min(nelems_per_block, send_count - block_offset);
+      nvshmemx_putmem_block(recv_buff + recv_offset + block_offset, send_buff + send_offset + block_offset,
+                            block_count * sizeof(T), peer_rank);
+    }
+
+    // Last block to finish this copy signals the destination PE.
+    nvshmem_fence();
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      if (atomicAdd(&params.block_counters[peer_rank], 1) + 1 == blocks_per_copy) {
+        params.block_counters[peer_rank] = 0;
+        nvshmemx_signal_op(sig_addr, 1, NVSHMEM_SIGNAL_ADD, peer_rank);
+      }
+    }
+  }
+}
 #endif
 
 template <int src_nd, int dest_nd, typename T>
@@ -107,7 +166,8 @@ __launch_bounds__(CUDECOMP_CUDA_NTHREADS) __global__
 }
 
 template <typename T>
-void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3DParams<T>& params,
+void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(cudecompHandle_t handle,
+                                                const cudecompBatchedD2DMemcpy3DParams<T>& params,
                                                 cudaStream_t stream) {
   size_t N = params.extents[0][0] * params.extents[1][0] * params.extents[2][0];
 
@@ -138,12 +198,9 @@ void cudecomp_batched_d2d_memcpy_3d_nd_dispatch(const cudecompBatchedD2DMemcpy3D
   int blocks_per_copy_unroll = (blocks_per_copy + CUDECOMP_UNROLL_FACTOR - 1) / CUDECOMP_UNROLL_FACTOR;
   size_t total_blocks_unroll = params.ncopies * blocks_per_copy_unroll;
 
-  // Clamp minimum number of blocks from unrolling
-  int dev, num_sms;
-  CHECK_CUDA(cudaGetDevice(&dev));
-  CHECK_CUDA(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev));
-
-  if (total_blocks_unroll > CUDECOMP_MIN_BLOCKS_PER_SM * num_sms) { blocks_per_copy = blocks_per_copy_unroll; }
+  if (total_blocks_unroll > CUDECOMP_MIN_BLOCKS_PER_SM * handle->device_num_sms) {
+    blocks_per_copy = blocks_per_copy_unroll;
+  }
 
   switch (src_nd) {
   case 1:
diff --git a/include/internal/cudecomp_kernels.h b/include/internal/cudecomp_kernels.h
index 6fcb9d7..17025aa 100644
--- a/include/internal/cudecomp_kernels.h
+++ b/include/internal/cudecomp_kernels.h
@@ -20,42 +20,77 @@
 
 #include <cuda/std/complex>
 
+#include "internal/common.h"
+
 namespace cudecomp {
 
 #ifdef ENABLE_NVSHMEM
-#define CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY 96
+
+// Capacity for the inter-group alltoallv kernel.
+#define CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY (96)
 template <typename T> struct cudecompNvshmemA2AParams {
-  int ntransfers;
   T* send_buff = nullptr;
   T* recv_buff = nullptr;
   size_t send_offsets[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY];
   size_t recv_offsets[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY];
   size_t send_counts[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY];
   int peer_ranks[CUDECOMP_NVSHMEM_A2A_PARAM_CAPACITY];
+  int ntransfers;
+};
+
+// Capacity for the intra-group SM P2P kernel.
+#define CUDECOMP_NVSHMEM_MAX_SMS (32)
+#define CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY (CUDECOMP_NVSHMEM_MAX_SMS * 2)
+template <typename T> struct cudecompNvshmemP2PParams {
+  T* send_buff = nullptr;
+  T* recv_buff = nullptr;
+  int* block_counters = nullptr;
+  size_t send_offsets[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY];
+  size_t recv_offsets[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY];
+  size_t send_counts[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY];
+  int peer_ranks[CUDECOMP_NVSHMEM_P2P_PARAM_CAPACITY];
+  int ntransfers;
 };
 
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<float>& params, cudaStream_t stream);
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<double>& params, cudaStream_t stream);
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<float>>& params, cudaStream_t stream);
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<double>>& params,
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<float>& params, uint64_t* sig_addr, cudaStream_t stream);
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<double>& params, uint64_t* sig_addr,
+                                cudaStream_t stream);
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<float>>& params, uint64_t* sig_addr,
                                 cudaStream_t stream);
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<double>>& params, uint64_t* sig_addr,
+                                cudaStream_t stream);
+
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams<float>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream);
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams<double>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream);
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle,
+                                    const cudecompNvshmemP2PParams<cuda::std::complex<float>>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream);
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle,
+                                    const cudecompNvshmemP2PParams<cuda::std::complex<double>>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream);
 #endif
 
 #define CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY 56
 template <typename T> struct cudecompBatchedD2DMemcpy3DParams {
-  int ncopies;
   T* src[CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY];
   T* dest[CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY];
   size_t src_strides[2][CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY];  // [depth stride, row stride] col_stride=1 assumed
   size_t dest_strides[2][CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY]; // [depth stride, row stride] col_stride=1 assumed
   size_t extents[3][CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY];      // [depth, height, width]
+  int ncopies;
 };
 
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<float>& params, cudaStream_t stream);
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<double>& params, cudaStream_t stream);
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<float>>& params,
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams<float>& params,
+                                    cudaStream_t stream);
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams<double>& params,
+                                    cudaStream_t stream);
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle,
+                                    cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<float>>& params,
                                     cudaStream_t stream);
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<double>>& params,
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle,
+                                    cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<double>>& params,
                                     cudaStream_t stream);
 
 } // namespace cudecomp
diff --git a/include/internal/halo.h b/include/internal/halo.h
index c9d890f..dd9a5e2 100644
--- a/include/internal/halo.h
+++ b/include/internal/halo.h
@@ -183,7 +183,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     }
 
     memcpy_params.ncopies = 2;
-    cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream);
+    cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream);
   } break;
 
   case 1: {
@@ -218,7 +218,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
     }
 
     memcpy_params.ncopies = 2;
-    cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream);
+    cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream);
 
     std::array<comm_count_t, 2> counts{static_cast<comm_count_t>(halo_size), static_cast<comm_count_t>(halo_size)};
     std::array<size_t, 2> offsets{};
@@ -266,7 +266,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG
       memcpy_params.ncopies = 2;
     }
 
-    cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream);
+    cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream);
   } break;
 
   case 2: {
diff --git a/include/internal/transpose.h b/include/internal/transpose.h
index 6328f93..5275379 100644
--- a/include/internal/transpose.h
+++ b/include/internal/transpose.h
@@ -579,7 +579,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
           memcpy_count++;
           if (memcpy_count == memcpy_limit || j == splits_a.size()) {
             memcpy_params.ncopies = memcpy_count;
-            cudecomp_batched_d2d_memcpy_3d(memcpy_params, graph_stream);
+            cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, graph_stream);
             memcpy_count = 0;
           }
 #if CUDART_VERSION >= 11010
@@ -873,7 +873,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c
         memcpy_count++;
         if (memcpy_count == memcpy_limit || j == splits_a.size() - 1) {
           memcpy_params.ncopies = memcpy_count;
-          cudecomp_batched_d2d_memcpy_3d(memcpy_params, stream);
+          cudecomp_batched_d2d_memcpy_3d(handle, memcpy_params, stream);
           memcpy_count = 0;
         }
       }
diff --git a/src/autotune.cc b/src/autotune.cc
index d4872c8..15a678c 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -105,6 +105,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     if (!options->disable_nvshmem_backends) {
       comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NVSHMEM);
       comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL);
+      comm_backend_list.push_back(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM);
       need_nvshmem = true;
     }
 #endif
@@ -283,6 +284,8 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
           (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
       CHECK_CUDA(
           cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
+      CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int)));
+      CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int)));
 #endif
     }
 
@@ -461,6 +464,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
       nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
       nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
+      CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
       grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
       grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
 #endif
@@ -709,6 +713,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
           (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
       CHECK_CUDA(
           cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
+      CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int)));
+      CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int)));
 #endif
     }
 
@@ -824,6 +830,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
       nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
       nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
       nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
+      CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
       grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
       grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
 #endif
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index c37902f..9a841cd 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -102,11 +102,13 @@ static void checkTransposeCommBackend(cudecompTransposeCommBackend_t comm_backen
   case CUDECOMP_TRANSPOSE_COMM_MPI_A2A:
 #ifdef ENABLE_NVSHMEM
   case CUDECOMP_TRANSPOSE_COMM_NVSHMEM:
-  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: return;
+  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL:
+  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: return;
 #else
     return;
   case CUDECOMP_TRANSPOSE_COMM_NVSHMEM:
-  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: THROW_NOT_SUPPORTED("transpose communication type unsupported");
+  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL:
+  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: THROW_NOT_SUPPORTED("transpose communication type unsupported");
 #endif
 
   default: THROW_INVALID_USAGE("unknown transpose communication type");
@@ -529,6 +531,10 @@ cudecompResult_t cudecompInit(cudecompHandle_t* handle_in, MPI_Comm mpi_comm) {
       handle->device_p2p_ce_count = 2; // Assume 2 P2P CE otherwise (shared D2H/H2D CE)
     }
 
+    // Cache device attributes
+    CHECK_CUDA(cudaDeviceGetAttribute(&handle->device_num_sms, cudaDevAttrMultiProcessorCount, dev));
+    CHECK_CUDA(cudaDeviceGetAttribute(&handle->device_max_threads_per_sm, cudaDevAttrMaxThreadsPerMultiProcessor, dev));
+
     handle->initialized = true;
     cudecomp_initialized = true;
 
@@ -791,6 +797,8 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
           (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
       CHECK_CUDA(
           cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
+      CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int)));
+      CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int)));
       handle->n_grid_descs_using_nvshmem++;
     } else {
       // Finalize nvshmem to reclaim symmetric heap memory if not used
@@ -924,6 +932,7 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
         nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
         nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
       }
+      CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
       handle->n_grid_descs_using_nvshmem--;
 
       // Finalize nvshmem to reclaim symmetric heap memory if not used
@@ -1338,6 +1347,7 @@ const char* cudecompTransposeCommBackendToString(cudecompTransposeCommBackend_t
   case CUDECOMP_TRANSPOSE_COMM_MPI_A2A: return "MPI_A2A";
   case CUDECOMP_TRANSPOSE_COMM_NVSHMEM: return "NVSHMEM";
   case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL: return "NVSHMEM (pipelined)";
+  case CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM: return "NVSHMEM_SM";
   default: return "ERROR";
   }
 }
diff --git a/src/cudecomp_kernels.cu b/src/cudecomp_kernels.cu
index db6b345..311ebb9 100644
--- a/src/cudecomp_kernels.cu
+++ b/src/cudecomp_kernels.cu
@@ -22,19 +22,23 @@
 
 namespace cudecomp {
 
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<float>& params, cudaStream_t stream) {
-  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream);
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams<float>& params,
+                                    cudaStream_t stream) {
+  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream);
 }
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<double>& params, cudaStream_t stream) {
-  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream);
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle, cudecompBatchedD2DMemcpy3DParams<double>& params,
+                                    cudaStream_t stream) {
+  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream);
 }
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<float>>& params,
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle,
+                                    cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<float>>& params,
                                     cudaStream_t stream) {
-  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream);
+  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream);
 }
-void cudecomp_batched_d2d_memcpy_3d(cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<double>>& params,
+void cudecomp_batched_d2d_memcpy_3d(cudecompHandle_t handle,
+                                    cudecompBatchedD2DMemcpy3DParams<cuda::std::complex<double>>& params,
                                     cudaStream_t stream) {
-  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(params, stream);
+  cudecomp_batched_d2d_memcpy_3d_nd_dispatch(handle, params, stream);
 }
 
 } // namespace cudecomp
diff --git a/src/cudecomp_kernels_rdc.cu b/src/cudecomp_kernels_rdc.cu
index 2215f39..2711f2b 100644
--- a/src/cudecomp_kernels_rdc.cu
+++ b/src/cudecomp_kernels_rdc.cu
@@ -23,27 +23,77 @@
 namespace cudecomp {
 
 #ifdef ENABLE_NVSHMEM
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<float>& params, cudaStream_t stream) {
-  cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params);
+template <typename T>
+static void launch_nvshmem_alltoallv(const cudecompNvshmemA2AParams<T>& params, uint64_t* sig_addr,
+                                     cudaStream_t stream) {
+  if (sig_addr) {
+    cudecomp_nvshmem_alltoallv_signal_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params, sig_addr);
+  } else {
+    cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params);
+  }
   CHECK_CUDA_LAUNCH();
 }
 
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<double>& params, cudaStream_t stream) {
-  cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params);
-  CHECK_CUDA_LAUNCH();
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<float>& params, uint64_t* sig_addr,
+                                cudaStream_t stream) {
+  launch_nvshmem_alltoallv(params, sig_addr, stream);
 }
 
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<float>>& params,
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<double>& params, uint64_t* sig_addr,
                                 cudaStream_t stream) {
-  cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params);
-  CHECK_CUDA_LAUNCH();
+  launch_nvshmem_alltoallv(params, sig_addr, stream);
 }
 
-void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<double>>& params,
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<float>>& params, uint64_t* sig_addr,
                                 cudaStream_t stream) {
-  cudecomp_nvshmem_alltoallv_k<<<1, CUDECOMP_CUDA_NTHREADS, 0, stream>>>(params);
+  launch_nvshmem_alltoallv(params, sig_addr, stream);
+}
+
+void cudecomp_nvshmem_alltoallv(const cudecompNvshmemA2AParams<cuda::std::complex<double>>& params, uint64_t* sig_addr,
+                                cudaStream_t stream) {
+  launch_nvshmem_alltoallv(params, sig_addr, stream);
+}
+
+static int nvshmem_p2p_nblocks(int ntransfers, cudecompHandle_t handle) {
+  // Theoretical max blocks per SM based on thread count. Round down to a multiple of ntransfers so
+  // that all launched blocks fit in a single resident wave and each copy receives exactly the same
+  // number of blocks, ensuring even NVLink subscription across peers.
+  int max_blocks_per_sm = handle->device_max_threads_per_sm / CUDECOMP_NVSHMEM_NTHREADS;
+  int nblocks_max = max_blocks_per_sm * std::min(handle->device_num_sms, (int32_t)CUDECOMP_NVSHMEM_MAX_SMS);
+  if (ntransfers == 0) return nblocks_max;
+  int blocks_per_copy = std::max(1, nblocks_max / ntransfers);
+  return blocks_per_copy * ntransfers;
+}
+
+template <typename T>
+static void launch_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams<T>& params,
+                                         uint64_t* sig_addr, cudaStream_t stream) {
+  int nblocks = nvshmem_p2p_nblocks(params.ntransfers, handle);
+  cudecomp_nvshmem_alltoallv_p2p_k<<<nblocks, CUDECOMP_NVSHMEM_NTHREADS, 0, stream>>>(params, sig_addr);
   CHECK_CUDA_LAUNCH();
 }
+
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams<float>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream) {
+  launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream);
+}
+
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle, const cudecompNvshmemP2PParams<double>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream) {
+  launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream);
+}
+
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle,
+                                    const cudecompNvshmemP2PParams<cuda::std::complex<float>>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream) {
+  launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream);
+}
+
+void cudecomp_nvshmem_alltoallv_p2p(cudecompHandle_t handle,
+                                    const cudecompNvshmemP2PParams<cuda::std::complex<double>>& params,
+                                    uint64_t* sig_addr, cudaStream_t stream) {
+  launch_nvshmem_alltoallv_p2p(handle, params, sig_addr, stream);
+}
 #endif
 
 } // namespace cudecomp
diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf
index 8d207b6..fb7bb04 100644
--- a/src/cudecomp_m.cuf
+++ b/src/cudecomp_m.cuf
@@ -30,6 +30,7 @@ module cudecomp
     enumerator :: CUDECOMP_TRANSPOSE_COMM_NCCL_PL = 5
     enumerator :: CUDECOMP_TRANSPOSE_COMM_NVSHMEM = 6
     enumerator :: CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL = 7
+    enumerator :: CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM = 8
   end enum
 
   ! enum for cuDecomp halo backend options
diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90
index d11af09..7060d99 100644
--- a/tests/fortran/transpose_test.f90
+++ b/tests/fortran/transpose_test.f90
@@ -48,8 +48,8 @@ module transpose_CUDECOMP_DOUBLE_COMPLEX_mod
 
   type(cudecompHandle) :: handle
   integer :: rank, nranks
-  type(cudecompGridDesc) :: grid_desc_cache(7)
-  logical :: grid_desc_cache_set(7) = .false.
+  type(cudecompGridDesc) :: grid_desc_cache(8)
+  logical :: grid_desc_cache_set(8) = .false.
   ARRTYPE, pointer, device, contiguous :: work_d(:)
   integer :: work_backend = -1
 
@@ -642,7 +642,7 @@ program main
   endif
 
   ! Free grid descriptors
-  do i = 1, 7
+  do i = 1, 8
     if (grid_desc_cache_set(i)) then
       ! Free workspace with correct grid descriptor
       if (work_backend == i) then
diff --git a/tests/test_config.yaml b/tests/test_config.yaml
index 08dcbed..f1a94d5 100644
--- a/tests/test_config.yaml
+++ b/tests/test_config.yaml
@@ -18,7 +18,7 @@ transpose_test_base: &transpose_test_base
           'hex' ,'hey', 'hez',
           'pdx', 'pdy', 'pdz']
 
-  backend: [1, 2, 3, 4, 5, 6, 7]
+  backend: [1, 2, 3, 4, 5, 6, 7, 8]
 
   gx: [128]
   gy: [124]