pytorch · shunting314 · Mar 25, 2026 · Mar 19, 2026 · yf225 · Mar 19, 2026
diff --git a/examples/distributed/all_reduce.py b/examples/distributed/all_reduce.py
@@ -240,8 +240,6 @@ def main() -> None:
     Sets up the distributed environment, initializes CUDA devices, and runs the
     all-reduce test, and then clean up.
     """
-    # Only NVSHMEM backend implements `get_remote_tensor` for now.
-    symm_mem.set_backend("NVSHMEM")
     rank = int(os.environ["LOCAL_RANK"])
     torch.manual_seed(42 + rank)
     device = torch.device(f"cuda:{rank}")

diff --git a/examples/distributed/allreduce_bias_rmsnorm.py b/examples/distributed/allreduce_bias_rmsnorm.py
@@ -285,7 +285,6 @@ def test(N: int, D: int, device: torch.device, dtype: torch.dtype) -> None:
 
 
 def main() -> None:
-    symm_mem.set_backend("NVSHMEM")
     rank = int(os.environ["LOCAL_RANK"])
     torch.manual_seed(42 + rank)
     device = torch.device(f"cuda:{rank}")

diff --git a/examples/distributed/matmul_reduce_scatter.py b/examples/distributed/matmul_reduce_scatter.py
@@ -200,7 +200,6 @@ def test(M: int, N: int, K: int, device: torch.device, dtype: torch.dtype) -> No
 
 
 def main() -> None:
-    symm_mem.set_backend("NVSHMEM")
     rank = int(os.environ["LOCAL_RANK"])
     torch.manual_seed(42 + rank)
     device = torch.device(f"cuda:{rank}")