From 19757aed403b353791b745f0c2cb32657edada84 Mon Sep 17 00:00:00 2001 From: Nero10578 Date: Thu, 16 Oct 2025 17:49:22 -0700 Subject: [PATCH 1/3] add force p2p in custom_all_reduce --- .../device_communicators/custom_all_reduce.py | 6 +- .../marlin_moe_wna16/kernel_bf16_ite::ku4.cu | 89 -------------- .../kernel_bf16_ite::ku4b8.cu | 109 ------------------ .../kernel_bf16_ite::ku8b128.cu | 109 ------------------ .../marlin_moe_wna16/kernel_fp16_ite::ku4.cu | 89 -------------- .../kernel_fp16_ite::ku4b8.cu | 109 ------------------ .../kernel_fp16_ite::ku8b128.cu | 109 ------------------ 7 files changed, 3 insertions(+), 617 deletions(-) delete mode 100644 kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4.cu delete mode 100644 kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4b8.cu delete mode 100644 kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku8b128.cu delete mode 100644 kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4.cu delete mode 100644 kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4b8.cu delete mode 100644 kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku8b128.cu diff --git a/aphrodite/distributed/device_communicators/custom_all_reduce.py b/aphrodite/distributed/device_communicators/custom_all_reduce.py index 9572020840..28e7f5a56b 100644 --- a/aphrodite/distributed/device_communicators/custom_all_reduce.py +++ b/aphrodite/distributed/device_communicators/custom_all_reduce.py @@ -141,13 +141,13 @@ def __init__(self, assert current_platform.is_cuda_alike() fully_connected = current_platform.is_fully_connected( physical_device_ids) - if world_size > 2 and not fully_connected: - if self.rank == 0: + if envs.VLLM_FORCE_P2P == 0: + if world_size > 2 and not fully_connected: logger.warning( "Custom allreduce is disabled because it's not supported on" " more than two PCIe-only GPUs. To silence this warning, " "specify disable_custom_all_reduce=True explicitly.") - return + return # test P2P capability, this checks software/cudaruntime support # this is expensive to compute at the first time # then we cache the result diff --git a/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4.cu b/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4.cu deleted file mode 100644 index b23cf1d921..0000000000 --- a/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4.cu +++ /dev/null @@ -1,89 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4b8.cu b/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4b8.cu deleted file mode 100644 index d6602efbff..0000000000 --- a/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4b8.cu +++ /dev/null @@ -1,109 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku8b128.cu b/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku8b128.cu deleted file mode 100644 index 121e784b01..0000000000 --- a/kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku8b128.cu +++ /dev/null @@ -1,109 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4.cu b/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4.cu deleted file mode 100644 index 3e12c2016a..0000000000 --- a/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4.cu +++ /dev/null @@ -1,89 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4b8.cu b/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4b8.cu deleted file mode 100644 index 46e62b623b..0000000000 --- a/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku4b8.cu +++ /dev/null @@ -1,109 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} diff --git a/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku8b128.cu b/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku8b128.cu deleted file mode 100644 index e9b524eebe..0000000000 --- a/kernels/moe/marlin_moe_wna16/kernel_fp16_ite::ku8b128.cu +++ /dev/null @@ -1,109 +0,0 @@ -// auto generated by generate.py -// clang-format off - -#include "kernel.h" -#include "marlin_template.h" - -namespace MARLIN_NAMESPACE_NAME { - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); - -} From 47af728165757846909830e7fe21cad53f82101b Mon Sep 17 00:00:00 2001 From: Nero10578 Date: Thu, 16 Oct 2025 17:53:23 -0700 Subject: [PATCH 2/3] fix env for p2p --- aphrodite/common/envs.py | 6 ++++++ .../distributed/device_communicators/custom_all_reduce.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/aphrodite/common/envs.py b/aphrodite/common/envs.py index a67c62c9ba..2e68c6c9a3 100755 --- a/aphrodite/common/envs.py +++ b/aphrodite/common/envs.py @@ -86,6 +86,7 @@ APHRODITE_USE_TRITON_AWQ: bool = False APHRODITE_ALLOW_RUNTIME_LORA_UPDATING: bool = False APHRODITE_SKIP_P2P_CHECK: bool = False + APHRODITE_FORCE_P2P: bool = False APHRODITE_DISABLED_KERNELS: list[str] = [] APHRODITE_USE_V1: bool = True APHRODITE_ROCM_USE_AITER: bool = False @@ -717,6 +718,11 @@ def get_aphrodite_port() -> Optional[int]: "APHRODITE_SKIP_P2P_CHECK": lambda: os.getenv("APHRODITE_SKIP_P2P_CHECK", "1") == "1", + # If set, Aphrodite will skip the P2P check and assume that P2P is + # available. Used for custom all-reduce kernels. + "APHRODITE_FORCE_P2P": + lambda: bool(int(os.getenv("APHRODITE_FORCE_P2P", "0"))), + # List of quantization kernels that should be disabled, used for testing # and performance comparisons. Currently only affects MPLinearKernel # selection diff --git a/aphrodite/distributed/device_communicators/custom_all_reduce.py b/aphrodite/distributed/device_communicators/custom_all_reduce.py index 28e7f5a56b..1c334c82d7 100644 --- a/aphrodite/distributed/device_communicators/custom_all_reduce.py +++ b/aphrodite/distributed/device_communicators/custom_all_reduce.py @@ -141,7 +141,7 @@ def __init__(self, assert current_platform.is_cuda_alike() fully_connected = current_platform.is_fully_connected( physical_device_ids) - if envs.VLLM_FORCE_P2P == 0: + if envs.APHRODITE_FORCE_P2P == 0: if world_size > 2 and not fully_connected: logger.warning( "Custom allreduce is disabled because it's not supported on" From 46fc948ffac2d56c8d5588a5ba4d480d64dd5291 Mon Sep 17 00:00:00 2001 From: Nero10578 Date: Thu, 16 Oct 2025 19:59:11 -0700 Subject: [PATCH 3/3] fix check self.rank==0 remove mistake --- .../device_communicators/custom_all_reduce.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/aphrodite/distributed/device_communicators/custom_all_reduce.py b/aphrodite/distributed/device_communicators/custom_all_reduce.py index 1c334c82d7..f260548d49 100644 --- a/aphrodite/distributed/device_communicators/custom_all_reduce.py +++ b/aphrodite/distributed/device_communicators/custom_all_reduce.py @@ -141,12 +141,13 @@ def __init__(self, assert current_platform.is_cuda_alike() fully_connected = current_platform.is_fully_connected( physical_device_ids) - if envs.APHRODITE_FORCE_P2P == 0: + if not envs.APHRODITE_FORCE_P2P: if world_size > 2 and not fully_connected: - logger.warning( - "Custom allreduce is disabled because it's not supported on" - " more than two PCIe-only GPUs. To silence this warning, " - "specify disable_custom_all_reduce=True explicitly.") + if self.rank == 0: + logger.warning( + "Custom allreduce is disabled because it's not supported on" + " more than two PCIe-only GPUs. To silence this warning, " + "specify disable_custom_all_reduce=True explicitly.") return # test P2P capability, this checks software/cudaruntime support # this is expensive to compute at the first time