Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions aphrodite/common/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
APHRODITE_USE_TRITON_AWQ: bool = False
APHRODITE_ALLOW_RUNTIME_LORA_UPDATING: bool = False
APHRODITE_SKIP_P2P_CHECK: bool = False
APHRODITE_FORCE_P2P: bool = False
APHRODITE_DISABLED_KERNELS: list[str] = []
APHRODITE_USE_V1: bool = True
APHRODITE_ROCM_USE_AITER: bool = False
Expand Down Expand Up @@ -717,6 +718,11 @@ def get_aphrodite_port() -> Optional[int]:
"APHRODITE_SKIP_P2P_CHECK":
lambda: os.getenv("APHRODITE_SKIP_P2P_CHECK", "1") == "1",

# If set, Aphrodite will skip the P2P check and assume that P2P is
# available. Used for custom all-reduce kernels.
"APHRODITE_FORCE_P2P":
lambda: bool(int(os.getenv("APHRODITE_FORCE_P2P", "0"))),

# List of quantization kernels that should be disabled, used for testing
# and performance comparisons. Currently only affects MPLinearKernel
# selection
Expand Down
15 changes: 8 additions & 7 deletions aphrodite/distributed/device_communicators/custom_all_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,14 @@ def __init__(self,
assert current_platform.is_cuda_alike()
fully_connected = current_platform.is_fully_connected(
physical_device_ids)
if world_size > 2 and not fully_connected:
if self.rank == 0:
logger.warning(
"Custom allreduce is disabled because it's not supported on"
" more than two PCIe-only GPUs. To silence this warning, "
"specify disable_custom_all_reduce=True explicitly.")
return
if not envs.APHRODITE_FORCE_P2P:
if world_size > 2 and not fully_connected:
if self.rank == 0:
logger.warning(
"Custom allreduce is disabled because it's not supported on"
" more than two PCIe-only GPUs. To silence this warning, "
"specify disable_custom_all_reduce=True explicitly.")
return
# test P2P capability, this checks software/cudaruntime support
# this is expensive to compute at the first time
# then we cache the result
Expand Down
89 changes: 0 additions & 89 deletions kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4.cu

This file was deleted.

109 changes: 0 additions & 109 deletions kernels/moe/marlin_moe_wna16/kernel_bf16_ite::ku4b8.cu

This file was deleted.

Loading
Loading