diff --git a/ci/rocm_test.sh b/ci/rocm_test.sh new file mode 100644 index 00000000000000..51891c832ffea5 --- /dev/null +++ b/ci/rocm_test.sh @@ -0,0 +1,61 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ROCm GPU Test Script - based on coverage_test.sh + +source $(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/utils.sh +init + +# Set ROCm environment +export WITH_ROCM=ON +export WITH_TESTING=ON + +mkdir -p ${PADDLE_ROOT}/build +cd ${PADDLE_ROOT}/build + +echo "::group::Install dependencies" +pip install hypothesis 2>/dev/null || true +pip install -r ${PADDLE_ROOT}/python/unittest_py/requirements.txt 2>/dev/null || true +echo "::endgroup::" + +echo "::group::Install paddle" +if ls ${PADDLE_ROOT}/build/python/dist/*whl >/dev/null 2>&1; then + pip install ${PADDLE_ROOT}/build/python/dist/*whl --force-reinstall +elif ls ${PADDLE_ROOT}/dist/*whl >/dev/null 2>&1; then + pip install ${PADDLE_ROOT}/dist/*whl --force-reinstall +fi +echo "::endgroup::" + +# Copy test support files from source directory (not build directory) +# Note: coverage_test.sh has a bug copying empty file from build dir +cp ${PADDLE_ROOT}/test/legacy_test/testsuite.py ${PADDLE_ROOT}/build/test/legacy_test/ 2>/dev/null || true +cp ${PADDLE_ROOT}/test/legacy_test/testsuite.py ${PADDLE_ROOT}/build/python 2>/dev/null || true +cp -r ${PADDLE_ROOT}/build/test/white_list ${PADDLE_ROOT}/build/python 2>/dev/null || true + +# Add source test directories to PYTHONPATH for module imports +export PYTHONPATH=${PADDLE_ROOT}/test:${PADDLE_ROOT}/test/legacy_test:${PYTHONPATH} + +ut_total_startTime_s=`date +%s` + +parallel_test_base_gpu_test + +ut_total_endTime_s=`date +%s` +echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" +echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + +if [[ -f ${PADDLE_ROOT}/build/build_summary.txt ]];then +echo "=====================build summary======================" +cat ${PADDLE_ROOT}/build/build_summary.txt +echo "========================================================" +fi diff --git a/ci/utils.sh b/ci/utils.sh index f4552fa7d16602..9d062ba111be8b 100644 --- a/ci/utils.sh +++ b/ci/utils.sh @@ -693,7 +693,12 @@ function card_test() { if [ "${WITH_XPU}" == "ON" ];then CUDA_DEVICE_COUNT=1 elif [ "${WITH_ROCM}" == "ON" ];then - CUDA_DEVICE_COUNT=$(rocm-smi -i | grep DCU | wc -l) + # Support both DCU (Hygon) and AMD ROCm GPUs + # Each GPU appears multiple times in -i output, count unique GPU IDs + CUDA_DEVICE_COUNT=$(rocm-smi -i 2>/dev/null | grep "^GPU\[" | cut -d']' -f1 | sort -u | wc -l || echo 0) + if [ "${CUDA_DEVICE_COUNT}" -eq 0 ]; then + CUDA_DEVICE_COUNT=1 + fi elif [ "${WITH_IPU}" == "ON" ];then CUDA_DEVICE_COUNT=1 else @@ -821,6 +826,27 @@ set +x cp -r ${PADDLE_ROOT}/build/CTestCostData.txt ${PADDLE_ROOT}/build/Testing/Temporary/ get_quickly_disable_ut||disable_ut_quickly='disable_ut' # indicate whether the case was in quickly disable list + + # ROCm: Disable tests not supported on ROCm platform + if [ "$WITH_ROCM" == "ON" ]; then + # OneDNN/MKL-DNN/cuDNN: Intel/NVIDIA specific + rocm_skip_tests="onednn|mkldnn|cudnn" + # BF16 kernels not registered for ROCm: argsort, mode, randperm + rocm_skip_tests="${rocm_skip_tests}|test_argsort_op|test_mode_op|test_randperm_op" + # FlashAttention not available (libflashattn.so missing) + rocm_skip_tests="${rocm_skip_tests}|test_scaled_dot_product_attention|test_compat_nn_multihead_attention" + # Graphviz dependency (dot command) + rocm_skip_tests="${rocm_skip_tests}|test_capture_backward_subgraph|test_capture_fwd_graph" + # c_embedding: distributed operator not registered + rocm_skip_tests="${rocm_skip_tests}|test_c_embedding_op" + if [ -n "$disable_ut_quickly" ] && [ "$disable_ut_quickly" != "disable_ut" ]; then + disable_ut_quickly="${disable_ut_quickly}|${rocm_skip_tests}" + else + disable_ut_quickly="${rocm_skip_tests}" + fi + echo "ROCm: Skipping OneDNN/MKL-DNN/cuDNN and ROCm-incompatible tests" + fi + test_cases=$(ctest -N -V) # get all test cases if [ ${WITH_CINN:-OFF} == "ON" ]; then diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 17ef70b4a071c9..1c1381a154c41c 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -52,8 +52,8 @@ if(WITH_ROCM) set(WARPCTC_PATCH_ROCM_COMMAND patch -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.rocm.patch && patch - -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/devicetypes.cuh.patch && patch - -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/hip.cmake.patch) + -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/devicetypes.cuh.patch && cp + ${PADDLE_SOURCE_DIR}/patches/warpctc/hip.cmake.rocm70 cmake/hip.cmake) endif() set(WARPCTC_INCLUDE_DIR diff --git a/cmake/external/warprnnt.cmake b/cmake/external/warprnnt.cmake index ce4b43343a4e96..98d7fef40fdfb9 100644 --- a/cmake/external/warprnnt.cmake +++ b/cmake/external/warprnnt.cmake @@ -44,7 +44,8 @@ endif() if(WITH_ROCM) set(WARPRNNT_PATCH_ROCM_COMMAND patch -p1 < - ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.rocm.patch) + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.rocm.patch && cp + ${PADDLE_SOURCE_DIR}/patches/warprnnt/hip.cmake.rocm70 cmake/hip.cmake) endif() if(NOT WIN32 AND WITH_GPU) if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION} diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 5ef94239595379..549b6bc8cfc45a 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -6,24 +6,37 @@ if(NOT DEFINED ENV{ROCM_PATH}) set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") - set(HIP_PATH - ${ROCM_PATH}/hip - CACHE PATH "Path to which HIP has been installed") - set(HIP_CLANG_PATH - ${ROCM_PATH}/llvm/bin - CACHE PATH "Path to which clang has been installed") else() set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") +endif() + +# ROCm 7.0+: HIP is now directly under ROCM_PATH, not in a separate hip subdirectory +# Check if we're using newer ROCm layout (7.0+) or older layout +if(EXISTS "${ROCM_PATH}/lib/cmake/hip/FindHIP.cmake") + # ROCm 7.0+ layout + set(HIP_PATH + ${ROCM_PATH} + CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) +elseif(EXISTS "${ROCM_PATH}/hip/cmake") + # Legacy ROCm layout (< 7.0) set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") - set(HIP_CLANG_PATH - ${ROCM_PATH}/llvm/bin - CACHE PATH "Path to which clang has been installed") + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +else() + # Fallback: assume ROCm 7.0+ layout + set(HIP_PATH + ${ROCM_PATH} + CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) endif() -set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + +set(HIP_CLANG_PATH + ${ROCM_PATH}/llvm/bin + CACHE PATH "Path to which clang has been installed") set(CMAKE_PREFIX_PATH "${ROCM_PATH}" ${CMAKE_PREFIX_PATH}) find_package(HIP REQUIRED) @@ -65,11 +78,23 @@ macro(find_hip_version hip_header_file) ) endif() endmacro() -find_hip_version(${HIP_PATH}/include/hip/hip_version.h) +# ROCm 7.0+: hip_version.h is directly under ROCM_PATH/include +if(EXISTS "${ROCM_PATH}/include/hip/hip_version.h") + find_hip_version(${ROCM_PATH}/include/hip/hip_version.h) +elseif(EXISTS "${HIP_PATH}/include/hip/hip_version.h") + find_hip_version(${HIP_PATH}/include/hip/hip_version.h) +else() + message(WARNING "Cannot find hip_version.h") +endif() macro(find_package_and_include PACKAGE_NAME) find_package("${PACKAGE_NAME}" REQUIRED) - include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include") + # ROCm 7.0+ uses /opt/rocm/include// instead of /opt/rocm//include/ + if(EXISTS "${ROCM_PATH}/include/${PACKAGE_NAME}") + include_directories("${ROCM_PATH}/include/${PACKAGE_NAME}") + elseif(EXISTS "${ROCM_PATH}/${PACKAGE_NAME}/include") + include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include") + endif() message(STATUS "${PACKAGE_NAME} version: ${${PACKAGE_NAME}_VERSION}") endmacro() @@ -93,10 +118,10 @@ endif() # set CXX flags for HIP set(CMAKE_C_FLAGS - "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIP_PLATFORM_AMD__ -DROCM_NO_WRAPPER_HEADER_WARNING" + "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIP_PLATFORM_AMD__ -D__HIP__=1 -DROCM_NO_WRAPPER_HEADER_WARNING" ) set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIP_PLATFORM_AMD__ -DROCM_NO_WRAPPER_HEADER_WARNING" + "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIP_PLATFORM_AMD__ -D__HIP__=1 -DROCM_NO_WRAPPER_HEADER_WARNING" ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") @@ -106,6 +131,7 @@ set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP) list(APPEND HIP_CXX_FLAGS -fPIC) list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1) list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_AMD__=1) +list(APPEND HIP_CXX_FLAGS -D__HIP__=1) # Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) list(APPEND HIP_CXX_FLAGS -DROCM_NO_WRAPPER_HEADER_WARNING) @@ -159,15 +185,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) # Ask hcc to generate device code during compilation so we can use # host linker to link. list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) -list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906) # Z100 (ZIFANG) -list(APPEND HIP_HCC_FLAGS --offload-arch=gfx926) # K100 (KONGING) -list(APPEND HIP_HCC_FLAGS --offload-arch=gfx928) # K100_AI (KONGING_AI) -list(APPEND HIP_HCC_FLAGS --offload-arch=gfx936) # BW1000 (BOWEN) +list(APPEND HIP_HCC_FLAGS --offload-arch=gfx942) # MI300 +list(APPEND HIP_HCC_FLAGS --offload-arch=gfx950) # MI350X list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) -list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906) # Z100 (ZIFANG) -list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx926) # K100 (KONGING) -list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx928) # K100_AI (KONGING_AI) -list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx936) # BW1000 (BOWEN) +list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx942) # MI300 +list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx950) # MI350X if(HIP_COMPILER STREQUAL clang) set(hip_library_name amdhip64) diff --git a/cmake/rccl.cmake b/cmake/rccl.cmake index 1f78c74f40e640..4b744253240733 100644 --- a/cmake/rccl.cmake +++ b/cmake/rccl.cmake @@ -11,13 +11,28 @@ if(WITH_RCCL) set(RCCL_ROOT ${ROCM_PATH}/rccl CACHE PATH "RCCL ROOT") - find_path( - RCCL_INCLUDE_DIR rccl.h - PATHS ${RCCL_ROOT} ${RCCL_ROOT}/include ${RCCL_ROOT}/local/include - $ENV{RCCL_ROOT} $ENV{RCCL_ROOT}/include $ENV{RCCL_ROOT}/local/include + # ROCm 7.0+: rccl.h is under include/rccl/ directory + # First try to find rccl.h directly (handles both old and new layouts) + find_file( + RCCL_HEADER_FILE rccl.h + PATHS ${ROCM_PATH}/include/rccl + ${ROCM_PATH}/include + ${RCCL_ROOT} + ${RCCL_ROOT}/include + ${RCCL_ROOT}/local/include + $ENV{RCCL_ROOT} + $ENV{RCCL_ROOT}/include + $ENV{RCCL_ROOT}/local/include NO_DEFAULT_PATH) - file(READ ${RCCL_INCLUDE_DIR}/rccl.h RCCL_VERSION_FILE_CONTENTS) + if(NOT RCCL_HEADER_FILE) + message(FATAL_ERROR "Cannot find rccl.h. Please check RCCL installation.") + endif() + + # Get the directory containing rccl.h + get_filename_component(RCCL_INCLUDE_DIR ${RCCL_HEADER_FILE} DIRECTORY) + + file(READ ${RCCL_HEADER_FILE} RCCL_VERSION_FILE_CONTENTS) string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" RCCL_VERSION "${RCCL_VERSION_FILE_CONTENTS}") @@ -25,6 +40,6 @@ if(WITH_RCCL) "${RCCL_VERSION}") # 2604 for ROCM3.5 and 2708 for ROCM 3.9 - message(STATUS "Current RCCL header is ${RCCL_INCLUDE_DIR}/rccl.h. " + message(STATUS "Current RCCL header is ${RCCL_HEADER_FILE}. " "Current RCCL version is v${RCCL_VERSION}. ") endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index b709c8181162fd..2ba4ebe41031a6 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -406,15 +406,11 @@ endif() list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_xxhash) -list( - APPEND - third_party_deps - extern_zlib - extern_dlpack - extern_warpctc - extern_warprnnt - extern_threadpool - extern_lapack) +list(APPEND third_party_deps extern_zlib extern_dlpack extern_threadpool + extern_lapack) +if(NOT WITH_ROCM) + list(APPEND third_party_deps extern_warpctc extern_warprnnt) +endif() if(WITH_MAGMA) list(APPEND third_party_deps extern_magma) diff --git a/cmake/thrust.cmake b/cmake/thrust.cmake index 73c2c29847a34c..d690de75fb6ebd 100644 --- a/cmake/thrust.cmake +++ b/cmake/thrust.cmake @@ -1,26 +1,38 @@ function(add_thrust_patches_if_necessary) - set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu) - file( - WRITE ${thrust_detect_file} - "" - "#include \"thrust/version.h\"\n" - "#include \"thrust/shuffle.h\"\n" - "#include \"stdio.h\"\n" - "int main() {\n" - " int version = THRUST_VERSION;\n" - " printf(\"%d\", version);\n" - " return 0;\n" - "}\n") + # ROCm 7.0+ has rocThrust with shuffle support built-in, so no patches needed + if(WITH_ROCM) + # Check if rocThrust has shuffle.h + if(EXISTS "${ROCM_PATH}/include/thrust/shuffle.h") + message(STATUS "ROCm thrust has native shuffle support, skipping patches") + return() + endif() + endif() + + # For CUDA, check if thrust has shuffle support + if(WITH_GPU) + set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu) + file( + WRITE ${thrust_detect_file} + "" + "#include \"thrust/version.h\"\n" + "#include \"thrust/shuffle.h\"\n" + "#include \"stdio.h\"\n" + "int main() {\n" + " int version = THRUST_VERSION;\n" + " printf(\"%d\", version);\n" + " return 0;\n" + "}\n") - execute_process( - COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${thrust_detect_file}" - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE nvcc_res - ERROR_QUIET) - if(NOT nvcc_res EQUAL 0) - set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust") - message(STATUS "Add thrust patches: ${thrust_patches}") - include_directories(${thrust_patches}) + execute_process( + COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${thrust_detect_file}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res + ERROR_QUIET) + if(NOT nvcc_res EQUAL 0) + set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust") + message(STATUS "Add thrust patches: ${thrust_patches}") + include_directories(${thrust_patches}) + endif() endif() endfunction() diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 243f798e3fee7f..48e4245ed201a6 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -42,9 +42,12 @@ limitations under the License. */ #include #include #include +// thrust headers require hipcc (rocThrust 7.0+ pulls in rocprim) +#ifdef __HIPCC__ #include #include // NOLINT #endif +#endif #include #include diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc index aa62b2e7300c2c..f27595a9301f4d 100644 --- a/paddle/phi/api/lib/tensor_utils.cc +++ b/paddle/phi/api/lib/tensor_utils.cc @@ -44,7 +44,8 @@ PADDLE_API phi::Place GetPlaceFromPtr(void* data) { #else hipPointerAttribute_t attr = {}; hipError_t status = hipPointerGetAttributes(&attr, data); - if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { + // ROCm 7.0+ uses 'type' instead of 'memoryType' + if (status == hipSuccess && attr.type == hipMemoryTypeDevice) { return phi::GPUPlace(attr.device); } #endif diff --git a/paddle/phi/backends/dynload/magma.h b/paddle/phi/backends/dynload/magma.h index 45dc5fb73012e8..b787e0c62151a6 100644 --- a/paddle/phi/backends/dynload/magma.h +++ b/paddle/phi/backends/dynload/magma.h @@ -17,7 +17,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include -#include +// Note: thrust/complex.h is not needed for magma type definitions +// and should only be included when compiled with hipcc typedef hipDoubleComplex magmaDoubleComplex; typedef hipFloatComplex magmaFloatComplex; #endif // PADDLE_WITH_HIP diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index 20fdf1e0d1917d..0c0302b552abbd 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -28,8 +28,12 @@ #ifdef PADDLE_WITH_HIP #include +// thrust/complex.h requires hipcc compiler +// (rocThrust 7.0+ pulls in rocprim) +#if defined(__HIPCC__) || defined(__HIP_DEVICE_COMPILE__) #include // NOLINT #endif +#endif #ifndef PADDLE_WITH_HIP #if !defined(_WIN32) @@ -66,7 +70,9 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE constexpr complex(T real, T imag) : real(real), imag(imag) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// thrust::complex interop: CUDA always, HIP only with hipcc +#if defined(PADDLE_WITH_CUDA) || \ + (defined(PADDLE_WITH_HIP) && defined(__HIPCC__)) template HOSTDEVICE inline explicit complex(const thrust::complex& c) { @@ -86,6 +92,9 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { HOSTDEVICE inline explicit operator thrust::complex() const { return thrust::complex(real, imag); } +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_HIP HOSTDEVICE inline explicit operator hipFloatComplex() const { diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 024a7de73eb72e..abcd26c54afb49 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -27,9 +27,12 @@ limitations under the License. */ #include #include #include +// thrust headers require hipcc (rocThrust 7.0+ pulls in rocprim) +#ifdef __HIPCC__ #include #include // NOLINT #endif +#endif #include #include diff --git a/paddle/phi/core/memory/allocation/allocator_facade.cc b/paddle/phi/core/memory/allocation/allocator_facade.cc index 61389a64a24183..aedc265585cb2c 100644 --- a/paddle/phi/core/memory/allocation/allocator_facade.cc +++ b/paddle/phi/core/memory/allocation/allocator_facade.cc @@ -42,17 +42,13 @@ #include "paddle/phi/core/platform/device/gpu/gpu_info.h" #if defined(PADDLE_WITH_CUDA) -#include "paddle/phi/backends/gpu/cuda/cuda_graph.h" -#elif defined(PADDLE_WITH_HIP) -#include "paddle/phi/backends/gpu/rocm/hip_graph.h" -#endif - #include "paddle/phi/backends/dynload/cuda_driver.h" +#include "paddle/phi/backends/gpu/cuda/cuda_graph.h" #include "paddle/phi/core/memory/allocation/cuda_malloc_async_allocator.h" #include "paddle/phi/core/memory/allocation/cuda_virtual_mem_allocator.h" #include "paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h" - -#ifdef PADDLE_WITH_HIP +#elif defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" #include "paddle/phi/core/memory/allocation/cuda_malloc_async_allocator.h" // NOLINT #endif #endif diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index afd80c02dbeff4..63cb903e7a4062 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -123,6 +123,22 @@ if(WITH_GPU "gpu/moe_unpermute_kernel.cu") endif() +# Note(ROCm 7.0): Exclude kernels that have compatibility issues with ROCm 7.0 +# These kernels use rocprim/thrust radix_sort with custom float16/bfloat16 types which +# require complex trait specializations that are incompatible with ROCm 7.0's new trait system. +# See ROCM70_UNSUPPORTED_OPS.md for details. +# TODO: Re-enable these kernels once ROCm compatibility is resolved or implement alternative sorting. +if(WITH_ROCM) + list( + REMOVE_ITEM + kernel_gpu + "gpu/argsort_kernel.cu" + "gpu/argsort_grad_kernel.cu" + "gpu/mode_kernel.cu" + "gpu/mode_grad_kernel.cu" + "gpu/randperm_kernel.cu") +endif() + if(NOT WITH_DGC) list(REMOVE_ITEM kernel_gpu "gpu/dgc_kernel.cu") endif() diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h index a74d3347af3c97..5db84e8880f1e1 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h @@ -1038,9 +1038,10 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, "but received %d", dev_ctx_.GetComputeCapability())); - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = thrust::complex(beta.real, beta.imag); + // Use rocblas complex types directly to avoid pulling + // in rocprim via thrust/complex.h in non-hipcc builds. + rocblas_float_complex c_alpha = {alpha.real, alpha.imag}; + rocblas_float_complex c_beta = {beta.real, beta.imag}; auto &cuda_ctx = const_cast(dev_ctx_); CUBlas::GEMM_EX(&cuda_ctx, @@ -1099,10 +1100,10 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, "but received %d", dev_ctx_.GetComputeCapability())); - thrust::complex c_alpha = - thrust::complex(alpha.real, alpha.imag); - thrust::complex c_beta = - thrust::complex(beta.real, beta.imag); + // Use rocblas complex types directly to avoid pulling + // in rocprim via thrust/complex.h in non-hipcc builds. + rocblas_double_complex c_alpha = {alpha.real, alpha.imag}; + rocblas_double_complex c_beta = {beta.real, beta.imag}; auto &cuda_ctx = const_cast(dev_ctx_); CUBlas::GEMM_EX(&cuda_ctx, diff --git a/paddle/phi/kernels/funcs/rocprim_traits.h b/paddle/phi/kernels/funcs/rocprim_traits.h new file mode 100644 index 00000000000000..0b030214541f3b --- /dev/null +++ b/paddle/phi/kernels/funcs/rocprim_traits.h @@ -0,0 +1,75 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef __HIPCC__ + +#include +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +// ROCm 7.0+ uses a new traits system based on rocprim::traits::define +// This header provides trait definitions for phi::float16 and phi::bfloat16 +// to enable radix sort and other rocprim algorithms on these types. + +#if defined(ROCPRIM_VERSION) && ROCPRIM_VERSION >= 400000 +// ROCm 7.0+ (rocprim 4.0.0+) +namespace rocprim { +namespace traits { + +template <> +struct define { + // float16: sign=0x8000, exponent=0x7C00, mantissa=0x03FF + using float_bit_mask = + float_bit_mask::values; +}; + +template <> +struct define { + // bfloat16: sign=0x8000, exponent=0x7F80, mantissa=0x007F + using float_bit_mask = + float_bit_mask::values; +}; + +} // namespace traits +} // namespace rocprim + +#else +// ROCm < 7.0 uses the old traits system +namespace rocprim { +namespace detail { + +template <> +struct radix_key_codec_base + : radix_key_codec_integral {}; + +template <> +struct radix_key_codec_base + : radix_key_codec_integral {}; + +#if HIP_VERSION >= 50400000 +template <> +struct float_bit_mask : float_bit_mask {}; + +template <> +struct float_bit_mask : float_bit_mask {}; +#endif + +} // namespace detail +} // namespace rocprim + +#endif // ROCPRIM_VERSION + +#endif // __HIPCC__ diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h index d9209f8c67d4ab..3e8061c40322b4 100644 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h @@ -47,25 +47,7 @@ inline static size_t round_up(size_t n, size_t q) { } #ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; - -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; - -#if HIP_VERSION >= 50400000 -template <> -struct float_bit_mask : float_bit_mask {}; - -template <> -struct float_bit_mask : float_bit_mask {}; -#endif -} // namespace detail -} // namespace rocprim +#include "paddle/phi/kernels/funcs/rocprim_traits.h" namespace cub = hipcub; #else // set cub base traits in order to handle float16 diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h index a89598417f9b7f..09a2f30e9b4f70 100644 --- a/paddle/phi/kernels/funcs/values_vectors_functor.h +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -17,7 +17,11 @@ #include "paddle/phi/backends/dynload/cusolver.h" #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP +// thrust/device_vector.h requires hipcc +// (rocThrust 7.0+ pulls in rocprim) +#ifdef __HIPCC__ #include +#endif #include "paddle/phi/backends/dynload/rocsolver.h" #endif // PADDLE_WITH_HIP #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -327,7 +331,9 @@ struct MatrixEighFunctor { } }; -#ifdef PADDLE_WITH_HIP +// HIP code using thrust::device_vector requires hipcc +// (rocThrust 7.0+ pulls in rocprim) +#if defined(PADDLE_WITH_HIP) && defined(__HIPCC__) #define ROCSOLVER_SYEVJ_BATCHED_ARGTYPES(scalar_t, value_t) \ solverHandle_t handle, rocblas_esort esort, rocblas_evect evect, \ rocblas_fill uplo, int n, scalar_t *const A[], int lda, \ diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu index 9e9efc87de7965..7f45d48dd9c8b8 100644 --- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu @@ -26,17 +26,7 @@ #include "paddle/phi/kernels/transpose_kernel.h" #ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; - -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; -} // namespace detail -} // namespace rocprim +#include "paddle/phi/kernels/funcs/rocprim_traits.h" #else // set cub base traits in order to handle float16 namespace cub { diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu index b351fe22a13104..a97e10670488bc 100644 --- a/paddle/phi/kernels/gpu/argsort_kernel.cu +++ b/paddle/phi/kernels/gpu/argsort_kernel.cu @@ -29,25 +29,7 @@ #include "paddle/phi/kernels/transpose_kernel.h" #ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; - -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; - -#if HIP_VERSION >= 50400000 -template <> -struct float_bit_mask : float_bit_mask {}; - -template <> -struct float_bit_mask : float_bit_mask {}; -#endif -} // namespace detail -} // namespace rocprim +#include "paddle/phi/kernels/funcs/rocprim_traits.h" #else // set cub base traits in order to handle float16 namespace cub { diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index 3d6eb173d10f47..64be2659737efb 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -14,8 +14,12 @@ // limitations under the License. #pragma once +// thrust headers require nvcc/hipcc +// (rocThrust 7.0+ pulls in rocprim) +#if defined(__NVCC__) || defined(__HIPCC__) #include #include +#endif #include "paddle/common/hostdevice.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/patches/thrust/thrust/detail/shuffle.inl b/patches/thrust/thrust/detail/shuffle.inl index edccc878731ef4..83c4f135d4f941 100644 --- a/patches/thrust/thrust/detail/shuffle.inl +++ b/patches/thrust/thrust/detail/shuffle.inl @@ -19,8 +19,9 @@ */ #include -#include +// Note: ROCm 7.0+ rocThrust removed cpp11_required.h +// The cpp dialect check is now handled by config/cpp_dialect.h which is included via config.h #if THRUST_CPP_DIALECT >= 2011 #include diff --git a/patches/thrust/thrust/shuffle.h b/patches/thrust/thrust/shuffle.h index 427414df7c11b9..1b0cbedeba6c25 100644 --- a/patches/thrust/thrust/shuffle.h +++ b/patches/thrust/thrust/shuffle.h @@ -35,8 +35,9 @@ #pragma once #include -#include +// Note: ROCm 7.0+ rocThrust removed cpp11_required.h +// The cpp dialect check is now handled by config/cpp_dialect.h which is included via config.h #if THRUST_CPP_DIALECT >= 2011 #include diff --git a/patches/thrust/thrust/system/detail/generic/shuffle.h b/patches/thrust/thrust/system/detail/generic/shuffle.h index 3b5feb1c3def82..825e4353ff180f 100644 --- a/patches/thrust/thrust/system/detail/generic/shuffle.h +++ b/patches/thrust/thrust/system/detail/generic/shuffle.h @@ -35,8 +35,9 @@ #pragma once #include -#include +// Note: ROCm 7.0+ rocThrust removed cpp11_required.h +// The cpp dialect check is now handled by config/cpp_dialect.h which is included via config.h #if THRUST_CPP_DIALECT >= 2011 #include diff --git a/patches/warpctc/hip.cmake.rocm70 b/patches/warpctc/hip.cmake.rocm70 new file mode 100644 index 00000000000000..79ee7e6ff61994 --- /dev/null +++ b/patches/warpctc/hip.cmake.rocm70 @@ -0,0 +1,108 @@ +if(NOT WITH_ROCM) + return() +endif() + +# ROCm 7.0+: HIP is now directly under ROCM_PATH, not in a separate hip subdirectory +if(NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") +else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") +endif() + +# Check if we're using newer ROCm layout (7.0+) or older layout +if(EXISTS "${ROCM_PATH}/lib/cmake/hip/FindHIP.cmake") + # ROCm 7.0+ layout + set(HIP_PATH ${ROCM_PATH} CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) +elseif(EXISTS "${ROCM_PATH}/hip/cmake") + # Legacy ROCm layout (< 7.0) + set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +else() + # Fallback: assume ROCm 7.0+ layout + set(HIP_PATH ${ROCM_PATH} CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) +endif() + +set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") +set(CMAKE_PREFIX_PATH "${ROCM_PATH}" ${CMAKE_PREFIX_PATH}) + +find_package(HIP REQUIRED) +include_directories(${ROCM_PATH}/include) +message(STATUS "HIP version: ${HIP_VERSION}") +message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") +MESSAGE(STATUS "HIP_ROOT_DIR: ${HIP_ROOT_DIR}") + +macro(find_package_and_include PACKAGE_NAME) + find_package("${PACKAGE_NAME}" REQUIRED) + include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include") + message(STATUS "${PACKAGE_NAME} version: ${${PACKAGE_NAME}_VERSION}") +endmacro() + +find_package_and_include(hiprand) +find_package_and_include(rocrand) +find_package_and_include(rocthrust) + +# set CXX flags for HIP +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") +set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP) + +# define HIP_CXX_FLAGS +list(APPEND HIP_CXX_FLAGS -fPIC) +list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1) +# Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer +list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) +list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined) +list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override) +list(APPEND HIP_CXX_FLAGS -Wno-exceptions) +list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative) +list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow) +list(APPEND HIP_CXX_FLAGS -Wno-unused-command-line-argument) +list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) +list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion) +list(APPEND HIP_CXX_FLAGS -Wno-pass-failed) +list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) +list(APPEND HIP_CXX_FLAGS -std=c++14) + +if(CMAKE_BUILD_TYPE MATCHES Debug) + list(APPEND HIP_CXX_FLAGS -g2) + list(APPEND HIP_CXX_FLAGS -O0) + list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) +endif(CMAKE_BUILD_TYPE MATCHES Debug) + +set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS}) +set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) +# Ask hcc to generate device code during compilation so we can use +# host linker to link. +list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx90a) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx942) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx950) +list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx90a) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx942) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx950) + + +if(HIP_COMPILER STREQUAL clang) + set(hip_library_name amdhip64) +else() + set(hip_library_name hip_hcc) +endif() +message(STATUS "HIP library name: ${hip_library_name}") + +# set HIP link libs - check multiple possible library locations for ROCm 7.0+ +find_library(ROCM_HIPRTC_LIB ${hip_library_name} + HINTS ${ROCM_PATH}/lib ${HIP_PATH}/lib + NO_DEFAULT_PATH) +if(NOT ROCM_HIPRTC_LIB) + find_library(ROCM_HIPRTC_LIB ${hip_library_name}) +endif() +message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}") + diff --git a/patches/warprnnt/hip.cmake.rocm70 b/patches/warprnnt/hip.cmake.rocm70 new file mode 100644 index 00000000000000..79ee7e6ff61994 --- /dev/null +++ b/patches/warprnnt/hip.cmake.rocm70 @@ -0,0 +1,108 @@ +if(NOT WITH_ROCM) + return() +endif() + +# ROCm 7.0+: HIP is now directly under ROCM_PATH, not in a separate hip subdirectory +if(NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") +else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") +endif() + +# Check if we're using newer ROCm layout (7.0+) or older layout +if(EXISTS "${ROCM_PATH}/lib/cmake/hip/FindHIP.cmake") + # ROCm 7.0+ layout + set(HIP_PATH ${ROCM_PATH} CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) +elseif(EXISTS "${ROCM_PATH}/hip/cmake") + # Legacy ROCm layout (< 7.0) + set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +else() + # Fallback: assume ROCm 7.0+ layout + set(HIP_PATH ${ROCM_PATH} CACHE PATH "Path to which HIP has been installed") + set(CMAKE_MODULE_PATH "${ROCM_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH}) +endif() + +set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") +set(CMAKE_PREFIX_PATH "${ROCM_PATH}" ${CMAKE_PREFIX_PATH}) + +find_package(HIP REQUIRED) +include_directories(${ROCM_PATH}/include) +message(STATUS "HIP version: ${HIP_VERSION}") +message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") +MESSAGE(STATUS "HIP_ROOT_DIR: ${HIP_ROOT_DIR}") + +macro(find_package_and_include PACKAGE_NAME) + find_package("${PACKAGE_NAME}" REQUIRED) + include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include") + message(STATUS "${PACKAGE_NAME} version: ${${PACKAGE_NAME}_VERSION}") +endmacro() + +find_package_and_include(hiprand) +find_package_and_include(rocrand) +find_package_and_include(rocthrust) + +# set CXX flags for HIP +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") +set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP) + +# define HIP_CXX_FLAGS +list(APPEND HIP_CXX_FLAGS -fPIC) +list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1) +# Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer +list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) +list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined) +list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override) +list(APPEND HIP_CXX_FLAGS -Wno-exceptions) +list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative) +list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow) +list(APPEND HIP_CXX_FLAGS -Wno-unused-command-line-argument) +list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) +list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion) +list(APPEND HIP_CXX_FLAGS -Wno-pass-failed) +list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) +list(APPEND HIP_CXX_FLAGS -std=c++14) + +if(CMAKE_BUILD_TYPE MATCHES Debug) + list(APPEND HIP_CXX_FLAGS -g2) + list(APPEND HIP_CXX_FLAGS -O0) + list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) +endif(CMAKE_BUILD_TYPE MATCHES Debug) + +set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS}) +set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) +# Ask hcc to generate device code during compilation so we can use +# host linker to link. +list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx90a) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx942) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx950) +list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx90a) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx942) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx950) + + +if(HIP_COMPILER STREQUAL clang) + set(hip_library_name amdhip64) +else() + set(hip_library_name hip_hcc) +endif() +message(STATUS "HIP library name: ${hip_library_name}") + +# set HIP link libs - check multiple possible library locations for ROCm 7.0+ +find_library(ROCM_HIPRTC_LIB ${hip_library_name} + HINTS ${ROCM_PATH}/lib ${HIP_PATH}/lib + NO_DEFAULT_PATH) +if(NOT ROCM_HIPRTC_LIB) + find_library(ROCM_HIPRTC_LIB ${hip_library_name}) +endif() +message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}") + diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 66caea43acff11..4f95777ea9c2da 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -429,6 +429,10 @@ def _get_cuda_arch_flags(cflags: list[str] | None = None) -> list[str]: For an added "+PTX", an additional ``-gencode=arch=compute_xx,code=compute_xx`` is added. """ + # ROCm uses get_rocm_arch_flags instead, not CUDA arch flags + if core.is_compiled_with_rocm(): + return [] + # If cflags is given, there may already be user-provided arch flags in it if cflags is not None: for flag in cflags: diff --git a/test/compat/test_cpp_extension_api.py b/test/compat/test_cpp_extension_api.py index 292e04036a8b08..78d128908945f0 100644 --- a/test/compat/test_cpp_extension_api.py +++ b/test/compat/test_cpp_extension_api.py @@ -26,9 +26,10 @@ ) -@unittest.skipIf(not core.is_compiled_with_cuda(), 'should compile with cuda.') class TestGetCudaArchFlags(unittest.TestCase): def setUp(self): + if not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(): + self.skipTest('should compile with cuda (not rocm).') self._old_env = dict(os.environ) def tearDown(self): @@ -91,6 +92,12 @@ def test_skip_paddle_extension_name_flag(self): flags = _get_cuda_arch_flags(cflags=["-DPADDLE_EXTENSION_NAME=my_ext"]) self.assertNotEqual(flags, []) + def test_rocm_returns_empty_flags(self): + with mock.patch.object( + extension_utils.core, "is_compiled_with_rocm", return_value=True + ): + self.assertEqual(_get_cuda_arch_flags(), []) + class TestCppExtensionUtils(unittest.TestCase): def test_cuda_home(self): diff --git a/test/legacy_test/test_registered_phi_kernels.py b/test/legacy_test/test_registered_phi_kernels.py index cc1a89ba87d43e..429115f4e4c9a8 100644 --- a/test/legacy_test/test_registered_phi_kernels.py +++ b/test/legacy_test/test_registered_phi_kernels.py @@ -61,7 +61,16 @@ def setUp(self): self.forward_ops = [] self.backward_ops = [] - root_path = pathlib.Path(__file__).parents[3] + root_path = None + for parent in pathlib.Path(__file__).parents: + if parent.joinpath('paddle/phi/ops/yaml/ops.yaml').is_file(): + root_path = parent + break + if root_path is None: + raise FileNotFoundError( + "Cannot locate repo root containing " + "paddle/phi/ops/yaml/ops.yaml" + ) ops_yaml_path = [ 'paddle/phi/ops/yaml/ops.yaml', diff --git a/tools/test_runner.py b/tools/test_runner.py index 96278b9050a3e4..e85411c8c1f62c 100644 --- a/tools/test_runner.py +++ b/tools/test_runner.py @@ -23,6 +23,7 @@ from paddle.base import core sys.path.append(os.path.abspath(os.path.dirname(__file__))) + sys.path.append( os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "build", "test")