From 2fe20007ef2563e56d61c8670e555cf0dcee55c6 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 27 Jul 2021 09:26:02 -0600 Subject: [PATCH 01/15] Initial hipify of source --- kgemm_nn_batched.cpp | 2 +- kgemm_nt_batched.cpp | 2 +- kroncommon.hpp | 3 +-- kronmult1_batched.cpp | 2 +- kronmult1_pbatched.cpp | 2 +- kronmult1_xbatched.cpp | 2 +- kronmult2_batched.cpp | 2 +- kronmult2_pbatched.cpp | 2 +- kronmult2_xbatched.cpp | 2 +- kronmult3_batched.cpp | 2 +- kronmult3_pbatched.cpp | 2 +- kronmult3_xbatched.cpp | 2 +- kronmult4_batched.cpp | 2 +- kronmult4_pbatched.cpp | 2 +- kronmult4_xbatched.cpp | 2 +- kronmult5_batched.cpp | 2 +- kronmult5_pbatched.cpp | 2 +- kronmult5_xbatched.cpp | 2 +- kronmult6_batched.cpp | 2 +- kronmult6_pbatched.cpp | 2 +- kronmult6_xbatched.cpp | 2 +- test_kgemm_nn_batched.cpp | 40 +++++++++++++++++------------------ test_kgemm_nt_batched.cpp | 36 +++++++++++++++---------------- test_kronmult6_batched.cpp | 42 ++++++++++++++++++------------------- test_kronmult6_pbatched.cpp | 42 ++++++++++++++++++------------------- test_kronmult6_xbatched.cpp | 42 ++++++++++++++++++------------------- 26 files changed, 122 insertions(+), 123 deletions(-) diff --git a/kgemm_nn_batched.cpp b/kgemm_nn_batched.cpp index 3ba54fd..96f2a08 100644 --- a/kgemm_nn_batched.cpp +++ b/kgemm_nn_batched.cpp @@ -20,7 +20,7 @@ void kgemm_nn_batched( int const mm, int const nn, int const kk, int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kgemm_nn_batched<<< batchCount, nthreads>>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nn_batched), dim3(batchCount), dim3(nthreads), 0, 0, mm,nn,kk, alpha, Aarray_, ldAarray_, Barray_, ldBarray_, diff --git a/kgemm_nt_batched.cpp b/kgemm_nt_batched.cpp index 62c443e..059360b 100644 --- a/kgemm_nt_batched.cpp +++ b/kgemm_nt_batched.cpp @@ -19,7 +19,7 @@ void kgemm_nt_batched( int const mm, int const nn, int const kk, int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kgemm_nt_batched<<< batchCount, nthreads >>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nt_batched), dim3(batchCount), dim3(nthreads ), 0, 0, mm,nn,kk, alpha, Aarray_, ldAarray_, Barray_, ldBarray_, diff --git a/kroncommon.hpp b/kroncommon.hpp index 750bc57..a5c9df5 100644 --- a/kroncommon.hpp +++ b/kroncommon.hpp @@ -4,8 +4,7 @@ #ifdef USE_GPU -#include -#include +#include #define GLOBAL_FUNCTION __global__ #define SYNCTHREADS __syncthreads() #define SHARED_MEMORY __shared__ diff --git a/kronmult1_batched.cpp b/kronmult1_batched.cpp index edfb3e0..917172d 100644 --- a/kronmult1_batched.cpp +++ b/kronmult1_batched.cpp @@ -13,7 +13,7 @@ void kronmult1_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult1_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult1_batched( n, diff --git a/kronmult1_pbatched.cpp b/kronmult1_pbatched.cpp index afaa266..413191e 100644 --- a/kronmult1_pbatched.cpp +++ b/kronmult1_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult1_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult1_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult1_pbatched( n, diff --git a/kronmult1_xbatched.cpp b/kronmult1_xbatched.cpp index 61d5314..14a698b 100644 --- a/kronmult1_xbatched.cpp +++ b/kronmult1_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult1_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult1_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult2_batched.cpp b/kronmult2_batched.cpp index 54a4b24..72ca53c 100644 --- a/kronmult2_batched.cpp +++ b/kronmult2_batched.cpp @@ -13,7 +13,7 @@ void kronmult2_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult2_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult2_batched( n, diff --git a/kronmult2_pbatched.cpp b/kronmult2_pbatched.cpp index bcfe204..ace712e 100644 --- a/kronmult2_pbatched.cpp +++ b/kronmult2_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult2_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult2_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult2_pbatched( n, diff --git a/kronmult2_xbatched.cpp b/kronmult2_xbatched.cpp index bee62d4..039f244 100644 --- a/kronmult2_xbatched.cpp +++ b/kronmult2_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult2_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult2_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult3_batched.cpp b/kronmult3_batched.cpp index 4a7e719..a69e2fb 100644 --- a/kronmult3_batched.cpp +++ b/kronmult3_batched.cpp @@ -13,7 +13,7 @@ void kronmult3_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult3_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult3_batched( n, diff --git a/kronmult3_pbatched.cpp b/kronmult3_pbatched.cpp index 822f822..706fbe3 100644 --- a/kronmult3_pbatched.cpp +++ b/kronmult3_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult3_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult3_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult3_pbatched( n, diff --git a/kronmult3_xbatched.cpp b/kronmult3_xbatched.cpp index 4b609d5..6f8c785 100644 --- a/kronmult3_xbatched.cpp +++ b/kronmult3_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult3_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult3_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult4_batched.cpp b/kronmult4_batched.cpp index e9327ae..059b1a3 100644 --- a/kronmult4_batched.cpp +++ b/kronmult4_batched.cpp @@ -13,7 +13,7 @@ void kronmult4_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult4_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult4_batched( n, diff --git a/kronmult4_pbatched.cpp b/kronmult4_pbatched.cpp index fae2eea..0ce2b19 100644 --- a/kronmult4_pbatched.cpp +++ b/kronmult4_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult4_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult4_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult4_pbatched( n, diff --git a/kronmult4_xbatched.cpp b/kronmult4_xbatched.cpp index 4148c1a..6c891c5 100644 --- a/kronmult4_xbatched.cpp +++ b/kronmult4_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult4_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult4_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult5_batched.cpp b/kronmult5_batched.cpp index 417833d..1889aa1 100644 --- a/kronmult5_batched.cpp +++ b/kronmult5_batched.cpp @@ -13,7 +13,7 @@ void kronmult5_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult5_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult5_batched( n, diff --git a/kronmult5_pbatched.cpp b/kronmult5_pbatched.cpp index 5398ff6..ceeff45 100644 --- a/kronmult5_pbatched.cpp +++ b/kronmult5_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult5_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult5_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult5_pbatched( n, diff --git a/kronmult5_xbatched.cpp b/kronmult5_xbatched.cpp index f3e14da..23495b4 100644 --- a/kronmult5_xbatched.cpp +++ b/kronmult5_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult5_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult5_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult6_batched.cpp b/kronmult6_batched.cpp index 7b3a624..f4f0454 100644 --- a/kronmult6_batched.cpp +++ b/kronmult6_batched.cpp @@ -13,7 +13,7 @@ void kronmult6_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult6_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult6_batched( n, diff --git a/kronmult6_pbatched.cpp b/kronmult6_pbatched.cpp index 30e29fd..7ceed69 100644 --- a/kronmult6_pbatched.cpp +++ b/kronmult6_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult6_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult6_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult6_pbatched( n, diff --git a/kronmult6_xbatched.cpp b/kronmult6_xbatched.cpp index a5ab2f9..66d8825 100644 --- a/kronmult6_xbatched.cpp +++ b/kronmult6_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult6_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult6_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/test_kgemm_nn_batched.cpp b/test_kgemm_nn_batched.cpp index e83a728..0cfd5be 100644 --- a/test_kgemm_nn_batched.cpp +++ b/test_kgemm_nn_batched.cpp @@ -5,7 +5,7 @@ #include #ifdef USE_GPU -#include +#include #else #include #include @@ -18,11 +18,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + assert( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -32,11 +32,11 @@ static inline void gpu2host( void * dest, void * src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -47,8 +47,8 @@ static inline void *myalloc( size_t const nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -59,8 +59,8 @@ void *myalloc( size_t const nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -328,11 +328,11 @@ T test_kgemm_nn_batched( int const mm, int const nwarps = min( min(32,mm), min(nn,kk)); int const nthreads = nwarps * warpsize; - cudaError_t istat_sync_start = cudaDeviceSynchronize(); - expect( istat_sync_start == cudaSuccess ); + hipError_t istat_sync_start = hipDeviceSynchronize(); + expect( istat_sync_start == hipSuccess ); - kgemm_nn_batched<<< batchCount, nthreads >>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nn_batched), dim3(batchCount), dim3(nthreads ), 0, 0, mm,nn,kk, alpha, ddAarray_, dldAarray_, ddBarray_, dldBarray_, @@ -340,8 +340,8 @@ T test_kgemm_nn_batched( int const mm, ddCarray_, dldCarray_, batchCount); - cudaError_t istat_sync_end = cudaDeviceSynchronize(); - expect( istat_sync_end == cudaSuccess ); + hipError_t istat_sync_end = hipDeviceSynchronize(); + expect( istat_sync_end == hipSuccess ); } #else { diff --git a/test_kgemm_nt_batched.cpp b/test_kgemm_nt_batched.cpp index 58c68e5..a126b8d 100644 --- a/test_kgemm_nt_batched.cpp +++ b/test_kgemm_nt_batched.cpp @@ -6,7 +6,7 @@ #include #ifdef USE_GPU -#include +#include #else #include #include @@ -19,11 +19,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -33,11 +33,11 @@ static inline void gpu2host( void * dest, void * src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -48,8 +48,8 @@ static inline void *myalloc( size_t const nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -60,8 +60,8 @@ void *myalloc( size_t const nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -327,11 +327,11 @@ T test_kgemm_nt_batched( int const mm, int const nwarps = min( min(32,mm), min(nn,kk)); int const nthreads = nwarps * warpsize; - cudaError_t istat_sync_start = cudaDeviceSynchronize(); - expect( istat_sync_start == cudaSuccess ); + hipError_t istat_sync_start = hipDeviceSynchronize(); + expect( istat_sync_start == hipSuccess ); - kgemm_nt_batched<<< batchCount, nthreads >>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nt_batched), dim3(batchCount), dim3(nthreads ), 0, 0, mm,nn,kk, alpha, ddAarray_, dldAarray_, ddBarray_, dldBarray_, @@ -339,8 +339,8 @@ T test_kgemm_nt_batched( int const mm, ddCarray_, dldCarray_, batchCount); - cudaError_t istat_sync_end = cudaDeviceSynchronize(); - expect( istat_sync_end == cudaSuccess ); + hipError_t istat_sync_end = hipDeviceSynchronize(); + expect( istat_sync_end == hipSuccess ); } #else { diff --git a/test_kronmult6_batched.cpp b/test_kronmult6_batched.cpp index 6a100b4..609ce79 100644 --- a/test_kronmult6_batched.cpp +++ b/test_kronmult6_batched.cpp @@ -13,7 +13,7 @@ #ifdef USE_GPU -#include +#include #else #include #include @@ -24,11 +24,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -38,11 +38,11 @@ static inline void gpu2host( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -53,8 +53,8 @@ static inline void *myalloc( size_t nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -195,42 +195,42 @@ T test_kronmult_batched( int const idim, // note the input Zarray will be over-written // -------------------------------------------- switch(idim) { - case 1: kronmult1_batched<<< batchCount, nthreads >>>( n, + case 1: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 2: kronmult2_batched<<< batchCount, nthreads >>>( n, + case 2: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 3: kronmult3_batched<<< batchCount, nthreads >>>( n, + case 3: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 4: kronmult4_batched<<< batchCount, nthreads >>>( n, + case 4: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 5: kronmult5_batched<<< batchCount, nthreads >>>( n, + case 5: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 6: kronmult6_batched<<< batchCount, nthreads >>>( n, + case 6: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, @@ -244,8 +244,8 @@ T test_kronmult_batched( int const idim, // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - cudaError_t istat = cudaDeviceSynchronize(); - expect( istat == cudaSuccess ); + hipError_t istat = hipDeviceSynchronize(); + expect( istat == hipSuccess ); } #else diff --git a/test_kronmult6_pbatched.cpp b/test_kronmult6_pbatched.cpp index f8bfc1d..62d41c4 100644 --- a/test_kronmult6_pbatched.cpp +++ b/test_kronmult6_pbatched.cpp @@ -13,7 +13,7 @@ #ifdef USE_GPU -#include +#include #else #include #include @@ -24,11 +24,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -38,11 +38,11 @@ static inline void gpu2host( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -53,8 +53,8 @@ static inline void *myalloc( size_t nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -269,42 +269,42 @@ T test_kronmult_pbatched( int const idim, // note the input Zarray will be over-written // -------------------------------------------- switch(idim) { - case 1: kronmult1_pbatched<<< batchCount, nthreads >>>( n, + case 1: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 2: kronmult2_pbatched<<< batchCount, nthreads >>>( n, + case 2: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 3: kronmult3_pbatched<<< batchCount, nthreads >>>( n, + case 3: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 4: kronmult4_pbatched<<< batchCount, nthreads >>>( n, + case 4: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 5: kronmult5_pbatched<<< batchCount, nthreads >>>( n, + case 5: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 6: kronmult6_pbatched<<< batchCount, nthreads >>>( n, + case 6: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, @@ -318,8 +318,8 @@ T test_kronmult_pbatched( int const idim, // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - cudaError_t istat = cudaDeviceSynchronize(); - expect( istat == cudaSuccess ); + hipError_t istat = hipDeviceSynchronize(); + expect( istat == hipSuccess ); } #else diff --git a/test_kronmult6_xbatched.cpp b/test_kronmult6_xbatched.cpp index 90a938a..7534b59 100644 --- a/test_kronmult6_xbatched.cpp +++ b/test_kronmult6_xbatched.cpp @@ -13,7 +13,7 @@ #ifdef USE_GPU -#include +#include #else #include #include @@ -24,11 +24,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -38,11 +38,11 @@ static inline void gpu2host( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -53,8 +53,8 @@ static inline void *myalloc( size_t nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -302,42 +302,42 @@ T test_kronmult_xbatched( int const idim, // note the input Zarray will be over-written // -------------------------------------------- switch(idim) { - case 1: kronmult1_xbatched<<< batchCount, nthreads >>>( n, + case 1: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 2: kronmult2_xbatched<<< batchCount, nthreads >>>( n, + case 2: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 3: kronmult3_xbatched<<< batchCount, nthreads >>>( n, + case 3: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 4: kronmult4_xbatched<<< batchCount, nthreads >>>( n, + case 4: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 5: kronmult5_xbatched<<< batchCount, nthreads >>>( n, + case 5: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 6: kronmult6_xbatched<<< batchCount, nthreads >>>( n, + case 6: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, @@ -351,8 +351,8 @@ T test_kronmult_xbatched( int const idim, // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - cudaError_t istat = cudaDeviceSynchronize(); - expect( istat == cudaSuccess ); + hipError_t istat = hipDeviceSynchronize(); + expect( istat == hipSuccess ); } #else From 09d92ed8ea34de18049408f9c1024712942c826e Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 27 Jul 2021 10:10:54 -0600 Subject: [PATCH 02/15] Start hipifying CMake --- CMakeLists.txt | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e67f30c..0b4157b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,9 +5,21 @@ #------------------------------------------------------------------------------- project (kronmult LANGUAGES CXX) - option (USE_GPU "Use CUDA for gpu support" OFF) + option (USE_GPU "Use HIP for gpu support" OFF) if (USE_GPU) - enable_language (CUDA) + # search for HIP and libraries + if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() + endif() + + # look for HIP cmake configs in different locations + list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") + find_package(HIP REQUIRED 4.0) + #enable_language (HIP) endif () #------------------------------------------------------------------------------- @@ -60,8 +72,8 @@ if (USE_GPU) target_compile_features (${target} PUBLIC cuda_std_14) get_target_property (SOURCES ${target} SOURCES) - set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA) - set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES 60) + set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) + set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 60) endif () endmacro () From 5b768ee9adacc2a4a87cb27cbb1fde42f248bbc6 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 28 Jul 2021 15:44:38 -0600 Subject: [PATCH 03/15] Modify target properties for hip --- CMakeLists.txt | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b4157b..e0cca2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,10 +16,18 @@ endif() endif() + set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") + # look for HIP cmake configs in different locations list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") find_package(HIP REQUIRED 4.0) #enable_language (HIP) + + message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") + message(STATUS "HIP COMPILER: ${HIP_COMPILER}") + message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}") + message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}") + message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") endif () #------------------------------------------------------------------------------- @@ -70,10 +78,12 @@ #------------------------------------------------------------------------------- macro (target_set_cuda target) if (USE_GPU) - target_compile_features (${target} PUBLIC cuda_std_14) + #target_compile_features (${target} PUBLIC cuda_std_14) get_target_property (SOURCES ${target} SOURCES) - set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) - set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 60) + #set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) + set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 86) + set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) endif () endmacro () From 825d94886b62d503b9753a5d1ba9e505eabfeb89 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 2 Aug 2021 08:51:23 -0600 Subject: [PATCH 04/15] Workaround for hip compile lang with older cmake --- CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0cca2f..9b4c566 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ # look for HIP cmake configs in different locations list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") + list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") find_package(HIP REQUIRED 4.0) #enable_language (HIP) @@ -84,6 +85,8 @@ set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 86) set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) + set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + target_include_directories(${target} PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include) endif () endmacro () @@ -136,12 +139,13 @@ target_compile_options (kron PUBLIC $<$:-Wall -Wextra -Wpedantic> - $<$:--compiler-options -fPIC --keep-device-functions> + #$<$:--compiler-options -fPIC --keep-device-functions> + $<$:--compiler-options -fPIC --keep-device-functions> ) target_compile_definitions (kron PUBLIC - $<$:USE_GPU> + $<$:USE_GPU> ) if (OpenMP_CXX_FOUND) From 6ebd5e189973014cd6f503d68c10ee24ee491be8 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 2 Aug 2021 10:31:13 -0600 Subject: [PATCH 05/15] Add setting clang paths for amd platform --- CMakeLists.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b4c566..9999d0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,11 +16,31 @@ endif() endif() + # set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack) + if (NOT DEFINED HIP_CLANG_PATH) + if(NOT DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries") + else() + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries") + endif() + endif() + + # note: could probably grab this path directly using hipconfig? + if (NOT DEFINED HIP_CLANG_INCLUDE_PATH) + if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH}) + # probably need a better way to get the compiler version.. this will cause non-existent p\aths for non-clang compilers + set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory") + else() + set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory") + endif() + endif() + set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") # look for HIP cmake configs in different locations list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + #set(CMAKE_HIP_ARCHITECTURES OFF) find_package(HIP REQUIRED 4.0) #enable_language (HIP) @@ -86,6 +106,7 @@ set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 86) set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + # HIP_INCLUDE_DIRS is not getting set for nvidia platforms target_include_directories(${target} PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include) endif () endmacro () From 40ee876724f24ca63dd8b67b7b8fe61654637979 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 3 Aug 2021 11:26:42 -0600 Subject: [PATCH 06/15] Start adding platform detection --- CMakeLists.txt | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9999d0b..0a4bdb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,11 +35,32 @@ endif() endif() + # try to find hipconfig executable which can help detect platforms and include dirs + find_program(HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") + if(HIPCONFIG_PATH) + execute_process(COMMAND ${HIPCONFIG_PATH} --platform OUTPUT_VARIABLE KRONMULT_HIP_PLATFORM) + elseif(DEFINED ENV{HIP_PLATFORM}) + set(KRONMULT_HIP_PLATFORM "$ENV{HIP_PLATFORM}") + else() + message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set") + endif() + + message(STATUS "HIP platform has been detected as ${KRONMULT_HIP_PLATFORM}") + # hip >= 4.2 is now using "amd" to identify platform + if(KRONMULT_HIP_PLATFORM STREQUAL "hcc" OR KRONMULT_HIP_PLATFORM STREQUAL "amd") + set(KRONMULT_PLATFORM_AMD 1) + # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia" + elseif(KRONMULT_HIP_PLATFORM STREQUAL "nvcc" OR KRONMULT_HIP_PLATFORM STREQUAL "nvidia") + set(KRONMULT_PLATFORM_NVCC 1) + endif() + set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") # look for HIP cmake configs in different locations list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") + list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") #set(CMAKE_HIP_ARCHITECTURES OFF) find_package(HIP REQUIRED 4.0) #enable_language (HIP) @@ -161,7 +182,7 @@ PUBLIC $<$:-Wall -Wextra -Wpedantic> #$<$:--compiler-options -fPIC --keep-device-functions> - $<$:--compiler-options -fPIC --keep-device-functions> + #$<$:--compiler-options -fPIC --keep-device-functions> ) target_compile_definitions (kron From 07c6d0b40805c2573be27cceeca1e823b30b9fb7 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 4 Aug 2021 13:20:11 -0600 Subject: [PATCH 07/15] Add hip compile definitions based on platform --- CMakeLists.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0a4bdb8..97c134a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,9 @@ option (USE_GPU "Use HIP for gpu support" OFF) if (USE_GPU) + set(KRONMULT_PLATFORM_AMD 0) + set(KRONMULT_PLATFORM_NVCC 0) + # search for HIP and libraries if(NOT DEFINED HIP_PATH) if(NOT DEFINED ENV{HIP_PATH}) @@ -70,6 +73,18 @@ message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}") message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}") message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") + + if(KRONMULT_PLATFORM_NVCC) + find_package(CUDA 9.0 REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) + enable_language(CUDA) + set (CMAKE_CUDA_STANDARD 14) + set (CMAKE_CUDA_STANDARD_REQUIRED ON) + add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__) + else() + add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__) + endif() + endif () #------------------------------------------------------------------------------- From eac800945dedfdf08a991d651d54388b2f138d34 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 5 Aug 2021 14:03:48 -0600 Subject: [PATCH 08/15] Add compile flags for each platform --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97c134a..6cea17f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,8 +196,8 @@ target_compile_options (kron PUBLIC $<$:-Wall -Wextra -Wpedantic> - #$<$:--compiler-options -fPIC --keep-device-functions> - #$<$:--compiler-options -fPIC --keep-device-functions> + $<$:--compiler-options -fPIC --keep-device-functions> + $<$:--offload-arch=gfx906> ) target_compile_definitions (kron From b78906f697a2f973bfa0fcee6e6cebd68cd0b57e Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Fri, 6 Aug 2021 10:16:27 -0600 Subject: [PATCH 09/15] Use cuda options for nvidia platforms --- CMakeLists.txt | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6cea17f..a19962e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ # note: could probably grab this path directly using hipconfig? if (NOT DEFINED HIP_CLANG_INCLUDE_PATH) if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH}) - # probably need a better way to get the compiler version.. this will cause non-existent p\aths for non-clang compilers + # probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory") else() set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory") @@ -60,13 +60,11 @@ set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") # look for HIP cmake configs in different locations - list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") - list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") - list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") - list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") - #set(CMAKE_HIP_ARCHITECTURES OFF) + list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + #set(CMAKE_HIP_ARCHITECTURES OFF) # enable if using CMake 3.21 find_package(HIP REQUIRED 4.0) - #enable_language (HIP) + #enable_language (HIP) # enable if using CMake 3.21 message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") message(STATUS "HIP COMPILER: ${HIP_COMPILER}") @@ -135,13 +133,20 @@ #------------------------------------------------------------------------------- macro (target_set_cuda target) if (USE_GPU) - #target_compile_features (${target} PUBLIC cuda_std_14) get_target_property (SOURCES ${target} SOURCES) - #set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) - set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 86) - set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) + #set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) # enable if using CMake 3.21 + + if(KRONMULT_PLATFORM_NVCC) + target_compile_features (${target} PUBLIC cuda_std_14) + set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA) + set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES 86) + else() + set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 86) + set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) + endif() set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + # HIP_INCLUDE_DIRS is not getting set for nvidia platforms target_include_directories(${target} PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include) endif () From e6beda85a8b5f9a0e44a88f0c6284712e6f034ba Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 11 Aug 2021 15:45:07 -0600 Subject: [PATCH 10/15] Add cmake option for gpu arch --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a19962e..d2ab938 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ project (kronmult LANGUAGES CXX) option (USE_GPU "Use HIP for gpu support" OFF) + set(GPU_ARCH "86" CACHE STRING "GPU architecture code for AMD/NVIDIA") if (USE_GPU) set(KRONMULT_PLATFORM_AMD 0) set(KRONMULT_PLATFORM_NVCC 0) @@ -139,10 +140,10 @@ if(KRONMULT_PLATFORM_NVCC) target_compile_features (${target} PUBLIC cuda_std_14) set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA) - set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES 86) + set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES ${GPU_ARCH}) else() set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES 86) + set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES ${GPU_ARCH}) set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) endif() set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) @@ -202,7 +203,7 @@ PUBLIC $<$:-Wall -Wextra -Wpedantic> $<$:--compiler-options -fPIC --keep-device-functions> - $<$:--offload-arch=gfx906> + $<$:--offload-arch=${GPU_ARCH}> ) target_compile_definitions (kron From 398dba066ca2d7c38cec7d7937481ad4e29069e8 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 19 Aug 2021 11:21:36 -0600 Subject: [PATCH 11/15] Adjust hip include directories to suppress warnings --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d2ab938..236fe20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,7 +149,7 @@ set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) # HIP_INCLUDE_DIRS is not getting set for nvidia platforms - target_include_directories(${target} PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include) + target_include_directories(${target} SYSTEM PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include) endif () endmacro () From 433766c8d42f0d11d907d810297a17fb1a6eb0aa Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 31 Aug 2021 10:19:54 -0600 Subject: [PATCH 12/15] Fix gpu flag for non-gpu and nvidia builds --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 236fe20..475bd6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,7 +208,7 @@ target_compile_definitions (kron PUBLIC - $<$:USE_GPU> + $<$:USE_GPU> ) if (OpenMP_CXX_FOUND) From 2d746ee0f0e155443845e6adad4e14c327ffc00a Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 1 Sep 2021 13:42:29 -0600 Subject: [PATCH 13/15] Switch to using cmake 3.21 --- CMakeLists.txt | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 475bd6a..dab78a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ - cmake_minimum_required (VERSION 3.17 FATAL_ERROR) + cmake_minimum_required (VERSION 3.21 FATAL_ERROR) #------------------------------------------------------------------------------- # Setup languages to use. Only enable CUDA if GPU's are in use. @@ -63,9 +63,12 @@ # look for HIP cmake configs in different locations list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") - #set(CMAKE_HIP_ARCHITECTURES OFF) # enable if using CMake 3.21 + set(CMAKE_HIP_ARCHITECTURES OFF) # enable if using CMake 3.21 find_package(HIP REQUIRED 4.0) - #enable_language (HIP) # enable if using CMake 3.21 + if(KRONMULT_PLATFORM_AMD) + enable_language (HIP) # enable if using CMake 3.21 + set_target_properties(hip-lang::device PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${HIP_PATH}/include) + endif() message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") message(STATUS "HIP COMPILER: ${HIP_COMPILER}") @@ -135,7 +138,7 @@ macro (target_set_cuda target) if (USE_GPU) get_target_property (SOURCES ${target} SOURCES) - #set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) # enable if using CMake 3.21 + set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) # enable if using CMake 3.21 if(KRONMULT_PLATFORM_NVCC) target_compile_features (${target} PUBLIC cuda_std_14) @@ -145,6 +148,7 @@ set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES ${GPU_ARCH}) set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) + #target_link_libraries(${target} PUBLIC hip-lang::device) endif() set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) From 9a8d70fcc4b8d945613aa66bef62f543da1f389a Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 2 Nov 2021 12:10:46 -0400 Subject: [PATCH 14/15] Rename leftover cudas and update amd offload arch for asgard --- CMakeLists.txt | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dab78a7..e4f8bac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,12 @@ cmake_minimum_required (VERSION 3.21 FATAL_ERROR) #------------------------------------------------------------------------------- -# Setup languages to use. Only enable CUDA if GPU's are in use. +# Setup languages to use. Only enable HIP if GPU's are in use. #------------------------------------------------------------------------------- project (kronmult LANGUAGES CXX) option (USE_GPU "Use HIP for gpu support" OFF) - set(GPU_ARCH "86" CACHE STRING "GPU architecture code for AMD/NVIDIA") + set(GPU_ARCH "70" CACHE STRING "GPU architecture code for AMD/NVIDIA") if (USE_GPU) set(KRONMULT_PLATFORM_AMD 0) set(KRONMULT_PLATFORM_NVCC 0) @@ -63,10 +63,10 @@ # look for HIP cmake configs in different locations list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") - set(CMAKE_HIP_ARCHITECTURES OFF) # enable if using CMake 3.21 + set(CMAKE_HIP_ARCHITECTURES OFF) find_package(HIP REQUIRED 4.0) if(KRONMULT_PLATFORM_AMD) - enable_language (HIP) # enable if using CMake 3.21 + enable_language (HIP) set_target_properties(hip-lang::device PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${HIP_PATH}/include) endif() @@ -104,7 +104,7 @@ # Sanitizer options #------------------------------------------------------------------------------- set (CMAKE_CXX_FLAGS_SANITIZED -fno-omit-frame-pointer) - set (CMAKE_CUDA_FLAGS_SANITIZED -fno-omit-frame-pointer) + set (CMAKE_HIP_FLAGS_SANITIZED -fno-omit-frame-pointer) set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "") set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "") @@ -115,7 +115,7 @@ if (${SANITIZE_${upper_name}}) set (CMAKE_CXX_FLAGS_SANITIZED "${CMAKE_CXX_FLAGS_SANITIZED} -fsanitize=${name}") - set (CMAKE_CUDA_FLAGS_SANITIZED "${CMAKE_CUDA_FLAGS_SANITIZED} -fsanitize=${name}") + set (CMAKE_HIP_FLAGS_SANITIZED "${CMAKE_HIP_FLAGS_SANITIZED} -fsanitize=${name}") set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "${CMAKE_EXE_LINKER_FLAGS_SANITIZED} -fsanitize=${name}") set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "${CMAKE_SHARED_LINKER_FLAGS_SANITIZED} -fsanitize=${name}") endif () @@ -128,17 +128,17 @@ register_sanitizer_option (undefined ON) #------------------------------------------------------------------------------- -# Define a macro function to set a targets source files to the CUDA language. +# Define a macro function to set a targets source files to the HIP language. # # Can't use $<$:foo> generator expressions for # target_compile_features # # Need to handle this with an explicit if statement. #------------------------------------------------------------------------------- - macro (target_set_cuda target) + macro (target_set_hip target) if (USE_GPU) get_target_property (SOURCES ${target} SOURCES) - set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) # enable if using CMake 3.21 + set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) if(KRONMULT_PLATFORM_NVCC) target_compile_features (${target} PUBLIC cuda_std_14) @@ -146,7 +146,7 @@ set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES ${GPU_ARCH}) else() set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES ${GPU_ARCH}) + set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES gfx${GPU_ARCH}) set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) #target_link_libraries(${target} PUBLIC hip-lang::device) endif() @@ -168,7 +168,7 @@ add_library (kron SHARED) # Note can't use generator expressions on the source files since it interfers -# with setting the source property to target CUDA. +# with setting the source property to target HIP. target_sources (kron PRIVATE @@ -201,13 +201,13 @@ ) target_compile_features (kron PUBLIC cxx_std_17) - target_set_cuda (kron) + target_set_hip (kron) target_compile_options (kron PUBLIC $<$:-Wall -Wextra -Wpedantic> $<$:--compiler-options -fPIC --keep-device-functions> - $<$:--offload-arch=${GPU_ARCH}> + $<$:--offload-arch=gfx${GPU_ARCH}> ) target_compile_definitions (kron @@ -233,7 +233,7 @@ PRIVATE ${source} ) - target_set_cuda (${target}) + target_set_hip (${target}) target_link_libraries (${target} PUBLIC kron) add_test (NAME ${target} From 3ffccd75c9a18a91e5d6082af1147267cd643ec8 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 18 Nov 2021 18:29:45 -0500 Subject: [PATCH 15/15] Remove find_package for cuda --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e4f8bac..4fee9bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,8 +77,6 @@ message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") if(KRONMULT_PLATFORM_NVCC) - find_package(CUDA 9.0 REQUIRED) - include_directories(${CUDA_INCLUDE_DIRS}) enable_language(CUDA) set (CMAKE_CUDA_STANDARD 14) set (CMAKE_CUDA_STANDARD_REQUIRED ON)