From adbf971fe5ccc0bb1fce9971523f1af5185f5d55 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 3 Feb 2023 11:18:40 +0100 Subject: [PATCH 1/4] [cuda/gg_ttgg] Simplify bridge code in gg_ttgg.mad std::copy implementations are supposed to use memmove where possible (dependending on the template parameters). Therefore, a manual check of the copied types is unnecessary. When fortran type and C++ type are identical, std::copy automatically decays to memcpy. --- .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h | 61 ++++--------------- 1 file changed, 13 insertions(+), 48 deletions(-) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 60eb101a6a..3f48ce1001 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -315,18 +315,10 @@ namespace mg5amcCpu //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } - if constexpr( std::is_same_v ) - { - memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) ); - } - else - { - std::copy( gs, gs + m_nevt, m_hstGs.data() ); - std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); - std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); - } + std::copy( gs, gs + m_nevt, m_hstGs.data() ); + std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); + std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); + copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -341,18 +333,9 @@ namespace mg5amcCpu flagAbnormalMEs( m_hstMEs.data(), m_nevt ); copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); - if constexpr( std::is_same_v ) - { - memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); - memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() ); - memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() ); - } - else - { - std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); - std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); - std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); - } + std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); + std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); + std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); } #endif @@ -369,18 +352,9 @@ namespace mg5amcCpu const bool goodHelOnly ) { hst_transposeMomentaF2C( momenta, m_hstMomentaC.data(), m_nevt ); - if constexpr( std::is_same_v ) - { - memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) ); - } - else - { - std::copy( gs, gs + m_nevt, m_hstGs.data() ); - std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); - std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); - } + std::copy( gs, gs + m_nevt, m_hstGs.data() ); + std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); + std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); @@ -389,18 +363,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( channelId ); flagAbnormalMEs( m_hstMEs.data(), m_nevt ); - if constexpr( std::is_same_v ) - { - memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); - memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() ); - memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() ); - } - else - { - std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); - std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); - std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); - } + std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); + std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); + std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); } #endif From 56d61292466ba9a18e89d9a7ab78aa327bf9d098 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 8 Feb 2023 16:03:04 +0100 Subject: [PATCH 2/4] [cuda/gg_ttgg] Add computation of max event weight to fortran bridge. Add kernels and bridge code to compute event weights on GPU. Using the weights of Jacobians and PDF from Fortran, the GPU can compute the total event weight in device memory. A second kernel computes the maximum of each batch, and returns this to the host. --- .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 3f48ce1001..e0aef783ae 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -47,6 +47,41 @@ namespace mg5amcCpu virtual ~CppObjectInFortran() {} }; +#ifdef __CUDACC__ + template + __global__ void computeFinalWeightKernel(T const * matrixElmSq, U * weights, const std::size_t n) { + for( unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { + weights[i] = weights[i] * matrixElmSq[i]; + } + } + + template + __device__ T max(T a, T b) { + return a < b ? b : a; + } + + template + __global__ void computeMaxWeightKernel(T * weights, const std::size_t n) { + T maxWeight = 0; + __shared__ T sharedMaxWeight[1024]; + + for( unsigned int i = threadIdx.x; blockIdx.x == 0 && i < n; i += blockDim.x ) + { + maxWeight = max(maxWeight, weights[i]); + } + sharedMaxWeight[threadIdx.x] = maxWeight; + + __syncthreads(); + + if (threadIdx.x == 0) { + for( unsigned int i = 0; i < blockDim.x; ++i) { + maxWeight = max( maxWeight, sharedMaxWeight[i] ); + } + weights[0] = maxWeight; + } + } +#endif + //-------------------------------------------------------------------------- /** * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. @@ -122,6 +157,14 @@ namespace mg5amcCpu int* selhel, int* selcol, const bool goodHelOnly = false ); + + /** + * Compute the maximum event weight on the device. + * + * @param jacobianWeights Jacobian and PDF weights from madevent + */ + FORTRANFPTYPE computeMaxWeight( const FORTRANFPTYPE* jacobianWeights ); + #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -168,6 +211,7 @@ namespace mg5amcCpu DeviceBufferMatrixElements m_devMEs; DeviceBufferSelectedHelicity m_devSelHel; DeviceBufferSelectedColor m_devSelCol; + std::unique_ptr> m_devJacobianWeights = nullptr; PinnedHostBufferGs m_hstGs; PinnedHostBufferRndNumHelicity m_hstRndHel; PinnedHostBufferRndNumColor m_hstRndCol; @@ -337,6 +381,22 @@ namespace mg5amcCpu std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); } + + template + FORTRANFPTYPE Bridge::computeMaxWeight( const FORTRANFPTYPE* jacobianWeights ) + { + if( !m_devJacobianWeights ) m_devJacobianWeights = std::make_unique>( m_nevt ); + checkCuda( cudaMemcpy(m_devJacobianWeights->data(), jacobianWeights, m_nevt*sizeof(FORTRANFPTYPE), cudaMemcpyDefault) ); + + computeFinalWeightKernel<<>>( m_devMEs.data(), m_devJacobianWeights->data(), m_nevt ); + computeMaxWeightKernel<<>>( m_devJacobianWeights->data(), m_nevt ); + + FORTRANFPTYPE maxWeight = 0.; + checkCuda( cudaMemcpy( &maxWeight, m_devJacobianWeights->data(), sizeof( FORTRANFPTYPE ), cudaMemcpyDefault ) ); + + return maxWeight; + } + #endif #ifndef MGONGPUCPP_GPUIMPL From 2181d6c3bb701720199429a23c3e5d5d7195e2a5 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 8 Feb 2023 15:57:15 +0100 Subject: [PATCH 3/4] [cuda/gg_ttgg] Add a first version for max weight computation to bridge. - For each batch, compute the maximum event weight on the GPU - Transfer this into a common block for the unweighting steps - This allows for rejecting events a lot earlier (instead of writing them to tmp) --- .../gg_tt.mad/SubProcesses/fbridge.inc | 14 +++++++++++ .../SubProcesses/P1_gg_ttxgg/auto_dsig1.f | 25 +++++++++++++++++-- .../gg_ttgg.mad/SubProcesses/fbridge.cc | 13 ++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc index a28622cdb6..6a4a82004c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc @@ -85,6 +85,20 @@ C END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL END INTERFACE +C +C Compute the maximum event weight of the batch on the GPU +C - PBRIDGE: the memory address of the C++ Bridge +C - ALL_WGT: the Jacobian+PDF weights from Fortran +C - MAX_WGT: maximum observed weight on GPU (output) +C + INTERFACE + SUBROUTINE FBRIDGECOMPUTEMAXWEIGHT(PBRIDGE, ALL_WGT, MAX_WGT) + INTEGER*8 PBRIDGE + DOUBLE PRECISION ALL_WGT(*) + DOUBLE_PRECISION MAX_WGT + END SUBROUTINE FBRIDGECOMPUTEMAXWEIGHT + END INTERFACE + C C Retrieve the number of good helicities for helicity filtering in the Bridge. C - PBRIDGE: the memory address of the C++ Bridge diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f index ddc480ec63..a6987f3944 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f @@ -251,6 +251,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C DOUBLE PRECISION ALL_PP(0:3,NEXTERNAL,VECSIZE_MEMMAX) DOUBLE PRECISION ALL_WGT(VECSIZE_MEMMAX) + DOUBLE PRECISION ALL_JACANDPDF(VECSIZE_MEMMAX) DOUBLE PRECISION ALL_XBK(2,VECSIZE_MEMMAX) DOUBLE PRECISION ALL_Q2FACT(2,VECSIZE_MEMMAX) DOUBLE PRECISION ALL_CM_RAP(VECSIZE_MEMMAX) @@ -397,8 +398,15 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, CALL RANMAR(HEL_RAND(IVEC)) CALL RANMAR(COL_RAND(IVEC)) ENDDO + + ! Prepare computation of max weight on GPU + ! This requires the jacobian and PDF weights + DO IVEC=1,VECSIZE_USED + ALL_JACANDPDF(IVEC) = ALL_WGT(IVEC)*ALL_PD(0,IVEC) + ENDDO + CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED, ALL_JACANDPDF) DO IVEC=1,VECSIZE_USED @@ -465,7 +473,7 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED, ALL_WGT) USE OMP_LIB IMPLICIT NONE @@ -482,6 +490,8 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) INTEGER VECSIZE_USED + DOUBLE PRECISION ALL_WGT(VECSIZE_MEMMAX) + INTEGER IVEC INTEGER IEXT @@ -506,6 +516,11 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, DOUBLE PRECISION CBYF1 INTEGER*4 NGOODHEL, NTOTHEL +C Pull in twgt to allow for more efficient unweighting + double precision twgt, maxwgt,swgt(maxevents) + integer lun, nw, itmin + common/to_unwgt/twgt, maxwgt, swgt, lun, nw, itmin + INTEGER*4 NWARNINGS SAVE NWARNINGS DATA NWARNINGS/0/ @@ -576,6 +591,12 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0 + + ! Compute the maximum weight of this batch. It needs to be + ! rescaled by CONV=389379660d0 to be compatible with the scale + ! used in the common block to_unwgt + CALL FBRIDGECOMPUTEMAXWEIGHT(FBRIDGE_PBRIDGE, ALL_WGT, twgt) + twgt = twgt * 389379660d0 ENDIF IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc index 99efcb1dbe..8da8dbd2ab 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc @@ -110,6 +110,19 @@ extern "C" #endif } + void fbridgecomputemaxweight_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* jacobianWeights, + double * maxWeight) + { +#ifdef __CUDACC__ + Bridge* pbridge = static_cast*>( *ppbridge ); + *maxWeight = pbridge->computeMaxWeight(jacobianWeights); +#else + std::cerr << __func__ << " only implemented on GPU.\n"; + *maxWeight = 0.; +#endif + } + /** * Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++, without multi-channel mode. * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). From 82963b7df07de8499fa2309a0eb09f2c9a16efe8 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 8 Feb 2023 16:17:10 +0100 Subject: [PATCH 4/4] [cuda/gg_ttgg] Use conservative unweight fudge factor Now that the max event weight can be computed in each batch, the unweight fudge factor for accepting / rejecting an event can be chosen much closer to one. Here we go on the conservative side, where we accept about twice as many events than go to the final sample. --- epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f index e8f2aa1d26..3bb9f60731 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f @@ -232,7 +232,7 @@ SUBROUTINE unwgt(px,wgt,numproc, ihel, icol, ivec) c data idum/-1/ data yran/1d0/ - data fudge/10d0/ + data fudge/0.5d0/ C----- C BEGIN CODE C-----