diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc index a28622cdb6..6a4a82004c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc @@ -85,6 +85,20 @@ C END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL END INTERFACE +C +C Compute the maximum event weight of the batch on the GPU +C - PBRIDGE: the memory address of the C++ Bridge +C - ALL_WGT: the Jacobian+PDF weights from Fortran +C - MAX_WGT: maximum observed weight on GPU (output) +C + INTERFACE + SUBROUTINE FBRIDGECOMPUTEMAXWEIGHT(PBRIDGE, ALL_WGT, MAX_WGT) + INTEGER*8 PBRIDGE + DOUBLE PRECISION ALL_WGT(*) + DOUBLE_PRECISION MAX_WGT + END SUBROUTINE FBRIDGECOMPUTEMAXWEIGHT + END INTERFACE + C C Retrieve the number of good helicities for helicity filtering in the Bridge. C - PBRIDGE: the memory address of the C++ Bridge diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 60eb101a6a..e0aef783ae 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -47,6 +47,41 @@ namespace mg5amcCpu virtual ~CppObjectInFortran() {} }; +#ifdef __CUDACC__ + template + __global__ void computeFinalWeightKernel(T const * matrixElmSq, U * weights, const std::size_t n) { + for( unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { + weights[i] = weights[i] * matrixElmSq[i]; + } + } + + template + __device__ T max(T a, T b) { + return a < b ? b : a; + } + + template + __global__ void computeMaxWeightKernel(T * weights, const std::size_t n) { + T maxWeight = 0; + __shared__ T sharedMaxWeight[1024]; + + for( unsigned int i = threadIdx.x; blockIdx.x == 0 && i < n; i += blockDim.x ) + { + maxWeight = max(maxWeight, weights[i]); + } + sharedMaxWeight[threadIdx.x] = maxWeight; + + __syncthreads(); + + if (threadIdx.x == 0) { + for( unsigned int i = 0; i < blockDim.x; ++i) { + maxWeight = max( maxWeight, sharedMaxWeight[i] ); + } + weights[0] = maxWeight; + } + } +#endif + //-------------------------------------------------------------------------- /** * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. @@ -122,6 +157,14 @@ namespace mg5amcCpu int* selhel, int* selcol, const bool goodHelOnly = false ); + + /** + * Compute the maximum event weight on the device. + * + * @param jacobianWeights Jacobian and PDF weights from madevent + */ + FORTRANFPTYPE computeMaxWeight( const FORTRANFPTYPE* jacobianWeights ); + #else /** * Sequence to be executed for the vectorized CPU matrix element calculation @@ -168,6 +211,7 @@ namespace mg5amcCpu DeviceBufferMatrixElements m_devMEs; DeviceBufferSelectedHelicity m_devSelHel; DeviceBufferSelectedColor m_devSelCol; + std::unique_ptr> m_devJacobianWeights = nullptr; PinnedHostBufferGs m_hstGs; PinnedHostBufferRndNumHelicity m_hstRndHel; PinnedHostBufferRndNumColor m_hstRndCol; @@ -315,18 +359,10 @@ namespace mg5amcCpu //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } - if constexpr( std::is_same_v ) - { - memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) ); - } - else - { - std::copy( gs, gs + m_nevt, m_hstGs.data() ); - std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); - std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); - } + std::copy( gs, gs + m_nevt, m_hstGs.data() ); + std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); + std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); + copyDeviceFromHost( m_devGs, m_hstGs ); copyDeviceFromHost( m_devRndHel, m_hstRndHel ); copyDeviceFromHost( m_devRndCol, m_hstRndCol ); @@ -341,19 +377,26 @@ namespace mg5amcCpu flagAbnormalMEs( m_hstMEs.data(), m_nevt ); copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); - if constexpr( std::is_same_v ) - { - memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); - memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() ); - memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() ); - } - else - { - std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); - std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); - std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); - } + std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); + std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); + std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); + } + + template + FORTRANFPTYPE Bridge::computeMaxWeight( const FORTRANFPTYPE* jacobianWeights ) + { + if( !m_devJacobianWeights ) m_devJacobianWeights = std::make_unique>( m_nevt ); + checkCuda( cudaMemcpy(m_devJacobianWeights->data(), jacobianWeights, m_nevt*sizeof(FORTRANFPTYPE), cudaMemcpyDefault) ); + + computeFinalWeightKernel<<>>( m_devMEs.data(), m_devJacobianWeights->data(), m_nevt ); + computeMaxWeightKernel<<>>( m_devJacobianWeights->data(), m_nevt ); + + FORTRANFPTYPE maxWeight = 0.; + checkCuda( cudaMemcpy( &maxWeight, m_devJacobianWeights->data(), sizeof( FORTRANFPTYPE ), cudaMemcpyDefault ) ); + + return maxWeight; } + #endif #ifndef MGONGPUCPP_GPUIMPL @@ -369,18 +412,9 @@ namespace mg5amcCpu const bool goodHelOnly ) { hst_transposeMomentaF2C( momenta, m_hstMomentaC.data(), m_nevt ); - if constexpr( std::is_same_v ) - { - memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) ); - memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) ); - } - else - { - std::copy( gs, gs + m_nevt, m_hstGs.data() ); - std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); - std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); - } + std::copy( gs, gs + m_nevt, m_hstGs.data() ); + std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); + std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); if( m_nGoodHel < 0 ) { m_nGoodHel = m_pmek->computeGoodHelicities(); @@ -389,18 +423,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( channelId ); flagAbnormalMEs( m_hstMEs.data(), m_nevt ); - if constexpr( std::is_same_v ) - { - memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); - memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() ); - memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() ); - } - else - { - std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); - std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); - std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); - } + std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); + std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); + std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f index ddc480ec63..a6987f3944 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f @@ -251,6 +251,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C DOUBLE PRECISION ALL_PP(0:3,NEXTERNAL,VECSIZE_MEMMAX) DOUBLE PRECISION ALL_WGT(VECSIZE_MEMMAX) + DOUBLE PRECISION ALL_JACANDPDF(VECSIZE_MEMMAX) DOUBLE PRECISION ALL_XBK(2,VECSIZE_MEMMAX) DOUBLE PRECISION ALL_Q2FACT(2,VECSIZE_MEMMAX) DOUBLE PRECISION ALL_CM_RAP(VECSIZE_MEMMAX) @@ -397,8 +398,15 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, CALL RANMAR(HEL_RAND(IVEC)) CALL RANMAR(COL_RAND(IVEC)) ENDDO + + ! Prepare computation of max weight on GPU + ! This requires the jacobian and PDF weights + DO IVEC=1,VECSIZE_USED + ALL_JACANDPDF(IVEC) = ALL_WGT(IVEC)*ALL_PD(0,IVEC) + ENDDO + CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, - $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED, ALL_JACANDPDF) DO IVEC=1,VECSIZE_USED @@ -465,7 +473,7 @@ SUBROUTINE PRINT_ZERO_AMP1() SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, - $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED) + $ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED, ALL_WGT) USE OMP_LIB IMPLICIT NONE @@ -482,6 +490,8 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, INTEGER SELECTED_HEL(VECSIZE_MEMMAX) INTEGER SELECTED_COL(VECSIZE_MEMMAX) INTEGER VECSIZE_USED + DOUBLE PRECISION ALL_WGT(VECSIZE_MEMMAX) + INTEGER IVEC INTEGER IEXT @@ -506,6 +516,11 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, DOUBLE PRECISION CBYF1 INTEGER*4 NGOODHEL, NTOTHEL +C Pull in twgt to allow for more efficient unweighting + double precision twgt, maxwgt,swgt(maxevents) + integer lun, nw, itmin + common/to_unwgt/twgt, maxwgt, swgt, lun, nw, itmin + INTEGER*4 NWARNINGS SAVE NWARNINGS DATA NWARNINGS/0/ @@ -576,6 +591,12 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL, & SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities ENDIF call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0 + + ! Compute the maximum weight of this batch. It needs to be + ! rescaled by CONV=389379660d0 to be compatible with the scale + ! used in the common block to_unwgt + CALL FBRIDGECOMPUTEMAXWEIGHT(FBRIDGE_PBRIDGE, ALL_WGT, twgt) + twgt = twgt * 389379660d0 ENDIF IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc index 99efcb1dbe..8da8dbd2ab 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc @@ -110,6 +110,19 @@ extern "C" #endif } + void fbridgecomputemaxweight_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* jacobianWeights, + double * maxWeight) + { +#ifdef __CUDACC__ + Bridge* pbridge = static_cast*>( *ppbridge ); + *maxWeight = pbridge->computeMaxWeight(jacobianWeights); +#else + std::cerr << __func__ << " only implemented on GPU.\n"; + *maxWeight = 0.; +#endif + } + /** * Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++, without multi-channel mode. * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f index e8f2aa1d26..3bb9f60731 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f @@ -232,7 +232,7 @@ SUBROUTINE unwgt(px,wgt,numproc, ihel, icol, ivec) c data idum/-1/ data yran/1d0/ - data fudge/10d0/ + data fudge/0.5d0/ C----- C BEGIN CODE C-----