From 648d866c0a89e0e135910ed699353f753402c19b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 7 Feb 2017 11:54:00 -0800 Subject: [PATCH 1/2] improved OpenMP usage 1) reduced the number of fork-join per iteration 'omp parallel for' does a fork-join, which can get expensive at large thread-counts. when this construct is used many times in a function, it should be replaced with a single 'omp parallel' around multiple 'omp for'. the code previously found between parallel regions is assumed to require serialization and uses 'pragma omp single' for protectin. 'single' is used instead of 'master' to allow the first encountering thread in the team to do the work, rather than waiting for the master thread. technically, but never in practice, 'single' requires MPI_THREAD_SERIALZIED instead of MPI_THREAD_FUNNELED. 'master' only requires MPI_THREAD_FUNNELED. it is possible that 'single nowait' is sufficient, in which case a few barriers can be eliminated. (aside: 'master' does not imply a barrier). 2) pragma omp simd wherever pragma ivdep is used the OpenMP standard defines 'pragma omp simd' semantics identical to the convention meaning of the non-standard 'pragma ivdep'. Intel compiler treats 'pragma omp simd' as an assertion rather than a hint so if SIMD isn't appropriate, this pragma should be conditionalized using preprocessor (C99/C++11 _Pragma being the O(1) solution here). --- src/Hydro.cc | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/src/Hydro.cc b/src/Hydro.cc index 23fab68..8cde61b 100644 --- a/src/Hydro.cc +++ b/src/Hydro.cc @@ -116,6 +116,7 @@ void Hydro::init() { if (!subrgn.empty()) { const double eps = 1.e-12; #pragma ivdep + #pragma omp simd for (int z = zfirst; z < zlast; ++z) { if (zx[z].x > (subrgn[0] - eps) && zx[z].x < (subrgn[1] + eps) && @@ -128,6 +129,7 @@ void Hydro::init() { } #pragma ivdep + #pragma omp simd for (int z = zfirst; z < zlast; ++z) { zm[z] = zr[z] * zvol[z]; zetot[z] = ze[z] * zm[z]; @@ -157,6 +159,7 @@ void Hydro::initRadialVel( const double eps = 1.e-12; #pragma ivdep + #pragma omp simd for (int p = pfirst; p < plast; ++p) { double pmag = length(px[p]); if (pmag > eps) @@ -194,7 +197,9 @@ void Hydro::doCycle( double* zdl = mesh->zdl; // Begin hydro cycle - #pragma omp parallel for schedule(static) + #pragma omp parallel + { + #pragma omp for schedule(static) for (int pch = 0; pch < numpch; ++pch) { int pfirst = mesh->pchpfirst[pch]; int plast = mesh->pchplast[pch]; @@ -208,7 +213,7 @@ void Hydro::doCycle( advPosHalf(px0, pu0, dt, pxp, pfirst, plast); } // for pch - #pragma omp parallel for schedule(static) + #pragma omp for schedule(static) for (int sch = 0; sch < numsch; ++sch) { int sfirst = mesh->schsfirst[sch]; int slast = mesh->schslast[sch]; @@ -241,13 +246,17 @@ void Hydro::doCycle( qcs->calcForce(sfq, sfirst, slast); sumCrnrForce(sfp, sfq, sft, cftot, sfirst, slast); } // for sch - mesh->checkBadSides(); - // sum corner masses, forces to points - mesh->sumToPoints(cmaswt, pmaswt); - mesh->sumToPoints(cftot, pf); + #pragma omp single + { + mesh->checkBadSides(); - #pragma omp parallel for schedule(static) + // sum corner masses, forces to points + mesh->sumToPoints(cmaswt, pmaswt); + mesh->sumToPoints(cftot, pf); + } + + #pragma omp for schedule(static) for (int pch = 0; pch < numpch; ++pch) { int pfirst = mesh->pchpfirst[pch]; int plast = mesh->pchplast[pch]; @@ -266,10 +275,12 @@ void Hydro::doCycle( // 6. advance mesh to end of time step advPosFull(px0, pu0, pap, dt, px, pu, pfirst, plast); } // for pch + #pragma omp single + { + resetDtHydro(); + } - resetDtHydro(); - - #pragma omp parallel for schedule(static) + #pragma omp for schedule(static) for (int sch = 0; sch < numsch; ++sch) { int sfirst = mesh->schsfirst[sch]; int slast = mesh->schslast[sch]; @@ -286,9 +297,12 @@ void Hydro::doCycle( calcWork(sfp, sfq, pu0, pu, pxp, dt, zw, zetot, sfirst, slast); } // for sch - mesh->checkBadSides(); + #pragma omp single + { + mesh->checkBadSides(); + } - #pragma omp parallel for schedule(static) + #pragma omp for schedule(static) for (int zch = 0; zch < mesh->numzch; ++zch) { int zfirst = mesh->zchzfirst[zch]; int zlast = mesh->zchzlast[zch]; @@ -304,6 +318,7 @@ void Hydro::doCycle( calcDtHydro(zdl, zvol, zvol0, dt, zfirst, zlast); } // for zch + } // omp parallel } @@ -318,6 +333,7 @@ void Hydro::advPosHalf( double dth = 0.5 * dt; #pragma ivdep + #pragma omp simd for (int p = pfirst; p < plast; ++p) { pxp[p] = px0[p] + pu0[p] * dth; } @@ -335,6 +351,7 @@ void Hydro::advPosFull( const int plast) { #pragma ivdep + #pragma omp simd for (int p = pfirst; p < plast; ++p) { pu[p] = pu0[p] + pa[p] * dt; px[p] = px0[p] + 0.5 * (pu[p] + pu0[p]) * dt; @@ -352,6 +369,7 @@ void Hydro::calcCrnrMass( const int slast) { #pragma ivdep + #pragma omp simd for (int s = sfirst; s < slast; ++s) { int s3 = mesh->mapss3[s]; int z = mesh->mapsz[s]; @@ -371,6 +389,7 @@ void Hydro::sumCrnrForce( const int slast) { #pragma ivdep + #pragma omp simd for (int s = sfirst; s < slast; ++s) { int s3 = mesh->mapss3[s]; @@ -391,6 +410,7 @@ void Hydro::calcAccel( const double fuzz = 1.e-99; #pragma ivdep + #pragma omp simd for (int p = pfirst; p < plast; ++p) { pa[p] = pf[p] / max(pmass[p], fuzz); } @@ -406,6 +426,7 @@ void Hydro::calcRho( const int zlast) { #pragma ivdep + #pragma omp simd for (int z = zfirst; z < zlast; ++z) { zr[z] = zm[z] / zvol[z]; } @@ -461,6 +482,7 @@ void Hydro::calcWorkRate( const int zlast) { double dtinv = 1. / dt; #pragma ivdep + #pragma omp simd for (int z = zfirst; z < zlast; ++z) { double dvol = zvol[z] - zvol0[z]; zwrate[z] = (zw[z] + zp[z] * dvol) * dtinv; @@ -478,6 +500,7 @@ void Hydro::calcEnergy( const double fuzz = 1.e-99; #pragma ivdep + #pragma omp simd for (int z = zfirst; z < zlast; ++z) { ze[z] = zetot[z] / (zm[z] + fuzz); } From 09288b866fd5bb9dcc68142f8a16a99875d1def7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 10 Feb 2017 10:25:52 -0800 Subject: [PATCH 2/2] fuse a pair of parallel regions --- src/Hydro.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/Hydro.cc b/src/Hydro.cc index 8cde61b..5fe6671 100644 --- a/src/Hydro.cc +++ b/src/Hydro.cc @@ -103,7 +103,9 @@ void Hydro::init() { cftot = Memory::alloc(nums); // initialize hydro vars - #pragma omp parallel for schedule(static) + #pragma omp parallel + { + #pragma omp for schedule(static) for (int zch = 0; zch < numzch; ++zch) { int zfirst = mesh->zchzfirst[zch]; int zlast = mesh->zchzlast[zch]; @@ -136,7 +138,7 @@ void Hydro::init() { } } // for sch - #pragma omp parallel for schedule(static) + #pragma omp for schedule(static) for (int pch = 0; pch < numpch; ++pch) { int pfirst = mesh->pchpfirst[pch]; int plast = mesh->pchplast[pch]; @@ -146,8 +148,11 @@ void Hydro::init() { fill(&pu[pfirst], &pu[plast], double2(0., 0.)); } // for pch - resetDtHydro(); - + #pragma omp single + { + resetDtHydro(); + } + } // omp parallel }