jsitaraman · gzagaris · Sep 19, 2021 · Sep 19, 2021 · Sep 28, 2021
diff --git a/src/LocalMesh.C b/src/LocalMesh.C
@@ -64,13 +64,14 @@ LocalMesh::~LocalMesh()
   FVSAND_FREE_DEVICE(dqupdate_d);  
 }
 
-LocalMesh::LocalMesh(GlobalMesh *g, int myid, MPI_Comm comm)
+LocalMesh::LocalMesh(GlobalMesh *g, int myid, MPI_Comm comm, bool usecuda)
 {
   Timer stopwatch;
   mycomm=comm;
   int ierr=MPI_Comm_rank(comm,&myid);
   ierr=MPI_Comm_size(comm,&ngroup);
   parallelComm pc;
+  usecudampi = usecuda;
 
   // create communication patterns and ghost cells
   stopwatch.tick(); 
@@ -339,25 +340,47 @@ void LocalMesh::UpdateFringes(double *qd)
 {
     nthreads=device2host.size();
     if(nthreads == 0) return;
-    FVSAND_GPU_KERNEL_LAUNCH( updateHost, nthreads,
+
+    if ( usecudampi )
+    {
+      // uses CUDA-aware MPI and passes device pointers to MPI
+      FVSAND_GPU_KERNEL_LAUNCH( updateHost, nthreads,
+                                qbuf_d, qd,device2host_d,nthreads);
+      // separate sends and receives so that we can overlap comm and calculation
+      // in the residual and iteration loops.
+      int reqcount=0;
+      pc.postRecvs_direct(qbuf_d2,nfields_d,rcvmap,ireq,mycomm,&reqcount);
+
+      // wait for pack kernel to finish packing buffers?
+      FVSAND::gpu::synchronize();
+
+      pc.postSends_direct(qbuf_d,nfields_d,sndmap,ireq,mycomm,&reqcount);
+      pc.finish_comm(reqcount,ireq,istatus);
+
+      // unpack
+      FVSAND_GPU_KERNEL_LAUNCH( updateDevice, nthreads, qd, 
+                                qbuf_d2, host2device_d, nthreads );
+    }
+    else
+    {
+      // does memcpy from/to global memory and only passes host-pointers to MPI
+      FVSAND_GPU_KERNEL_LAUNCH( updateHost, nthreads,
 			      qbuf_d,qd,device2host_d,nthreads);
-    // separate sends and receives so that we can overlap comm and calculation
-    // in the residual and iteration loops.
-    // TODO (george) use qbuf2_d and qbuf_d instead of qbuf2 and qbuf for cuda-aware
-    int reqcount=0;
-    pc.postRecvs_direct(qbuf2,nfields_d,rcvmap,ireq,mycomm,&reqcount);
-    // TODO (george) with cuda-aware this pull is not required
-    // but it doesn't work now
-    gpu::pull_from_device<double>(qbuf,qbuf_d,sizeof(double)*device2host.size());
-    pc.postSends_direct(qbuf,nfields_d,sndmap,ireq,mycomm,&reqcount);
-    pc.finish_comm(reqcount,ireq,istatus);
-    // same as above
-    // not doing cuda-aware now
-    gpu::copy_to_device(qbuf_d2,qbuf2,sizeof(double)*host2device.size());
-
-    nthreads=host2device.size();
-    FVSAND_GPU_KERNEL_LAUNCH( updateDevice, nthreads,
-			      qd,qbuf_d2,host2device_d,nthreads);
+
+      int reqcount=0;
+      pc.postRecvs_direct(qbuf2,nfields_d,rcvmap,ireq,mycomm,&reqcount);
+      gpu::pull_from_device<double>(qbuf,qbuf_d,sizeof(double)*device2host.size());
+
+      pc.postSends_direct(qbuf,nfields_d,sndmap,ireq,mycomm,&reqcount);
+      pc.finish_comm(reqcount,ireq,istatus);
+
+      gpu::copy_to_device(qbuf_d2,qbuf2,sizeof(double)*host2device.size());
+
+      nthreads=host2device.size();
+      FVSAND_GPU_KERNEL_LAUNCH( updateDevice, nthreads,
+		  	      qd,qbuf_d2,host2device_d,nthreads);
+    } 
+
 }
 
 

diff --git a/src/LocalMesh.h b/src/LocalMesh.h
@@ -94,6 +94,8 @@ class LocalMesh
   MPI_Request *ireq{nullptr};
   MPI_Status *istatus{nullptr};
 
+  bool usecudampi{false}; // indicates whether cuda-aware MPI is used
+
  public:
 
   // solution fields at n+1,n & n-1
@@ -108,7 +110,8 @@ class LocalMesh
   ~LocalMesh();
   LocalMesh(GlobalMesh *g,
 	    int myid,
-	    MPI_Comm comm);
+	    MPI_Comm comm,
+      bool usecudampi=false );
   void WriteMesh(int label);
   void CreateGridMetrics(int);
   void CreateFaces();

diff --git a/src/fvsand.C b/src/fvsand.C
@@ -9,6 +9,7 @@
 #include <string>
 #include "NVTXMacros.h"
 #include <sstream> // for std::ostringstream
+#include <cstring> // for std::strcmp()
 #include "inputParser.h"
 using namespace FVSAND;
 
@@ -76,10 +77,22 @@ int main(int argc, char *argv[])
   int nsweep = 2;   // Jacobi Sweeps (=0 means explict)
   int istoreJac =3; // Jacobian storage or not 
   int restype=0;    // restype = 0 (cell-based) 1 (face-based)
-  if (argc > 1) {
-   parseInputs(argv[1],fname,&dsmin,&stretch,&nlevels,
-	      flovar,&nsteps,&nsave,&dt,reOrderCells,&nsweep,
-	      &istoreJac,&restype);
+  bool usecudampi = false; // optional argument to fvsand
+  if (argc > 1) 
+  {
+    parseInputs(argv[1],fname,&dsmin,&stretch,&nlevels,
+	        flovar,&nsteps,&nsave,&dt,reOrderCells,&nsweep,
+	        &istoreJac,&restype);
+
+    for ( int iarg=2; iarg < argc; ++iarg )
+    {
+      if ( std::strcmp(argv[iarg], "--usecudampi" ) == 0 )
+      {
+        usecudampi = true;
+      }
+
+    } // end for all command line arguments
+
   }
 
   // runge-kutta tableue
@@ -95,7 +108,7 @@ int main(int argc, char *argv[])
   // create local mesh partitions
   // and compute grid metrics
   LocalMesh *lm;
-  lm= new LocalMesh(sm,myid,MPI_COMM_WORLD);
+  lm= new LocalMesh(sm,myid,MPI_COMM_WORLD,usecudampi);
   lm->CreateGridMetrics(istoreJac);
 
   // initialize solution

diff --git a/src/parallelComm.h b/src/parallelComm.h
@@ -288,6 +288,7 @@ namespace FVSAND {
       delete [] istatus;
     }
 
+		inline
     void postRecvs_direct(double *qbuf, int nfields, 
 			  std::unordered_map <int, std::vector<int>> rcvmap,
 			  MPI_Request *ireq,
@@ -296,17 +297,17 @@ namespace FVSAND {
     {
       int offset=0;
       int rcount=*k;
-      for(auto r:rcvmap)
-	{
-	  MPI_Irecv(qbuf+offset,
-		    r.second.size()*nfields, MPI_DOUBLE,
-		    r.first,0,comm,&ireq[rcount++]);
-	  offset+=r.second.size()*nfields;
-	}
+      for(const auto& r: rcvmap)
+			{
+				MPI_Irecv(qbuf+offset,
+						r.second.size()*nfields, MPI_DOUBLE,
+						r.first,0,comm,&ireq[rcount++]);
+				offset+=r.second.size()*nfields;
+			}
       *k=rcount;
     }
 
-
+    inline 
     void postSends_direct(double *qbuf, int nfields, 
 			  std::unordered_map <int, std::vector<int>> sndmap,
 			  MPI_Request *ireq,
@@ -315,16 +316,17 @@ namespace FVSAND {
     {
       int offset=0;
       int scount=*k;
-      for(auto s:sndmap)
-	{
-	  MPI_Isend(qbuf+offset,
-		    s.second.size()*nfields, MPI_DOUBLE,
-		    s.first,0,comm,&ireq[scount++]);
-	  offset+=s.second.size()*nfields;
-	}
+      for(const auto& s:sndmap)
+			{
+				MPI_Isend(qbuf+offset,
+						s.second.size()*nfields, MPI_DOUBLE,
+						s.first,0,comm,&ireq[scount++]);
+				offset+=s.second.size()*nfields;
+			}
       *k=scount;
     }
 
+		inline
     void finish_comm(int nrequests, MPI_Request *ireq, MPI_Status *istatus)
     {
       MPI_Waitall(nrequests,ireq,istatus);