QMCPACK · ye-luo · Feb 7, 2019 · Dec 22, 2017 · Apr 12, 2018 · Apr 12, 2018
diff --git a/CMake/GNUCompilers.cmake b/CMake/GNUCompilers.cmake
@@ -43,8 +43,8 @@ if(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
   endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" AND CMAKE_C_FLAGS MATCHES "-march=")
 else() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
   # use -march=native
-  SET(CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} -march=native")
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+  SET(CMAKE_C_FLAGS     "${CMAKE_C_FLAGS}")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
 
 ENDIF((NOT $ENV{CRAYPE_VERSION} MATCHES ".") AND (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64"))

diff --git a/CMake/IBMCompilers.cmake b/CMake/IBMCompilers.cmake
@@ -19,14 +19,14 @@ SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wno-deprecated -Wno-unused-value")
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated -Wno-unused-value")
 
 # Set extra optimization specific flags
-SET( CMAKE_C_FLAGS_RELEASE   "-O3 -DNDEBUG" )
-SET( CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" )
-SET( CMAKE_C_FLAGS_RELWITHDEBINFO   "-g -O3" )
-SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3" )
+# SET( CMAKE_C_FLAGS_RELEASE   "-O3 -DNDEBUG" )
+# SET( CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" )
+# SET( CMAKE_C_FLAGS_RELWITHDEBINFO   "-g -O3" )
+# SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3" )
 
 # Set language standardards
-SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -std=gnu11")
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11 -qnoxlcompatmacros")
+SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -std=gnu11 -O3")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11 -qnoxlcompatmacros -O3")
 
 IF(QMC_OMP)
   SET(ENABLE_OPENMP 1)

diff --git a/src/CUDA/gpu_vector.h b/src/CUDA/gpu_vector.h
@@ -360,6 +360,45 @@ class device_vector
 #endif
   }
 
+  void
+  asyncCopy(const T* vec_ptr, size_t len, size_t offset, size_t datalen)
+  {
+    if ((this->size() != len) || (this->size() < offset+datalen))
+    {
+      if (!own_data)
+      {
+        fprintf (stderr, "Assigning referenced GPU vector, but it has the "
+                 "wrong size.\n");
+        fprintf (stderr, "Name = %s.  This size = %ld, vec size = %ld\n",
+                 name.c_str(), size(), len);
+        abort();
+      }
+      if (len<offset+datalen)
+      {
+        fprintf(stderr, "Trying to write more than the length of the vector.\n");
+        fprintf (stderr, "Name = %s.  This size = %ld, vec size = %ld, needed length = %ld\n",
+                 name.c_str(), size(), len, offset+datalen);
+        abort();
+      }
+      resize(len);
+    }
+#ifdef QMC_CUDA
+    cudaMemcpyAsync (&((*this)[offset]), vec_ptr, datalen*sizeof(T),
+                     cudaMemcpyHostToDevice, kernelStream);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+      fprintf (stderr, "In operator=, name=%s, size=%ld  vec.size()=%ld\n",
+               name.c_str(), size(), len);
+      fprintf (stderr, "this pointer = %p  vec pointer=%p\n",
+               data_pointer, vec_ptr);
+      fprintf (stderr, "CUDA error in device_vector::asyncCopy(const T* vec_ptr, len, offset, datalen) for %s:\n  %s\n",
+               name.c_str(), cudaGetErrorString(err));
+      abort();
+    }
+#endif
+  }
+
   void
   asyncCopy(const std::vector<T, std::allocator<T> > &vec)
   {

diff --git a/src/Numerics/CUDA/cuda_inverse.cu b/src/Numerics/CUDA/cuda_inverse.cu
diff --git a/src/Numerics/CUDA/cuda_inverse.h b/src/Numerics/CUDA/cuda_inverse.h
@@ -29,6 +29,26 @@ cublas_inverse (cublasHandle_t handle,
                 int N, int rowStride, int numMats,
                 bool useHigherPrecision = true);
 
+void
+cublas_lemma_mats (cublasHandle_t handle,
+                   float *AList_d[], float *AWorkList_d[],
+                   float *AinvList_d[], float *AinvkList_d[], float *U_d[],
+                   float *lemma_d[], float *AinvUList_d[],
+                   int k, int N, int nw, int RowStride);
+
+void
+cublas_ainv_row (cublasHandle_t handle,
+                 float *AinvkList_d[], float *AWorkList_d[], float *AinvList_d[],
+                 int k, int N, int nw, int RowStride);
+
+void
+cublas_smw_update (cublasHandle_t handle,
+                   float *AinvkList_d[], float *AinvList_d[], 
+                   float *AinvUList_d[], float *AWorkList_d[],
+                   float *lemma_inv[], float *lemma_lu[],
+                   int *info_array,
+                   int k, int kd, int M, int N, int nw, int RowStride);
+
 //////////////////////
 // Double precision //
 //////////////////////
@@ -40,6 +60,25 @@ cublas_inverse (cublasHandle_t handle,
                 int N, int rowStride, int numMats, 
                 bool useHigherPrecision = true);
 
+void
+cublas_lemma_mats (cublasHandle_t handle,
+                   double *AList_d[], double *AWorkList_d[],
+                   double *AinvList_d[], double *AinvkList_d[], double *U_d[],
+                   double *lemma_d[], double *AinvUList_d[],
+                   int k, int N, int nw, int RowStride);
+
+void
+cublas_ainv_row (cublasHandle_t handle,
+                 double *AinvkList_d[], double *AWorkList_d[], double *AinvList_d[],
+                 int k, int N, int nw, int RowStride);
+
+void
+cublas_smw_update (cublasHandle_t handle,
+                   double *AinvkList_d[], double *AinvList_d[], 
+                   double *AinvUList_d[], double *AWorkList_d[],
+                   double *lemma_inv[], double *lemma_lu[],
+                   int *info_array,
+                   int k, int kd, int M, int N, int nw, int RowStride);
 
 //////////////////////////////////////
 // Complex single / mixed precision //

diff --git a/src/Particle/MCWalkerConfiguration.cpp b/src/Particle/MCWalkerConfiguration.cpp
@@ -555,14 +555,14 @@ void MCWalkerConfiguration::updateLists_GPU()
 {
   int nw = WalkerList.size();
   int NumSpecies = getSpeciesSet().TotalNum;
-  if (Rnew_GPU.size() != nw)
+  if (Rnew_GPU.size() != nw*kblocksize)
   {
-    Rnew_GPU.resize(nw);
+    Rnew_GPU.resize(nw*kblocksize);
     RhokLists_GPU.resize(NumSpecies);
     for (int isp=0; isp<NumSpecies; isp++)
       RhokLists_GPU[isp].resize(nw);
-    Rnew_host.resize(nw);
-    Rnew.resize(nw);
+    Rnew_host.resize(nw*kblocksize);
+    Rnew.resize(nw*kblocksize);
     AcceptList_GPU.resize(nw);
     AcceptList_host.resize(nw);
     RList_GPU.resize(nw);
@@ -574,6 +574,7 @@ void MCWalkerConfiguration::updateLists_GPU()
   hostlist_valueType.resize(nw);
   hostlist_AA.resize(nw);
 
+// TODO: May need to think about R_GPU now potentially being a block... (AT)
   for (int iw=0; iw<nw; iw++)
   {
     if (WalkerList[iw]->R_GPU.size() != R.size())
@@ -654,20 +655,45 @@ void MCWalkerConfiguration::copyWalkerGradToGPU()
 }
 
 void MCWalkerConfiguration::proposeMove_GPU
-(std::vector<PosType> &newPos, int iat)
+(std::vector<PosType> &newPos, int iat, int nat)
 {
-  if (Rnew_host.size() < newPos.size())
-    Rnew_host.resize(newPos.size());
-  for (int i=0; i<newPos.size(); i++)
+  int nw=newPos.size();
+  if (Rnew_host.size() < nw*kblocksize)
+  {
+    Rnew.resize(nw*kblocksize);
+    Rnew_host.resize(nw*kblocksize);
+  }
+  // store things sequentially with k to make evaluation more straight-forward:
+  //           k=0     k=1     k=kblocksize
+  // Rnew = [0,..,nw|0,..,nw|...|0,..,nw]
+  int offset=kcurr*nw;
+  for (int i=0; i<nw; i++)
+  {
     for (int dim=0; dim<OHMMS_DIM; dim++)
-      Rnew_host[i][dim] = newPos[i][dim];
-  Rnew_GPU.asyncCopy(Rnew_host);
-  Rnew = newPos;
+    {
+      Rnew[i+offset][dim] = newPos[i][dim];
+      Rnew_host[i+offset][dim] = newPos[i][dim];
+    }
+  }
+  if(kDelay){
+    kcurr=(kcurr+1)%kblocksize; // loop kcurr around every k blocks
+    kstart=kblock*kblocksize;
+    if(kcurr==0)
+      kblock++; // keep increasing kblock (even beyond available matrix blocks) - the update check takes care of self-consistency
+    // only copy new position matrix when needed (when update is imminent)
+    if(klinear)
+    {
+      Rnew_GPU.asyncCopy(&(Rnew_host[offset]),nw*kblocksize,offset,nw);
+    } else
+      if(kcurr==0 || (kcurr+kblock*kblocksize>=nat/2)) // TODO: Make sure we do *not* divide by two if there is no spin up/down matrix savings
+        Rnew_GPU.asyncCopy(Rnew_host);
+  } else
+    Rnew_GPU.asyncCopy(Rnew_host);
   CurrentParticle = iat;
 }
 
 
-void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
+void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept, int k)
 {
   if (AcceptList_host.size() < toAccept.size())
     AcceptList_host.resize(toAccept.size());
@@ -682,13 +708,13 @@ void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
 //   app_log() << "RList_GPU.size()       = " << RList_GPU.size() << std::endl;
   if (RList_GPU.size() != WalkerList.size())
     std::cerr << "Error in RList_GPU size.\n";
-  if (Rnew_GPU.size() != WalkerList.size())
+  if (Rnew_GPU.size() != WalkerList.size()*kblocksize)
     std::cerr << "Error in Rnew_GPU size.\n";
   if (AcceptList_GPU.size() != WalkerList.size())
-    std::cerr << "Error in AcceptList_GPU_GPU size.\n";
+    std::cerr << "Error in AcceptList_GPU size.\n";
   accept_move_GPU_cuda
   (RList_GPU.data(), (CUDA_PRECISION*)Rnew_GPU.data(),
-   AcceptList_GPU.data(), CurrentParticle, WalkerList.size());
+   AcceptList_GPU.data(), CurrentParticle++, WalkerList.size(), k);
 }
 
 
@@ -703,6 +729,7 @@ void MCWalkerConfiguration::NLMove_GPU(std::vector<Walker_t*> &walkers,
     NLlist_GPU.resize(N);
     NLlist_host.resize(N);
   }
+// TODO: In case delayed updates are including DMC, need to work here (AT)
   if (Rnew_GPU.size() < N)
   {
     Rnew_host.resize(N);

diff --git a/src/Particle/MCWalkerConfiguration.h b/src/Particle/MCWalkerConfiguration.h
@@ -122,11 +122,16 @@ class MCWalkerConfiguration: public ParticleSet
   GPU_XRAY_TRACE void updateLists_GPU();
   int CurrentParticle;
   GPU_XRAY_TRACE void proposeMove_GPU
-  (std::vector<PosType> &newPos, int iat);
-  GPU_XRAY_TRACE void acceptMove_GPU(std::vector<bool> &toAccept);
+  (std::vector<PosType> &newPos, int iat, int nat);
+  GPU_XRAY_TRACE void proposeMove_GPU
+  (std::vector<PosType> &newPos, int iat){ proposeMove_GPU(newPos, iat, 0); }
+  GPU_XRAY_TRACE void acceptMove_GPU
+  (std::vector<bool> &toAccept, int k);
+  GPU_XRAY_TRACE void acceptMove_GPU
+  (std::vector<bool> &toAccept){ acceptMove_GPU(toAccept,0); }
   GPU_XRAY_TRACE void NLMove_GPU (std::vector<Walker_t*> &walkers,
-				  std::vector<PosType> &Rnew,
-				  std::vector<int> &iat);
+                                  std::vector<PosType> &Rnew,
+                                  std::vector<int> &iat);
 #endif
 
   ///default constructor
@@ -379,6 +384,100 @@ class MCWalkerConfiguration: public ParticleSet
     }
   }
 
+  inline void setklinear()
+  {
+    klinear=true;
+  }
+
+  inline bool getklinear()
+  {
+    return klinear;
+  }
+
+  inline void setkDelay(int k)
+  {
+    klinear=false;
+    kDelay=k;
+    if (kDelay<0)
+    {
+      app_log() << "  Warning: Specified negative delayed updates k = " << k << ", setting to zero." << std::endl;
+      kDelay=0;
+    }
+    kblocksize=1;
+    kblock=0;
+    kcurr=0;
+    kstart=0;
+    kupdate=1;
+    if(kDelay)
+    {
+      app_log() << "  Using delayed updates (k = " << kDelay << ") for all walkers" << std::endl;
+      kblocksize=kDelay;
+    }
+  }
+
+  inline int getkDelay()
+  {
+    return kDelay;
+  }
+
+  inline int getkblock()
+  {
+    return kblock;
+  }
+
+  inline int getkblocksize()
+  {
+    return kblocksize;
+  }
+
+  // this function is a bit dangerous but needed to pass the current k of the determinant look-ahead to the Jastrow calculation
+  inline void setkcurr(int k)
+  {
+    kcurr=k;
+  }
+
+  inline int getkcurr()
+  {
+    return kcurr;
+  }
+
+  inline int getkstart()
+  {
+    return kstart;
+  }
+
+  inline int getkupdate()
+  {
+    return kupdate;
+  }
+
+  inline bool update_now(int nat)
+  {
+    // in case that we finished the current k-block (kcurr=0) *or* (<- This case also takes care of no delayed updates as kcurr will always be zero then)
+    // if we run out of electrons (nat) but still have some k's in the current k-block, an update needs to happen now
+    bool update=((!kcurr) || (kcurr+kblock*kblocksize>=nat/2));
+    if(update)
+    {
+      if(kblock>0)
+      {
+         kstart=kblock*kblocksize;
+         if(kcurr==0) kstart-=kblocksize;
+         kupdate=kcurr+kblock*kblocksize-kstart;
+         kcurr=0;
+#ifdef QMC_CUDA
+         if(!klinear) CurrentParticle-=kupdate-1;
+#endif
+      }
+    }
+    // reset kblock if we're out of matrix blocks
+    if(kblock*kblocksize>=nat/2) // TODO: Make sure we do *not* divide by two if there is no spin up/down matrix savings
+      kblock=0;
+    return update;
+  }
+
+  ///overwrite make_clones function
+  void make_clones(int n);
+
 protected:
 
   ///boolean for cleanup
@@ -391,6 +490,20 @@ class MCWalkerConfiguration: public ParticleSet
   int GlobalNumWalkers;
   ///update-mode index
   int UpdateMode;
+  ///delayed update streak parameter k
+  int kDelay;
+  ///block dimension (usually k) in case delayed updates are used (there are nat/kblocksize blocks available)
+  int kblocksize;
+  ///current block
+  int kblock;
+  ///current k within current block
+  int kcurr;
+  ///current k to start from update
+  int kstart;
+  ///number of columns to update
+  int kupdate;
+  ///klinear switch to indicate if values are calculated sequentially for algorithms using drift
+  bool klinear;
 
   RealType LocalEnergy;