QMCPACK · ye-luo · Feb 7, 2019 · Dec 22, 2017 · Apr 12, 2018 · Apr 12, 2018
diff --git a/src/CUDA/gpu_vector.h b/src/CUDA/gpu_vector.h
@@ -360,6 +360,45 @@ class device_vector
 #endif
   }
 
+  void
+  asyncCopy(const T* vec_ptr, size_t len, size_t offset, size_t datalen)
+  {
+    if ((this->size() != len) || (this->size() < offset+datalen))
+    {
+      if (!own_data)
+      {
+        fprintf (stderr, "Assigning referenced GPU vector, but it has the "
+                 "wrong size.\n");
+        fprintf (stderr, "Name = %s.  This size = %ld, vec size = %ld\n",
+                 name.c_str(), size(), len);
+        abort();
+      }
+      if (len<offset+datalen)
+      {
+        fprintf(stderr, "Trying to write more than the length of the vector.\n");
+        fprintf (stderr, "Name = %s.  This size = %ld, vec size = %ld, needed length = %ld\n",
+                 name.c_str(), size(), len, offset+datalen);
+        abort();
+      }
+      resize(len);
+    }
+#ifdef QMC_CUDA
+    cudaMemcpyAsync (&((*this)[offset]), vec_ptr, datalen*sizeof(T),
+                     cudaMemcpyHostToDevice, kernelStream);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+      fprintf (stderr, "In operator=, name=%s, size=%ld  vec.size()=%ld\n",
+               name.c_str(), size(), len);
+      fprintf (stderr, "this pointer = %p  vec pointer=%p\n",
+               data_pointer, vec_ptr);
+      fprintf (stderr, "CUDA error in device_vector::asyncCopy(const T* vec_ptr, len, offset, datalen) for %s:\n  %s\n",
+               name.c_str(), cudaGetErrorString(err));
+      abort();
+    }
+#endif
+  }
+
   void
   asyncCopy(const std::vector<T, std::allocator<T> > &vec)
   {

diff --git a/src/Numerics/CUDA/cuda_inverse.cu b/src/Numerics/CUDA/cuda_inverse.cu
@@ -144,7 +144,6 @@ convert_complex (Tdest **dest_list, Tsrc **src_list, int len)
   }
 }
 
-
 // Four matrix inversion functions
 // 1. for float matrices
 //    useHigherPrecision = false --> single precision operations
@@ -246,7 +245,6 @@ cublas_inverse (cublasHandle_t handle,
   cudaDeviceSynchronize();
 }
 
-
 // 3. for complex float matrices
 //    useHigherPrecision = false --> single precision operations
 //    useHigherPrecision = true  --> double precision operations  (default)

diff --git a/src/Numerics/CUDA/cuda_inverse.h b/src/Numerics/CUDA/cuda_inverse.h
@@ -40,7 +40,6 @@ cublas_inverse (cublasHandle_t handle,
                 int N, int rowStride, int numMats, 
                 bool useHigherPrecision = true);
 
-
 //////////////////////////////////////
 // Complex single / mixed precision //
 //////////////////////////////////////

diff --git a/src/Particle/MCWalkerConfiguration.cpp b/src/Particle/MCWalkerConfiguration.cpp
@@ -555,14 +555,14 @@ void MCWalkerConfiguration::updateLists_GPU()
 {
   int nw = WalkerList.size();
   int NumSpecies = getSpeciesSet().TotalNum;
-  if (Rnew_GPU.size() != nw)
+  if (Rnew_GPU.size() != nw*kblocksize)
   {
-    Rnew_GPU.resize(nw);
+    Rnew_GPU.resize(nw*kblocksize);
     RhokLists_GPU.resize(NumSpecies);
     for (int isp=0; isp<NumSpecies; isp++)
       RhokLists_GPU[isp].resize(nw);
-    Rnew_host.resize(nw);
-    Rnew.resize(nw);
+    Rnew_host.resize(nw*kblocksize);
+    Rnew.resize(nw*kblocksize);
     AcceptList_GPU.resize(nw);
     AcceptList_host.resize(nw);
     RList_GPU.resize(nw);
@@ -656,18 +656,44 @@ void MCWalkerConfiguration::copyWalkerGradToGPU()
 void MCWalkerConfiguration::proposeMove_GPU
 (std::vector<PosType> &newPos, int iat)
 {
-  if (Rnew_host.size() < newPos.size())
-    Rnew_host.resize(newPos.size());
-  for (int i=0; i<newPos.size(); i++)
+  int nw=newPos.size();
+  if (Rnew_host.size() < nw*kblocksize)
+  {
+    Rnew.resize(nw*kblocksize);
+    Rnew_host.resize(nw*kblocksize);
+  }
+  // store things sequentially with k to make evaluation more straight-forward:
+  //           k=0     k=1     k=kblocksize
+  // Rnew = [0,..,nw|0,..,nw|...|0,..,nw]
+  int offset=kcurr*nw;
+  for (int i=0; i<nw; i++)
+  {
     for (int dim=0; dim<OHMMS_DIM; dim++)
-      Rnew_host[i][dim] = newPos[i][dim];
-  Rnew_GPU.asyncCopy(Rnew_host);
-  Rnew = newPos;
+    {
+      Rnew[i+offset][dim] = newPos[i][dim];
+      Rnew_host[i+offset][dim] = newPos[i][dim];
+    }
+  }
+  if(kDelay)
+  {
+    kcurr=(kcurr+1)%kblocksize; // loop kcurr around every k blocks
+    kstart=kblock*kblocksize;
+    if(kcurr==0)
+      kblock++; // keep increasing kblock (even beyond available matrix blocks) - the update check takes care of self-consistency
+    // only copy new position matrix when needed (when update is imminent)
+    if(klinear)
+    {
+      Rnew_GPU.asyncCopy(&(Rnew_host[offset]),nw*kblocksize,offset,nw);
+    } else
+      if(kcurr==0 || (kcurr+kblock*kblocksize>=getnat(iat)))
+        Rnew_GPU.asyncCopy(Rnew_host);
+  } else
+    Rnew_GPU.asyncCopy(Rnew_host);
   CurrentParticle = iat;
 }
 
 
-void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
+void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept, int k)
 {
   if (AcceptList_host.size() < toAccept.size())
     AcceptList_host.resize(toAccept.size());
@@ -682,13 +708,13 @@ void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
 //   app_log() << "RList_GPU.size()       = " << RList_GPU.size() << std::endl;
   if (RList_GPU.size() != WalkerList.size())
     std::cerr << "Error in RList_GPU size.\n";
-  if (Rnew_GPU.size() != WalkerList.size())
+  if (Rnew_GPU.size() != WalkerList.size()*kblocksize)
     std::cerr << "Error in Rnew_GPU size.\n";
   if (AcceptList_GPU.size() != WalkerList.size())
-    std::cerr << "Error in AcceptList_GPU_GPU size.\n";
+    std::cerr << "Error in AcceptList_GPU size.\n";
   accept_move_GPU_cuda
   (RList_GPU.data(), (CUDA_PRECISION*)Rnew_GPU.data(),
-   AcceptList_GPU.data(), CurrentParticle, WalkerList.size());
+   AcceptList_GPU.data(), CurrentParticle++, WalkerList.size(), k);
 }
 
 

diff --git a/src/Particle/MCWalkerConfiguration.h b/src/Particle/MCWalkerConfiguration.h
@@ -123,10 +123,13 @@ class MCWalkerConfiguration: public ParticleSet
   int CurrentParticle;
   GPU_XRAY_TRACE void proposeMove_GPU
   (std::vector<PosType> &newPos, int iat);
-  GPU_XRAY_TRACE void acceptMove_GPU(std::vector<bool> &toAccept);
+  GPU_XRAY_TRACE void acceptMove_GPU
+  (std::vector<bool> &toAccept, int k);
+  GPU_XRAY_TRACE void acceptMove_GPU
+  (std::vector<bool> &toAccept){ acceptMove_GPU(toAccept,0); }
   GPU_XRAY_TRACE void NLMove_GPU (std::vector<Walker_t*> &walkers,
-				  std::vector<PosType> &Rnew,
-				  std::vector<int> &iat);
+                                  std::vector<PosType> &Rnew,
+                                  std::vector<int> &iat);
 #endif
 
   ///default constructor
@@ -379,6 +382,102 @@ class MCWalkerConfiguration: public ParticleSet
     }
   }
 
+#ifdef QMC_CUDA
+  inline void setklinear()
+  {
+    klinear=true;
+  }
+
+  inline bool getklinear()
+  {
+    return klinear;
+  }
+
+  inline void setkDelay(int k)
+  {
+    klinear=false;
+    kDelay=k;
+    if (kDelay<0)
+    {
+      app_log() << "  Warning: Specified negative delayed updates k = " << k << ", setting to zero (no delay)." << std::endl;
+      kDelay=0;
+    }
+    if (kDelay==1)
+      kDelay=0; // use old algorithm as additional overhead for k=1 is not doing anything useful outside of code development
+    kblocksize=1;
+    kblock=0;
+    kcurr=0;
+    kstart=0;
+    if(kDelay)
+    {
+      app_log() << "  Using delayed updates (k = " << kDelay << ") for all walkers" << std::endl;
+      kblocksize=kDelay;
+    }
+    kupdate=kblocksize;
+  }
+
+  inline int getkDelay()
+  {
+    return kDelay;
+  }
+
+  inline int getkblock()
+  {
+    return kblock;
+  }
+
+  inline int getkblocksize()
+  {
+    return kblocksize;
+  }
+
+  inline int getkcurr()
+  {
+    return kcurr;
+  }
+
+  inline int getkstart()
+  {
+    return kstart;
+  }
+
+  inline int getkupdate()
+  {
+    return kupdate;
+  }
+
+  inline int getnat(int iat)
+  {
+    for(unsigned int gid=0; gid<groups(); gid++)
+      if(last(gid)>iat)
+        return last(gid)-first(gid);
+  }
+
+  inline bool update_now(int iat)
+  {
+    // in case that we finished the current k-block (kcurr=0) *or* (<- This case also takes care of no delayed updates as kcurr will always be zero then)
+    // if we run out of electrons (nat) but still have some k's in the current k-block, an update needs to happen now
+    bool end_of_matrix = (kcurr+kblock*kblocksize>=getnat(iat));
+    bool update=((!kcurr) || end_of_matrix);
+    kupdate=kblocksize;
+    if(update)
+    {
+      if(kblock>0)
+      {
+         kstart=kblock*kblocksize;
+         if(kcurr==0) kstart-=kblocksize; // means we looped cleanly within kblocksize matrix (and kblock is too large by 1), hence start is at (kblock-1)*kblocksize
+         kupdate=kcurr+kblock*kblocksize-kstart;
+         kcurr=0;
+         if(!klinear) CurrentParticle-=kupdate-1;
+      }
+    }
+    // reset kblock if we're out of matrix blocks
+    if(end_of_matrix)
+      kblock=0;
+    return update;
+  }
+#endif
+
 protected:
 
   ///boolean for cleanup
@@ -391,6 +490,22 @@ class MCWalkerConfiguration: public ParticleSet
   int GlobalNumWalkers;
   ///update-mode index
   int UpdateMode;
+#ifdef QMC_CUDA
+  ///delayed update streak parameter k
+  int kDelay;
+  ///block dimension (usually k) in case delayed updates are used (there are nat/kblocksize blocks available)
+  int kblocksize=1;
+  ///current block
+  int kblock;
+  ///current k within current block
+  int kcurr;
+  ///current k to start from update
+  int kstart;
+  ///number of columns to update
+  int kupdate;
+  ///klinear switch to indicate if values are calculated sequentially for algorithms using drift
+  bool klinear;
+#endif
 
   RealType LocalEnergy;
 

diff --git a/src/Particle/ParticleSet.h b/src/Particle/ParticleSet.h
@@ -624,6 +624,11 @@ class ParticleSet
   {
     return SubPtcl[igroup+1];
   }
+
+  inline int groupsize(int igroup) const
+  {
+    return SubPtcl[igroup+1]-SubPtcl[igroup];
+  }
 
   ///add attributes to list for IO
   template<typename ATList>

diff --git a/src/Particle/accept_kernel.cu b/src/Particle/accept_kernel.cu
@@ -17,7 +17,7 @@
 
 template<typename T, int BS>
 __global__ void accept_kernel (T** Rlist, T* Rnew,
-                               int* toAccept, int iat, int N)
+                               int* toAccept, int iat, int N, int k)
 {
   int tid = threadIdx.x;
   __shared__ T* myR[BS];
@@ -26,7 +26,7 @@ __global__ void accept_kernel (T** Rlist, T* Rnew,
   int block = blockIdx.x;
   for (int i=0; i<3; i++)
     if ((3*block+i)*BS + tid < 3*N)
-      Rnew_shared[i*BS + tid] = Rnew[(3*block+i)*BS + tid];
+      Rnew_shared[i*BS + tid] = Rnew[(3*block+i)*BS + tid + 3*k*N];
   __syncthreads();
   if (block*BS + tid < N)
   {
@@ -55,27 +55,27 @@ __global__ void accept_kernel (T** Rlist, T* Rnew,
 
 void
 accept_move_GPU_cuda (float** Rlist, float new_pos[],
-                      int toAccept[], int iat, int N)
+                      int toAccept[], int iat, int N, int k)
 {
   const int BS=32;
   int NB = N / BS + ((N % BS) ? 1 : 0);
   dim3 dimBlock(BS);
   dim3 dimGrid(NB);
   accept_kernel<float,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
-  (Rlist, new_pos, toAccept, iat, N);
+  (Rlist, new_pos, toAccept, iat, N, k);
 }
 
 
 void
 accept_move_GPU_cuda (double** Rlist, double* new_pos,
-                      int* toAccept, int iat, int N)
+                      int* toAccept, int iat, int N, int k)
 {
   const int BS=32;
   int NB = N / BS + ((N % BS) ? 1 : 0);
   dim3 dimBlock(BS);
   dim3 dimGrid(NB);
   accept_kernel<double,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
-  (Rlist, new_pos, toAccept, iat, N);
+  (Rlist, new_pos, toAccept, iat, N, k);
 }
 
 

diff --git a/src/Particle/accept_kernel.h b/src/Particle/accept_kernel.h
@@ -19,10 +19,10 @@
 
 void
 accept_move_GPU_cuda (float* Rlist[], float new_pos[],
-                      int toAccept[], int iat, int N);
+                      int toAccept[], int iat, int N, int k);
 void
 accept_move_GPU_cuda (double* Rlist[], double new_pos[],
-                      int toAccept[], int iat, int N);
+                      int toAccept[], int iat, int N, int k);
 
 void NL_move_cuda ( float* Rlist[],  float new_pos[], int N);
 void NL_move_cuda (double* Rlist[], double new_pos[], int N);