Skip to content

Commit

Permalink
Merge pull request #1279 from atillack/delayed_updates
Browse files Browse the repository at this point in the history
GPU CUDA delayed updates
  • Loading branch information
ye-luo authored Feb 7, 2019
2 parents e4798a4 + 55d6e47 commit bf1a98e
Show file tree
Hide file tree
Showing 42 changed files with 2,633 additions and 457 deletions.
39 changes: 39 additions & 0 deletions src/CUDA/gpu_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,45 @@ class device_vector
#endif
}

void
asyncCopy(const T* vec_ptr, size_t len, size_t offset, size_t datalen)
{
if ((this->size() != len) || (this->size() < offset+datalen))
{
if (!own_data)
{
fprintf (stderr, "Assigning referenced GPU vector, but it has the "
"wrong size.\n");
fprintf (stderr, "Name = %s. This size = %ld, vec size = %ld\n",
name.c_str(), size(), len);
abort();
}
if (len<offset+datalen)
{
fprintf(stderr, "Trying to write more than the length of the vector.\n");
fprintf (stderr, "Name = %s. This size = %ld, vec size = %ld, needed length = %ld\n",
name.c_str(), size(), len, offset+datalen);
abort();
}
resize(len);
}
#ifdef QMC_CUDA
cudaMemcpyAsync (&((*this)[offset]), vec_ptr, datalen*sizeof(T),
cudaMemcpyHostToDevice, kernelStream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf (stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n",
name.c_str(), size(), len);
fprintf (stderr, "this pointer = %p vec pointer=%p\n",
data_pointer, vec_ptr);
fprintf (stderr, "CUDA error in device_vector::asyncCopy(const T* vec_ptr, len, offset, datalen) for %s:\n %s\n",
name.c_str(), cudaGetErrorString(err));
abort();
}
#endif
}

void
asyncCopy(const std::vector<T, std::allocator<T> > &vec)
{
Expand Down
2 changes: 0 additions & 2 deletions src/Numerics/CUDA/cuda_inverse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ convert_complex (Tdest **dest_list, Tsrc **src_list, int len)
}
}


// Four matrix inversion functions
// 1. for float matrices
// useHigherPrecision = false --> single precision operations
Expand Down Expand Up @@ -246,7 +245,6 @@ cublas_inverse (cublasHandle_t handle,
cudaDeviceSynchronize();
}


// 3. for complex float matrices
// useHigherPrecision = false --> single precision operations
// useHigherPrecision = true --> double precision operations (default)
Expand Down
1 change: 0 additions & 1 deletion src/Numerics/CUDA/cuda_inverse.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ cublas_inverse (cublasHandle_t handle,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);


//////////////////////////////////////
// Complex single / mixed precision //
//////////////////////////////////////
Expand Down
54 changes: 40 additions & 14 deletions src/Particle/MCWalkerConfiguration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,14 +555,14 @@ void MCWalkerConfiguration::updateLists_GPU()
{
int nw = WalkerList.size();
int NumSpecies = getSpeciesSet().TotalNum;
if (Rnew_GPU.size() != nw)
if (Rnew_GPU.size() != nw*kblocksize)
{
Rnew_GPU.resize(nw);
Rnew_GPU.resize(nw*kblocksize);
RhokLists_GPU.resize(NumSpecies);
for (int isp=0; isp<NumSpecies; isp++)
RhokLists_GPU[isp].resize(nw);
Rnew_host.resize(nw);
Rnew.resize(nw);
Rnew_host.resize(nw*kblocksize);
Rnew.resize(nw*kblocksize);
AcceptList_GPU.resize(nw);
AcceptList_host.resize(nw);
RList_GPU.resize(nw);
Expand Down Expand Up @@ -656,18 +656,44 @@ void MCWalkerConfiguration::copyWalkerGradToGPU()
void MCWalkerConfiguration::proposeMove_GPU
(std::vector<PosType> &newPos, int iat)
{
if (Rnew_host.size() < newPos.size())
Rnew_host.resize(newPos.size());
for (int i=0; i<newPos.size(); i++)
int nw=newPos.size();
if (Rnew_host.size() < nw*kblocksize)
{
Rnew.resize(nw*kblocksize);
Rnew_host.resize(nw*kblocksize);
}
// store things sequentially with k to make evaluation more straight-forward:
// k=0 k=1 k=kblocksize
// Rnew = [0,..,nw|0,..,nw|...|0,..,nw]
int offset=kcurr*nw;
for (int i=0; i<nw; i++)
{
for (int dim=0; dim<OHMMS_DIM; dim++)
Rnew_host[i][dim] = newPos[i][dim];
Rnew_GPU.asyncCopy(Rnew_host);
Rnew = newPos;
{
Rnew[i+offset][dim] = newPos[i][dim];
Rnew_host[i+offset][dim] = newPos[i][dim];
}
}
if(kDelay)
{
kcurr=(kcurr+1)%kblocksize; // loop kcurr around every k blocks
kstart=kblock*kblocksize;
if(kcurr==0)
kblock++; // keep increasing kblock (even beyond available matrix blocks) - the update check takes care of self-consistency
// only copy new position matrix when needed (when update is imminent)
if(klinear)
{
Rnew_GPU.asyncCopy(&(Rnew_host[offset]),nw*kblocksize,offset,nw);
} else
if(kcurr==0 || (kcurr+kblock*kblocksize>=getnat(iat)))
Rnew_GPU.asyncCopy(Rnew_host);
} else
Rnew_GPU.asyncCopy(Rnew_host);
CurrentParticle = iat;
}


void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept, int k)
{
if (AcceptList_host.size() < toAccept.size())
AcceptList_host.resize(toAccept.size());
Expand All @@ -682,13 +708,13 @@ void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
// app_log() << "RList_GPU.size() = " << RList_GPU.size() << std::endl;
if (RList_GPU.size() != WalkerList.size())
std::cerr << "Error in RList_GPU size.\n";
if (Rnew_GPU.size() != WalkerList.size())
if (Rnew_GPU.size() != WalkerList.size()*kblocksize)
std::cerr << "Error in Rnew_GPU size.\n";
if (AcceptList_GPU.size() != WalkerList.size())
std::cerr << "Error in AcceptList_GPU_GPU size.\n";
std::cerr << "Error in AcceptList_GPU size.\n";
accept_move_GPU_cuda
(RList_GPU.data(), (CUDA_PRECISION*)Rnew_GPU.data(),
AcceptList_GPU.data(), CurrentParticle, WalkerList.size());
AcceptList_GPU.data(), CurrentParticle++, WalkerList.size(), k);
}


Expand Down
121 changes: 118 additions & 3 deletions src/Particle/MCWalkerConfiguration.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,13 @@ class MCWalkerConfiguration: public ParticleSet
int CurrentParticle;
GPU_XRAY_TRACE void proposeMove_GPU
(std::vector<PosType> &newPos, int iat);
GPU_XRAY_TRACE void acceptMove_GPU(std::vector<bool> &toAccept);
GPU_XRAY_TRACE void acceptMove_GPU
(std::vector<bool> &toAccept, int k);
GPU_XRAY_TRACE void acceptMove_GPU
(std::vector<bool> &toAccept){ acceptMove_GPU(toAccept,0); }
GPU_XRAY_TRACE void NLMove_GPU (std::vector<Walker_t*> &walkers,
std::vector<PosType> &Rnew,
std::vector<int> &iat);
std::vector<PosType> &Rnew,
std::vector<int> &iat);
#endif

///default constructor
Expand Down Expand Up @@ -379,6 +382,102 @@ class MCWalkerConfiguration: public ParticleSet
}
}

#ifdef QMC_CUDA
inline void setklinear()
{
klinear=true;
}

inline bool getklinear()
{
return klinear;
}

inline void setkDelay(int k)
{
klinear=false;
kDelay=k;
if (kDelay<0)
{
app_log() << " Warning: Specified negative delayed updates k = " << k << ", setting to zero (no delay)." << std::endl;
kDelay=0;
}
if (kDelay==1)
kDelay=0; // use old algorithm as additional overhead for k=1 is not doing anything useful outside of code development
kblocksize=1;
kblock=0;
kcurr=0;
kstart=0;
if(kDelay)
{
app_log() << " Using delayed updates (k = " << kDelay << ") for all walkers" << std::endl;
kblocksize=kDelay;
}
kupdate=kblocksize;
}

inline int getkDelay()
{
return kDelay;
}

inline int getkblock()
{
return kblock;
}

inline int getkblocksize()
{
return kblocksize;
}

inline int getkcurr()
{
return kcurr;
}

inline int getkstart()
{
return kstart;
}

inline int getkupdate()
{
return kupdate;
}

inline int getnat(int iat)
{
for(unsigned int gid=0; gid<groups(); gid++)
if(last(gid)>iat)
return last(gid)-first(gid);
}

inline bool update_now(int iat)
{
// in case that we finished the current k-block (kcurr=0) *or* (<- This case also takes care of no delayed updates as kcurr will always be zero then)
// if we run out of electrons (nat) but still have some k's in the current k-block, an update needs to happen now
bool end_of_matrix = (kcurr+kblock*kblocksize>=getnat(iat));
bool update=((!kcurr) || end_of_matrix);
kupdate=kblocksize;
if(update)
{
if(kblock>0)
{
kstart=kblock*kblocksize;
if(kcurr==0) kstart-=kblocksize; // means we looped cleanly within kblocksize matrix (and kblock is too large by 1), hence start is at (kblock-1)*kblocksize
kupdate=kcurr+kblock*kblocksize-kstart;
kcurr=0;
if(!klinear) CurrentParticle-=kupdate-1;
}
}
// reset kblock if we're out of matrix blocks
if(end_of_matrix)
kblock=0;
return update;
}
#endif

protected:

///boolean for cleanup
Expand All @@ -391,6 +490,22 @@ class MCWalkerConfiguration: public ParticleSet
int GlobalNumWalkers;
///update-mode index
int UpdateMode;
#ifdef QMC_CUDA
///delayed update streak parameter k
int kDelay;
///block dimension (usually k) in case delayed updates are used (there are nat/kblocksize blocks available)
int kblocksize=1;
///current block
int kblock;
///current k within current block
int kcurr;
///current k to start from update
int kstart;
///number of columns to update
int kupdate;
///klinear switch to indicate if values are calculated sequentially for algorithms using drift
bool klinear;
#endif

RealType LocalEnergy;

Expand Down
5 changes: 5 additions & 0 deletions src/Particle/ParticleSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,11 @@ class ParticleSet
{
return SubPtcl[igroup+1];
}

inline int groupsize(int igroup) const
{
return SubPtcl[igroup+1]-SubPtcl[igroup];
}

///add attributes to list for IO
template<typename ATList>
Expand Down
12 changes: 6 additions & 6 deletions src/Particle/accept_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

template<typename T, int BS>
__global__ void accept_kernel (T** Rlist, T* Rnew,
int* toAccept, int iat, int N)
int* toAccept, int iat, int N, int k)
{
int tid = threadIdx.x;
__shared__ T* myR[BS];
Expand All @@ -26,7 +26,7 @@ __global__ void accept_kernel (T** Rlist, T* Rnew,
int block = blockIdx.x;
for (int i=0; i<3; i++)
if ((3*block+i)*BS + tid < 3*N)
Rnew_shared[i*BS + tid] = Rnew[(3*block+i)*BS + tid];
Rnew_shared[i*BS + tid] = Rnew[(3*block+i)*BS + tid + 3*k*N];
__syncthreads();
if (block*BS + tid < N)
{
Expand Down Expand Up @@ -55,27 +55,27 @@ __global__ void accept_kernel (T** Rlist, T* Rnew,

void
accept_move_GPU_cuda (float** Rlist, float new_pos[],
int toAccept[], int iat, int N)
int toAccept[], int iat, int N, int k)
{
const int BS=32;
int NB = N / BS + ((N % BS) ? 1 : 0);
dim3 dimBlock(BS);
dim3 dimGrid(NB);
accept_kernel<float,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(Rlist, new_pos, toAccept, iat, N);
(Rlist, new_pos, toAccept, iat, N, k);
}


void
accept_move_GPU_cuda (double** Rlist, double* new_pos,
int* toAccept, int iat, int N)
int* toAccept, int iat, int N, int k)
{
const int BS=32;
int NB = N / BS + ((N % BS) ? 1 : 0);
dim3 dimBlock(BS);
dim3 dimGrid(NB);
accept_kernel<double,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(Rlist, new_pos, toAccept, iat, N);
(Rlist, new_pos, toAccept, iat, N, k);
}


Expand Down
4 changes: 2 additions & 2 deletions src/Particle/accept_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@

void
accept_move_GPU_cuda (float* Rlist[], float new_pos[],
int toAccept[], int iat, int N);
int toAccept[], int iat, int N, int k);
void
accept_move_GPU_cuda (double* Rlist[], double new_pos[],
int toAccept[], int iat, int N);
int toAccept[], int iat, int N, int k);

void NL_move_cuda ( float* Rlist[], float new_pos[], int N);
void NL_move_cuda (double* Rlist[], double new_pos[], int N);
Expand Down
Loading

0 comments on commit bf1a98e

Please sign in to comment.