Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU CUDA delayed updates #1279

Merged
merged 33 commits into from
Feb 7, 2019
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7ca8750
First implementation of delayed updates into VMC_CUDA. Both w/ and w/…
Dec 22, 2017
85af24f
Compile fix for GCC on SummitDev/Summit and optimization of calc_lemm…
Apr 12, 2018
0596d39
Added output in the case a user chooses a negative k for the delayed …
Apr 12, 2018
ba41772
Replaced second smw_update call with small addon to update_onemove ke…
Apr 17, 2018
145bee1
Merge remote-tracking branch 'upstream/develop' into delayed_updates
Dec 10, 2018
b5cf582
Minor bugfixes caused by incomplete merge with new/renamed files (AoS…
Dec 12, 2018
9349e94
Added DMC functionality.
Dec 14, 2018
d6368ec
Slight performance update for GPU delayed updates code.
Dec 20, 2018
20892cc
Fix the compilation of GPU real.
ye-luo Dec 21, 2018
43a6560
Fixed hamiltonian unit test failure.
Jan 2, 2019
245438b
First implementation of complex GPU delayed updates code path.
Jan 3, 2019
1757bf2
Bugfix for complex code path. Things are working now.
Jan 3, 2019
0cb4d5f
Fix compile issues for slightly older CUDA version (i.e. on Rhea).
Jan 4, 2019
d5e984d
Minor compile fix.
Jan 4, 2019
e82c929
Moved delayed update routines into QMCWaveFunctions/Fermion/delayed_u…
Jan 7, 2019
d59cec9
Implemented pass-through of Ye's delay_rank parameter to my code path.
Jan 9, 2019
b39bd00
Compile fix.
Jan 9, 2019
ab6f5e3
Minor bugfix.
Jan 10, 2019
a9ff651
Revert to original cmake files and fix minor bug in DiracDeterminantC…
Jan 15, 2019
f849398
Merge branch 'develop' of https://github.com/QMCPACK/qmcpack into del…
Jan 15, 2019
aefab61
Fix merge issue which caused tests to fail.
Jan 15, 2019
29c9a0c
Fix drift case when delay rank not an integer fraction of electron nu…
Jan 18, 2019
3cbd58a
Code should be feature complete now as the last TODO (nat/2 use in MC…
Jan 18, 2019
17b7bdc
Removed resolved TODO comments.
Jan 18, 2019
c68fe6a
Cleaned up code and fixed bug of sometimes not properly initialized l…
Jan 24, 2019
5f279c4
Reverted changed files in utils directory back to original versions.
Jan 24, 2019
a5c369a
Fix complex compile bug.
Jan 24, 2019
49b908b
Added pivot table to cublas inverse to improve delayed update stabili…
Jan 31, 2019
13cf03e
Fixed numerical instability and improved performance for VMC w/o drift.
Feb 4, 2019
9f21534
Added missing synchronization before lemma matrix calculation.
Feb 5, 2019
004c09f
Added warning to cap GPU delayed updates at k = 64 for VMC w/ drift d…
Feb 5, 2019
7bda6a2
Fixed VMC w/o drift error for larger delay ranks. Removed cap and war…
Feb 6, 2019
55d6e47
Merge branch 'develop' into delayed_updates
ye-luo Feb 7, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMake/GNUCompilers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ if(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" AND CMAKE_C_FLAGS MATCHES "-march=")
else() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
# use -march=native
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")

ENDIF((NOT $ENV{CRAYPE_VERSION} MATCHES ".") AND (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64"))
Expand Down
12 changes: 6 additions & 6 deletions CMake/IBMCompilers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated -Wno-unused-value")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated -Wno-unused-value")

# Set extra optimization specific flags
SET( CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" )
SET( CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" )
SET( CMAKE_C_FLAGS_RELWITHDEBINFO "-g -O3" )
SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3" )
# SET( CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" )
# SET( CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" )
# SET( CMAKE_C_FLAGS_RELWITHDEBINFO "-g -O3" )
# SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3" )

# Set language standardards
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11 -qnoxlcompatmacros")
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11 -O3")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11 -qnoxlcompatmacros -O3")

IF(QMC_OMP)
SET(ENABLE_OPENMP 1)
Expand Down
39 changes: 39 additions & 0 deletions src/CUDA/gpu_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,45 @@ class device_vector
#endif
}

void
asyncCopy(const T* vec_ptr, size_t len, size_t offset, size_t datalen)
{
if ((this->size() != len) || (this->size() < offset+datalen))
{
if (!own_data)
{
fprintf (stderr, "Assigning referenced GPU vector, but it has the "
"wrong size.\n");
fprintf (stderr, "Name = %s. This size = %ld, vec size = %ld\n",
name.c_str(), size(), len);
abort();
}
if (len<offset+datalen)
{
fprintf(stderr, "Trying to write more than the length of the vector.\n");
fprintf (stderr, "Name = %s. This size = %ld, vec size = %ld, needed length = %ld\n",
name.c_str(), size(), len, offset+datalen);
abort();
}
resize(len);
}
#ifdef QMC_CUDA
cudaMemcpyAsync (&((*this)[offset]), vec_ptr, datalen*sizeof(T),
cudaMemcpyHostToDevice, kernelStream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf (stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n",
name.c_str(), size(), len);
fprintf (stderr, "this pointer = %p vec pointer=%p\n",
data_pointer, vec_ptr);
fprintf (stderr, "CUDA error in device_vector::asyncCopy(const T* vec_ptr, len, offset, datalen) for %s:\n %s\n",
name.c_str(), cudaGetErrorString(err));
abort();
}
#endif
}

void
asyncCopy(const std::vector<T, std::allocator<T> > &vec)
{
Expand Down
355 changes: 354 additions & 1 deletion src/Numerics/CUDA/cuda_inverse.cu

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions src/Numerics/CUDA/cuda_inverse.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,26 @@ cublas_inverse (cublasHandle_t handle,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);

void
cublas_lemma_mats (cublasHandle_t handle,
Copy link
Contributor

@ye-luo ye-luo Dec 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cuda_inverse.h/cu is for matrix inversion.
cublas_lemma_mats, cublas_ainv_row, cublas_smw_update are not generic wrapper functions of cublas.
Please move them to src/QMCWaveFunctions/Fermion

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No production code should be in sandbox. Put distinct functionality in different files or simply rename the cuda_inverse.h file e.g. cuda_matrices.h

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The path I put previously was wrong. It was just my local path. Functions under Numerics should not be only used a specific algorithm. So cuda_matrices.h is not good. Please create a new set of .h and .cu file under src/QMCWaveFunctions/Fermion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float *AList_d[], float *AWorkList_d[],
float *AinvList_d[], float *AinvkList_d[], float *U_d[],
float *lemma_d[], float *AinvUList_d[],
int k, int N, int nw, int RowStride);

void
cublas_ainv_row (cublasHandle_t handle,
float *AinvkList_d[], float *AWorkList_d[], float *AinvList_d[],
int k, int N, int nw, int RowStride);

void
cublas_smw_update (cublasHandle_t handle,
float *AinvkList_d[], float *AinvList_d[],
float *AinvUList_d[], float *AWorkList_d[],
float *lemma_inv[], float *lemma_lu[],
int *info_array,
int k, int kd, int M, int N, int nw, int RowStride);

//////////////////////
// Double precision //
//////////////////////
Expand All @@ -40,6 +60,25 @@ cublas_inverse (cublasHandle_t handle,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);

void
cublas_lemma_mats (cublasHandle_t handle,
double *AList_d[], double *AWorkList_d[],
double *AinvList_d[], double *AinvkList_d[], double *U_d[],
double *lemma_d[], double *AinvUList_d[],
int k, int N, int nw, int RowStride);

void
cublas_ainv_row (cublasHandle_t handle,
double *AinvkList_d[], double *AWorkList_d[], double *AinvList_d[],
int k, int N, int nw, int RowStride);

void
cublas_smw_update (cublasHandle_t handle,
double *AinvkList_d[], double *AinvList_d[],
double *AinvUList_d[], double *AWorkList_d[],
double *lemma_inv[], double *lemma_lu[],
int *info_array,
int k, int kd, int M, int N, int nw, int RowStride);

//////////////////////////////////////
// Complex single / mixed precision //
Expand Down
57 changes: 42 additions & 15 deletions src/Particle/MCWalkerConfiguration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,14 +555,14 @@ void MCWalkerConfiguration::updateLists_GPU()
{
int nw = WalkerList.size();
int NumSpecies = getSpeciesSet().TotalNum;
if (Rnew_GPU.size() != nw)
if (Rnew_GPU.size() != nw*kblocksize)
{
Rnew_GPU.resize(nw);
Rnew_GPU.resize(nw*kblocksize);
RhokLists_GPU.resize(NumSpecies);
for (int isp=0; isp<NumSpecies; isp++)
RhokLists_GPU[isp].resize(nw);
Rnew_host.resize(nw);
Rnew.resize(nw);
Rnew_host.resize(nw*kblocksize);
Rnew.resize(nw*kblocksize);
AcceptList_GPU.resize(nw);
AcceptList_host.resize(nw);
RList_GPU.resize(nw);
Expand All @@ -574,6 +574,7 @@ void MCWalkerConfiguration::updateLists_GPU()
hostlist_valueType.resize(nw);
hostlist_AA.resize(nw);

// TODO: May need to think about R_GPU now potentially being a block... (AT)
for (int iw=0; iw<nw; iw++)
{
if (WalkerList[iw]->R_GPU.size() != R.size())
Expand Down Expand Up @@ -654,20 +655,45 @@ void MCWalkerConfiguration::copyWalkerGradToGPU()
}

void MCWalkerConfiguration::proposeMove_GPU
(std::vector<PosType> &newPos, int iat)
(std::vector<PosType> &newPos, int iat, int nat)
{
if (Rnew_host.size() < newPos.size())
Rnew_host.resize(newPos.size());
for (int i=0; i<newPos.size(); i++)
int nw=newPos.size();
if (Rnew_host.size() < nw*kblocksize)
{
Rnew.resize(nw*kblocksize);
Rnew_host.resize(nw*kblocksize);
}
// store things sequentially with k to make evaluation more straight-forward:
// k=0 k=1 k=kblocksize
// Rnew = [0,..,nw|0,..,nw|...|0,..,nw]
int offset=kcurr*nw;
for (int i=0; i<nw; i++)
{
for (int dim=0; dim<OHMMS_DIM; dim++)
Rnew_host[i][dim] = newPos[i][dim];
Rnew_GPU.asyncCopy(Rnew_host);
Rnew = newPos;
{
Rnew[i+offset][dim] = newPos[i][dim];
Rnew_host[i+offset][dim] = newPos[i][dim];
}
}
if(kDelay){
kcurr=(kcurr+1)%kblocksize; // loop kcurr around every k blocks
kstart=kblock*kblocksize;
if(kcurr==0)
kblock++; // keep increasing kblock (even beyond available matrix blocks) - the update check takes care of self-consistency
// only copy new position matrix when needed (when update is imminent)
if(klinear)
{
Rnew_GPU.asyncCopy(&(Rnew_host[offset]),nw*kblocksize,offset,nw);
} else
if(kcurr==0 || (kcurr+kblock*kblocksize>=nat/2)) // TODO: Make sure we do *not* divide by two if there is no spin up/down matrix savings
Rnew_GPU.asyncCopy(Rnew_host);
} else
Rnew_GPU.asyncCopy(Rnew_host);
CurrentParticle = iat;
}


void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept, int k)
{
if (AcceptList_host.size() < toAccept.size())
AcceptList_host.resize(toAccept.size());
Expand All @@ -682,13 +708,13 @@ void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
// app_log() << "RList_GPU.size() = " << RList_GPU.size() << std::endl;
if (RList_GPU.size() != WalkerList.size())
std::cerr << "Error in RList_GPU size.\n";
if (Rnew_GPU.size() != WalkerList.size())
if (Rnew_GPU.size() != WalkerList.size()*kblocksize)
std::cerr << "Error in Rnew_GPU size.\n";
if (AcceptList_GPU.size() != WalkerList.size())
std::cerr << "Error in AcceptList_GPU_GPU size.\n";
std::cerr << "Error in AcceptList_GPU size.\n";
accept_move_GPU_cuda
(RList_GPU.data(), (CUDA_PRECISION*)Rnew_GPU.data(),
AcceptList_GPU.data(), CurrentParticle, WalkerList.size());
AcceptList_GPU.data(), CurrentParticle++, WalkerList.size(), k);
}


Expand All @@ -703,6 +729,7 @@ void MCWalkerConfiguration::NLMove_GPU(std::vector<Walker_t*> &walkers,
NLlist_GPU.resize(N);
NLlist_host.resize(N);
}
// TODO: In case delayed updates are including DMC, need to work here (AT)
if (Rnew_GPU.size() < N)
{
Rnew_host.resize(N);
Expand Down
121 changes: 117 additions & 4 deletions src/Particle/MCWalkerConfiguration.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,16 @@ class MCWalkerConfiguration: public ParticleSet
GPU_XRAY_TRACE void updateLists_GPU();
int CurrentParticle;
GPU_XRAY_TRACE void proposeMove_GPU
(std::vector<PosType> &newPos, int iat);
GPU_XRAY_TRACE void acceptMove_GPU(std::vector<bool> &toAccept);
(std::vector<PosType> &newPos, int iat, int nat);
GPU_XRAY_TRACE void proposeMove_GPU
(std::vector<PosType> &newPos, int iat){ proposeMove_GPU(newPos, iat, 0); }
GPU_XRAY_TRACE void acceptMove_GPU
(std::vector<bool> &toAccept, int k);
GPU_XRAY_TRACE void acceptMove_GPU
(std::vector<bool> &toAccept){ acceptMove_GPU(toAccept,0); }
GPU_XRAY_TRACE void NLMove_GPU (std::vector<Walker_t*> &walkers,
std::vector<PosType> &Rnew,
std::vector<int> &iat);
std::vector<PosType> &Rnew,
std::vector<int> &iat);
#endif

///default constructor
Expand Down Expand Up @@ -379,6 +384,100 @@ class MCWalkerConfiguration: public ParticleSet
}
}

inline void setklinear()
{
klinear=true;
}

inline bool getklinear()
{
return klinear;
}

inline void setkDelay(int k)
{
klinear=false;
kDelay=k;
if (kDelay<0)
{
app_log() << " Warning: Specified negative delayed updates k = " << k << ", setting to zero." << std::endl;
kDelay=0;
}
kblocksize=1;
kblock=0;
kcurr=0;
kstart=0;
kupdate=1;
if(kDelay)
{
app_log() << " Using delayed updates (k = " << kDelay << ") for all walkers" << std::endl;
kblocksize=kDelay;
}
}

inline int getkDelay()
{
return kDelay;
}

inline int getkblock()
{
return kblock;
}

inline int getkblocksize()
{
return kblocksize;
}

// this function is a bit dangerous but needed to pass the current k of the determinant look-ahead to the Jastrow calculation
inline void setkcurr(int k)
{
kcurr=k;
}

inline int getkcurr()
{
return kcurr;
}

inline int getkstart()
{
return kstart;
}

inline int getkupdate()
{
return kupdate;
}

inline bool update_now(int nat)
{
// in case that we finished the current k-block (kcurr=0) *or* (<- This case also takes care of no delayed updates as kcurr will always be zero then)
// if we run out of electrons (nat) but still have some k's in the current k-block, an update needs to happen now
bool update=((!kcurr) || (kcurr+kblock*kblocksize>=nat/2));
if(update)
{
if(kblock>0)
{
kstart=kblock*kblocksize;
if(kcurr==0) kstart-=kblocksize;
kupdate=kcurr+kblock*kblocksize-kstart;
kcurr=0;
#ifdef QMC_CUDA
if(!klinear) CurrentParticle-=kupdate-1;
#endif
}
}
// reset kblock if we're out of matrix blocks
if(kblock*kblocksize>=nat/2) // TODO: Make sure we do *not* divide by two if there is no spin up/down matrix savings
kblock=0;
return update;
}

///overwrite make_clones function
void make_clones(int n);

protected:

///boolean for cleanup
Expand All @@ -391,6 +490,20 @@ class MCWalkerConfiguration: public ParticleSet
int GlobalNumWalkers;
///update-mode index
int UpdateMode;
///delayed update streak parameter k
int kDelay;
///block dimension (usually k) in case delayed updates are used (there are nat/kblocksize blocks available)
int kblocksize;
///current block
int kblock;
///current k within current block
int kcurr;
///current k to start from update
int kstart;
///number of columns to update
int kupdate;
///klinear switch to indicate if values are calculated sequentially for algorithms using drift
bool klinear;

RealType LocalEnergy;

Expand Down
Loading