Skip to content

Commit

Permalink
Replace CUDA API wrapper memory operations with native CUDA calls (#395)
Browse files Browse the repository at this point in the history
  • Loading branch information
waredjeb authored and fwyzard committed Oct 20, 2020
1 parent 504e15c commit c7b23f0
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 35 deletions.
2 changes: 1 addition & 1 deletion CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ cudautils::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync
template <>
cudautils::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cudautils::make_host_unique<uint32_t[]>(2001, stream);
cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream);
cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
return ret;
}
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,11 @@ cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::g
}
}

cuda::memory::async::copy(
modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream);
cudaCheck(cudaMemcpyAsync(modToUnpDevice.get(),
modToUnpHost.get(),
pixelgpudetails::MAX_SIZE * sizeof(unsigned char),
cudaMemcpyHostToDevice,
cudaStream));
return modToUnpDevice;
}

Expand Down
62 changes: 30 additions & 32 deletions RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#ifdef __CUDACC__
#include <cuda/api_wrappers.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
#endif
Expand Down Expand Up @@ -252,12 +253,12 @@ int main(void) {
size_t size16 = n * sizeof(unsigned short);
// size_t size8 = n * sizeof(uint8_t);

cuda::memory::copy(d_moduleStart.get(), &nModules, sizeof(uint32_t));
cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));

cuda::memory::copy(d_id.get(), h_id.get(), size16);
cuda::memory::copy(d_x.get(), h_x.get(), size16);
cuda::memory::copy(d_y.get(), h_y.get(), size16);
cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
// Launch CUDA Kernels
int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
Expand All @@ -270,26 +271,23 @@ int main(void) {

std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
<< " threads\n";

cuda::memory::device::zero(d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)));

cudautils::launch(findClus,
{blocksPerGrid, threadsPerBlock},
d_id.get(),
d_x.get(),
d_y.get(),
d_moduleStart.get(),
d_clusInModule.get(),
d_moduleId.get(),
d_clus.get(),
n);
{blocksPerGrid, threadsPerBlock},
d_id.get(),
d_x.get(),
d_y.get(),
d_moduleStart.get(),
d_clusInModule.get(),
d_moduleId.get(),
d_clus.get(),
n);
cudaDeviceSynchronize();

cuda::memory::copy(&nModules, d_moduleStart.get(), sizeof(uint32_t));
cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));

uint32_t nclus[MaxNumModules], moduleId[nModules];

cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));

std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
<< std::endl;
Expand All @@ -302,14 +300,14 @@ int main(void) {
std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;

cudautils::launch(clusterChargeCut,
{blocksPerGrid, threadsPerBlock},
d_id.get(),
d_adc.get(),
d_moduleStart.get(),
d_clusInModule.get(),
d_moduleId.get(),
d_clus.get(),
n);
{blocksPerGrid, threadsPerBlock},
d_id.get(),
d_adc.get(),
d_moduleStart.get(),
d_clusInModule.get(),
d_moduleId.get(),
d_clus.get(),
n);

cudaDeviceSynchronize();
#else
Expand Down Expand Up @@ -354,10 +352,10 @@ int main(void) {
std::cout << "found " << nModules << " Modules active" << std::endl;

#ifdef __CUDACC__
cuda::memory::copy(h_id.get(), d_id.get(), size16);
cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
cuda::memory::copy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t));
cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
#endif

std::set<unsigned int> clids;
Expand Down

0 comments on commit c7b23f0

Please sign in to comment.