Skip to content

Commit

Permalink
Implement changes from the CUDA framework review (#429)
Browse files Browse the repository at this point in the history
Rename the cudautils namespace to cms::cuda or cms::cudatest, and drop the CUDA prefix from the symbols defined there.

Always record and query the CUDA event, to minimize need for error checking in CUDAScopedContextProduce destructor.

Add comments to highlight the pieces in CachingDeviceAllocator that have been changed wrt. cub.

Various other updates and clean up:
  - enable CUDA for compute capability 3.5.
  - clean up CUDAService, CUDA tests and plugins.
  - add CUDA existence protections to BuildFiles.
  - mark thread-safe static variables with CMS_THREAD_SAFE.
  • Loading branch information
makortel authored and fwyzard committed Nov 16, 2020
1 parent 2e52dc1 commit f4bda7b
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 15 deletions.
4 changes: 2 additions & 2 deletions CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
#ifndef __CUDACC__
#include "CUDADataFormats/Common/interface/CUDAProduct.h"
using ZVertexCUDAProduct = CUDAProduct<ZVertexHeterogeneous>;
#include "CUDADataFormats/Common/interface/Product.h"
using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
#endif

#endif
2 changes: 1 addition & 1 deletion CUDADataFormats/Vertex/src/classes.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CUDADataFormats__src_classes_h

#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
2 changes: 1 addition & 1 deletion CUDADataFormats/Vertex/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<lcgdict>
<class name="CUDAProduct<ZVertexHeterogeneous>" persistent="false"/>
<class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
<class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
<class name="ZVertexHeterogeneous" persistent="false"/>
<class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
Expand Down
22 changes: 11 additions & 11 deletions RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <vector>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
#ifdef USE_DBSCAN
#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
Expand Down Expand Up @@ -114,10 +114,10 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {

int main() {
#ifdef __CUDACC__
requireCUDADevices();
cms::cudatest::requireDevices();

auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);
auto onGPU_d = cms::cuda::make_device_unique<ZVertices[]>(1, nullptr);
auto ws_d = cms::cuda::make_device_unique<WorkSpace[]>(1, nullptr);
#else
auto onGPU_d = std::make_unique<ZVertices>();
auto ws_d = std::make_unique<WorkSpace>();
Expand Down Expand Up @@ -174,16 +174,16 @@ int main() {
cudaDeviceSynchronize();

#ifdef ONE_KERNEL
cudautils::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
#else
cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
#endif
print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());

cudaCheck(cudaGetLastError());
cudaDeviceSynchronize();

cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));

Expand Down Expand Up @@ -245,7 +245,7 @@ int main() {
}

#ifdef __CUDACC__
cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
Expand All @@ -265,7 +265,7 @@ int main() {

#ifdef __CUDACC__
// one vertex per block!!!
cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
cms::cuda::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
#else
gridDim.x = 1;
Expand All @@ -277,10 +277,10 @@ int main() {
std::cout << "after split " << nv << std::endl;

#ifdef __CUDACC__
cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
cudaCheck(cudaGetLastError());

cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
cms::cuda::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
#else
Expand Down

0 comments on commit f4bda7b

Please sign in to comment.