From 2ee863c326c9f8174bb2fd704a17fae9a572de7f Mon Sep 17 00:00:00 2001 From: Wahid Redjeb Date: Sat, 26 Oct 2019 22:48:53 +0200 Subject: [PATCH 01/13] Solve conflicts with #389 --- CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc | 2 +- .../test/gpuFrameTransformTest.cpp | 18 +- DataFormats/Math/test/CholeskyInvert_t.cu | 8 +- .../test/CholeskyInvert_t_BACKUP_195321.cu | 225 ++++++++++++++++++ .../Math/test/CholeskyInvert_t_BASE_195321.cu | 214 +++++++++++++++++ .../test/CholeskyInvert_t_LOCAL_195321.cu | 215 +++++++++++++++++ .../test/CholeskyInvert_t_REMOTE_195321.cu | 216 +++++++++++++++++ DataFormats/Math/test/cudaAtan2Test.cu | 4 +- DataFormats/Math/test/cudaMathTest.cu | 8 +- .../CUDACore/test/test_CUDAScopedContext.cc | 10 +- .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 3 +- .../plugins/TestCUDAProducerGPUEWTask.cc | 5 +- .../plugins/TestCUDAProducerGPUKernel.cu | 10 +- .../plugins/TestCUDAProducerGPUtoCPU.cc | 3 +- .../test/test_TestCUDAProducerGPUFirst.cc | 4 +- .../CUDAUtilities/interface/copyAsync.h | 8 +- .../CUDAUtilities/test/HistoContainer_t.cu | 11 +- .../CUDAUtilities/test/OneHistoContainer_t.cu | 2 +- .../CUDAUtilities/test/OneToManyAssoc_t.h | 8 +- .../CUDAUtilities/test/copyAsync_t.cpp | 15 +- .../CUDAUtilities/test/radixSort_t.cu | 9 +- .../TestHeterogeneousEDProducerGPUHelpers.cu | 25 +- .../src/SiPixelFedCablingMapGPUWrapper.cc | 6 +- .../SiPixelClusterizer/test/gpuClustering_t.h | 27 +-- .../PixelVertexFinding/test/VertexFinder_t.h | 36 +-- 25 files changed, 981 insertions(+), 111 deletions(-) create mode 100644 DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu create mode 100644 DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu create mode 100644 DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu create mode 100644 DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc index cdc49aa46bfa4..da9cd4d776c92 100644 --- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc +++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc @@ -4,5 +4,5 @@ BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) { data_d_ = cudautils::make_device_unique(stream); - cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream); + cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream); } diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp index 4f3c11212e2e9..75b5418eeaffb 100644 --- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp @@ -73,7 +73,8 @@ int main(void) { // auto d_sf = cuda::memory::device::make_unique(current_device, 1); auto d_sf = cuda::memory::device::make_unique(current_device, sizeof(SFrame)); - cuda::memory::copy(d_sf.get(), &sf1, sizeof(SFrame)); + cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice); + for (auto i = 0U; i < size; ++i) { xl[i] = yl[i] = 0.1f * float(i) - float(size / 2); @@ -84,9 +85,9 @@ int main(void) { std::random_shuffle(xl, xl + size); std::random_shuffle(yl, yl + size); - cuda::memory::copy(d_xl.get(), xl, size32); - cuda::memory::copy(d_yl.get(), yl, size32); - cuda::memory::copy(d_le.get(), le, 3 * size32); + cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice); + cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice); + cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice); toGlobalWrapper((SFrame const *)(d_sf.get()), d_xl.get(), @@ -97,11 +98,10 @@ int main(void) { d_le.get(), d_ge.get(), size); - - cuda::memory::copy(x, d_x.get(), size32); - cuda::memory::copy(y, d_y.get(), size32); - cuda::memory::copy(z, d_z.get(), size32); - cuda::memory::copy(ge, d_ge.get(), 6 * size32); + cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost); + cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost); + cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost); + cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost); float eps = 0.; for (auto i = 0U; i < size; ++i) { diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu index 3e2cf041bae16..65e8adca7e5ee 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cu +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -132,7 +132,7 @@ void go(bool soa) { std::cout << mm[SIZE / 2](1, 1) << std::endl; auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); - cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX)); + cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice); constexpr int NKK = #ifdef DOPROF @@ -150,8 +150,9 @@ void go(bool soa) { cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); else cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); delta += (std::chrono::high_resolution_clock::now() - start); if (0 == kk) @@ -162,8 +163,9 @@ void go(bool soa) { #ifndef DOPROF cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); #endif delta1 += (std::chrono::high_resolution_clock::now() - start); diff --git a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu new file mode 100644 index 0000000000000..e5c714acea1f5 --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu @@ -0,0 +1,225 @@ +// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride>; + +template +__global__ void invertSOA(double *__restrict__ p, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + MapMX m(p + i); + choleskyInversion::invert(m, m); +} + +template +__global__ void invert(M *mm, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto &m = mm[i]; + choleskyInversion::invert(m, m); +} + +template +__global__ void invertSeq(M *mm, unsigned int n) { + if (threadIdx.x != 0) + return; + auto first = blockIdx.x * blockDim.x; + auto last = std::min(first + blockDim.x, n); + + for (auto i = first; i < last; ++i) { + auto &m = mm[i]; + choleskyInversion::invert(m, m); + } +} + +// generate matrices +template +void genMatrix(M &m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" + << "\n"; + exit(EXIT_FAILURE); + } + + auto current_device = cuda::device::current::get(); + + constexpr unsigned int SIZE = 4 * 1024; + + MX mm[stride()]; // just storage in case of SOA + double *__restrict__ p = (double *)(mm); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto &m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto &m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); + cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice); + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + int threadsPerBlock = 128; + int blocksPerGrid = SIZE / threadsPerBlock; + + delta -= (std::chrono::high_resolution_clock::now() - start); + + if (soa) + cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); + else +<<<<<<< HEAD + cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); +======= + cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); +>>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and + + delta += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (!soa) { + delta1 -= (std::chrono::high_resolution_clock::now() - start); + +#ifndef DOPROF +<<<<<<< HEAD + cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); +======= + cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); +>>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and + +#endif + delta1 += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + } + + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto &m : mm) { + choleskyInversion::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "cuda/cudaSeq/x86 computation took " + << std::chrono::duration_cast(delta).count() / DNNK << ' ' + << std::chrono::duration_cast(delta1).count() / DNNK << ' ' + << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" + << std::endl; +} + +int main() { + exitSansCUDADevices(); + + go<2>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu new file mode 100644 index 0000000000000..6c1915126073f --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu @@ -0,0 +1,214 @@ +// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride>; + +template +__global__ void invertSOA(double *__restrict__ p, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + MapMX m(p + i); + choleskyInversion::invert(m, m); +} + +template +__global__ void invert(M *mm, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto &m = mm[i]; + choleskyInversion::invert(m, m); +} + +template +__global__ void invertSeq(M *mm, unsigned int n) { + if (threadIdx.x != 0) + return; + auto first = blockIdx.x * blockDim.x; + auto last = std::min(first + blockDim.x, n); + + for (auto i = first; i < last; ++i) { + auto &m = mm[i]; + choleskyInversion::invert(m, m); + } +} + +// generate matrices +template +void genMatrix(M &m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" + << "\n"; + exit(EXIT_FAILURE); + } + + auto current_device = cuda::device::current::get(); + + constexpr unsigned int SIZE = 4 * 1024; + + MX mm[stride()]; // just storage in case of SOA + double *__restrict__ p = (double *)(mm); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto &m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto &m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); + cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX)); + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + int threadsPerBlock = 128; + int blocksPerGrid = SIZE / threadsPerBlock; + + delta -= (std::chrono::high_resolution_clock::now() - start); + + if (soa) + cuda::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); + else + cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); + delta += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (!soa) { + delta1 -= (std::chrono::high_resolution_clock::now() - start); + +#ifndef DOPROF + cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); +#endif + delta1 += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + } + + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto &m : mm) { + choleskyInversion::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "cuda/cudaSeq/x86 computation took " + << std::chrono::duration_cast(delta).count() / DNNK << ' ' + << std::chrono::duration_cast(delta1).count() / DNNK << ' ' + << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" + << std::endl; +} + +int main() { + exitSansCUDADevices(); + + go<2>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu new file mode 100644 index 0000000000000..3e2cf041bae16 --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu @@ -0,0 +1,215 @@ +// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride>; + +template +__global__ void invertSOA(double *__restrict__ p, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + MapMX m(p + i); + choleskyInversion::invert(m, m); +} + +template +__global__ void invert(M *mm, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto &m = mm[i]; + choleskyInversion::invert(m, m); +} + +template +__global__ void invertSeq(M *mm, unsigned int n) { + if (threadIdx.x != 0) + return; + auto first = blockIdx.x * blockDim.x; + auto last = std::min(first + blockDim.x, n); + + for (auto i = first; i < last; ++i) { + auto &m = mm[i]; + choleskyInversion::invert(m, m); + } +} + +// generate matrices +template +void genMatrix(M &m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" + << "\n"; + exit(EXIT_FAILURE); + } + + auto current_device = cuda::device::current::get(); + + constexpr unsigned int SIZE = 4 * 1024; + + MX mm[stride()]; // just storage in case of SOA + double *__restrict__ p = (double *)(mm); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto &m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto &m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); + cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX)); + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + int threadsPerBlock = 128; + int blocksPerGrid = SIZE / threadsPerBlock; + + delta -= (std::chrono::high_resolution_clock::now() - start); + + if (soa) + cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); + else + cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); + delta += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (!soa) { + delta1 -= (std::chrono::high_resolution_clock::now() - start); + +#ifndef DOPROF + cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); +#endif + delta1 += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + } + + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto &m : mm) { + choleskyInversion::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "cuda/cudaSeq/x86 computation took " + << std::chrono::duration_cast(delta).count() / DNNK << ' ' + << std::chrono::duration_cast(delta1).count() / DNNK << ' ' + << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" + << std::endl; +} + +int main() { + exitSansCUDADevices(); + + go<2>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu new file mode 100644 index 0000000000000..0751af78a7289 --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu @@ -0,0 +1,216 @@ +// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride>; + +template +__global__ void invertSOA(double *__restrict__ p, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + MapMX m(p + i); + choleskyInversion::invert(m, m); +} + +template +__global__ void invert(M *mm, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto &m = mm[i]; + choleskyInversion::invert(m, m); +} + +template +__global__ void invertSeq(M *mm, unsigned int n) { + if (threadIdx.x != 0) + return; + auto first = blockIdx.x * blockDim.x; + auto last = std::min(first + blockDim.x, n); + + for (auto i = first; i < last; ++i) { + auto &m = mm[i]; + choleskyInversion::invert(m, m); + } +} + +// generate matrices +template +void genMatrix(M &m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" + << "\n"; + exit(EXIT_FAILURE); + } + + auto current_device = cuda::device::current::get(); + + constexpr unsigned int SIZE = 4 * 1024; + + MX mm[stride()]; // just storage in case of SOA + double *__restrict__ p = (double *)(mm); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto &m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto &m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); + cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice); + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + int threadsPerBlock = 128; + int blocksPerGrid = SIZE / threadsPerBlock; + + delta -= (std::chrono::high_resolution_clock::now() - start); + + if (soa) + cuda::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); + else + cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); + + delta += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (!soa) { + delta1 -= (std::chrono::high_resolution_clock::now() - start); + +#ifndef DOPROF + cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); + +#endif + delta1 += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + } + + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto &m : mm) { + choleskyInversion::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "cuda/cudaSeq/x86 computation took " + << std::chrono::duration_cast(delta).count() / DNNK << ' ' + << std::chrono::duration_cast(delta1).count() / DNNK << ' ' + << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" + << std::endl; +} + +int main() { + exitSansCUDADevices(); + + go<2>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu index c436801640b09..eec6020b44179 100644 --- a/DataFormats/Math/test/cudaAtan2Test.cu +++ b/DataFormats/Math/test/cudaAtan2Test.cu @@ -72,7 +72,7 @@ void go() { auto diff_d = cuda::memory::device::make_unique(current_device, 3); int diffs[3]; - cuda::memory::device::zero(diff_d.get(), 3 * 4); + cudaMemset(diff_d.get(), 0, 3 * 4); // Launch the diff CUDA Kernel dim3 threadsPerBlock(32, 32, 1); @@ -83,7 +83,7 @@ void go() { cudautils::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); - cuda::memory::copy(diffs, diff_d.get(), 3 * 4); + cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost); delta += (std::chrono::high_resolution_clock::now() - start); float mdiff = diffs[0] * 1.e-7; diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu index 1a77456a1c061..e443544e2e3c0 100644 --- a/DataFormats/Math/test/cudaMathTest.cu +++ b/DataFormats/Math/test/cudaMathTest.cu @@ -105,9 +105,9 @@ void go() { auto d_A = cuda::memory::device::make_unique(current_device, numElements); auto d_B = cuda::memory::device::make_unique(current_device, numElements); auto d_C = cuda::memory::device::make_unique(current_device, numElements); - - cuda::memory::copy(d_A.get(), h_A.get(), size); - cuda::memory::copy(d_B.get(), h_B.get(), size); + + cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice); + cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda alloc+copy took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -130,7 +130,7 @@ void go() { << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::memory::copy(h_C.get(), d_C.get(), size); + cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda copy back took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index 5e6d67b24e479..bab3d0a347509 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -31,8 +31,7 @@ namespace cudatest { namespace { std::unique_ptr> produce(int device, int* d, int* h) { auto ctx = cudatest::TestCUDAScopedContext::make(device, true); - - cuda::memory::async::copy(d, h, sizeof(int), ctx.stream()); + cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()); testCUDAScopedContextKernels_single(d, ctx.stream()); return ctx.wrap(d); } @@ -116,9 +115,10 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { h_a1 = 0; h_a2 = 0; int h_a3 = 0; - cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream()); - cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream()); - cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream()); + + cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()); + cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()); + cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()); REQUIRE(h_a1 == 2); REQUIRE(h_a2 == 4); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc index abded61899096..95af5ba5c0685 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc @@ -67,8 +67,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream()); - + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); } diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc index 2b628ae93051e..e38322b7f5713 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc @@ -75,8 +75,7 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream()); - + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); // Push a task to run addSimpleWork() after the asynchronous work // (and acquire()) has finished instead of produce() ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) { @@ -94,7 +93,7 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID, edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID << " 10th element " << *hostData_ << " not satisfied, queueing more work"; - cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream()); + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); }); gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream()); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu index 73a6615259ebc..0015cdcdc7bb6 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu @@ -69,7 +69,7 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const // First make the sanity check if (d_input != nullptr) { auto h_check = std::make_unique(NUM_VALUES); - cuda::memory::copy(h_check.get(), d_input, NUM_VALUES * sizeof(float)); + cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost); for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -88,10 +88,10 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const auto d_a = cudautils::make_device_unique(NUM_VALUES, stream); auto d_b = cudautils::make_device_unique(NUM_VALUES, stream); - - cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream); - cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream); - + + cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream); + int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc index 06f099073e1ea..73f2123521dc2 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc @@ -58,8 +58,7 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, buffer_ = cudautils::make_host_unique(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream()); // Enqueue async copy, continue in produce once finished - cuda::memory::async::copy( - buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), ctx.stream()); + cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc index 3ab110806ee78..8c3ae29a8b343 100644 --- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc +++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc @@ -74,8 +74,8 @@ process.moduleToTest(process.toTest) REQUIRE(data != nullptr); float firstElements[10]; - cuda::memory::async::copy(firstElements, data, sizeof(float) * 10, prod->stream()); - + cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()); + std::cout << "Synchronizing with CUDA stream" << std::endl; auto stream = prod->stream(); cudaCheck(cudaStreamSynchronize(stream)); diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h index a327402b2f3d3..1f827c4d6bb0e 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h @@ -17,7 +17,7 @@ namespace cudautils { // Shouldn't compile for array types because of sizeof(T), but // let's add an assert with a more helpful message static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream); + cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream); } template @@ -25,7 +25,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, cudaStream_t stream) { static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream); + cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream); } // Multiple elements @@ -34,7 +34,7 @@ namespace cudautils { const cudautils::host::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream); + cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream); } template @@ -42,7 +42,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream); + cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream); } } // namespace cudautils diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index e3ac77f1556a4..3db8b611012c6 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -25,8 +25,8 @@ void go() { constexpr int N = 12000; T v[N]; auto v_d = cuda::memory::device::make_unique(current_device, N); - - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); + + cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice); constexpr uint32_t nParts = 10; constexpr uint32_t partSize = N / nParts; @@ -65,7 +65,7 @@ void go() { offsets[10] = 3297 + offsets[9]; } - cuda::memory::copy(off_d.get(), offsets, 4 * (nParts + 1)); + cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice); for (long long j = 0; j < N; j++) v[j] = rgen(eng); @@ -75,11 +75,10 @@ void go() { v[j] = sizeof(T) == 1 ? 22 : 3456; } - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); + cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice); cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); - - cuda::memory::copy(&h, h_d.get(), sizeof(Hist)); + cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost); assert(0 == h.off[0]); assert(offsets[10] == h.size()); diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index 4514ddd55e20b..4307da7abcf23 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -131,7 +131,7 @@ void go() { assert(v_d.get()); assert(v); - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); + cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice); assert(v_d.get()); cudautils::launch(mykernel, {1, 256}, v_d.get(), N); } diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index b2d85cf1cf732..d361e0b128c78 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -169,8 +169,8 @@ int main() { auto a_d = cuda::memory::device::make_unique(current_device, 1); auto sa_d = cuda::memory::device::make_unique(current_device, 1); auto ws_d = cuda::memory::device::make_unique(current_device, Assoc::wsSize()); - - cuda::memory::copy(v_d.get(), tr.data(), N * sizeof(std::array)); + + cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice); #else auto a_d = std::make_unique(); auto sa_d = std::make_unique(); @@ -198,7 +198,7 @@ int main() { Assoc la; #ifdef __CUDACC__ - cuda::memory::copy(&la, a_d.get(), sizeof(Assoc)); + cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost); #else memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier #endif @@ -231,7 +231,7 @@ int main() { cudautils::finalizeBulk<<>>(dc_d, a_d.get()); verifyBulk<<<1, 1>>>(a_d.get(), dc_d); - cuda::memory::copy(&la, a_d.get(), sizeof(Assoc)); + cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost); cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost); cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp index cec4df12ca99b..d29b41dffc7d3 100644 --- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp @@ -21,8 +21,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(stream); cudautils::copyAsync(device, host_orig, stream); - cuda::memory::async::copy(host.get(), device.get(), sizeof(int), stream); cudaCheck(cudaStreamSynchronize(stream)); + cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream); + stream.synchronize(); REQUIRE(*host == 42); } @@ -40,8 +41,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { SECTION("Copy all") { cudautils::copyAsync(device, host_orig, N, stream); - cuda::memory::async::copy(host.get(), device.get(), N * sizeof(int), stream); cudaCheck(cudaStreamSynchronize(stream)); + cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream); + stream.synchronize(); for (int i = 0; i < N; ++i) { CHECK(host[i] == i); } @@ -53,8 +55,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { SECTION("Copy some") { cudautils::copyAsync(device, host_orig, 42, stream); - cuda::memory::async::copy(host.get(), device.get(), 42 * sizeof(int), stream); cudaCheck(cudaStreamSynchronize(stream)); + cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream); + stream.synchronize(); for (int i = 0; i < 42; ++i) { CHECK(host[i] == 200 + i); } @@ -70,7 +73,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto device = cudautils::make_device_unique(stream); auto host = cudautils::make_host_unique(stream); - cuda::memory::async::copy(device.get(), host_orig.get(), sizeof(int), stream); + cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream); cudautils::copyAsync(host, device, stream); cudaCheck(cudaStreamSynchronize(stream)); @@ -89,7 +92,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cuda::memory::async::copy(device.get(), host_orig.get(), N * sizeof(int), stream); + cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream); cudautils::copyAsync(host, device, N, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { @@ -102,7 +105,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { - cuda::memory::async::copy(device.get(), host_orig.get(), 42 * sizeof(int), stream); + cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream); cudautils::copyAsync(host, device, 42, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index 9be80547837d4..e1d0a42252482 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -101,9 +101,9 @@ void go(bool useShared) { auto ind_d = cuda::memory::device::make_unique(current_device, N); auto ws_d = cuda::memory::device::make_unique(current_device, N); auto off_d = cuda::memory::device::make_unique(current_device, blocks + 1); - - cuda::memory::copy(v_d.get(), v, N * sizeof(T)); - cuda::memory::copy(off_d.get(), offsets, 4 * (blocks + 1)); + + cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice); + cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice); if (i < 2) std::cout << "lauch for " << offsets[blocks] << std::endl; @@ -121,8 +121,7 @@ void go(bool useShared) { if (i == 0) std::cout << "done for " << offsets[blocks] << std::endl; - // cuda::memory::copy(v, v_d.get(), 2*N); - cuda::memory::copy(ind, ind_d.get(), 2 * N); + cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost); delta += (std::chrono::high_resolution_clock::now() - start); diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu index 7e531a48b6b82..b28ee0cbc9949 100644 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu +++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu @@ -6,7 +6,6 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "TestHeterogeneousEDProducerGPUHelpers.h" - // // Vector Addition Kernel // @@ -59,8 +58,8 @@ namespace { int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { // Example from Viktor/cuda-api-wrappers - constexpr int NUM_VALUES = 10000; - + constexpr int NUM_VALUES = 10000; + auto current_device = cuda::device::current::get(); auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); @@ -77,8 +76,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { auto d_b = cuda::memory::device::make_unique(current_device, NUM_VALUES); auto d_c = cuda::memory::device::make_unique(current_device, NUM_VALUES); - cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), stream.id()); - cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), stream.id()); + cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()); + cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()); int threadsPerBlock{256}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; @@ -90,8 +89,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { cudautils::launch(vectorAdd, {blocksPerGrid, threadsPerBlock}, d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); */ - - cuda::memory::async::copy(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), stream.id()); + + cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()); stream.synchronize(); @@ -111,6 +110,7 @@ TestHeterogeneousEDProducerGPUTask::TestHeterogeneousEDProducerGPUTask() { h_a = cuda::memory::host::make_unique(NUM_VALUES); h_b = cuda::memory::host::make_unique(NUM_VALUES); + auto current_device = cuda::device::current::get(); d_b = cuda::memory::device::make_unique(current_device, NUM_VALUES); @@ -124,7 +124,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas // First make the sanity check if (inputArrays.first != nullptr) { auto h_check = std::make_unique(NUM_VALUES); - cuda::memory::copy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float)); + cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost); + for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -139,15 +140,17 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas } auto current_device = cuda::device::current::get(); + auto d_a = cuda::memory::device::make_unique(current_device, NUM_VALUES); auto d_c = cuda::memory::device::make_unique(current_device, NUM_VALUES); if (inputArrays.second != nullptr) { + d_d = cuda::memory::device::make_unique(current_device, NUM_VALUES); } // Create stream - cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream.id()); - cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream.id()); + cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()); + cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()); int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; @@ -195,7 +198,7 @@ void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda: int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) { auto h_c = cuda::memory::host::make_unique(NUM_VALUES); - cuda::memory::async::copy(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), stream.id()); + cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()); stream.synchronize(); float ret = 0; diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc index 4a554b47bbf63..d484d9eeadea9 100644 --- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc +++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc @@ -155,9 +155,9 @@ cudautils::device::unique_ptr SiPixelFedCablingMapGPUWrapper::g } } } - - cuda::memory::async::copy( - modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream); + + cudaMemcpyAsync( + modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream); return modToUnpDevice; } diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h index 4f7ced9b7e309..d65ca1033d454 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h @@ -252,12 +252,12 @@ int main(void) { size_t size16 = n * sizeof(unsigned short); // size_t size8 = n * sizeof(uint8_t); - cuda::memory::copy(d_moduleStart.get(), &nModules, sizeof(uint32_t)); + cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice); - cuda::memory::copy(d_id.get(), h_id.get(), size16); - cuda::memory::copy(d_x.get(), h_x.get(), size16); - cuda::memory::copy(d_y.get(), h_y.get(), size16); - cuda::memory::copy(d_adc.get(), h_adc.get(), size16); + cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice); + cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice); + cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice); + cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice); // Launch CUDA Kernels int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256); int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; @@ -270,8 +270,7 @@ int main(void) { std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; - - cuda::memory::device::zero(d_clusInModule.get(), MaxNumModules * sizeof(uint32_t)); + cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)); cudautils::launch(findClus, {blocksPerGrid, threadsPerBlock}, @@ -284,12 +283,10 @@ int main(void) { d_clus.get(), n); cudaDeviceSynchronize(); - - cuda::memory::copy(&nModules, d_moduleStart.get(), sizeof(uint32_t)); + cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost); uint32_t nclus[MaxNumModules], moduleId[nModules]; - - cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t)); + cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost); std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters" << std::endl; @@ -354,10 +351,10 @@ int main(void) { std::cout << "found " << nModules << " Modules active" << std::endl; #ifdef __CUDACC__ - cuda::memory::copy(h_id.get(), d_id.get(), size16); - cuda::memory::copy(h_clus.get(), d_clus.get(), size32); - cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t)); - cuda::memory::copy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t)); + cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost); + cudaMemcpy(h_clus.get(), d_clus.get(), size16, cudaMemcpyDeviceToHost); + cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost); #endif std::set clids; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 0df7af362ac0d..2313bbe7803bd 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -126,10 +126,10 @@ int main() { std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cuda::memory::copy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - cuda::memory::copy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - cuda::memory::copy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - cuda::memory::copy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice); + cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice); + cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice); #else ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); @@ -162,7 +162,7 @@ int main() { cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaGetLastError()); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); + cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); #else print(onGPU_d.get(), ws_d.get()); @@ -207,8 +207,8 @@ int main() { #endif #ifdef __CUDACC__ - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost); #else memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif @@ -223,9 +223,9 @@ int main() { #ifdef __CUDACC__ cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost); #else fitVertices(onGPU_d.get(), ws_d.get(), 50.f); nv = onGPU_d->nvFinal; @@ -243,7 +243,7 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t)); + cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost); #else gridDim.x = 1024; // nv ???? assert(blockIdx.x == 0); @@ -260,7 +260,7 @@ int main() { cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); + cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); #else fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); sortByPt2(onGPU_d.get(), ws_d.get()); @@ -274,12 +274,12 @@ int main() { } #ifdef __CUDACC__ - cuda::memory::copy(zv, LOC_ONGPU(zv), nv * sizeof(float)); - cuda::memory::copy(wv, LOC_ONGPU(wv), nv * sizeof(float)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); - cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float)); - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t)); + cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) From ede0cdd9bd90a2c9d3a4e0fb23ad4b92fa918eef Mon Sep 17 00:00:00 2001 From: Wahid Redjeb Date: Sat, 26 Oct 2019 22:51:01 +0200 Subject: [PATCH 02/13] Solve conflicts with #389 --- .../Math/test/CholeskyInvert_t_BACKUP_195321.cu | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu index e5c714acea1f5..bad32accd2d00 100644 --- a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu +++ b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu @@ -149,13 +149,8 @@ void go(bool soa) { if (soa) cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); else -<<<<<<< HEAD cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); -======= - cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); ->>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and delta += (std::chrono::high_resolution_clock::now() - start); @@ -166,13 +161,8 @@ void go(bool soa) { delta1 -= (std::chrono::high_resolution_clock::now() - start); #ifndef DOPROF -<<<<<<< HEAD cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); -======= - cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); ->>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and #endif delta1 += (std::chrono::high_resolution_clock::now() - start); From a731751a0dd965f5a18b69d94ed11f2d20ba9f62 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sat, 26 Oct 2019 23:41:03 +0200 Subject: [PATCH 03/13] Delete spurious file --- .../test/CholeskyInvert_t_BACKUP_195321.cu | 215 ------------------ 1 file changed, 215 deletions(-) delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu diff --git a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu deleted file mode 100644 index bad32accd2d00..0000000000000 --- a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu +++ /dev/null @@ -1,215 +0,0 @@ -// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" -// add -DDOPROF to run nvprof --metrics all - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include "DataFormats/Math/interface/choleskyInversion.h" -#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" - -constexpr int stride() { return 5 * 1024; } -template -using MXN = Eigen::Matrix; -template -using MapMX = Eigen::Map, 0, Eigen::Stride>; - -template -__global__ void invertSOA(double *__restrict__ p, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - MapMX m(p + i); - choleskyInversion::invert(m, m); -} - -template -__global__ void invert(M *mm, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - auto &m = mm[i]; - choleskyInversion::invert(m, m); -} - -template -__global__ void invertSeq(M *mm, unsigned int n) { - if (threadIdx.x != 0) - return; - auto first = blockIdx.x * blockDim.x; - auto last = std::min(first + blockDim.x, n); - - for (auto i = first; i < last; ++i) { - auto &m = mm[i]; - choleskyInversion::invert(m, m); - } -} - -// generate matrices -template -void genMatrix(M &m) { - using T = typename std::remove_reference::type; - int n = M::ColsAtCompileTime; - std::mt19937 eng; - // std::mt19937 eng2; - std::uniform_real_distribution rgen(0., 1.); - - // generate first diagonal elemets - for (int i = 0; i < n; ++i) { - double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 - m(i, i) = maxVal * rgen(eng); - } - for (int i = 0; i < n; ++i) { - for (int j = 0; j < i; ++j) { - double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined - m(i, j) = v * rgen(eng); - m(j, i) = m(i, j); - } - } -} - -template -void go(bool soa) { - constexpr unsigned int DIM = N; - using MX = MXN; - std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; - - auto start = std::chrono::high_resolution_clock::now(); - auto delta = start - start; - auto delta1 = delta; - auto delta2 = delta; - - if (cuda::device::count() == 0) { - std::cerr << "No CUDA devices on this system" - << "\n"; - exit(EXIT_FAILURE); - } - - auto current_device = cuda::device::current::get(); - - constexpr unsigned int SIZE = 4 * 1024; - - MX mm[stride()]; // just storage in case of SOA - double *__restrict__ p = (double *)(mm); - - if (soa) { - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - genMatrix(m); - } - } else { - for (auto &m : mm) - genMatrix(m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (soa) - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - else - for (auto &m : mm) { - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); - cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice); - - constexpr int NKK = -#ifdef DOPROF - 2; -#else - 1000; -#endif - for (int kk = 0; kk < NKK; ++kk) { - int threadsPerBlock = 128; - int blocksPerGrid = SIZE / threadsPerBlock; - - delta -= (std::chrono::high_resolution_clock::now() - start); - - if (soa) - cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); - else - cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); - - delta += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (!soa) { - delta1 -= (std::chrono::high_resolution_clock::now() - start); - -#ifndef DOPROF - cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); - -#endif - delta1 += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - } - - delta2 -= (std::chrono::high_resolution_clock::now() - start); - if (soa) -#pragma GCC ivdep - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - } - else -#pragma GCC ivdep - for (auto &m : mm) { - choleskyInversion::invert(m, m); - } - - delta2 += (std::chrono::high_resolution_clock::now() - start); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - double DNNK = NKK; - std::cout << "cuda/cudaSeq/x86 computation took " - << std::chrono::duration_cast(delta).count() / DNNK << ' ' - << std::chrono::duration_cast(delta1).count() / DNNK << ' ' - << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" - << std::endl; -} - -int main() { - exitSansCUDADevices(); - - go<2>(false); - go<4>(false); - go<5>(false); - go<6>(false); - - go<2>(true); - go<4>(true); - go<5>(true); - go<6>(true); - - // go<10>(); - return 0; -} From c00e7bd769ad7e932a7c89daab5c34654cd37dc3 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sat, 26 Oct 2019 23:41:17 +0200 Subject: [PATCH 04/13] Delete spurious file --- .../Math/test/CholeskyInvert_t_BASE_195321.cu | 214 ------------------ 1 file changed, 214 deletions(-) delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu diff --git a/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu deleted file mode 100644 index 6c1915126073f..0000000000000 --- a/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu +++ /dev/null @@ -1,214 +0,0 @@ -// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" -// add -DDOPROF to run nvprof --metrics all - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include "DataFormats/Math/interface/choleskyInversion.h" -#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" - -constexpr int stride() { return 5 * 1024; } -template -using MXN = Eigen::Matrix; -template -using MapMX = Eigen::Map, 0, Eigen::Stride>; - -template -__global__ void invertSOA(double *__restrict__ p, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - MapMX m(p + i); - choleskyInversion::invert(m, m); -} - -template -__global__ void invert(M *mm, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - auto &m = mm[i]; - choleskyInversion::invert(m, m); -} - -template -__global__ void invertSeq(M *mm, unsigned int n) { - if (threadIdx.x != 0) - return; - auto first = blockIdx.x * blockDim.x; - auto last = std::min(first + blockDim.x, n); - - for (auto i = first; i < last; ++i) { - auto &m = mm[i]; - choleskyInversion::invert(m, m); - } -} - -// generate matrices -template -void genMatrix(M &m) { - using T = typename std::remove_reference::type; - int n = M::ColsAtCompileTime; - std::mt19937 eng; - // std::mt19937 eng2; - std::uniform_real_distribution rgen(0., 1.); - - // generate first diagonal elemets - for (int i = 0; i < n; ++i) { - double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 - m(i, i) = maxVal * rgen(eng); - } - for (int i = 0; i < n; ++i) { - for (int j = 0; j < i; ++j) { - double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined - m(i, j) = v * rgen(eng); - m(j, i) = m(i, j); - } - } -} - -template -void go(bool soa) { - constexpr unsigned int DIM = N; - using MX = MXN; - std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; - - auto start = std::chrono::high_resolution_clock::now(); - auto delta = start - start; - auto delta1 = delta; - auto delta2 = delta; - - if (cuda::device::count() == 0) { - std::cerr << "No CUDA devices on this system" - << "\n"; - exit(EXIT_FAILURE); - } - - auto current_device = cuda::device::current::get(); - - constexpr unsigned int SIZE = 4 * 1024; - - MX mm[stride()]; // just storage in case of SOA - double *__restrict__ p = (double *)(mm); - - if (soa) { - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - genMatrix(m); - } - } else { - for (auto &m : mm) - genMatrix(m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (soa) - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - else - for (auto &m : mm) { - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); - cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX)); - - constexpr int NKK = -#ifdef DOPROF - 2; -#else - 1000; -#endif - for (int kk = 0; kk < NKK; ++kk) { - int threadsPerBlock = 128; - int blocksPerGrid = SIZE / threadsPerBlock; - - delta -= (std::chrono::high_resolution_clock::now() - start); - - if (soa) - cuda::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); - else - cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); - delta += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (!soa) { - delta1 -= (std::chrono::high_resolution_clock::now() - start); - -#ifndef DOPROF - cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); -#endif - delta1 += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - } - - delta2 -= (std::chrono::high_resolution_clock::now() - start); - if (soa) -#pragma GCC ivdep - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - } - else -#pragma GCC ivdep - for (auto &m : mm) { - choleskyInversion::invert(m, m); - } - - delta2 += (std::chrono::high_resolution_clock::now() - start); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - double DNNK = NKK; - std::cout << "cuda/cudaSeq/x86 computation took " - << std::chrono::duration_cast(delta).count() / DNNK << ' ' - << std::chrono::duration_cast(delta1).count() / DNNK << ' ' - << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" - << std::endl; -} - -int main() { - exitSansCUDADevices(); - - go<2>(false); - go<4>(false); - go<5>(false); - go<6>(false); - - go<2>(true); - go<4>(true); - go<5>(true); - go<6>(true); - - // go<10>(); - return 0; -} From 3b7e845832cd6bbff0e72d2519c39ee14367ec70 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sat, 26 Oct 2019 23:44:42 +0200 Subject: [PATCH 05/13] Delete spurious file --- .../test/CholeskyInvert_t_LOCAL_195321.cu | 215 ------------------ 1 file changed, 215 deletions(-) delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu diff --git a/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu deleted file mode 100644 index 3e2cf041bae16..0000000000000 --- a/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu +++ /dev/null @@ -1,215 +0,0 @@ -// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" -// add -DDOPROF to run nvprof --metrics all - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include "DataFormats/Math/interface/choleskyInversion.h" -#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" - -constexpr int stride() { return 5 * 1024; } -template -using MXN = Eigen::Matrix; -template -using MapMX = Eigen::Map, 0, Eigen::Stride>; - -template -__global__ void invertSOA(double *__restrict__ p, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - MapMX m(p + i); - choleskyInversion::invert(m, m); -} - -template -__global__ void invert(M *mm, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - auto &m = mm[i]; - choleskyInversion::invert(m, m); -} - -template -__global__ void invertSeq(M *mm, unsigned int n) { - if (threadIdx.x != 0) - return; - auto first = blockIdx.x * blockDim.x; - auto last = std::min(first + blockDim.x, n); - - for (auto i = first; i < last; ++i) { - auto &m = mm[i]; - choleskyInversion::invert(m, m); - } -} - -// generate matrices -template -void genMatrix(M &m) { - using T = typename std::remove_reference::type; - int n = M::ColsAtCompileTime; - std::mt19937 eng; - // std::mt19937 eng2; - std::uniform_real_distribution rgen(0., 1.); - - // generate first diagonal elemets - for (int i = 0; i < n; ++i) { - double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 - m(i, i) = maxVal * rgen(eng); - } - for (int i = 0; i < n; ++i) { - for (int j = 0; j < i; ++j) { - double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined - m(i, j) = v * rgen(eng); - m(j, i) = m(i, j); - } - } -} - -template -void go(bool soa) { - constexpr unsigned int DIM = N; - using MX = MXN; - std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; - - auto start = std::chrono::high_resolution_clock::now(); - auto delta = start - start; - auto delta1 = delta; - auto delta2 = delta; - - if (cuda::device::count() == 0) { - std::cerr << "No CUDA devices on this system" - << "\n"; - exit(EXIT_FAILURE); - } - - auto current_device = cuda::device::current::get(); - - constexpr unsigned int SIZE = 4 * 1024; - - MX mm[stride()]; // just storage in case of SOA - double *__restrict__ p = (double *)(mm); - - if (soa) { - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - genMatrix(m); - } - } else { - for (auto &m : mm) - genMatrix(m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (soa) - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - else - for (auto &m : mm) { - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); - cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX)); - - constexpr int NKK = -#ifdef DOPROF - 2; -#else - 1000; -#endif - for (int kk = 0; kk < NKK; ++kk) { - int threadsPerBlock = 128; - int blocksPerGrid = SIZE / threadsPerBlock; - - delta -= (std::chrono::high_resolution_clock::now() - start); - - if (soa) - cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); - else - cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); - delta += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (!soa) { - delta1 -= (std::chrono::high_resolution_clock::now() - start); - -#ifndef DOPROF - cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - - cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX)); -#endif - delta1 += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - } - - delta2 -= (std::chrono::high_resolution_clock::now() - start); - if (soa) -#pragma GCC ivdep - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - } - else -#pragma GCC ivdep - for (auto &m : mm) { - choleskyInversion::invert(m, m); - } - - delta2 += (std::chrono::high_resolution_clock::now() - start); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - double DNNK = NKK; - std::cout << "cuda/cudaSeq/x86 computation took " - << std::chrono::duration_cast(delta).count() / DNNK << ' ' - << std::chrono::duration_cast(delta1).count() / DNNK << ' ' - << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" - << std::endl; -} - -int main() { - exitSansCUDADevices(); - - go<2>(false); - go<4>(false); - go<5>(false); - go<6>(false); - - go<2>(true); - go<4>(true); - go<5>(true); - go<6>(true); - - // go<10>(); - return 0; -} From 61d12f5dabbb9d60680ed8b34f1021708a2993c8 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sat, 26 Oct 2019 23:45:00 +0200 Subject: [PATCH 06/13] Delete spurious file --- .../test/CholeskyInvert_t_REMOTE_195321.cu | 216 ------------------ 1 file changed, 216 deletions(-) delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu diff --git a/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu deleted file mode 100644 index 0751af78a7289..0000000000000 --- a/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu +++ /dev/null @@ -1,216 +0,0 @@ -// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" -// add -DDOPROF to run nvprof --metrics all - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include "DataFormats/Math/interface/choleskyInversion.h" -#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" - -constexpr int stride() { return 5 * 1024; } -template -using MXN = Eigen::Matrix; -template -using MapMX = Eigen::Map, 0, Eigen::Stride>; - -template -__global__ void invertSOA(double *__restrict__ p, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - MapMX m(p + i); - choleskyInversion::invert(m, m); -} - -template -__global__ void invert(M *mm, unsigned int n) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) - return; - - auto &m = mm[i]; - choleskyInversion::invert(m, m); -} - -template -__global__ void invertSeq(M *mm, unsigned int n) { - if (threadIdx.x != 0) - return; - auto first = blockIdx.x * blockDim.x; - auto last = std::min(first + blockDim.x, n); - - for (auto i = first; i < last; ++i) { - auto &m = mm[i]; - choleskyInversion::invert(m, m); - } -} - -// generate matrices -template -void genMatrix(M &m) { - using T = typename std::remove_reference::type; - int n = M::ColsAtCompileTime; - std::mt19937 eng; - // std::mt19937 eng2; - std::uniform_real_distribution rgen(0., 1.); - - // generate first diagonal elemets - for (int i = 0; i < n; ++i) { - double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 - m(i, i) = maxVal * rgen(eng); - } - for (int i = 0; i < n; ++i) { - for (int j = 0; j < i; ++j) { - double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined - m(i, j) = v * rgen(eng); - m(j, i) = m(i, j); - } - } -} - -template -void go(bool soa) { - constexpr unsigned int DIM = N; - using MX = MXN; - std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; - - auto start = std::chrono::high_resolution_clock::now(); - auto delta = start - start; - auto delta1 = delta; - auto delta2 = delta; - - if (cuda::device::count() == 0) { - std::cerr << "No CUDA devices on this system" - << "\n"; - exit(EXIT_FAILURE); - } - - auto current_device = cuda::device::current::get(); - - constexpr unsigned int SIZE = 4 * 1024; - - MX mm[stride()]; // just storage in case of SOA - double *__restrict__ p = (double *)(mm); - - if (soa) { - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - genMatrix(m); - } - } else { - for (auto &m : mm) - genMatrix(m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (soa) - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - else - for (auto &m : mm) { - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); - cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice); - - constexpr int NKK = -#ifdef DOPROF - 2; -#else - 1000; -#endif - for (int kk = 0; kk < NKK; ++kk) { - int threadsPerBlock = 128; - int blocksPerGrid = SIZE / threadsPerBlock; - - delta -= (std::chrono::high_resolution_clock::now() - start); - - if (soa) - cuda::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); - else - cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); - - delta += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - if (!soa) { - delta1 -= (std::chrono::high_resolution_clock::now() - start); - -#ifndef DOPROF - cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); - -#endif - delta1 += (std::chrono::high_resolution_clock::now() - start); - - if (0 == kk) - std::cout << mm[SIZE / 2](1, 1) << std::endl; - } - - delta2 -= (std::chrono::high_resolution_clock::now() - start); - if (soa) -#pragma GCC ivdep - for (unsigned int i = 0; i < SIZE; ++i) { - MapMX m(p + i); - choleskyInversion::invert(m, m); - } - else -#pragma GCC ivdep - for (auto &m : mm) { - choleskyInversion::invert(m, m); - } - - delta2 += (std::chrono::high_resolution_clock::now() - start); - } - - std::cout << mm[SIZE / 2](1, 1) << std::endl; - - double DNNK = NKK; - std::cout << "cuda/cudaSeq/x86 computation took " - << std::chrono::duration_cast(delta).count() / DNNK << ' ' - << std::chrono::duration_cast(delta1).count() / DNNK << ' ' - << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" - << std::endl; -} - -int main() { - exitSansCUDADevices(); - - go<2>(false); - go<4>(false); - go<5>(false); - go<6>(false); - - go<2>(true); - go<4>(true); - go<5>(true); - go<6>(true); - - // go<10>(); - return 0; -} From b17e7f9b482999f02f6f6b6d77ddca6c64dbce44 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sat, 26 Oct 2019 23:49:03 +0200 Subject: [PATCH 07/13] Whitespaces --- .../test/TestHeterogeneousEDProducerGPUHelpers.cu | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu index b28ee0cbc9949..017092452c1e1 100644 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu +++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu @@ -6,6 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "TestHeterogeneousEDProducerGPUHelpers.h" + // // Vector Addition Kernel // @@ -58,8 +59,8 @@ namespace { int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { // Example from Viktor/cuda-api-wrappers - constexpr int NUM_VALUES = 10000; - + constexpr int NUM_VALUES = 10000; + auto current_device = cuda::device::current::get(); auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); @@ -89,7 +90,7 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { cudautils::launch(vectorAdd, {blocksPerGrid, threadsPerBlock}, d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); */ - + cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()); stream.synchronize(); @@ -110,7 +111,6 @@ TestHeterogeneousEDProducerGPUTask::TestHeterogeneousEDProducerGPUTask() { h_a = cuda::memory::host::make_unique(NUM_VALUES); h_b = cuda::memory::host::make_unique(NUM_VALUES); - auto current_device = cuda::device::current::get(); d_b = cuda::memory::device::make_unique(current_device, NUM_VALUES); @@ -125,7 +125,6 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas if (inputArrays.first != nullptr) { auto h_check = std::make_unique(NUM_VALUES); cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost); - for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -140,11 +139,9 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas } auto current_device = cuda::device::current::get(); - auto d_a = cuda::memory::device::make_unique(current_device, NUM_VALUES); auto d_c = cuda::memory::device::make_unique(current_device, NUM_VALUES); if (inputArrays.second != nullptr) { - d_d = cuda::memory::device::make_unique(current_device, NUM_VALUES); } From 01bb99567d64a6ff1897e276709df7ecbb2f40df Mon Sep 17 00:00:00 2001 From: Wahid Redjeb Date: Mon, 28 Oct 2019 12:00:13 +0100 Subject: [PATCH 08/13] Wrap cudaMem calls in call to cudaCheck --- CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc | 3 +- .../src/TrackingRecHit2DCUDA.cc | 2 +- .../test/gpuFrameTransformTest.cpp | 17 ++++--- DataFormats/Math/test/CholeskyInvert_t.cu | 9 ++-- DataFormats/Math/test/cudaAtan2Test.cu | 5 +- DataFormats/Math/test/cudaMathTest.cu | 7 +-- .../CUDACore/test/testStreamEvent.cu | 7 +-- .../CUDACore/test/test_CUDAScopedContext.cc | 8 +-- .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 3 +- .../plugins/TestCUDAProducerGPUEWTask.cc | 5 +- .../plugins/TestCUDAProducerGPUKernel.cu | 6 +-- .../plugins/TestCUDAProducerGPUtoCPU.cc | 3 +- .../test/test_TestCUDAProducerGPUFirst.cc | 2 +- .../CUDAUtilities/interface/HistoContainer.h | 2 +- .../CUDAUtilities/interface/copyAsync.h | 9 ++-- .../CUDAUtilities/interface/memsetAsync.h | 7 +-- .../CUDAUtilities/test/AtomicPairCounter_t.cu | 11 ++-- .../CUDAUtilities/test/HistoContainer_t.cu | 9 ++-- .../CUDAUtilities/test/OneHistoContainer_t.cu | 3 +- .../CUDAUtilities/test/OneToManyAssoc_t.h | 15 +++--- .../CUDAUtilities/test/copyAsync_t.cpp | 15 +++--- .../CUDAUtilities/test/prefixScan_t.cu | 13 ++--- .../CUDAUtilities/test/radixSort_t.cu | 7 +-- .../TestHeterogeneousEDProducerGPUHelpers.cu | 14 +++--- .../src/SiPixelFedCablingMapGPUWrapper.cc | 4 +- .../SiPixelClusterizer/test/gpuClustering_t.h | 32 ++++++------ .../PixelTrackFitting/test/testEigenGPU.cu | 8 +-- .../test/testEigenGPUNoFit.cu | 50 +++++++++---------- .../PixelVertexFinding/test/VertexFinder_t.h | 37 +++++++------- 29 files changed, 163 insertions(+), 150 deletions(-) diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc index da9cd4d776c92..a297ae11dc327 100644 --- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc +++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc @@ -1,8 +1,9 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) { data_d_ = cudautils::make_device_unique(stream); - cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream)); } diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc index c4cb13e3a0bd8..e6f223bfec4e3 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc @@ -14,6 +14,6 @@ cudautils::host::unique_ptr TrackingRecHit2DCUDA::localCoordToHostAsync template <> cudautils::host::unique_ptr TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const { auto ret = cudautils::make_host_unique(2001, stream); - cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream); + cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream)); return ret; } diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp index 75b5418eeaffb..cae4e48f07b6b 100644 --- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp @@ -10,6 +10,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "DataFormats/GeometrySurface/interface/GloballyPositioned.h" #include "DataFormats/GeometrySurface/interface/SOARotation.h" #include "DataFormats/GeometrySurface/interface/TkRotation.h" @@ -73,7 +74,7 @@ int main(void) { // auto d_sf = cuda::memory::device::make_unique(current_device, 1); auto d_sf = cuda::memory::device::make_unique(current_device, sizeof(SFrame)); - cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice)); for (auto i = 0U; i < size; ++i) { @@ -85,9 +86,9 @@ int main(void) { std::random_shuffle(xl, xl + size); std::random_shuffle(yl, yl + size); - cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice); - cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice); - cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice)); toGlobalWrapper((SFrame const *)(d_sf.get()), d_xl.get(), @@ -98,10 +99,10 @@ int main(void) { d_le.get(), d_ge.get(), size); - cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost); - cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost); - cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost); - cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost)); float eps = 0.; for (auto i = 0U; i < size; ++i) { diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu index 65e8adca7e5ee..4adbd885eab7b 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cu +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -16,6 +16,7 @@ #include #include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -132,7 +133,7 @@ void go(bool soa) { std::cout << mm[SIZE / 2](1, 1) << std::endl; auto m_d = cuda::memory::device::make_unique(current_device, DIM * DIM * stride()); - cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice)); constexpr int NKK = #ifdef DOPROF @@ -151,7 +152,7 @@ void go(bool soa) { else cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); @@ -163,9 +164,7 @@ void go(bool soa) { #ifndef DOPROF cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost); - + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); #endif delta1 += (std::chrono::high_resolution_clock::now() - start); diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu index eec6020b44179..298d8b784f322 100644 --- a/DataFormats/Math/test/cudaAtan2Test.cu +++ b/DataFormats/Math/test/cudaAtan2Test.cu @@ -29,6 +29,7 @@ end #include "cuda/api_wrappers.h" #include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -72,7 +73,7 @@ void go() { auto diff_d = cuda::memory::device::make_unique(current_device, 3); int diffs[3]; - cudaMemset(diff_d.get(), 0, 3 * 4); + cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4)); // Launch the diff CUDA Kernel dim3 threadsPerBlock(32, 32, 1); @@ -83,7 +84,7 @@ void go() { cudautils::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); - cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); float mdiff = diffs[0] * 1.e-7; diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu index e443544e2e3c0..efb99d12ced9c 100644 --- a/DataFormats/Math/test/cudaMathTest.cu +++ b/DataFormats/Math/test/cudaMathTest.cu @@ -39,6 +39,7 @@ end #include "DataFormats/Math/interface/approx_log.h" #include "DataFormats/Math/interface/approx_exp.h" #include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -106,8 +107,8 @@ void go() { auto d_B = cuda::memory::device::make_unique(current_device, numElements); auto d_C = cuda::memory::device::make_unique(current_device, numElements); - cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice); - cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda alloc+copy took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -130,7 +131,7 @@ void go() { << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda copy back took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu index 1c889f8f75a04..bd9ce4f29fba3 100644 --- a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu +++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu @@ -12,6 +12,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" namespace { @@ -39,8 +40,8 @@ int main() { cudaStream_t stream1, stream2; cudaEvent_t event1, event2; - cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)); - cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)); + cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float))); + cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float))); cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); cudaEventCreate(&event1); @@ -50,7 +51,7 @@ int main() { host_points1[j] = static_cast(j); } - cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1); + cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1)); kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE); if (debug) std::cout << "Kernel launched on stream1" << std::endl; diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index bab3d0a347509..04f2789f83f43 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -31,7 +31,7 @@ namespace cudatest { namespace { std::unique_ptr> produce(int device, int* d, int* h) { auto ctx = cudatest::TestCUDAScopedContext::make(device, true); - cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()); + cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream())); testCUDAScopedContextKernels_single(d, ctx.stream()); return ctx.wrap(d); } @@ -116,9 +116,9 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { h_a2 = 0; int h_a3 = 0; - cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()); - cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()); - cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()); + cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); REQUIRE(h_a1 == 2); REQUIRE(h_a2 == 4); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc index 95af5ba5c0685..046150f2b6930 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc @@ -7,6 +7,7 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" @@ -67,7 +68,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); + cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); } diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc index e38322b7f5713..db2d6593d7a95 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc @@ -10,6 +10,7 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" @@ -75,7 +76,7 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); + cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); // Push a task to run addSimpleWork() after the asynchronous work // (and acquire()) has finished instead of produce() ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) { @@ -93,7 +94,7 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID, edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID << " 10th element " << *hostData_ << " not satisfied, queueing more work"; - cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); + cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); }); gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream()); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu index 0015cdcdc7bb6..1facd8ef44b1e 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu @@ -69,7 +69,7 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const // First make the sanity check if (d_input != nullptr) { auto h_check = std::make_unique(NUM_VALUES); - cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost)); for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -89,8 +89,8 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const auto d_a = cudautils::make_device_unique(NUM_VALUES, stream); auto d_b = cudautils::make_device_unique(NUM_VALUES, stream); - cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc index 73f2123521dc2..6a0de45e12d95 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc @@ -6,6 +6,7 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" @@ -58,7 +59,7 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, buffer_ = cudautils::make_host_unique(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream()); // Enqueue async copy, continue in produce once finished - cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()); + cudaCheck(cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc index 8c3ae29a8b343..b3f3c3bad8a9b 100644 --- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc +++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc @@ -74,7 +74,7 @@ process.moduleToTest(process.toTest) REQUIRE(data != nullptr); float firstElements[10]; - cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()); + cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream())); std::cout << "Synchronizing with CUDA stream" << std::endl; auto stream = prod->stream(); diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 4501f98e39376..ca7053740f6f7 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -62,7 +62,7 @@ namespace cudautils { ) { uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); #ifdef __CUDACC__ - cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream); + cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream)); #else ::memset(off, 0, 4 * Histo::totbins()); #endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h index 1f827c4d6bb0e..9d93ca6260334 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h #define HeterogeneousCore_CUDAUtilities_copyAsync_h +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" @@ -17,7 +18,7 @@ namespace cudautils { // Shouldn't compile for array types because of sizeof(T), but // let's add an assert with a more helpful message static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); } template @@ -25,7 +26,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, cudaStream_t stream) { static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); } // Multiple elements @@ -34,7 +35,7 @@ namespace cudautils { const cudautils::host::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); } template @@ -42,7 +43,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); } } // namespace cudautils diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h index b0167dcb9ed25..e20a606042793 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h #define HeterogeneousCore_CUDAUtilities_memsetAsync_h +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include @@ -12,8 +13,8 @@ namespace cudautils { inline void memsetAsync(cudautils::device::unique_ptr& ptr, T value, cudaStream_t stream) { // Shouldn't compile for array types because of sizeof(T), but // let's add an assert with a more helpful message - static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream); + static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter");i + cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream)); } /** @@ -24,7 +25,7 @@ namespace cudautils { */ template inline void memsetAsync(cudautils::device::unique_ptr& ptr, int value, size_t nelements, cudaStream_t stream) { - cuda::memory::device::async::set(ptr.get(), value, nelements * sizeof(T), stream); + cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream)); } } // namespace cudautils diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu index 1b9669aa88368..1c28758fb970e 100644 --- a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu @@ -1,3 +1,4 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" @@ -39,24 +40,24 @@ __global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_ #include int main() { AtomicPairCounter *dc_d; - cudaMalloc(&dc_d, sizeof(AtomicPairCounter)); - cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); std::cout << "size " << sizeof(AtomicPairCounter) << std::endl; constexpr uint32_t N = 20000; constexpr uint32_t M = N * 6; uint32_t *n_d, *m_d; - cudaMalloc(&n_d, N * sizeof(int)); + cudaCheck(cudaMalloc(&n_d, N * sizeof(int))); // cudaMemset(n_d, 0, N*sizeof(int)); - cudaMalloc(&m_d, M * sizeof(int)); + cudaCheck(cudaMalloc(&m_d, M * sizeof(int))); update<<<2000, 512>>>(dc_d, n_d, m_d, 10000); finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000); verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000); AtomicPairCounter dc; - cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); std::cout << dc.get().n << ' ' << dc.get().m << std::endl; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 3db8b611012c6..c0b374b427d4f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -6,6 +6,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" @@ -26,7 +27,7 @@ void go() { T v[N]; auto v_d = cuda::memory::device::make_unique(current_device, N); - cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); constexpr uint32_t nParts = 10; constexpr uint32_t partSize = N / nParts; @@ -65,7 +66,7 @@ void go() { offsets[10] = 3297 + offsets[9]; } - cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); for (long long j = 0; j < N; j++) v[j] = rgen(eng); @@ -75,10 +76,10 @@ void go() { v[j] = sizeof(T) == 1 ? 22 : 3456; } - cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); - cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); assert(0 == h.off[0]); assert(offsets[10] == h.size()); diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index 4307da7abcf23..0bb4e4a353b86 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -6,6 +6,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" @@ -131,7 +132,7 @@ void go() { assert(v_d.get()); assert(v); - cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice)); assert(v_d.get()); cudautils::launch(mykernel, {1, 256}, v_d.get(), N); } diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index a82a3c0121212..4533c2c2882b0 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -8,6 +8,7 @@ #ifdef __CUDACC__ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #endif @@ -170,7 +171,7 @@ int main() { auto sa_d = cuda::memory::device::make_unique(current_device, 1); auto ws_d = cuda::memory::device::make_unique(current_device, Assoc::wsSize()); - cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else auto a_d = std::make_unique(); auto sa_d = std::make_unique(); @@ -198,7 +199,7 @@ int main() { Assoc la; #ifdef __CUDACC__ - cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); #else memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier #endif @@ -224,17 +225,17 @@ int main() { AtomicPairCounter dc(0); #ifdef __CUDACC__ - cudaMalloc(&dc_d, sizeof(AtomicPairCounter)); - cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); nBlocks = (N + nThreads - 1) / nThreads; fillBulk<<>>(dc_d, v_d.get(), a_d.get(), N); cudautils::finalizeBulk<<>>(dc_d, a_d.get()); verifyBulk<<<1, 1>>>(a_d.get(), dc_d); - cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost); - cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); - cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); cudautils::finalizeBulk<<>>(dc_d, sa_d.get()); verifyBulk<<<1, 1>>>(sa_d.get(), dc_d); diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp index d29b41dffc7d3..0168271baa4fa 100644 --- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp @@ -20,10 +20,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto device = cudautils::make_device_unique(stream); auto host = cudautils::make_host_unique(stream); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, stream); cudaCheck(cudaStreamSynchronize(stream)); - cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream); - stream.synchronize(); REQUIRE(*host == 42); } @@ -40,10 +39,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, N, stream); cudaCheck(cudaStreamSynchronize(stream)); - cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream); - stream.synchronize(); for (int i = 0; i < N; ++i) { CHECK(host[i] == i); } @@ -54,10 +52,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, 42, stream); cudaCheck(cudaStreamSynchronize(stream)); - cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream); - stream.synchronize(); for (int i = 0; i < 42; ++i) { CHECK(host[i] == 200 + i); } @@ -73,7 +70,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto device = cudautils::make_device_unique(stream); auto host = cudautils::make_host_unique(stream); - cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, stream); cudaCheck(cudaStreamSynchronize(stream)); @@ -92,7 +89,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, N, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { @@ -105,7 +102,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { - cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, 42, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index f8266898323cd..b83c03f710012 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -2,6 +2,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" @@ -107,9 +108,9 @@ int main() { uint32_t *d_out1; uint32_t *d_out2; - cudaMalloc(&d_in, num_items * sizeof(uint32_t)); - cudaMalloc(&d_out1, num_items * sizeof(uint32_t)); - cudaMalloc(&d_out2, num_items * sizeof(uint32_t)); + cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t))); auto nthreads = 256; auto nblocks = (num_items + nthreads - 1) / nthreads; @@ -118,8 +119,8 @@ int main() { // the block counter int32_t *d_pc; - cudaMalloc(&d_pc, sizeof(int32_t)); - cudaMemset(d_pc, 0, 4); + cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t))); + cudaCheck(cudaMemset(d_pc, 0, 4)); nthreads = 1024; nblocks = (num_items + nthreads - 1) / nthreads; @@ -139,7 +140,7 @@ int main() { // Allocate temporary storage for inclusive prefix sum // fake larger ws already available temp_storage_bytes *= 8; - cudaMalloc(&d_temp_storage, temp_storage_bytes); + cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes)); std::cout << "temp storage " << temp_storage_bytes << std::endl; // Run inclusive prefix sum CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items)); diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index e1d0a42252482..4573ae106238c 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -10,6 +10,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" @@ -102,8 +103,8 @@ void go(bool useShared) { auto ws_d = cuda::memory::device::make_unique(current_device, N); auto off_d = cuda::memory::device::make_unique(current_device, blocks + 1); - cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice); - cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice)); if (i < 2) std::cout << "lauch for " << offsets[blocks] << std::endl; @@ -121,7 +122,7 @@ void go(bool useShared) { if (i == 0) std::cout << "done for " << offsets[blocks] << std::endl; - cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu index 017092452c1e1..fc6e0684660b7 100644 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu +++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu @@ -77,8 +77,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { auto d_b = cuda::memory::device::make_unique(current_device, NUM_VALUES); auto d_c = cuda::memory::device::make_unique(current_device, NUM_VALUES); - cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()); - cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); int threadsPerBlock{256}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; @@ -91,7 +91,7 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); */ - cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()); + cudaCheck(cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id())); stream.synchronize(); @@ -124,7 +124,7 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas // First make the sanity check if (inputArrays.first != nullptr) { auto h_check = std::make_unique(NUM_VALUES); - cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost)); for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -146,8 +146,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas } // Create stream - cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()); - cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; @@ -195,7 +195,7 @@ void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda: int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) { auto h_c = cuda::memory::host::make_unique(NUM_VALUES); - cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()); + cudaCheck(cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id())); stream.synchronize(); float ret = 0; diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc index d484d9eeadea9..451fe2fb6d75b 100644 --- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc +++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc @@ -156,8 +156,8 @@ cudautils::device::unique_ptr SiPixelFedCablingMapGPUWrapper::g } } - cudaMemcpyAsync( - modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream); + cudaCheck(cudaMemcpyAsync( + modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream)); return modToUnpDevice; } diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h index d65ca1033d454..dd93a8f622898 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h @@ -12,8 +12,8 @@ #ifdef __CUDACC__ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #endif #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" @@ -250,14 +250,14 @@ int main(void) { #ifdef __CUDACC__ size_t size32 = n * sizeof(unsigned int); size_t size16 = n * sizeof(unsigned short); - // size_t size8 = n * sizeof(uint8_t); - - cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice); - - cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice); - cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice); - cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice); - cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice); + // size_t size8 = n * sizeof(uint8_t); + + cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice)); + + cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice)); // Launch CUDA Kernels int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256); int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; @@ -270,7 +270,7 @@ int main(void) { std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; - cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)); + cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t))); cudautils::launch(findClus, {blocksPerGrid, threadsPerBlock}, @@ -283,10 +283,10 @@ int main(void) { d_clus.get(), n); cudaDeviceSynchronize(); - cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); uint32_t nclus[MaxNumModules], moduleId[nModules]; - cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters" << std::endl; @@ -351,10 +351,10 @@ int main(void) { std::cout << "found " << nModules << " Modules active" << std::endl; #ifdef __CUDACC__ - cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost); - cudaMemcpy(h_clus.get(), d_clus.get(), size16, cudaMemcpyDeviceToHost); - cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost); - cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); #endif std::set clids; diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu index a206feca83b52..44d1a375bcabe 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu @@ -261,10 +261,10 @@ void testFit() { kernelFastFit<<>>(hitsGPU, fast_fit_resultsGPU); cudaDeviceSynchronize(); - cudaMemcpy(fast_fit_resultsGPUret, + cudaCheck(cudaMemcpy(fast_fit_resultsGPUret, fast_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Vector4d), - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost)); Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4); std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl; assert(isEqualFuzzy(fast_fit_results, fast_fit)); @@ -311,13 +311,13 @@ void testFit() { std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl; - cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost)); std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl; assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par)); std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl; // LINE_FIT GPU - cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost)); std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl; assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6)); // requires fma on CPU diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu index ebaea2037eb2a..ff0d6aa5f68f5 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu @@ -73,18 +73,18 @@ void testMultiply() { Eigen::Matrix *multiply_resultGPU = nullptr; Eigen::Matrix *multiply_resultGPUret = new Eigen::Matrix(); - cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix)); - cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix)); - cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix)); - cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice); - cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice); - cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU); cudaDeviceSynchronize(); - cudaMemcpy( - multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy( + multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix), cudaMemcpyDeviceToHost)); printIt(multiply_resultGPUret); assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret))); } @@ -104,14 +104,14 @@ void testInverse3x3() { std::cout << "Here is the matrix m:" << std::endl << m << std::endl; std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; #endif - cudaMalloc((void **)&mGPU, sizeof(Matrix3d)); - cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)); - cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice)); kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret); cudaDeviceSynchronize(); - cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; #endif @@ -133,14 +133,14 @@ void testInverse4x4() { std::cout << "Here is the matrix m:" << std::endl << m << std::endl; std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; #endif - cudaMalloc((void **)&mGPU, sizeof(Matrix4d)); - cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)); - cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice)); kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret); cudaDeviceSynchronize(); - cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; #endif @@ -162,14 +162,14 @@ void testInverse5x5() { std::cout << "Here is the matrix m:" << std::endl << m << std::endl; std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; #endif - cudaMalloc((void **)&mGPU, sizeof(Matrix5d)); - cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)); - cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice)); kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret); cudaDeviceSynchronize(); - cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; #endif @@ -195,15 +195,15 @@ void testEigenvalues() { std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl; std::cout << "*************************\n\n" << std::endl; #endif - cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)); - cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType)); - cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice); + cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d))); + cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType))); + cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice)); kernel<<<1, 1>>>(m_gpu, ret_gpu); cudaDeviceSynchronize(); - cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost); - cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl; std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 2313bbe7803bd..59f586eae9812 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -6,6 +6,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #ifdef USE_DBSCAN @@ -126,10 +127,10 @@ int main() { std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice); - cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice); - cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice); - cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice); + cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); @@ -162,7 +163,7 @@ int main() { cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaGetLastError()); - cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else print(onGPU_d.get(), ws_d.get()); @@ -207,8 +208,8 @@ int main() { #endif #ifdef __CUDACC__ - cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost); - cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif @@ -223,9 +224,9 @@ int main() { #ifdef __CUDACC__ cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); - cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(uint32_t), cudaMemcpyDeviceToHost); - cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 50.f); nv = onGPU_d->nvFinal; @@ -243,7 +244,7 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else gridDim.x = 1024; // nv ???? assert(blockIdx.x == 0); @@ -260,7 +261,7 @@ int main() { cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); - cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); sortByPt2(onGPU_d.get(), ws_d.get()); @@ -274,12 +275,12 @@ int main() { } #ifdef __CUDACC__ - cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost); - cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) From c7b7f03fbb0e52a858f816364b30fa4e75bca33e Mon Sep 17 00:00:00 2001 From: Wahid Redjeb Date: Mon, 28 Oct 2019 12:31:44 +0100 Subject: [PATCH 09/13] Fix errors, missing include of launch.h --- HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h | 2 +- RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h index e20a606042793..b9ce5a001d41f 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h @@ -13,7 +13,7 @@ namespace cudautils { inline void memsetAsync(cudautils::device::unique_ptr& ptr, T value, cudaStream_t stream) { // Shouldn't compile for array types because of sizeof(T), but // let's add an assert with a more helpful message - static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter");i + static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream)); } diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h index dd93a8f622898..edc29d6452e69 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h @@ -14,6 +14,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #endif #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" From e007a5eb70b96edbd89a56e34ba12c9fe478ded2 Mon Sep 17 00:00:00 2001 From: Wahid Redjeb Date: Mon, 28 Oct 2019 12:33:14 +0100 Subject: [PATCH 10/13] Apply code-format --- .../test/gpuFrameTransformTest.cpp | 1 - DataFormats/Math/test/CholeskyInvert_t.cu | 2 +- DataFormats/Math/test/cudaMathTest.cu | 8 ++-- .../CUDACore/test/test_CUDAScopedContext.cc | 2 +- .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 3 +- .../plugins/TestCUDAProducerGPUEWTask.cc | 6 ++- .../plugins/TestCUDAProducerGPUKernel.cu | 6 +-- .../plugins/TestCUDAProducerGPUtoCPU.cc | 6 ++- .../test/test_TestCUDAProducerGPUFirst.cc | 4 +- .../CUDAUtilities/interface/copyAsync.h | 4 +- .../CUDAUtilities/interface/prefixScan.h | 2 +- .../CUDAUtilities/test/HistoContainer_t.cu | 6 +-- .../CUDAUtilities/test/OneHistoContainer_t.cu | 2 +- .../CUDAUtilities/test/OneToManyAssoc_t.h | 6 +-- .../CUDAUtilities/test/copyAsync_t.cpp | 6 +-- .../CUDAUtilities/test/radixSort_t.cu | 11 +++--- .../TestHeterogeneousEDProducerGPUHelpers.cu | 8 ++-- .../src/SiPixelFedCablingMapGPUWrapper.cc | 9 +++-- .../SiPixelClusterizer/test/gpuClustering_t.h | 38 +++++++++---------- .../PixelTrackFitting/test/testEigenGPU.cu | 9 +++-- .../test/testEigenGPUNoFit.cu | 6 ++- .../PixelVertexFinding/test/VertexFinder_t.h | 14 +++---- 22 files changed, 87 insertions(+), 72 deletions(-) diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp index cae4e48f07b6b..d02672c08d5d1 100644 --- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp @@ -76,7 +76,6 @@ int main(void) { auto d_sf = cuda::memory::device::make_unique(current_device, sizeof(SFrame)); cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice)); - for (auto i = 0U; i < size; ++i) { xl[i] = yl[i] = 0.1f * float(i) - float(size / 2); le[3 * i] = 0.01f; diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu index 4adbd885eab7b..dca89682113fe 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cu +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -151,7 +151,7 @@ void go(bool soa) { cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); else cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); - + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu index efb99d12ced9c..46aae5a64f217 100644 --- a/DataFormats/Math/test/cudaMathTest.cu +++ b/DataFormats/Math/test/cudaMathTest.cu @@ -106,7 +106,7 @@ void go() { auto d_A = cuda::memory::device::make_unique(current_device, numElements); auto d_B = cuda::memory::device::make_unique(current_device, numElements); auto d_C = cuda::memory::device::make_unique(current_device, numElements); - + cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice)); delta += (std::chrono::high_resolution_clock::now() - start); @@ -119,13 +119,15 @@ void go() { std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; delta -= (std::chrono::high_resolution_clock::now() - start); - cudautils::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cudautils::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cudautils::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cudautils::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index 04f2789f83f43..3e06ed15d7594 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -115,7 +115,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { h_a1 = 0; h_a2 = 0; int h_a3 = 0; - + cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc index 046150f2b6930..74e5af7c46baf 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc @@ -68,7 +68,8 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck( + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); } diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc index db2d6593d7a95..0c8aad0931f15 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc @@ -76,7 +76,8 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent, // Mimick the need to transfer some of the GPU data back to CPU to // be used for something within this module, or to be put in the // event. - cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck( + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); // Push a task to run addSimpleWork() after the asynchronous work // (and acquire()) has finished instead of produce() ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) { @@ -94,7 +95,8 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID, edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID << " 10th element " << *hostData_ << " not satisfied, queueing more work"; - cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck( + cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); }); gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream()); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu index 1facd8ef44b1e..aaa6b9148c74c 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu @@ -69,7 +69,7 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const // First make the sanity check if (d_input != nullptr) { auto h_check = std::make_unique(NUM_VALUES); - cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost)); for (int i = 0; i < NUM_VALUES; ++i) { if (h_check[i] != i) { throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " @@ -88,10 +88,10 @@ cudautils::device::unique_ptr TestCUDAProducerGPUKernel::runAlgo(const auto d_a = cudautils::make_device_unique(NUM_VALUES, stream); auto d_b = cudautils::make_device_unique(NUM_VALUES, stream); - + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream)); - + int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc index 6a0de45e12d95..168ac1daa14b9 100644 --- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc @@ -59,7 +59,11 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, buffer_ = cudautils::make_host_unique(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream()); // Enqueue async copy, continue in produce once finished - cudaCheck(cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(buffer_.get(), + device.get(), + TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), + cudaMemcpyDeviceToHost, + ctx.stream())); edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc index b3f3c3bad8a9b..2b137d3483c5a 100644 --- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc +++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc @@ -74,8 +74,8 @@ process.moduleToTest(process.toTest) REQUIRE(data != nullptr); float firstElements[10]; - cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream())); - + cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream())); + std::cout << "Synchronizing with CUDA stream" << std::endl; auto stream = prod->stream(); cudaCheck(cudaStreamSynchronize(stream)); diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h index 9d93ca6260334..5726736166fdc 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h @@ -35,7 +35,7 @@ namespace cudautils { const cudautils::host::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); } template @@ -43,7 +43,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); } } // namespace cudautils diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 9e591f2be4d69..8b784bdd61bfe 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -151,7 +151,7 @@ __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ c // let's get the partial sums from each block __shared__ T psum[1024]; - for (int i = threadIdx.x, ni = gridDim.x; i(current_device, N); - + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); constexpr uint32_t nParts = 10; @@ -66,7 +66,7 @@ void go() { offsets[10] = 3297 + offsets[9]; } - cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); for (long long j = 0; j < N; j++) v[j] = rgen(eng); @@ -76,7 +76,7 @@ void go() { v[j] = sizeof(T) == 1 ? 22 : 3456; } - cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index 0bb4e4a353b86..03a969102ee1b 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -132,7 +132,7 @@ void go() { assert(v_d.get()); assert(v); - cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); assert(v_d.get()); cudautils::launch(mykernel, {1, 256}, v_d.get(), N); } diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 4533c2c2882b0..8782d6db07e3a 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -170,8 +170,8 @@ int main() { auto a_d = cuda::memory::device::make_unique(current_device, 1); auto sa_d = cuda::memory::device::make_unique(current_device, 1); auto ws_d = cuda::memory::device::make_unique(current_device, Assoc::wsSize()); - - cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); + + cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else auto a_d = std::make_unique(); auto sa_d = std::make_unique(); @@ -199,7 +199,7 @@ int main() { Assoc la; #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); #else memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier #endif diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp index 0168271baa4fa..dbd1e6a40dcb8 100644 --- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp @@ -39,7 +39,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, N, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { @@ -52,7 +52,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { - cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, 42, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { @@ -89,7 +89,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, N, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index 4573ae106238c..bc042cc012185 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -102,9 +102,9 @@ void go(bool useShared) { auto ind_d = cuda::memory::device::make_unique(current_device, N); auto ws_d = cuda::memory::device::make_unique(current_device, N); auto off_d = cuda::memory::device::make_unique(current_device, blocks + 1); - - cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice)); + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) std::cout << "lauch for " << offsets[blocks] << std::endl; @@ -117,12 +117,13 @@ void go(bool useShared) { cudautils::launch( radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); else - cudautils::launch(radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); + cudautils::launch( + radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); if (i == 0) std::cout << "done for " << offsets[blocks] << std::endl; - cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu index fc6e0684660b7..14e9245e19e2e 100644 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu +++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu @@ -77,8 +77,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) { auto d_b = cuda::memory::device::make_unique(current_device, NUM_VALUES); auto d_c = cuda::memory::device::make_unique(current_device, NUM_VALUES); - cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); - cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id())); int threadsPerBlock{256}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; @@ -146,8 +146,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas } // Create stream - cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); - cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); + cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id())); int threadsPerBlock{32}; int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc index 451fe2fb6d75b..d4b8e40dea76b 100644 --- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc +++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc @@ -155,9 +155,12 @@ cudautils::device::unique_ptr SiPixelFedCablingMapGPUWrapper::g } } } - - cudaCheck(cudaMemcpyAsync( - modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream)); + + cudaCheck(cudaMemcpyAsync(modToUnpDevice.get(), + modToUnpHost.get(), + pixelgpudetails::MAX_SIZE * sizeof(unsigned char), + cudaMemcpyHostToDevice, + cudaStream)); return modToUnpDevice; } diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h index edc29d6452e69..bb86c1392cdf9 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h @@ -251,8 +251,8 @@ int main(void) { #ifdef __CUDACC__ size_t size32 = n * sizeof(unsigned int); size_t size16 = n * sizeof(unsigned short); - // size_t size8 = n * sizeof(uint8_t); - + // size_t size8 = n * sizeof(uint8_t); + cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice)); @@ -274,15 +274,15 @@ int main(void) { cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t))); cudautils::launch(findClus, - {blocksPerGrid, threadsPerBlock}, - d_id.get(), - d_x.get(), - d_y.get(), - d_moduleStart.get(), - d_clusInModule.get(), - d_moduleId.get(), - d_clus.get(), - n); + {blocksPerGrid, threadsPerBlock}, + d_id.get(), + d_x.get(), + d_y.get(), + d_moduleStart.get(), + d_clusInModule.get(), + d_moduleId.get(), + d_clus.get(), + n); cudaDeviceSynchronize(); cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); @@ -300,14 +300,14 @@ int main(void) { std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; cudautils::launch(clusterChargeCut, - {blocksPerGrid, threadsPerBlock}, - d_id.get(), - d_adc.get(), - d_moduleStart.get(), - d_clusInModule.get(), - d_moduleId.get(), - d_clus.get(), - n); + {blocksPerGrid, threadsPerBlock}, + d_id.get(), + d_adc.get(), + d_moduleStart.get(), + d_clusInModule.get(), + d_moduleId.get(), + d_clus.get(), + n); cudaDeviceSynchronize(); #else diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu index 44d1a375bcabe..7b02a23c41dca 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu @@ -262,9 +262,9 @@ void testFit() { cudaDeviceSynchronize(); cudaCheck(cudaMemcpy(fast_fit_resultsGPUret, - fast_fit_resultsGPU, - Rfit::maxNumberOfTracks() * sizeof(Vector4d), - cudaMemcpyDeviceToHost)); + fast_fit_resultsGPU, + Rfit::maxNumberOfTracks() * sizeof(Vector4d), + cudaMemcpyDeviceToHost)); Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4); std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl; assert(isEqualFuzzy(fast_fit_results, fast_fit)); @@ -311,7 +311,8 @@ void testFit() { std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl; - cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost)); + cudaCheck( + cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost)); std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl; assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par)); diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu index ff0d6aa5f68f5..e16ac3dbbcbc3 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu @@ -78,7 +78,8 @@ void testMultiply() { cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix))); cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy( + multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU); cudaDeviceSynchronize(); @@ -203,7 +204,8 @@ void testEigenvalues() { cudaDeviceSynchronize(); cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy( + ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType), cudaMemcpyDeviceToHost)); #if TEST_DEBUG std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl; std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 59f586eae9812..14263ed7b3d18 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -127,10 +127,10 @@ int main() { std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); @@ -261,7 +261,7 @@ int main() { cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); sortByPt2(onGPU_d.get(), ws_d.get()); @@ -279,8 +279,8 @@ int main() { cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) From 9a1ca2496aa21a5af9c66e9cbf80dd6fb509084f Mon Sep 17 00:00:00 2001 From: Wahid Redjeb Date: Mon, 28 Oct 2019 14:50:12 +0100 Subject: [PATCH 11/13] Reorders memory copy operations --- HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp index dbd1e6a40dcb8..a9c7774006fa3 100644 --- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp @@ -20,8 +20,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto device = cudautils::make_device_unique(stream); auto host = cudautils::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, stream); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaStreamSynchronize(stream)); REQUIRE(*host == 42); @@ -39,8 +39,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, N, stream); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { CHECK(host[i] == i); @@ -52,8 +52,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { - cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudautils::copyAsync(device, host_orig, 42, stream); + cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { CHECK(host[i] == 200 + i); @@ -70,8 +70,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto device = cudautils::make_device_unique(stream); auto host = cudautils::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream)); cudaCheck(cudaStreamSynchronize(stream)); REQUIRE(*host == 42); @@ -89,8 +89,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, N, stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream)); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { CHECK(host[i] == i); @@ -102,8 +102,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { - cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream)); cudautils::copyAsync(host, device, 42, stream); + cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream)); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { CHECK(host[i] == 200 + i); From 463b4958fd2384d76d3ff64d23e7597afd564299 Mon Sep 17 00:00:00 2001 From: Wahid Redjeb Date: Mon, 28 Oct 2019 15:03:50 +0100 Subject: [PATCH 12/13] Reoders memory copy in Device_to_Host section --- HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp index a9c7774006fa3..750551161a161 100644 --- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp @@ -70,8 +70,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto device = cudautils::make_device_unique(stream); auto host = cudautils::make_host_unique(stream); - cudautils::copyAsync(host, device, stream); cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream)); + cudautils::copyAsync(host, device, stream); cudaCheck(cudaStreamSynchronize(stream)); REQUIRE(*host == 42); @@ -89,8 +89,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { auto host = cudautils::make_host_unique(N, stream); SECTION("Copy all") { - cudautils::copyAsync(host, device, N, stream); cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream)); + cudautils::copyAsync(host, device, N, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < N; ++i) { CHECK(host[i] == i); @@ -102,8 +102,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") { } SECTION("Copy some") { - cudautils::copyAsync(host, device, 42, stream); cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream)); + cudautils::copyAsync(host, device, 42, stream); cudaCheck(cudaStreamSynchronize(stream)); for (int i = 0; i < 42; ++i) { CHECK(host[i] == 200 + i); From 49d83d653727c19433d45a732ce4ca7d958d06f4 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 28 Oct 2019 18:26:42 +0100 Subject: [PATCH 13/13] Fix direction of the copies from device to host --- HeterogeneousCore/CUDAUtilities/interface/copyAsync.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h index 5726736166fdc..bfa1bdee9a03d 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h @@ -26,7 +26,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, cudaStream_t stream) { static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); - cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream)); } // Multiple elements @@ -43,7 +43,7 @@ namespace cudautils { const cudautils::device::unique_ptr& src, size_t nelements, cudaStream_t stream) { - cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream)); } } // namespace cudautils