From 2ee863c326c9f8174bb2fd704a17fae9a572de7f Mon Sep 17 00:00:00 2001
From: Wahid Redjeb <wahid.redjeb.wr@gmail.com>
Date: Sat, 26 Oct 2019 22:48:53 +0200
Subject: [PATCH 01/13] Solve conflicts with #389

---
 CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc  |   2 +-
 .../test/gpuFrameTransformTest.cpp            |  18 +-
 DataFormats/Math/test/CholeskyInvert_t.cu     |   8 +-
 .../test/CholeskyInvert_t_BACKUP_195321.cu    | 225 ++++++++++++++++++
 .../Math/test/CholeskyInvert_t_BASE_195321.cu | 214 +++++++++++++++++
 .../test/CholeskyInvert_t_LOCAL_195321.cu     | 215 +++++++++++++++++
 .../test/CholeskyInvert_t_REMOTE_195321.cu    | 216 +++++++++++++++++
 DataFormats/Math/test/cudaAtan2Test.cu        |   4 +-
 DataFormats/Math/test/cudaMathTest.cu         |   8 +-
 .../CUDACore/test/test_CUDAScopedContext.cc   |  10 +-
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc |   3 +-
 .../plugins/TestCUDAProducerGPUEWTask.cc      |   5 +-
 .../plugins/TestCUDAProducerGPUKernel.cu      |  10 +-
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |   3 +-
 .../test/test_TestCUDAProducerGPUFirst.cc     |   4 +-
 .../CUDAUtilities/interface/copyAsync.h       |   8 +-
 .../CUDAUtilities/test/HistoContainer_t.cu    |  11 +-
 .../CUDAUtilities/test/OneHistoContainer_t.cu |   2 +-
 .../CUDAUtilities/test/OneToManyAssoc_t.h     |   8 +-
 .../CUDAUtilities/test/copyAsync_t.cpp        |  15 +-
 .../CUDAUtilities/test/radixSort_t.cu         |   9 +-
 .../TestHeterogeneousEDProducerGPUHelpers.cu  |  25 +-
 .../src/SiPixelFedCablingMapGPUWrapper.cc     |   6 +-
 .../SiPixelClusterizer/test/gpuClustering_t.h |  27 +--
 .../PixelVertexFinding/test/VertexFinder_t.h  |  36 +--
 25 files changed, 981 insertions(+), 111 deletions(-)
 create mode 100644 DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
 create mode 100644 DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu
 create mode 100644 DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu
 create mode 100644 DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu
diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
index cdc49aa46bfa4..da9cd4d776c92 100644
--- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
+++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -4,5 +4,5 @@
 
 BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
   data_d_ = cudautils::make_device_unique<Data>(stream);
-  cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream);
+  cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream);
 }
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
index 4f3c11212e2e9..75b5418eeaffb 100644
--- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -73,7 +73,8 @@ int main(void) {
 
   // auto d_sf = cuda::memory::device::make_unique<SFrame[]>(current_device, 1);
   auto d_sf = cuda::memory::device::make_unique<char[]>(current_device, sizeof(SFrame));
-  cuda::memory::copy(d_sf.get(), &sf1, sizeof(SFrame));
+  cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice);
+
 
   for (auto i = 0U; i < size; ++i) {
     xl[i] = yl[i] = 0.1f * float(i) - float(size / 2);
@@ -84,9 +85,9 @@ int main(void) {
   std::random_shuffle(xl, xl + size);
   std::random_shuffle(yl, yl + size);
 
-  cuda::memory::copy(d_xl.get(), xl, size32);
-  cuda::memory::copy(d_yl.get(), yl, size32);
-  cuda::memory::copy(d_le.get(), le, 3 * size32);
+  cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice);
 
   toGlobalWrapper((SFrame const *)(d_sf.get()),
                   d_xl.get(),
@@ -97,11 +98,10 @@ int main(void) {
                   d_le.get(),
                   d_ge.get(),
                   size);
-
-  cuda::memory::copy(x, d_x.get(), size32);
-  cuda::memory::copy(y, d_y.get(), size32);
-  cuda::memory::copy(z, d_z.get(), size32);
-  cuda::memory::copy(ge, d_ge.get(), 6 * size32);
+  cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost);
+  cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost);
+  cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost);
+  cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost);
 
   float eps = 0.;
   for (auto i = 0U; i < size; ++i) {
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index 3e2cf041bae16..65e8adca7e5ee 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -132,7 +132,7 @@ void go(bool soa) {
   std::cout << mm[SIZE / 2](1, 1) << std::endl;
 
   auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
-  cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
+  cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice);
 
   constexpr int NKK =
 #ifdef DOPROF
@@ -150,8 +150,9 @@ void go(bool soa) {
       cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
     else
       cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+    
+    cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
 
-    cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
     delta += (std::chrono::high_resolution_clock::now() - start);
 
     if (0 == kk)
@@ -162,8 +163,9 @@ void go(bool soa) {
 
 #ifndef DOPROF
       cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+      cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
 
-      cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
 #endif
       delta1 += (std::chrono::high_resolution_clock::now() - start);
 
diff --git a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
new file mode 100644
index 0000000000000..e5c714acea1f5
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
@@ -0,0 +1,225 @@
+// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run  nvprof --metrics all
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include <cuda/api_wrappers.h>
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+constexpr int stride() { return 5 * 1024; }
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+template <int DIM>
+using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
+
+template <int N>
+__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  MapMX<N> m(p + i);
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invert(M *mm, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto &m = mm[i];
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invertSeq(M *mm, unsigned int n) {
+  if (threadIdx.x != 0)
+    return;
+  auto first = blockIdx.x * blockDim.x;
+  auto last = std::min(first + blockDim.x, n);
+
+  for (auto i = first; i < last; ++i) {
+    auto &m = mm[i];
+    choleskyInversion::invert(m, m);
+  }
+}
+
+// generate matrices
+template <class M>
+void genMatrix(M &m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int N>
+void go(bool soa) {
+  constexpr unsigned int DIM = N;
+  using MX = MXN<DIM>;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+  auto delta1 = delta;
+  auto delta2 = delta;
+
+  if (cuda::device::count() == 0) {
+    std::cerr << "No CUDA devices on this system"
+              << "\n";
+    exit(EXIT_FAILURE);
+  }
+
+  auto current_device = cuda::device::current::get();
+
+  constexpr unsigned int SIZE = 4 * 1024;
+
+  MX mm[stride()];  // just storage in case of SOA
+  double *__restrict__ p = (double *)(mm);
+
+  if (soa) {
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      genMatrix(m);
+    }
+  } else {
+    for (auto &m : mm)
+      genMatrix(m);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  if (soa)
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+  else
+    for (auto &m : mm) {
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
+  cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice);
+
+  constexpr int NKK =
+#ifdef DOPROF
+      2;
+#else
+      1000;
+#endif
+  for (int kk = 0; kk < NKK; ++kk) {
+    int threadsPerBlock = 128;
+    int blocksPerGrid = SIZE / threadsPerBlock;
+
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+
+    if (soa)
+      cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
+    else
+<<<<<<< HEAD
+      cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+=======
+      cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+    
+    cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
+>>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and
+
+    delta += (std::chrono::high_resolution_clock::now() - start);
+
+    if (0 == kk)
+      std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+    if (!soa) {
+      delta1 -= (std::chrono::high_resolution_clock::now() - start);
+
+#ifndef DOPROF
+<<<<<<< HEAD
+      cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+=======
+      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+      
+      cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
+>>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and
+
+#endif
+      delta1 += (std::chrono::high_resolution_clock::now() - start);
+
+      if (0 == kk)
+        std::cout << mm[SIZE / 2](1, 1) << std::endl;
+    }
+
+    delta2 -= (std::chrono::high_resolution_clock::now() - start);
+    if (soa)
+#pragma GCC ivdep
+      for (unsigned int i = 0; i < SIZE; ++i) {
+        MapMX<N> m(p + i);
+        choleskyInversion::invert(m, m);
+      }
+    else
+#pragma GCC ivdep
+      for (auto &m : mm) {
+        choleskyInversion::invert(m, m);
+      }
+
+    delta2 += (std::chrono::high_resolution_clock::now() - start);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  double DNNK = NKK;
+  std::cout << "cuda/cudaSeq/x86 computation took "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
+            << std::endl;
+}
+
+int main() {
+  exitSansCUDADevices();
+
+  go<2>(false);
+  go<4>(false);
+  go<5>(false);
+  go<6>(false);
+
+  go<2>(true);
+  go<4>(true);
+  go<5>(true);
+  go<6>(true);
+
+  // go<10>();
+  return 0;
+}
diff --git a/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu
new file mode 100644
index 0000000000000..6c1915126073f
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu
@@ -0,0 +1,214 @@
+// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run  nvprof --metrics all
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include <cuda/api_wrappers.h>
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+constexpr int stride() { return 5 * 1024; }
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+template <int DIM>
+using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
+
+template <int N>
+__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  MapMX<N> m(p + i);
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invert(M *mm, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto &m = mm[i];
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invertSeq(M *mm, unsigned int n) {
+  if (threadIdx.x != 0)
+    return;
+  auto first = blockIdx.x * blockDim.x;
+  auto last = std::min(first + blockDim.x, n);
+
+  for (auto i = first; i < last; ++i) {
+    auto &m = mm[i];
+    choleskyInversion::invert(m, m);
+  }
+}
+
+// generate matrices
+template <class M>
+void genMatrix(M &m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int N>
+void go(bool soa) {
+  constexpr unsigned int DIM = N;
+  using MX = MXN<DIM>;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+  auto delta1 = delta;
+  auto delta2 = delta;
+
+  if (cuda::device::count() == 0) {
+    std::cerr << "No CUDA devices on this system"
+              << "\n";
+    exit(EXIT_FAILURE);
+  }
+
+  auto current_device = cuda::device::current::get();
+
+  constexpr unsigned int SIZE = 4 * 1024;
+
+  MX mm[stride()];  // just storage in case of SOA
+  double *__restrict__ p = (double *)(mm);
+
+  if (soa) {
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      genMatrix(m);
+    }
+  } else {
+    for (auto &m : mm)
+      genMatrix(m);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  if (soa)
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+  else
+    for (auto &m : mm) {
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
+  cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
+
+  constexpr int NKK =
+#ifdef DOPROF
+      2;
+#else
+      1000;
+#endif
+  for (int kk = 0; kk < NKK; ++kk) {
+    int threadsPerBlock = 128;
+    int blocksPerGrid = SIZE / threadsPerBlock;
+
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+
+    if (soa)
+      cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
+    else
+      cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+
+    cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
+    delta += (std::chrono::high_resolution_clock::now() - start);
+
+    if (0 == kk)
+      std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+    if (!soa) {
+      delta1 -= (std::chrono::high_resolution_clock::now() - start);
+
+#ifndef DOPROF
+      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+
+      cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
+#endif
+      delta1 += (std::chrono::high_resolution_clock::now() - start);
+
+      if (0 == kk)
+        std::cout << mm[SIZE / 2](1, 1) << std::endl;
+    }
+
+    delta2 -= (std::chrono::high_resolution_clock::now() - start);
+    if (soa)
+#pragma GCC ivdep
+      for (unsigned int i = 0; i < SIZE; ++i) {
+        MapMX<N> m(p + i);
+        choleskyInversion::invert(m, m);
+      }
+    else
+#pragma GCC ivdep
+      for (auto &m : mm) {
+        choleskyInversion::invert(m, m);
+      }
+
+    delta2 += (std::chrono::high_resolution_clock::now() - start);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  double DNNK = NKK;
+  std::cout << "cuda/cudaSeq/x86 computation took "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
+            << std::endl;
+}
+
+int main() {
+  exitSansCUDADevices();
+
+  go<2>(false);
+  go<4>(false);
+  go<5>(false);
+  go<6>(false);
+
+  go<2>(true);
+  go<4>(true);
+  go<5>(true);
+  go<6>(true);
+
+  // go<10>();
+  return 0;
+}
diff --git a/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu
new file mode 100644
index 0000000000000..3e2cf041bae16
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu
@@ -0,0 +1,215 @@
+// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run  nvprof --metrics all
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include <cuda/api_wrappers.h>
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+constexpr int stride() { return 5 * 1024; }
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+template <int DIM>
+using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
+
+template <int N>
+__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  MapMX<N> m(p + i);
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invert(M *mm, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto &m = mm[i];
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invertSeq(M *mm, unsigned int n) {
+  if (threadIdx.x != 0)
+    return;
+  auto first = blockIdx.x * blockDim.x;
+  auto last = std::min(first + blockDim.x, n);
+
+  for (auto i = first; i < last; ++i) {
+    auto &m = mm[i];
+    choleskyInversion::invert(m, m);
+  }
+}
+
+// generate matrices
+template <class M>
+void genMatrix(M &m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int N>
+void go(bool soa) {
+  constexpr unsigned int DIM = N;
+  using MX = MXN<DIM>;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+  auto delta1 = delta;
+  auto delta2 = delta;
+
+  if (cuda::device::count() == 0) {
+    std::cerr << "No CUDA devices on this system"
+              << "\n";
+    exit(EXIT_FAILURE);
+  }
+
+  auto current_device = cuda::device::current::get();
+
+  constexpr unsigned int SIZE = 4 * 1024;
+
+  MX mm[stride()];  // just storage in case of SOA
+  double *__restrict__ p = (double *)(mm);
+
+  if (soa) {
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      genMatrix(m);
+    }
+  } else {
+    for (auto &m : mm)
+      genMatrix(m);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  if (soa)
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+  else
+    for (auto &m : mm) {
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
+  cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
+
+  constexpr int NKK =
+#ifdef DOPROF
+      2;
+#else
+      1000;
+#endif
+  for (int kk = 0; kk < NKK; ++kk) {
+    int threadsPerBlock = 128;
+    int blocksPerGrid = SIZE / threadsPerBlock;
+
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+
+    if (soa)
+      cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
+    else
+      cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+
+    cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
+    delta += (std::chrono::high_resolution_clock::now() - start);
+
+    if (0 == kk)
+      std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+    if (!soa) {
+      delta1 -= (std::chrono::high_resolution_clock::now() - start);
+
+#ifndef DOPROF
+      cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+
+      cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
+#endif
+      delta1 += (std::chrono::high_resolution_clock::now() - start);
+
+      if (0 == kk)
+        std::cout << mm[SIZE / 2](1, 1) << std::endl;
+    }
+
+    delta2 -= (std::chrono::high_resolution_clock::now() - start);
+    if (soa)
+#pragma GCC ivdep
+      for (unsigned int i = 0; i < SIZE; ++i) {
+        MapMX<N> m(p + i);
+        choleskyInversion::invert(m, m);
+      }
+    else
+#pragma GCC ivdep
+      for (auto &m : mm) {
+        choleskyInversion::invert(m, m);
+      }
+
+    delta2 += (std::chrono::high_resolution_clock::now() - start);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  double DNNK = NKK;
+  std::cout << "cuda/cudaSeq/x86 computation took "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
+            << std::endl;
+}
+
+int main() {
+  exitSansCUDADevices();
+
+  go<2>(false);
+  go<4>(false);
+  go<5>(false);
+  go<6>(false);
+
+  go<2>(true);
+  go<4>(true);
+  go<5>(true);
+  go<6>(true);
+
+  // go<10>();
+  return 0;
+}
diff --git a/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu
new file mode 100644
index 0000000000000..0751af78a7289
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu
@@ -0,0 +1,216 @@
+// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run  nvprof --metrics all
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include <cuda/api_wrappers.h>
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+constexpr int stride() { return 5 * 1024; }
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+template <int DIM>
+using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
+
+template <int N>
+__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  MapMX<N> m(p + i);
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invert(M *mm, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto &m = mm[i];
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invertSeq(M *mm, unsigned int n) {
+  if (threadIdx.x != 0)
+    return;
+  auto first = blockIdx.x * blockDim.x;
+  auto last = std::min(first + blockDim.x, n);
+
+  for (auto i = first; i < last; ++i) {
+    auto &m = mm[i];
+    choleskyInversion::invert(m, m);
+  }
+}
+
+// generate matrices
+template <class M>
+void genMatrix(M &m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int N>
+void go(bool soa) {
+  constexpr unsigned int DIM = N;
+  using MX = MXN<DIM>;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+  auto delta1 = delta;
+  auto delta2 = delta;
+
+  if (cuda::device::count() == 0) {
+    std::cerr << "No CUDA devices on this system"
+              << "\n";
+    exit(EXIT_FAILURE);
+  }
+
+  auto current_device = cuda::device::current::get();
+
+  constexpr unsigned int SIZE = 4 * 1024;
+
+  MX mm[stride()];  // just storage in case of SOA
+  double *__restrict__ p = (double *)(mm);
+
+  if (soa) {
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      genMatrix(m);
+    }
+  } else {
+    for (auto &m : mm)
+      genMatrix(m);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  if (soa)
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+  else
+    for (auto &m : mm) {
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
+  cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice);
+
+  constexpr int NKK =
+#ifdef DOPROF
+      2;
+#else
+      1000;
+#endif
+  for (int kk = 0; kk < NKK; ++kk) {
+    int threadsPerBlock = 128;
+    int blocksPerGrid = SIZE / threadsPerBlock;
+
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+
+    if (soa)
+      cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
+    else
+      cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+    
+    cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
+
+    delta += (std::chrono::high_resolution_clock::now() - start);
+
+    if (0 == kk)
+      std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+    if (!soa) {
+      delta1 -= (std::chrono::high_resolution_clock::now() - start);
+
+#ifndef DOPROF
+      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+      
+      cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
+
+#endif
+      delta1 += (std::chrono::high_resolution_clock::now() - start);
+
+      if (0 == kk)
+        std::cout << mm[SIZE / 2](1, 1) << std::endl;
+    }
+
+    delta2 -= (std::chrono::high_resolution_clock::now() - start);
+    if (soa)
+#pragma GCC ivdep
+      for (unsigned int i = 0; i < SIZE; ++i) {
+        MapMX<N> m(p + i);
+        choleskyInversion::invert(m, m);
+      }
+    else
+#pragma GCC ivdep
+      for (auto &m : mm) {
+        choleskyInversion::invert(m, m);
+      }
+
+    delta2 += (std::chrono::high_resolution_clock::now() - start);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  double DNNK = NKK;
+  std::cout << "cuda/cudaSeq/x86 computation took "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
+            << std::endl;
+}
+
+int main() {
+  exitSansCUDADevices();
+
+  go<2>(false);
+  go<4>(false);
+  go<5>(false);
+  go<6>(false);
+
+  go<2>(true);
+  go<4>(true);
+  go<5>(true);
+  go<6>(true);
+
+  // go<10>();
+  return 0;
+}
diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu
index c436801640b09..eec6020b44179 100644
--- a/DataFormats/Math/test/cudaAtan2Test.cu
+++ b/DataFormats/Math/test/cudaAtan2Test.cu
@@ -72,7 +72,7 @@ void go() {
   auto diff_d = cuda::memory::device::make_unique<int[]>(current_device, 3);
 
   int diffs[3];
-  cuda::memory::device::zero(diff_d.get(), 3 * 4);
+  cudaMemset(diff_d.get(), 0, 3 * 4);
 
   // Launch the diff CUDA Kernel
   dim3 threadsPerBlock(32, 32, 1);
@@ -83,7 +83,7 @@ void go() {
 
   cudautils::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
 
-  cuda::memory::copy(diffs, diff_d.get(), 3 * 4);
+  cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost);
   delta += (std::chrono::high_resolution_clock::now() - start);
 
   float mdiff = diffs[0] * 1.e-7;
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index 1a77456a1c061..e443544e2e3c0 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -105,9 +105,9 @@ void go() {
   auto d_A = cuda::memory::device::make_unique<float[]>(current_device, numElements);
   auto d_B = cuda::memory::device::make_unique<float[]>(current_device, numElements);
   auto d_C = cuda::memory::device::make_unique<float[]>(current_device, numElements);
-
-  cuda::memory::copy(d_A.get(), h_A.get(), size);
-  cuda::memory::copy(d_B.get(), h_B.get(), size);
+  
+  cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
@@ -130,7 +130,7 @@ void go() {
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cuda::memory::copy(h_C.get(), d_C.get(), size);
+  cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 5e6d67b24e479..bab3d0a347509 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -31,8 +31,7 @@ namespace cudatest {
 namespace {
   std::unique_ptr<CUDAProduct<int*>> produce(int device, int* d, int* h) {
     auto ctx = cudatest::TestCUDAScopedContext::make(device, true);
-
-    cuda::memory::async::copy(d, h, sizeof(int), ctx.stream());
+    cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream());
     testCUDAScopedContextKernels_single(d, ctx.stream());
     return ctx.wrap(d);
   }
@@ -116,9 +115,10 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       h_a1 = 0;
       h_a2 = 0;
       int h_a3 = 0;
-      cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream());
-      cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream());
-      cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream());
+      
+      cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream());
+      cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream());
+      cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream());
 
       REQUIRE(h_a1 == 2);
       REQUIRE(h_a2 == 4);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index abded61899096..95af5ba5c0685 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -67,8 +67,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
-  cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
-
+  cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
                                             << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
index 2b628ae93051e..e38322b7f5713 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -75,8 +75,7 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
-  cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
-
+ cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());
   // Push a task to run addSimpleWork() after the asynchronous work
   // (and acquire()) has finished instead of produce()
   ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
@@ -94,7 +93,7 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
     edm::LogVerbatim("TestCUDAProducerGPUEWTask")
         << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
         << " 10th element " << *hostData_ << " not satisfied, queueing more work";
-    cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
+ cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());   
 
     ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
     gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index 73a6615259ebc..0015cdcdc7bb6 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -69,7 +69,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   // First make the sanity check
   if (d_input != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cuda::memory::copy(h_check.get(), d_input, NUM_VALUES * sizeof(float));
+    cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost);
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -88,10 +88,10 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
 
   auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
   auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
-
-  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream);
-  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream);
-
+  
+  cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream);
+ 
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 06f099073e1ea..73f2123521dc2 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -58,8 +58,7 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,
 
   buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
-  cuda::memory::async::copy(
-      buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), ctx.stream());
+  cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());
 
   edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 3ab110806ee78..8c3ae29a8b343 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -74,8 +74,8 @@ process.moduleToTest(process.toTest)
     REQUIRE(data != nullptr);
 
     float firstElements[10];
-    cuda::memory::async::copy(firstElements, data, sizeof(float) * 10, prod->stream());
-
+    cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()); 
+    
     std::cout << "Synchronizing with CUDA stream" << std::endl;
     auto stream = prod->stream();
     cudaCheck(cudaStreamSynchronize(stream));
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index a327402b2f3d3..1f827c4d6bb0e 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -17,7 +17,7 @@ namespace cudautils {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
+    cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream);
   }
 
   template <typename T>
@@ -25,7 +25,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T>& src,
                         cudaStream_t stream) {
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
+    cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream);
   }
 
   // Multiple elements
@@ -34,7 +34,7 @@ namespace cudautils {
                         const cudautils::host::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-    cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
+  cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream); 
   }
 
   template <typename T>
@@ -42,7 +42,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-    cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
+  cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream);
   }
 }  // namespace cudautils
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index e3ac77f1556a4..3db8b611012c6 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -25,8 +25,8 @@ void go() {
   constexpr int N = 12000;
   T v[N];
   auto v_d = cuda::memory::device::make_unique<T[]>(current_device, N);
-
-  cuda::memory::copy(v_d.get(), v, N * sizeof(T));
+  
+  cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice);
 
   constexpr uint32_t nParts = 10;
   constexpr uint32_t partSize = N / nParts;
@@ -65,7 +65,7 @@ void go() {
       offsets[10] = 3297 + offsets[9];
     }
 
-    cuda::memory::copy(off_d.get(), offsets, 4 * (nParts + 1));
+   cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice);
 
     for (long long j = 0; j < N; j++)
       v[j] = rgen(eng);
@@ -75,11 +75,10 @@ void go() {
         v[j] = sizeof(T) == 1 ? 22 : 3456;
     }
 
-    cuda::memory::copy(v_d.get(), v, N * sizeof(T));
+   cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice);
 
     cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
-
-    cuda::memory::copy(&h, h_d.get(), sizeof(Hist));
+    cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost);
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 4514ddd55e20b..4307da7abcf23 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -131,7 +131,7 @@ void go() {
 
     assert(v_d.get());
     assert(v);
-    cuda::memory::copy(v_d.get(), v, N * sizeof(T));
+    cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice);
     assert(v_d.get());
     cudautils::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index b2d85cf1cf732..d361e0b128c78 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -169,8 +169,8 @@ int main() {
   auto a_d = cuda::memory::device::make_unique<Assoc[]>(current_device, 1);
   auto sa_d = cuda::memory::device::make_unique<SmallAssoc[]>(current_device, 1);
   auto ws_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, Assoc::wsSize());
-
-  cuda::memory::copy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>));
+  
+  cudaMemcpy(v_d.get(), tr.data(), N *  sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice);
 #else
   auto a_d = std::make_unique<Assoc>();
   auto sa_d = std::make_unique<SmallAssoc>();
@@ -198,7 +198,7 @@ int main() {
   Assoc la;
 
 #ifdef __CUDACC__
-  cuda::memory::copy(&la, a_d.get(), sizeof(Assoc));
+  cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost); 
 #else
   memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
 #endif
@@ -231,7 +231,7 @@ int main() {
   cudautils::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
-  cuda::memory::copy(&la, a_d.get(), sizeof(Assoc));
+  cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost);
   cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost);
 
   cudaMemset(dc_d, 0, sizeof(AtomicPairCounter));
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index cec4df12ca99b..d29b41dffc7d3 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -21,8 +21,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int>(stream);
 
       cudautils::copyAsync(device, host_orig, stream);
-      cuda::memory::async::copy(host.get(), device.get(), sizeof(int), stream);
       cudaCheck(cudaStreamSynchronize(stream));
+      cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream);
+      stream.synchronize();
 
       REQUIRE(*host == 42);
     }
@@ -40,8 +41,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
 
       SECTION("Copy all") {
         cudautils::copyAsync(device, host_orig, N, stream);
-        cuda::memory::async::copy(host.get(), device.get(), N * sizeof(int), stream);
         cudaCheck(cudaStreamSynchronize(stream));
+	cudaMemcpyAsync(host.get(), device.get(), N *  sizeof(int), cudaMemcpyDeviceToHost, stream);
+        stream.synchronize();
         for (int i = 0; i < N; ++i) {
           CHECK(host[i] == i);
         }
@@ -53,8 +55,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
 
       SECTION("Copy some") {
         cudautils::copyAsync(device, host_orig, 42, stream);
-        cuda::memory::async::copy(host.get(), device.get(), 42 * sizeof(int), stream);
         cudaCheck(cudaStreamSynchronize(stream));
+	cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream);
+        stream.synchronize();
         for (int i = 0; i < 42; ++i) {
           CHECK(host[i] == 200 + i);
         }
@@ -70,7 +73,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto device = cudautils::make_device_unique<int>(stream);
       auto host = cudautils::make_host_unique<int>(stream);
 
-      cuda::memory::async::copy(device.get(), host_orig.get(), sizeof(int), stream);
+      cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream);
       cudautils::copyAsync(host, device, stream);
       cudaCheck(cudaStreamSynchronize(stream));
 
@@ -89,7 +92,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-        cuda::memory::async::copy(device.get(), host_orig.get(), N * sizeof(int), stream);
+	cudaMemcpyAsync(device.get(), host_orig.get(), N *  sizeof(int), cudaMemcpyHostToDevice, stream);
         cudautils::copyAsync(host, device, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
@@ -102,7 +105,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-        cuda::memory::async::copy(device.get(), host_orig.get(), 42 * sizeof(int), stream);
+        cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream);
         cudautils::copyAsync(host, device, 42, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index 9be80547837d4..e1d0a42252482 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -101,9 +101,9 @@ void go(bool useShared) {
     auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
     auto ws_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
     auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, blocks + 1);
-
-    cuda::memory::copy(v_d.get(), v, N * sizeof(T));
-    cuda::memory::copy(off_d.get(), offsets, 4 * (blocks + 1));
+    
+    cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice);
+    cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice);
 
     if (i < 2)
       std::cout << "lauch for " << offsets[blocks] << std::endl;
@@ -121,8 +121,7 @@ void go(bool useShared) {
     if (i == 0)
       std::cout << "done for " << offsets[blocks] << std::endl;
 
-    //  cuda::memory::copy(v, v_d.get(), 2*N);
-    cuda::memory::copy(ind, ind_d.get(), 2 * N);
+    cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost);  
 
     delta += (std::chrono::high_resolution_clock::now() - start);
 
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
index 7e531a48b6b82..b28ee0cbc9949 100644
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
+++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
@@ -6,7 +6,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "TestHeterogeneousEDProducerGPUHelpers.h"
-
 //
 // Vector Addition Kernel
 //
@@ -59,8 +58,8 @@ namespace {
 
 int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   // Example from Viktor/cuda-api-wrappers
-  constexpr int NUM_VALUES = 10000;
-
+  constexpr int  NUM_VALUES = 10000;
+   
   auto current_device = cuda::device::current::get();
   auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
 
@@ -77,8 +76,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   auto d_b = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
   auto d_c = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
 
-  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), stream.id());
-  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), stream.id());
+   cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id());
+   cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id());
 
   int threadsPerBlock{256};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
@@ -90,8 +89,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   cudautils::launch(vectorAdd, {blocksPerGrid, threadsPerBlock},
                d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
   */
-
-  cuda::memory::async::copy(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), stream.id());
+  
+  cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id());
 
   stream.synchronize();
 
@@ -111,6 +110,7 @@ TestHeterogeneousEDProducerGPUTask::TestHeterogeneousEDProducerGPUTask() {
   h_a = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
   h_b = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
 
+  
   auto current_device = cuda::device::current::get();
   d_b = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
 
@@ -124,7 +124,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   // First make the sanity check
   if (inputArrays.first != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cuda::memory::copy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float));
+    cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost);
+	
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -139,15 +140,17 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   }
 
   auto current_device = cuda::device::current::get();
+
   auto d_a = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
   auto d_c = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
   if (inputArrays.second != nullptr) {
+
     d_d = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
   }
 
   // Create stream
-  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream.id());
-  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream.id());
+ cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id());
+ cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id());
 
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
@@ -195,7 +198,7 @@ void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda:
 
 int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) {
   auto h_c = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
-  cuda::memory::async::copy(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), stream.id());
+  cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id());
   stream.synchronize();
 
   float ret = 0;
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index 4a554b47bbf63..d484d9eeadea9 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -155,9 +155,9 @@ cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::g
       }
     }
   }
-
-  cuda::memory::async::copy(
-      modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream);
+  
+  cudaMemcpyAsync(
+      modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream);
   return modToUnpDevice;
 }
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index 4f7ced9b7e309..d65ca1033d454 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -252,12 +252,12 @@ int main(void) {
     size_t size16 = n * sizeof(unsigned short);
     // size_t size8 = n * sizeof(uint8_t);
 
-    cuda::memory::copy(d_moduleStart.get(), &nModules, sizeof(uint32_t));
+    cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice);
 
-    cuda::memory::copy(d_id.get(), h_id.get(), size16);
-    cuda::memory::copy(d_x.get(), h_x.get(), size16);
-    cuda::memory::copy(d_y.get(), h_y.get(), size16);
-    cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
+    cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice);
     // Launch CUDA Kernels
     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
@@ -270,8 +270,7 @@ int main(void) {
 
     std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
               << " threads\n";
-
-    cuda::memory::device::zero(d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
+    cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));
 
     cudautils::launch(findClus,
                  {blocksPerGrid, threadsPerBlock},
@@ -284,12 +283,10 @@ int main(void) {
                  d_clus.get(),
                  n);
     cudaDeviceSynchronize();
-
-    cuda::memory::copy(&nModules, d_moduleStart.get(), sizeof(uint32_t));
+    cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
     uint32_t nclus[MaxNumModules], moduleId[nModules];
-
-    cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
+    cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
               << std::endl;
@@ -354,10 +351,10 @@ int main(void) {
     std::cout << "found " << nModules << " Modules active" << std::endl;
 
 #ifdef __CUDACC__
-    cuda::memory::copy(h_id.get(), d_id.get(), size16);
-    cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
-    cuda::memory::copy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t));
-    cuda::memory::copy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t));
+    cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_clus.get(), d_clus.get(), size16, cudaMemcpyDeviceToHost);
+    cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 #endif
 
     std::set<unsigned int> clids;
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 0df7af362ac0d..2313bbe7803bd 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -126,10 +126,10 @@ int main() {
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cuda::memory::copy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
-      cuda::memory::copy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
-      cuda::memory::copy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
-      cuda::memory::copy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+      cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice);
+      cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice);
+      cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice);
+      cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice);
 #else
       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
@@ -162,7 +162,7 @@ int main() {
 
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
 #else
       print(onGPU_d.get(), ws_d.get());
@@ -207,8 +207,8 @@ int main() {
 #endif
 
 #ifdef __CUDACC__
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost);
+      cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost);
 #else
       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
@@ -223,9 +223,9 @@ int main() {
 
 #ifdef __CUDACC__
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost);
+      cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+      cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost);
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
       nv = onGPU_d->nvFinal;
@@ -243,7 +243,7 @@ int main() {
 #ifdef __CUDACC__
       // one vertex per block!!!
       cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
-      cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t));
+      cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost);
 #else
       gridDim.x = 1024;  // nv ????
       assert(blockIdx.x == 0);
@@ -260,7 +260,7 @@ int main() {
 
       cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); 
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
       sortByPt2(onGPU_d.get(), ws_d.get());
@@ -274,12 +274,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cuda::memory::copy(zv, LOC_ONGPU(zv), nv * sizeof(float));
-      cuda::memory::copy(wv, LOC_ONGPU(wv), nv * sizeof(float));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
-      cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float));
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t));
+      cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost);
+      cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost);
+      cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost);
+      cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost);
+      cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost);
+      cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost); 
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)

From ede0cdd9bd90a2c9d3a4e0fb23ad4b92fa918eef Mon Sep 17 00:00:00 2001
From: Wahid Redjeb <wahid.redjeb.wr@gmail.com>
Date: Sat, 26 Oct 2019 22:51:01 +0200
Subject: [PATCH 02/13] Solve conflicts with #389

---
 .../Math/test/CholeskyInvert_t_BACKUP_195321.cu        | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
index e5c714acea1f5..bad32accd2d00 100644
--- a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
@@ -149,13 +149,8 @@ void go(bool soa) {
     if (soa)
       cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
     else
-<<<<<<< HEAD
       cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-=======
-      cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-    
     cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
->>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and
 
     delta += (std::chrono::high_resolution_clock::now() - start);
 
@@ -166,13 +161,8 @@ void go(bool soa) {
       delta1 -= (std::chrono::high_resolution_clock::now() - start);
 
 #ifndef DOPROF
-<<<<<<< HEAD
       cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-=======
-      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-      
       cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
->>>>>>> Replace cuda::memory[::async]::copy() with cudaMemcpy[Async](), cuda::memory[::async]::zero() and
 
 #endif
       delta1 += (std::chrono::high_resolution_clock::now() - start);

From a731751a0dd965f5a18b69d94ed11f2d20ba9f62 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 26 Oct 2019 23:41:03 +0200
Subject: [PATCH 03/13] Delete spurious file

---
 .../test/CholeskyInvert_t_BACKUP_195321.cu    | 215 ------------------
 1 file changed, 215 deletions(-)
 delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu

diff --git a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
deleted file mode 100644
index bad32accd2d00..0000000000000
--- a/DataFormats/Math/test/CholeskyInvert_t_BACKUP_195321.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
-// add -DDOPROF to run  nvprof --metrics all
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <random>
-
-#include <Eigen/Core>
-#include <Eigen/Eigenvalues>
-
-#include <cuda/api_wrappers.h>
-
-#include "DataFormats/Math/interface/choleskyInversion.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
-
-constexpr int stride() { return 5 * 1024; }
-template <int DIM>
-using MXN = Eigen::Matrix<double, DIM, DIM>;
-template <int DIM>
-using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
-
-template <int N>
-__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  MapMX<N> m(p + i);
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invert(M *mm, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  auto &m = mm[i];
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invertSeq(M *mm, unsigned int n) {
-  if (threadIdx.x != 0)
-    return;
-  auto first = blockIdx.x * blockDim.x;
-  auto last = std::min(first + blockDim.x, n);
-
-  for (auto i = first; i < last; ++i) {
-    auto &m = mm[i];
-    choleskyInversion::invert(m, m);
-  }
-}
-
-// generate matrices
-template <class M>
-void genMatrix(M &m) {
-  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
-  int n = M::ColsAtCompileTime;
-  std::mt19937 eng;
-  // std::mt19937 eng2;
-  std::uniform_real_distribution<T> rgen(0., 1.);
-
-  // generate first diagonal elemets
-  for (int i = 0; i < n; ++i) {
-    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
-    m(i, i) = maxVal * rgen(eng);
-  }
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < i; ++j) {
-      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
-      m(i, j) = v * rgen(eng);
-      m(j, i) = m(i, j);
-    }
-  }
-}
-
-template <int N>
-void go(bool soa) {
-  constexpr unsigned int DIM = N;
-  using MX = MXN<DIM>;
-  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
-
-  auto start = std::chrono::high_resolution_clock::now();
-  auto delta = start - start;
-  auto delta1 = delta;
-  auto delta2 = delta;
-
-  if (cuda::device::count() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
-  auto current_device = cuda::device::current::get();
-
-  constexpr unsigned int SIZE = 4 * 1024;
-
-  MX mm[stride()];  // just storage in case of SOA
-  double *__restrict__ p = (double *)(mm);
-
-  if (soa) {
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      genMatrix(m);
-    }
-  } else {
-    for (auto &m : mm)
-      genMatrix(m);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  if (soa)
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-  else
-    for (auto &m : mm) {
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
-  cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice);
-
-  constexpr int NKK =
-#ifdef DOPROF
-      2;
-#else
-      1000;
-#endif
-  for (int kk = 0; kk < NKK; ++kk) {
-    int threadsPerBlock = 128;
-    int blocksPerGrid = SIZE / threadsPerBlock;
-
-    delta -= (std::chrono::high_resolution_clock::now() - start);
-
-    if (soa)
-      cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
-    else
-      cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-    cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
-
-    delta += (std::chrono::high_resolution_clock::now() - start);
-
-    if (0 == kk)
-      std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-    if (!soa) {
-      delta1 -= (std::chrono::high_resolution_clock::now() - start);
-
-#ifndef DOPROF
-      cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-      cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
-
-#endif
-      delta1 += (std::chrono::high_resolution_clock::now() - start);
-
-      if (0 == kk)
-        std::cout << mm[SIZE / 2](1, 1) << std::endl;
-    }
-
-    delta2 -= (std::chrono::high_resolution_clock::now() - start);
-    if (soa)
-#pragma GCC ivdep
-      for (unsigned int i = 0; i < SIZE; ++i) {
-        MapMX<N> m(p + i);
-        choleskyInversion::invert(m, m);
-      }
-    else
-#pragma GCC ivdep
-      for (auto &m : mm) {
-        choleskyInversion::invert(m, m);
-      }
-
-    delta2 += (std::chrono::high_resolution_clock::now() - start);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  double DNNK = NKK;
-  std::cout << "cuda/cudaSeq/x86 computation took "
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
-            << std::endl;
-}
-
-int main() {
-  exitSansCUDADevices();
-
-  go<2>(false);
-  go<4>(false);
-  go<5>(false);
-  go<6>(false);
-
-  go<2>(true);
-  go<4>(true);
-  go<5>(true);
-  go<6>(true);
-
-  // go<10>();
-  return 0;
-}

From c00e7bd769ad7e932a7c89daab5c34654cd37dc3 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 26 Oct 2019 23:41:17 +0200
Subject: [PATCH 04/13] Delete spurious file

---
 .../Math/test/CholeskyInvert_t_BASE_195321.cu | 214 ------------------
 1 file changed, 214 deletions(-)
 delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu

diff --git a/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu
deleted file mode 100644
index 6c1915126073f..0000000000000
--- a/DataFormats/Math/test/CholeskyInvert_t_BASE_195321.cu
+++ /dev/null
@@ -1,214 +0,0 @@
-// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
-// add -DDOPROF to run  nvprof --metrics all
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <random>
-
-#include <Eigen/Core>
-#include <Eigen/Eigenvalues>
-
-#include <cuda/api_wrappers.h>
-
-#include "DataFormats/Math/interface/choleskyInversion.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-
-constexpr int stride() { return 5 * 1024; }
-template <int DIM>
-using MXN = Eigen::Matrix<double, DIM, DIM>;
-template <int DIM>
-using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
-
-template <int N>
-__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  MapMX<N> m(p + i);
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invert(M *mm, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  auto &m = mm[i];
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invertSeq(M *mm, unsigned int n) {
-  if (threadIdx.x != 0)
-    return;
-  auto first = blockIdx.x * blockDim.x;
-  auto last = std::min(first + blockDim.x, n);
-
-  for (auto i = first; i < last; ++i) {
-    auto &m = mm[i];
-    choleskyInversion::invert(m, m);
-  }
-}
-
-// generate matrices
-template <class M>
-void genMatrix(M &m) {
-  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
-  int n = M::ColsAtCompileTime;
-  std::mt19937 eng;
-  // std::mt19937 eng2;
-  std::uniform_real_distribution<T> rgen(0., 1.);
-
-  // generate first diagonal elemets
-  for (int i = 0; i < n; ++i) {
-    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
-    m(i, i) = maxVal * rgen(eng);
-  }
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < i; ++j) {
-      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
-      m(i, j) = v * rgen(eng);
-      m(j, i) = m(i, j);
-    }
-  }
-}
-
-template <int N>
-void go(bool soa) {
-  constexpr unsigned int DIM = N;
-  using MX = MXN<DIM>;
-  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
-
-  auto start = std::chrono::high_resolution_clock::now();
-  auto delta = start - start;
-  auto delta1 = delta;
-  auto delta2 = delta;
-
-  if (cuda::device::count() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
-  auto current_device = cuda::device::current::get();
-
-  constexpr unsigned int SIZE = 4 * 1024;
-
-  MX mm[stride()];  // just storage in case of SOA
-  double *__restrict__ p = (double *)(mm);
-
-  if (soa) {
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      genMatrix(m);
-    }
-  } else {
-    for (auto &m : mm)
-      genMatrix(m);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  if (soa)
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-  else
-    for (auto &m : mm) {
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
-  cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
-
-  constexpr int NKK =
-#ifdef DOPROF
-      2;
-#else
-      1000;
-#endif
-  for (int kk = 0; kk < NKK; ++kk) {
-    int threadsPerBlock = 128;
-    int blocksPerGrid = SIZE / threadsPerBlock;
-
-    delta -= (std::chrono::high_resolution_clock::now() - start);
-
-    if (soa)
-      cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
-    else
-      cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-
-    cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
-    delta += (std::chrono::high_resolution_clock::now() - start);
-
-    if (0 == kk)
-      std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-    if (!soa) {
-      delta1 -= (std::chrono::high_resolution_clock::now() - start);
-
-#ifndef DOPROF
-      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-
-      cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
-#endif
-      delta1 += (std::chrono::high_resolution_clock::now() - start);
-
-      if (0 == kk)
-        std::cout << mm[SIZE / 2](1, 1) << std::endl;
-    }
-
-    delta2 -= (std::chrono::high_resolution_clock::now() - start);
-    if (soa)
-#pragma GCC ivdep
-      for (unsigned int i = 0; i < SIZE; ++i) {
-        MapMX<N> m(p + i);
-        choleskyInversion::invert(m, m);
-      }
-    else
-#pragma GCC ivdep
-      for (auto &m : mm) {
-        choleskyInversion::invert(m, m);
-      }
-
-    delta2 += (std::chrono::high_resolution_clock::now() - start);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  double DNNK = NKK;
-  std::cout << "cuda/cudaSeq/x86 computation took "
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
-            << std::endl;
-}
-
-int main() {
-  exitSansCUDADevices();
-
-  go<2>(false);
-  go<4>(false);
-  go<5>(false);
-  go<6>(false);
-
-  go<2>(true);
-  go<4>(true);
-  go<5>(true);
-  go<6>(true);
-
-  // go<10>();
-  return 0;
-}

From 3b7e845832cd6bbff0e72d2519c39ee14367ec70 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 26 Oct 2019 23:44:42 +0200
Subject: [PATCH 05/13] Delete spurious file

---
 .../test/CholeskyInvert_t_LOCAL_195321.cu     | 215 ------------------
 1 file changed, 215 deletions(-)
 delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu

diff --git a/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu
deleted file mode 100644
index 3e2cf041bae16..0000000000000
--- a/DataFormats/Math/test/CholeskyInvert_t_LOCAL_195321.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
-// add -DDOPROF to run  nvprof --metrics all
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <random>
-
-#include <Eigen/Core>
-#include <Eigen/Eigenvalues>
-
-#include <cuda/api_wrappers.h>
-
-#include "DataFormats/Math/interface/choleskyInversion.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
-
-constexpr int stride() { return 5 * 1024; }
-template <int DIM>
-using MXN = Eigen::Matrix<double, DIM, DIM>;
-template <int DIM>
-using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
-
-template <int N>
-__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  MapMX<N> m(p + i);
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invert(M *mm, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  auto &m = mm[i];
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invertSeq(M *mm, unsigned int n) {
-  if (threadIdx.x != 0)
-    return;
-  auto first = blockIdx.x * blockDim.x;
-  auto last = std::min(first + blockDim.x, n);
-
-  for (auto i = first; i < last; ++i) {
-    auto &m = mm[i];
-    choleskyInversion::invert(m, m);
-  }
-}
-
-// generate matrices
-template <class M>
-void genMatrix(M &m) {
-  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
-  int n = M::ColsAtCompileTime;
-  std::mt19937 eng;
-  // std::mt19937 eng2;
-  std::uniform_real_distribution<T> rgen(0., 1.);
-
-  // generate first diagonal elemets
-  for (int i = 0; i < n; ++i) {
-    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
-    m(i, i) = maxVal * rgen(eng);
-  }
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < i; ++j) {
-      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
-      m(i, j) = v * rgen(eng);
-      m(j, i) = m(i, j);
-    }
-  }
-}
-
-template <int N>
-void go(bool soa) {
-  constexpr unsigned int DIM = N;
-  using MX = MXN<DIM>;
-  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
-
-  auto start = std::chrono::high_resolution_clock::now();
-  auto delta = start - start;
-  auto delta1 = delta;
-  auto delta2 = delta;
-
-  if (cuda::device::count() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
-  auto current_device = cuda::device::current::get();
-
-  constexpr unsigned int SIZE = 4 * 1024;
-
-  MX mm[stride()];  // just storage in case of SOA
-  double *__restrict__ p = (double *)(mm);
-
-  if (soa) {
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      genMatrix(m);
-    }
-  } else {
-    for (auto &m : mm)
-      genMatrix(m);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  if (soa)
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-  else
-    for (auto &m : mm) {
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
-  cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
-
-  constexpr int NKK =
-#ifdef DOPROF
-      2;
-#else
-      1000;
-#endif
-  for (int kk = 0; kk < NKK; ++kk) {
-    int threadsPerBlock = 128;
-    int blocksPerGrid = SIZE / threadsPerBlock;
-
-    delta -= (std::chrono::high_resolution_clock::now() - start);
-
-    if (soa)
-      cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
-    else
-      cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-
-    cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
-    delta += (std::chrono::high_resolution_clock::now() - start);
-
-    if (0 == kk)
-      std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-    if (!soa) {
-      delta1 -= (std::chrono::high_resolution_clock::now() - start);
-
-#ifndef DOPROF
-      cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-
-      cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
-#endif
-      delta1 += (std::chrono::high_resolution_clock::now() - start);
-
-      if (0 == kk)
-        std::cout << mm[SIZE / 2](1, 1) << std::endl;
-    }
-
-    delta2 -= (std::chrono::high_resolution_clock::now() - start);
-    if (soa)
-#pragma GCC ivdep
-      for (unsigned int i = 0; i < SIZE; ++i) {
-        MapMX<N> m(p + i);
-        choleskyInversion::invert(m, m);
-      }
-    else
-#pragma GCC ivdep
-      for (auto &m : mm) {
-        choleskyInversion::invert(m, m);
-      }
-
-    delta2 += (std::chrono::high_resolution_clock::now() - start);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  double DNNK = NKK;
-  std::cout << "cuda/cudaSeq/x86 computation took "
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
-            << std::endl;
-}
-
-int main() {
-  exitSansCUDADevices();
-
-  go<2>(false);
-  go<4>(false);
-  go<5>(false);
-  go<6>(false);
-
-  go<2>(true);
-  go<4>(true);
-  go<5>(true);
-  go<6>(true);
-
-  // go<10>();
-  return 0;
-}

From 61d12f5dabbb9d60680ed8b34f1021708a2993c8 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 26 Oct 2019 23:45:00 +0200
Subject: [PATCH 06/13] Delete spurious file

---
 .../test/CholeskyInvert_t_REMOTE_195321.cu    | 216 ------------------
 1 file changed, 216 deletions(-)
 delete mode 100644 DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu

diff --git a/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu b/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu
deleted file mode 100644
index 0751af78a7289..0000000000000
--- a/DataFormats/Math/test/CholeskyInvert_t_REMOTE_195321.cu
+++ /dev/null
@@ -1,216 +0,0 @@
-// nvcc -O3 CholeskyDecomp_t.cu -Icuda-api-wrappers/src/ --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
-// add -DDOPROF to run  nvprof --metrics all
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <random>
-
-#include <Eigen/Core>
-#include <Eigen/Eigenvalues>
-
-#include <cuda/api_wrappers.h>
-
-#include "DataFormats/Math/interface/choleskyInversion.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-
-constexpr int stride() { return 5 * 1024; }
-template <int DIM>
-using MXN = Eigen::Matrix<double, DIM, DIM>;
-template <int DIM>
-using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
-
-template <int N>
-__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  MapMX<N> m(p + i);
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invert(M *mm, unsigned int n) {
-  auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= n)
-    return;
-
-  auto &m = mm[i];
-  choleskyInversion::invert(m, m);
-}
-
-template <typename M, int N>
-__global__ void invertSeq(M *mm, unsigned int n) {
-  if (threadIdx.x != 0)
-    return;
-  auto first = blockIdx.x * blockDim.x;
-  auto last = std::min(first + blockDim.x, n);
-
-  for (auto i = first; i < last; ++i) {
-    auto &m = mm[i];
-    choleskyInversion::invert(m, m);
-  }
-}
-
-// generate matrices
-template <class M>
-void genMatrix(M &m) {
-  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
-  int n = M::ColsAtCompileTime;
-  std::mt19937 eng;
-  // std::mt19937 eng2;
-  std::uniform_real_distribution<T> rgen(0., 1.);
-
-  // generate first diagonal elemets
-  for (int i = 0; i < n; ++i) {
-    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
-    m(i, i) = maxVal * rgen(eng);
-  }
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < i; ++j) {
-      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
-      m(i, j) = v * rgen(eng);
-      m(j, i) = m(i, j);
-    }
-  }
-}
-
-template <int N>
-void go(bool soa) {
-  constexpr unsigned int DIM = N;
-  using MX = MXN<DIM>;
-  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
-
-  auto start = std::chrono::high_resolution_clock::now();
-  auto delta = start - start;
-  auto delta1 = delta;
-  auto delta2 = delta;
-
-  if (cuda::device::count() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
-  auto current_device = cuda::device::current::get();
-
-  constexpr unsigned int SIZE = 4 * 1024;
-
-  MX mm[stride()];  // just storage in case of SOA
-  double *__restrict__ p = (double *)(mm);
-
-  if (soa) {
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      genMatrix(m);
-    }
-  } else {
-    for (auto &m : mm)
-      genMatrix(m);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  if (soa)
-    for (unsigned int i = 0; i < SIZE; ++i) {
-      MapMX<N> m(p + i);
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-  else
-    for (auto &m : mm) {
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
-    }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
-  cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice);
-
-  constexpr int NKK =
-#ifdef DOPROF
-      2;
-#else
-      1000;
-#endif
-  for (int kk = 0; kk < NKK; ++kk) {
-    int threadsPerBlock = 128;
-    int blocksPerGrid = SIZE / threadsPerBlock;
-
-    delta -= (std::chrono::high_resolution_clock::now() - start);
-
-    if (soa)
-      cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
-    else
-      cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-    
-    cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
-
-    delta += (std::chrono::high_resolution_clock::now() - start);
-
-    if (0 == kk)
-      std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-    if (!soa) {
-      delta1 -= (std::chrono::high_resolution_clock::now() - start);
-
-#ifndef DOPROF
-      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-      
-      cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
-
-#endif
-      delta1 += (std::chrono::high_resolution_clock::now() - start);
-
-      if (0 == kk)
-        std::cout << mm[SIZE / 2](1, 1) << std::endl;
-    }
-
-    delta2 -= (std::chrono::high_resolution_clock::now() - start);
-    if (soa)
-#pragma GCC ivdep
-      for (unsigned int i = 0; i < SIZE; ++i) {
-        MapMX<N> m(p + i);
-        choleskyInversion::invert(m, m);
-      }
-    else
-#pragma GCC ivdep
-      for (auto &m : mm) {
-        choleskyInversion::invert(m, m);
-      }
-
-    delta2 += (std::chrono::high_resolution_clock::now() - start);
-  }
-
-  std::cout << mm[SIZE / 2](1, 1) << std::endl;
-
-  double DNNK = NKK;
-  std::cout << "cuda/cudaSeq/x86 computation took "
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
-            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
-            << std::endl;
-}
-
-int main() {
-  exitSansCUDADevices();
-
-  go<2>(false);
-  go<4>(false);
-  go<5>(false);
-  go<6>(false);
-
-  go<2>(true);
-  go<4>(true);
-  go<5>(true);
-  go<6>(true);
-
-  // go<10>();
-  return 0;
-}

From b17e7f9b482999f02f6f6b6d77ddca6c64dbce44 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 26 Oct 2019 23:49:03 +0200
Subject: [PATCH 07/13] Whitespaces

---
 .../test/TestHeterogeneousEDProducerGPUHelpers.cu     | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
index b28ee0cbc9949..017092452c1e1 100644
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
+++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
@@ -6,6 +6,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "TestHeterogeneousEDProducerGPUHelpers.h"
+
 //
 // Vector Addition Kernel
 //
@@ -58,8 +59,8 @@ namespace {
 
 int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   // Example from Viktor/cuda-api-wrappers
-  constexpr int  NUM_VALUES = 10000;
-   
+  constexpr int NUM_VALUES = 10000;
+
   auto current_device = cuda::device::current::get();
   auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
 
@@ -89,7 +90,7 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   cudautils::launch(vectorAdd, {blocksPerGrid, threadsPerBlock},
                d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
   */
-  
+
   cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id());
 
   stream.synchronize();
@@ -110,7 +111,6 @@ TestHeterogeneousEDProducerGPUTask::TestHeterogeneousEDProducerGPUTask() {
   h_a = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
   h_b = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
 
-  
   auto current_device = cuda::device::current::get();
   d_b = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
 
@@ -125,7 +125,6 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   if (inputArrays.first != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
     cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost);
-	
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -140,11 +139,9 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   }
 
   auto current_device = cuda::device::current::get();
-
   auto d_a = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
   auto d_c = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
   if (inputArrays.second != nullptr) {
-
     d_d = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
   }
 

From 01bb99567d64a6ff1897e276709df7ecbb2f40df Mon Sep 17 00:00:00 2001
From: Wahid Redjeb <wahid.redjeb.wr@gmail.com>
Date: Mon, 28 Oct 2019 12:00:13 +0100
Subject: [PATCH 08/13] Wrap cudaMem calls in call to cudaCheck

---
 CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc  |  3 +-
 .../src/TrackingRecHit2DCUDA.cc               |  2 +-
 .../test/gpuFrameTransformTest.cpp            | 17 ++++---
 DataFormats/Math/test/CholeskyInvert_t.cu     |  9 ++--
 DataFormats/Math/test/cudaAtan2Test.cu        |  5 +-
 DataFormats/Math/test/cudaMathTest.cu         |  7 +--
 .../CUDACore/test/testStreamEvent.cu          |  7 +--
 .../CUDACore/test/test_CUDAScopedContext.cc   |  8 +--
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc |  3 +-
 .../plugins/TestCUDAProducerGPUEWTask.cc      |  5 +-
 .../plugins/TestCUDAProducerGPUKernel.cu      |  6 +--
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |  3 +-
 .../test/test_TestCUDAProducerGPUFirst.cc     |  2 +-
 .../CUDAUtilities/interface/HistoContainer.h  |  2 +-
 .../CUDAUtilities/interface/copyAsync.h       |  9 ++--
 .../CUDAUtilities/interface/memsetAsync.h     |  7 +--
 .../CUDAUtilities/test/AtomicPairCounter_t.cu | 11 ++--
 .../CUDAUtilities/test/HistoContainer_t.cu    |  9 ++--
 .../CUDAUtilities/test/OneHistoContainer_t.cu |  3 +-
 .../CUDAUtilities/test/OneToManyAssoc_t.h     | 15 +++---
 .../CUDAUtilities/test/copyAsync_t.cpp        | 15 +++---
 .../CUDAUtilities/test/prefixScan_t.cu        | 13 ++---
 .../CUDAUtilities/test/radixSort_t.cu         |  7 +--
 .../TestHeterogeneousEDProducerGPUHelpers.cu  | 14 +++---
 .../src/SiPixelFedCablingMapGPUWrapper.cc     |  4 +-
 .../SiPixelClusterizer/test/gpuClustering_t.h | 32 ++++++------
 .../PixelTrackFitting/test/testEigenGPU.cu    |  8 +--
 .../test/testEigenGPUNoFit.cu                 | 50 +++++++++----------
 .../PixelVertexFinding/test/VertexFinder_t.h  | 37 +++++++-------
 29 files changed, 163 insertions(+), 150 deletions(-)

diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
index da9cd4d776c92..a297ae11dc327 100644
--- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
+++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -1,8 +1,9 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
   data_d_ = cudautils::make_device_unique<Data>(stream);
-  cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream);
+  cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
 }
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
index c4cb13e3a0bd8..e6f223bfec4e3 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
@@ -14,6 +14,6 @@ cudautils::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync
 template <>
 cudautils::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
   auto ret = cudautils::make_host_unique<uint32_t[]>(2001, stream);
-  cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream);
+  cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
   return ret;
 }
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
index 75b5418eeaffb..cae4e48f07b6b 100644
--- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -10,6 +10,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "DataFormats/GeometrySurface/interface/GloballyPositioned.h"
 #include "DataFormats/GeometrySurface/interface/SOARotation.h"
 #include "DataFormats/GeometrySurface/interface/TkRotation.h"
@@ -73,7 +74,7 @@ int main(void) {
 
   // auto d_sf = cuda::memory::device::make_unique<SFrame[]>(current_device, 1);
   auto d_sf = cuda::memory::device::make_unique<char[]>(current_device, sizeof(SFrame));
-  cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));
 
 
   for (auto i = 0U; i < size; ++i) {
@@ -85,9 +86,9 @@ int main(void) {
   std::random_shuffle(xl, xl + size);
   std::random_shuffle(yl, yl + size);
 
-  cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice);
+  cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice));
 
   toGlobalWrapper((SFrame const *)(d_sf.get()),
                   d_xl.get(),
@@ -98,10 +99,10 @@ int main(void) {
                   d_le.get(),
                   d_ge.get(),
                   size);
-  cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost);
-  cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost);
-  cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost);
-  cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost));
 
   float eps = 0.;
   for (auto i = 0U; i < size; ++i) {
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index 65e8adca7e5ee..4adbd885eab7b 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -16,6 +16,7 @@
 #include <cuda/api_wrappers.h>
 
 #include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
@@ -132,7 +133,7 @@ void go(bool soa) {
   std::cout << mm[SIZE / 2](1, 1) << std::endl;
 
   auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
-  cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));
 
   constexpr int NKK =
 #ifdef DOPROF
@@ -151,7 +152,7 @@ void go(bool soa) {
     else
       cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
     
-    cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
 
     delta += (std::chrono::high_resolution_clock::now() - start);
 
@@ -163,9 +164,7 @@ void go(bool soa) {
 
 #ifndef DOPROF
       cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-      cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-      cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost);
-
+      cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
 #endif
       delta1 += (std::chrono::high_resolution_clock::now() - start);
 
diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu
index eec6020b44179..298d8b784f322 100644
--- a/DataFormats/Math/test/cudaAtan2Test.cu
+++ b/DataFormats/Math/test/cudaAtan2Test.cu
@@ -29,6 +29,7 @@ end
 #include "cuda/api_wrappers.h"
 
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
@@ -72,7 +73,7 @@ void go() {
   auto diff_d = cuda::memory::device::make_unique<int[]>(current_device, 3);
 
   int diffs[3];
-  cudaMemset(diff_d.get(), 0, 3 * 4);
+  cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));
 
   // Launch the diff CUDA Kernel
   dim3 threadsPerBlock(32, 32, 1);
@@ -83,7 +84,7 @@ void go() {
 
   cudautils::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
 
-  cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
   delta += (std::chrono::high_resolution_clock::now() - start);
 
   float mdiff = diffs[0] * 1.e-7;
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index e443544e2e3c0..efb99d12ced9c 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -39,6 +39,7 @@ end
 #include "DataFormats/Math/interface/approx_log.h"
 #include "DataFormats/Math/interface/approx_exp.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
@@ -106,8 +107,8 @@ void go() {
   auto d_B = cuda::memory::device::make_unique<float[]>(current_device, numElements);
   auto d_C = cuda::memory::device::make_unique<float[]>(current_device, numElements);
   
-  cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice);
+  cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
@@ -130,7 +131,7 @@ void go() {
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
index 1c889f8f75a04..bd9ce4f29fba3 100644
--- a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
+++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
@@ -12,6 +12,7 @@
 
 #include <cuda_runtime.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
 namespace {
@@ -39,8 +40,8 @@ int main() {
   cudaStream_t stream1, stream2;
   cudaEvent_t event1, event2;
 
-  cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float));
-  cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float));
+  cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)));
+  cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)));
   cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
   cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
   cudaEventCreate(&event1);
@@ -50,7 +51,7 @@ int main() {
     host_points1[j] = static_cast<float>(j);
   }
 
-  cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1);
+  cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1));
   kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE);
   if (debug)
     std::cout << "Kernel launched on stream1" << std::endl;
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index bab3d0a347509..04f2789f83f43 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -31,7 +31,7 @@ namespace cudatest {
 namespace {
   std::unique_ptr<CUDAProduct<int*>> produce(int device, int* d, int* h) {
     auto ctx = cudatest::TestCUDAScopedContext::make(device, true);
-    cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream());
+    cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
     testCUDAScopedContextKernels_single(d, ctx.stream());
     return ctx.wrap(d);
   }
@@ -116,9 +116,9 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       h_a2 = 0;
       int h_a3 = 0;
       
-      cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream());
-      cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream());
-      cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream());
+      cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
 
       REQUIRE(h_a1 == 2);
       REQUIRE(h_a2 == 4);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 95af5ba5c0685..046150f2b6930 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -7,6 +7,7 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
@@ -67,7 +68,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
-  cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());
+  cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
                                             << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
index e38322b7f5713..db2d6593d7a95 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -10,6 +10,7 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
@@ -75,7 +76,7 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
- cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());
+ cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
   // Push a task to run addSimpleWork() after the asynchronous work
   // (and acquire()) has finished instead of produce()
   ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
@@ -93,7 +94,7 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
     edm::LogVerbatim("TestCUDAProducerGPUEWTask")
         << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
         << " 10th element " << *hostData_ << " not satisfied, queueing more work";
- cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());   
+ cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));   
 
     ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
     gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index 0015cdcdc7bb6..1facd8ef44b1e 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -69,7 +69,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   // First make the sanity check
   if (d_input != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost));
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -89,8 +89,8 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
   auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
   
-  cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream);
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
  
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 73f2123521dc2..6a0de45e12d95 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -6,6 +6,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
@@ -58,7 +59,7 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,
 
   buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
-  cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream());
+  cudaCheck(cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
 
   edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 8c3ae29a8b343..b3f3c3bad8a9b 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -74,7 +74,7 @@ process.moduleToTest(process.toTest)
     REQUIRE(data != nullptr);
 
     float firstElements[10];
-    cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()); 
+    cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream())); 
     
     std::cout << "Synchronizing with CUDA stream" << std::endl;
     auto stream = prod->stream();
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 4501f98e39376..ca7053740f6f7 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -62,7 +62,7 @@ namespace cudautils {
   ) {
     uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
 #ifdef __CUDACC__
-    cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream);
+    cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
 #else
     ::memset(off, 0, 4 * Histo::totbins());
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index 1f827c4d6bb0e..9d93ca6260334 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
 #define HeterogeneousCore_CUDAUtilities_copyAsync_h
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
@@ -17,7 +18,7 @@ namespace cudautils {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream);
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
   }
 
   template <typename T>
@@ -25,7 +26,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T>& src,
                         cudaStream_t stream) {
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream);
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
   }
 
   // Multiple elements
@@ -34,7 +35,7 @@ namespace cudautils {
                         const cudautils::host::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-  cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream); 
+  cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream)); 
   }
 
   template <typename T>
@@ -42,7 +43,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-  cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream);
+  cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream));
   }
 }  // namespace cudautils
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index b0167dcb9ed25..e20a606042793 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
 #define HeterogeneousCore_CUDAUtilities_memsetAsync_h
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
@@ -12,8 +13,8 @@ namespace cudautils {
   inline void memsetAsync(cudautils::device::unique_ptr<T>& ptr, T value, cudaStream_t stream) {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
-    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream);
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");i
+    cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
   }
 
   /**
@@ -24,7 +25,7 @@ namespace cudautils {
    */
   template <typename T>
   inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
-    cuda::memory::device::async::set(ptr.get(), value, nelements * sizeof(T), stream);
+    cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
   }
 }  // namespace cudautils
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
index 1b9669aa88368..1c28758fb970e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
@@ -1,3 +1,4 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
@@ -39,24 +40,24 @@ __global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_
 #include <iostream>
 int main() {
   AtomicPairCounter *dc_d;
-  cudaMalloc(&dc_d, sizeof(AtomicPairCounter));
-  cudaMemset(dc_d, 0, sizeof(AtomicPairCounter));
+  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
 
   std::cout << "size " << sizeof(AtomicPairCounter) << std::endl;
 
   constexpr uint32_t N = 20000;
   constexpr uint32_t M = N * 6;
   uint32_t *n_d, *m_d;
-  cudaMalloc(&n_d, N * sizeof(int));
+  cudaCheck(cudaMalloc(&n_d, N * sizeof(int)));
   // cudaMemset(n_d, 0, N*sizeof(int));
-  cudaMalloc(&m_d, M * sizeof(int));
+  cudaCheck(cudaMalloc(&m_d, M * sizeof(int)));
 
   update<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
   finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000);
   verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
 
   AtomicPairCounter dc;
-  cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
 
   std::cout << dc.get().n << ' ' << dc.get().m << std::endl;
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 3db8b611012c6..c0b374b427d4f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -6,6 +6,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
@@ -26,7 +27,7 @@ void go() {
   T v[N];
   auto v_d = cuda::memory::device::make_unique<T[]>(current_device, N);
   
-  cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
   constexpr uint32_t nParts = 10;
   constexpr uint32_t partSize = N / nParts;
@@ -65,7 +66,7 @@ void go() {
       offsets[10] = 3297 + offsets[9];
     }
 
-   cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice);
+   cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
 
     for (long long j = 0; j < N; j++)
       v[j] = rgen(eng);
@@ -75,10 +76,10 @@ void go() {
         v[j] = sizeof(T) == 1 ? 22 : 3456;
     }
 
-   cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice);
+   cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
     cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
-    cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 4307da7abcf23..0bb4e4a353b86 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -6,6 +6,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
@@ -131,7 +132,7 @@ void go() {
 
     assert(v_d.get());
     assert(v);
-    cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice);
+    cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice));
     assert(v_d.get());
     cudautils::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index a82a3c0121212..4533c2c2882b0 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -8,6 +8,7 @@
 
 #ifdef __CUDACC__
 #include <cuda/api_wrappers.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #endif
 
@@ -170,7 +171,7 @@ int main() {
   auto sa_d = cuda::memory::device::make_unique<SmallAssoc[]>(current_device, 1);
   auto ws_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, Assoc::wsSize());
   
-  cudaMemcpy(v_d.get(), tr.data(), N *  sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N *  sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
   auto a_d = std::make_unique<Assoc>();
   auto sa_d = std::make_unique<SmallAssoc>();
@@ -198,7 +199,7 @@ int main() {
   Assoc la;
 
 #ifdef __CUDACC__
-  cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost); 
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); 
 #else
   memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
 #endif
@@ -224,17 +225,17 @@ int main() {
   AtomicPairCounter dc(0);
 
 #ifdef __CUDACC__
-  cudaMalloc(&dc_d, sizeof(AtomicPairCounter));
-  cudaMemset(dc_d, 0, sizeof(AtomicPairCounter));
+  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   nBlocks = (N + nThreads - 1) / nThreads;
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
   cudautils::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
-  cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost);
-  cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
 
-  cudaMemset(dc_d, 0, sizeof(AtomicPairCounter));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
   cudautils::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
   verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index d29b41dffc7d3..0168271baa4fa 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -20,10 +20,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto device = cudautils::make_device_unique<int>(stream);
       auto host = cudautils::make_host_unique<int>(stream);
 
+      cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
       cudautils::copyAsync(device, host_orig, stream);
       cudaCheck(cudaStreamSynchronize(stream));
-      cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream);
-      stream.synchronize();
 
       REQUIRE(*host == 42);
     }
@@ -40,10 +39,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
+	cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N *  sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudautils::copyAsync(device, host_orig, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
-	cudaMemcpyAsync(host.get(), device.get(), N *  sizeof(int), cudaMemcpyDeviceToHost, stream);
-        stream.synchronize();
         for (int i = 0; i < N; ++i) {
           CHECK(host[i] == i);
         }
@@ -54,10 +52,9 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
+	cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudautils::copyAsync(device, host_orig, 42, stream);
         cudaCheck(cudaStreamSynchronize(stream));
-	cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream);
-        stream.synchronize();
         for (int i = 0; i < 42; ++i) {
           CHECK(host[i] == 200 + i);
         }
@@ -73,7 +70,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto device = cudautils::make_device_unique<int>(stream);
       auto host = cudautils::make_host_unique<int>(stream);
 
-      cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream);
+      cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
       cudautils::copyAsync(host, device, stream);
       cudaCheck(cudaStreamSynchronize(stream));
 
@@ -92,7 +89,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-	cudaMemcpyAsync(device.get(), host_orig.get(), N *  sizeof(int), cudaMemcpyHostToDevice, stream);
+	cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N *  sizeof(int), cudaMemcpyHostToDevice, stream));
         cudautils::copyAsync(host, device, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
@@ -105,7 +102,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-        cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudautils::copyAsync(host, device, 42, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index f8266898323cd..b83c03f710012 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -2,6 +2,7 @@
 
 #include <cub/cub.cuh>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
@@ -107,9 +108,9 @@ int main() {
     uint32_t *d_out1;
     uint32_t *d_out2;
 
-    cudaMalloc(&d_in, num_items * sizeof(uint32_t));
-    cudaMalloc(&d_out1, num_items * sizeof(uint32_t));
-    cudaMalloc(&d_out2, num_items * sizeof(uint32_t));
+    cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t)));
 
     auto nthreads = 256;
     auto nblocks = (num_items + nthreads - 1) / nthreads;
@@ -118,8 +119,8 @@ int main() {
 
     // the block counter
     int32_t *d_pc;
-    cudaMalloc(&d_pc, sizeof(int32_t));
-    cudaMemset(d_pc, 0, 4);
+    cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t)));
+    cudaCheck(cudaMemset(d_pc, 0, 4));
 
     nthreads = 1024;
     nblocks = (num_items + nthreads - 1) / nthreads;
@@ -139,7 +140,7 @@ int main() {
     // Allocate temporary storage for inclusive prefix sum
     // fake larger ws already available
     temp_storage_bytes *= 8;
-    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+    cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes));
     std::cout << "temp storage " << temp_storage_bytes << std::endl;
     // Run inclusive prefix sum
     CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items));
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index e1d0a42252482..4573ae106238c 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -10,6 +10,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
@@ -102,8 +103,8 @@ void go(bool useShared) {
     auto ws_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
     auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, blocks + 1);
     
-    cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice);
-    cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice);
+    cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice));
 
     if (i < 2)
       std::cout << "lauch for " << offsets[blocks] << std::endl;
@@ -121,7 +122,7 @@ void go(bool useShared) {
     if (i == 0)
       std::cout << "done for " << offsets[blocks] << std::endl;
 
-    cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost);  
+    cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));  
 
     delta += (std::chrono::high_resolution_clock::now() - start);
 
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
index 017092452c1e1..fc6e0684660b7 100644
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
+++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
@@ -77,8 +77,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   auto d_b = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
   auto d_c = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
 
-   cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id());
-   cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id());
+   cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
+   cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
 
   int threadsPerBlock{256};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
@@ -91,7 +91,7 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
                d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
   */
 
-  cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id());
+  cudaCheck(cudaMemcpyAsync(h_c.get(), d_c.get(), NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()));
 
   stream.synchronize();
 
@@ -124,7 +124,7 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   // First make the sanity check
   if (inputArrays.first != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(h_check.get(), inputArrays.first, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost));
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -146,8 +146,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   }
 
   // Create stream
- cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id());
- cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id());
+ cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
+ cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
 
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
@@ -195,7 +195,7 @@ void TestHeterogeneousEDProducerGPUTask::release(const std::string &label, cuda:
 
 int TestHeterogeneousEDProducerGPUTask::getResult(const ResultTypeRaw &d_ac, cuda::stream_t<> &stream) {
   auto h_c = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
-  cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id());
+  cudaCheck(cudaMemcpyAsync(h_c.get(), d_ac.second, NUM_VALUES * sizeof(int), cudaMemcpyDeviceToHost, stream.id()));
   stream.synchronize();
 
   float ret = 0;
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index d484d9eeadea9..451fe2fb6d75b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -156,8 +156,8 @@ cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::g
     }
   }
   
-  cudaMemcpyAsync(
-      modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream);
+  cudaCheck(cudaMemcpyAsync(
+      modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
   return modToUnpDevice;
 }
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index d65ca1033d454..dd93a8f622898 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -12,8 +12,8 @@
 #ifdef __CUDACC__
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #endif
 
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
@@ -250,14 +250,14 @@ int main(void) {
 #ifdef __CUDACC__
     size_t size32 = n * sizeof(unsigned int);
     size_t size16 = n * sizeof(unsigned short);
-    // size_t size8 = n * sizeof(uint8_t);
-
-    cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice);
-
-    cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice);
+    // size_t size8 = n * sizeof(uint8_t);    
+    
+    cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
+
+    cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
     // Launch CUDA Kernels
     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
@@ -270,7 +270,7 @@ int main(void) {
 
     std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
               << " threads\n";
-    cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));
+    cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)));
 
     cudautils::launch(findClus,
                  {blocksPerGrid, threadsPerBlock},
@@ -283,10 +283,10 @@ int main(void) {
                  d_clus.get(),
                  n);
     cudaDeviceSynchronize();
-    cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
     uint32_t nclus[MaxNumModules], moduleId[nModules];
-    cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
               << std::endl;
@@ -351,10 +351,10 @@ int main(void) {
     std::cout << "found " << nModules << " Modules active" << std::endl;
 
 #ifdef __CUDACC__
-    cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_clus.get(), d_clus.get(), size16, cudaMemcpyDeviceToHost);
-    cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #endif
 
     std::set<unsigned int> clids;
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index a206feca83b52..44d1a375bcabe 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -261,10 +261,10 @@ void testFit() {
   kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(fast_fit_resultsGPUret,
+  cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
              fast_fit_resultsGPU,
              Rfit::maxNumberOfTracks() * sizeof(Vector4d),
-             cudaMemcpyDeviceToHost);
+             cudaMemcpyDeviceToHost));
   Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
@@ -311,13 +311,13 @@ void testFit() {
 
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
-  cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
 
   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
   // LINE_FIT GPU
-  cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index ebaea2037eb2a..ff0d6aa5f68f5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -73,18 +73,18 @@ void testMultiply() {
   Eigen::Matrix<double, row1, col2> *multiply_resultGPU = nullptr;
   Eigen::Matrix<double, row1, col2> *multiply_resultGPUret = new Eigen::Matrix<double, row1, col2>();
 
-  cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>));
-  cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>));
-  cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>));
-  cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice);
-  cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice);
-  cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>)));
+  cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>)));
+  cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>)));
+  cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));
 
   kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(
-      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost));
   printIt(multiply_resultGPUret);
   assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
 }
@@ -104,14 +104,14 @@ void testInverse3x3() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix3d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
 
   kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -133,14 +133,14 @@ void testInverse4x4() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix4d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix4d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice));
 
   kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -162,14 +162,14 @@ void testInverse5x5() {
   std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
   std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
 #endif
-  cudaMalloc((void **)&mGPU, sizeof(Matrix5d));
-  cudaMalloc((void **)&mGPUret, sizeof(Matrix5d));
-  cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice));
 
   kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
 #endif
@@ -195,15 +195,15 @@ void testEigenvalues() {
   std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
   std::cout << "*************************\n\n" << std::endl;
 #endif
-  cudaMalloc((void **)&m_gpu, sizeof(Matrix3d));
-  cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType));
-  cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
+  cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType)));
+  cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
 
   kernel<<<1, 1>>>(m_gpu, ret_gpu);
   cudaDeviceSynchronize();
 
-  cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
-  cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost);
+  cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
   std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 2313bbe7803bd..59f586eae9812 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -6,6 +6,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
@@ -126,10 +127,10 @@ int main() {
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice);
-      cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice);
-      cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice);
-      cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice);
+     cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+     cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+     cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+     cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
@@ -162,7 +163,7 @@ int main() {
 
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
-      cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost);
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
       print(onGPU_d.get(), ws_d.get());
@@ -207,8 +208,8 @@ int main() {
 #endif
 
 #ifdef __CUDACC__
-      cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost);
-      cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost);
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
@@ -223,9 +224,9 @@ int main() {
 
 #ifdef __CUDACC__
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost);
-      cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(uint32_t), cudaMemcpyDeviceToHost);
-      cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost);
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
       nv = onGPU_d->nvFinal;
@@ -243,7 +244,7 @@ int main() {
 #ifdef __CUDACC__
       // one vertex per block!!!
       cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
-      cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost);
+      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       gridDim.x = 1024;  // nv ????
       assert(blockIdx.x == 0);
@@ -260,7 +261,7 @@ int main() {
 
       cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
-      cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost); 
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); 
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
       sortByPt2(onGPU_d.get(), ws_d.get());
@@ -274,12 +275,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost);
-      cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost);
-      cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost);
-      cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost);
-      cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost);
-      cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost); 
+      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); 
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)

From c7b7f03fbb0e52a858f816364b30fa4e75bca33e Mon Sep 17 00:00:00 2001
From: Wahid Redjeb <wahid.redjeb.wr@gmail.com>
Date: Mon, 28 Oct 2019 12:31:44 +0100
Subject: [PATCH 09/13] Fix errors,  missing include of launch.h

---
 HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h    | 2 +-
 RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index e20a606042793..b9ce5a001d41f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -13,7 +13,7 @@ namespace cudautils {
   inline void memsetAsync(cudautils::device::unique_ptr<T>& ptr, T value, cudaStream_t stream) {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
-    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");i
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
     cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
   }
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index dd93a8f622898..edc29d6452e69 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -14,6 +14,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #endif
 
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"

From e007a5eb70b96edbd89a56e34ba12c9fe478ded2 Mon Sep 17 00:00:00 2001
From: Wahid Redjeb <wahid.redjeb.wr@gmail.com>
Date: Mon, 28 Oct 2019 12:33:14 +0100
Subject: [PATCH 10/13] Apply code-format

---
 .../test/gpuFrameTransformTest.cpp            |  1 -
 DataFormats/Math/test/CholeskyInvert_t.cu     |  2 +-
 DataFormats/Math/test/cudaMathTest.cu         |  8 ++--
 .../CUDACore/test/test_CUDAScopedContext.cc   |  2 +-
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc |  3 +-
 .../plugins/TestCUDAProducerGPUEWTask.cc      |  6 ++-
 .../plugins/TestCUDAProducerGPUKernel.cu      |  6 +--
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |  6 ++-
 .../test/test_TestCUDAProducerGPUFirst.cc     |  4 +-
 .../CUDAUtilities/interface/copyAsync.h       |  4 +-
 .../CUDAUtilities/interface/prefixScan.h      |  2 +-
 .../CUDAUtilities/test/HistoContainer_t.cu    |  6 +--
 .../CUDAUtilities/test/OneHistoContainer_t.cu |  2 +-
 .../CUDAUtilities/test/OneToManyAssoc_t.h     |  6 +--
 .../CUDAUtilities/test/copyAsync_t.cpp        |  6 +--
 .../CUDAUtilities/test/radixSort_t.cu         | 11 +++---
 .../TestHeterogeneousEDProducerGPUHelpers.cu  |  8 ++--
 .../src/SiPixelFedCablingMapGPUWrapper.cc     |  9 +++--
 .../SiPixelClusterizer/test/gpuClustering_t.h | 38 +++++++++----------
 .../PixelTrackFitting/test/testEigenGPU.cu    |  9 +++--
 .../test/testEigenGPUNoFit.cu                 |  6 ++-
 .../PixelVertexFinding/test/VertexFinder_t.h  | 14 +++----
 22 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
index cae4e48f07b6b..d02672c08d5d1 100644
--- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -76,7 +76,6 @@ int main(void) {
   auto d_sf = cuda::memory::device::make_unique<char[]>(current_device, sizeof(SFrame));
   cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));
 
-
   for (auto i = 0U; i < size; ++i) {
     xl[i] = yl[i] = 0.1f * float(i) - float(size / 2);
     le[3 * i] = 0.01f;
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index 4adbd885eab7b..dca89682113fe 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -151,7 +151,7 @@ void go(bool soa) {
       cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
     else
       cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
-    
+
     cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
 
     delta += (std::chrono::high_resolution_clock::now() - start);
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index efb99d12ced9c..46aae5a64f217 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -106,7 +106,7 @@ void go() {
   auto d_A = cuda::memory::device::make_unique<float[]>(current_device, numElements);
   auto d_B = cuda::memory::device::make_unique<float[]>(current_device, numElements);
   auto d_C = cuda::memory::device::make_unique<float[]>(current_device, numElements);
-  
+
   cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
   cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
   delta += (std::chrono::high_resolution_clock::now() - start);
@@ -119,13 +119,15 @@ void go() {
   std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
+  cudautils::launch(
+      vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
+  cudautils::launch(
+      vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 04f2789f83f43..3e06ed15d7594 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -115,7 +115,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       h_a1 = 0;
       h_a2 = 0;
       int h_a3 = 0;
-      
+
       cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
       cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
       cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 046150f2b6930..74e5af7c46baf 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -68,7 +68,8 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
-  cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
                                             << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
index db2d6593d7a95..0c8aad0931f15 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -76,7 +76,8 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
- cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
   // Push a task to run addSimpleWork() after the asynchronous work
   // (and acquire()) has finished instead of produce()
   ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
@@ -94,7 +95,8 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
     edm::LogVerbatim("TestCUDAProducerGPUEWTask")
         << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
         << " 10th element " << *hostData_ << " not satisfied, queueing more work";
- cudaCheck(cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));   
+    cudaCheck(
+        cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
 
     ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
     gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index 1facd8ef44b1e..aaa6b9148c74c 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -69,7 +69,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   // First make the sanity check
   if (d_input != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES* sizeof(float), cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost));
     for (int i = 0; i < NUM_VALUES; ++i) {
       if (h_check[i] != i) {
         throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
@@ -88,10 +88,10 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
 
   auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
   auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
-  
+
   cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
   cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
- 
+
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 6a0de45e12d95..168ac1daa14b9 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -59,7 +59,11 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,
 
   buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
-  cudaCheck(cudaMemcpyAsync(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(buffer_.get(),
+                            device.get(),
+                            TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
 
   edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index b3f3c3bad8a9b..2b137d3483c5a 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -74,8 +74,8 @@ process.moduleToTest(process.toTest)
     REQUIRE(data != nullptr);
 
     float firstElements[10];
-    cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream())); 
-    
+    cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()));
+
     std::cout << "Synchronizing with CUDA stream" << std::endl;
     auto stream = prod->stream();
     cudaCheck(cudaStreamSynchronize(stream));
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index 9d93ca6260334..5726736166fdc 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -35,7 +35,7 @@ namespace cudautils {
                         const cudautils::host::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-  cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream)); 
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
   }
 
   template <typename T>
@@ -43,7 +43,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-  cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements *  sizeof(T), cudaMemcpyHostToDevice, stream));
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
   }
 }  // namespace cudautils
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
index 9e591f2be4d69..8b784bdd61bfe 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -151,7 +151,7 @@ __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ c
 
   // let's get the partial sums from each block
   __shared__ T psum[1024];
-  for (int i = threadIdx.x, ni = gridDim.x; i<ni; i += blockDim.x) {
+  for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
     auto j = 1024 * i + 1023;
     psum[i] = (j < size) ? co[j] : T(0);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index c0b374b427d4f..f9e5fa28c3ee9 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -26,7 +26,7 @@ void go() {
   constexpr int N = 12000;
   T v[N];
   auto v_d = cuda::memory::device::make_unique<T[]>(current_device, N);
-  
+
   cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
   constexpr uint32_t nParts = 10;
@@ -66,7 +66,7 @@ void go() {
       offsets[10] = 3297 + offsets[9];
     }
 
-   cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
 
     for (long long j = 0; j < N; j++)
       v[j] = rgen(eng);
@@ -76,7 +76,7 @@ void go() {
         v[j] = sizeof(T) == 1 ? 22 : 3456;
     }
 
-   cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
     cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 0bb4e4a353b86..03a969102ee1b 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -132,7 +132,7 @@ void go() {
 
     assert(v_d.get());
     assert(v);
-    cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
     assert(v_d.get());
     cudautils::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 4533c2c2882b0..8782d6db07e3a 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -170,8 +170,8 @@ int main() {
   auto a_d = cuda::memory::device::make_unique<Assoc[]>(current_device, 1);
   auto sa_d = cuda::memory::device::make_unique<SmallAssoc[]>(current_device, 1);
   auto ws_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, Assoc::wsSize());
-  
-  cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N *  sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
+
+  cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
   auto a_d = std::make_unique<Assoc>();
   auto sa_d = std::make_unique<SmallAssoc>();
@@ -199,7 +199,7 @@ int main() {
   Assoc la;
 
 #ifdef __CUDACC__
-  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); 
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
 #else
   memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index 0168271baa4fa..dbd1e6a40dcb8 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -39,7 +39,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-	cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N *  sizeof(int), cudaMemcpyDeviceToHost, stream));
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudautils::copyAsync(device, host_orig, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
@@ -52,7 +52,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-	cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudautils::copyAsync(device, host_orig, 42, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
@@ -89,7 +89,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-	cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N *  sizeof(int), cudaMemcpyHostToDevice, stream));
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudautils::copyAsync(host, device, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index 4573ae106238c..bc042cc012185 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -102,9 +102,9 @@ void go(bool useShared) {
     auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
     auto ws_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
     auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, blocks + 1);
-    
-    cudaCheck(cudaMemcpy(v_d.get(), v , N * sizeof(T), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks+1), cudaMemcpyHostToDevice));
+
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
       std::cout << "lauch for " << offsets[blocks] << std::endl;
@@ -117,12 +117,13 @@ void go(bool useShared) {
       cudautils::launch(
           radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
     else
-      cudautils::launch(radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
+      cudautils::launch(
+          radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
 
     if (i == 0)
       std::cout << "done for " << offsets[blocks] << std::endl;
 
-    cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));  
+    cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));
 
     delta += (std::chrono::high_resolution_clock::now() - start);
 
diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
index fc6e0684660b7..14e9245e19e2e 100644
--- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
+++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUHelpers.cu
@@ -77,8 +77,8 @@ int TestAcceleratorServiceProducerGPUHelpers_simple_kernel(int input) {
   auto d_b = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
   auto d_c = cuda::memory::device::make_unique<int[]>(current_device, NUM_VALUES);
 
-   cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
-   cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(int), cudaMemcpyHostToDevice, stream.id()));
 
   int threadsPerBlock{256};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
@@ -146,8 +146,8 @@ TestHeterogeneousEDProducerGPUTask::ResultType TestHeterogeneousEDProducerGPUTas
   }
 
   // Create stream
- cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
- cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream.id()));
 
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index 451fe2fb6d75b..d4b8e40dea76b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -155,9 +155,12 @@ cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::g
       }
     }
   }
-  
-  cudaCheck(cudaMemcpyAsync(
-      modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
+
+  cudaCheck(cudaMemcpyAsync(modToUnpDevice.get(),
+                            modToUnpHost.get(),
+                            pixelgpudetails::MAX_SIZE * sizeof(unsigned char),
+                            cudaMemcpyHostToDevice,
+                            cudaStream));
   return modToUnpDevice;
 }
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index edc29d6452e69..bb86c1392cdf9 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -251,8 +251,8 @@ int main(void) {
 #ifdef __CUDACC__
     size_t size32 = n * sizeof(unsigned int);
     size_t size16 = n * sizeof(unsigned short);
-    // size_t size8 = n * sizeof(uint8_t);    
-    
+    // size_t size8 = n * sizeof(uint8_t);
+
     cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
 
     cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
@@ -274,15 +274,15 @@ int main(void) {
     cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)));
 
     cudautils::launch(findClus,
-                 {blocksPerGrid, threadsPerBlock},
-                 d_id.get(),
-                 d_x.get(),
-                 d_y.get(),
-                 d_moduleStart.get(),
-                 d_clusInModule.get(),
-                 d_moduleId.get(),
-                 d_clus.get(),
-                 n);
+                      {blocksPerGrid, threadsPerBlock},
+                      d_id.get(),
+                      d_x.get(),
+                      d_y.get(),
+                      d_moduleStart.get(),
+                      d_clusInModule.get(),
+                      d_moduleId.get(),
+                      d_clus.get(),
+                      n);
     cudaDeviceSynchronize();
     cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
@@ -300,14 +300,14 @@ int main(void) {
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
     cudautils::launch(clusterChargeCut,
-                 {blocksPerGrid, threadsPerBlock},
-                 d_id.get(),
-                 d_adc.get(),
-                 d_moduleStart.get(),
-                 d_clusInModule.get(),
-                 d_moduleId.get(),
-                 d_clus.get(),
-                 n);
+                      {blocksPerGrid, threadsPerBlock},
+                      d_id.get(),
+                      d_adc.get(),
+                      d_moduleStart.get(),
+                      d_clusInModule.get(),
+                      d_moduleId.get(),
+                      d_clus.get(),
+                      n);
 
     cudaDeviceSynchronize();
 #else
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index 44d1a375bcabe..7b02a23c41dca 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -262,9 +262,9 @@ void testFit() {
   cudaDeviceSynchronize();
 
   cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
-             fast_fit_resultsGPU,
-             Rfit::maxNumberOfTracks() * sizeof(Vector4d),
-             cudaMemcpyDeviceToHost));
+                       fast_fit_resultsGPU,
+                       Rfit::maxNumberOfTracks() * sizeof(Vector4d),
+                       cudaMemcpyDeviceToHost));
   Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
@@ -311,7 +311,8 @@ void testFit() {
 
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
-  cudaCheck(cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
+  cudaCheck(
+      cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index ff0d6aa5f68f5..e16ac3dbbcbc3 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -78,7 +78,8 @@ void testMultiply() {
   cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>)));
   cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice));
   cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice));
-  cudaCheck(cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));
 
   kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
   cudaDeviceSynchronize();
@@ -203,7 +204,8 @@ void testEigenvalues() {
   cudaDeviceSynchronize();
 
   cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
-  cudaCheck(cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(
+      ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
 #if TEST_DEBUG
   std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
   std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 59f586eae9812..14263ed7b3d18 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -127,10 +127,10 @@ int main() {
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-     cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
-     cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
-     cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
-     cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
@@ -261,7 +261,7 @@ int main() {
 
       cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); 
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
       sortByPt2(onGPU_d.get(), ws_d.get());
@@ -279,8 +279,8 @@ int main() {
       cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn , LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ind ,LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); 
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)

From 9a1ca2496aa21a5af9c66e9cbf80dd6fb509084f Mon Sep 17 00:00:00 2001
From: Wahid Redjeb <wahid.redjeb.wr@gmail.com>
Date: Mon, 28 Oct 2019 14:50:12 +0100
Subject: [PATCH 11/13] Reorders memory copy operations

---
 HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index dbd1e6a40dcb8..a9c7774006fa3 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -20,8 +20,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto device = cudautils::make_device_unique<int>(stream);
       auto host = cudautils::make_host_unique<int>(stream);
 
-      cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
       cudautils::copyAsync(device, host_orig, stream);
+      cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
       cudaCheck(cudaStreamSynchronize(stream));
 
       REQUIRE(*host == 42);
@@ -39,8 +39,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudautils::copyAsync(device, host_orig, N, stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
           CHECK(host[i] == i);
@@ -52,8 +52,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudautils::copyAsync(device, host_orig, 42, stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
           CHECK(host[i] == 200 + i);
@@ -70,8 +70,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto device = cudautils::make_device_unique<int>(stream);
       auto host = cudautils::make_host_unique<int>(stream);
 
-      cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
       cudautils::copyAsync(host, device, stream);
+      cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
       cudaCheck(cudaStreamSynchronize(stream));
 
       REQUIRE(*host == 42);
@@ -89,8 +89,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudautils::copyAsync(host, device, N, stream);
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
           CHECK(host[i] == i);
@@ -102,8 +102,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudautils::copyAsync(host, device, 42, stream);
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
           CHECK(host[i] == 200 + i);

From 463b4958fd2384d76d3ff64d23e7597afd564299 Mon Sep 17 00:00:00 2001
From: Wahid Redjeb <wahid.redjeb.wr@gmail.com>
Date: Mon, 28 Oct 2019 15:03:50 +0100
Subject: [PATCH 12/13] Reoders memory copy in Device_to_Host section

---
 HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index a9c7774006fa3..750551161a161 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -70,8 +70,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto device = cudautils::make_device_unique<int>(stream);
       auto host = cudautils::make_host_unique<int>(stream);
 
-      cudautils::copyAsync(host, device, stream);
       cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
+      cudautils::copyAsync(host, device, stream);
       cudaCheck(cudaStreamSynchronize(stream));
 
       REQUIRE(*host == 42);
@@ -89,8 +89,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       auto host = cudautils::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-        cudautils::copyAsync(host, device, N, stream);
         cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
+        cudautils::copyAsync(host, device, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
           CHECK(host[i] == i);
@@ -102,8 +102,8 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-        cudautils::copyAsync(host, device, 42, stream);
         cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
+        cudautils::copyAsync(host, device, 42, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
           CHECK(host[i] == 200 + i);

From 49d83d653727c19433d45a732ce4ca7d958d06f4 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 28 Oct 2019 18:26:42 +0100
Subject: [PATCH 13/13] Fix direction of the copies from device to host

---
 HeterogeneousCore/CUDAUtilities/interface/copyAsync.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index 5726736166fdc..bfa1bdee9a03d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -26,7 +26,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T>& src,
                         cudaStream_t stream) {
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
   }
 
   // Multiple elements
@@ -43,7 +43,7 @@ namespace cudautils {
                         const cudautils::device::unique_ptr<T[]>& src,
                         size_t nelements,
                         cudaStream_t stream) {
-    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
   }
 }  // namespace cudautils